From 4573f00a0d3e767eb0864f1748da5503242de411 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 31 May 2020 14:21:56 +0200 Subject: [PATCH 0001/1889] Initial commit --- LICENSE | 21 +++++++++++++++++++++ README.md | 2 ++ 2 files changed, 23 insertions(+) create mode 100644 LICENSE create mode 100644 README.md diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..17a0f0781 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Clément Renault + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 000000000..18e5d74b8 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# mega-mini-indexer +A prototype of concurrent indexing, only contains postings ids From 91ba938953e202d8ede5a67ec1888c03a9ab239a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 25 May 2020 20:39:53 +0200 Subject: [PATCH 0002/1889] Initial commit --- .gitignore | 1 + Cargo.lock | 749 ++++++++++++++++++++++++++++++++++ Cargo.toml | 27 ++ qc_loop.sh | 11 + src/bp_vec.rs | 197 +++++++++ src/codec/bitpacker_sorted.rs | 84 ++++ src/codec/mod.rs | 3 + src/main.rs | 186 +++++++++ 8 files changed, 1258 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100755 qc_loop.sh create mode 100644 src/bp_vec.rs create mode 100644 src/codec/bitpacker_sorted.rs create mode 100644 src/codec/mod.rs create mode 100644 src/main.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..ea8c4bf7f --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 000000000..18931ea68 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,749 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +[[package]] +name = "aho-corasick" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada" +dependencies = [ + "memchr", +] + +[[package]] +name = "anyhow" +version = "1.0.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f" + +[[package]] +name = "autocfg" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" + +[[package]] +name = "bitflags" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" + +[[package]] +name = "bitpacking" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3744aff20a3437a99ebc0bb7733e9e60c7bf590478c9b897e95b38d57e5acb68" +dependencies = [ + "crunchy", +] + +[[package]] +name = "bstr" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31accafdb70df7871592c058eca3985b71104e15ac32f64706022c58867da931" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "byteorder" +version = "1.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" + +[[package]] +name = "cc" +version = "1.0.54" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311" + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "clap" +version = "2.33.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129" +dependencies = [ + "bitflags", + "textwrap", + "unicode-width", +] + +[[package]] +name = "cloudabi" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +dependencies = [ + "bitflags", +] + +[[package]] +name = "crc32fast" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", + "maybe-uninit", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "lazy_static", + "maybe-uninit", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c695eeca1e7173472a32221542ae469b3e9aac3a4fc81f7696bcad82029493db" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" +dependencies = [ + "autocfg", + "cfg-if", + "lazy_static", +] + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "csv" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00affe7f6ab566df61b4be3ce8cf16bc2576bca0963ceb0955e45d514bf9a279" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "either" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" + +[[package]] +name = "env_logger" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" +dependencies = [ + "log", + "regex", +] + +[[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "fs_extra" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f2a4a2034423744d2cc7ca2068453168dcdb82c438419e639a26bd87839c674" + +[[package]] +name = "fst" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7293de202dbfe786c0b3fe6110a027836c5438ed06db7b715c9955ff4bfea51" + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "heck" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hermit-abi" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71" +dependencies = [ + "libc", +] + +[[package]] +name = "itoa" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e" + +[[package]] +name = "jemalloc-sys" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45" +dependencies = [ + "cc", + "fs_extra", + "libc", +] + +[[package]] +name = "jemallocator" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69" +dependencies = [ + "jemalloc-sys", + "libc", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "libc" +version = "0.2.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f" + +[[package]] +name = "lock_api" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "maybe-uninit" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" + +[[package]] +name = "mega-mini-indexer" +version = "0.1.0" +dependencies = [ + "anyhow", + "bitpacking", + "byteorder", + "csv", + "fst", + "fxhash", + "jemallocator", + "quickcheck", + "rayon", + "sdset", + "sled", + "slice-group-by", + "smallstr", + "structopt", + "zerocopy", +] + +[[package]] +name = "memchr" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" + +[[package]] +name = "memoffset" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "parking_lot" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3" +dependencies = [ + "cfg-if", + "cloudabi", + "libc", + "redox_syscall", + "smallvec", + "winapi", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea" + +[[package]] +name = "proc-macro-error" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98e9e4b82e0ef281812565ea4751049f1bdcdfccda7d3f459f2e138a40c08678" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f5444ead4e9935abd7f27dc51f7e852a0569ac888096d5ec2499470794e2e53" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "syn-mid", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1502d12e458c49a4c9cbff560d0fe0060c252bc29799ed94ca2ed4bb665a0101" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quickcheck" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44883e74aa97ad63db83c4bf8ca490f02b2fc02f92575e720c8551e843c945f" +dependencies = [ + "env_logger", + "log", + "rand", + "rand_core", +] + +[[package]] +name = "quote" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a21852a652ad6f610c9510194f398ff6f8692e334fd1145fed931f7fbe44ea" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom", + "libc", + "rand_chacha", + "rand_core", + "rand_hc", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core", +] + +[[package]] +name = "rayon" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098" +dependencies = [ + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9" +dependencies = [ + "crossbeam-deque", + "crossbeam-queue", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.1.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" + +[[package]] +name = "regex" +version = "1.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", + "thread_local", +] + +[[package]] +name = "regex-automata" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" +dependencies = [ + "byteorder", +] + +[[package]] +name = "regex-syntax" +version = "0.6.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" + +[[package]] +name = "ryu" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed3d612bc64430efeb3f7ee6ef26d590dce0c43249217bddc62112540c7941e1" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "sdset" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbb21fe0588557792176c89bc7b943027b14f346d03c6be6a199c2860277d93a" + +[[package]] +name = "serde" +version = "1.0.110" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99e7b308464d16b56eba9964e4972a3eee817760ab60d88c3f86e1fecb08204c" + +[[package]] +name = "sled" +version = "0.31.0" +source = "git+https://github.com/spacejam/sled.git?rev=2fe05c9#2fe05c933a4a68d4dbbc06a16a3058236fcc6350" +dependencies = [ + "crc32fast", + "crossbeam-epoch", + "crossbeam-utils", + "fs2", + "fxhash", + "libc", + "log", + "parking_lot", +] + +[[package]] +name = "slice-group-by" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb" + +[[package]] +name = "smallstr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e922794d168678729ffc7e07182721a14219c65814e66e91b839a272fe5ae4f" +dependencies = [ + "smallvec", +] + +[[package]] +name = "smallvec" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7cb5678e1615754284ec264d9bb5b4c27d2018577fd90ac0ceb578591ed5ee4" + +[[package]] +name = "structopt" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef" +dependencies = [ + "clap", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d239ca4b13aee7a2142e6795cbd69e457665ff8037aed33b3effdc430d2f927a" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "syn" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95b5f192649e48a5302a13f2feb224df883b98933222369e4b3b0fe2a5447269" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "syn-mid" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7be3539f6c128a931cf19dcee741c1af532c7fd387baa739c03dd2e96479338a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "synstructure" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67656ea1dc1b41b1451851562ea232ec2e5a80242139f7e679ceccfb5d61f545" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "unicode-xid", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + +[[package]] +name = "thread_local" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "unicode-segmentation" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0" + +[[package]] +name = "unicode-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479" + +[[package]] +name = "unicode-xid" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" + +[[package]] +name = "version_check" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "winapi" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "zerocopy" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" +dependencies = [ + "proc-macro2", + "syn", + "synstructure", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 000000000..cfa000799 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "mega-mini-indexer" +version = "0.1.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +anyhow = "1.0.28" +bitpacking = "0.8.2" +byteorder = "1.3.4" +csv = "1.1.3" +fst = "0.4.3" +fxhash = "0.2.1" +jemallocator = "0.3.2" +rayon = "1.3.0" +sdset = "0.4.0" +sled = { git = "https://github.com/spacejam/sled.git", rev = "2fe05c9"} +slice-group-by = "0.2.6" +smallstr = "0.2.0" +structopt = { version = "0.3.14", default-features = false } +zerocopy = "0.3.0" + +[dev-dependencies] +quickcheck = "0.9.2" + +[profile.release] +debug = true diff --git a/qc_loop.sh b/qc_loop.sh new file mode 100755 index 000000000..c479307cc --- /dev/null +++ b/qc_loop.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +export RUST_BACKTRACE=1 + +while true +do + cargo test qc_ --release -- --nocapture + if [[ x$? != x0 ]] ; then + exit $? + fi +done diff --git a/src/bp_vec.rs b/src/bp_vec.rs new file mode 100644 index 000000000..d567ac0a5 --- /dev/null +++ b/src/bp_vec.rs @@ -0,0 +1,197 @@ +use byteorder::{ByteOrder, NativeEndian}; +use bitpacking::{BitPacker, BitPacker4x}; + +/// An append only bitpacked u32 vector that ignore order of insertion. +#[derive(Default)] +pub struct BpVec { + compressed: Vec, + uncompressed: Vec, +} + +impl BpVec { + pub fn new() -> BpVec { + BpVec::default() + } + + pub fn push(&mut self, elem: u32) { + self.uncompressed.push(elem); + if self.uncompressed.len() == BitPacker4x::BLOCK_LEN { + encode(&mut self.uncompressed[..], &mut self.compressed); + self.uncompressed.clear(); + } + } + + pub fn extend_from_slice(&mut self, elems: &[u32]) { + self.uncompressed.extend_from_slice(elems); + let remaining = self.uncompressed.len() % BitPacker4x::BLOCK_LEN; + for chunk in self.uncompressed[remaining..].chunks_exact_mut(BitPacker4x::BLOCK_LEN) { + encode(chunk, &mut self.compressed); + } + self.uncompressed.truncate(remaining); + self.uncompressed.shrink_to_fit(); + } + + pub fn to_vec(self) -> Vec { + let BpVec { compressed, mut uncompressed } = self; + decode(&compressed, &mut uncompressed); + uncompressed + } + + pub fn capacity(&self) -> usize { + self.compressed.capacity() + self.uncompressed.capacity() + } +} + +fn encode(items: &mut [u32], encoded: &mut Vec) { + assert_eq!(items.len(), BitPacker4x::BLOCK_LEN); + + let bitpacker = BitPacker4x::new(); + + // We reserve enough space in the output buffer, filled with zeroes. + let len = encoded.len(); + // initial_value + num_bits + encoded numbers + let max_possible_length = 4 + 1 + 4 * BitPacker4x::BLOCK_LEN; + encoded.resize(len + max_possible_length, 0); + + // We sort the items to be able to efficiently bitpack them. + items.sort_unstable(); + // We save the initial value to us for this block, the lowest one. + let initial_value = items[0]; + // We compute the number of bits necessary to encode this block + let num_bits = bitpacker.num_bits_sorted(initial_value, items); + + // We write the initial value for this block. + let buffer = &mut encoded[len..]; + NativeEndian::write_u32(buffer, initial_value); + // We write the num_bits that will be read to decode this block + let buffer = &mut buffer[4..]; + buffer[0] = num_bits; + // We encode the block numbers into the buffer using the num_bits + let buffer = &mut buffer[1..]; + let compressed_len = bitpacker.compress_sorted(initial_value, items, buffer, num_bits); + + // We truncate the buffer to the avoid leaking padding zeroes + encoded.truncate(len + 4 + 1 + compressed_len); +} + +fn decode(mut encoded: &[u8], decoded: &mut Vec) { + let bitpacker = BitPacker4x::new(); + + // initial_value + num_bits + while let Some(header) = encoded.get(0..4 + 1) { + // We extract the header informations + let initial_value = NativeEndian::read_u32(header); + let num_bits = header[4]; + let bytes = &encoded[4 + 1..]; + + // If the num_bits is equal to zero it means that all encoded numbers were zeroes + if num_bits == 0 { + decoded.resize(decoded.len() + BitPacker4x::BLOCK_LEN, initial_value); + encoded = bytes; + continue; + } + + // We guess the block size based on the num_bits used for this block + let block_size = BitPacker4x::compressed_block_size(num_bits); + + // We pad the decoded vector with zeroes + let new_len = decoded.len() + BitPacker4x::BLOCK_LEN; + decoded.resize(new_len, 0); + + // Create a view into the decoded buffer and decode into it + let to_decompress = &mut decoded[new_len - BitPacker4x::BLOCK_LEN..new_len]; + bitpacker.decompress_sorted(initial_value, &bytes[..block_size], to_decompress, num_bits); + + // Advance the bytes offset to read the next block (+ num_bits) + encoded = &bytes[block_size..]; + } +} + +impl sdset::Collection for BpVec { + fn push(&mut self, elem: u32) { + BpVec::push(self, elem); + } + + fn extend_from_slice(&mut self, elems: &[u32]) { + BpVec::extend_from_slice(self, elems); + } + + fn extend(&mut self, elems: I) where I: IntoIterator { + elems.into_iter().for_each(|x| BpVec::push(self, x)); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + quickcheck! { + fn qc_push(xs: Vec) -> bool { + let mut xs: Vec<_> = xs.iter().cloned().cycle().take(1300).collect(); + + let mut bpvec = BpVec::new(); + xs.iter().for_each(|x| bpvec.push(*x)); + let mut result = bpvec.to_vec(); + + result.sort_unstable(); + xs.sort_unstable(); + + xs == result + } + } + + quickcheck! { + fn qc_extend_from_slice(xs: Vec) -> bool { + let mut xs: Vec<_> = xs.iter().cloned().cycle().take(1300).collect(); + + let mut bpvec = BpVec::new(); + bpvec.extend_from_slice(&xs); + let mut result = bpvec.to_vec(); + + result.sort_unstable(); + xs.sort_unstable(); + + xs == result + } + } + + #[test] + fn empty() { + let mut bpvec = BpVec::new(); + bpvec.extend_from_slice(&[]); + let result = bpvec.to_vec(); + + assert!(result.is_empty()); + } + + #[test] + fn one_zero() { + let mut bpvec = BpVec::new(); + bpvec.extend_from_slice(&[0]); + let result = bpvec.to_vec(); + + assert_eq!(&[0], &*result); + } + + #[test] + fn many_zeros() { + let xs: Vec<_> = std::iter::repeat(0).take(1300).collect(); + + let mut bpvec = BpVec::new(); + bpvec.extend_from_slice(&xs); + let result = bpvec.to_vec(); + + assert_eq!(xs, result); + } + + #[test] + fn many_ones() { + let xs: Vec<_> = std::iter::repeat(1).take(1300).collect(); + + let mut bpvec = BpVec::new(); + bpvec.extend_from_slice(&xs); + let result = bpvec.to_vec(); + + assert_eq!(xs, result); + } +} diff --git a/src/codec/bitpacker_sorted.rs b/src/codec/bitpacker_sorted.rs new file mode 100644 index 000000000..274e2c2bb --- /dev/null +++ b/src/codec/bitpacker_sorted.rs @@ -0,0 +1,84 @@ +use bitpacking::{BitPacker, BitPacker4x}; +use byteorder::{ReadBytesExt, NativeEndian}; +use zerocopy::AsBytes; + +pub struct CodecBitPacker4xSorted; + +impl CodecBitPacker4xSorted { + pub fn bytes_encode(item: &[u32]) -> Option> { + // This is a hotfix to the SIGSEGV + // https://github.com/tantivy-search/bitpacking/issues/23 + if item.is_empty() { + return Some(Vec::default()) + } + + let bitpacker = BitPacker4x::new(); + let mut compressed = Vec::new(); + let mut initial_value = 0; + + // The number of remaining numbers that don't fit in the block size. + compressed.push((item.len() % BitPacker4x::BLOCK_LEN) as u8); + + // we cannot use a mut slice here because of #68630, TooGeneric error. + // we can probably avoid this new allocation by directly using the compressed final Vec. + let mut buffer = vec![0u8; 4 * BitPacker4x::BLOCK_LEN]; + + for chunk in item.chunks(BitPacker4x::BLOCK_LEN) { + if chunk.len() == BitPacker4x::BLOCK_LEN { + // compute the number of bits necessary to encode this block + let num_bits = bitpacker.num_bits_sorted(initial_value, chunk); + // Encode the block numbers into the buffer using the num_bits + let compressed_len = bitpacker.compress_sorted(initial_value, chunk, &mut buffer, num_bits); + // Write the num_bits that will be read to decode this block + compressed.push(num_bits); + // Wrtie the bytes of the compressed block numbers + compressed.extend_from_slice(&buffer[..compressed_len]); + // Save the initial_value, which is the last value of the n-1 used for the n block + initial_value = *chunk.last().unwrap(); + } else { + // Save the remaining numbers which don't fit inside of a BLOCK_LEN + compressed.extend_from_slice(chunk.as_bytes()); + } + } + + Some(compressed) + } + + pub fn bytes_decode(bytes: &[u8]) -> Option> { + if bytes.is_empty() { + return Some(Vec::new()) + } + + let bitpacker = BitPacker4x::new(); + let (remaining, bytes) = bytes.split_first().unwrap(); + let remaining = *remaining as usize; + + let (mut bytes, mut remaining_bytes) = bytes.split_at(bytes.len() - remaining * 4); + let mut decompressed = Vec::new(); + let mut initial_value = 0; + + while let Some(num_bits) = bytes.get(0) { + let block_size = BitPacker4x::compressed_block_size(*num_bits); + + let new_len = decompressed.len() + BitPacker4x::BLOCK_LEN; + decompressed.resize(new_len, 0); + + // Create a view into the decompressed buffer and decomress into it + let to_decompress = &mut decompressed[new_len - BitPacker4x::BLOCK_LEN..new_len]; + bitpacker.decompress_sorted(initial_value, &bytes[1..block_size + 1], to_decompress, *num_bits); + + // Set the new initial_value for the next block + initial_value = *decompressed.last().unwrap(); + // Advance the bytes offset to read the next block (+ num_bits) + bytes = &bytes[block_size + 1..]; + } + + // We add the remaining uncompressed numbers. + let new_len = decompressed.len() + remaining; + decompressed.resize(new_len, 0); + let to_decompress = &mut decompressed[new_len - remaining..new_len]; + remaining_bytes.read_u32_into::(to_decompress).ok()?; + + Some(decompressed) + } +} diff --git a/src/codec/mod.rs b/src/codec/mod.rs new file mode 100644 index 000000000..451839fea --- /dev/null +++ b/src/codec/mod.rs @@ -0,0 +1,3 @@ +mod bitpacker_sorted; + +pub use self::bitpacker_sorted::CodecBitPacker4xSorted; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 000000000..c8d345baf --- /dev/null +++ b/src/main.rs @@ -0,0 +1,186 @@ +#[cfg(test)] +#[macro_use] extern crate quickcheck; + +mod codec; +mod bp_vec; + +use std::collections::{HashMap, BTreeSet}; +use std::convert::TryFrom; +use std::fs::File; +use std::hash::BuildHasherDefault; +use std::path::PathBuf; + +use anyhow::{ensure, Context}; +use fst::IntoStreamer; +use fxhash::FxHasher32; +use rayon::prelude::*; +use sdset::{SetOperation, SetBuf}; +use slice_group_by::StrGroupBy; +use structopt::StructOpt; + +use self::codec::CodecBitPacker4xSorted; +use self::bp_vec::BpVec; + +pub type FastMap4 = HashMap>; +pub type SmallString32 = smallstr::SmallString<[u8; 32]>; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +#[derive(Debug, StructOpt)] +#[structopt(name = "mm-indexer", about = "The server side of the daugt project.")] +struct Opt { + /// The database path where the database is located. + /// It is created if it doesn't already exist. + #[structopt(long = "db", parse(from_os_str))] + database: PathBuf, + + /// Files to index in parallel. + files_to_index: Vec, +} + +fn union_bitpacked_postings_ids(_key: &[u8], old_value: Option<&[u8]>, new_value: &[u8]) -> Option> { + if old_value.is_none() { + return Some(new_value.to_vec()) + } + + let old_value = old_value.unwrap_or_default(); + let old_value = CodecBitPacker4xSorted::bytes_decode(&old_value).unwrap(); + let new_value = CodecBitPacker4xSorted::bytes_decode(&new_value).unwrap(); + + let old_set = SetBuf::new(old_value).unwrap(); + let new_set = SetBuf::new(new_value).unwrap(); + + let result = sdset::duo::Union::new(&old_set, &new_set).into_set_buf(); + let compressed = CodecBitPacker4xSorted::bytes_encode(&result).unwrap(); + + Some(compressed) +} + +fn union_words_fst(key: &[u8], old_value: Option<&[u8]>, new_value: &[u8]) -> Option> { + if key != b"words-fst" { unimplemented!() } + + let old_value = match old_value { + Some(old_value) => old_value, + None => return Some(new_value.to_vec()), + }; + + eprintln!("old_words size: {}", old_value.len()); + eprintln!("new_words size: {}", new_value.len()); + + let old_words = fst::Set::new(old_value).unwrap(); + let new_words = fst::Set::new(new_value).unwrap(); + + // Do an union of the old and the new set of words. + let op = old_words.op().add(new_words.into_stream()).r#union(); + let mut build = fst::SetBuilder::memory(); + build.extend_stream(op.into_stream()).unwrap(); + + Some(build.into_inner().unwrap()) +} + +fn alphanumeric_tokens(string: &str) -> impl Iterator { + let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); + string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) +} + +fn index_csv(tid: usize, db: sled::Db, mut rdr: csv::Reader) -> anyhow::Result { + const MAX_POSITION: usize = 1000; + const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; + + let main = &*db; + let postings_ids = db.open_tree("postings-ids")?; + let documents = db.open_tree("documents")?; + + let mut document = csv::StringRecord::new(); + let mut new_postings_ids = FastMap4::default(); + let mut new_words = BTreeSet::default(); + let mut number_of_documents = 0; + + // Write the headers into a Vec of bytes. + let headers = rdr.headers()?; + let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); + writer.write_byte_record(headers.as_byte_record())?; + let headers = writer.into_inner()?; + + if let Some(old_headers) = main.insert("headers", headers.as_slice())? { + ensure!(old_headers == headers, "headers differs from the previous ones"); + } + + while rdr.read_record(&mut document)? { + let document_id = db.generate_id()?; + let document_id = u32::try_from(document_id).context("Generated id is too big")?; + + for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { + for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { + new_postings_ids.entry(SmallString32::from(word)).or_insert_with(BpVec::new).push(document_id); + } + } + + // We write the document in the database. + let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); + writer.write_byte_record(document.as_byte_record())?; + let document = writer.into_inner()?; + documents.insert(document_id.to_be_bytes(), document)?; + + number_of_documents += 1; + if number_of_documents % 100000 == 0 { + let postings_ids_size = new_postings_ids.iter().map(|(_, v)| v.capacity() * 4).sum::(); + eprintln!("{}, documents seen {}, postings size {}", + tid, number_of_documents, postings_ids_size); + } + } + + eprintln!("Start collecting the postings lists and words"); + + // We compute and store the postings list into the DB. + for (word, new_ids) in new_postings_ids { + let new_ids = SetBuf::from_dirty(new_ids.to_vec()); + let compressed = CodecBitPacker4xSorted::bytes_encode(&new_ids) + .context("error while compressing using CodecBitPacker4xSorted")?; + + postings_ids.merge(word.as_bytes(), compressed)?; + + new_words.insert(word); + } + + eprintln!("Finished collecting the postings lists and words"); + + eprintln!("Start merging the words-fst"); + + let new_words_fst = fst::Set::from_iter(new_words.iter().map(|s| s.as_str()))?; + drop(new_words); + main.merge("words-fst", new_words_fst.as_fst().as_bytes())?; + + eprintln!("Finished merging the words-fst"); + + Ok(number_of_documents) +} + +fn main() -> anyhow::Result<()> { + let opt = Opt::from_args(); + + let db = sled::open(opt.database)?; + let main = &*db; + + // Setup the merge operators + main.set_merge_operator(union_words_fst); + let postings_ids = db.open_tree("postings-ids")?; + postings_ids.set_merge_operator(union_bitpacked_postings_ids); + // ... + let _documents = db.open_tree("documents")?; + + let res = opt.files_to_index + .into_par_iter() + .enumerate() + .map(|(tid, path)| { + let rdr = csv::Reader::from_path(path)?; + index_csv(tid, db.clone(), rdr) + }) + .try_reduce(|| 0, |a, b| Ok(a + b)); + + println!("{:?}", res); + + Ok(()) +} From a81f201fad0ea02c07289be0d1ec66dcf0aec494 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 26 May 2020 12:18:29 +0200 Subject: [PATCH 0003/1889] Inroduce the use of RocksDB instead of sled (RAM) --- Cargo.lock | 273 +++++++++++++++++++++++++--------- Cargo.toml | 2 +- src/bp_vec.rs | 8 +- src/codec/bitpacker_sorted.rs | 6 + src/main.rs | 115 +++++++++----- 5 files changed, 288 insertions(+), 116 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 18931ea68..1660704b0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9,18 +9,62 @@ dependencies = [ "memchr", ] +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi", +] + [[package]] name = "anyhow" version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f" +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] + [[package]] name = "autocfg" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" +[[package]] +name = "bindgen" +version = "0.53.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c72a978d268b1d70b0e963217e60fdabd9523a941457a6c42a7315d15c7e89e5" +dependencies = [ + "bitflags", + "cexpr", + "cfg-if", + "clang-sys", + "clap", + "env_logger", + "lazy_static", + "lazycell", + "log", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "which", +] + [[package]] name = "bitflags" version = "1.2.1" @@ -59,6 +103,18 @@ name = "cc" version = "1.0.54" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311" +dependencies = [ + "jobserver", +] + +[[package]] +name = "cexpr" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27" +dependencies = [ + "nom", +] [[package]] name = "cfg-if" @@ -66,33 +122,30 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +[[package]] +name = "clang-sys" +version = "0.29.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a" +dependencies = [ + "glob", + "libc", + "libloading", +] + [[package]] name = "clap" version = "2.33.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129" dependencies = [ + "ansi_term", + "atty", "bitflags", + "strsim", "textwrap", "unicode-width", -] - -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -dependencies = [ - "bitflags", -] - -[[package]] -name = "crc32fast" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1" -dependencies = [ - "cfg-if", + "vec_map", ] [[package]] @@ -182,18 +235,11 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" dependencies = [ + "atty", + "humantime", "log", "regex", -] - -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", + "termcolor", ] [[package]] @@ -228,6 +274,12 @@ dependencies = [ "wasi", ] +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "heck" version = "0.3.1" @@ -246,6 +298,15 @@ dependencies = [ "libc", ] +[[package]] +name = "humantime" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" +dependencies = [ + "quick-error", +] + [[package]] name = "itoa" version = "0.4.5" @@ -273,12 +334,27 @@ dependencies = [ "libc", ] +[[package]] +name = "jobserver" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c71313ebb9439f74b00d9d2dcec36440beaf57a6aa0623068441dd7cd81a7f2" +dependencies = [ + "libc", +] + [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lazycell" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b294d6fa9ee409a054354afc4352b0b9ef7ca222c69b8812cbea9e7d2bf3783f" + [[package]] name = "libc" version = "0.2.70" @@ -286,12 +362,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f" [[package]] -name = "lock_api" -version = "0.3.4" +name = "libloading" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75" +checksum = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753" dependencies = [ - "scopeguard", + "cc", + "winapi", +] + +[[package]] +name = "librocksdb-sys" +version = "6.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "883213ae3d09bfc3d104aefe94b25ebb183b6f4d3a515b23b14817e1f4854005" +dependencies = [ + "bindgen", + "cc", + "glob", + "libc", ] [[package]] @@ -322,8 +411,8 @@ dependencies = [ "jemallocator", "quickcheck", "rayon", + "rocksdb", "sdset", - "sled", "slice-group-by", "smallstr", "structopt", @@ -345,6 +434,16 @@ dependencies = [ "autocfg", ] +[[package]] +name = "nom" +version = "5.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b471253da97532da4b61552249c521e01e736071f71c1a4f7ebbfbf0a06aad6" +dependencies = [ + "memchr", + "version_check", +] + [[package]] name = "num_cpus" version = "1.13.0" @@ -356,28 +455,10 @@ dependencies = [ ] [[package]] -name = "parking_lot" -version = "0.10.2" +name = "peeking_take_while" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3" -dependencies = [ - "cfg-if", - "cloudabi", - "libc", - "redox_syscall", - "smallvec", - "winapi", -] +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "ppv-lite86" @@ -420,6 +501,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quickcheck" version = "0.9.2" @@ -506,12 +593,6 @@ dependencies = [ "num_cpus", ] -[[package]] -name = "redox_syscall" -version = "0.1.56" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" - [[package]] name = "regex" version = "1.3.7" @@ -539,6 +620,22 @@ version = "0.6.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" +[[package]] +name = "rocksdb" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61aa17a99a2413cd71c1106691bf59dad7de0cd5099127f90e9d99c429c40d4a" +dependencies = [ + "libc", + "librocksdb-sys", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "ryu" version = "1.0.4" @@ -564,19 +661,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99e7b308464d16b56eba9964e4972a3eee817760ab60d88c3f86e1fecb08204c" [[package]] -name = "sled" -version = "0.31.0" -source = "git+https://github.com/spacejam/sled.git?rev=2fe05c9#2fe05c933a4a68d4dbbc06a16a3058236fcc6350" -dependencies = [ - "crc32fast", - "crossbeam-epoch", - "crossbeam-utils", - "fs2", - "fxhash", - "libc", - "log", - "parking_lot", -] +name = "shlex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2" [[package]] name = "slice-group-by" @@ -599,6 +687,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7cb5678e1615754284ec264d9bb5b4c27d2018577fd90ac0ceb578591ed5ee4" +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + [[package]] name = "structopt" version = "0.3.14" @@ -657,6 +751,15 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "termcolor" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f" +dependencies = [ + "winapi-util", +] + [[package]] name = "textwrap" version = "0.11.0" @@ -693,6 +796,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + [[package]] name = "version_check" version = "0.9.2" @@ -705,6 +814,15 @@ version = "0.9.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" +[[package]] +name = "which" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d011071ae14a2f6671d0b74080ae0cd8ebf3a6f8c9589a2cd45f23126fe29724" +dependencies = [ + "libc", +] + [[package]] name = "winapi" version = "0.3.8" @@ -721,6 +839,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index cfa000799..5186d5964 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,8 +13,8 @@ fst = "0.4.3" fxhash = "0.2.1" jemallocator = "0.3.2" rayon = "1.3.0" +rocksdb = "0.14.0" sdset = "0.4.0" -sled = { git = "https://github.com/spacejam/sled.git", rev = "2fe05c9"} slice-group-by = "0.2.6" smallstr = "0.2.0" structopt = { version = "0.3.14", default-features = false } diff --git a/src/bp_vec.rs b/src/bp_vec.rs index d567ac0a5..f91e6aa22 100644 --- a/src/bp_vec.rs +++ b/src/bp_vec.rs @@ -37,8 +37,12 @@ impl BpVec { uncompressed } - pub fn capacity(&self) -> usize { - self.compressed.capacity() + self.uncompressed.capacity() + pub fn compressed_capacity(&self) -> usize { + self.compressed.capacity() + } + + pub fn uncompressed_capacity(&self) -> usize { + self.uncompressed.capacity() } } diff --git a/src/codec/bitpacker_sorted.rs b/src/codec/bitpacker_sorted.rs index 274e2c2bb..c51b4d71c 100644 --- a/src/codec/bitpacker_sorted.rs +++ b/src/codec/bitpacker_sorted.rs @@ -58,6 +58,12 @@ impl CodecBitPacker4xSorted { let mut initial_value = 0; while let Some(num_bits) = bytes.get(0) { + if *num_bits == 0 { + decompressed.resize(decompressed.len() + BitPacker4x::BLOCK_LEN, initial_value); + bytes = &bytes[1..]; + continue; + } + let block_size = BitPacker4x::compressed_block_size(*num_bits); let new_len = decompressed.len() + BitPacker4x::BLOCK_LEN; diff --git a/src/main.rs b/src/main.rs index c8d345baf..e4ce5aeca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,6 +9,8 @@ use std::convert::TryFrom; use std::fs::File; use std::hash::BuildHasherDefault; use std::path::PathBuf; +use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; use anyhow::{ensure, Context}; use fst::IntoStreamer; @@ -28,6 +30,8 @@ pub type SmallString32 = smallstr::SmallString<[u8; 32]>; #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +static ID_GENERATOR: AtomicUsize = AtomicUsize::new(0); + #[derive(Debug, StructOpt)] #[structopt(name = "mm-indexer", about = "The server side of the daugt project.")] struct Opt { @@ -40,40 +44,58 @@ struct Opt { files_to_index: Vec, } -fn union_bitpacked_postings_ids(_key: &[u8], old_value: Option<&[u8]>, new_value: &[u8]) -> Option> { - if old_value.is_none() { - return Some(new_value.to_vec()) +fn union_bitpacked_postings_ids( + _key: &[u8], + old_value: Option<&[u8]>, + operands: &mut rocksdb::MergeOperands, +) -> Option> +{ + let mut sets_bufs = Vec::new(); + + if let Some(old_value) = old_value { + let old_value = CodecBitPacker4xSorted::bytes_decode(old_value).unwrap(); + sets_bufs.push(SetBuf::new(old_value).unwrap()); } - let old_value = old_value.unwrap_or_default(); - let old_value = CodecBitPacker4xSorted::bytes_decode(&old_value).unwrap(); - let new_value = CodecBitPacker4xSorted::bytes_decode(&new_value).unwrap(); + for operand in operands { + let new_value = CodecBitPacker4xSorted::bytes_decode(operand).unwrap(); + sets_bufs.push(SetBuf::new(new_value).unwrap()); + } - let old_set = SetBuf::new(old_value).unwrap(); - let new_set = SetBuf::new(new_value).unwrap(); - - let result = sdset::duo::Union::new(&old_set, &new_set).into_set_buf(); + let sets = sets_bufs.iter().map(|s| s.as_set()).collect(); + let result = sdset::multi::Union::new(sets).into_set_buf(); let compressed = CodecBitPacker4xSorted::bytes_encode(&result).unwrap(); Some(compressed) } -fn union_words_fst(key: &[u8], old_value: Option<&[u8]>, new_value: &[u8]) -> Option> { +fn union_words_fst( + key: &[u8], + old_value: Option<&[u8]>, + operands: &mut rocksdb::MergeOperands, +) -> Option> +{ if key != b"words-fst" { unimplemented!() } - let old_value = match old_value { - Some(old_value) => old_value, - None => return Some(new_value.to_vec()), - }; - - eprintln!("old_words size: {}", old_value.len()); - eprintln!("new_words size: {}", new_value.len()); - - let old_words = fst::Set::new(old_value).unwrap(); - let new_words = fst::Set::new(new_value).unwrap(); + let mut fst_operands = Vec::new(); + for operand in operands { + fst_operands.push(fst::Set::new(operand).unwrap()); + } // Do an union of the old and the new set of words. - let op = old_words.op().add(new_words.into_stream()).r#union(); + let mut builder = fst::set::OpBuilder::new(); + + let old_words = old_value.map(|v| fst::Set::new(v).unwrap()); + let old_words = old_words.as_ref().map(|v| v.into_stream()); + if let Some(old_words) = old_words { + builder.push(old_words); + } + + for new_words in &fst_operands { + builder.push(new_words.into_stream()); + } + + let op = builder.r#union(); let mut build = fst::SetBuilder::memory(); build.extend_stream(op.into_stream()).unwrap(); @@ -85,13 +107,18 @@ fn alphanumeric_tokens(string: &str) -> impl Iterator { string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) } -fn index_csv(tid: usize, db: sled::Db, mut rdr: csv::Reader) -> anyhow::Result { +fn index_csv( + tid: usize, + db: Arc, + mut rdr: csv::Reader, +) -> anyhow::Result +{ const MAX_POSITION: usize = 1000; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; - let main = &*db; - let postings_ids = db.open_tree("postings-ids")?; - let documents = db.open_tree("documents")?; + let main = db.cf_handle("main").context("cf \"main\" not found")?; + let postings_ids = db.cf_handle("postings-ids").context("cf \"postings-ids\" not found")?; + let documents = db.cf_handle("documents").context("cf \"documents\" not found")?; let mut document = csv::StringRecord::new(); let mut new_postings_ids = FastMap4::default(); @@ -104,12 +131,13 @@ fn index_csv(tid: usize, db: sled::Db, mut rdr: csv::Reader) -> anyhow::Re writer.write_byte_record(headers.as_byte_record())?; let headers = writer.into_inner()?; - if let Some(old_headers) = main.insert("headers", headers.as_slice())? { + if let Some(old_headers) = db.get_cf(&main, "headers")? { ensure!(old_headers == headers, "headers differs from the previous ones"); } + db.put_cf(&main, "headers", headers.as_slice())?; while rdr.read_record(&mut document)? { - let document_id = db.generate_id()?; + let document_id = ID_GENERATOR.fetch_add(1, Ordering::SeqCst); let document_id = u32::try_from(document_id).context("Generated id is too big")?; for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { @@ -122,11 +150,13 @@ fn index_csv(tid: usize, db: sled::Db, mut rdr: csv::Reader) -> anyhow::Re let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); writer.write_byte_record(document.as_byte_record())?; let document = writer.into_inner()?; - documents.insert(document_id.to_be_bytes(), document)?; + db.put_cf(&documents, document_id.to_be_bytes(), document)?; number_of_documents += 1; if number_of_documents % 100000 == 0 { - let postings_ids_size = new_postings_ids.iter().map(|(_, v)| v.capacity() * 4).sum::(); + let postings_ids_size = new_postings_ids.iter().map(|(_, v)| { + v.compressed_capacity() + v.uncompressed_capacity() * 4 + }).sum::(); eprintln!("{}, documents seen {}, postings size {}", tid, number_of_documents, postings_ids_size); } @@ -140,7 +170,7 @@ fn index_csv(tid: usize, db: sled::Db, mut rdr: csv::Reader) -> anyhow::Re let compressed = CodecBitPacker4xSorted::bytes_encode(&new_ids) .context("error while compressing using CodecBitPacker4xSorted")?; - postings_ids.merge(word.as_bytes(), compressed)?; + db.merge_cf(&postings_ids, word.as_bytes(), compressed)?; new_words.insert(word); } @@ -151,7 +181,7 @@ fn index_csv(tid: usize, db: sled::Db, mut rdr: csv::Reader) -> anyhow::Re let new_words_fst = fst::Set::from_iter(new_words.iter().map(|s| s.as_str()))?; drop(new_words); - main.merge("words-fst", new_words_fst.as_fst().as_bytes())?; + db.merge_cf(&main, "words-fst", new_words_fst.as_fst().as_bytes())?; eprintln!("Finished merging the words-fst"); @@ -161,16 +191,21 @@ fn index_csv(tid: usize, db: sled::Db, mut rdr: csv::Reader) -> anyhow::Re fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); - let db = sled::open(opt.database)?; - let main = &*db; - + let mut opts = rocksdb::Options::default(); + opts.create_if_missing(true); + opts.create_missing_column_families(true); // Setup the merge operators - main.set_merge_operator(union_words_fst); - let postings_ids = db.open_tree("postings-ids")?; - postings_ids.set_merge_operator(union_bitpacked_postings_ids); - // ... - let _documents = db.open_tree("documents")?; + opts.set_merge_operator("main", union_words_fst, Some(union_words_fst)); + opts.set_merge_operator("postings-ids", union_bitpacked_postings_ids, Some(union_bitpacked_postings_ids)); + let mut db = rocksdb::DB::open(&opts, &opt.database)?; + + let cfs = &["main", "postings-ids", "documents"]; + for cf in cfs.into_iter() { + db.create_cf(cf, &opts).unwrap(); + } + + let db = Arc::new(db); let res = opt.files_to_index .into_par_iter() .enumerate() From 3668627e031c13c5946d888ec9f33371555fa4ee Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 26 May 2020 17:41:44 +0200 Subject: [PATCH 0004/1889] Use zerocopy without bitpacking as a first step --- src/main.rs | 59 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/src/main.rs b/src/main.rs index e4ce5aeca..2d3cda946 100644 --- a/src/main.rs +++ b/src/main.rs @@ -19,8 +19,9 @@ use rayon::prelude::*; use sdset::{SetOperation, SetBuf}; use slice_group_by::StrGroupBy; use structopt::StructOpt; +use zerocopy::{LayoutVerified, AsBytes}; -use self::codec::CodecBitPacker4xSorted; +// use self::codec::CodecBitPacker4xSorted; use self::bp_vec::BpVec; pub type FastMap4 = HashMap>; @@ -44,7 +45,35 @@ struct Opt { files_to_index: Vec, } -fn union_bitpacked_postings_ids( +fn bytes_to_u32s(bytes: &[u8]) -> Vec { + fn aligned_to(bytes: &[u8], align: usize) -> bool { + (bytes as *const _ as *const () as usize) % align == 0 + } + + match LayoutVerified::new_slice(bytes) { + Some(slice) => slice.to_vec(), + None => { + let len = bytes.len(); + + // ensure that it is the alignment that is wrong and the length is valid + assert!(len % 4 == 0, "length is {} and is not modulo 4", len); + assert!(!aligned_to(bytes, std::mem::align_of::()), "bytes are already aligned"); + + let elems = len / 4; + let mut vec = Vec::::with_capacity(elems); + + unsafe { + let dst = vec.as_mut_ptr() as *mut u8; + std::ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len); + vec.set_len(elems); + } + + vec + }, + } +} + +fn union_postings_ids( _key: &[u8], old_value: Option<&[u8]>, operands: &mut rocksdb::MergeOperands, @@ -53,20 +82,21 @@ fn union_bitpacked_postings_ids( let mut sets_bufs = Vec::new(); if let Some(old_value) = old_value { - let old_value = CodecBitPacker4xSorted::bytes_decode(old_value).unwrap(); - sets_bufs.push(SetBuf::new(old_value).unwrap()); + let old_value = bytes_to_u32s(old_value); + sets_bufs.push(SetBuf::new_unchecked(old_value.to_vec())); } for operand in operands { - let new_value = CodecBitPacker4xSorted::bytes_decode(operand).unwrap(); - sets_bufs.push(SetBuf::new(new_value).unwrap()); + let new_value = bytes_to_u32s(operand); + sets_bufs.push(SetBuf::new_unchecked(new_value.to_vec())); } let sets = sets_bufs.iter().map(|s| s.as_set()).collect(); - let result = sdset::multi::Union::new(sets).into_set_buf(); - let compressed = CodecBitPacker4xSorted::bytes_encode(&result).unwrap(); + let result: SetBuf = sdset::multi::Union::new(sets).into_set_buf(); - Some(compressed) + assert!(result.as_bytes().len() % 4 == 0); + + Some(result.as_bytes().to_vec()) } fn union_words_fst( @@ -167,11 +197,7 @@ fn index_csv( // We compute and store the postings list into the DB. for (word, new_ids) in new_postings_ids { let new_ids = SetBuf::from_dirty(new_ids.to_vec()); - let compressed = CodecBitPacker4xSorted::bytes_encode(&new_ids) - .context("error while compressing using CodecBitPacker4xSorted")?; - - db.merge_cf(&postings_ids, word.as_bytes(), compressed)?; - + db.merge_cf(&postings_ids, word.as_bytes(), new_ids.as_bytes())?; new_words.insert(word); } @@ -184,6 +210,7 @@ fn index_csv( db.merge_cf(&main, "words-fst", new_words_fst.as_fst().as_bytes())?; eprintln!("Finished merging the words-fst"); + eprintln!("Total number of documents seen is {}", ID_GENERATOR.load(Ordering::Relaxed)); Ok(number_of_documents) } @@ -195,8 +222,8 @@ fn main() -> anyhow::Result<()> { opts.create_if_missing(true); opts.create_missing_column_families(true); // Setup the merge operators - opts.set_merge_operator("main", union_words_fst, Some(union_words_fst)); - opts.set_merge_operator("postings-ids", union_bitpacked_postings_ids, Some(union_bitpacked_postings_ids)); + opts.set_merge_operator("main", union_words_fst, None); // Some(union_words_fst)); + opts.set_merge_operator("postings-ids", union_postings_ids, None); // Some(union_postings_ids)); let mut db = rocksdb::DB::open(&opts, &opt.database)?; From 1237306ca8254d7d7b828a2e2bcde8a9eb12c674 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sat, 30 May 2020 15:35:33 +0200 Subject: [PATCH 0005/1889] Introduce a thread that write to heed --- Cargo.lock | 448 ++++++++++------------------------ Cargo.toml | 9 +- src/bp_vec.rs | 201 --------------- src/codec/bitpacker_sorted.rs | 90 ------- src/codec/mod.rs | 3 - src/main.rs | 251 ++++++++++--------- 6 files changed, 269 insertions(+), 733 deletions(-) delete mode 100644 src/bp_vec.rs delete mode 100644 src/codec/bitpacker_sorted.rs delete mode 100644 src/codec/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 1660704b0..24008fc08 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,40 +1,11 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -[[package]] -name = "aho-corasick" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada" -dependencies = [ - "memchr", -] - -[[package]] -name = "ansi_term" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" -dependencies = [ - "winapi", -] - [[package]] name = "anyhow" version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f" -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - [[package]] name = "autocfg" version = "1.0.0" @@ -42,27 +13,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" [[package]] -name = "bindgen" -version = "0.53.3" +name = "bincode" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c72a978d268b1d70b0e963217e60fdabd9523a941457a6c42a7315d15c7e89e5" +checksum = "5753e2a71534719bf3f4e57006c3a4f0d2c672a4b676eec84161f763eca87dbf" dependencies = [ - "bitflags", - "cexpr", - "cfg-if", - "clang-sys", - "clap", - "env_logger", - "lazy_static", - "lazycell", - "log", - "peeking_take_while", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "which", + "byteorder 1.3.4", + "serde", ] [[package]] @@ -92,6 +49,12 @@ dependencies = [ "serde", ] +[[package]] +name = "byteorder" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855" + [[package]] name = "byteorder" version = "1.3.4" @@ -103,18 +66,6 @@ name = "cc" version = "1.0.54" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311" -dependencies = [ - "jobserver", -] - -[[package]] -name = "cexpr" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27" -dependencies = [ - "nom", -] [[package]] name = "cfg-if" @@ -122,30 +73,25 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" -[[package]] -name = "clang-sys" -version = "0.29.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a" -dependencies = [ - "glob", - "libc", - "libloading", -] - [[package]] name = "clap" version = "2.33.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129" dependencies = [ - "ansi_term", - "atty", "bitflags", - "strsim", "textwrap", "unicode-width", - "vec_map", +] + +[[package]] +name = "crossbeam-channel" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cced8691919c02aac3cb0a1bc2e9b73d89e832bf9a06fc579d4e71b68a2da061" +dependencies = [ + "crossbeam-utils", + "maybe-uninit", ] [[package]] @@ -176,9 +122,9 @@ dependencies = [ [[package]] name = "crossbeam-queue" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c695eeca1e7173472a32221542ae469b3e9aac3a4fc81f7696bcad82029493db" +checksum = "ab6bffe714b6bb07e42f201352c34f51fefd355ace793f9e638ebd52d23f98d2" dependencies = [ "cfg-if", "crossbeam-utils", @@ -229,19 +175,6 @@ version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" -[[package]] -name = "env_logger" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" -dependencies = [ - "atty", - "humantime", - "log", - "regex", - "termcolor", -] - [[package]] name = "fs_extra" version = "1.1.0" @@ -260,26 +193,9 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" dependencies = [ - "byteorder", + "byteorder 1.3.4", ] -[[package]] -name = "getrandom" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - -[[package]] -name = "glob" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" - [[package]] name = "heck" version = "0.3.1" @@ -289,6 +205,42 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "heed" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd7882b766b4be1b90d8ce5ce4c7aca2539b43176a708dbc8e79576dbbdbba93" +dependencies = [ + "byteorder 1.3.4", + "heed-traits", + "heed-types", + "libc", + "lmdb-rkv-sys", + "once_cell", + "page_size", + "url", + "zerocopy", +] + +[[package]] +name = "heed-traits" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b328f6260a7e51bdb0ca6b68e6ea27ee3d11fba5dee930896ee7ff6ad5fc072c" + +[[package]] +name = "heed-types" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e100387815256b00dbb4f48db990f7fa03e9b88b4a89c2a1661b7d9d77b77c46" +dependencies = [ + "bincode", + "heed-traits", + "serde", + "serde_json", + "zerocopy", +] + [[package]] name = "hermit-abi" version = "0.1.13" @@ -299,12 +251,14 @@ dependencies = [ ] [[package]] -name = "humantime" -version = "1.3.0" +name = "idna" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" +checksum = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" dependencies = [ - "quick-error", + "matches", + "unicode-bidi", + "unicode-normalization", ] [[package]] @@ -334,27 +288,12 @@ dependencies = [ "libc", ] -[[package]] -name = "jobserver" -version = "0.1.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c71313ebb9439f74b00d9d2dcec36440beaf57a6aa0623068441dd7cd81a7f2" -dependencies = [ - "libc", -] - [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" -[[package]] -name = "lazycell" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b294d6fa9ee409a054354afc4352b0b9ef7ca222c69b8812cbea9e7d2bf3783f" - [[package]] name = "libc" version = "0.2.70" @@ -362,35 +301,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f" [[package]] -name = "libloading" -version = "0.5.2" +name = "lmdb-rkv-sys" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753" +checksum = "b27470ac25167b3afdfb6af8fcd3bc1be67de50ffbdaf4073378cfded6ae24a5" dependencies = [ "cc", - "winapi", -] - -[[package]] -name = "librocksdb-sys" -version = "6.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "883213ae3d09bfc3d104aefe94b25ebb183b6f4d3a515b23b14817e1f4854005" -dependencies = [ - "bindgen", - "cc", - "glob", "libc", + "pkg-config", ] [[package]] -name = "log" -version = "0.4.8" +name = "matches" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" -dependencies = [ - "cfg-if", -] +checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "maybe-uninit" @@ -404,19 +329,18 @@ version = "0.1.0" dependencies = [ "anyhow", "bitpacking", - "byteorder", + "byteorder 1.3.4", + "crossbeam-channel", "csv", "fst", "fxhash", + "heed", "jemallocator", - "quickcheck", "rayon", - "rocksdb", - "sdset", + "roaring", "slice-group-by", "smallstr", "structopt", - "zerocopy", ] [[package]] @@ -434,16 +358,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "nom" -version = "5.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b471253da97532da4b61552249c521e01e736071f71c1a4f7ebbfbf0a06aad6" -dependencies = [ - "memchr", - "version_check", -] - [[package]] name = "num_cpus" version = "1.13.0" @@ -455,16 +369,32 @@ dependencies = [ ] [[package]] -name = "peeking_take_while" -version = "0.1.2" +name = "once_cell" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" +checksum = "0b631f7e854af39a1739f401cf34a8a013dfe09eac4fa4dba91e9768bd28168d" [[package]] -name = "ppv-lite86" -version = "0.2.8" +name = "page_size" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea" +checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "percent-encoding" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" + +[[package]] +name = "pkg-config" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05da548ad6865900e60eaba7f589cc0783590a92e940c26953ff81ddbab2d677" [[package]] name = "proc-macro-error" @@ -501,24 +431,6 @@ dependencies = [ "unicode-xid", ] -[[package]] -name = "quick-error" -version = "1.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" - -[[package]] -name = "quickcheck" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44883e74aa97ad63db83c4bf8ca490f02b2fc02f92575e720c8551e843c945f" -dependencies = [ - "env_logger", - "log", - "rand", - "rand_core", -] - [[package]] name = "quote" version = "1.0.6" @@ -528,47 +440,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom", - "libc", - "rand_chacha", - "rand_core", - "rand_hc", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core", -] - [[package]] name = "rayon" version = "1.3.0" @@ -593,49 +464,24 @@ dependencies = [ "num_cpus", ] -[[package]] -name = "regex" -version = "1.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", - "thread_local", -] - [[package]] name = "regex-automata" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" dependencies = [ - "byteorder", + "byteorder 1.3.4", ] [[package]] -name = "regex-syntax" -version = "0.6.17" +name = "roaring" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" - -[[package]] -name = "rocksdb" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61aa17a99a2413cd71c1106691bf59dad7de0cd5099127f90e9d99c429c40d4a" +checksum = "4af20e5d3e44732a57489fa297768ca29361b54fbc3b20cdeb738fa6932cc22d" dependencies = [ - "libc", - "librocksdb-sys", + "byteorder 0.5.3", ] -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "ryu" version = "1.0.4" @@ -648,12 +494,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "sdset" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbb21fe0588557792176c89bc7b943027b14f346d03c6be6a199c2860277d93a" - [[package]] name = "serde" version = "1.0.110" @@ -661,10 +501,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99e7b308464d16b56eba9964e4972a3eee817760ab60d88c3f86e1fecb08204c" [[package]] -name = "shlex" -version = "0.1.1" +name = "serde_json" +version = "1.0.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2" +checksum = "993948e75b189211a9b31a7528f950c6adc21f9720b6438ff80a7fa2f864cea2" +dependencies = [ + "itoa", + "ryu", + "serde", +] [[package]] name = "slice-group-by" @@ -687,12 +532,6 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7cb5678e1615754284ec264d9bb5b4c27d2018577fd90ac0ceb578591ed5ee4" -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - [[package]] name = "structopt" version = "0.3.14" @@ -751,15 +590,6 @@ dependencies = [ "unicode-xid", ] -[[package]] -name = "termcolor" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f" -dependencies = [ - "winapi-util", -] - [[package]] name = "textwrap" version = "0.11.0" @@ -770,12 +600,21 @@ dependencies = [ ] [[package]] -name = "thread_local" -version = "1.0.1" +name = "unicode-bidi" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" dependencies = [ - "lazy_static", + "matches", +] + +[[package]] +name = "unicode-normalization" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5479532badd04e128284890390c1e876ef7a993d0570b3597ae43dfa1d59afa4" +dependencies = [ + "smallvec", ] [[package]] @@ -797,10 +636,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" [[package]] -name = "vec_map" -version = "0.8.2" +name = "url" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +checksum = "829d4a8476c35c9bf0bbce5a3b23f4106f79728039b726d292bb93bc106787cb" +dependencies = [ + "idna", + "matches", + "percent-encoding", +] [[package]] name = "version_check" @@ -808,21 +652,6 @@ version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - -[[package]] -name = "which" -version = "3.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d011071ae14a2f6671d0b74080ae0cd8ebf3a6f8c9589a2cd45f23126fe29724" -dependencies = [ - "libc", -] - [[package]] name = "winapi" version = "0.3.8" @@ -839,15 +668,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi", -] - [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -860,7 +680,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" dependencies = [ - "byteorder", + "byteorder 1.3.4", "zerocopy-derive", ] diff --git a/Cargo.toml b/Cargo.toml index 5186d5964..51966ebf9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,20 +8,17 @@ edition = "2018" anyhow = "1.0.28" bitpacking = "0.8.2" byteorder = "1.3.4" +roaring = "0.5.2" +crossbeam-channel = "0.4.2" csv = "1.1.3" fst = "0.4.3" fxhash = "0.2.1" +heed = { version = "0.8.0", default-features = false, features = ["lmdb"] } jemallocator = "0.3.2" rayon = "1.3.0" -rocksdb = "0.14.0" -sdset = "0.4.0" slice-group-by = "0.2.6" smallstr = "0.2.0" structopt = { version = "0.3.14", default-features = false } -zerocopy = "0.3.0" - -[dev-dependencies] -quickcheck = "0.9.2" [profile.release] debug = true diff --git a/src/bp_vec.rs b/src/bp_vec.rs deleted file mode 100644 index f91e6aa22..000000000 --- a/src/bp_vec.rs +++ /dev/null @@ -1,201 +0,0 @@ -use byteorder::{ByteOrder, NativeEndian}; -use bitpacking::{BitPacker, BitPacker4x}; - -/// An append only bitpacked u32 vector that ignore order of insertion. -#[derive(Default)] -pub struct BpVec { - compressed: Vec, - uncompressed: Vec, -} - -impl BpVec { - pub fn new() -> BpVec { - BpVec::default() - } - - pub fn push(&mut self, elem: u32) { - self.uncompressed.push(elem); - if self.uncompressed.len() == BitPacker4x::BLOCK_LEN { - encode(&mut self.uncompressed[..], &mut self.compressed); - self.uncompressed.clear(); - } - } - - pub fn extend_from_slice(&mut self, elems: &[u32]) { - self.uncompressed.extend_from_slice(elems); - let remaining = self.uncompressed.len() % BitPacker4x::BLOCK_LEN; - for chunk in self.uncompressed[remaining..].chunks_exact_mut(BitPacker4x::BLOCK_LEN) { - encode(chunk, &mut self.compressed); - } - self.uncompressed.truncate(remaining); - self.uncompressed.shrink_to_fit(); - } - - pub fn to_vec(self) -> Vec { - let BpVec { compressed, mut uncompressed } = self; - decode(&compressed, &mut uncompressed); - uncompressed - } - - pub fn compressed_capacity(&self) -> usize { - self.compressed.capacity() - } - - pub fn uncompressed_capacity(&self) -> usize { - self.uncompressed.capacity() - } -} - -fn encode(items: &mut [u32], encoded: &mut Vec) { - assert_eq!(items.len(), BitPacker4x::BLOCK_LEN); - - let bitpacker = BitPacker4x::new(); - - // We reserve enough space in the output buffer, filled with zeroes. - let len = encoded.len(); - // initial_value + num_bits + encoded numbers - let max_possible_length = 4 + 1 + 4 * BitPacker4x::BLOCK_LEN; - encoded.resize(len + max_possible_length, 0); - - // We sort the items to be able to efficiently bitpack them. - items.sort_unstable(); - // We save the initial value to us for this block, the lowest one. - let initial_value = items[0]; - // We compute the number of bits necessary to encode this block - let num_bits = bitpacker.num_bits_sorted(initial_value, items); - - // We write the initial value for this block. - let buffer = &mut encoded[len..]; - NativeEndian::write_u32(buffer, initial_value); - // We write the num_bits that will be read to decode this block - let buffer = &mut buffer[4..]; - buffer[0] = num_bits; - // We encode the block numbers into the buffer using the num_bits - let buffer = &mut buffer[1..]; - let compressed_len = bitpacker.compress_sorted(initial_value, items, buffer, num_bits); - - // We truncate the buffer to the avoid leaking padding zeroes - encoded.truncate(len + 4 + 1 + compressed_len); -} - -fn decode(mut encoded: &[u8], decoded: &mut Vec) { - let bitpacker = BitPacker4x::new(); - - // initial_value + num_bits - while let Some(header) = encoded.get(0..4 + 1) { - // We extract the header informations - let initial_value = NativeEndian::read_u32(header); - let num_bits = header[4]; - let bytes = &encoded[4 + 1..]; - - // If the num_bits is equal to zero it means that all encoded numbers were zeroes - if num_bits == 0 { - decoded.resize(decoded.len() + BitPacker4x::BLOCK_LEN, initial_value); - encoded = bytes; - continue; - } - - // We guess the block size based on the num_bits used for this block - let block_size = BitPacker4x::compressed_block_size(num_bits); - - // We pad the decoded vector with zeroes - let new_len = decoded.len() + BitPacker4x::BLOCK_LEN; - decoded.resize(new_len, 0); - - // Create a view into the decoded buffer and decode into it - let to_decompress = &mut decoded[new_len - BitPacker4x::BLOCK_LEN..new_len]; - bitpacker.decompress_sorted(initial_value, &bytes[..block_size], to_decompress, num_bits); - - // Advance the bytes offset to read the next block (+ num_bits) - encoded = &bytes[block_size..]; - } -} - -impl sdset::Collection for BpVec { - fn push(&mut self, elem: u32) { - BpVec::push(self, elem); - } - - fn extend_from_slice(&mut self, elems: &[u32]) { - BpVec::extend_from_slice(self, elems); - } - - fn extend(&mut self, elems: I) where I: IntoIterator { - elems.into_iter().for_each(|x| BpVec::push(self, x)); - } -} - -#[cfg(test)] -mod tests { - use super::*; - - quickcheck! { - fn qc_push(xs: Vec) -> bool { - let mut xs: Vec<_> = xs.iter().cloned().cycle().take(1300).collect(); - - let mut bpvec = BpVec::new(); - xs.iter().for_each(|x| bpvec.push(*x)); - let mut result = bpvec.to_vec(); - - result.sort_unstable(); - xs.sort_unstable(); - - xs == result - } - } - - quickcheck! { - fn qc_extend_from_slice(xs: Vec) -> bool { - let mut xs: Vec<_> = xs.iter().cloned().cycle().take(1300).collect(); - - let mut bpvec = BpVec::new(); - bpvec.extend_from_slice(&xs); - let mut result = bpvec.to_vec(); - - result.sort_unstable(); - xs.sort_unstable(); - - xs == result - } - } - - #[test] - fn empty() { - let mut bpvec = BpVec::new(); - bpvec.extend_from_slice(&[]); - let result = bpvec.to_vec(); - - assert!(result.is_empty()); - } - - #[test] - fn one_zero() { - let mut bpvec = BpVec::new(); - bpvec.extend_from_slice(&[0]); - let result = bpvec.to_vec(); - - assert_eq!(&[0], &*result); - } - - #[test] - fn many_zeros() { - let xs: Vec<_> = std::iter::repeat(0).take(1300).collect(); - - let mut bpvec = BpVec::new(); - bpvec.extend_from_slice(&xs); - let result = bpvec.to_vec(); - - assert_eq!(xs, result); - } - - #[test] - fn many_ones() { - let xs: Vec<_> = std::iter::repeat(1).take(1300).collect(); - - let mut bpvec = BpVec::new(); - bpvec.extend_from_slice(&xs); - let result = bpvec.to_vec(); - - assert_eq!(xs, result); - } -} diff --git a/src/codec/bitpacker_sorted.rs b/src/codec/bitpacker_sorted.rs deleted file mode 100644 index c51b4d71c..000000000 --- a/src/codec/bitpacker_sorted.rs +++ /dev/null @@ -1,90 +0,0 @@ -use bitpacking::{BitPacker, BitPacker4x}; -use byteorder::{ReadBytesExt, NativeEndian}; -use zerocopy::AsBytes; - -pub struct CodecBitPacker4xSorted; - -impl CodecBitPacker4xSorted { - pub fn bytes_encode(item: &[u32]) -> Option> { - // This is a hotfix to the SIGSEGV - // https://github.com/tantivy-search/bitpacking/issues/23 - if item.is_empty() { - return Some(Vec::default()) - } - - let bitpacker = BitPacker4x::new(); - let mut compressed = Vec::new(); - let mut initial_value = 0; - - // The number of remaining numbers that don't fit in the block size. - compressed.push((item.len() % BitPacker4x::BLOCK_LEN) as u8); - - // we cannot use a mut slice here because of #68630, TooGeneric error. - // we can probably avoid this new allocation by directly using the compressed final Vec. - let mut buffer = vec![0u8; 4 * BitPacker4x::BLOCK_LEN]; - - for chunk in item.chunks(BitPacker4x::BLOCK_LEN) { - if chunk.len() == BitPacker4x::BLOCK_LEN { - // compute the number of bits necessary to encode this block - let num_bits = bitpacker.num_bits_sorted(initial_value, chunk); - // Encode the block numbers into the buffer using the num_bits - let compressed_len = bitpacker.compress_sorted(initial_value, chunk, &mut buffer, num_bits); - // Write the num_bits that will be read to decode this block - compressed.push(num_bits); - // Wrtie the bytes of the compressed block numbers - compressed.extend_from_slice(&buffer[..compressed_len]); - // Save the initial_value, which is the last value of the n-1 used for the n block - initial_value = *chunk.last().unwrap(); - } else { - // Save the remaining numbers which don't fit inside of a BLOCK_LEN - compressed.extend_from_slice(chunk.as_bytes()); - } - } - - Some(compressed) - } - - pub fn bytes_decode(bytes: &[u8]) -> Option> { - if bytes.is_empty() { - return Some(Vec::new()) - } - - let bitpacker = BitPacker4x::new(); - let (remaining, bytes) = bytes.split_first().unwrap(); - let remaining = *remaining as usize; - - let (mut bytes, mut remaining_bytes) = bytes.split_at(bytes.len() - remaining * 4); - let mut decompressed = Vec::new(); - let mut initial_value = 0; - - while let Some(num_bits) = bytes.get(0) { - if *num_bits == 0 { - decompressed.resize(decompressed.len() + BitPacker4x::BLOCK_LEN, initial_value); - bytes = &bytes[1..]; - continue; - } - - let block_size = BitPacker4x::compressed_block_size(*num_bits); - - let new_len = decompressed.len() + BitPacker4x::BLOCK_LEN; - decompressed.resize(new_len, 0); - - // Create a view into the decompressed buffer and decomress into it - let to_decompress = &mut decompressed[new_len - BitPacker4x::BLOCK_LEN..new_len]; - bitpacker.decompress_sorted(initial_value, &bytes[1..block_size + 1], to_decompress, *num_bits); - - // Set the new initial_value for the next block - initial_value = *decompressed.last().unwrap(); - // Advance the bytes offset to read the next block (+ num_bits) - bytes = &bytes[block_size + 1..]; - } - - // We add the remaining uncompressed numbers. - let new_len = decompressed.len() + remaining; - decompressed.resize(new_len, 0); - let to_decompress = &mut decompressed[new_len - remaining..new_len]; - remaining_bytes.read_u32_into::(to_decompress).ok()?; - - Some(decompressed) - } -} diff --git a/src/codec/mod.rs b/src/codec/mod.rs deleted file mode 100644 index 451839fea..000000000 --- a/src/codec/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ -mod bitpacker_sorted; - -pub use self::bitpacker_sorted::CodecBitPacker4xSorted; diff --git a/src/main.rs b/src/main.rs index 2d3cda946..a2784cae3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,37 +1,32 @@ -#[cfg(test)] -#[macro_use] extern crate quickcheck; - -mod codec; -mod bp_vec; - use std::collections::{HashMap, BTreeSet}; use std::convert::TryFrom; use std::fs::File; use std::hash::BuildHasherDefault; use std::path::PathBuf; -use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; +use std::thread; use anyhow::{ensure, Context}; +use roaring::RoaringBitmap; +use crossbeam_channel::{select, Sender, Receiver}; use fst::IntoStreamer; use fxhash::FxHasher32; +use heed::{EnvOpenOptions, Database}; +use heed::types::*; use rayon::prelude::*; -use sdset::{SetOperation, SetBuf}; use slice_group_by::StrGroupBy; use structopt::StructOpt; -use zerocopy::{LayoutVerified, AsBytes}; - -// use self::codec::CodecBitPacker4xSorted; -use self::bp_vec::BpVec; pub type FastMap4 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; +pub type BEU32 = heed::zerocopy::U32; +pub type DocumentId = u32; #[cfg(target_os = "linux")] #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; -static ID_GENERATOR: AtomicUsize = AtomicUsize::new(0); +static ID_GENERATOR: AtomicUsize = AtomicUsize::new(0); // AtomicU32 ? #[derive(Debug, StructOpt)] #[structopt(name = "mm-indexer", about = "The server side of the daugt project.")] @@ -45,73 +40,24 @@ struct Opt { files_to_index: Vec, } -fn bytes_to_u32s(bytes: &[u8]) -> Vec { - fn aligned_to(bytes: &[u8], align: usize) -> bool { - (bytes as *const _ as *const () as usize) % align == 0 - } - - match LayoutVerified::new_slice(bytes) { - Some(slice) => slice.to_vec(), - None => { - let len = bytes.len(); - - // ensure that it is the alignment that is wrong and the length is valid - assert!(len % 4 == 0, "length is {} and is not modulo 4", len); - assert!(!aligned_to(bytes, std::mem::align_of::()), "bytes are already aligned"); - - let elems = len / 4; - let mut vec = Vec::::with_capacity(elems); - - unsafe { - let dst = vec.as_mut_ptr() as *mut u8; - std::ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len); - vec.set_len(elems); - } - - vec +fn union_postings_ids(_key: &[u8], old_value: Option<&[u8]>, new_value: RoaringBitmap) -> Option> { + let result = match old_value { + Some(bytes) => { + let mut old_value = RoaringBitmap::deserialize_from(bytes).unwrap(); + old_value.union_with(&new_value); + old_value }, - } + None => new_value, + }; + + let mut vec = Vec::new(); + result.serialize_into(&mut vec).unwrap(); + Some(vec) } -fn union_postings_ids( - _key: &[u8], - old_value: Option<&[u8]>, - operands: &mut rocksdb::MergeOperands, -) -> Option> -{ - let mut sets_bufs = Vec::new(); - - if let Some(old_value) = old_value { - let old_value = bytes_to_u32s(old_value); - sets_bufs.push(SetBuf::new_unchecked(old_value.to_vec())); - } - - for operand in operands { - let new_value = bytes_to_u32s(operand); - sets_bufs.push(SetBuf::new_unchecked(new_value.to_vec())); - } - - let sets = sets_bufs.iter().map(|s| s.as_set()).collect(); - let result: SetBuf = sdset::multi::Union::new(sets).into_set_buf(); - - assert!(result.as_bytes().len() % 4 == 0); - - Some(result.as_bytes().to_vec()) -} - -fn union_words_fst( - key: &[u8], - old_value: Option<&[u8]>, - operands: &mut rocksdb::MergeOperands, -) -> Option> -{ +fn union_words_fst(key: &[u8], old_value: Option<&[u8]>, new_value: &fst::Set>) -> Option> { if key != b"words-fst" { unimplemented!() } - let mut fst_operands = Vec::new(); - for operand in operands { - fst_operands.push(fst::Set::new(operand).unwrap()); - } - // Do an union of the old and the new set of words. let mut builder = fst::set::OpBuilder::new(); @@ -121,9 +67,7 @@ fn union_words_fst( builder.push(old_words); } - for new_words in &fst_operands { - builder.push(new_words.into_stream()); - } + builder.push(new_value); let op = builder.r#union(); let mut build = fst::SetBuilder::memory(); @@ -137,19 +81,94 @@ fn alphanumeric_tokens(string: &str) -> impl Iterator { string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) } -fn index_csv( - tid: usize, - db: Arc, - mut rdr: csv::Reader, -) -> anyhow::Result -{ +enum MainKey { + WordsFst(fst::Set>), + Headers(Vec), +} + +#[derive(Clone)] +struct DbSender { + main: Sender, + postings_ids: Sender<(SmallString32, RoaringBitmap)>, + documents: Sender<(DocumentId, Vec)>, +} + +struct DbReceiver { + main: Receiver, + postings_ids: Receiver<(SmallString32, RoaringBitmap)>, + documents: Receiver<(DocumentId, Vec)>, +} + +fn thread_channel() -> (DbSender, DbReceiver) { + let (sd_main, rc_main) = crossbeam_channel::bounded(4); + let (sd_postings, rc_postings) = crossbeam_channel::bounded(10); + let (sd_documents, rc_documents) = crossbeam_channel::bounded(10); + + let sender = DbSender { main: sd_main, postings_ids: sd_postings, documents: sd_documents }; + let receiver = DbReceiver { main: rc_main, postings_ids: rc_postings, documents: rc_documents }; + + (sender, receiver) +} + +fn writer_thread(env: heed::Env, receiver: DbReceiver) -> anyhow::Result<()> { + let main = env.create_poly_database(None)?; + let postings_ids: Database = env.create_database(Some("postings-ids"))?; + let documents: Database, ByteSlice> = env.create_database(Some("documents"))?; + + let mut wtxn = env.write_txn()?; + + loop { + select! { + recv(receiver.main) -> msg => { + let msg = match msg { + Err(_) => break, + Ok(msg) => msg, + }; + + match msg { + MainKey::WordsFst(new_fst) => { + let old_value = main.get::<_, Str, ByteSlice>(&wtxn, "words-fst")?; + let new_value = union_words_fst(b"words-fst", old_value, &new_fst) + .context("error while do a words-fst union")?; + main.put::<_, Str, ByteSlice>(&mut wtxn, "words-fst", &new_value)?; + }, + MainKey::Headers(headers) => { + if let Some(old_headers) = main.get::<_, Str, ByteSlice>(&wtxn, "headers")? { + ensure!(old_headers == &*headers, "headers differs from the previous ones"); + } + main.put::<_, Str, ByteSlice>(&mut wtxn, "headers", &headers)?; + }, + } + }, + recv(receiver.postings_ids) -> msg => { + let (word, postings) = match msg { + Err(_) => break, + Ok(msg) => msg, + }; + + let old_value = postings_ids.get(&wtxn, &word)?; + let new_value = union_postings_ids(word.as_bytes(), old_value, postings) + .context("error while do a words-fst union")?; + postings_ids.put(&mut wtxn, &word, &new_value)?; + }, + recv(receiver.documents) -> msg => { + let (id, content) = match msg { + Err(_) => break, + Ok(msg) => msg, + }; + documents.put(&mut wtxn, &BEU32::new(id), &content)?; + }, + } + } + + wtxn.commit()?; + Ok(()) +} + +fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader) -> anyhow::Result { const MAX_POSITION: usize = 1000; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; - let main = db.cf_handle("main").context("cf \"main\" not found")?; - let postings_ids = db.cf_handle("postings-ids").context("cf \"postings-ids\" not found")?; - let documents = db.cf_handle("documents").context("cf \"documents\" not found")?; - let mut document = csv::StringRecord::new(); let mut new_postings_ids = FastMap4::default(); let mut new_words = BTreeSet::default(); @@ -160,19 +179,19 @@ fn index_csv( let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); writer.write_byte_record(headers.as_byte_record())?; let headers = writer.into_inner()?; - - if let Some(old_headers) = db.get_cf(&main, "headers")? { - ensure!(old_headers == headers, "headers differs from the previous ones"); - } - db.put_cf(&main, "headers", headers.as_slice())?; + db_sender.main.send(MainKey::Headers(headers))?; while rdr.read_record(&mut document)? { let document_id = ID_GENERATOR.fetch_add(1, Ordering::SeqCst); - let document_id = u32::try_from(document_id).context("Generated id is too big")?; + let document_id = DocumentId::try_from(document_id).context("Generated id is too big")?; for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { - new_postings_ids.entry(SmallString32::from(word)).or_insert_with(BpVec::new).push(document_id); + if !word.is_empty() && word.len() < 500 { // LMDB limits + new_postings_ids.entry(SmallString32::from(word)) + .or_insert_with(RoaringBitmap::new) + .insert(document_id); + } } } @@ -180,15 +199,11 @@ fn index_csv( let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); writer.write_byte_record(document.as_byte_record())?; let document = writer.into_inner()?; - db.put_cf(&documents, document_id.to_be_bytes(), document)?; + db_sender.documents.send((document_id, document))?; number_of_documents += 1; if number_of_documents % 100000 == 0 { - let postings_ids_size = new_postings_ids.iter().map(|(_, v)| { - v.compressed_capacity() + v.uncompressed_capacity() * 4 - }).sum::(); - eprintln!("{}, documents seen {}, postings size {}", - tid, number_of_documents, postings_ids_size); + eprintln!("{}, documents seen {}", tid, number_of_documents); } } @@ -196,8 +211,7 @@ fn index_csv( // We compute and store the postings list into the DB. for (word, new_ids) in new_postings_ids { - let new_ids = SetBuf::from_dirty(new_ids.to_vec()); - db.merge_cf(&postings_ids, word.as_bytes(), new_ids.as_bytes())?; + db_sender.postings_ids.send((word.clone(), new_ids))?; new_words.insert(word); } @@ -207,7 +221,7 @@ fn index_csv( let new_words_fst = fst::Set::from_iter(new_words.iter().map(|s| s.as_str()))?; drop(new_words); - db.merge_cf(&main, "words-fst", new_words_fst.as_fst().as_bytes())?; + db_sender.main.send(MainKey::WordsFst(new_words_fst))?; eprintln!("Finished merging the words-fst"); eprintln!("Total number of documents seen is {}", ID_GENERATOR.load(Ordering::Relaxed)); @@ -218,31 +232,30 @@ fn index_csv( fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); - let mut opts = rocksdb::Options::default(); - opts.create_if_missing(true); - opts.create_missing_column_families(true); - // Setup the merge operators - opts.set_merge_operator("main", union_words_fst, None); // Some(union_words_fst)); - opts.set_merge_operator("postings-ids", union_postings_ids, None); // Some(union_postings_ids)); + std::fs::create_dir_all(&opt.database)?; + let env = EnvOpenOptions::new() + .map_size(100 * 1024 * 1024 * 1024) // 100 GB + .max_readers(10) + .max_dbs(5) + .open(opt.database)?; - let mut db = rocksdb::DB::open(&opts, &opt.database)?; + let (sender, receiver) = thread_channel(); + let writing_child = thread::spawn(move || writer_thread(env, receiver)); - let cfs = &["main", "postings-ids", "documents"]; - for cf in cfs.into_iter() { - db.create_cf(cf, &opts).unwrap(); - } - - let db = Arc::new(db); let res = opt.files_to_index .into_par_iter() .enumerate() .map(|(tid, path)| { let rdr = csv::Reader::from_path(path)?; - index_csv(tid, db.clone(), rdr) + index_csv(tid, sender.clone(), rdr) }) .try_reduce(|| 0, |a, b| Ok(a + b)); - println!("{:?}", res); + + eprintln!("witing the writing thread..."); + writing_child.join().unwrap().unwrap(); + + println!("indexed {:?} documents", res); Ok(()) } From 3a998cf39c878ec49c708223430ae3885d596fc0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sat, 30 May 2020 19:56:57 +0200 Subject: [PATCH 0006/1889] Far better usage of rayon to fold indexed data --- Cargo.lock | 11 --- Cargo.toml | 1 - src/main.rs | 208 +++++++++++++++++++++++++++------------------------- 3 files changed, 107 insertions(+), 113 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 24008fc08..abc44ba56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -84,16 +84,6 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "crossbeam-channel" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cced8691919c02aac3cb0a1bc2e9b73d89e832bf9a06fc579d4e71b68a2da061" -dependencies = [ - "crossbeam-utils", - "maybe-uninit", -] - [[package]] name = "crossbeam-deque" version = "0.7.3" @@ -330,7 +320,6 @@ dependencies = [ "anyhow", "bitpacking", "byteorder 1.3.4", - "crossbeam-channel", "csv", "fst", "fxhash", diff --git a/Cargo.toml b/Cargo.toml index 51966ebf9..bf5f76152 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,6 @@ anyhow = "1.0.28" bitpacking = "0.8.2" byteorder = "1.3.4" roaring = "0.5.2" -crossbeam-channel = "0.4.2" csv = "1.1.3" fst = "0.4.3" fxhash = "0.2.1" diff --git a/src/main.rs b/src/main.rs index a2784cae3..553357612 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,17 +1,16 @@ +use std::collections::hash_map::Entry; use std::collections::{HashMap, BTreeSet}; use std::convert::TryFrom; use std::fs::File; use std::hash::BuildHasherDefault; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; -use std::thread; use anyhow::{ensure, Context}; use roaring::RoaringBitmap; -use crossbeam_channel::{select, Sender, Receiver}; use fst::IntoStreamer; use fxhash::FxHasher32; -use heed::{EnvOpenOptions, Database}; +use heed::{EnvOpenOptions, PolyDatabase, Database}; use heed::types::*; use rayon::prelude::*; use slice_group_by::StrGroupBy; @@ -81,97 +80,67 @@ fn alphanumeric_tokens(string: &str) -> impl Iterator { string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) } -enum MainKey { - WordsFst(fst::Set>), - Headers(Vec), +#[derive(Default)] +struct Indexed { + fst: fst::Set>, + postings_ids: FastMap4, + headers: Vec, + documents: Vec<(DocumentId, Vec)>, } -#[derive(Clone)] -struct DbSender { - main: Sender, - postings_ids: Sender<(SmallString32, RoaringBitmap)>, - documents: Sender<(DocumentId, Vec)>, -} +impl Indexed { + fn merge_with(mut self, mut other: Indexed) -> Indexed { -struct DbReceiver { - main: Receiver, - postings_ids: Receiver<(SmallString32, RoaringBitmap)>, - documents: Receiver<(DocumentId, Vec)>, -} + // Union of the two FSTs + let op = fst::set::OpBuilder::new() + .add(self.fst.into_stream()) + .add(other.fst.into_stream()) + .r#union(); -fn thread_channel() -> (DbSender, DbReceiver) { - let (sd_main, rc_main) = crossbeam_channel::bounded(4); - let (sd_postings, rc_postings) = crossbeam_channel::bounded(10); - let (sd_documents, rc_documents) = crossbeam_channel::bounded(10); + let mut build = fst::SetBuilder::memory(); + build.extend_stream(op.into_stream()).unwrap(); + let fst = build.into_set(); - let sender = DbSender { main: sd_main, postings_ids: sd_postings, documents: sd_documents }; - let receiver = DbReceiver { main: rc_main, postings_ids: rc_postings, documents: rc_documents }; + // Merge the postings by unions + for (word, mut postings) in other.postings_ids { + match self.postings_ids.entry(word) { + Entry::Occupied(mut entry) => { + let old = entry.get(); + postings.union_with(&old); + entry.insert(postings); + }, + Entry::Vacant(entry) => { + entry.insert(postings); + }, + } + } - (sender, receiver) -} + // assert headers are valid + assert_eq!(self.headers, other.headers); -fn writer_thread(env: heed::Env, receiver: DbReceiver) -> anyhow::Result<()> { - let main = env.create_poly_database(None)?; - let postings_ids: Database = env.create_database(Some("postings-ids"))?; - let documents: Database, ByteSlice> = env.create_database(Some("documents"))?; + // extend the documents + self.documents.append(&mut other.documents); - let mut wtxn = env.write_txn()?; - - loop { - select! { - recv(receiver.main) -> msg => { - let msg = match msg { - Err(_) => break, - Ok(msg) => msg, - }; - - match msg { - MainKey::WordsFst(new_fst) => { - let old_value = main.get::<_, Str, ByteSlice>(&wtxn, "words-fst")?; - let new_value = union_words_fst(b"words-fst", old_value, &new_fst) - .context("error while do a words-fst union")?; - main.put::<_, Str, ByteSlice>(&mut wtxn, "words-fst", &new_value)?; - }, - MainKey::Headers(headers) => { - if let Some(old_headers) = main.get::<_, Str, ByteSlice>(&wtxn, "headers")? { - ensure!(old_headers == &*headers, "headers differs from the previous ones"); - } - main.put::<_, Str, ByteSlice>(&mut wtxn, "headers", &headers)?; - }, - } - }, - recv(receiver.postings_ids) -> msg => { - let (word, postings) = match msg { - Err(_) => break, - Ok(msg) => msg, - }; - - let old_value = postings_ids.get(&wtxn, &word)?; - let new_value = union_postings_ids(word.as_bytes(), old_value, postings) - .context("error while do a words-fst union")?; - postings_ids.put(&mut wtxn, &word, &new_value)?; - }, - recv(receiver.documents) -> msg => { - let (id, content) = match msg { - Err(_) => break, - Ok(msg) => msg, - }; - documents.put(&mut wtxn, &BEU32::new(id), &content)?; - }, + Indexed { + fst, + postings_ids: self.postings_ids, + headers: self.headers, + documents: self.documents, } } - - wtxn.commit()?; - Ok(()) } -fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader) -> anyhow::Result { +fn index_csv( + tid: usize, + mut rdr: csv::Reader, +) -> anyhow::Result +{ const MAX_POSITION: usize = 1000; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; let mut document = csv::StringRecord::new(); - let mut new_postings_ids = FastMap4::default(); - let mut new_words = BTreeSet::default(); + let mut postings_ids = FastMap4::default(); + let mut documents = Vec::new(); let mut number_of_documents = 0; // Write the headers into a Vec of bytes. @@ -179,7 +148,6 @@ fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader) -> any let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); writer.write_byte_record(headers.as_byte_record())?; let headers = writer.into_inner()?; - db_sender.main.send(MainKey::Headers(headers))?; while rdr.read_record(&mut document)? { let document_id = ID_GENERATOR.fetch_add(1, Ordering::SeqCst); @@ -188,7 +156,7 @@ fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader) -> any for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { if !word.is_empty() && word.len() < 500 { // LMDB limits - new_postings_ids.entry(SmallString32::from(word)) + postings_ids.entry(SmallString32::from(word)) .or_insert_with(RoaringBitmap::new) .insert(document_id); } @@ -199,7 +167,7 @@ fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader) -> any let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new()); writer.write_byte_record(document.as_byte_record())?; let document = writer.into_inner()?; - db_sender.documents.send((document_id, document))?; + documents.push((document_id, document)); number_of_documents += 1; if number_of_documents % 100000 == 0 { @@ -207,26 +175,57 @@ fn index_csv(tid: usize, db_sender: DbSender, mut rdr: csv::Reader) -> any } } - eprintln!("Start collecting the postings lists and words"); + eprintln!("Start collecting the words into an FST"); // We compute and store the postings list into the DB. - for (word, new_ids) in new_postings_ids { - db_sender.postings_ids.send((word.clone(), new_ids))?; - new_words.insert(word); + let mut new_words = BTreeSet::default(); + for (word, _new_ids) in &postings_ids { + new_words.insert(word.clone()); } - eprintln!("Finished collecting the postings lists and words"); + let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallString32::as_str))?; - eprintln!("Start merging the words-fst"); + eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed)); - let new_words_fst = fst::Set::from_iter(new_words.iter().map(|s| s.as_str()))?; - drop(new_words); - db_sender.main.send(MainKey::WordsFst(new_words_fst))?; + Ok(Indexed { fst: new_words_fst, headers, postings_ids, documents }) +} - eprintln!("Finished merging the words-fst"); - eprintln!("Total number of documents seen is {}", ID_GENERATOR.load(Ordering::Relaxed)); +fn writer( + wtxn: &mut heed::RwTxn, + main: PolyDatabase, + postings_ids: Database, + documents: Database, ByteSlice>, + indexed: Indexed, +) -> anyhow::Result +{ + // Write and merge the words fst + let old_value = main.get::<_, Str, ByteSlice>(wtxn, "words-fst")?; + let new_value = union_words_fst(b"words-fst", old_value, &indexed.fst) + .context("error while do a words-fst union")?; + main.put::<_, Str, ByteSlice>(wtxn, "words-fst", &new_value)?; - Ok(number_of_documents) + // Write and merge the headers + if let Some(old_headers) = main.get::<_, Str, ByteSlice>(wtxn, "headers")? { + ensure!(old_headers == &*indexed.headers, "headers differs from the previous ones"); + } + main.put::<_, Str, ByteSlice>(wtxn, "headers", &indexed.headers)?; + + // Write and merge the postings lists + for (word, postings) in indexed.postings_ids { + let old_value = postings_ids.get(wtxn, word.as_str())?; + let new_value = union_postings_ids(word.as_bytes(), old_value, postings) + .context("error while do a words-fst union")?; + postings_ids.put(wtxn, &word, &new_value)?; + } + + let count = indexed.documents.len(); + + // Write the documents + for (id, content) in indexed.documents { + documents.put(wtxn, &BEU32::new(id), &content)?; + } + + Ok(count) } fn main() -> anyhow::Result<()> { @@ -239,22 +238,29 @@ fn main() -> anyhow::Result<()> { .max_dbs(5) .open(opt.database)?; - let (sender, receiver) = thread_channel(); - let writing_child = thread::spawn(move || writer_thread(env, receiver)); + let main = env.create_poly_database(None)?; + let postings_ids: Database = env.create_database(Some("postings-ids"))?; + let documents: Database, ByteSlice> = env.create_database(Some("documents"))?; let res = opt.files_to_index .into_par_iter() .enumerate() - .map(|(tid, path)| { + .try_fold(|| Indexed::default(), |acc, (tid, path)| { let rdr = csv::Reader::from_path(path)?; - index_csv(tid, sender.clone(), rdr) + let indexed = index_csv(tid, rdr)?; + Ok(acc.merge_with(indexed)) as anyhow::Result + }) + .map(|indexed| match indexed { + Ok(indexed) => { + let mut wtxn = env.write_txn()?; + let count = writer(&mut wtxn, main, postings_ids, documents, indexed)?; + wtxn.commit()?; + Ok(count) + }, + Err(e) => Err(e), }) .try_reduce(|| 0, |a, b| Ok(a + b)); - - eprintln!("witing the writing thread..."); - writing_child.join().unwrap().unwrap(); - println!("indexed {:?} documents", res); Ok(()) From 6762c2d08f004be1f124fbf8d1656596e9d1571c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 31 May 2020 12:29:19 +0200 Subject: [PATCH 0007/1889] Clean up a little bit --- src/main.rs | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/main.rs b/src/main.rs index 553357612..1dd6551ef 100644 --- a/src/main.rs +++ b/src/main.rs @@ -116,7 +116,9 @@ impl Indexed { } // assert headers are valid - assert_eq!(self.headers, other.headers); + if !self.headers.is_empty() { + assert_eq!(self.headers, other.headers); + } // extend the documents self.documents.append(&mut other.documents); @@ -130,18 +132,13 @@ impl Indexed { } } -fn index_csv( - tid: usize, - mut rdr: csv::Reader, -) -> anyhow::Result -{ +fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { const MAX_POSITION: usize = 1000; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; let mut document = csv::StringRecord::new(); let mut postings_ids = FastMap4::default(); let mut documents = Vec::new(); - let mut number_of_documents = 0; // Write the headers into a Vec of bytes. let headers = rdr.headers()?; @@ -168,15 +165,8 @@ fn index_csv( writer.write_byte_record(document.as_byte_record())?; let document = writer.into_inner()?; documents.push((document_id, document)); - - number_of_documents += 1; - if number_of_documents % 100000 == 0 { - eprintln!("{}, documents seen {}", tid, number_of_documents); - } } - eprintln!("Start collecting the words into an FST"); - // We compute and store the postings list into the DB. let mut new_words = BTreeSet::default(); for (word, _new_ids) in &postings_ids { @@ -185,8 +175,6 @@ fn index_csv( let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallString32::as_str))?; - eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed)); - Ok(Indexed { fst: new_words_fst, headers, postings_ids, documents }) } @@ -244,21 +232,26 @@ fn main() -> anyhow::Result<()> { let res = opt.files_to_index .into_par_iter() - .enumerate() - .try_fold(|| Indexed::default(), |acc, (tid, path)| { + .try_fold(|| Indexed::default(), |acc, path| { let rdr = csv::Reader::from_path(path)?; - let indexed = index_csv(tid, rdr)?; + let indexed = index_csv(rdr)?; Ok(acc.merge_with(indexed)) as anyhow::Result }) .map(|indexed| match indexed { Ok(indexed) => { + let tid = rayon::current_thread_index(); + eprintln!("{:?}: A new step to write into LMDB", tid); let mut wtxn = env.write_txn()?; let count = writer(&mut wtxn, main, postings_ids, documents, indexed)?; wtxn.commit()?; + eprintln!("{:?}: Wrote {} documents into LMDB", tid, count); Ok(count) }, Err(e) => Err(e), }) + .inspect(|_| { + eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed)) + }) .try_reduce(|| 0, |a, b| Ok(a + b)); println!("indexed {:?} documents", res); From 24587148fdb8009d78e13d254616b39c9b091575 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 31 May 2020 14:20:17 +0200 Subject: [PATCH 0008/1889] Introduce MTBL parallel merging before LMDB writing --- Cargo.lock | 225 ++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 6 +- src/main.rs | 255 +++++++++++++++++++++++++++++----------------------- 3 files changed, 372 insertions(+), 114 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index abc44ba56..1fa4b92e8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,5 +1,11 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +[[package]] +name = "adler32" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d2e7343e7fc9de883d1b0341e0b13970f764c14101234857d2ddafa1cb1cac2" + [[package]] name = "anyhow" version = "1.0.31" @@ -66,6 +72,9 @@ name = "cc" version = "1.0.54" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311" +dependencies = [ + "jobserver", +] [[package]] name = "cfg-if" @@ -84,6 +93,21 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "crc32c" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ba37ef26c12988c1cee882d522d65e1d5d2ad8c3864665b88ee92767ed84c5" + +[[package]] +name = "crc32fast" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-deque" version = "0.7.3" @@ -165,6 +189,18 @@ version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" +[[package]] +name = "flate2" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cfff41391129e0a856d6d822600b8d71179d46879e310417eb9c762eb178b42" +dependencies = [ + "cfg-if", + "crc32fast", + "libc", + "miniz_oxide", +] + [[package]] name = "fs_extra" version = "1.1.0" @@ -186,6 +222,23 @@ dependencies = [ "byteorder 1.3.4", ] +[[package]] +name = "getrandom" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + [[package]] name = "heck" version = "0.3.1" @@ -251,6 +304,15 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "itertools" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "0.4.5" @@ -278,6 +340,15 @@ dependencies = [ "libc", ] +[[package]] +name = "jobserver" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c71313ebb9439f74b00d9d2dcec36440beaf57a6aa0623068441dd7cd81a7f2" +dependencies = [ + "libc", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -325,11 +396,15 @@ dependencies = [ "fxhash", "heed", "jemallocator", + "memmap", + "oxidized-mtbl", "rayon", "roaring", "slice-group-by", "smallstr", + "smallvec", "structopt", + "tempfile", ] [[package]] @@ -338,6 +413,16 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" +[[package]] +name = "memmap" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "memoffset" version = "0.5.4" @@ -347,6 +432,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "miniz_oxide" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa679ff6578b1cddee93d7e82e263b94a575e0bfced07284eb0c037c1d2416a5" +dependencies = [ + "adler32", +] + [[package]] name = "num_cpus" version = "1.13.0" @@ -363,6 +457,18 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b631f7e854af39a1739f401cf34a8a013dfe09eac4fa4dba91e9768bd28168d" +[[package]] +name = "oxidized-mtbl" +version = "0.1.0" +source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=8918476#8918476f61f4430890d067db7b4a6cfb2d549c43" +dependencies = [ + "byteorder 1.3.4", + "crc32c", + "flate2", + "snap", + "zstd", +] + [[package]] name = "page_size" version = "0.4.2" @@ -385,6 +491,12 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05da548ad6865900e60eaba7f589cc0783590a92e940c26953ff81ddbab2d677" +[[package]] +name = "ppv-lite86" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea" + [[package]] name = "proc-macro-error" version = "1.0.2" @@ -429,6 +541,47 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom", + "libc", + "rand_chacha", + "rand_core", + "rand_hc", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core", +] + [[package]] name = "rayon" version = "1.3.0" @@ -453,6 +606,12 @@ dependencies = [ "num_cpus", ] +[[package]] +name = "redox_syscall" +version = "0.1.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" + [[package]] name = "regex-automata" version = "0.1.9" @@ -462,6 +621,15 @@ dependencies = [ "byteorder 1.3.4", ] +[[package]] +name = "remove_dir_all" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a83fa3702a688b9359eccba92d153ac33fd2e8462f9e0e3fdf155239ea7792e" +dependencies = [ + "winapi", +] + [[package]] name = "roaring" version = "0.5.2" @@ -521,6 +689,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7cb5678e1615754284ec264d9bb5b4c27d2018577fd90ac0ceb578591ed5ee4" +[[package]] +name = "snap" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7fb9b0bb877b35a1cc1474a3b43d9c226a2625311760cdda2cbccbc0c7a8376" + [[package]] name = "structopt" version = "0.3.14" @@ -579,6 +753,20 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "tempfile" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9" +dependencies = [ + "cfg-if", + "libc", + "rand", + "redox_syscall", + "remove_dir_all", + "winapi", +] + [[package]] name = "textwrap" version = "0.11.0" @@ -641,6 +829,12 @@ version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "winapi" version = "0.3.8" @@ -683,3 +877,34 @@ dependencies = [ "syn", "synstructure", ] + +[[package]] +name = "zstd" +version = "0.5.2+zstd.1.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "644352b10ce7f333d6e0af85bd4f5322dc449416dc1211c6308e95bca8923db4" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "2.0.4+zstd.1.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7113c0c9aed2c55181f2d9f5b0a36e7d2c0183b11c058ab40b35987479efe4d7" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "1.4.16+zstd.1.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c442965efc45353be5a9b9969c9b0872fff6828c7e06d118dda2cb2d0bb11d5a" +dependencies = [ + "cc", + "glob", + "itertools", + "libc", +] diff --git a/Cargo.toml b/Cargo.toml index bf5f76152..abe475a1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,16 +8,20 @@ edition = "2018" anyhow = "1.0.28" bitpacking = "0.8.2" byteorder = "1.3.4" -roaring = "0.5.2" csv = "1.1.3" fst = "0.4.3" fxhash = "0.2.1" heed = { version = "0.8.0", default-features = false, features = ["lmdb"] } jemallocator = "0.3.2" +memmap = "0.7.0" +oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "8918476" } rayon = "1.3.0" +roaring = "0.5.2" slice-group-by = "0.2.6" smallstr = "0.2.0" +smallvec = "1.4.0" structopt = { version = "0.3.14", default-features = false } +tempfile = "3.1.0" [profile.release] debug = true diff --git a/src/main.rs b/src/main.rs index 1dd6551ef..7a22d16d7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,23 +1,25 @@ -use std::collections::hash_map::Entry; use std::collections::{HashMap, BTreeSet}; use std::convert::TryFrom; +use std::convert::TryInto; use std::fs::File; use std::hash::BuildHasherDefault; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; -use anyhow::{ensure, Context}; -use roaring::RoaringBitmap; -use fst::IntoStreamer; +use anyhow::Context; +use fst::{Streamer, IntoStreamer}; use fxhash::FxHasher32; -use heed::{EnvOpenOptions, PolyDatabase, Database}; use heed::types::*; +use heed::{EnvOpenOptions, PolyDatabase, Database}; +use oxidized_mtbl::{Reader, ReaderOptions, Writer, Merger, MergerOptions}; use rayon::prelude::*; +use roaring::RoaringBitmap; use slice_group_by::StrGroupBy; use structopt::StructOpt; pub type FastMap4 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; +pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>; pub type BEU32 = heed::zerocopy::U32; pub type DocumentId = u32; @@ -39,100 +41,126 @@ struct Opt { files_to_index: Vec, } -fn union_postings_ids(_key: &[u8], old_value: Option<&[u8]>, new_value: RoaringBitmap) -> Option> { - let result = match old_value { - Some(bytes) => { - let mut old_value = RoaringBitmap::deserialize_from(bytes).unwrap(); - old_value.union_with(&new_value); - old_value - }, - None => new_value, - }; - - let mut vec = Vec::new(); - result.serialize_into(&mut vec).unwrap(); - Some(vec) -} - -fn union_words_fst(key: &[u8], old_value: Option<&[u8]>, new_value: &fst::Set>) -> Option> { - if key != b"words-fst" { unimplemented!() } - - // Do an union of the old and the new set of words. - let mut builder = fst::set::OpBuilder::new(); - - let old_words = old_value.map(|v| fst::Set::new(v).unwrap()); - let old_words = old_words.as_ref().map(|v| v.into_stream()); - if let Some(old_words) = old_words { - builder.push(old_words); - } - - builder.push(new_value); - - let op = builder.r#union(); - let mut build = fst::SetBuilder::memory(); - build.extend_stream(op.into_stream()).unwrap(); - - Some(build.into_inner().unwrap()) -} - fn alphanumeric_tokens(string: &str) -> impl Iterator { let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) } -#[derive(Default)] struct Indexed { fst: fst::Set>, - postings_ids: FastMap4, + postings_ids: FastMap4, headers: Vec, documents: Vec<(DocumentId, Vec)>, } -impl Indexed { - fn merge_with(mut self, mut other: Indexed) -> Indexed { +#[derive(Default)] +struct MtblKvStore(Option); - // Union of the two FSTs - let op = fst::set::OpBuilder::new() - .add(self.fst.into_stream()) - .add(other.fst.into_stream()) - .r#union(); +impl MtblKvStore { + fn from_indexed(mut indexed: Indexed) -> anyhow::Result { + let outfile = tempfile::tempfile()?; + let mut out = Writer::new(outfile, None)?; - let mut build = fst::SetBuilder::memory(); - build.extend_stream(op.into_stream()).unwrap(); - let fst = build.into_set(); + out.add(b"\0headers", indexed.headers)?; + out.add(b"\0words-fst", indexed.fst.as_fst().as_bytes())?; - // Merge the postings by unions - for (word, mut postings) in other.postings_ids { - match self.postings_ids.entry(word) { - Entry::Occupied(mut entry) => { - let old = entry.get(); - postings.union_with(&old); - entry.insert(postings); - }, - Entry::Vacant(entry) => { - entry.insert(postings); - }, + // postings ids keys are all prefixed by a '1' + let mut key = vec![1]; + let mut buffer = Vec::new(); + // We must write the postings ids in order for mtbl therefore + // we iterate over the fst to read the words in order + let mut stream = indexed.fst.stream(); + while let Some(word) = stream.next() { + key.truncate(1); + key.extend_from_slice(word); + if let Some(ids) = indexed.postings_ids.remove(word) { + buffer.clear(); + ids.serialize_into(&mut buffer)?; + out.add(&key, &buffer).unwrap(); } } - // assert headers are valid - if !self.headers.is_empty() { - assert_eq!(self.headers, other.headers); + // postings ids keys are all prefixed by a '2' + key[0] = 2; + indexed.documents.sort_unstable(); + for (id, content) in indexed.documents { + key.truncate(1); + key.extend_from_slice(&id.to_be_bytes()); + out.add(&key, content).unwrap(); } - // extend the documents - self.documents.append(&mut other.documents); + let out = out.into_inner()?; + Ok(MtblKvStore(Some(out))) + } - Indexed { - fst, - postings_ids: self.postings_ids, - headers: self.headers, - documents: self.documents, + fn merge_with(self, other: MtblKvStore) -> anyhow::Result { + let (left, right) = match (self.0, other.0) { + (Some(left), Some(right)) => (left, right), + (Some(left), None) => return Ok(MtblKvStore(Some(left))), + (None, Some(right)) => return Ok(MtblKvStore(Some(right))), + (None, None) => return Ok(MtblKvStore(None)), + }; + + let left = unsafe { memmap::Mmap::map(&left)? }; + let right = unsafe { memmap::Mmap::map(&right)? }; + + let left = Reader::new(&left, ReaderOptions::default()).unwrap(); + let right = Reader::new(&right, ReaderOptions::default()).unwrap(); + + fn merge(key: &[u8], left: &[u8], right: &[u8]) -> Option> { + if key == b"\0words-fst" { + let left_fst = fst::Set::new(left).unwrap(); + let right_fst = fst::Set::new(right).unwrap(); + + // Union of the two FSTs + let op = fst::set::OpBuilder::new() + .add(left_fst.into_stream()) + .add(right_fst.into_stream()) + .r#union(); + + let mut build = fst::SetBuilder::memory(); + build.extend_stream(op.into_stream()).unwrap(); + Some(build.into_inner().unwrap()) + } + else if key == b"\0headers" { + assert_eq!(left, right); + Some(left.to_vec()) + } + else if key.starts_with(&[1]) { + let mut left = RoaringBitmap::deserialize_from(left).unwrap(); + let right = RoaringBitmap::deserialize_from(right).unwrap(); + left.union_with(&right); + let mut vec = Vec::new(); + left.serialize_into(&mut vec).unwrap(); + Some(vec) + } + else if key.starts_with(&[2]) { + assert_eq!(left, right); + Some(left.to_vec()) + } + else { + panic!("wut? {:?}", key) + } } + + let outfile = tempfile::tempfile()?; + let mut out = Writer::new(outfile, None)?; + + let sources = vec![left, right]; + let opt = MergerOptions { merge }; + let mut merger = Merger::new(sources, opt); + + let mut iter = merger.iter(); + while let Some((k, v)) = iter.next() { + out.add(k, v).unwrap(); + } + + let out = out.into_inner()?; + Ok(MtblKvStore(Some(out))) } } -fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { +fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { const MAX_POSITION: usize = 1000; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; @@ -153,7 +181,7 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { if !word.is_empty() && word.len() < 500 { // LMDB limits - postings_ids.entry(SmallString32::from(word)) + postings_ids.entry(SmallVec32::from(word.as_bytes())) .or_insert_with(RoaringBitmap::new) .insert(document_id); } @@ -173,44 +201,51 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { new_words.insert(word.clone()); } - let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallString32::as_str))?; + let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallVec32::as_ref))?; - Ok(Indexed { fst: new_words_fst, headers, postings_ids, documents }) + let indexed = Indexed { fst: new_words_fst, headers, postings_ids, documents }; + + MtblKvStore::from_indexed(indexed) } +// TODO merge with the previous values fn writer( wtxn: &mut heed::RwTxn, main: PolyDatabase, postings_ids: Database, documents: Database, ByteSlice>, - indexed: Indexed, + mtbl_store: MtblKvStore, ) -> anyhow::Result { - // Write and merge the words fst - let old_value = main.get::<_, Str, ByteSlice>(wtxn, "words-fst")?; - let new_value = union_words_fst(b"words-fst", old_value, &indexed.fst) - .context("error while do a words-fst union")?; - main.put::<_, Str, ByteSlice>(wtxn, "words-fst", &new_value)?; + let mtbl_store = match mtbl_store.0 { + Some(store) => unsafe { memmap::Mmap::map(&store)? }, + None => return Ok(0), + }; + let mtbl_store = Reader::new(&mtbl_store, ReaderOptions::default()).unwrap(); + + // Write the words fst + let fst = mtbl_store.get(b"\0words-fst").unwrap(); + let fst = fst::Set::new(fst)?; + main.put::<_, Str, ByteSlice>(wtxn, "words-fst", &fst.as_fst().as_bytes())?; // Write and merge the headers - if let Some(old_headers) = main.get::<_, Str, ByteSlice>(wtxn, "headers")? { - ensure!(old_headers == &*indexed.headers, "headers differs from the previous ones"); - } - main.put::<_, Str, ByteSlice>(wtxn, "headers", &indexed.headers)?; + let headers = mtbl_store.get(b"\0headers").unwrap(); + main.put::<_, Str, ByteSlice>(wtxn, "headers", headers.as_ref())?; // Write and merge the postings lists - for (word, postings) in indexed.postings_ids { - let old_value = postings_ids.get(wtxn, word.as_str())?; - let new_value = union_postings_ids(word.as_bytes(), old_value, postings) - .context("error while do a words-fst union")?; - postings_ids.put(wtxn, &word, &new_value)?; + let mut iter = mtbl_store.iter_prefix(&[1]).unwrap(); + while let Some((word, postings)) = iter.next() { + let word = std::str::from_utf8(&word[1..]).unwrap(); + postings_ids.put(wtxn, &word, &postings)?; } - let count = indexed.documents.len(); - // Write the documents - for (id, content) in indexed.documents { + let mut count = 0; + let mut iter = mtbl_store.iter_prefix(&[2]).unwrap(); + while let Some((id_bytes, content)) = iter.next() { + let id = id_bytes[1..].try_into().map(u32::from_be_bytes).unwrap(); documents.put(wtxn, &BEU32::new(id), &content)?; + count += 1; } Ok(count) @@ -232,29 +267,23 @@ fn main() -> anyhow::Result<()> { let res = opt.files_to_index .into_par_iter() - .try_fold(|| Indexed::default(), |acc, path| { + .try_fold(MtblKvStore::default, |acc, path| { let rdr = csv::Reader::from_path(path)?; - let indexed = index_csv(rdr)?; - Ok(acc.merge_with(indexed)) as anyhow::Result - }) - .map(|indexed| match indexed { - Ok(indexed) => { - let tid = rayon::current_thread_index(); - eprintln!("{:?}: A new step to write into LMDB", tid); - let mut wtxn = env.write_txn()?; - let count = writer(&mut wtxn, main, postings_ids, documents, indexed)?; - wtxn.commit()?; - eprintln!("{:?}: Wrote {} documents into LMDB", tid, count); - Ok(count) - }, - Err(e) => Err(e), + let mtbl_store = index_csv(rdr)?; + acc.merge_with(mtbl_store) }) .inspect(|_| { eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed)) }) - .try_reduce(|| 0, |a, b| Ok(a + b)); + .try_reduce(MtblKvStore::default, MtblKvStore::merge_with); - println!("indexed {:?} documents", res); + let mtbl_store = res?; + + eprintln!("We are writing into LMDB..."); + let mut wtxn = env.write_txn()?; + let count = writer(&mut wtxn, main, postings_ids, documents, mtbl_store)?; + wtxn.commit()?; + eprintln!("Wrote {} documents into LMDB", count); Ok(()) } From 6c726df9b976d5f8e5e15b1639083759e1e9f53b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 31 May 2020 16:09:34 +0200 Subject: [PATCH 0009/1889] Support multiple space seperated words --- Cargo.lock | 7 +++ Cargo.toml | 2 + src/{main.rs => bin/indexer.rs} | 22 +++------ src/bin/search.rs | 85 +++++++++++++++++++++++++++++++++ src/lib.rs | 16 +++++++ 5 files changed, 116 insertions(+), 16 deletions(-) rename src/{main.rs => bin/indexer.rs} (93%) create mode 100644 src/bin/search.rs create mode 100644 src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 1fa4b92e8..e5f05b909 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -93,6 +93,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "cow-utils" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" + [[package]] name = "crc32c" version = "0.4.0" @@ -391,6 +397,7 @@ dependencies = [ "anyhow", "bitpacking", "byteorder 1.3.4", + "cow-utils", "csv", "fst", "fxhash", diff --git a/Cargo.toml b/Cargo.toml index abe475a1d..45e71778f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,11 +3,13 @@ name = "mega-mini-indexer" version = "0.1.0" authors = ["Kerollmops "] edition = "2018" +default-run = "indexer" [dependencies] anyhow = "1.0.28" bitpacking = "0.8.2" byteorder = "1.3.4" +cow-utils = "0.1.2" csv = "1.1.3" fst = "0.4.3" fxhash = "0.2.1" diff --git a/src/main.rs b/src/bin/indexer.rs similarity index 93% rename from src/main.rs rename to src/bin/indexer.rs index 7a22d16d7..a7d2c01f1 100644 --- a/src/main.rs +++ b/src/bin/indexer.rs @@ -1,27 +1,21 @@ -use std::collections::{HashMap, BTreeSet}; -use std::convert::TryFrom; -use std::convert::TryInto; +use std::collections::BTreeSet; +use std::convert::{TryInto, TryFrom}; use std::fs::File; -use std::hash::BuildHasherDefault; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; use anyhow::Context; +use cow_utils::CowUtils; use fst::{Streamer, IntoStreamer}; -use fxhash::FxHasher32; use heed::types::*; use heed::{EnvOpenOptions, PolyDatabase, Database}; use oxidized_mtbl::{Reader, ReaderOptions, Writer, Merger, MergerOptions}; use rayon::prelude::*; use roaring::RoaringBitmap; -use slice_group_by::StrGroupBy; use structopt::StructOpt; -pub type FastMap4 = HashMap>; -pub type SmallString32 = smallstr::SmallString<[u8; 32]>; -pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>; -pub type BEU32 = heed::zerocopy::U32; -pub type DocumentId = u32; +use mega_mini_indexer::alphanumeric_tokens; +use mega_mini_indexer::{FastMap4, SmallVec32, BEU32, DocumentId}; #[cfg(target_os = "linux")] #[global_allocator] @@ -41,11 +35,6 @@ struct Opt { files_to_index: Vec, } -fn alphanumeric_tokens(string: &str) -> impl Iterator { - let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); - string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) -} - struct Indexed { fst: fst::Set>, postings_ids: FastMap4, @@ -181,6 +170,7 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) { if !word.is_empty() && word.len() < 500 { // LMDB limits + let word = word.cow_to_lowercase(); postings_ids.entry(SmallVec32::from(word.as_bytes())) .or_insert_with(RoaringBitmap::new) .insert(document_id); diff --git a/src/bin/search.rs b/src/bin/search.rs new file mode 100644 index 000000000..aea12610c --- /dev/null +++ b/src/bin/search.rs @@ -0,0 +1,85 @@ +use std::io::{self, Write}; +use std::path::PathBuf; +use std::time::Instant; + +use cow_utils::CowUtils; +use heed::types::*; +use heed::{EnvOpenOptions, Database}; +use roaring::RoaringBitmap; +use structopt::StructOpt; + +use mega_mini_indexer::alphanumeric_tokens; +use mega_mini_indexer::BEU32; + +#[derive(Debug, StructOpt)] +#[structopt(name = "mm-indexer", about = "The server side of the daugt project.")] +struct Opt { + /// The database path where the database is located. + /// It is created if it doesn't already exist. + #[structopt(long = "db", parse(from_os_str))] + database: PathBuf, + + /// The query string to search for (doesn't support prefix search yet). + query: String, +} + +fn main() -> anyhow::Result<()> { + let opt = Opt::from_args(); + + std::fs::create_dir_all(&opt.database)?; + let env = EnvOpenOptions::new() + .map_size(100 * 1024 * 1024 * 1024) // 100 GB + .max_readers(10) + .max_dbs(5) + .open(opt.database)?; + + let main = env.create_poly_database(None)?; + let postings_ids: Database = env.create_database(Some("postings-ids"))?; + let documents: Database, ByteSlice> = env.create_database(Some("documents"))?; + + let rtxn = env.read_txn()?; + + let before = Instant::now(); + let mut result: Option = None; + for word in alphanumeric_tokens(&opt.query) { + let word = word.cow_to_lowercase(); + match postings_ids.get(&rtxn, &word)? { + Some(ids) => { + let before = Instant::now(); + let right = RoaringBitmap::deserialize_from(ids)?; + eprintln!("deserialized bitmap for {:?} took {:.02?}", word, before.elapsed()); + result = match result.take() { + Some(mut left) => { + let before = Instant::now(); + let left_len = left.len(); + left.intersect_with(&right); + eprintln!("intersect between {:?} and {:?} took {:.02?}", + left_len, right.len(), before.elapsed()); + Some(left) + }, + None => Some(right), + }; + }, + None => result = Some(RoaringBitmap::default()), + } + } + + let headers = match main.get::<_, Str, ByteSlice>(&rtxn, "headers")? { + Some(headers) => headers, + None => return Ok(()), + }; + + let mut stdout = io::stdout(); + stdout.write_all(&headers)?; + + let total_length = result.as_ref().map_or(0, |x| x.len()); + for id in result.unwrap_or_default().iter().take(20) { + if let Some(content) = documents.get(&rtxn, &BEU32::new(id))? { + stdout.write_all(&content)?; + } + } + + eprintln!("Took {:.02?} to find {} documents", before.elapsed(), total_length); + + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 000000000..9e5baf1fd --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,16 @@ +use std::collections::HashMap; +use std::hash::BuildHasherDefault; + +use fxhash::FxHasher32; +use slice_group_by::StrGroupBy; + +pub type FastMap4 = HashMap>; +pub type SmallString32 = smallstr::SmallString<[u8; 32]>; +pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>; +pub type BEU32 = heed::zerocopy::U32; +pub type DocumentId = u32; + +pub fn alphanumeric_tokens(string: &str) -> impl Iterator { + let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); + string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) +} From ba9527abc07bc9b1300317f74a233edf8c6abe51 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 31 May 2020 17:01:11 +0200 Subject: [PATCH 0010/1889] Support typos with a levenshtein automata --- Cargo.lock | 10 ++++++ Cargo.toml | 1 + src/bin/search.rs | 77 ++++++++++++++++++++++++++++++----------------- 3 files changed, 60 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e5f05b909..91f07e95f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -361,6 +361,15 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "levenshtein_automata" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f44db4199cdb049b494a92d105acbfa43c25b3925e33803923ba9580b7bc9e1a" +dependencies = [ + "fst", +] + [[package]] name = "libc" version = "0.2.70" @@ -403,6 +412,7 @@ dependencies = [ "fxhash", "heed", "jemallocator", + "levenshtein_automata", "memmap", "oxidized-mtbl", "rayon", diff --git a/Cargo.toml b/Cargo.toml index 45e71778f..f6eeb778d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ fst = "0.4.3" fxhash = "0.2.1" heed = { version = "0.8.0", default-features = false, features = ["lmdb"] } jemallocator = "0.3.2" +levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } memmap = "0.7.0" oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "8918476" } rayon = "1.3.0" diff --git a/src/bin/search.rs b/src/bin/search.rs index aea12610c..9a25309d3 100644 --- a/src/bin/search.rs +++ b/src/bin/search.rs @@ -3,8 +3,10 @@ use std::path::PathBuf; use std::time::Instant; use cow_utils::CowUtils; +use fst::{Streamer, IntoStreamer}; use heed::types::*; use heed::{EnvOpenOptions, Database}; +use levenshtein_automata::LevenshteinAutomatonBuilder; use roaring::RoaringBitmap; use structopt::StructOpt; @@ -38,42 +40,61 @@ fn main() -> anyhow::Result<()> { let documents: Database, ByteSlice> = env.create_database(Some("documents"))?; let rtxn = env.read_txn()?; - - let before = Instant::now(); - let mut result: Option = None; - for word in alphanumeric_tokens(&opt.query) { - let word = word.cow_to_lowercase(); - match postings_ids.get(&rtxn, &word)? { - Some(ids) => { - let before = Instant::now(); - let right = RoaringBitmap::deserialize_from(ids)?; - eprintln!("deserialized bitmap for {:?} took {:.02?}", word, before.elapsed()); - result = match result.take() { - Some(mut left) => { - let before = Instant::now(); - let left_len = left.len(); - left.intersect_with(&right); - eprintln!("intersect between {:?} and {:?} took {:.02?}", - left_len, right.len(), before.elapsed()); - Some(left) - }, - None => Some(right), - }; - }, - None => result = Some(RoaringBitmap::default()), - } - } - let headers = match main.get::<_, Str, ByteSlice>(&rtxn, "headers")? { Some(headers) => headers, None => return Ok(()), }; + let fst = match main.get::<_, Str, ByteSlice>(&rtxn, "words-fst")? { + Some(bytes) => fst::Set::new(bytes)?, + None => return Ok(()), + }; + + // Building this factory is not free. + let lev_0_builder = LevenshteinAutomatonBuilder::new(0, true); + let lev_1_builder = LevenshteinAutomatonBuilder::new(1, true); + let lev_2_builder = LevenshteinAutomatonBuilder::new(2, true); + + let dfas = alphanumeric_tokens(&opt.query).map(|word| { + let word = word.cow_to_lowercase(); + match word.len() { + 0..=4 => lev_0_builder.build_dfa(&word), + 5..=8 => lev_1_builder.build_dfa(&word), + _ => lev_2_builder.build_dfa(&word), + } + }); + + let before = Instant::now(); + let mut intersect_result: Option = None; + for dfa in dfas { + let mut union_result = RoaringBitmap::default(); + let mut stream = fst.search(dfa).into_stream(); + while let Some(word) = stream.next() { + let word = std::str::from_utf8(word)?; + if let Some(ids) = postings_ids.get(&rtxn, word)? { + let right = RoaringBitmap::deserialize_from(ids)?; + union_result.union_with(&right); + } + } + + intersect_result = match intersect_result.take() { + Some(mut left) => { + let before = Instant::now(); + let left_len = left.len(); + left.intersect_with(&union_result); + eprintln!("intersect between {:?} and {:?} took {:.02?}", + left_len, union_result.len(), before.elapsed()); + Some(left) + }, + None => Some(union_result), + }; + } + let mut stdout = io::stdout(); stdout.write_all(&headers)?; - let total_length = result.as_ref().map_or(0, |x| x.len()); - for id in result.unwrap_or_default().iter().take(20) { + let total_length = intersect_result.as_ref().map_or(0, |x| x.len()); + for id in intersect_result.unwrap_or_default().iter().take(20) { if let Some(content) = documents.get(&rtxn, &BEU32::new(id))? { stdout.write_all(&content)?; } From 2a10b2275ed694e52e890bc0659073fcb9ee145d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 31 May 2020 17:11:58 +0200 Subject: [PATCH 0011/1889] Support prefix typo tolerant search --- src/bin/search.rs | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/bin/search.rs b/src/bin/search.rs index 9a25309d3..81860fd9f 100644 --- a/src/bin/search.rs +++ b/src/bin/search.rs @@ -50,23 +50,28 @@ fn main() -> anyhow::Result<()> { None => return Ok(()), }; - // Building this factory is not free. - let lev_0_builder = LevenshteinAutomatonBuilder::new(0, true); - let lev_1_builder = LevenshteinAutomatonBuilder::new(1, true); - let lev_2_builder = LevenshteinAutomatonBuilder::new(2, true); + // Building these factories is not free. + let lev0 = LevenshteinAutomatonBuilder::new(0, true); + let lev1 = LevenshteinAutomatonBuilder::new(1, true); + let lev2 = LevenshteinAutomatonBuilder::new(2, true); - let dfas = alphanumeric_tokens(&opt.query).map(|word| { + let words: Vec<_> = alphanumeric_tokens(&opt.query).collect(); + let number_of_words = words.len(); + let dfas = words.into_iter().enumerate().map(|(i, word)| { let word = word.cow_to_lowercase(); - match word.len() { - 0..=4 => lev_0_builder.build_dfa(&word), - 5..=8 => lev_1_builder.build_dfa(&word), - _ => lev_2_builder.build_dfa(&word), - } + let is_last = i + 1 == number_of_words; + let dfa = match word.len() { + 0..=4 => if is_last { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) }, + 5..=8 => if is_last { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) }, + _ => if is_last { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) }, + }; + (word, dfa) }); let before = Instant::now(); let mut intersect_result: Option = None; - for dfa in dfas { + for (word, dfa) in dfas { + let before = Instant::now(); let mut union_result = RoaringBitmap::default(); let mut stream = fst.search(dfa).into_stream(); while let Some(word) = stream.next() { @@ -76,6 +81,7 @@ fn main() -> anyhow::Result<()> { union_result.union_with(&right); } } + eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); intersect_result = match intersect_result.take() { Some(mut left) => { From a26553c90a5fa33579a18c22a31d647ffcaf270d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 31 May 2020 17:48:13 +0200 Subject: [PATCH 0012/1889] Reintroduce a simple HTTP server --- Cargo.lock | 1048 +++++++++++++++++++++++++++++++++++- Cargo.toml | 5 + public/bulma.min.css | 1 + public/index.html | 199 +++++++ public/jquery-3.4.1.min.js | 2 + public/papaparse.min.js | 7 + src/bin/search.rs | 78 +-- src/bin/serve.rs | 115 ++++ src/lib.rs | 86 +++ 9 files changed, 1458 insertions(+), 83 deletions(-) create mode 100644 public/bulma.min.css create mode 100644 public/index.html create mode 100644 public/jquery-3.4.1.min.js create mode 100755 public/papaparse.min.js create mode 100644 src/bin/serve.rs diff --git a/Cargo.lock b/Cargo.lock index 91f07e95f..6acef59fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12,12 +12,36 @@ version = "1.0.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f" +[[package]] +name = "arc-swap" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b585a98a234c46fc563103e9278c9391fde1f4e6850334da895d27edb9580f62" + +[[package]] +name = "autocfg" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2" + [[package]] name = "autocfg" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" +[[package]] +name = "base64" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b41b7ea54a0c9d92199de89e20e58d49f02f8e699814ef3fdf266f6f748d15c7" + +[[package]] +name = "base64" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d1ccbaf7d9ec9537465a97bf19edc1a4e158ecb49fc16178202238c569cc42" + [[package]] name = "bincode" version = "1.2.1" @@ -43,6 +67,27 @@ dependencies = [ "crunchy", ] +[[package]] +name = "block-buffer" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" +dependencies = [ + "block-padding", + "byte-tools", + "byteorder 1.3.4", + "generic-array", +] + +[[package]] +name = "block-padding" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" +dependencies = [ + "byte-tools", +] + [[package]] name = "bstr" version = "0.2.13" @@ -55,6 +100,22 @@ dependencies = [ "serde", ] +[[package]] +name = "buf_redux" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" +dependencies = [ + "memchr", + "safemem", +] + +[[package]] +name = "byte-tools" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" + [[package]] name = "byteorder" version = "0.5.3" @@ -67,6 +128,12 @@ version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" +[[package]] +name = "bytes" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "130aac562c0dd69c56b3b1cc8ffd2e17be31d0b6c25b61c96b76231aa23e39e1" + [[package]] name = "cc" version = "1.0.54" @@ -93,6 +160,15 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "cloudabi" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +dependencies = [ + "bitflags", +] + [[package]] name = "cow-utils" version = "0.1.2" @@ -131,7 +207,7 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace" dependencies = [ - "autocfg", + "autocfg 1.0.0", "cfg-if", "crossbeam-utils", "lazy_static", @@ -156,7 +232,7 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" dependencies = [ - "autocfg", + "autocfg 1.0.0", "cfg-if", "lazy_static", ] @@ -189,12 +265,33 @@ dependencies = [ "memchr", ] +[[package]] +name = "digest" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" +dependencies = [ + "generic-array", +] + +[[package]] +name = "dtoa" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4358a9e11b9a09cf52383b451b49a169e8d797b68aa02301ff586d70d9661ea3" + [[package]] name = "either" version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" +[[package]] +name = "fake-simd" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" + [[package]] name = "flate2" version = "1.0.14" @@ -207,6 +304,12 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "fs_extra" version = "1.1.0" @@ -219,6 +322,123 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7293de202dbfe786c0b3fe6110a027836c5438ed06db7b715c9955ff4bfea51" +[[package]] +name = "fuchsia-cprng" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" + +[[package]] +name = "fuchsia-zircon" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" +dependencies = [ + "bitflags", + "fuchsia-zircon-sys", +] + +[[package]] +name = "fuchsia-zircon-sys" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" + +[[package]] +name = "futures" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e05b85ec287aac0dc34db7d4a569323df697f9c55b99b15d6b4ef8cde49f613" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f366ad74c28cca6ba456d95e6422883cfb4b252a83bed929c83abfdbbf2967d5" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f5fff90fd5d971f936ad674802482ba441b6f09ba5e15fd8b39145582ca399" + +[[package]] +name = "futures-executor" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10d6bb888be1153d3abeb9006b11b02cf5e9b209fda28693c31ae1e4e012e314" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de27142b013a8e869c14957e6d2edeef89e97c289e69d042ee3a49acd8b51789" + +[[package]] +name = "futures-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0b5a30a4328ab5473878237c447333c093297bded83a4983d10f4deea240d39" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f2032893cb734c7a05d85ce0cc8b8c4075278e93b24b66f9de99d6eb0fa8acc" + +[[package]] +name = "futures-task" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb66b5f09e22019b1ab0830f7785bcea8e7a42148683f99214f73f8ec21a626" +dependencies = [ + "once_cell", +] + +[[package]] +name = "futures-util" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8764574ff08b701a084482c3c7031349104b07ac897393010494beaa18ce32c6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project", + "pin-utils", + "proc-macro-hack", + "proc-macro-nested", + "slab", +] + [[package]] name = "fxhash" version = "0.2.1" @@ -228,6 +448,15 @@ dependencies = [ "byteorder 1.3.4", ] +[[package]] +name = "generic-array" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c68f0274ae0e023facc3c97b2e00f076be70e254bc851d972503b328db79b2ec" +dependencies = [ + "typenum", +] + [[package]] name = "getrandom" version = "0.1.14" @@ -245,6 +474,50 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" +[[package]] +name = "h2" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79b7246d7e4b979c03fa093da39cfb3617a96bbeee6310af63991668d7e843ff" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "log 0.4.8", + "slab", + "tokio", + "tokio-util", +] + +[[package]] +name = "headers" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed18eb2459bf1a09ad2d6b1547840c3e5e62882fa09b9a6a20b1de8e3228848f" +dependencies = [ + "base64 0.12.1", + "bitflags", + "bytes", + "headers-core", + "http", + "mime 0.3.16", + "sha-1", + "time", +] + +[[package]] +name = "headers-core" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" +dependencies = [ + "http", +] + [[package]] name = "heck" version = "0.3.1" @@ -299,6 +572,57 @@ dependencies = [ "libc", ] +[[package]] +name = "http" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d569972648b2c512421b5f2a405ad6ac9666547189d0c5477a3f200f3e02f9" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13d5ff830006f7646652e057693569bfe0d51760c0085a071769d142a205111b" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "httparse" +version = "1.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd179ae861f0c2e53da70d892f5f3029f9594be0c41dc5269cd371691b1dc2f9" + +[[package]] +name = "hyper" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6e7655b9594024ad0ee439f3b5a7299369dc2a3f459b47c696f9ff676f9aa1f" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "itoa", + "log 0.4.8", + "pin-project", + "socket2", + "time", + "tokio", + "tower-service", + "want", +] + [[package]] name = "idna" version = "0.2.0" @@ -310,6 +634,33 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indexmap" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "076f042c5b7b98f31d205f1249267e12a6518c1481e9dae9764af19b707d2292" +dependencies = [ + "autocfg 1.0.0", +] + +[[package]] +name = "input_buffer" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19a8a95243d5a0398cae618ec29477c6e3cb631152be5c19481f80bc71559754" +dependencies = [ + "bytes", +] + +[[package]] +name = "iovec" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" +dependencies = [ + "libc", +] + [[package]] name = "itertools" version = "0.9.0" @@ -355,6 +706,16 @@ dependencies = [ "libc", ] +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -387,6 +748,24 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "log" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" +dependencies = [ + "log 0.4.8", +] + +[[package]] +name = "log" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" +dependencies = [ + "cfg-if", +] + [[package]] name = "matches" version = "0.1.8" @@ -417,11 +796,14 @@ dependencies = [ "oxidized-mtbl", "rayon", "roaring", + "serde", "slice-group-by", "smallstr", "smallvec", "structopt", "tempfile", + "tokio", + "warp", ] [[package]] @@ -437,7 +819,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" dependencies = [ "libc", - "winapi", + "winapi 0.3.8", ] [[package]] @@ -446,7 +828,44 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8" dependencies = [ - "autocfg", + "autocfg 1.0.0", +] + +[[package]] +name = "mime" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba626b8a6de5da682e1caa06bdb42a335aee5a84db8e5046a3e8ab17ba0a3ae0" +dependencies = [ + "log 0.3.9", +] + +[[package]] +name = "mime" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" + +[[package]] +name = "mime_guess" +version = "1.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "216929a5ee4dd316b1702eedf5e74548c123d370f47841ceaac38ca154690ca3" +dependencies = [ + "mime 0.2.6", + "phf", + "phf_codegen", + "unicase 1.4.2", +] + +[[package]] +name = "mime_guess" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2684d4c2e97d99848d30b324b00c8fcc7e5c897b7cbb5819b09e7c90e8baf212" +dependencies = [ + "mime 0.3.16", + "unicase 2.6.0", ] [[package]] @@ -458,6 +877,99 @@ dependencies = [ "adler32", ] +[[package]] +name = "mio" +version = "0.6.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fce347092656428bc8eaf6201042cb551b8d67855af7374542a92a0fbfcac430" +dependencies = [ + "cfg-if", + "fuchsia-zircon", + "fuchsia-zircon-sys", + "iovec", + "kernel32-sys", + "libc", + "log 0.4.8", + "miow 0.2.1", + "net2", + "slab", + "winapi 0.2.8", +] + +[[package]] +name = "mio-named-pipes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5e374eff525ce1c5b7687c4cef63943e7686524a387933ad27ca7ec43779cb3" +dependencies = [ + "log 0.4.8", + "mio", + "miow 0.3.4", + "winapi 0.3.8", +] + +[[package]] +name = "mio-uds" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0" +dependencies = [ + "iovec", + "libc", + "mio", +] + +[[package]] +name = "miow" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c1f2f3b1cf331de6896aabf6e9d55dca90356cc9960cca7eaaf408a355ae919" +dependencies = [ + "kernel32-sys", + "net2", + "winapi 0.2.8", + "ws2_32-sys", +] + +[[package]] +name = "miow" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22dfdd1d51b2639a5abd17ed07005c3af05fb7a2a3b1a1d0d7af1000a520c1c7" +dependencies = [ + "socket2", + "winapi 0.3.8", +] + +[[package]] +name = "multipart" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136eed74cadb9edd2651ffba732b19a450316b680e4f48d6c79e905799e19d01" +dependencies = [ + "buf_redux", + "httparse", + "log 0.4.8", + "mime 0.2.6", + "mime_guess 1.8.8", + "quick-error", + "rand 0.6.5", + "safemem", + "tempfile", + "twoway", +] + +[[package]] +name = "net2" +version = "0.2.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ba7c918ac76704fb42afcbbb43891e72731f3dcca3bef2a19786297baf14af7" +dependencies = [ + "cfg-if", + "libc", + "winapi 0.3.8", +] + [[package]] name = "num_cpus" version = "1.13.0" @@ -474,6 +986,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b631f7e854af39a1739f401cf34a8a013dfe09eac4fa4dba91e9768bd28168d" +[[package]] +name = "opaque-debug" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" + [[package]] name = "oxidized-mtbl" version = "0.1.0" @@ -493,7 +1011,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" dependencies = [ "libc", - "winapi", + "winapi 0.3.8", ] [[package]] @@ -502,6 +1020,77 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" +[[package]] +name = "phf" +version = "0.7.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.7.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.7.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662" +dependencies = [ + "phf_shared", + "rand 0.6.5", +] + +[[package]] +name = "phf_shared" +version = "0.7.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0" +dependencies = [ + "siphasher", + "unicase 1.4.2", +] + +[[package]] +name = "pin-project" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edc93aeee735e60ecb40cf740eb319ff23eab1c5748abfdb5c180e4ce49f7791" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "0.4.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e58db2081ba5b4c93bd6be09c40fd36cb9193a8336c384f3b40012e531aa7e40" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9df32da11d84f3a7d70205549562966279adb900e080fad3dccd8e64afccf0ad" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" version = "0.3.17" @@ -524,7 +1113,7 @@ dependencies = [ "proc-macro2", "quote", "syn", - "version_check", + "version_check 0.9.2", ] [[package]] @@ -537,9 +1126,21 @@ dependencies = [ "quote", "syn", "syn-mid", - "version_check", + "version_check 0.9.2", ] +[[package]] +name = "proc-macro-hack" +version = "0.5.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e0456befd48169b9f13ef0f0ad46d492cf9d2dbb918bcf38e01eed4ce3ec5e4" + +[[package]] +name = "proc-macro-nested" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e946095f9d3ed29ec38de908c22f95d9ac008e424c7bcae54c75a79c527c694" + [[package]] name = "proc-macro2" version = "1.0.17" @@ -549,6 +1150,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.6" @@ -558,6 +1165,25 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" +dependencies = [ + "autocfg 0.1.7", + "libc", + "rand_chacha 0.1.1", + "rand_core 0.4.2", + "rand_hc 0.1.0", + "rand_isaac", + "rand_jitter", + "rand_os", + "rand_pcg", + "rand_xorshift", + "winapi 0.3.8", +] + [[package]] name = "rand" version = "0.7.3" @@ -566,9 +1192,19 @@ checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ "getrandom", "libc", - "rand_chacha", - "rand_core", - "rand_hc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc 0.2.0", +] + +[[package]] +name = "rand_chacha" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" +dependencies = [ + "autocfg 0.1.7", + "rand_core 0.3.1", ] [[package]] @@ -578,9 +1214,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.5.1", ] +[[package]] +name = "rand_core" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" +dependencies = [ + "rand_core 0.4.2", +] + +[[package]] +name = "rand_core" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" + [[package]] name = "rand_core" version = "0.5.1" @@ -590,13 +1241,75 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rand_hc" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" +dependencies = [ + "rand_core 0.3.1", +] + [[package]] name = "rand_hc" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" dependencies = [ - "rand_core", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_isaac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" +dependencies = [ + "rand_core 0.3.1", +] + +[[package]] +name = "rand_jitter" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b" +dependencies = [ + "libc", + "rand_core 0.4.2", + "winapi 0.3.8", +] + +[[package]] +name = "rand_os" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" +dependencies = [ + "cloudabi", + "fuchsia-cprng", + "libc", + "rand_core 0.4.2", + "rdrand", + "winapi 0.3.8", +] + +[[package]] +name = "rand_pcg" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" +dependencies = [ + "autocfg 0.1.7", + "rand_core 0.4.2", +] + +[[package]] +name = "rand_xorshift" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" +dependencies = [ + "rand_core 0.3.1", ] [[package]] @@ -623,6 +1336,15 @@ dependencies = [ "num_cpus", ] +[[package]] +name = "rdrand" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" +dependencies = [ + "rand_core 0.3.1", +] + [[package]] name = "redox_syscall" version = "0.1.56" @@ -644,7 +1366,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a83fa3702a688b9359eccba92d153ac33fd2e8462f9e0e3fdf155239ea7792e" dependencies = [ - "winapi", + "winapi 0.3.8", ] [[package]] @@ -662,6 +1384,18 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed3d612bc64430efeb3f7ee6ef26d590dce0c43249217bddc62112540c7941e1" +[[package]] +name = "safemem" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" + +[[package]] +name = "scoped-tls" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" + [[package]] name = "scopeguard" version = "1.1.0" @@ -673,6 +1407,20 @@ name = "serde" version = "1.0.110" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99e7b308464d16b56eba9964e4972a3eee817760ab60d88c3f86e1fecb08204c" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.110" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "818fbf6bfa9a42d3bfcaca148547aa00c7b915bec71d1757aa2d44ca68771984" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "serde_json" @@ -685,6 +1433,52 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_urlencoded" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ec5d77e2d4c73717816afac02670d5c4f534ea95ed430442cad02e7a6e32c97" +dependencies = [ + "dtoa", + "itoa", + "serde", + "url", +] + +[[package]] +name = "sha-1" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" +dependencies = [ + "block-buffer", + "digest", + "fake-simd", + "opaque-debug", +] + +[[package]] +name = "signal-hook-registry" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94f478ede9f64724c5d173d7bb56099ec3e2d9fc2774aac65d34b8b890405f41" +dependencies = [ + "arc-swap", + "libc", +] + +[[package]] +name = "siphasher" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac" + +[[package]] +name = "slab" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" + [[package]] name = "slice-group-by" version = "0.2.6" @@ -712,6 +1506,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7fb9b0bb877b35a1cc1474a3b43d9c226a2625311760cdda2cbccbc0c7a8376" +[[package]] +name = "socket2" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03088793f677dce356f3ccc2edb1b314ad191ab702a5de3faf49304f7e104918" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "winapi 0.3.8", +] + [[package]] name = "structopt" version = "0.3.14" @@ -778,10 +1584,10 @@ checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9" dependencies = [ "cfg-if", "libc", - "rand", + "rand 0.7.3", "redox_syscall", "remove_dir_all", - "winapi", + "winapi 0.3.8", ] [[package]] @@ -793,6 +1599,142 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "time" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" +dependencies = [ + "libc", + "winapi 0.3.8", +] + +[[package]] +name = "tokio" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d099fa27b9702bed751524694adbe393e18b36b204da91eb1cbbbbb4a5ee2d58" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "iovec", + "lazy_static", + "libc", + "memchr", + "mio", + "mio-named-pipes", + "mio-uds", + "num_cpus", + "pin-project-lite", + "signal-hook-registry", + "slab", + "tokio-macros", + "winapi 0.3.8", +] + +[[package]] +name = "tokio-macros" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c3acc6aa564495a0f2e1d59fab677cd7f81a19994cfc7f3ad0e64301560389" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8b8fe88007ebc363512449868d7da4389c9400072a3f666f212c7280082882a" +dependencies = [ + "futures", + "log 0.4.8", + "pin-project", + "tokio", + "tungstenite", +] + +[[package]] +name = "tokio-util" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be8242891f2b6cbef26a2d7e8605133c2c554cd35b3e4948ea892d6d68436499" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "log 0.4.8", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower-service" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e987b6bf443f4b5b3b6f38704195592cca41c5bb7aedd3c3693c7081f8289860" + +[[package]] +name = "try-lock" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e604eb7b43c06650e854be16a2a03155743d3752dd1c943f6829e26b7a36e382" + +[[package]] +name = "tungstenite" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfea31758bf674f990918962e8e5f07071a3161bd7c4138ed23e416e1ac4264e" +dependencies = [ + "base64 0.11.0", + "byteorder 1.3.4", + "bytes", + "http", + "httparse", + "input_buffer", + "log 0.4.8", + "rand 0.7.3", + "sha-1", + "url", + "utf-8", +] + +[[package]] +name = "twoway" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b11b2b5241ba34be09c3cc85a36e56e48f9888862e19cedf23336d35316ed1" +dependencies = [ + "memchr", +] + +[[package]] +name = "typenum" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" + +[[package]] +name = "unicase" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f4765f83163b74f957c797ad9253caf97f103fb064d3999aea9568d09fc8a33" +dependencies = [ + "version_check 0.1.5", +] + +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check 0.9.2", +] + [[package]] name = "unicode-bidi" version = "0.3.4" @@ -840,18 +1782,78 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "urlencoding" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3df3561629a8bb4c57e5a2e4c43348d9e29c7c29d9b1c4c1f47166deca8f37ed" + +[[package]] +name = "utf-8" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" + +[[package]] +name = "version_check" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" + [[package]] name = "version_check" version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" +[[package]] +name = "want" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" +dependencies = [ + "log 0.4.8", + "try-lock", +] + +[[package]] +name = "warp" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e95175b7a927258ecbb816bdada3cc469cb68593e7940b96a60f4af366a9970" +dependencies = [ + "bytes", + "futures", + "headers", + "http", + "hyper", + "log 0.4.8", + "mime 0.3.16", + "mime_guess 2.0.3", + "multipart", + "pin-project", + "scoped-tls", + "serde", + "serde_json", + "serde_urlencoded", + "tokio", + "tokio-tungstenite", + "tower-service", + "urlencoding", +] + [[package]] name = "wasi" version = "0.9.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" + [[package]] name = "winapi" version = "0.3.8" @@ -862,6 +1864,12 @@ dependencies = [ "winapi-x86_64-pc-windows-gnu", ] +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" + [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" @@ -874,6 +1882,16 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "ws2_32-sys" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + [[package]] name = "zerocopy" version = "0.3.0" diff --git a/Cargo.toml b/Cargo.toml index f6eeb778d..5087f59bc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,5 +26,10 @@ smallvec = "1.4.0" structopt = { version = "0.3.14", default-features = false } tempfile = "3.1.0" +# http server +serde = { version = "1.0", features = ["derive"] } +tokio = { version = "0.2.15", features = ["full"] } +warp = "0.2.2" + [profile.release] debug = true diff --git a/public/bulma.min.css b/public/bulma.min.css new file mode 100644 index 000000000..2649eaa9c --- /dev/null +++ b/public/bulma.min.css @@ -0,0 +1 @@ +/*! bulma.io v0.8.0 | MIT License | github.com/jgthms/bulma */@-webkit-keyframes spinAround{from{transform:rotate(0)}to{transform:rotate(359deg)}}@keyframes spinAround{from{transform:rotate(0)}to{transform:rotate(359deg)}}.breadcrumb,.button,.delete,.file,.is-unselectable,.modal-close,.pagination-ellipsis,.pagination-link,.pagination-next,.pagination-previous,.tabs{-webkit-touch-callout:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.navbar-link:not(.is-arrowless)::after,.select:not(.is-multiple):not(.is-loading)::after{border:3px solid transparent;border-radius:2px;border-right:0;border-top:0;content:" ";display:block;height:.625em;margin-top:-.4375em;pointer-events:none;position:absolute;top:50%;transform:rotate(-45deg);transform-origin:center;width:.625em}.block:not(:last-child),.box:not(:last-child),.breadcrumb:not(:last-child),.content:not(:last-child),.highlight:not(:last-child),.level:not(:last-child),.list:not(:last-child),.message:not(:last-child),.notification:not(:last-child),.pagination:not(:last-child),.progress:not(:last-child),.subtitle:not(:last-child),.table-container:not(:last-child),.table:not(:last-child),.tabs:not(:last-child),.title:not(:last-child){margin-bottom:1.5rem}.delete,.modal-close{-moz-appearance:none;-webkit-appearance:none;background-color:rgba(10,10,10,.2);border:none;border-radius:290486px;cursor:pointer;pointer-events:auto;display:inline-block;flex-grow:0;flex-shrink:0;font-size:0;height:20px;max-height:20px;max-width:20px;min-height:20px;min-width:20px;outline:0;position:relative;vertical-align:top;width:20px}.delete::after,.delete::before,.modal-close::after,.modal-close::before{background-color:#fff;content:"";display:block;left:50%;position:absolute;top:50%;transform:translateX(-50%) translateY(-50%) rotate(45deg);transform-origin:center center}.delete::before,.modal-close::before{height:2px;width:50%}.delete::after,.modal-close::after{height:50%;width:2px}.delete:focus,.delete:hover,.modal-close:focus,.modal-close:hover{background-color:rgba(10,10,10,.3)}.delete:active,.modal-close:active{background-color:rgba(10,10,10,.4)}.is-small.delete,.is-small.modal-close{height:16px;max-height:16px;max-width:16px;min-height:16px;min-width:16px;width:16px}.is-medium.delete,.is-medium.modal-close{height:24px;max-height:24px;max-width:24px;min-height:24px;min-width:24px;width:24px}.is-large.delete,.is-large.modal-close{height:32px;max-height:32px;max-width:32px;min-height:32px;min-width:32px;width:32px}.button.is-loading::after,.control.is-loading::after,.loader,.select.is-loading::after{-webkit-animation:spinAround .5s infinite linear;animation:spinAround .5s infinite linear;border:2px solid #dbdbdb;border-radius:290486px;border-right-color:transparent;border-top-color:transparent;content:"";display:block;height:1em;position:relative;width:1em}.hero-video,.image.is-16by9 .has-ratio,.image.is-16by9 img,.image.is-1by1 .has-ratio,.image.is-1by1 img,.image.is-1by2 .has-ratio,.image.is-1by2 img,.image.is-1by3 .has-ratio,.image.is-1by3 img,.image.is-2by1 .has-ratio,.image.is-2by1 img,.image.is-2by3 .has-ratio,.image.is-2by3 img,.image.is-3by1 .has-ratio,.image.is-3by1 img,.image.is-3by2 .has-ratio,.image.is-3by2 img,.image.is-3by4 .has-ratio,.image.is-3by4 img,.image.is-3by5 .has-ratio,.image.is-3by5 img,.image.is-4by3 .has-ratio,.image.is-4by3 img,.image.is-4by5 .has-ratio,.image.is-4by5 img,.image.is-5by3 .has-ratio,.image.is-5by3 img,.image.is-5by4 .has-ratio,.image.is-5by4 img,.image.is-9by16 .has-ratio,.image.is-9by16 img,.image.is-square .has-ratio,.image.is-square img,.is-overlay,.modal,.modal-background{bottom:0;left:0;position:absolute;right:0;top:0}.button,.file-cta,.file-name,.input,.pagination-ellipsis,.pagination-link,.pagination-next,.pagination-previous,.select select,.textarea{-moz-appearance:none;-webkit-appearance:none;align-items:center;border:1px solid transparent;border-radius:4px;box-shadow:none;display:inline-flex;font-size:1rem;height:2.5em;justify-content:flex-start;line-height:1.5;padding-bottom:calc(.5em - 1px);padding-left:calc(.75em - 1px);padding-right:calc(.75em - 1px);padding-top:calc(.5em - 1px);position:relative;vertical-align:top}.button:active,.button:focus,.file-cta:active,.file-cta:focus,.file-name:active,.file-name:focus,.input:active,.input:focus,.is-active.button,.is-active.file-cta,.is-active.file-name,.is-active.input,.is-active.pagination-ellipsis,.is-active.pagination-link,.is-active.pagination-next,.is-active.pagination-previous,.is-active.textarea,.is-focused.button,.is-focused.file-cta,.is-focused.file-name,.is-focused.input,.is-focused.pagination-ellipsis,.is-focused.pagination-link,.is-focused.pagination-next,.is-focused.pagination-previous,.is-focused.textarea,.pagination-ellipsis:active,.pagination-ellipsis:focus,.pagination-link:active,.pagination-link:focus,.pagination-next:active,.pagination-next:focus,.pagination-previous:active,.pagination-previous:focus,.select select.is-active,.select select.is-focused,.select select:active,.select select:focus,.textarea:active,.textarea:focus{outline:0}.button[disabled],.file-cta[disabled],.file-name[disabled],.input[disabled],.pagination-ellipsis[disabled],.pagination-link[disabled],.pagination-next[disabled],.pagination-previous[disabled],.select fieldset[disabled] select,.select select[disabled],.textarea[disabled],fieldset[disabled] .button,fieldset[disabled] .file-cta,fieldset[disabled] .file-name,fieldset[disabled] .input,fieldset[disabled] .pagination-ellipsis,fieldset[disabled] .pagination-link,fieldset[disabled] .pagination-next,fieldset[disabled] .pagination-previous,fieldset[disabled] .select select,fieldset[disabled] .textarea{cursor:not-allowed}/*! minireset.css v0.0.6 | MIT License | github.com/jgthms/minireset.css */blockquote,body,dd,dl,dt,fieldset,figure,h1,h2,h3,h4,h5,h6,hr,html,iframe,legend,li,ol,p,pre,textarea,ul{margin:0;padding:0}h1,h2,h3,h4,h5,h6{font-size:100%;font-weight:400}ul{list-style:none}button,input,select,textarea{margin:0}html{box-sizing:border-box}*,::after,::before{box-sizing:inherit}img,video{height:auto;max-width:100%}iframe{border:0}table{border-collapse:collapse;border-spacing:0}td,th{padding:0}td:not([align]),th:not([align]){text-align:left}html{background-color:#fff;font-size:16px;-moz-osx-font-smoothing:grayscale;-webkit-font-smoothing:antialiased;min-width:300px;overflow-x:hidden;overflow-y:scroll;text-rendering:optimizeLegibility;-webkit-text-size-adjust:100%;-moz-text-size-adjust:100%;-ms-text-size-adjust:100%;text-size-adjust:100%}article,aside,figure,footer,header,hgroup,section{display:block}body,button,input,select,textarea{font-family:BlinkMacSystemFont,-apple-system,"Segoe UI",Roboto,Oxygen,Ubuntu,Cantarell,"Fira Sans","Droid Sans","Helvetica Neue",Helvetica,Arial,sans-serif}code,pre{-moz-osx-font-smoothing:auto;-webkit-font-smoothing:auto;font-family:monospace}body{color:#4a4a4a;font-size:1em;font-weight:400;line-height:1.5}a{color:#3273dc;cursor:pointer;text-decoration:none}a strong{color:currentColor}a:hover{color:#363636}code{background-color:#f5f5f5;color:#f14668;font-size:.875em;font-weight:400;padding:.25em .5em .25em}hr{background-color:#f5f5f5;border:none;display:block;height:2px;margin:1.5rem 0}img{height:auto;max-width:100%}input[type=checkbox],input[type=radio]{vertical-align:baseline}small{font-size:.875em}span{font-style:inherit;font-weight:inherit}strong{color:#363636;font-weight:700}fieldset{border:none}pre{-webkit-overflow-scrolling:touch;background-color:#f5f5f5;color:#4a4a4a;font-size:.875em;overflow-x:auto;padding:1.25rem 1.5rem;white-space:pre;word-wrap:normal}pre code{background-color:transparent;color:currentColor;font-size:1em;padding:0}table td,table th{vertical-align:top}table td:not([align]),table th:not([align]){text-align:left}table th{color:#363636}.is-clearfix::after{clear:both;content:" ";display:table}.is-pulled-left{float:left!important}.is-pulled-right{float:right!important}.is-clipped{overflow:hidden!important}.is-size-1{font-size:3rem!important}.is-size-2{font-size:2.5rem!important}.is-size-3{font-size:2rem!important}.is-size-4{font-size:1.5rem!important}.is-size-5{font-size:1.25rem!important}.is-size-6{font-size:1rem!important}.is-size-7{font-size:.75rem!important}@media screen and (max-width:768px){.is-size-1-mobile{font-size:3rem!important}.is-size-2-mobile{font-size:2.5rem!important}.is-size-3-mobile{font-size:2rem!important}.is-size-4-mobile{font-size:1.5rem!important}.is-size-5-mobile{font-size:1.25rem!important}.is-size-6-mobile{font-size:1rem!important}.is-size-7-mobile{font-size:.75rem!important}}@media screen and (min-width:769px),print{.is-size-1-tablet{font-size:3rem!important}.is-size-2-tablet{font-size:2.5rem!important}.is-size-3-tablet{font-size:2rem!important}.is-size-4-tablet{font-size:1.5rem!important}.is-size-5-tablet{font-size:1.25rem!important}.is-size-6-tablet{font-size:1rem!important}.is-size-7-tablet{font-size:.75rem!important}}@media screen and (max-width:1023px){.is-size-1-touch{font-size:3rem!important}.is-size-2-touch{font-size:2.5rem!important}.is-size-3-touch{font-size:2rem!important}.is-size-4-touch{font-size:1.5rem!important}.is-size-5-touch{font-size:1.25rem!important}.is-size-6-touch{font-size:1rem!important}.is-size-7-touch{font-size:.75rem!important}}@media screen and (min-width:1024px){.is-size-1-desktop{font-size:3rem!important}.is-size-2-desktop{font-size:2.5rem!important}.is-size-3-desktop{font-size:2rem!important}.is-size-4-desktop{font-size:1.5rem!important}.is-size-5-desktop{font-size:1.25rem!important}.is-size-6-desktop{font-size:1rem!important}.is-size-7-desktop{font-size:.75rem!important}}@media screen and (min-width:1216px){.is-size-1-widescreen{font-size:3rem!important}.is-size-2-widescreen{font-size:2.5rem!important}.is-size-3-widescreen{font-size:2rem!important}.is-size-4-widescreen{font-size:1.5rem!important}.is-size-5-widescreen{font-size:1.25rem!important}.is-size-6-widescreen{font-size:1rem!important}.is-size-7-widescreen{font-size:.75rem!important}}@media screen and (min-width:1408px){.is-size-1-fullhd{font-size:3rem!important}.is-size-2-fullhd{font-size:2.5rem!important}.is-size-3-fullhd{font-size:2rem!important}.is-size-4-fullhd{font-size:1.5rem!important}.is-size-5-fullhd{font-size:1.25rem!important}.is-size-6-fullhd{font-size:1rem!important}.is-size-7-fullhd{font-size:.75rem!important}}.has-text-centered{text-align:center!important}.has-text-justified{text-align:justify!important}.has-text-left{text-align:left!important}.has-text-right{text-align:right!important}@media screen and (max-width:768px){.has-text-centered-mobile{text-align:center!important}}@media screen and (min-width:769px),print{.has-text-centered-tablet{text-align:center!important}}@media screen and (min-width:769px) and (max-width:1023px){.has-text-centered-tablet-only{text-align:center!important}}@media screen and (max-width:1023px){.has-text-centered-touch{text-align:center!important}}@media screen and (min-width:1024px){.has-text-centered-desktop{text-align:center!important}}@media screen and (min-width:1024px) and (max-width:1215px){.has-text-centered-desktop-only{text-align:center!important}}@media screen and (min-width:1216px){.has-text-centered-widescreen{text-align:center!important}}@media screen and (min-width:1216px) and (max-width:1407px){.has-text-centered-widescreen-only{text-align:center!important}}@media screen and (min-width:1408px){.has-text-centered-fullhd{text-align:center!important}}@media screen and (max-width:768px){.has-text-justified-mobile{text-align:justify!important}}@media screen and (min-width:769px),print{.has-text-justified-tablet{text-align:justify!important}}@media screen and (min-width:769px) and (max-width:1023px){.has-text-justified-tablet-only{text-align:justify!important}}@media screen and (max-width:1023px){.has-text-justified-touch{text-align:justify!important}}@media screen and (min-width:1024px){.has-text-justified-desktop{text-align:justify!important}}@media screen and (min-width:1024px) and (max-width:1215px){.has-text-justified-desktop-only{text-align:justify!important}}@media screen and (min-width:1216px){.has-text-justified-widescreen{text-align:justify!important}}@media screen and (min-width:1216px) and (max-width:1407px){.has-text-justified-widescreen-only{text-align:justify!important}}@media screen and (min-width:1408px){.has-text-justified-fullhd{text-align:justify!important}}@media screen and (max-width:768px){.has-text-left-mobile{text-align:left!important}}@media screen and (min-width:769px),print{.has-text-left-tablet{text-align:left!important}}@media screen and (min-width:769px) and (max-width:1023px){.has-text-left-tablet-only{text-align:left!important}}@media screen and (max-width:1023px){.has-text-left-touch{text-align:left!important}}@media screen and (min-width:1024px){.has-text-left-desktop{text-align:left!important}}@media screen and (min-width:1024px) and (max-width:1215px){.has-text-left-desktop-only{text-align:left!important}}@media screen and (min-width:1216px){.has-text-left-widescreen{text-align:left!important}}@media screen and (min-width:1216px) and (max-width:1407px){.has-text-left-widescreen-only{text-align:left!important}}@media screen and (min-width:1408px){.has-text-left-fullhd{text-align:left!important}}@media screen and (max-width:768px){.has-text-right-mobile{text-align:right!important}}@media screen and (min-width:769px),print{.has-text-right-tablet{text-align:right!important}}@media screen and (min-width:769px) and (max-width:1023px){.has-text-right-tablet-only{text-align:right!important}}@media screen and (max-width:1023px){.has-text-right-touch{text-align:right!important}}@media screen and (min-width:1024px){.has-text-right-desktop{text-align:right!important}}@media screen and (min-width:1024px) and (max-width:1215px){.has-text-right-desktop-only{text-align:right!important}}@media screen and (min-width:1216px){.has-text-right-widescreen{text-align:right!important}}@media screen and (min-width:1216px) and (max-width:1407px){.has-text-right-widescreen-only{text-align:right!important}}@media screen and (min-width:1408px){.has-text-right-fullhd{text-align:right!important}}.is-capitalized{text-transform:capitalize!important}.is-lowercase{text-transform:lowercase!important}.is-uppercase{text-transform:uppercase!important}.is-italic{font-style:italic!important}.has-text-white{color:#fff!important}a.has-text-white:focus,a.has-text-white:hover{color:#e6e6e6!important}.has-background-white{background-color:#fff!important}.has-text-black{color:#0a0a0a!important}a.has-text-black:focus,a.has-text-black:hover{color:#000!important}.has-background-black{background-color:#0a0a0a!important}.has-text-light{color:#f5f5f5!important}a.has-text-light:focus,a.has-text-light:hover{color:#dbdbdb!important}.has-background-light{background-color:#f5f5f5!important}.has-text-dark{color:#363636!important}a.has-text-dark:focus,a.has-text-dark:hover{color:#1c1c1c!important}.has-background-dark{background-color:#363636!important}.has-text-primary{color:#00d1b2!important}a.has-text-primary:focus,a.has-text-primary:hover{color:#009e86!important}.has-background-primary{background-color:#00d1b2!important}.has-text-link{color:#3273dc!important}a.has-text-link:focus,a.has-text-link:hover{color:#205bbc!important}.has-background-link{background-color:#3273dc!important}.has-text-info{color:#3298dc!important}a.has-text-info:focus,a.has-text-info:hover{color:#207dbc!important}.has-background-info{background-color:#3298dc!important}.has-text-success{color:#48c774!important}a.has-text-success:focus,a.has-text-success:hover{color:#34a85c!important}.has-background-success{background-color:#48c774!important}.has-text-warning{color:#ffdd57!important}a.has-text-warning:focus,a.has-text-warning:hover{color:#ffd324!important}.has-background-warning{background-color:#ffdd57!important}.has-text-danger{color:#f14668!important}a.has-text-danger:focus,a.has-text-danger:hover{color:#ee1742!important}.has-background-danger{background-color:#f14668!important}.has-text-black-bis{color:#121212!important}.has-background-black-bis{background-color:#121212!important}.has-text-black-ter{color:#242424!important}.has-background-black-ter{background-color:#242424!important}.has-text-grey-darker{color:#363636!important}.has-background-grey-darker{background-color:#363636!important}.has-text-grey-dark{color:#4a4a4a!important}.has-background-grey-dark{background-color:#4a4a4a!important}.has-text-grey{color:#7a7a7a!important}.has-background-grey{background-color:#7a7a7a!important}.has-text-grey-light{color:#b5b5b5!important}.has-background-grey-light{background-color:#b5b5b5!important}.has-text-grey-lighter{color:#dbdbdb!important}.has-background-grey-lighter{background-color:#dbdbdb!important}.has-text-white-ter{color:#f5f5f5!important}.has-background-white-ter{background-color:#f5f5f5!important}.has-text-white-bis{color:#fafafa!important}.has-background-white-bis{background-color:#fafafa!important}.has-text-weight-light{font-weight:300!important}.has-text-weight-normal{font-weight:400!important}.has-text-weight-medium{font-weight:500!important}.has-text-weight-semibold{font-weight:600!important}.has-text-weight-bold{font-weight:700!important}.is-family-primary{font-family:BlinkMacSystemFont,-apple-system,"Segoe UI",Roboto,Oxygen,Ubuntu,Cantarell,"Fira Sans","Droid Sans","Helvetica Neue",Helvetica,Arial,sans-serif!important}.is-family-secondary{font-family:BlinkMacSystemFont,-apple-system,"Segoe UI",Roboto,Oxygen,Ubuntu,Cantarell,"Fira Sans","Droid Sans","Helvetica Neue",Helvetica,Arial,sans-serif!important}.is-family-sans-serif{font-family:BlinkMacSystemFont,-apple-system,"Segoe UI",Roboto,Oxygen,Ubuntu,Cantarell,"Fira Sans","Droid Sans","Helvetica Neue",Helvetica,Arial,sans-serif!important}.is-family-monospace{font-family:monospace!important}.is-family-code{font-family:monospace!important}.is-block{display:block!important}@media screen and (max-width:768px){.is-block-mobile{display:block!important}}@media screen and (min-width:769px),print{.is-block-tablet{display:block!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-block-tablet-only{display:block!important}}@media screen and (max-width:1023px){.is-block-touch{display:block!important}}@media screen and (min-width:1024px){.is-block-desktop{display:block!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-block-desktop-only{display:block!important}}@media screen and (min-width:1216px){.is-block-widescreen{display:block!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-block-widescreen-only{display:block!important}}@media screen and (min-width:1408px){.is-block-fullhd{display:block!important}}.is-flex{display:flex!important}@media screen and (max-width:768px){.is-flex-mobile{display:flex!important}}@media screen and (min-width:769px),print{.is-flex-tablet{display:flex!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-flex-tablet-only{display:flex!important}}@media screen and (max-width:1023px){.is-flex-touch{display:flex!important}}@media screen and (min-width:1024px){.is-flex-desktop{display:flex!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-flex-desktop-only{display:flex!important}}@media screen and (min-width:1216px){.is-flex-widescreen{display:flex!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-flex-widescreen-only{display:flex!important}}@media screen and (min-width:1408px){.is-flex-fullhd{display:flex!important}}.is-inline{display:inline!important}@media screen and (max-width:768px){.is-inline-mobile{display:inline!important}}@media screen and (min-width:769px),print{.is-inline-tablet{display:inline!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-inline-tablet-only{display:inline!important}}@media screen and (max-width:1023px){.is-inline-touch{display:inline!important}}@media screen and (min-width:1024px){.is-inline-desktop{display:inline!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-inline-desktop-only{display:inline!important}}@media screen and (min-width:1216px){.is-inline-widescreen{display:inline!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-inline-widescreen-only{display:inline!important}}@media screen and (min-width:1408px){.is-inline-fullhd{display:inline!important}}.is-inline-block{display:inline-block!important}@media screen and (max-width:768px){.is-inline-block-mobile{display:inline-block!important}}@media screen and (min-width:769px),print{.is-inline-block-tablet{display:inline-block!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-inline-block-tablet-only{display:inline-block!important}}@media screen and (max-width:1023px){.is-inline-block-touch{display:inline-block!important}}@media screen and (min-width:1024px){.is-inline-block-desktop{display:inline-block!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-inline-block-desktop-only{display:inline-block!important}}@media screen and (min-width:1216px){.is-inline-block-widescreen{display:inline-block!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-inline-block-widescreen-only{display:inline-block!important}}@media screen and (min-width:1408px){.is-inline-block-fullhd{display:inline-block!important}}.is-inline-flex{display:inline-flex!important}@media screen and (max-width:768px){.is-inline-flex-mobile{display:inline-flex!important}}@media screen and (min-width:769px),print{.is-inline-flex-tablet{display:inline-flex!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-inline-flex-tablet-only{display:inline-flex!important}}@media screen and (max-width:1023px){.is-inline-flex-touch{display:inline-flex!important}}@media screen and (min-width:1024px){.is-inline-flex-desktop{display:inline-flex!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-inline-flex-desktop-only{display:inline-flex!important}}@media screen and (min-width:1216px){.is-inline-flex-widescreen{display:inline-flex!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-inline-flex-widescreen-only{display:inline-flex!important}}@media screen and (min-width:1408px){.is-inline-flex-fullhd{display:inline-flex!important}}.is-hidden{display:none!important}.is-sr-only{border:none!important;clip:rect(0,0,0,0)!important;height:.01em!important;overflow:hidden!important;padding:0!important;position:absolute!important;white-space:nowrap!important;width:.01em!important}@media screen and (max-width:768px){.is-hidden-mobile{display:none!important}}@media screen and (min-width:769px),print{.is-hidden-tablet{display:none!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-hidden-tablet-only{display:none!important}}@media screen and (max-width:1023px){.is-hidden-touch{display:none!important}}@media screen and (min-width:1024px){.is-hidden-desktop{display:none!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-hidden-desktop-only{display:none!important}}@media screen and (min-width:1216px){.is-hidden-widescreen{display:none!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-hidden-widescreen-only{display:none!important}}@media screen and (min-width:1408px){.is-hidden-fullhd{display:none!important}}.is-invisible{visibility:hidden!important}@media screen and (max-width:768px){.is-invisible-mobile{visibility:hidden!important}}@media screen and (min-width:769px),print{.is-invisible-tablet{visibility:hidden!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-invisible-tablet-only{visibility:hidden!important}}@media screen and (max-width:1023px){.is-invisible-touch{visibility:hidden!important}}@media screen and (min-width:1024px){.is-invisible-desktop{visibility:hidden!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-invisible-desktop-only{visibility:hidden!important}}@media screen and (min-width:1216px){.is-invisible-widescreen{visibility:hidden!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-invisible-widescreen-only{visibility:hidden!important}}@media screen and (min-width:1408px){.is-invisible-fullhd{visibility:hidden!important}}.is-marginless{margin:0!important}.is-paddingless{padding:0!important}.is-radiusless{border-radius:0!important}.is-shadowless{box-shadow:none!important}.is-relative{position:relative!important}.box{background-color:#fff;border-radius:6px;box-shadow:0 .5em 1em -.125em rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.02);color:#4a4a4a;display:block;padding:1.25rem}a.box:focus,a.box:hover{box-shadow:0 .5em 1em -.125em rgba(10,10,10,.1),0 0 0 1px #3273dc}a.box:active{box-shadow:inset 0 1px 2px rgba(10,10,10,.2),0 0 0 1px #3273dc}.button{background-color:#fff;border-color:#dbdbdb;border-width:1px;color:#363636;cursor:pointer;justify-content:center;padding-bottom:calc(.5em - 1px);padding-left:1em;padding-right:1em;padding-top:calc(.5em - 1px);text-align:center;white-space:nowrap}.button strong{color:inherit}.button .icon,.button .icon.is-large,.button .icon.is-medium,.button .icon.is-small{height:1.5em;width:1.5em}.button .icon:first-child:not(:last-child){margin-left:calc(-.5em - 1px);margin-right:.25em}.button .icon:last-child:not(:first-child){margin-left:.25em;margin-right:calc(-.5em - 1px)}.button .icon:first-child:last-child{margin-left:calc(-.5em - 1px);margin-right:calc(-.5em - 1px)}.button.is-hovered,.button:hover{border-color:#b5b5b5;color:#363636}.button.is-focused,.button:focus{border-color:#3273dc;color:#363636}.button.is-focused:not(:active),.button:focus:not(:active){box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.button.is-active,.button:active{border-color:#4a4a4a;color:#363636}.button.is-text{background-color:transparent;border-color:transparent;color:#4a4a4a;text-decoration:underline}.button.is-text.is-focused,.button.is-text.is-hovered,.button.is-text:focus,.button.is-text:hover{background-color:#f5f5f5;color:#363636}.button.is-text.is-active,.button.is-text:active{background-color:#e8e8e8;color:#363636}.button.is-text[disabled],fieldset[disabled] .button.is-text{background-color:transparent;border-color:transparent;box-shadow:none}.button.is-white{background-color:#fff;border-color:transparent;color:#0a0a0a}.button.is-white.is-hovered,.button.is-white:hover{background-color:#f9f9f9;border-color:transparent;color:#0a0a0a}.button.is-white.is-focused,.button.is-white:focus{border-color:transparent;color:#0a0a0a}.button.is-white.is-focused:not(:active),.button.is-white:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.button.is-white.is-active,.button.is-white:active{background-color:#f2f2f2;border-color:transparent;color:#0a0a0a}.button.is-white[disabled],fieldset[disabled] .button.is-white{background-color:#fff;border-color:transparent;box-shadow:none}.button.is-white.is-inverted{background-color:#0a0a0a;color:#fff}.button.is-white.is-inverted.is-hovered,.button.is-white.is-inverted:hover{background-color:#000}.button.is-white.is-inverted[disabled],fieldset[disabled] .button.is-white.is-inverted{background-color:#0a0a0a;border-color:transparent;box-shadow:none;color:#fff}.button.is-white.is-loading::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-white.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-white.is-outlined.is-focused,.button.is-white.is-outlined.is-hovered,.button.is-white.is-outlined:focus,.button.is-white.is-outlined:hover{background-color:#fff;border-color:#fff;color:#0a0a0a}.button.is-white.is-outlined.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-white.is-outlined.is-loading.is-focused::after,.button.is-white.is-outlined.is-loading.is-hovered::after,.button.is-white.is-outlined.is-loading:focus::after,.button.is-white.is-outlined.is-loading:hover::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-white.is-outlined[disabled],fieldset[disabled] .button.is-white.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-white.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;color:#0a0a0a}.button.is-white.is-inverted.is-outlined.is-focused,.button.is-white.is-inverted.is-outlined.is-hovered,.button.is-white.is-inverted.is-outlined:focus,.button.is-white.is-inverted.is-outlined:hover{background-color:#0a0a0a;color:#fff}.button.is-white.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-white.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-white.is-inverted.is-outlined.is-loading:focus::after,.button.is-white.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-white.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-white.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;box-shadow:none;color:#0a0a0a}.button.is-black{background-color:#0a0a0a;border-color:transparent;color:#fff}.button.is-black.is-hovered,.button.is-black:hover{background-color:#040404;border-color:transparent;color:#fff}.button.is-black.is-focused,.button.is-black:focus{border-color:transparent;color:#fff}.button.is-black.is-focused:not(:active),.button.is-black:focus:not(:active){box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.button.is-black.is-active,.button.is-black:active{background-color:#000;border-color:transparent;color:#fff}.button.is-black[disabled],fieldset[disabled] .button.is-black{background-color:#0a0a0a;border-color:transparent;box-shadow:none}.button.is-black.is-inverted{background-color:#fff;color:#0a0a0a}.button.is-black.is-inverted.is-hovered,.button.is-black.is-inverted:hover{background-color:#f2f2f2}.button.is-black.is-inverted[disabled],fieldset[disabled] .button.is-black.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#0a0a0a}.button.is-black.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-black.is-outlined{background-color:transparent;border-color:#0a0a0a;color:#0a0a0a}.button.is-black.is-outlined.is-focused,.button.is-black.is-outlined.is-hovered,.button.is-black.is-outlined:focus,.button.is-black.is-outlined:hover{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.button.is-black.is-outlined.is-loading::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-black.is-outlined.is-loading.is-focused::after,.button.is-black.is-outlined.is-loading.is-hovered::after,.button.is-black.is-outlined.is-loading:focus::after,.button.is-black.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-black.is-outlined[disabled],fieldset[disabled] .button.is-black.is-outlined{background-color:transparent;border-color:#0a0a0a;box-shadow:none;color:#0a0a0a}.button.is-black.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-black.is-inverted.is-outlined.is-focused,.button.is-black.is-inverted.is-outlined.is-hovered,.button.is-black.is-inverted.is-outlined:focus,.button.is-black.is-inverted.is-outlined:hover{background-color:#fff;color:#0a0a0a}.button.is-black.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-black.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-black.is-inverted.is-outlined.is-loading:focus::after,.button.is-black.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-black.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-black.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-light{background-color:#f5f5f5;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-light.is-hovered,.button.is-light:hover{background-color:#eee;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-light.is-focused,.button.is-light:focus{border-color:transparent;color:rgba(0,0,0,.7)}.button.is-light.is-focused:not(:active),.button.is-light:focus:not(:active){box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.button.is-light.is-active,.button.is-light:active{background-color:#e8e8e8;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-light[disabled],fieldset[disabled] .button.is-light{background-color:#f5f5f5;border-color:transparent;box-shadow:none}.button.is-light.is-inverted{background-color:rgba(0,0,0,.7);color:#f5f5f5}.button.is-light.is-inverted.is-hovered,.button.is-light.is-inverted:hover{background-color:rgba(0,0,0,.7)}.button.is-light.is-inverted[disabled],fieldset[disabled] .button.is-light.is-inverted{background-color:rgba(0,0,0,.7);border-color:transparent;box-shadow:none;color:#f5f5f5}.button.is-light.is-loading::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-light.is-outlined{background-color:transparent;border-color:#f5f5f5;color:#f5f5f5}.button.is-light.is-outlined.is-focused,.button.is-light.is-outlined.is-hovered,.button.is-light.is-outlined:focus,.button.is-light.is-outlined:hover{background-color:#f5f5f5;border-color:#f5f5f5;color:rgba(0,0,0,.7)}.button.is-light.is-outlined.is-loading::after{border-color:transparent transparent #f5f5f5 #f5f5f5!important}.button.is-light.is-outlined.is-loading.is-focused::after,.button.is-light.is-outlined.is-loading.is-hovered::after,.button.is-light.is-outlined.is-loading:focus::after,.button.is-light.is-outlined.is-loading:hover::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-light.is-outlined[disabled],fieldset[disabled] .button.is-light.is-outlined{background-color:transparent;border-color:#f5f5f5;box-shadow:none;color:#f5f5f5}.button.is-light.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);color:rgba(0,0,0,.7)}.button.is-light.is-inverted.is-outlined.is-focused,.button.is-light.is-inverted.is-outlined.is-hovered,.button.is-light.is-inverted.is-outlined:focus,.button.is-light.is-inverted.is-outlined:hover{background-color:rgba(0,0,0,.7);color:#f5f5f5}.button.is-light.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-light.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-light.is-inverted.is-outlined.is-loading:focus::after,.button.is-light.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #f5f5f5 #f5f5f5!important}.button.is-light.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-light.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);box-shadow:none;color:rgba(0,0,0,.7)}.button.is-dark{background-color:#363636;border-color:transparent;color:#fff}.button.is-dark.is-hovered,.button.is-dark:hover{background-color:#2f2f2f;border-color:transparent;color:#fff}.button.is-dark.is-focused,.button.is-dark:focus{border-color:transparent;color:#fff}.button.is-dark.is-focused:not(:active),.button.is-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.button.is-dark.is-active,.button.is-dark:active{background-color:#292929;border-color:transparent;color:#fff}.button.is-dark[disabled],fieldset[disabled] .button.is-dark{background-color:#363636;border-color:transparent;box-shadow:none}.button.is-dark.is-inverted{background-color:#fff;color:#363636}.button.is-dark.is-inverted.is-hovered,.button.is-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-dark.is-inverted[disabled],fieldset[disabled] .button.is-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#363636}.button.is-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-dark.is-outlined{background-color:transparent;border-color:#363636;color:#363636}.button.is-dark.is-outlined.is-focused,.button.is-dark.is-outlined.is-hovered,.button.is-dark.is-outlined:focus,.button.is-dark.is-outlined:hover{background-color:#363636;border-color:#363636;color:#fff}.button.is-dark.is-outlined.is-loading::after{border-color:transparent transparent #363636 #363636!important}.button.is-dark.is-outlined.is-loading.is-focused::after,.button.is-dark.is-outlined.is-loading.is-hovered::after,.button.is-dark.is-outlined.is-loading:focus::after,.button.is-dark.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-dark.is-outlined[disabled],fieldset[disabled] .button.is-dark.is-outlined{background-color:transparent;border-color:#363636;box-shadow:none;color:#363636}.button.is-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-dark.is-inverted.is-outlined.is-focused,.button.is-dark.is-inverted.is-outlined.is-hovered,.button.is-dark.is-inverted.is-outlined:focus,.button.is-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#363636}.button.is-dark.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-dark.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-dark.is-inverted.is-outlined.is-loading:focus::after,.button.is-dark.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #363636 #363636!important}.button.is-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-primary{background-color:#00d1b2;border-color:transparent;color:#fff}.button.is-primary.is-hovered,.button.is-primary:hover{background-color:#00c4a7;border-color:transparent;color:#fff}.button.is-primary.is-focused,.button.is-primary:focus{border-color:transparent;color:#fff}.button.is-primary.is-focused:not(:active),.button.is-primary:focus:not(:active){box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.button.is-primary.is-active,.button.is-primary:active{background-color:#00b89c;border-color:transparent;color:#fff}.button.is-primary[disabled],fieldset[disabled] .button.is-primary{background-color:#00d1b2;border-color:transparent;box-shadow:none}.button.is-primary.is-inverted{background-color:#fff;color:#00d1b2}.button.is-primary.is-inverted.is-hovered,.button.is-primary.is-inverted:hover{background-color:#f2f2f2}.button.is-primary.is-inverted[disabled],fieldset[disabled] .button.is-primary.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#00d1b2}.button.is-primary.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-primary.is-outlined{background-color:transparent;border-color:#00d1b2;color:#00d1b2}.button.is-primary.is-outlined.is-focused,.button.is-primary.is-outlined.is-hovered,.button.is-primary.is-outlined:focus,.button.is-primary.is-outlined:hover{background-color:#00d1b2;border-color:#00d1b2;color:#fff}.button.is-primary.is-outlined.is-loading::after{border-color:transparent transparent #00d1b2 #00d1b2!important}.button.is-primary.is-outlined.is-loading.is-focused::after,.button.is-primary.is-outlined.is-loading.is-hovered::after,.button.is-primary.is-outlined.is-loading:focus::after,.button.is-primary.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-primary.is-outlined[disabled],fieldset[disabled] .button.is-primary.is-outlined{background-color:transparent;border-color:#00d1b2;box-shadow:none;color:#00d1b2}.button.is-primary.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-primary.is-inverted.is-outlined.is-focused,.button.is-primary.is-inverted.is-outlined.is-hovered,.button.is-primary.is-inverted.is-outlined:focus,.button.is-primary.is-inverted.is-outlined:hover{background-color:#fff;color:#00d1b2}.button.is-primary.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-primary.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-primary.is-inverted.is-outlined.is-loading:focus::after,.button.is-primary.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #00d1b2 #00d1b2!important}.button.is-primary.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-primary.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-primary.is-light{background-color:#ebfffc;color:#00947e}.button.is-primary.is-light.is-hovered,.button.is-primary.is-light:hover{background-color:#defffa;border-color:transparent;color:#00947e}.button.is-primary.is-light.is-active,.button.is-primary.is-light:active{background-color:#d1fff8;border-color:transparent;color:#00947e}.button.is-link{background-color:#3273dc;border-color:transparent;color:#fff}.button.is-link.is-hovered,.button.is-link:hover{background-color:#276cda;border-color:transparent;color:#fff}.button.is-link.is-focused,.button.is-link:focus{border-color:transparent;color:#fff}.button.is-link.is-focused:not(:active),.button.is-link:focus:not(:active){box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.button.is-link.is-active,.button.is-link:active{background-color:#2366d1;border-color:transparent;color:#fff}.button.is-link[disabled],fieldset[disabled] .button.is-link{background-color:#3273dc;border-color:transparent;box-shadow:none}.button.is-link.is-inverted{background-color:#fff;color:#3273dc}.button.is-link.is-inverted.is-hovered,.button.is-link.is-inverted:hover{background-color:#f2f2f2}.button.is-link.is-inverted[disabled],fieldset[disabled] .button.is-link.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#3273dc}.button.is-link.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-link.is-outlined{background-color:transparent;border-color:#3273dc;color:#3273dc}.button.is-link.is-outlined.is-focused,.button.is-link.is-outlined.is-hovered,.button.is-link.is-outlined:focus,.button.is-link.is-outlined:hover{background-color:#3273dc;border-color:#3273dc;color:#fff}.button.is-link.is-outlined.is-loading::after{border-color:transparent transparent #3273dc #3273dc!important}.button.is-link.is-outlined.is-loading.is-focused::after,.button.is-link.is-outlined.is-loading.is-hovered::after,.button.is-link.is-outlined.is-loading:focus::after,.button.is-link.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-link.is-outlined[disabled],fieldset[disabled] .button.is-link.is-outlined{background-color:transparent;border-color:#3273dc;box-shadow:none;color:#3273dc}.button.is-link.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-link.is-inverted.is-outlined.is-focused,.button.is-link.is-inverted.is-outlined.is-hovered,.button.is-link.is-inverted.is-outlined:focus,.button.is-link.is-inverted.is-outlined:hover{background-color:#fff;color:#3273dc}.button.is-link.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-link.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-link.is-inverted.is-outlined.is-loading:focus::after,.button.is-link.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #3273dc #3273dc!important}.button.is-link.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-link.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-link.is-light{background-color:#eef3fc;color:#2160c4}.button.is-link.is-light.is-hovered,.button.is-link.is-light:hover{background-color:#e3ecfa;border-color:transparent;color:#2160c4}.button.is-link.is-light.is-active,.button.is-link.is-light:active{background-color:#d8e4f8;border-color:transparent;color:#2160c4}.button.is-info{background-color:#3298dc;border-color:transparent;color:#fff}.button.is-info.is-hovered,.button.is-info:hover{background-color:#2793da;border-color:transparent;color:#fff}.button.is-info.is-focused,.button.is-info:focus{border-color:transparent;color:#fff}.button.is-info.is-focused:not(:active),.button.is-info:focus:not(:active){box-shadow:0 0 0 .125em rgba(50,152,220,.25)}.button.is-info.is-active,.button.is-info:active{background-color:#238cd1;border-color:transparent;color:#fff}.button.is-info[disabled],fieldset[disabled] .button.is-info{background-color:#3298dc;border-color:transparent;box-shadow:none}.button.is-info.is-inverted{background-color:#fff;color:#3298dc}.button.is-info.is-inverted.is-hovered,.button.is-info.is-inverted:hover{background-color:#f2f2f2}.button.is-info.is-inverted[disabled],fieldset[disabled] .button.is-info.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#3298dc}.button.is-info.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-info.is-outlined{background-color:transparent;border-color:#3298dc;color:#3298dc}.button.is-info.is-outlined.is-focused,.button.is-info.is-outlined.is-hovered,.button.is-info.is-outlined:focus,.button.is-info.is-outlined:hover{background-color:#3298dc;border-color:#3298dc;color:#fff}.button.is-info.is-outlined.is-loading::after{border-color:transparent transparent #3298dc #3298dc!important}.button.is-info.is-outlined.is-loading.is-focused::after,.button.is-info.is-outlined.is-loading.is-hovered::after,.button.is-info.is-outlined.is-loading:focus::after,.button.is-info.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-info.is-outlined[disabled],fieldset[disabled] .button.is-info.is-outlined{background-color:transparent;border-color:#3298dc;box-shadow:none;color:#3298dc}.button.is-info.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-info.is-inverted.is-outlined.is-focused,.button.is-info.is-inverted.is-outlined.is-hovered,.button.is-info.is-inverted.is-outlined:focus,.button.is-info.is-inverted.is-outlined:hover{background-color:#fff;color:#3298dc}.button.is-info.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-info.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-info.is-inverted.is-outlined.is-loading:focus::after,.button.is-info.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #3298dc #3298dc!important}.button.is-info.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-info.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-info.is-light{background-color:#eef6fc;color:#1d72aa}.button.is-info.is-light.is-hovered,.button.is-info.is-light:hover{background-color:#e3f1fa;border-color:transparent;color:#1d72aa}.button.is-info.is-light.is-active,.button.is-info.is-light:active{background-color:#d8ebf8;border-color:transparent;color:#1d72aa}.button.is-success{background-color:#48c774;border-color:transparent;color:#fff}.button.is-success.is-hovered,.button.is-success:hover{background-color:#3ec46d;border-color:transparent;color:#fff}.button.is-success.is-focused,.button.is-success:focus{border-color:transparent;color:#fff}.button.is-success.is-focused:not(:active),.button.is-success:focus:not(:active){box-shadow:0 0 0 .125em rgba(72,199,116,.25)}.button.is-success.is-active,.button.is-success:active{background-color:#3abb67;border-color:transparent;color:#fff}.button.is-success[disabled],fieldset[disabled] .button.is-success{background-color:#48c774;border-color:transparent;box-shadow:none}.button.is-success.is-inverted{background-color:#fff;color:#48c774}.button.is-success.is-inverted.is-hovered,.button.is-success.is-inverted:hover{background-color:#f2f2f2}.button.is-success.is-inverted[disabled],fieldset[disabled] .button.is-success.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#48c774}.button.is-success.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-success.is-outlined{background-color:transparent;border-color:#48c774;color:#48c774}.button.is-success.is-outlined.is-focused,.button.is-success.is-outlined.is-hovered,.button.is-success.is-outlined:focus,.button.is-success.is-outlined:hover{background-color:#48c774;border-color:#48c774;color:#fff}.button.is-success.is-outlined.is-loading::after{border-color:transparent transparent #48c774 #48c774!important}.button.is-success.is-outlined.is-loading.is-focused::after,.button.is-success.is-outlined.is-loading.is-hovered::after,.button.is-success.is-outlined.is-loading:focus::after,.button.is-success.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-success.is-outlined[disabled],fieldset[disabled] .button.is-success.is-outlined{background-color:transparent;border-color:#48c774;box-shadow:none;color:#48c774}.button.is-success.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-success.is-inverted.is-outlined.is-focused,.button.is-success.is-inverted.is-outlined.is-hovered,.button.is-success.is-inverted.is-outlined:focus,.button.is-success.is-inverted.is-outlined:hover{background-color:#fff;color:#48c774}.button.is-success.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-success.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-success.is-inverted.is-outlined.is-loading:focus::after,.button.is-success.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #48c774 #48c774!important}.button.is-success.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-success.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-success.is-light{background-color:#effaf3;color:#257942}.button.is-success.is-light.is-hovered,.button.is-success.is-light:hover{background-color:#e6f7ec;border-color:transparent;color:#257942}.button.is-success.is-light.is-active,.button.is-success.is-light:active{background-color:#dcf4e4;border-color:transparent;color:#257942}.button.is-warning{background-color:#ffdd57;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-hovered,.button.is-warning:hover{background-color:#ffdb4a;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-focused,.button.is-warning:focus{border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-focused:not(:active),.button.is-warning:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.button.is-warning.is-active,.button.is-warning:active{background-color:#ffd83d;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning[disabled],fieldset[disabled] .button.is-warning{background-color:#ffdd57;border-color:transparent;box-shadow:none}.button.is-warning.is-inverted{background-color:rgba(0,0,0,.7);color:#ffdd57}.button.is-warning.is-inverted.is-hovered,.button.is-warning.is-inverted:hover{background-color:rgba(0,0,0,.7)}.button.is-warning.is-inverted[disabled],fieldset[disabled] .button.is-warning.is-inverted{background-color:rgba(0,0,0,.7);border-color:transparent;box-shadow:none;color:#ffdd57}.button.is-warning.is-loading::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-warning.is-outlined{background-color:transparent;border-color:#ffdd57;color:#ffdd57}.button.is-warning.is-outlined.is-focused,.button.is-warning.is-outlined.is-hovered,.button.is-warning.is-outlined:focus,.button.is-warning.is-outlined:hover{background-color:#ffdd57;border-color:#ffdd57;color:rgba(0,0,0,.7)}.button.is-warning.is-outlined.is-loading::after{border-color:transparent transparent #ffdd57 #ffdd57!important}.button.is-warning.is-outlined.is-loading.is-focused::after,.button.is-warning.is-outlined.is-loading.is-hovered::after,.button.is-warning.is-outlined.is-loading:focus::after,.button.is-warning.is-outlined.is-loading:hover::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-warning.is-outlined[disabled],fieldset[disabled] .button.is-warning.is-outlined{background-color:transparent;border-color:#ffdd57;box-shadow:none;color:#ffdd57}.button.is-warning.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);color:rgba(0,0,0,.7)}.button.is-warning.is-inverted.is-outlined.is-focused,.button.is-warning.is-inverted.is-outlined.is-hovered,.button.is-warning.is-inverted.is-outlined:focus,.button.is-warning.is-inverted.is-outlined:hover{background-color:rgba(0,0,0,.7);color:#ffdd57}.button.is-warning.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-warning.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-warning.is-inverted.is-outlined.is-loading:focus::after,.button.is-warning.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #ffdd57 #ffdd57!important}.button.is-warning.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-warning.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);box-shadow:none;color:rgba(0,0,0,.7)}.button.is-warning.is-light{background-color:#fffbeb;color:#947600}.button.is-warning.is-light.is-hovered,.button.is-warning.is-light:hover{background-color:#fff8de;border-color:transparent;color:#947600}.button.is-warning.is-light.is-active,.button.is-warning.is-light:active{background-color:#fff6d1;border-color:transparent;color:#947600}.button.is-danger{background-color:#f14668;border-color:transparent;color:#fff}.button.is-danger.is-hovered,.button.is-danger:hover{background-color:#f03a5f;border-color:transparent;color:#fff}.button.is-danger.is-focused,.button.is-danger:focus{border-color:transparent;color:#fff}.button.is-danger.is-focused:not(:active),.button.is-danger:focus:not(:active){box-shadow:0 0 0 .125em rgba(241,70,104,.25)}.button.is-danger.is-active,.button.is-danger:active{background-color:#ef2e55;border-color:transparent;color:#fff}.button.is-danger[disabled],fieldset[disabled] .button.is-danger{background-color:#f14668;border-color:transparent;box-shadow:none}.button.is-danger.is-inverted{background-color:#fff;color:#f14668}.button.is-danger.is-inverted.is-hovered,.button.is-danger.is-inverted:hover{background-color:#f2f2f2}.button.is-danger.is-inverted[disabled],fieldset[disabled] .button.is-danger.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#f14668}.button.is-danger.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-danger.is-outlined{background-color:transparent;border-color:#f14668;color:#f14668}.button.is-danger.is-outlined.is-focused,.button.is-danger.is-outlined.is-hovered,.button.is-danger.is-outlined:focus,.button.is-danger.is-outlined:hover{background-color:#f14668;border-color:#f14668;color:#fff}.button.is-danger.is-outlined.is-loading::after{border-color:transparent transparent #f14668 #f14668!important}.button.is-danger.is-outlined.is-loading.is-focused::after,.button.is-danger.is-outlined.is-loading.is-hovered::after,.button.is-danger.is-outlined.is-loading:focus::after,.button.is-danger.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-danger.is-outlined[disabled],fieldset[disabled] .button.is-danger.is-outlined{background-color:transparent;border-color:#f14668;box-shadow:none;color:#f14668}.button.is-danger.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-danger.is-inverted.is-outlined.is-focused,.button.is-danger.is-inverted.is-outlined.is-hovered,.button.is-danger.is-inverted.is-outlined:focus,.button.is-danger.is-inverted.is-outlined:hover{background-color:#fff;color:#f14668}.button.is-danger.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-danger.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-danger.is-inverted.is-outlined.is-loading:focus::after,.button.is-danger.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #f14668 #f14668!important}.button.is-danger.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-danger.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-danger.is-light{background-color:#feecf0;color:#cc0f35}.button.is-danger.is-light.is-hovered,.button.is-danger.is-light:hover{background-color:#fde0e6;border-color:transparent;color:#cc0f35}.button.is-danger.is-light.is-active,.button.is-danger.is-light:active{background-color:#fcd4dc;border-color:transparent;color:#cc0f35}.button.is-small{border-radius:2px;font-size:.75rem}.button.is-normal{font-size:1rem}.button.is-medium{font-size:1.25rem}.button.is-large{font-size:1.5rem}.button[disabled],fieldset[disabled] .button{background-color:#fff;border-color:#dbdbdb;box-shadow:none;opacity:.5}.button.is-fullwidth{display:flex;width:100%}.button.is-loading{color:transparent!important;pointer-events:none}.button.is-loading::after{position:absolute;left:calc(50% - (1em / 2));top:calc(50% - (1em / 2));position:absolute!important}.button.is-static{background-color:#f5f5f5;border-color:#dbdbdb;color:#7a7a7a;box-shadow:none;pointer-events:none}.button.is-rounded{border-radius:290486px;padding-left:calc(1em + .25em);padding-right:calc(1em + .25em)}.buttons{align-items:center;display:flex;flex-wrap:wrap;justify-content:flex-start}.buttons .button{margin-bottom:.5rem}.buttons .button:not(:last-child):not(.is-fullwidth){margin-right:.5rem}.buttons:last-child{margin-bottom:-.5rem}.buttons:not(:last-child){margin-bottom:1rem}.buttons.are-small .button:not(.is-normal):not(.is-medium):not(.is-large){border-radius:2px;font-size:.75rem}.buttons.are-medium .button:not(.is-small):not(.is-normal):not(.is-large){font-size:1.25rem}.buttons.are-large .button:not(.is-small):not(.is-normal):not(.is-medium){font-size:1.5rem}.buttons.has-addons .button:not(:first-child){border-bottom-left-radius:0;border-top-left-radius:0}.buttons.has-addons .button:not(:last-child){border-bottom-right-radius:0;border-top-right-radius:0;margin-right:-1px}.buttons.has-addons .button:last-child{margin-right:0}.buttons.has-addons .button.is-hovered,.buttons.has-addons .button:hover{z-index:2}.buttons.has-addons .button.is-active,.buttons.has-addons .button.is-focused,.buttons.has-addons .button.is-selected,.buttons.has-addons .button:active,.buttons.has-addons .button:focus{z-index:3}.buttons.has-addons .button.is-active:hover,.buttons.has-addons .button.is-focused:hover,.buttons.has-addons .button.is-selected:hover,.buttons.has-addons .button:active:hover,.buttons.has-addons .button:focus:hover{z-index:4}.buttons.has-addons .button.is-expanded{flex-grow:1;flex-shrink:1}.buttons.is-centered{justify-content:center}.buttons.is-centered:not(.has-addons) .button:not(.is-fullwidth){margin-left:.25rem;margin-right:.25rem}.buttons.is-right{justify-content:flex-end}.buttons.is-right:not(.has-addons) .button:not(.is-fullwidth){margin-left:.25rem;margin-right:.25rem}.container{flex-grow:1;margin:0 auto;position:relative;width:auto}.container.is-fluid{max-width:none;padding-left:32px;padding-right:32px;width:100%}@media screen and (min-width:1024px){.container{max-width:960px}}@media screen and (max-width:1215px){.container.is-widescreen{max-width:1152px}}@media screen and (max-width:1407px){.container.is-fullhd{max-width:1344px}}@media screen and (min-width:1216px){.container{max-width:1152px}}@media screen and (min-width:1408px){.container{max-width:1344px}}.content li+li{margin-top:.25em}.content blockquote:not(:last-child),.content dl:not(:last-child),.content ol:not(:last-child),.content p:not(:last-child),.content pre:not(:last-child),.content table:not(:last-child),.content ul:not(:last-child){margin-bottom:1em}.content h1,.content h2,.content h3,.content h4,.content h5,.content h6{color:#363636;font-weight:600;line-height:1.125}.content h1{font-size:2em;margin-bottom:.5em}.content h1:not(:first-child){margin-top:1em}.content h2{font-size:1.75em;margin-bottom:.5714em}.content h2:not(:first-child){margin-top:1.1428em}.content h3{font-size:1.5em;margin-bottom:.6666em}.content h3:not(:first-child){margin-top:1.3333em}.content h4{font-size:1.25em;margin-bottom:.8em}.content h5{font-size:1.125em;margin-bottom:.8888em}.content h6{font-size:1em;margin-bottom:1em}.content blockquote{background-color:#f5f5f5;border-left:5px solid #dbdbdb;padding:1.25em 1.5em}.content ol{list-style-position:outside;margin-left:2em;margin-top:1em}.content ol:not([type]){list-style-type:decimal}.content ol:not([type]).is-lower-alpha{list-style-type:lower-alpha}.content ol:not([type]).is-lower-roman{list-style-type:lower-roman}.content ol:not([type]).is-upper-alpha{list-style-type:upper-alpha}.content ol:not([type]).is-upper-roman{list-style-type:upper-roman}.content ul{list-style:disc outside;margin-left:2em;margin-top:1em}.content ul ul{list-style-type:circle;margin-top:.5em}.content ul ul ul{list-style-type:square}.content dd{margin-left:2em}.content figure{margin-left:2em;margin-right:2em;text-align:center}.content figure:not(:first-child){margin-top:2em}.content figure:not(:last-child){margin-bottom:2em}.content figure img{display:inline-block}.content figure figcaption{font-style:italic}.content pre{-webkit-overflow-scrolling:touch;overflow-x:auto;padding:1.25em 1.5em;white-space:pre;word-wrap:normal}.content sub,.content sup{font-size:75%}.content table{width:100%}.content table td,.content table th{border:1px solid #dbdbdb;border-width:0 0 1px;padding:.5em .75em;vertical-align:top}.content table th{color:#363636}.content table th:not([align]){text-align:left}.content table thead td,.content table thead th{border-width:0 0 2px;color:#363636}.content table tfoot td,.content table tfoot th{border-width:2px 0 0;color:#363636}.content table tbody tr:last-child td,.content table tbody tr:last-child th{border-bottom-width:0}.content .tabs li+li{margin-top:0}.content.is-small{font-size:.75rem}.content.is-medium{font-size:1.25rem}.content.is-large{font-size:1.5rem}.icon{align-items:center;display:inline-flex;justify-content:center;height:1.5rem;width:1.5rem}.icon.is-small{height:1rem;width:1rem}.icon.is-medium{height:2rem;width:2rem}.icon.is-large{height:3rem;width:3rem}.image{display:block;position:relative}.image img{display:block;height:auto;width:100%}.image img.is-rounded{border-radius:290486px}.image.is-fullwidth{width:100%}.image.is-16by9 .has-ratio,.image.is-16by9 img,.image.is-1by1 .has-ratio,.image.is-1by1 img,.image.is-1by2 .has-ratio,.image.is-1by2 img,.image.is-1by3 .has-ratio,.image.is-1by3 img,.image.is-2by1 .has-ratio,.image.is-2by1 img,.image.is-2by3 .has-ratio,.image.is-2by3 img,.image.is-3by1 .has-ratio,.image.is-3by1 img,.image.is-3by2 .has-ratio,.image.is-3by2 img,.image.is-3by4 .has-ratio,.image.is-3by4 img,.image.is-3by5 .has-ratio,.image.is-3by5 img,.image.is-4by3 .has-ratio,.image.is-4by3 img,.image.is-4by5 .has-ratio,.image.is-4by5 img,.image.is-5by3 .has-ratio,.image.is-5by3 img,.image.is-5by4 .has-ratio,.image.is-5by4 img,.image.is-9by16 .has-ratio,.image.is-9by16 img,.image.is-square .has-ratio,.image.is-square img{height:100%;width:100%}.image.is-1by1,.image.is-square{padding-top:100%}.image.is-5by4{padding-top:80%}.image.is-4by3{padding-top:75%}.image.is-3by2{padding-top:66.6666%}.image.is-5by3{padding-top:60%}.image.is-16by9{padding-top:56.25%}.image.is-2by1{padding-top:50%}.image.is-3by1{padding-top:33.3333%}.image.is-4by5{padding-top:125%}.image.is-3by4{padding-top:133.3333%}.image.is-2by3{padding-top:150%}.image.is-3by5{padding-top:166.6666%}.image.is-9by16{padding-top:177.7777%}.image.is-1by2{padding-top:200%}.image.is-1by3{padding-top:300%}.image.is-16x16{height:16px;width:16px}.image.is-24x24{height:24px;width:24px}.image.is-32x32{height:32px;width:32px}.image.is-48x48{height:48px;width:48px}.image.is-64x64{height:64px;width:64px}.image.is-96x96{height:96px;width:96px}.image.is-128x128{height:128px;width:128px}.notification{background-color:#f5f5f5;border-radius:4px;padding:1.25rem 2.5rem 1.25rem 1.5rem;position:relative}.notification a:not(.button):not(.dropdown-item){color:currentColor;text-decoration:underline}.notification strong{color:currentColor}.notification code,.notification pre{background:#fff}.notification pre code{background:0 0}.notification>.delete{position:absolute;right:.5rem;top:.5rem}.notification .content,.notification .subtitle,.notification .title{color:currentColor}.notification.is-white{background-color:#fff;color:#0a0a0a}.notification.is-black{background-color:#0a0a0a;color:#fff}.notification.is-light{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.notification.is-dark{background-color:#363636;color:#fff}.notification.is-primary{background-color:#00d1b2;color:#fff}.notification.is-link{background-color:#3273dc;color:#fff}.notification.is-info{background-color:#3298dc;color:#fff}.notification.is-success{background-color:#48c774;color:#fff}.notification.is-warning{background-color:#ffdd57;color:rgba(0,0,0,.7)}.notification.is-danger{background-color:#f14668;color:#fff}.progress{-moz-appearance:none;-webkit-appearance:none;border:none;border-radius:290486px;display:block;height:1rem;overflow:hidden;padding:0;width:100%}.progress::-webkit-progress-bar{background-color:#ededed}.progress::-webkit-progress-value{background-color:#4a4a4a}.progress::-moz-progress-bar{background-color:#4a4a4a}.progress::-ms-fill{background-color:#4a4a4a;border:none}.progress.is-white::-webkit-progress-value{background-color:#fff}.progress.is-white::-moz-progress-bar{background-color:#fff}.progress.is-white::-ms-fill{background-color:#fff}.progress.is-white:indeterminate{background-image:linear-gradient(to right,#fff 30%,#ededed 30%)}.progress.is-black::-webkit-progress-value{background-color:#0a0a0a}.progress.is-black::-moz-progress-bar{background-color:#0a0a0a}.progress.is-black::-ms-fill{background-color:#0a0a0a}.progress.is-black:indeterminate{background-image:linear-gradient(to right,#0a0a0a 30%,#ededed 30%)}.progress.is-light::-webkit-progress-value{background-color:#f5f5f5}.progress.is-light::-moz-progress-bar{background-color:#f5f5f5}.progress.is-light::-ms-fill{background-color:#f5f5f5}.progress.is-light:indeterminate{background-image:linear-gradient(to right,#f5f5f5 30%,#ededed 30%)}.progress.is-dark::-webkit-progress-value{background-color:#363636}.progress.is-dark::-moz-progress-bar{background-color:#363636}.progress.is-dark::-ms-fill{background-color:#363636}.progress.is-dark:indeterminate{background-image:linear-gradient(to right,#363636 30%,#ededed 30%)}.progress.is-primary::-webkit-progress-value{background-color:#00d1b2}.progress.is-primary::-moz-progress-bar{background-color:#00d1b2}.progress.is-primary::-ms-fill{background-color:#00d1b2}.progress.is-primary:indeterminate{background-image:linear-gradient(to right,#00d1b2 30%,#ededed 30%)}.progress.is-link::-webkit-progress-value{background-color:#3273dc}.progress.is-link::-moz-progress-bar{background-color:#3273dc}.progress.is-link::-ms-fill{background-color:#3273dc}.progress.is-link:indeterminate{background-image:linear-gradient(to right,#3273dc 30%,#ededed 30%)}.progress.is-info::-webkit-progress-value{background-color:#3298dc}.progress.is-info::-moz-progress-bar{background-color:#3298dc}.progress.is-info::-ms-fill{background-color:#3298dc}.progress.is-info:indeterminate{background-image:linear-gradient(to right,#3298dc 30%,#ededed 30%)}.progress.is-success::-webkit-progress-value{background-color:#48c774}.progress.is-success::-moz-progress-bar{background-color:#48c774}.progress.is-success::-ms-fill{background-color:#48c774}.progress.is-success:indeterminate{background-image:linear-gradient(to right,#48c774 30%,#ededed 30%)}.progress.is-warning::-webkit-progress-value{background-color:#ffdd57}.progress.is-warning::-moz-progress-bar{background-color:#ffdd57}.progress.is-warning::-ms-fill{background-color:#ffdd57}.progress.is-warning:indeterminate{background-image:linear-gradient(to right,#ffdd57 30%,#ededed 30%)}.progress.is-danger::-webkit-progress-value{background-color:#f14668}.progress.is-danger::-moz-progress-bar{background-color:#f14668}.progress.is-danger::-ms-fill{background-color:#f14668}.progress.is-danger:indeterminate{background-image:linear-gradient(to right,#f14668 30%,#ededed 30%)}.progress:indeterminate{-webkit-animation-duration:1.5s;animation-duration:1.5s;-webkit-animation-iteration-count:infinite;animation-iteration-count:infinite;-webkit-animation-name:moveIndeterminate;animation-name:moveIndeterminate;-webkit-animation-timing-function:linear;animation-timing-function:linear;background-color:#ededed;background-image:linear-gradient(to right,#4a4a4a 30%,#ededed 30%);background-position:top left;background-repeat:no-repeat;background-size:150% 150%}.progress:indeterminate::-webkit-progress-bar{background-color:transparent}.progress:indeterminate::-moz-progress-bar{background-color:transparent}.progress.is-small{height:.75rem}.progress.is-medium{height:1.25rem}.progress.is-large{height:1.5rem}@-webkit-keyframes moveIndeterminate{from{background-position:200% 0}to{background-position:-200% 0}}@keyframes moveIndeterminate{from{background-position:200% 0}to{background-position:-200% 0}}.table{background-color:#fff;color:#363636}.table td,.table th{border:1px solid #dbdbdb;border-width:0 0 1px;padding:.5em .75em;vertical-align:top}.table td.is-white,.table th.is-white{background-color:#fff;border-color:#fff;color:#0a0a0a}.table td.is-black,.table th.is-black{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.table td.is-light,.table th.is-light{background-color:#f5f5f5;border-color:#f5f5f5;color:rgba(0,0,0,.7)}.table td.is-dark,.table th.is-dark{background-color:#363636;border-color:#363636;color:#fff}.table td.is-primary,.table th.is-primary{background-color:#00d1b2;border-color:#00d1b2;color:#fff}.table td.is-link,.table th.is-link{background-color:#3273dc;border-color:#3273dc;color:#fff}.table td.is-info,.table th.is-info{background-color:#3298dc;border-color:#3298dc;color:#fff}.table td.is-success,.table th.is-success{background-color:#48c774;border-color:#48c774;color:#fff}.table td.is-warning,.table th.is-warning{background-color:#ffdd57;border-color:#ffdd57;color:rgba(0,0,0,.7)}.table td.is-danger,.table th.is-danger{background-color:#f14668;border-color:#f14668;color:#fff}.table td.is-narrow,.table th.is-narrow{white-space:nowrap;width:1%}.table td.is-selected,.table th.is-selected{background-color:#00d1b2;color:#fff}.table td.is-selected a,.table td.is-selected strong,.table th.is-selected a,.table th.is-selected strong{color:currentColor}.table th{color:#363636}.table th:not([align]){text-align:left}.table tr.is-selected{background-color:#00d1b2;color:#fff}.table tr.is-selected a,.table tr.is-selected strong{color:currentColor}.table tr.is-selected td,.table tr.is-selected th{border-color:#fff;color:currentColor}.table thead{background-color:transparent}.table thead td,.table thead th{border-width:0 0 2px;color:#363636}.table tfoot{background-color:transparent}.table tfoot td,.table tfoot th{border-width:2px 0 0;color:#363636}.table tbody{background-color:transparent}.table tbody tr:last-child td,.table tbody tr:last-child th{border-bottom-width:0}.table.is-bordered td,.table.is-bordered th{border-width:1px}.table.is-bordered tr:last-child td,.table.is-bordered tr:last-child th{border-bottom-width:1px}.table.is-fullwidth{width:100%}.table.is-hoverable tbody tr:not(.is-selected):hover{background-color:#fafafa}.table.is-hoverable.is-striped tbody tr:not(.is-selected):hover{background-color:#fafafa}.table.is-hoverable.is-striped tbody tr:not(.is-selected):hover:nth-child(even){background-color:#f5f5f5}.table.is-narrow td,.table.is-narrow th{padding:.25em .5em}.table.is-striped tbody tr:not(.is-selected):nth-child(even){background-color:#fafafa}.table-container{-webkit-overflow-scrolling:touch;overflow:auto;overflow-y:hidden;max-width:100%}.tags{align-items:center;display:flex;flex-wrap:wrap;justify-content:flex-start}.tags .tag{margin-bottom:.5rem}.tags .tag:not(:last-child){margin-right:.5rem}.tags:last-child{margin-bottom:-.5rem}.tags:not(:last-child){margin-bottom:1rem}.tags.are-medium .tag:not(.is-normal):not(.is-large){font-size:1rem}.tags.are-large .tag:not(.is-normal):not(.is-medium){font-size:1.25rem}.tags.is-centered{justify-content:center}.tags.is-centered .tag{margin-right:.25rem;margin-left:.25rem}.tags.is-right{justify-content:flex-end}.tags.is-right .tag:not(:first-child){margin-left:.5rem}.tags.is-right .tag:not(:last-child){margin-right:0}.tags.has-addons .tag{margin-right:0}.tags.has-addons .tag:not(:first-child){margin-left:0;border-bottom-left-radius:0;border-top-left-radius:0}.tags.has-addons .tag:not(:last-child){border-bottom-right-radius:0;border-top-right-radius:0}.tag:not(body){align-items:center;background-color:#f5f5f5;border-radius:4px;color:#4a4a4a;display:inline-flex;font-size:.75rem;height:2em;justify-content:center;line-height:1.5;padding-left:.75em;padding-right:.75em;white-space:nowrap}.tag:not(body) .delete{margin-left:.25rem;margin-right:-.375rem}.tag:not(body).is-white{background-color:#fff;color:#0a0a0a}.tag:not(body).is-black{background-color:#0a0a0a;color:#fff}.tag:not(body).is-light{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.tag:not(body).is-dark{background-color:#363636;color:#fff}.tag:not(body).is-primary{background-color:#00d1b2;color:#fff}.tag:not(body).is-primary.is-light{background-color:#ebfffc;color:#00947e}.tag:not(body).is-link{background-color:#3273dc;color:#fff}.tag:not(body).is-link.is-light{background-color:#eef3fc;color:#2160c4}.tag:not(body).is-info{background-color:#3298dc;color:#fff}.tag:not(body).is-info.is-light{background-color:#eef6fc;color:#1d72aa}.tag:not(body).is-success{background-color:#48c774;color:#fff}.tag:not(body).is-success.is-light{background-color:#effaf3;color:#257942}.tag:not(body).is-warning{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tag:not(body).is-warning.is-light{background-color:#fffbeb;color:#947600}.tag:not(body).is-danger{background-color:#f14668;color:#fff}.tag:not(body).is-danger.is-light{background-color:#feecf0;color:#cc0f35}.tag:not(body).is-normal{font-size:.75rem}.tag:not(body).is-medium{font-size:1rem}.tag:not(body).is-large{font-size:1.25rem}.tag:not(body) .icon:first-child:not(:last-child){margin-left:-.375em;margin-right:.1875em}.tag:not(body) .icon:last-child:not(:first-child){margin-left:.1875em;margin-right:-.375em}.tag:not(body) .icon:first-child:last-child{margin-left:-.375em;margin-right:-.375em}.tag:not(body).is-delete{margin-left:1px;padding:0;position:relative;width:2em}.tag:not(body).is-delete::after,.tag:not(body).is-delete::before{background-color:currentColor;content:"";display:block;left:50%;position:absolute;top:50%;transform:translateX(-50%) translateY(-50%) rotate(45deg);transform-origin:center center}.tag:not(body).is-delete::before{height:1px;width:50%}.tag:not(body).is-delete::after{height:50%;width:1px}.tag:not(body).is-delete:focus,.tag:not(body).is-delete:hover{background-color:#e8e8e8}.tag:not(body).is-delete:active{background-color:#dbdbdb}.tag:not(body).is-rounded{border-radius:290486px}a.tag:hover{text-decoration:underline}.subtitle,.title{word-break:break-word}.subtitle em,.subtitle span,.title em,.title span{font-weight:inherit}.subtitle sub,.title sub{font-size:.75em}.subtitle sup,.title sup{font-size:.75em}.subtitle .tag,.title .tag{vertical-align:middle}.title{color:#363636;font-size:2rem;font-weight:600;line-height:1.125}.title strong{color:inherit;font-weight:inherit}.title+.highlight{margin-top:-.75rem}.title:not(.is-spaced)+.subtitle{margin-top:-1.25rem}.title.is-1{font-size:3rem}.title.is-2{font-size:2.5rem}.title.is-3{font-size:2rem}.title.is-4{font-size:1.5rem}.title.is-5{font-size:1.25rem}.title.is-6{font-size:1rem}.title.is-7{font-size:.75rem}.subtitle{color:#4a4a4a;font-size:1.25rem;font-weight:400;line-height:1.25}.subtitle strong{color:#363636;font-weight:600}.subtitle:not(.is-spaced)+.title{margin-top:-1.25rem}.subtitle.is-1{font-size:3rem}.subtitle.is-2{font-size:2.5rem}.subtitle.is-3{font-size:2rem}.subtitle.is-4{font-size:1.5rem}.subtitle.is-5{font-size:1.25rem}.subtitle.is-6{font-size:1rem}.subtitle.is-7{font-size:.75rem}.heading{display:block;font-size:11px;letter-spacing:1px;margin-bottom:5px;text-transform:uppercase}.highlight{font-weight:400;max-width:100%;overflow:hidden;padding:0}.highlight pre{overflow:auto;max-width:100%}.number{align-items:center;background-color:#f5f5f5;border-radius:290486px;display:inline-flex;font-size:1.25rem;height:2em;justify-content:center;margin-right:1.5rem;min-width:2.5em;padding:.25rem .5rem;text-align:center;vertical-align:top}.input,.select select,.textarea{background-color:#fff;border-color:#dbdbdb;border-radius:4px;color:#363636}.input::-moz-placeholder,.select select::-moz-placeholder,.textarea::-moz-placeholder{color:rgba(54,54,54,.3)}.input::-webkit-input-placeholder,.select select::-webkit-input-placeholder,.textarea::-webkit-input-placeholder{color:rgba(54,54,54,.3)}.input:-moz-placeholder,.select select:-moz-placeholder,.textarea:-moz-placeholder{color:rgba(54,54,54,.3)}.input:-ms-input-placeholder,.select select:-ms-input-placeholder,.textarea:-ms-input-placeholder{color:rgba(54,54,54,.3)}.input:hover,.is-hovered.input,.is-hovered.textarea,.select select.is-hovered,.select select:hover,.textarea:hover{border-color:#b5b5b5}.input:active,.input:focus,.is-active.input,.is-active.textarea,.is-focused.input,.is-focused.textarea,.select select.is-active,.select select.is-focused,.select select:active,.select select:focus,.textarea:active,.textarea:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.input[disabled],.select fieldset[disabled] select,.select select[disabled],.textarea[disabled],fieldset[disabled] .input,fieldset[disabled] .select select,fieldset[disabled] .textarea{background-color:#f5f5f5;border-color:#f5f5f5;box-shadow:none;color:#7a7a7a}.input[disabled]::-moz-placeholder,.select fieldset[disabled] select::-moz-placeholder,.select select[disabled]::-moz-placeholder,.textarea[disabled]::-moz-placeholder,fieldset[disabled] .input::-moz-placeholder,fieldset[disabled] .select select::-moz-placeholder,fieldset[disabled] .textarea::-moz-placeholder{color:rgba(122,122,122,.3)}.input[disabled]::-webkit-input-placeholder,.select fieldset[disabled] select::-webkit-input-placeholder,.select select[disabled]::-webkit-input-placeholder,.textarea[disabled]::-webkit-input-placeholder,fieldset[disabled] .input::-webkit-input-placeholder,fieldset[disabled] .select select::-webkit-input-placeholder,fieldset[disabled] .textarea::-webkit-input-placeholder{color:rgba(122,122,122,.3)}.input[disabled]:-moz-placeholder,.select fieldset[disabled] select:-moz-placeholder,.select select[disabled]:-moz-placeholder,.textarea[disabled]:-moz-placeholder,fieldset[disabled] .input:-moz-placeholder,fieldset[disabled] .select select:-moz-placeholder,fieldset[disabled] .textarea:-moz-placeholder{color:rgba(122,122,122,.3)}.input[disabled]:-ms-input-placeholder,.select fieldset[disabled] select:-ms-input-placeholder,.select select[disabled]:-ms-input-placeholder,.textarea[disabled]:-ms-input-placeholder,fieldset[disabled] .input:-ms-input-placeholder,fieldset[disabled] .select select:-ms-input-placeholder,fieldset[disabled] .textarea:-ms-input-placeholder{color:rgba(122,122,122,.3)}.input,.textarea{box-shadow:inset 0 .0625em .125em rgba(10,10,10,.05);max-width:100%;width:100%}.input[readonly],.textarea[readonly]{box-shadow:none}.is-white.input,.is-white.textarea{border-color:#fff}.is-white.input:active,.is-white.input:focus,.is-white.is-active.input,.is-white.is-active.textarea,.is-white.is-focused.input,.is-white.is-focused.textarea,.is-white.textarea:active,.is-white.textarea:focus{box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.is-black.input,.is-black.textarea{border-color:#0a0a0a}.is-black.input:active,.is-black.input:focus,.is-black.is-active.input,.is-black.is-active.textarea,.is-black.is-focused.input,.is-black.is-focused.textarea,.is-black.textarea:active,.is-black.textarea:focus{box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.is-light.input,.is-light.textarea{border-color:#f5f5f5}.is-light.input:active,.is-light.input:focus,.is-light.is-active.input,.is-light.is-active.textarea,.is-light.is-focused.input,.is-light.is-focused.textarea,.is-light.textarea:active,.is-light.textarea:focus{box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.is-dark.input,.is-dark.textarea{border-color:#363636}.is-dark.input:active,.is-dark.input:focus,.is-dark.is-active.input,.is-dark.is-active.textarea,.is-dark.is-focused.input,.is-dark.is-focused.textarea,.is-dark.textarea:active,.is-dark.textarea:focus{box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.is-primary.input,.is-primary.textarea{border-color:#00d1b2}.is-primary.input:active,.is-primary.input:focus,.is-primary.is-active.input,.is-primary.is-active.textarea,.is-primary.is-focused.input,.is-primary.is-focused.textarea,.is-primary.textarea:active,.is-primary.textarea:focus{box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.is-link.input,.is-link.textarea{border-color:#3273dc}.is-link.input:active,.is-link.input:focus,.is-link.is-active.input,.is-link.is-active.textarea,.is-link.is-focused.input,.is-link.is-focused.textarea,.is-link.textarea:active,.is-link.textarea:focus{box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.is-info.input,.is-info.textarea{border-color:#3298dc}.is-info.input:active,.is-info.input:focus,.is-info.is-active.input,.is-info.is-active.textarea,.is-info.is-focused.input,.is-info.is-focused.textarea,.is-info.textarea:active,.is-info.textarea:focus{box-shadow:0 0 0 .125em rgba(50,152,220,.25)}.is-success.input,.is-success.textarea{border-color:#48c774}.is-success.input:active,.is-success.input:focus,.is-success.is-active.input,.is-success.is-active.textarea,.is-success.is-focused.input,.is-success.is-focused.textarea,.is-success.textarea:active,.is-success.textarea:focus{box-shadow:0 0 0 .125em rgba(72,199,116,.25)}.is-warning.input,.is-warning.textarea{border-color:#ffdd57}.is-warning.input:active,.is-warning.input:focus,.is-warning.is-active.input,.is-warning.is-active.textarea,.is-warning.is-focused.input,.is-warning.is-focused.textarea,.is-warning.textarea:active,.is-warning.textarea:focus{box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.is-danger.input,.is-danger.textarea{border-color:#f14668}.is-danger.input:active,.is-danger.input:focus,.is-danger.is-active.input,.is-danger.is-active.textarea,.is-danger.is-focused.input,.is-danger.is-focused.textarea,.is-danger.textarea:active,.is-danger.textarea:focus{box-shadow:0 0 0 .125em rgba(241,70,104,.25)}.is-small.input,.is-small.textarea{border-radius:2px;font-size:.75rem}.is-medium.input,.is-medium.textarea{font-size:1.25rem}.is-large.input,.is-large.textarea{font-size:1.5rem}.is-fullwidth.input,.is-fullwidth.textarea{display:block;width:100%}.is-inline.input,.is-inline.textarea{display:inline;width:auto}.input.is-rounded{border-radius:290486px;padding-left:calc(calc(.75em - 1px) + .375em);padding-right:calc(calc(.75em - 1px) + .375em)}.input.is-static{background-color:transparent;border-color:transparent;box-shadow:none;padding-left:0;padding-right:0}.textarea{display:block;max-width:100%;min-width:100%;padding:calc(.75em - 1px);resize:vertical}.textarea:not([rows]){max-height:40em;min-height:8em}.textarea[rows]{height:initial}.textarea.has-fixed-size{resize:none}.checkbox,.radio{cursor:pointer;display:inline-block;line-height:1.25;position:relative}.checkbox input,.radio input{cursor:pointer}.checkbox:hover,.radio:hover{color:#363636}.checkbox[disabled],.radio[disabled],fieldset[disabled] .checkbox,fieldset[disabled] .radio{color:#7a7a7a;cursor:not-allowed}.radio+.radio{margin-left:.5em}.select{display:inline-block;max-width:100%;position:relative;vertical-align:top}.select:not(.is-multiple){height:2.5em}.select:not(.is-multiple):not(.is-loading)::after{border-color:#3273dc;right:1.125em;z-index:4}.select.is-rounded select{border-radius:290486px;padding-left:1em}.select select{cursor:pointer;display:block;font-size:1em;max-width:100%;outline:0}.select select::-ms-expand{display:none}.select select[disabled]:hover,fieldset[disabled] .select select:hover{border-color:#f5f5f5}.select select:not([multiple]){padding-right:2.5em}.select select[multiple]{height:auto;padding:0}.select select[multiple] option{padding:.5em 1em}.select:not(.is-multiple):not(.is-loading):hover::after{border-color:#363636}.select.is-white:not(:hover)::after{border-color:#fff}.select.is-white select{border-color:#fff}.select.is-white select.is-hovered,.select.is-white select:hover{border-color:#f2f2f2}.select.is-white select.is-active,.select.is-white select.is-focused,.select.is-white select:active,.select.is-white select:focus{box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.select.is-black:not(:hover)::after{border-color:#0a0a0a}.select.is-black select{border-color:#0a0a0a}.select.is-black select.is-hovered,.select.is-black select:hover{border-color:#000}.select.is-black select.is-active,.select.is-black select.is-focused,.select.is-black select:active,.select.is-black select:focus{box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.select.is-light:not(:hover)::after{border-color:#f5f5f5}.select.is-light select{border-color:#f5f5f5}.select.is-light select.is-hovered,.select.is-light select:hover{border-color:#e8e8e8}.select.is-light select.is-active,.select.is-light select.is-focused,.select.is-light select:active,.select.is-light select:focus{box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.select.is-dark:not(:hover)::after{border-color:#363636}.select.is-dark select{border-color:#363636}.select.is-dark select.is-hovered,.select.is-dark select:hover{border-color:#292929}.select.is-dark select.is-active,.select.is-dark select.is-focused,.select.is-dark select:active,.select.is-dark select:focus{box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.select.is-primary:not(:hover)::after{border-color:#00d1b2}.select.is-primary select{border-color:#00d1b2}.select.is-primary select.is-hovered,.select.is-primary select:hover{border-color:#00b89c}.select.is-primary select.is-active,.select.is-primary select.is-focused,.select.is-primary select:active,.select.is-primary select:focus{box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.select.is-link:not(:hover)::after{border-color:#3273dc}.select.is-link select{border-color:#3273dc}.select.is-link select.is-hovered,.select.is-link select:hover{border-color:#2366d1}.select.is-link select.is-active,.select.is-link select.is-focused,.select.is-link select:active,.select.is-link select:focus{box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.select.is-info:not(:hover)::after{border-color:#3298dc}.select.is-info select{border-color:#3298dc}.select.is-info select.is-hovered,.select.is-info select:hover{border-color:#238cd1}.select.is-info select.is-active,.select.is-info select.is-focused,.select.is-info select:active,.select.is-info select:focus{box-shadow:0 0 0 .125em rgba(50,152,220,.25)}.select.is-success:not(:hover)::after{border-color:#48c774}.select.is-success select{border-color:#48c774}.select.is-success select.is-hovered,.select.is-success select:hover{border-color:#3abb67}.select.is-success select.is-active,.select.is-success select.is-focused,.select.is-success select:active,.select.is-success select:focus{box-shadow:0 0 0 .125em rgba(72,199,116,.25)}.select.is-warning:not(:hover)::after{border-color:#ffdd57}.select.is-warning select{border-color:#ffdd57}.select.is-warning select.is-hovered,.select.is-warning select:hover{border-color:#ffd83d}.select.is-warning select.is-active,.select.is-warning select.is-focused,.select.is-warning select:active,.select.is-warning select:focus{box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.select.is-danger:not(:hover)::after{border-color:#f14668}.select.is-danger select{border-color:#f14668}.select.is-danger select.is-hovered,.select.is-danger select:hover{border-color:#ef2e55}.select.is-danger select.is-active,.select.is-danger select.is-focused,.select.is-danger select:active,.select.is-danger select:focus{box-shadow:0 0 0 .125em rgba(241,70,104,.25)}.select.is-small{border-radius:2px;font-size:.75rem}.select.is-medium{font-size:1.25rem}.select.is-large{font-size:1.5rem}.select.is-disabled::after{border-color:#7a7a7a}.select.is-fullwidth{width:100%}.select.is-fullwidth select{width:100%}.select.is-loading::after{margin-top:0;position:absolute;right:.625em;top:.625em;transform:none}.select.is-loading.is-small:after{font-size:.75rem}.select.is-loading.is-medium:after{font-size:1.25rem}.select.is-loading.is-large:after{font-size:1.5rem}.file{align-items:stretch;display:flex;justify-content:flex-start;position:relative}.file.is-white .file-cta{background-color:#fff;border-color:transparent;color:#0a0a0a}.file.is-white.is-hovered .file-cta,.file.is-white:hover .file-cta{background-color:#f9f9f9;border-color:transparent;color:#0a0a0a}.file.is-white.is-focused .file-cta,.file.is-white:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(255,255,255,.25);color:#0a0a0a}.file.is-white.is-active .file-cta,.file.is-white:active .file-cta{background-color:#f2f2f2;border-color:transparent;color:#0a0a0a}.file.is-black .file-cta{background-color:#0a0a0a;border-color:transparent;color:#fff}.file.is-black.is-hovered .file-cta,.file.is-black:hover .file-cta{background-color:#040404;border-color:transparent;color:#fff}.file.is-black.is-focused .file-cta,.file.is-black:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(10,10,10,.25);color:#fff}.file.is-black.is-active .file-cta,.file.is-black:active .file-cta{background-color:#000;border-color:transparent;color:#fff}.file.is-light .file-cta{background-color:#f5f5f5;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-light.is-hovered .file-cta,.file.is-light:hover .file-cta{background-color:#eee;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-light.is-focused .file-cta,.file.is-light:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(245,245,245,.25);color:rgba(0,0,0,.7)}.file.is-light.is-active .file-cta,.file.is-light:active .file-cta{background-color:#e8e8e8;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-dark .file-cta{background-color:#363636;border-color:transparent;color:#fff}.file.is-dark.is-hovered .file-cta,.file.is-dark:hover .file-cta{background-color:#2f2f2f;border-color:transparent;color:#fff}.file.is-dark.is-focused .file-cta,.file.is-dark:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(54,54,54,.25);color:#fff}.file.is-dark.is-active .file-cta,.file.is-dark:active .file-cta{background-color:#292929;border-color:transparent;color:#fff}.file.is-primary .file-cta{background-color:#00d1b2;border-color:transparent;color:#fff}.file.is-primary.is-hovered .file-cta,.file.is-primary:hover .file-cta{background-color:#00c4a7;border-color:transparent;color:#fff}.file.is-primary.is-focused .file-cta,.file.is-primary:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(0,209,178,.25);color:#fff}.file.is-primary.is-active .file-cta,.file.is-primary:active .file-cta{background-color:#00b89c;border-color:transparent;color:#fff}.file.is-link .file-cta{background-color:#3273dc;border-color:transparent;color:#fff}.file.is-link.is-hovered .file-cta,.file.is-link:hover .file-cta{background-color:#276cda;border-color:transparent;color:#fff}.file.is-link.is-focused .file-cta,.file.is-link:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(50,115,220,.25);color:#fff}.file.is-link.is-active .file-cta,.file.is-link:active .file-cta{background-color:#2366d1;border-color:transparent;color:#fff}.file.is-info .file-cta{background-color:#3298dc;border-color:transparent;color:#fff}.file.is-info.is-hovered .file-cta,.file.is-info:hover .file-cta{background-color:#2793da;border-color:transparent;color:#fff}.file.is-info.is-focused .file-cta,.file.is-info:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(50,152,220,.25);color:#fff}.file.is-info.is-active .file-cta,.file.is-info:active .file-cta{background-color:#238cd1;border-color:transparent;color:#fff}.file.is-success .file-cta{background-color:#48c774;border-color:transparent;color:#fff}.file.is-success.is-hovered .file-cta,.file.is-success:hover .file-cta{background-color:#3ec46d;border-color:transparent;color:#fff}.file.is-success.is-focused .file-cta,.file.is-success:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(72,199,116,.25);color:#fff}.file.is-success.is-active .file-cta,.file.is-success:active .file-cta{background-color:#3abb67;border-color:transparent;color:#fff}.file.is-warning .file-cta{background-color:#ffdd57;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-warning.is-hovered .file-cta,.file.is-warning:hover .file-cta{background-color:#ffdb4a;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-warning.is-focused .file-cta,.file.is-warning:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(255,221,87,.25);color:rgba(0,0,0,.7)}.file.is-warning.is-active .file-cta,.file.is-warning:active .file-cta{background-color:#ffd83d;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-danger .file-cta{background-color:#f14668;border-color:transparent;color:#fff}.file.is-danger.is-hovered .file-cta,.file.is-danger:hover .file-cta{background-color:#f03a5f;border-color:transparent;color:#fff}.file.is-danger.is-focused .file-cta,.file.is-danger:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(241,70,104,.25);color:#fff}.file.is-danger.is-active .file-cta,.file.is-danger:active .file-cta{background-color:#ef2e55;border-color:transparent;color:#fff}.file.is-small{font-size:.75rem}.file.is-medium{font-size:1.25rem}.file.is-medium .file-icon .fa{font-size:21px}.file.is-large{font-size:1.5rem}.file.is-large .file-icon .fa{font-size:28px}.file.has-name .file-cta{border-bottom-right-radius:0;border-top-right-radius:0}.file.has-name .file-name{border-bottom-left-radius:0;border-top-left-radius:0}.file.has-name.is-empty .file-cta{border-radius:4px}.file.has-name.is-empty .file-name{display:none}.file.is-boxed .file-label{flex-direction:column}.file.is-boxed .file-cta{flex-direction:column;height:auto;padding:1em 3em}.file.is-boxed .file-name{border-width:0 1px 1px}.file.is-boxed .file-icon{height:1.5em;width:1.5em}.file.is-boxed .file-icon .fa{font-size:21px}.file.is-boxed.is-small .file-icon .fa{font-size:14px}.file.is-boxed.is-medium .file-icon .fa{font-size:28px}.file.is-boxed.is-large .file-icon .fa{font-size:35px}.file.is-boxed.has-name .file-cta{border-radius:4px 4px 0 0}.file.is-boxed.has-name .file-name{border-radius:0 0 4px 4px;border-width:0 1px 1px}.file.is-centered{justify-content:center}.file.is-fullwidth .file-label{width:100%}.file.is-fullwidth .file-name{flex-grow:1;max-width:none}.file.is-right{justify-content:flex-end}.file.is-right .file-cta{border-radius:0 4px 4px 0}.file.is-right .file-name{border-radius:4px 0 0 4px;border-width:1px 0 1px 1px;order:-1}.file-label{align-items:stretch;display:flex;cursor:pointer;justify-content:flex-start;overflow:hidden;position:relative}.file-label:hover .file-cta{background-color:#eee;color:#363636}.file-label:hover .file-name{border-color:#d5d5d5}.file-label:active .file-cta{background-color:#e8e8e8;color:#363636}.file-label:active .file-name{border-color:#cfcfcf}.file-input{height:100%;left:0;opacity:0;outline:0;position:absolute;top:0;width:100%}.file-cta,.file-name{border-color:#dbdbdb;border-radius:4px;font-size:1em;padding-left:1em;padding-right:1em;white-space:nowrap}.file-cta{background-color:#f5f5f5;color:#4a4a4a}.file-name{border-color:#dbdbdb;border-style:solid;border-width:1px 1px 1px 0;display:block;max-width:16em;overflow:hidden;text-align:left;text-overflow:ellipsis}.file-icon{align-items:center;display:flex;height:1em;justify-content:center;margin-right:.5em;width:1em}.file-icon .fa{font-size:14px}.label{color:#363636;display:block;font-size:1rem;font-weight:700}.label:not(:last-child){margin-bottom:.5em}.label.is-small{font-size:.75rem}.label.is-medium{font-size:1.25rem}.label.is-large{font-size:1.5rem}.help{display:block;font-size:.75rem;margin-top:.25rem}.help.is-white{color:#fff}.help.is-black{color:#0a0a0a}.help.is-light{color:#f5f5f5}.help.is-dark{color:#363636}.help.is-primary{color:#00d1b2}.help.is-link{color:#3273dc}.help.is-info{color:#3298dc}.help.is-success{color:#48c774}.help.is-warning{color:#ffdd57}.help.is-danger{color:#f14668}.field:not(:last-child){margin-bottom:.75rem}.field.has-addons{display:flex;justify-content:flex-start}.field.has-addons .control:not(:last-child){margin-right:-1px}.field.has-addons .control:not(:first-child):not(:last-child) .button,.field.has-addons .control:not(:first-child):not(:last-child) .input,.field.has-addons .control:not(:first-child):not(:last-child) .select select{border-radius:0}.field.has-addons .control:first-child:not(:only-child) .button,.field.has-addons .control:first-child:not(:only-child) .input,.field.has-addons .control:first-child:not(:only-child) .select select{border-bottom-right-radius:0;border-top-right-radius:0}.field.has-addons .control:last-child:not(:only-child) .button,.field.has-addons .control:last-child:not(:only-child) .input,.field.has-addons .control:last-child:not(:only-child) .select select{border-bottom-left-radius:0;border-top-left-radius:0}.field.has-addons .control .button:not([disabled]).is-hovered,.field.has-addons .control .button:not([disabled]):hover,.field.has-addons .control .input:not([disabled]).is-hovered,.field.has-addons .control .input:not([disabled]):hover,.field.has-addons .control .select select:not([disabled]).is-hovered,.field.has-addons .control .select select:not([disabled]):hover{z-index:2}.field.has-addons .control .button:not([disabled]).is-active,.field.has-addons .control .button:not([disabled]).is-focused,.field.has-addons .control .button:not([disabled]):active,.field.has-addons .control .button:not([disabled]):focus,.field.has-addons .control .input:not([disabled]).is-active,.field.has-addons .control .input:not([disabled]).is-focused,.field.has-addons .control .input:not([disabled]):active,.field.has-addons .control .input:not([disabled]):focus,.field.has-addons .control .select select:not([disabled]).is-active,.field.has-addons .control .select select:not([disabled]).is-focused,.field.has-addons .control .select select:not([disabled]):active,.field.has-addons .control .select select:not([disabled]):focus{z-index:3}.field.has-addons .control .button:not([disabled]).is-active:hover,.field.has-addons .control .button:not([disabled]).is-focused:hover,.field.has-addons .control .button:not([disabled]):active:hover,.field.has-addons .control .button:not([disabled]):focus:hover,.field.has-addons .control .input:not([disabled]).is-active:hover,.field.has-addons .control .input:not([disabled]).is-focused:hover,.field.has-addons .control .input:not([disabled]):active:hover,.field.has-addons .control .input:not([disabled]):focus:hover,.field.has-addons .control .select select:not([disabled]).is-active:hover,.field.has-addons .control .select select:not([disabled]).is-focused:hover,.field.has-addons .control .select select:not([disabled]):active:hover,.field.has-addons .control .select select:not([disabled]):focus:hover{z-index:4}.field.has-addons .control.is-expanded{flex-grow:1;flex-shrink:1}.field.has-addons.has-addons-centered{justify-content:center}.field.has-addons.has-addons-right{justify-content:flex-end}.field.has-addons.has-addons-fullwidth .control{flex-grow:1;flex-shrink:0}.field.is-grouped{display:flex;justify-content:flex-start}.field.is-grouped>.control{flex-shrink:0}.field.is-grouped>.control:not(:last-child){margin-bottom:0;margin-right:.75rem}.field.is-grouped>.control.is-expanded{flex-grow:1;flex-shrink:1}.field.is-grouped.is-grouped-centered{justify-content:center}.field.is-grouped.is-grouped-right{justify-content:flex-end}.field.is-grouped.is-grouped-multiline{flex-wrap:wrap}.field.is-grouped.is-grouped-multiline>.control:last-child,.field.is-grouped.is-grouped-multiline>.control:not(:last-child){margin-bottom:.75rem}.field.is-grouped.is-grouped-multiline:last-child{margin-bottom:-.75rem}.field.is-grouped.is-grouped-multiline:not(:last-child){margin-bottom:0}@media screen and (min-width:769px),print{.field.is-horizontal{display:flex}}.field-label .label{font-size:inherit}@media screen and (max-width:768px){.field-label{margin-bottom:.5rem}}@media screen and (min-width:769px),print{.field-label{flex-basis:0;flex-grow:1;flex-shrink:0;margin-right:1.5rem;text-align:right}.field-label.is-small{font-size:.75rem;padding-top:.375em}.field-label.is-normal{padding-top:.375em}.field-label.is-medium{font-size:1.25rem;padding-top:.375em}.field-label.is-large{font-size:1.5rem;padding-top:.375em}}.field-body .field .field{margin-bottom:0}@media screen and (min-width:769px),print{.field-body{display:flex;flex-basis:0;flex-grow:5;flex-shrink:1}.field-body .field{margin-bottom:0}.field-body>.field{flex-shrink:1}.field-body>.field:not(.is-narrow){flex-grow:1}.field-body>.field:not(:last-child){margin-right:.75rem}}.control{box-sizing:border-box;clear:both;font-size:1rem;position:relative;text-align:left}.control.has-icons-left .input:focus~.icon,.control.has-icons-left .select:focus~.icon,.control.has-icons-right .input:focus~.icon,.control.has-icons-right .select:focus~.icon{color:#4a4a4a}.control.has-icons-left .input.is-small~.icon,.control.has-icons-left .select.is-small~.icon,.control.has-icons-right .input.is-small~.icon,.control.has-icons-right .select.is-small~.icon{font-size:.75rem}.control.has-icons-left .input.is-medium~.icon,.control.has-icons-left .select.is-medium~.icon,.control.has-icons-right .input.is-medium~.icon,.control.has-icons-right .select.is-medium~.icon{font-size:1.25rem}.control.has-icons-left .input.is-large~.icon,.control.has-icons-left .select.is-large~.icon,.control.has-icons-right .input.is-large~.icon,.control.has-icons-right .select.is-large~.icon{font-size:1.5rem}.control.has-icons-left .icon,.control.has-icons-right .icon{color:#dbdbdb;height:2.5em;pointer-events:none;position:absolute;top:0;width:2.5em;z-index:4}.control.has-icons-left .input,.control.has-icons-left .select select{padding-left:2.5em}.control.has-icons-left .icon.is-left{left:0}.control.has-icons-right .input,.control.has-icons-right .select select{padding-right:2.5em}.control.has-icons-right .icon.is-right{right:0}.control.is-loading::after{position:absolute!important;right:.625em;top:.625em;z-index:4}.control.is-loading.is-small:after{font-size:.75rem}.control.is-loading.is-medium:after{font-size:1.25rem}.control.is-loading.is-large:after{font-size:1.5rem}.breadcrumb{font-size:1rem;white-space:nowrap}.breadcrumb a{align-items:center;color:#3273dc;display:flex;justify-content:center;padding:0 .75em}.breadcrumb a:hover{color:#363636}.breadcrumb li{align-items:center;display:flex}.breadcrumb li:first-child a{padding-left:0}.breadcrumb li.is-active a{color:#363636;cursor:default;pointer-events:none}.breadcrumb li+li::before{color:#b5b5b5;content:"\0002f"}.breadcrumb ol,.breadcrumb ul{align-items:flex-start;display:flex;flex-wrap:wrap;justify-content:flex-start}.breadcrumb .icon:first-child{margin-right:.5em}.breadcrumb .icon:last-child{margin-left:.5em}.breadcrumb.is-centered ol,.breadcrumb.is-centered ul{justify-content:center}.breadcrumb.is-right ol,.breadcrumb.is-right ul{justify-content:flex-end}.breadcrumb.is-small{font-size:.75rem}.breadcrumb.is-medium{font-size:1.25rem}.breadcrumb.is-large{font-size:1.5rem}.breadcrumb.has-arrow-separator li+li::before{content:"\02192"}.breadcrumb.has-bullet-separator li+li::before{content:"\02022"}.breadcrumb.has-dot-separator li+li::before{content:"\000b7"}.breadcrumb.has-succeeds-separator li+li::before{content:"\0227B"}.card{background-color:#fff;box-shadow:0 .5em 1em -.125em rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.02);color:#4a4a4a;max-width:100%;position:relative}.card-header{background-color:transparent;align-items:stretch;box-shadow:0 .125em .25em rgba(10,10,10,.1);display:flex}.card-header-title{align-items:center;color:#363636;display:flex;flex-grow:1;font-weight:700;padding:.75rem 1rem}.card-header-title.is-centered{justify-content:center}.card-header-icon{align-items:center;cursor:pointer;display:flex;justify-content:center;padding:.75rem 1rem}.card-image{display:block;position:relative}.card-content{background-color:transparent;padding:1.5rem}.card-footer{background-color:transparent;border-top:1px solid #ededed;align-items:stretch;display:flex}.card-footer-item{align-items:center;display:flex;flex-basis:0;flex-grow:1;flex-shrink:0;justify-content:center;padding:.75rem}.card-footer-item:not(:last-child){border-right:1px solid #ededed}.card .media:not(:last-child){margin-bottom:1.5rem}.dropdown{display:inline-flex;position:relative;vertical-align:top}.dropdown.is-active .dropdown-menu,.dropdown.is-hoverable:hover .dropdown-menu{display:block}.dropdown.is-right .dropdown-menu{left:auto;right:0}.dropdown.is-up .dropdown-menu{bottom:100%;padding-bottom:4px;padding-top:initial;top:auto}.dropdown-menu{display:none;left:0;min-width:12rem;padding-top:4px;position:absolute;top:100%;z-index:20}.dropdown-content{background-color:#fff;border-radius:4px;box-shadow:0 .5em 1em -.125em rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.02);padding-bottom:.5rem;padding-top:.5rem}.dropdown-item{color:#4a4a4a;display:block;font-size:.875rem;line-height:1.5;padding:.375rem 1rem;position:relative}a.dropdown-item,button.dropdown-item{padding-right:3rem;text-align:left;white-space:nowrap;width:100%}a.dropdown-item:hover,button.dropdown-item:hover{background-color:#f5f5f5;color:#0a0a0a}a.dropdown-item.is-active,button.dropdown-item.is-active{background-color:#3273dc;color:#fff}.dropdown-divider{background-color:#ededed;border:none;display:block;height:1px;margin:.5rem 0}.level{align-items:center;justify-content:space-between}.level code{border-radius:4px}.level img{display:inline-block;vertical-align:top}.level.is-mobile{display:flex}.level.is-mobile .level-left,.level.is-mobile .level-right{display:flex}.level.is-mobile .level-left+.level-right{margin-top:0}.level.is-mobile .level-item:not(:last-child){margin-bottom:0;margin-right:.75rem}.level.is-mobile .level-item:not(.is-narrow){flex-grow:1}@media screen and (min-width:769px),print{.level{display:flex}.level>.level-item:not(.is-narrow){flex-grow:1}}.level-item{align-items:center;display:flex;flex-basis:auto;flex-grow:0;flex-shrink:0;justify-content:center}.level-item .subtitle,.level-item .title{margin-bottom:0}@media screen and (max-width:768px){.level-item:not(:last-child){margin-bottom:.75rem}}.level-left,.level-right{flex-basis:auto;flex-grow:0;flex-shrink:0}.level-left .level-item.is-flexible,.level-right .level-item.is-flexible{flex-grow:1}@media screen and (min-width:769px),print{.level-left .level-item:not(:last-child),.level-right .level-item:not(:last-child){margin-right:.75rem}}.level-left{align-items:center;justify-content:flex-start}@media screen and (max-width:768px){.level-left+.level-right{margin-top:1.5rem}}@media screen and (min-width:769px),print{.level-left{display:flex}}.level-right{align-items:center;justify-content:flex-end}@media screen and (min-width:769px),print{.level-right{display:flex}}.list{background-color:#fff;border-radius:4px;box-shadow:0 2px 3px rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.1)}.list-item{display:block;padding:.5em 1em}.list-item:not(a){color:#4a4a4a}.list-item:first-child{border-top-left-radius:4px;border-top-right-radius:4px}.list-item:last-child{border-bottom-left-radius:4px;border-bottom-right-radius:4px}.list-item:not(:last-child){border-bottom:1px solid #dbdbdb}.list-item.is-active{background-color:#3273dc;color:#fff}a.list-item{background-color:#f5f5f5;cursor:pointer}.media{align-items:flex-start;display:flex;text-align:left}.media .content:not(:last-child){margin-bottom:.75rem}.media .media{border-top:1px solid rgba(219,219,219,.5);display:flex;padding-top:.75rem}.media .media .content:not(:last-child),.media .media .control:not(:last-child){margin-bottom:.5rem}.media .media .media{padding-top:.5rem}.media .media .media+.media{margin-top:.5rem}.media+.media{border-top:1px solid rgba(219,219,219,.5);margin-top:1rem;padding-top:1rem}.media.is-large+.media{margin-top:1.5rem;padding-top:1.5rem}.media-left,.media-right{flex-basis:auto;flex-grow:0;flex-shrink:0}.media-left{margin-right:1rem}.media-right{margin-left:1rem}.media-content{flex-basis:auto;flex-grow:1;flex-shrink:1;text-align:left}@media screen and (max-width:768px){.media-content{overflow-x:auto}}.menu{font-size:1rem}.menu.is-small{font-size:.75rem}.menu.is-medium{font-size:1.25rem}.menu.is-large{font-size:1.5rem}.menu-list{line-height:1.25}.menu-list a{border-radius:2px;color:#4a4a4a;display:block;padding:.5em .75em}.menu-list a:hover{background-color:#f5f5f5;color:#363636}.menu-list a.is-active{background-color:#3273dc;color:#fff}.menu-list li ul{border-left:1px solid #dbdbdb;margin:.75em;padding-left:.75em}.menu-label{color:#7a7a7a;font-size:.75em;letter-spacing:.1em;text-transform:uppercase}.menu-label:not(:first-child){margin-top:1em}.menu-label:not(:last-child){margin-bottom:1em}.message{background-color:#f5f5f5;border-radius:4px;font-size:1rem}.message strong{color:currentColor}.message a:not(.button):not(.tag):not(.dropdown-item){color:currentColor;text-decoration:underline}.message.is-small{font-size:.75rem}.message.is-medium{font-size:1.25rem}.message.is-large{font-size:1.5rem}.message.is-white{background-color:#fff}.message.is-white .message-header{background-color:#fff;color:#0a0a0a}.message.is-white .message-body{border-color:#fff}.message.is-black{background-color:#fafafa}.message.is-black .message-header{background-color:#0a0a0a;color:#fff}.message.is-black .message-body{border-color:#0a0a0a}.message.is-light{background-color:#fafafa}.message.is-light .message-header{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.message.is-light .message-body{border-color:#f5f5f5}.message.is-dark{background-color:#fafafa}.message.is-dark .message-header{background-color:#363636;color:#fff}.message.is-dark .message-body{border-color:#363636}.message.is-primary{background-color:#ebfffc}.message.is-primary .message-header{background-color:#00d1b2;color:#fff}.message.is-primary .message-body{border-color:#00d1b2;color:#00947e}.message.is-link{background-color:#eef3fc}.message.is-link .message-header{background-color:#3273dc;color:#fff}.message.is-link .message-body{border-color:#3273dc;color:#2160c4}.message.is-info{background-color:#eef6fc}.message.is-info .message-header{background-color:#3298dc;color:#fff}.message.is-info .message-body{border-color:#3298dc;color:#1d72aa}.message.is-success{background-color:#effaf3}.message.is-success .message-header{background-color:#48c774;color:#fff}.message.is-success .message-body{border-color:#48c774;color:#257942}.message.is-warning{background-color:#fffbeb}.message.is-warning .message-header{background-color:#ffdd57;color:rgba(0,0,0,.7)}.message.is-warning .message-body{border-color:#ffdd57;color:#947600}.message.is-danger{background-color:#feecf0}.message.is-danger .message-header{background-color:#f14668;color:#fff}.message.is-danger .message-body{border-color:#f14668;color:#cc0f35}.message-header{align-items:center;background-color:#4a4a4a;border-radius:4px 4px 0 0;color:#fff;display:flex;font-weight:700;justify-content:space-between;line-height:1.25;padding:.75em 1em;position:relative}.message-header .delete{flex-grow:0;flex-shrink:0;margin-left:.75em}.message-header+.message-body{border-width:0;border-top-left-radius:0;border-top-right-radius:0}.message-body{border-color:#dbdbdb;border-radius:4px;border-style:solid;border-width:0 0 0 4px;color:#4a4a4a;padding:1.25em 1.5em}.message-body code,.message-body pre{background-color:#fff}.message-body pre code{background-color:transparent}.modal{align-items:center;display:none;flex-direction:column;justify-content:center;overflow:hidden;position:fixed;z-index:40}.modal.is-active{display:flex}.modal-background{background-color:rgba(10,10,10,.86)}.modal-card,.modal-content{margin:0 20px;max-height:calc(100vh - 160px);overflow:auto;position:relative;width:100%}@media screen and (min-width:769px),print{.modal-card,.modal-content{margin:0 auto;max-height:calc(100vh - 40px);width:640px}}.modal-close{background:0 0;height:40px;position:fixed;right:20px;top:20px;width:40px}.modal-card{display:flex;flex-direction:column;max-height:calc(100vh - 40px);overflow:hidden;-ms-overflow-y:visible}.modal-card-foot,.modal-card-head{align-items:center;background-color:#f5f5f5;display:flex;flex-shrink:0;justify-content:flex-start;padding:20px;position:relative}.modal-card-head{border-bottom:1px solid #dbdbdb;border-top-left-radius:6px;border-top-right-radius:6px}.modal-card-title{color:#363636;flex-grow:1;flex-shrink:0;font-size:1.5rem;line-height:1}.modal-card-foot{border-bottom-left-radius:6px;border-bottom-right-radius:6px;border-top:1px solid #dbdbdb}.modal-card-foot .button:not(:last-child){margin-right:.5em}.modal-card-body{-webkit-overflow-scrolling:touch;background-color:#fff;flex-grow:1;flex-shrink:1;overflow:auto;padding:20px}.navbar{background-color:#fff;min-height:3.25rem;position:relative;z-index:30}.navbar.is-white{background-color:#fff;color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link,.navbar.is-white .navbar-brand>.navbar-item{color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link.is-active,.navbar.is-white .navbar-brand .navbar-link:focus,.navbar.is-white .navbar-brand .navbar-link:hover,.navbar.is-white .navbar-brand>a.navbar-item.is-active,.navbar.is-white .navbar-brand>a.navbar-item:focus,.navbar.is-white .navbar-brand>a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link::after{border-color:#0a0a0a}.navbar.is-white .navbar-burger{color:#0a0a0a}@media screen and (min-width:1024px){.navbar.is-white .navbar-end .navbar-link,.navbar.is-white .navbar-end>.navbar-item,.navbar.is-white .navbar-start .navbar-link,.navbar.is-white .navbar-start>.navbar-item{color:#0a0a0a}.navbar.is-white .navbar-end .navbar-link.is-active,.navbar.is-white .navbar-end .navbar-link:focus,.navbar.is-white .navbar-end .navbar-link:hover,.navbar.is-white .navbar-end>a.navbar-item.is-active,.navbar.is-white .navbar-end>a.navbar-item:focus,.navbar.is-white .navbar-end>a.navbar-item:hover,.navbar.is-white .navbar-start .navbar-link.is-active,.navbar.is-white .navbar-start .navbar-link:focus,.navbar.is-white .navbar-start .navbar-link:hover,.navbar.is-white .navbar-start>a.navbar-item.is-active,.navbar.is-white .navbar-start>a.navbar-item:focus,.navbar.is-white .navbar-start>a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white .navbar-end .navbar-link::after,.navbar.is-white .navbar-start .navbar-link::after{border-color:#0a0a0a}.navbar.is-white .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-white .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-white .navbar-item.has-dropdown:hover .navbar-link{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white .navbar-dropdown a.navbar-item.is-active{background-color:#fff;color:#0a0a0a}}.navbar.is-black{background-color:#0a0a0a;color:#fff}.navbar.is-black .navbar-brand .navbar-link,.navbar.is-black .navbar-brand>.navbar-item{color:#fff}.navbar.is-black .navbar-brand .navbar-link.is-active,.navbar.is-black .navbar-brand .navbar-link:focus,.navbar.is-black .navbar-brand .navbar-link:hover,.navbar.is-black .navbar-brand>a.navbar-item.is-active,.navbar.is-black .navbar-brand>a.navbar-item:focus,.navbar.is-black .navbar-brand>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-black .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-black .navbar-end .navbar-link,.navbar.is-black .navbar-end>.navbar-item,.navbar.is-black .navbar-start .navbar-link,.navbar.is-black .navbar-start>.navbar-item{color:#fff}.navbar.is-black .navbar-end .navbar-link.is-active,.navbar.is-black .navbar-end .navbar-link:focus,.navbar.is-black .navbar-end .navbar-link:hover,.navbar.is-black .navbar-end>a.navbar-item.is-active,.navbar.is-black .navbar-end>a.navbar-item:focus,.navbar.is-black .navbar-end>a.navbar-item:hover,.navbar.is-black .navbar-start .navbar-link.is-active,.navbar.is-black .navbar-start .navbar-link:focus,.navbar.is-black .navbar-start .navbar-link:hover,.navbar.is-black .navbar-start>a.navbar-item.is-active,.navbar.is-black .navbar-start>a.navbar-item:focus,.navbar.is-black .navbar-start>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black .navbar-end .navbar-link::after,.navbar.is-black .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-black .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-black .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-black .navbar-item.has-dropdown:hover .navbar-link{background-color:#000;color:#fff}.navbar.is-black .navbar-dropdown a.navbar-item.is-active{background-color:#0a0a0a;color:#fff}}.navbar.is-light{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.navbar.is-light .navbar-brand .navbar-link,.navbar.is-light .navbar-brand>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-light .navbar-brand .navbar-link.is-active,.navbar.is-light .navbar-brand .navbar-link:focus,.navbar.is-light .navbar-brand .navbar-link:hover,.navbar.is-light .navbar-brand>a.navbar-item.is-active,.navbar.is-light .navbar-brand>a.navbar-item:focus,.navbar.is-light .navbar-brand>a.navbar-item:hover{background-color:#e8e8e8;color:rgba(0,0,0,.7)}.navbar.is-light .navbar-brand .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-light .navbar-burger{color:rgba(0,0,0,.7)}@media screen and (min-width:1024px){.navbar.is-light .navbar-end .navbar-link,.navbar.is-light .navbar-end>.navbar-item,.navbar.is-light .navbar-start .navbar-link,.navbar.is-light .navbar-start>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-light .navbar-end .navbar-link.is-active,.navbar.is-light .navbar-end .navbar-link:focus,.navbar.is-light .navbar-end .navbar-link:hover,.navbar.is-light .navbar-end>a.navbar-item.is-active,.navbar.is-light .navbar-end>a.navbar-item:focus,.navbar.is-light .navbar-end>a.navbar-item:hover,.navbar.is-light .navbar-start .navbar-link.is-active,.navbar.is-light .navbar-start .navbar-link:focus,.navbar.is-light .navbar-start .navbar-link:hover,.navbar.is-light .navbar-start>a.navbar-item.is-active,.navbar.is-light .navbar-start>a.navbar-item:focus,.navbar.is-light .navbar-start>a.navbar-item:hover{background-color:#e8e8e8;color:rgba(0,0,0,.7)}.navbar.is-light .navbar-end .navbar-link::after,.navbar.is-light .navbar-start .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-light .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-light .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-light .navbar-item.has-dropdown:hover .navbar-link{background-color:#e8e8e8;color:rgba(0,0,0,.7)}.navbar.is-light .navbar-dropdown a.navbar-item.is-active{background-color:#f5f5f5;color:rgba(0,0,0,.7)}}.navbar.is-dark{background-color:#363636;color:#fff}.navbar.is-dark .navbar-brand .navbar-link,.navbar.is-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-dark .navbar-brand .navbar-link.is-active,.navbar.is-dark .navbar-brand .navbar-link:focus,.navbar.is-dark .navbar-brand .navbar-link:hover,.navbar.is-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-dark .navbar-brand>a.navbar-item:focus,.navbar.is-dark .navbar-brand>a.navbar-item:hover{background-color:#292929;color:#fff}.navbar.is-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-dark .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-dark .navbar-end .navbar-link,.navbar.is-dark .navbar-end>.navbar-item,.navbar.is-dark .navbar-start .navbar-link,.navbar.is-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-dark .navbar-end .navbar-link.is-active,.navbar.is-dark .navbar-end .navbar-link:focus,.navbar.is-dark .navbar-end .navbar-link:hover,.navbar.is-dark .navbar-end>a.navbar-item.is-active,.navbar.is-dark .navbar-end>a.navbar-item:focus,.navbar.is-dark .navbar-end>a.navbar-item:hover,.navbar.is-dark .navbar-start .navbar-link.is-active,.navbar.is-dark .navbar-start .navbar-link:focus,.navbar.is-dark .navbar-start .navbar-link:hover,.navbar.is-dark .navbar-start>a.navbar-item.is-active,.navbar.is-dark .navbar-start>a.navbar-item:focus,.navbar.is-dark .navbar-start>a.navbar-item:hover{background-color:#292929;color:#fff}.navbar.is-dark .navbar-end .navbar-link::after,.navbar.is-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-dark .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#292929;color:#fff}.navbar.is-dark .navbar-dropdown a.navbar-item.is-active{background-color:#363636;color:#fff}}.navbar.is-primary{background-color:#00d1b2;color:#fff}.navbar.is-primary .navbar-brand .navbar-link,.navbar.is-primary .navbar-brand>.navbar-item{color:#fff}.navbar.is-primary .navbar-brand .navbar-link.is-active,.navbar.is-primary .navbar-brand .navbar-link:focus,.navbar.is-primary .navbar-brand .navbar-link:hover,.navbar.is-primary .navbar-brand>a.navbar-item.is-active,.navbar.is-primary .navbar-brand>a.navbar-item:focus,.navbar.is-primary .navbar-brand>a.navbar-item:hover{background-color:#00b89c;color:#fff}.navbar.is-primary .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-primary .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-primary .navbar-end .navbar-link,.navbar.is-primary .navbar-end>.navbar-item,.navbar.is-primary .navbar-start .navbar-link,.navbar.is-primary .navbar-start>.navbar-item{color:#fff}.navbar.is-primary .navbar-end .navbar-link.is-active,.navbar.is-primary .navbar-end .navbar-link:focus,.navbar.is-primary .navbar-end .navbar-link:hover,.navbar.is-primary .navbar-end>a.navbar-item.is-active,.navbar.is-primary .navbar-end>a.navbar-item:focus,.navbar.is-primary .navbar-end>a.navbar-item:hover,.navbar.is-primary .navbar-start .navbar-link.is-active,.navbar.is-primary .navbar-start .navbar-link:focus,.navbar.is-primary .navbar-start .navbar-link:hover,.navbar.is-primary .navbar-start>a.navbar-item.is-active,.navbar.is-primary .navbar-start>a.navbar-item:focus,.navbar.is-primary .navbar-start>a.navbar-item:hover{background-color:#00b89c;color:#fff}.navbar.is-primary .navbar-end .navbar-link::after,.navbar.is-primary .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-primary .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-primary .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-primary .navbar-item.has-dropdown:hover .navbar-link{background-color:#00b89c;color:#fff}.navbar.is-primary .navbar-dropdown a.navbar-item.is-active{background-color:#00d1b2;color:#fff}}.navbar.is-link{background-color:#3273dc;color:#fff}.navbar.is-link .navbar-brand .navbar-link,.navbar.is-link .navbar-brand>.navbar-item{color:#fff}.navbar.is-link .navbar-brand .navbar-link.is-active,.navbar.is-link .navbar-brand .navbar-link:focus,.navbar.is-link .navbar-brand .navbar-link:hover,.navbar.is-link .navbar-brand>a.navbar-item.is-active,.navbar.is-link .navbar-brand>a.navbar-item:focus,.navbar.is-link .navbar-brand>a.navbar-item:hover{background-color:#2366d1;color:#fff}.navbar.is-link .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-link .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-link .navbar-end .navbar-link,.navbar.is-link .navbar-end>.navbar-item,.navbar.is-link .navbar-start .navbar-link,.navbar.is-link .navbar-start>.navbar-item{color:#fff}.navbar.is-link .navbar-end .navbar-link.is-active,.navbar.is-link .navbar-end .navbar-link:focus,.navbar.is-link .navbar-end .navbar-link:hover,.navbar.is-link .navbar-end>a.navbar-item.is-active,.navbar.is-link .navbar-end>a.navbar-item:focus,.navbar.is-link .navbar-end>a.navbar-item:hover,.navbar.is-link .navbar-start .navbar-link.is-active,.navbar.is-link .navbar-start .navbar-link:focus,.navbar.is-link .navbar-start .navbar-link:hover,.navbar.is-link .navbar-start>a.navbar-item.is-active,.navbar.is-link .navbar-start>a.navbar-item:focus,.navbar.is-link .navbar-start>a.navbar-item:hover{background-color:#2366d1;color:#fff}.navbar.is-link .navbar-end .navbar-link::after,.navbar.is-link .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-link .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-link .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-link .navbar-item.has-dropdown:hover .navbar-link{background-color:#2366d1;color:#fff}.navbar.is-link .navbar-dropdown a.navbar-item.is-active{background-color:#3273dc;color:#fff}}.navbar.is-info{background-color:#3298dc;color:#fff}.navbar.is-info .navbar-brand .navbar-link,.navbar.is-info .navbar-brand>.navbar-item{color:#fff}.navbar.is-info .navbar-brand .navbar-link.is-active,.navbar.is-info .navbar-brand .navbar-link:focus,.navbar.is-info .navbar-brand .navbar-link:hover,.navbar.is-info .navbar-brand>a.navbar-item.is-active,.navbar.is-info .navbar-brand>a.navbar-item:focus,.navbar.is-info .navbar-brand>a.navbar-item:hover{background-color:#238cd1;color:#fff}.navbar.is-info .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-info .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-info .navbar-end .navbar-link,.navbar.is-info .navbar-end>.navbar-item,.navbar.is-info .navbar-start .navbar-link,.navbar.is-info .navbar-start>.navbar-item{color:#fff}.navbar.is-info .navbar-end .navbar-link.is-active,.navbar.is-info .navbar-end .navbar-link:focus,.navbar.is-info .navbar-end .navbar-link:hover,.navbar.is-info .navbar-end>a.navbar-item.is-active,.navbar.is-info .navbar-end>a.navbar-item:focus,.navbar.is-info .navbar-end>a.navbar-item:hover,.navbar.is-info .navbar-start .navbar-link.is-active,.navbar.is-info .navbar-start .navbar-link:focus,.navbar.is-info .navbar-start .navbar-link:hover,.navbar.is-info .navbar-start>a.navbar-item.is-active,.navbar.is-info .navbar-start>a.navbar-item:focus,.navbar.is-info .navbar-start>a.navbar-item:hover{background-color:#238cd1;color:#fff}.navbar.is-info .navbar-end .navbar-link::after,.navbar.is-info .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-info .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-info .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-info .navbar-item.has-dropdown:hover .navbar-link{background-color:#238cd1;color:#fff}.navbar.is-info .navbar-dropdown a.navbar-item.is-active{background-color:#3298dc;color:#fff}}.navbar.is-success{background-color:#48c774;color:#fff}.navbar.is-success .navbar-brand .navbar-link,.navbar.is-success .navbar-brand>.navbar-item{color:#fff}.navbar.is-success .navbar-brand .navbar-link.is-active,.navbar.is-success .navbar-brand .navbar-link:focus,.navbar.is-success .navbar-brand .navbar-link:hover,.navbar.is-success .navbar-brand>a.navbar-item.is-active,.navbar.is-success .navbar-brand>a.navbar-item:focus,.navbar.is-success .navbar-brand>a.navbar-item:hover{background-color:#3abb67;color:#fff}.navbar.is-success .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-success .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-success .navbar-end .navbar-link,.navbar.is-success .navbar-end>.navbar-item,.navbar.is-success .navbar-start .navbar-link,.navbar.is-success .navbar-start>.navbar-item{color:#fff}.navbar.is-success .navbar-end .navbar-link.is-active,.navbar.is-success .navbar-end .navbar-link:focus,.navbar.is-success .navbar-end .navbar-link:hover,.navbar.is-success .navbar-end>a.navbar-item.is-active,.navbar.is-success .navbar-end>a.navbar-item:focus,.navbar.is-success .navbar-end>a.navbar-item:hover,.navbar.is-success .navbar-start .navbar-link.is-active,.navbar.is-success .navbar-start .navbar-link:focus,.navbar.is-success .navbar-start .navbar-link:hover,.navbar.is-success .navbar-start>a.navbar-item.is-active,.navbar.is-success .navbar-start>a.navbar-item:focus,.navbar.is-success .navbar-start>a.navbar-item:hover{background-color:#3abb67;color:#fff}.navbar.is-success .navbar-end .navbar-link::after,.navbar.is-success .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-success .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-success .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-success .navbar-item.has-dropdown:hover .navbar-link{background-color:#3abb67;color:#fff}.navbar.is-success .navbar-dropdown a.navbar-item.is-active{background-color:#48c774;color:#fff}}.navbar.is-warning{background-color:#ffdd57;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link,.navbar.is-warning .navbar-brand>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link.is-active,.navbar.is-warning .navbar-brand .navbar-link:focus,.navbar.is-warning .navbar-brand .navbar-link:hover,.navbar.is-warning .navbar-brand>a.navbar-item.is-active,.navbar.is-warning .navbar-brand>a.navbar-item:focus,.navbar.is-warning .navbar-brand>a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-burger{color:rgba(0,0,0,.7)}@media screen and (min-width:1024px){.navbar.is-warning .navbar-end .navbar-link,.navbar.is-warning .navbar-end>.navbar-item,.navbar.is-warning .navbar-start .navbar-link,.navbar.is-warning .navbar-start>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-end .navbar-link.is-active,.navbar.is-warning .navbar-end .navbar-link:focus,.navbar.is-warning .navbar-end .navbar-link:hover,.navbar.is-warning .navbar-end>a.navbar-item.is-active,.navbar.is-warning .navbar-end>a.navbar-item:focus,.navbar.is-warning .navbar-end>a.navbar-item:hover,.navbar.is-warning .navbar-start .navbar-link.is-active,.navbar.is-warning .navbar-start .navbar-link:focus,.navbar.is-warning .navbar-start .navbar-link:hover,.navbar.is-warning .navbar-start>a.navbar-item.is-active,.navbar.is-warning .navbar-start>a.navbar-item:focus,.navbar.is-warning .navbar-start>a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-end .navbar-link::after,.navbar.is-warning .navbar-start .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-warning .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-warning .navbar-item.has-dropdown:hover .navbar-link{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-dropdown a.navbar-item.is-active{background-color:#ffdd57;color:rgba(0,0,0,.7)}}.navbar.is-danger{background-color:#f14668;color:#fff}.navbar.is-danger .navbar-brand .navbar-link,.navbar.is-danger .navbar-brand>.navbar-item{color:#fff}.navbar.is-danger .navbar-brand .navbar-link.is-active,.navbar.is-danger .navbar-brand .navbar-link:focus,.navbar.is-danger .navbar-brand .navbar-link:hover,.navbar.is-danger .navbar-brand>a.navbar-item.is-active,.navbar.is-danger .navbar-brand>a.navbar-item:focus,.navbar.is-danger .navbar-brand>a.navbar-item:hover{background-color:#ef2e55;color:#fff}.navbar.is-danger .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-danger .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-danger .navbar-end .navbar-link,.navbar.is-danger .navbar-end>.navbar-item,.navbar.is-danger .navbar-start .navbar-link,.navbar.is-danger .navbar-start>.navbar-item{color:#fff}.navbar.is-danger .navbar-end .navbar-link.is-active,.navbar.is-danger .navbar-end .navbar-link:focus,.navbar.is-danger .navbar-end .navbar-link:hover,.navbar.is-danger .navbar-end>a.navbar-item.is-active,.navbar.is-danger .navbar-end>a.navbar-item:focus,.navbar.is-danger .navbar-end>a.navbar-item:hover,.navbar.is-danger .navbar-start .navbar-link.is-active,.navbar.is-danger .navbar-start .navbar-link:focus,.navbar.is-danger .navbar-start .navbar-link:hover,.navbar.is-danger .navbar-start>a.navbar-item.is-active,.navbar.is-danger .navbar-start>a.navbar-item:focus,.navbar.is-danger .navbar-start>a.navbar-item:hover{background-color:#ef2e55;color:#fff}.navbar.is-danger .navbar-end .navbar-link::after,.navbar.is-danger .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-danger .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-danger .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-danger .navbar-item.has-dropdown:hover .navbar-link{background-color:#ef2e55;color:#fff}.navbar.is-danger .navbar-dropdown a.navbar-item.is-active{background-color:#f14668;color:#fff}}.navbar>.container{align-items:stretch;display:flex;min-height:3.25rem;width:100%}.navbar.has-shadow{box-shadow:0 2px 0 0 #f5f5f5}.navbar.is-fixed-bottom,.navbar.is-fixed-top{left:0;position:fixed;right:0;z-index:30}.navbar.is-fixed-bottom{bottom:0}.navbar.is-fixed-bottom.has-shadow{box-shadow:0 -2px 0 0 #f5f5f5}.navbar.is-fixed-top{top:0}body.has-navbar-fixed-top,html.has-navbar-fixed-top{padding-top:3.25rem}body.has-navbar-fixed-bottom,html.has-navbar-fixed-bottom{padding-bottom:3.25rem}.navbar-brand,.navbar-tabs{align-items:stretch;display:flex;flex-shrink:0;min-height:3.25rem}.navbar-brand a.navbar-item:focus,.navbar-brand a.navbar-item:hover{background-color:transparent}.navbar-tabs{-webkit-overflow-scrolling:touch;max-width:100vw;overflow-x:auto;overflow-y:hidden}.navbar-burger{color:#4a4a4a;cursor:pointer;display:block;height:3.25rem;position:relative;width:3.25rem;margin-left:auto}.navbar-burger span{background-color:currentColor;display:block;height:1px;left:calc(50% - 8px);position:absolute;transform-origin:center;transition-duration:86ms;transition-property:background-color,opacity,transform;transition-timing-function:ease-out;width:16px}.navbar-burger span:nth-child(1){top:calc(50% - 6px)}.navbar-burger span:nth-child(2){top:calc(50% - 1px)}.navbar-burger span:nth-child(3){top:calc(50% + 4px)}.navbar-burger:hover{background-color:rgba(0,0,0,.05)}.navbar-burger.is-active span:nth-child(1){transform:translateY(5px) rotate(45deg)}.navbar-burger.is-active span:nth-child(2){opacity:0}.navbar-burger.is-active span:nth-child(3){transform:translateY(-5px) rotate(-45deg)}.navbar-menu{display:none}.navbar-item,.navbar-link{color:#4a4a4a;display:block;line-height:1.5;padding:.5rem .75rem;position:relative}.navbar-item .icon:only-child,.navbar-link .icon:only-child{margin-left:-.25rem;margin-right:-.25rem}.navbar-link,a.navbar-item{cursor:pointer}.navbar-link.is-active,.navbar-link:focus,.navbar-link:focus-within,.navbar-link:hover,a.navbar-item.is-active,a.navbar-item:focus,a.navbar-item:focus-within,a.navbar-item:hover{background-color:#fafafa;color:#3273dc}.navbar-item{display:block;flex-grow:0;flex-shrink:0}.navbar-item img{max-height:1.75rem}.navbar-item.has-dropdown{padding:0}.navbar-item.is-expanded{flex-grow:1;flex-shrink:1}.navbar-item.is-tab{border-bottom:1px solid transparent;min-height:3.25rem;padding-bottom:calc(.5rem - 1px)}.navbar-item.is-tab:focus,.navbar-item.is-tab:hover{background-color:transparent;border-bottom-color:#3273dc}.navbar-item.is-tab.is-active{background-color:transparent;border-bottom-color:#3273dc;border-bottom-style:solid;border-bottom-width:3px;color:#3273dc;padding-bottom:calc(.5rem - 3px)}.navbar-content{flex-grow:1;flex-shrink:1}.navbar-link:not(.is-arrowless){padding-right:2.5em}.navbar-link:not(.is-arrowless)::after{border-color:#3273dc;margin-top:-.375em;right:1.125em}.navbar-dropdown{font-size:.875rem;padding-bottom:.5rem;padding-top:.5rem}.navbar-dropdown .navbar-item{padding-left:1.5rem;padding-right:1.5rem}.navbar-divider{background-color:#f5f5f5;border:none;display:none;height:2px;margin:.5rem 0}@media screen and (max-width:1023px){.navbar>.container{display:block}.navbar-brand .navbar-item,.navbar-tabs .navbar-item{align-items:center;display:flex}.navbar-link::after{display:none}.navbar-menu{background-color:#fff;box-shadow:0 8px 16px rgba(10,10,10,.1);padding:.5rem 0}.navbar-menu.is-active{display:block}.navbar.is-fixed-bottom-touch,.navbar.is-fixed-top-touch{left:0;position:fixed;right:0;z-index:30}.navbar.is-fixed-bottom-touch{bottom:0}.navbar.is-fixed-bottom-touch.has-shadow{box-shadow:0 -2px 3px rgba(10,10,10,.1)}.navbar.is-fixed-top-touch{top:0}.navbar.is-fixed-top .navbar-menu,.navbar.is-fixed-top-touch .navbar-menu{-webkit-overflow-scrolling:touch;max-height:calc(100vh - 3.25rem);overflow:auto}body.has-navbar-fixed-top-touch,html.has-navbar-fixed-top-touch{padding-top:3.25rem}body.has-navbar-fixed-bottom-touch,html.has-navbar-fixed-bottom-touch{padding-bottom:3.25rem}}@media screen and (min-width:1024px){.navbar,.navbar-end,.navbar-menu,.navbar-start{align-items:stretch;display:flex}.navbar{min-height:3.25rem}.navbar.is-spaced{padding:1rem 2rem}.navbar.is-spaced .navbar-end,.navbar.is-spaced .navbar-start{align-items:center}.navbar.is-spaced .navbar-link,.navbar.is-spaced a.navbar-item{border-radius:4px}.navbar.is-transparent .navbar-link.is-active,.navbar.is-transparent .navbar-link:focus,.navbar.is-transparent .navbar-link:hover,.navbar.is-transparent a.navbar-item.is-active,.navbar.is-transparent a.navbar-item:focus,.navbar.is-transparent a.navbar-item:hover{background-color:transparent!important}.navbar.is-transparent .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-transparent .navbar-item.has-dropdown.is-hoverable:focus .navbar-link,.navbar.is-transparent .navbar-item.has-dropdown.is-hoverable:focus-within .navbar-link,.navbar.is-transparent .navbar-item.has-dropdown.is-hoverable:hover .navbar-link{background-color:transparent!important}.navbar.is-transparent .navbar-dropdown a.navbar-item:focus,.navbar.is-transparent .navbar-dropdown a.navbar-item:hover{background-color:#f5f5f5;color:#0a0a0a}.navbar.is-transparent .navbar-dropdown a.navbar-item.is-active{background-color:#f5f5f5;color:#3273dc}.navbar-burger{display:none}.navbar-item,.navbar-link{align-items:center;display:flex}.navbar-item{display:flex}.navbar-item.has-dropdown{align-items:stretch}.navbar-item.has-dropdown-up .navbar-link::after{transform:rotate(135deg) translate(.25em,-.25em)}.navbar-item.has-dropdown-up .navbar-dropdown{border-bottom:2px solid #dbdbdb;border-radius:6px 6px 0 0;border-top:none;bottom:100%;box-shadow:0 -8px 8px rgba(10,10,10,.1);top:auto}.navbar-item.is-active .navbar-dropdown,.navbar-item.is-hoverable:focus .navbar-dropdown,.navbar-item.is-hoverable:focus-within .navbar-dropdown,.navbar-item.is-hoverable:hover .navbar-dropdown{display:block}.navbar-item.is-active .navbar-dropdown.is-boxed,.navbar-item.is-hoverable:focus .navbar-dropdown.is-boxed,.navbar-item.is-hoverable:focus-within .navbar-dropdown.is-boxed,.navbar-item.is-hoverable:hover .navbar-dropdown.is-boxed,.navbar.is-spaced .navbar-item.is-active .navbar-dropdown,.navbar.is-spaced .navbar-item.is-hoverable:focus .navbar-dropdown,.navbar.is-spaced .navbar-item.is-hoverable:focus-within .navbar-dropdown,.navbar.is-spaced .navbar-item.is-hoverable:hover .navbar-dropdown{opacity:1;pointer-events:auto;transform:translateY(0)}.navbar-menu{flex-grow:1;flex-shrink:0}.navbar-start{justify-content:flex-start;margin-right:auto}.navbar-end{justify-content:flex-end;margin-left:auto}.navbar-dropdown{background-color:#fff;border-bottom-left-radius:6px;border-bottom-right-radius:6px;border-top:2px solid #dbdbdb;box-shadow:0 8px 8px rgba(10,10,10,.1);display:none;font-size:.875rem;left:0;min-width:100%;position:absolute;top:100%;z-index:20}.navbar-dropdown .navbar-item{padding:.375rem 1rem;white-space:nowrap}.navbar-dropdown a.navbar-item{padding-right:3rem}.navbar-dropdown a.navbar-item:focus,.navbar-dropdown a.navbar-item:hover{background-color:#f5f5f5;color:#0a0a0a}.navbar-dropdown a.navbar-item.is-active{background-color:#f5f5f5;color:#3273dc}.navbar-dropdown.is-boxed,.navbar.is-spaced .navbar-dropdown{border-radius:6px;border-top:none;box-shadow:0 8px 8px rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.1);display:block;opacity:0;pointer-events:none;top:calc(100% + (-4px));transform:translateY(-5px);transition-duration:86ms;transition-property:opacity,transform}.navbar-dropdown.is-right{left:auto;right:0}.navbar-divider{display:block}.container>.navbar .navbar-brand,.navbar>.container .navbar-brand{margin-left:-.75rem}.container>.navbar .navbar-menu,.navbar>.container .navbar-menu{margin-right:-.75rem}.navbar.is-fixed-bottom-desktop,.navbar.is-fixed-top-desktop{left:0;position:fixed;right:0;z-index:30}.navbar.is-fixed-bottom-desktop{bottom:0}.navbar.is-fixed-bottom-desktop.has-shadow{box-shadow:0 -2px 3px rgba(10,10,10,.1)}.navbar.is-fixed-top-desktop{top:0}body.has-navbar-fixed-top-desktop,html.has-navbar-fixed-top-desktop{padding-top:3.25rem}body.has-navbar-fixed-bottom-desktop,html.has-navbar-fixed-bottom-desktop{padding-bottom:3.25rem}body.has-spaced-navbar-fixed-top,html.has-spaced-navbar-fixed-top{padding-top:5.25rem}body.has-spaced-navbar-fixed-bottom,html.has-spaced-navbar-fixed-bottom{padding-bottom:5.25rem}.navbar-link.is-active,a.navbar-item.is-active{color:#0a0a0a}.navbar-link.is-active:not(:focus):not(:hover),a.navbar-item.is-active:not(:focus):not(:hover){background-color:transparent}.navbar-item.has-dropdown.is-active .navbar-link,.navbar-item.has-dropdown:focus .navbar-link,.navbar-item.has-dropdown:hover .navbar-link{background-color:#fafafa}}.hero.is-fullheight-with-navbar{min-height:calc(100vh - 3.25rem)}.pagination{font-size:1rem;margin:-.25rem}.pagination.is-small{font-size:.75rem}.pagination.is-medium{font-size:1.25rem}.pagination.is-large{font-size:1.5rem}.pagination.is-rounded .pagination-next,.pagination.is-rounded .pagination-previous{padding-left:1em;padding-right:1em;border-radius:290486px}.pagination.is-rounded .pagination-link{border-radius:290486px}.pagination,.pagination-list{align-items:center;display:flex;justify-content:center;text-align:center}.pagination-ellipsis,.pagination-link,.pagination-next,.pagination-previous{font-size:1em;justify-content:center;margin:.25rem;padding-left:.5em;padding-right:.5em;text-align:center}.pagination-link,.pagination-next,.pagination-previous{border-color:#dbdbdb;color:#363636;min-width:2.5em}.pagination-link:hover,.pagination-next:hover,.pagination-previous:hover{border-color:#b5b5b5;color:#363636}.pagination-link:focus,.pagination-next:focus,.pagination-previous:focus{border-color:#3273dc}.pagination-link:active,.pagination-next:active,.pagination-previous:active{box-shadow:inset 0 1px 2px rgba(10,10,10,.2)}.pagination-link[disabled],.pagination-next[disabled],.pagination-previous[disabled]{background-color:#dbdbdb;border-color:#dbdbdb;box-shadow:none;color:#7a7a7a;opacity:.5}.pagination-next,.pagination-previous{padding-left:.75em;padding-right:.75em;white-space:nowrap}.pagination-link.is-current{background-color:#3273dc;border-color:#3273dc;color:#fff}.pagination-ellipsis{color:#b5b5b5;pointer-events:none}.pagination-list{flex-wrap:wrap}@media screen and (max-width:768px){.pagination{flex-wrap:wrap}.pagination-next,.pagination-previous{flex-grow:1;flex-shrink:1}.pagination-list li{flex-grow:1;flex-shrink:1}}@media screen and (min-width:769px),print{.pagination-list{flex-grow:1;flex-shrink:1;justify-content:flex-start;order:1}.pagination-previous{order:2}.pagination-next{order:3}.pagination{justify-content:space-between}.pagination.is-centered .pagination-previous{order:1}.pagination.is-centered .pagination-list{justify-content:center;order:2}.pagination.is-centered .pagination-next{order:3}.pagination.is-right .pagination-previous{order:1}.pagination.is-right .pagination-next{order:2}.pagination.is-right .pagination-list{justify-content:flex-end;order:3}}.panel{border-radius:6px;box-shadow:0 .5em 1em -.125em rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.02);font-size:1rem}.panel:not(:last-child){margin-bottom:1.5rem}.panel.is-white .panel-heading{background-color:#fff;color:#0a0a0a}.panel.is-white .panel-tabs a.is-active{border-bottom-color:#fff}.panel.is-white .panel-block.is-active .panel-icon{color:#fff}.panel.is-black .panel-heading{background-color:#0a0a0a;color:#fff}.panel.is-black .panel-tabs a.is-active{border-bottom-color:#0a0a0a}.panel.is-black .panel-block.is-active .panel-icon{color:#0a0a0a}.panel.is-light .panel-heading{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.panel.is-light .panel-tabs a.is-active{border-bottom-color:#f5f5f5}.panel.is-light .panel-block.is-active .panel-icon{color:#f5f5f5}.panel.is-dark .panel-heading{background-color:#363636;color:#fff}.panel.is-dark .panel-tabs a.is-active{border-bottom-color:#363636}.panel.is-dark .panel-block.is-active .panel-icon{color:#363636}.panel.is-primary .panel-heading{background-color:#00d1b2;color:#fff}.panel.is-primary .panel-tabs a.is-active{border-bottom-color:#00d1b2}.panel.is-primary .panel-block.is-active .panel-icon{color:#00d1b2}.panel.is-link .panel-heading{background-color:#3273dc;color:#fff}.panel.is-link .panel-tabs a.is-active{border-bottom-color:#3273dc}.panel.is-link .panel-block.is-active .panel-icon{color:#3273dc}.panel.is-info .panel-heading{background-color:#3298dc;color:#fff}.panel.is-info .panel-tabs a.is-active{border-bottom-color:#3298dc}.panel.is-info .panel-block.is-active .panel-icon{color:#3298dc}.panel.is-success .panel-heading{background-color:#48c774;color:#fff}.panel.is-success .panel-tabs a.is-active{border-bottom-color:#48c774}.panel.is-success .panel-block.is-active .panel-icon{color:#48c774}.panel.is-warning .panel-heading{background-color:#ffdd57;color:rgba(0,0,0,.7)}.panel.is-warning .panel-tabs a.is-active{border-bottom-color:#ffdd57}.panel.is-warning .panel-block.is-active .panel-icon{color:#ffdd57}.panel.is-danger .panel-heading{background-color:#f14668;color:#fff}.panel.is-danger .panel-tabs a.is-active{border-bottom-color:#f14668}.panel.is-danger .panel-block.is-active .panel-icon{color:#f14668}.panel-block:not(:last-child),.panel-tabs:not(:last-child){border-bottom:1px solid #ededed}.panel-heading{background-color:#ededed;border-radius:6px 6px 0 0;color:#363636;font-size:1.25em;font-weight:700;line-height:1.25;padding:.75em 1em}.panel-tabs{align-items:flex-end;display:flex;font-size:.875em;justify-content:center}.panel-tabs a{border-bottom:1px solid #dbdbdb;margin-bottom:-1px;padding:.5em}.panel-tabs a.is-active{border-bottom-color:#4a4a4a;color:#363636}.panel-list a{color:#4a4a4a}.panel-list a:hover{color:#3273dc}.panel-block{align-items:center;color:#363636;display:flex;justify-content:flex-start;padding:.5em .75em}.panel-block input[type=checkbox]{margin-right:.75em}.panel-block>.control{flex-grow:1;flex-shrink:1;width:100%}.panel-block.is-wrapped{flex-wrap:wrap}.panel-block.is-active{border-left-color:#3273dc;color:#363636}.panel-block.is-active .panel-icon{color:#3273dc}.panel-block:last-child{border-bottom-left-radius:6px;border-bottom-right-radius:6px}a.panel-block,label.panel-block{cursor:pointer}a.panel-block:hover,label.panel-block:hover{background-color:#f5f5f5}.panel-icon{display:inline-block;font-size:14px;height:1em;line-height:1em;text-align:center;vertical-align:top;width:1em;color:#7a7a7a;margin-right:.75em}.panel-icon .fa{font-size:inherit;line-height:inherit}.tabs{-webkit-overflow-scrolling:touch;align-items:stretch;display:flex;font-size:1rem;justify-content:space-between;overflow:hidden;overflow-x:auto;white-space:nowrap}.tabs a{align-items:center;border-bottom-color:#dbdbdb;border-bottom-style:solid;border-bottom-width:1px;color:#4a4a4a;display:flex;justify-content:center;margin-bottom:-1px;padding:.5em 1em;vertical-align:top}.tabs a:hover{border-bottom-color:#363636;color:#363636}.tabs li{display:block}.tabs li.is-active a{border-bottom-color:#3273dc;color:#3273dc}.tabs ul{align-items:center;border-bottom-color:#dbdbdb;border-bottom-style:solid;border-bottom-width:1px;display:flex;flex-grow:1;flex-shrink:0;justify-content:flex-start}.tabs ul.is-left{padding-right:.75em}.tabs ul.is-center{flex:none;justify-content:center;padding-left:.75em;padding-right:.75em}.tabs ul.is-right{justify-content:flex-end;padding-left:.75em}.tabs .icon:first-child{margin-right:.5em}.tabs .icon:last-child{margin-left:.5em}.tabs.is-centered ul{justify-content:center}.tabs.is-right ul{justify-content:flex-end}.tabs.is-boxed a{border:1px solid transparent;border-radius:4px 4px 0 0}.tabs.is-boxed a:hover{background-color:#f5f5f5;border-bottom-color:#dbdbdb}.tabs.is-boxed li.is-active a{background-color:#fff;border-color:#dbdbdb;border-bottom-color:transparent!important}.tabs.is-fullwidth li{flex-grow:1;flex-shrink:0}.tabs.is-toggle a{border-color:#dbdbdb;border-style:solid;border-width:1px;margin-bottom:0;position:relative}.tabs.is-toggle a:hover{background-color:#f5f5f5;border-color:#b5b5b5;z-index:2}.tabs.is-toggle li+li{margin-left:-1px}.tabs.is-toggle li:first-child a{border-radius:4px 0 0 4px}.tabs.is-toggle li:last-child a{border-radius:0 4px 4px 0}.tabs.is-toggle li.is-active a{background-color:#3273dc;border-color:#3273dc;color:#fff;z-index:1}.tabs.is-toggle ul{border-bottom:none}.tabs.is-toggle.is-toggle-rounded li:first-child a{border-bottom-left-radius:290486px;border-top-left-radius:290486px;padding-left:1.25em}.tabs.is-toggle.is-toggle-rounded li:last-child a{border-bottom-right-radius:290486px;border-top-right-radius:290486px;padding-right:1.25em}.tabs.is-small{font-size:.75rem}.tabs.is-medium{font-size:1.25rem}.tabs.is-large{font-size:1.5rem}.column{display:block;flex-basis:0;flex-grow:1;flex-shrink:1;padding:.75rem}.columns.is-mobile>.column.is-narrow{flex:none}.columns.is-mobile>.column.is-full{flex:none;width:100%}.columns.is-mobile>.column.is-three-quarters{flex:none;width:75%}.columns.is-mobile>.column.is-two-thirds{flex:none;width:66.6666%}.columns.is-mobile>.column.is-half{flex:none;width:50%}.columns.is-mobile>.column.is-one-third{flex:none;width:33.3333%}.columns.is-mobile>.column.is-one-quarter{flex:none;width:25%}.columns.is-mobile>.column.is-one-fifth{flex:none;width:20%}.columns.is-mobile>.column.is-two-fifths{flex:none;width:40%}.columns.is-mobile>.column.is-three-fifths{flex:none;width:60%}.columns.is-mobile>.column.is-four-fifths{flex:none;width:80%}.columns.is-mobile>.column.is-offset-three-quarters{margin-left:75%}.columns.is-mobile>.column.is-offset-two-thirds{margin-left:66.6666%}.columns.is-mobile>.column.is-offset-half{margin-left:50%}.columns.is-mobile>.column.is-offset-one-third{margin-left:33.3333%}.columns.is-mobile>.column.is-offset-one-quarter{margin-left:25%}.columns.is-mobile>.column.is-offset-one-fifth{margin-left:20%}.columns.is-mobile>.column.is-offset-two-fifths{margin-left:40%}.columns.is-mobile>.column.is-offset-three-fifths{margin-left:60%}.columns.is-mobile>.column.is-offset-four-fifths{margin-left:80%}.columns.is-mobile>.column.is-0{flex:none;width:0%}.columns.is-mobile>.column.is-offset-0{margin-left:0}.columns.is-mobile>.column.is-1{flex:none;width:8.33333%}.columns.is-mobile>.column.is-offset-1{margin-left:8.33333%}.columns.is-mobile>.column.is-2{flex:none;width:16.66667%}.columns.is-mobile>.column.is-offset-2{margin-left:16.66667%}.columns.is-mobile>.column.is-3{flex:none;width:25%}.columns.is-mobile>.column.is-offset-3{margin-left:25%}.columns.is-mobile>.column.is-4{flex:none;width:33.33333%}.columns.is-mobile>.column.is-offset-4{margin-left:33.33333%}.columns.is-mobile>.column.is-5{flex:none;width:41.66667%}.columns.is-mobile>.column.is-offset-5{margin-left:41.66667%}.columns.is-mobile>.column.is-6{flex:none;width:50%}.columns.is-mobile>.column.is-offset-6{margin-left:50%}.columns.is-mobile>.column.is-7{flex:none;width:58.33333%}.columns.is-mobile>.column.is-offset-7{margin-left:58.33333%}.columns.is-mobile>.column.is-8{flex:none;width:66.66667%}.columns.is-mobile>.column.is-offset-8{margin-left:66.66667%}.columns.is-mobile>.column.is-9{flex:none;width:75%}.columns.is-mobile>.column.is-offset-9{margin-left:75%}.columns.is-mobile>.column.is-10{flex:none;width:83.33333%}.columns.is-mobile>.column.is-offset-10{margin-left:83.33333%}.columns.is-mobile>.column.is-11{flex:none;width:91.66667%}.columns.is-mobile>.column.is-offset-11{margin-left:91.66667%}.columns.is-mobile>.column.is-12{flex:none;width:100%}.columns.is-mobile>.column.is-offset-12{margin-left:100%}@media screen and (max-width:768px){.column.is-narrow-mobile{flex:none}.column.is-full-mobile{flex:none;width:100%}.column.is-three-quarters-mobile{flex:none;width:75%}.column.is-two-thirds-mobile{flex:none;width:66.6666%}.column.is-half-mobile{flex:none;width:50%}.column.is-one-third-mobile{flex:none;width:33.3333%}.column.is-one-quarter-mobile{flex:none;width:25%}.column.is-one-fifth-mobile{flex:none;width:20%}.column.is-two-fifths-mobile{flex:none;width:40%}.column.is-three-fifths-mobile{flex:none;width:60%}.column.is-four-fifths-mobile{flex:none;width:80%}.column.is-offset-three-quarters-mobile{margin-left:75%}.column.is-offset-two-thirds-mobile{margin-left:66.6666%}.column.is-offset-half-mobile{margin-left:50%}.column.is-offset-one-third-mobile{margin-left:33.3333%}.column.is-offset-one-quarter-mobile{margin-left:25%}.column.is-offset-one-fifth-mobile{margin-left:20%}.column.is-offset-two-fifths-mobile{margin-left:40%}.column.is-offset-three-fifths-mobile{margin-left:60%}.column.is-offset-four-fifths-mobile{margin-left:80%}.column.is-0-mobile{flex:none;width:0%}.column.is-offset-0-mobile{margin-left:0}.column.is-1-mobile{flex:none;width:8.33333%}.column.is-offset-1-mobile{margin-left:8.33333%}.column.is-2-mobile{flex:none;width:16.66667%}.column.is-offset-2-mobile{margin-left:16.66667%}.column.is-3-mobile{flex:none;width:25%}.column.is-offset-3-mobile{margin-left:25%}.column.is-4-mobile{flex:none;width:33.33333%}.column.is-offset-4-mobile{margin-left:33.33333%}.column.is-5-mobile{flex:none;width:41.66667%}.column.is-offset-5-mobile{margin-left:41.66667%}.column.is-6-mobile{flex:none;width:50%}.column.is-offset-6-mobile{margin-left:50%}.column.is-7-mobile{flex:none;width:58.33333%}.column.is-offset-7-mobile{margin-left:58.33333%}.column.is-8-mobile{flex:none;width:66.66667%}.column.is-offset-8-mobile{margin-left:66.66667%}.column.is-9-mobile{flex:none;width:75%}.column.is-offset-9-mobile{margin-left:75%}.column.is-10-mobile{flex:none;width:83.33333%}.column.is-offset-10-mobile{margin-left:83.33333%}.column.is-11-mobile{flex:none;width:91.66667%}.column.is-offset-11-mobile{margin-left:91.66667%}.column.is-12-mobile{flex:none;width:100%}.column.is-offset-12-mobile{margin-left:100%}}@media screen and (min-width:769px),print{.column.is-narrow,.column.is-narrow-tablet{flex:none}.column.is-full,.column.is-full-tablet{flex:none;width:100%}.column.is-three-quarters,.column.is-three-quarters-tablet{flex:none;width:75%}.column.is-two-thirds,.column.is-two-thirds-tablet{flex:none;width:66.6666%}.column.is-half,.column.is-half-tablet{flex:none;width:50%}.column.is-one-third,.column.is-one-third-tablet{flex:none;width:33.3333%}.column.is-one-quarter,.column.is-one-quarter-tablet{flex:none;width:25%}.column.is-one-fifth,.column.is-one-fifth-tablet{flex:none;width:20%}.column.is-two-fifths,.column.is-two-fifths-tablet{flex:none;width:40%}.column.is-three-fifths,.column.is-three-fifths-tablet{flex:none;width:60%}.column.is-four-fifths,.column.is-four-fifths-tablet{flex:none;width:80%}.column.is-offset-three-quarters,.column.is-offset-three-quarters-tablet{margin-left:75%}.column.is-offset-two-thirds,.column.is-offset-two-thirds-tablet{margin-left:66.6666%}.column.is-offset-half,.column.is-offset-half-tablet{margin-left:50%}.column.is-offset-one-third,.column.is-offset-one-third-tablet{margin-left:33.3333%}.column.is-offset-one-quarter,.column.is-offset-one-quarter-tablet{margin-left:25%}.column.is-offset-one-fifth,.column.is-offset-one-fifth-tablet{margin-left:20%}.column.is-offset-two-fifths,.column.is-offset-two-fifths-tablet{margin-left:40%}.column.is-offset-three-fifths,.column.is-offset-three-fifths-tablet{margin-left:60%}.column.is-offset-four-fifths,.column.is-offset-four-fifths-tablet{margin-left:80%}.column.is-0,.column.is-0-tablet{flex:none;width:0%}.column.is-offset-0,.column.is-offset-0-tablet{margin-left:0}.column.is-1,.column.is-1-tablet{flex:none;width:8.33333%}.column.is-offset-1,.column.is-offset-1-tablet{margin-left:8.33333%}.column.is-2,.column.is-2-tablet{flex:none;width:16.66667%}.column.is-offset-2,.column.is-offset-2-tablet{margin-left:16.66667%}.column.is-3,.column.is-3-tablet{flex:none;width:25%}.column.is-offset-3,.column.is-offset-3-tablet{margin-left:25%}.column.is-4,.column.is-4-tablet{flex:none;width:33.33333%}.column.is-offset-4,.column.is-offset-4-tablet{margin-left:33.33333%}.column.is-5,.column.is-5-tablet{flex:none;width:41.66667%}.column.is-offset-5,.column.is-offset-5-tablet{margin-left:41.66667%}.column.is-6,.column.is-6-tablet{flex:none;width:50%}.column.is-offset-6,.column.is-offset-6-tablet{margin-left:50%}.column.is-7,.column.is-7-tablet{flex:none;width:58.33333%}.column.is-offset-7,.column.is-offset-7-tablet{margin-left:58.33333%}.column.is-8,.column.is-8-tablet{flex:none;width:66.66667%}.column.is-offset-8,.column.is-offset-8-tablet{margin-left:66.66667%}.column.is-9,.column.is-9-tablet{flex:none;width:75%}.column.is-offset-9,.column.is-offset-9-tablet{margin-left:75%}.column.is-10,.column.is-10-tablet{flex:none;width:83.33333%}.column.is-offset-10,.column.is-offset-10-tablet{margin-left:83.33333%}.column.is-11,.column.is-11-tablet{flex:none;width:91.66667%}.column.is-offset-11,.column.is-offset-11-tablet{margin-left:91.66667%}.column.is-12,.column.is-12-tablet{flex:none;width:100%}.column.is-offset-12,.column.is-offset-12-tablet{margin-left:100%}}@media screen and (max-width:1023px){.column.is-narrow-touch{flex:none}.column.is-full-touch{flex:none;width:100%}.column.is-three-quarters-touch{flex:none;width:75%}.column.is-two-thirds-touch{flex:none;width:66.6666%}.column.is-half-touch{flex:none;width:50%}.column.is-one-third-touch{flex:none;width:33.3333%}.column.is-one-quarter-touch{flex:none;width:25%}.column.is-one-fifth-touch{flex:none;width:20%}.column.is-two-fifths-touch{flex:none;width:40%}.column.is-three-fifths-touch{flex:none;width:60%}.column.is-four-fifths-touch{flex:none;width:80%}.column.is-offset-three-quarters-touch{margin-left:75%}.column.is-offset-two-thirds-touch{margin-left:66.6666%}.column.is-offset-half-touch{margin-left:50%}.column.is-offset-one-third-touch{margin-left:33.3333%}.column.is-offset-one-quarter-touch{margin-left:25%}.column.is-offset-one-fifth-touch{margin-left:20%}.column.is-offset-two-fifths-touch{margin-left:40%}.column.is-offset-three-fifths-touch{margin-left:60%}.column.is-offset-four-fifths-touch{margin-left:80%}.column.is-0-touch{flex:none;width:0%}.column.is-offset-0-touch{margin-left:0}.column.is-1-touch{flex:none;width:8.33333%}.column.is-offset-1-touch{margin-left:8.33333%}.column.is-2-touch{flex:none;width:16.66667%}.column.is-offset-2-touch{margin-left:16.66667%}.column.is-3-touch{flex:none;width:25%}.column.is-offset-3-touch{margin-left:25%}.column.is-4-touch{flex:none;width:33.33333%}.column.is-offset-4-touch{margin-left:33.33333%}.column.is-5-touch{flex:none;width:41.66667%}.column.is-offset-5-touch{margin-left:41.66667%}.column.is-6-touch{flex:none;width:50%}.column.is-offset-6-touch{margin-left:50%}.column.is-7-touch{flex:none;width:58.33333%}.column.is-offset-7-touch{margin-left:58.33333%}.column.is-8-touch{flex:none;width:66.66667%}.column.is-offset-8-touch{margin-left:66.66667%}.column.is-9-touch{flex:none;width:75%}.column.is-offset-9-touch{margin-left:75%}.column.is-10-touch{flex:none;width:83.33333%}.column.is-offset-10-touch{margin-left:83.33333%}.column.is-11-touch{flex:none;width:91.66667%}.column.is-offset-11-touch{margin-left:91.66667%}.column.is-12-touch{flex:none;width:100%}.column.is-offset-12-touch{margin-left:100%}}@media screen and (min-width:1024px){.column.is-narrow-desktop{flex:none}.column.is-full-desktop{flex:none;width:100%}.column.is-three-quarters-desktop{flex:none;width:75%}.column.is-two-thirds-desktop{flex:none;width:66.6666%}.column.is-half-desktop{flex:none;width:50%}.column.is-one-third-desktop{flex:none;width:33.3333%}.column.is-one-quarter-desktop{flex:none;width:25%}.column.is-one-fifth-desktop{flex:none;width:20%}.column.is-two-fifths-desktop{flex:none;width:40%}.column.is-three-fifths-desktop{flex:none;width:60%}.column.is-four-fifths-desktop{flex:none;width:80%}.column.is-offset-three-quarters-desktop{margin-left:75%}.column.is-offset-two-thirds-desktop{margin-left:66.6666%}.column.is-offset-half-desktop{margin-left:50%}.column.is-offset-one-third-desktop{margin-left:33.3333%}.column.is-offset-one-quarter-desktop{margin-left:25%}.column.is-offset-one-fifth-desktop{margin-left:20%}.column.is-offset-two-fifths-desktop{margin-left:40%}.column.is-offset-three-fifths-desktop{margin-left:60%}.column.is-offset-four-fifths-desktop{margin-left:80%}.column.is-0-desktop{flex:none;width:0%}.column.is-offset-0-desktop{margin-left:0}.column.is-1-desktop{flex:none;width:8.33333%}.column.is-offset-1-desktop{margin-left:8.33333%}.column.is-2-desktop{flex:none;width:16.66667%}.column.is-offset-2-desktop{margin-left:16.66667%}.column.is-3-desktop{flex:none;width:25%}.column.is-offset-3-desktop{margin-left:25%}.column.is-4-desktop{flex:none;width:33.33333%}.column.is-offset-4-desktop{margin-left:33.33333%}.column.is-5-desktop{flex:none;width:41.66667%}.column.is-offset-5-desktop{margin-left:41.66667%}.column.is-6-desktop{flex:none;width:50%}.column.is-offset-6-desktop{margin-left:50%}.column.is-7-desktop{flex:none;width:58.33333%}.column.is-offset-7-desktop{margin-left:58.33333%}.column.is-8-desktop{flex:none;width:66.66667%}.column.is-offset-8-desktop{margin-left:66.66667%}.column.is-9-desktop{flex:none;width:75%}.column.is-offset-9-desktop{margin-left:75%}.column.is-10-desktop{flex:none;width:83.33333%}.column.is-offset-10-desktop{margin-left:83.33333%}.column.is-11-desktop{flex:none;width:91.66667%}.column.is-offset-11-desktop{margin-left:91.66667%}.column.is-12-desktop{flex:none;width:100%}.column.is-offset-12-desktop{margin-left:100%}}@media screen and (min-width:1216px){.column.is-narrow-widescreen{flex:none}.column.is-full-widescreen{flex:none;width:100%}.column.is-three-quarters-widescreen{flex:none;width:75%}.column.is-two-thirds-widescreen{flex:none;width:66.6666%}.column.is-half-widescreen{flex:none;width:50%}.column.is-one-third-widescreen{flex:none;width:33.3333%}.column.is-one-quarter-widescreen{flex:none;width:25%}.column.is-one-fifth-widescreen{flex:none;width:20%}.column.is-two-fifths-widescreen{flex:none;width:40%}.column.is-three-fifths-widescreen{flex:none;width:60%}.column.is-four-fifths-widescreen{flex:none;width:80%}.column.is-offset-three-quarters-widescreen{margin-left:75%}.column.is-offset-two-thirds-widescreen{margin-left:66.6666%}.column.is-offset-half-widescreen{margin-left:50%}.column.is-offset-one-third-widescreen{margin-left:33.3333%}.column.is-offset-one-quarter-widescreen{margin-left:25%}.column.is-offset-one-fifth-widescreen{margin-left:20%}.column.is-offset-two-fifths-widescreen{margin-left:40%}.column.is-offset-three-fifths-widescreen{margin-left:60%}.column.is-offset-four-fifths-widescreen{margin-left:80%}.column.is-0-widescreen{flex:none;width:0%}.column.is-offset-0-widescreen{margin-left:0}.column.is-1-widescreen{flex:none;width:8.33333%}.column.is-offset-1-widescreen{margin-left:8.33333%}.column.is-2-widescreen{flex:none;width:16.66667%}.column.is-offset-2-widescreen{margin-left:16.66667%}.column.is-3-widescreen{flex:none;width:25%}.column.is-offset-3-widescreen{margin-left:25%}.column.is-4-widescreen{flex:none;width:33.33333%}.column.is-offset-4-widescreen{margin-left:33.33333%}.column.is-5-widescreen{flex:none;width:41.66667%}.column.is-offset-5-widescreen{margin-left:41.66667%}.column.is-6-widescreen{flex:none;width:50%}.column.is-offset-6-widescreen{margin-left:50%}.column.is-7-widescreen{flex:none;width:58.33333%}.column.is-offset-7-widescreen{margin-left:58.33333%}.column.is-8-widescreen{flex:none;width:66.66667%}.column.is-offset-8-widescreen{margin-left:66.66667%}.column.is-9-widescreen{flex:none;width:75%}.column.is-offset-9-widescreen{margin-left:75%}.column.is-10-widescreen{flex:none;width:83.33333%}.column.is-offset-10-widescreen{margin-left:83.33333%}.column.is-11-widescreen{flex:none;width:91.66667%}.column.is-offset-11-widescreen{margin-left:91.66667%}.column.is-12-widescreen{flex:none;width:100%}.column.is-offset-12-widescreen{margin-left:100%}}@media screen and (min-width:1408px){.column.is-narrow-fullhd{flex:none}.column.is-full-fullhd{flex:none;width:100%}.column.is-three-quarters-fullhd{flex:none;width:75%}.column.is-two-thirds-fullhd{flex:none;width:66.6666%}.column.is-half-fullhd{flex:none;width:50%}.column.is-one-third-fullhd{flex:none;width:33.3333%}.column.is-one-quarter-fullhd{flex:none;width:25%}.column.is-one-fifth-fullhd{flex:none;width:20%}.column.is-two-fifths-fullhd{flex:none;width:40%}.column.is-three-fifths-fullhd{flex:none;width:60%}.column.is-four-fifths-fullhd{flex:none;width:80%}.column.is-offset-three-quarters-fullhd{margin-left:75%}.column.is-offset-two-thirds-fullhd{margin-left:66.6666%}.column.is-offset-half-fullhd{margin-left:50%}.column.is-offset-one-third-fullhd{margin-left:33.3333%}.column.is-offset-one-quarter-fullhd{margin-left:25%}.column.is-offset-one-fifth-fullhd{margin-left:20%}.column.is-offset-two-fifths-fullhd{margin-left:40%}.column.is-offset-three-fifths-fullhd{margin-left:60%}.column.is-offset-four-fifths-fullhd{margin-left:80%}.column.is-0-fullhd{flex:none;width:0%}.column.is-offset-0-fullhd{margin-left:0}.column.is-1-fullhd{flex:none;width:8.33333%}.column.is-offset-1-fullhd{margin-left:8.33333%}.column.is-2-fullhd{flex:none;width:16.66667%}.column.is-offset-2-fullhd{margin-left:16.66667%}.column.is-3-fullhd{flex:none;width:25%}.column.is-offset-3-fullhd{margin-left:25%}.column.is-4-fullhd{flex:none;width:33.33333%}.column.is-offset-4-fullhd{margin-left:33.33333%}.column.is-5-fullhd{flex:none;width:41.66667%}.column.is-offset-5-fullhd{margin-left:41.66667%}.column.is-6-fullhd{flex:none;width:50%}.column.is-offset-6-fullhd{margin-left:50%}.column.is-7-fullhd{flex:none;width:58.33333%}.column.is-offset-7-fullhd{margin-left:58.33333%}.column.is-8-fullhd{flex:none;width:66.66667%}.column.is-offset-8-fullhd{margin-left:66.66667%}.column.is-9-fullhd{flex:none;width:75%}.column.is-offset-9-fullhd{margin-left:75%}.column.is-10-fullhd{flex:none;width:83.33333%}.column.is-offset-10-fullhd{margin-left:83.33333%}.column.is-11-fullhd{flex:none;width:91.66667%}.column.is-offset-11-fullhd{margin-left:91.66667%}.column.is-12-fullhd{flex:none;width:100%}.column.is-offset-12-fullhd{margin-left:100%}}.columns{margin-left:-.75rem;margin-right:-.75rem;margin-top:-.75rem}.columns:last-child{margin-bottom:-.75rem}.columns:not(:last-child){margin-bottom:calc(1.5rem - .75rem)}.columns.is-centered{justify-content:center}.columns.is-gapless{margin-left:0;margin-right:0;margin-top:0}.columns.is-gapless>.column{margin:0;padding:0!important}.columns.is-gapless:not(:last-child){margin-bottom:1.5rem}.columns.is-gapless:last-child{margin-bottom:0}.columns.is-mobile{display:flex}.columns.is-multiline{flex-wrap:wrap}.columns.is-vcentered{align-items:center}@media screen and (min-width:769px),print{.columns:not(.is-desktop){display:flex}}@media screen and (min-width:1024px){.columns.is-desktop{display:flex}}.columns.is-variable{--columnGap:0.75rem;margin-left:calc(-1 * var(--columnGap));margin-right:calc(-1 * var(--columnGap))}.columns.is-variable .column{padding-left:var(--columnGap);padding-right:var(--columnGap)}.columns.is-variable.is-0{--columnGap:0rem}@media screen and (max-width:768px){.columns.is-variable.is-0-mobile{--columnGap:0rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-0-tablet{--columnGap:0rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-0-tablet-only{--columnGap:0rem}}@media screen and (max-width:1023px){.columns.is-variable.is-0-touch{--columnGap:0rem}}@media screen and (min-width:1024px){.columns.is-variable.is-0-desktop{--columnGap:0rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-0-desktop-only{--columnGap:0rem}}@media screen and (min-width:1216px){.columns.is-variable.is-0-widescreen{--columnGap:0rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-0-widescreen-only{--columnGap:0rem}}@media screen and (min-width:1408px){.columns.is-variable.is-0-fullhd{--columnGap:0rem}}.columns.is-variable.is-1{--columnGap:0.25rem}@media screen and (max-width:768px){.columns.is-variable.is-1-mobile{--columnGap:0.25rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-1-tablet{--columnGap:0.25rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-1-tablet-only{--columnGap:0.25rem}}@media screen and (max-width:1023px){.columns.is-variable.is-1-touch{--columnGap:0.25rem}}@media screen and (min-width:1024px){.columns.is-variable.is-1-desktop{--columnGap:0.25rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-1-desktop-only{--columnGap:0.25rem}}@media screen and (min-width:1216px){.columns.is-variable.is-1-widescreen{--columnGap:0.25rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-1-widescreen-only{--columnGap:0.25rem}}@media screen and (min-width:1408px){.columns.is-variable.is-1-fullhd{--columnGap:0.25rem}}.columns.is-variable.is-2{--columnGap:0.5rem}@media screen and (max-width:768px){.columns.is-variable.is-2-mobile{--columnGap:0.5rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-2-tablet{--columnGap:0.5rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-2-tablet-only{--columnGap:0.5rem}}@media screen and (max-width:1023px){.columns.is-variable.is-2-touch{--columnGap:0.5rem}}@media screen and (min-width:1024px){.columns.is-variable.is-2-desktop{--columnGap:0.5rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-2-desktop-only{--columnGap:0.5rem}}@media screen and (min-width:1216px){.columns.is-variable.is-2-widescreen{--columnGap:0.5rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-2-widescreen-only{--columnGap:0.5rem}}@media screen and (min-width:1408px){.columns.is-variable.is-2-fullhd{--columnGap:0.5rem}}.columns.is-variable.is-3{--columnGap:0.75rem}@media screen and (max-width:768px){.columns.is-variable.is-3-mobile{--columnGap:0.75rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-3-tablet{--columnGap:0.75rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-3-tablet-only{--columnGap:0.75rem}}@media screen and (max-width:1023px){.columns.is-variable.is-3-touch{--columnGap:0.75rem}}@media screen and (min-width:1024px){.columns.is-variable.is-3-desktop{--columnGap:0.75rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-3-desktop-only{--columnGap:0.75rem}}@media screen and (min-width:1216px){.columns.is-variable.is-3-widescreen{--columnGap:0.75rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-3-widescreen-only{--columnGap:0.75rem}}@media screen and (min-width:1408px){.columns.is-variable.is-3-fullhd{--columnGap:0.75rem}}.columns.is-variable.is-4{--columnGap:1rem}@media screen and (max-width:768px){.columns.is-variable.is-4-mobile{--columnGap:1rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-4-tablet{--columnGap:1rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-4-tablet-only{--columnGap:1rem}}@media screen and (max-width:1023px){.columns.is-variable.is-4-touch{--columnGap:1rem}}@media screen and (min-width:1024px){.columns.is-variable.is-4-desktop{--columnGap:1rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-4-desktop-only{--columnGap:1rem}}@media screen and (min-width:1216px){.columns.is-variable.is-4-widescreen{--columnGap:1rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-4-widescreen-only{--columnGap:1rem}}@media screen and (min-width:1408px){.columns.is-variable.is-4-fullhd{--columnGap:1rem}}.columns.is-variable.is-5{--columnGap:1.25rem}@media screen and (max-width:768px){.columns.is-variable.is-5-mobile{--columnGap:1.25rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-5-tablet{--columnGap:1.25rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-5-tablet-only{--columnGap:1.25rem}}@media screen and (max-width:1023px){.columns.is-variable.is-5-touch{--columnGap:1.25rem}}@media screen and (min-width:1024px){.columns.is-variable.is-5-desktop{--columnGap:1.25rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-5-desktop-only{--columnGap:1.25rem}}@media screen and (min-width:1216px){.columns.is-variable.is-5-widescreen{--columnGap:1.25rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-5-widescreen-only{--columnGap:1.25rem}}@media screen and (min-width:1408px){.columns.is-variable.is-5-fullhd{--columnGap:1.25rem}}.columns.is-variable.is-6{--columnGap:1.5rem}@media screen and (max-width:768px){.columns.is-variable.is-6-mobile{--columnGap:1.5rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-6-tablet{--columnGap:1.5rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-6-tablet-only{--columnGap:1.5rem}}@media screen and (max-width:1023px){.columns.is-variable.is-6-touch{--columnGap:1.5rem}}@media screen and (min-width:1024px){.columns.is-variable.is-6-desktop{--columnGap:1.5rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-6-desktop-only{--columnGap:1.5rem}}@media screen and (min-width:1216px){.columns.is-variable.is-6-widescreen{--columnGap:1.5rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-6-widescreen-only{--columnGap:1.5rem}}@media screen and (min-width:1408px){.columns.is-variable.is-6-fullhd{--columnGap:1.5rem}}.columns.is-variable.is-7{--columnGap:1.75rem}@media screen and (max-width:768px){.columns.is-variable.is-7-mobile{--columnGap:1.75rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-7-tablet{--columnGap:1.75rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-7-tablet-only{--columnGap:1.75rem}}@media screen and (max-width:1023px){.columns.is-variable.is-7-touch{--columnGap:1.75rem}}@media screen and (min-width:1024px){.columns.is-variable.is-7-desktop{--columnGap:1.75rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-7-desktop-only{--columnGap:1.75rem}}@media screen and (min-width:1216px){.columns.is-variable.is-7-widescreen{--columnGap:1.75rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-7-widescreen-only{--columnGap:1.75rem}}@media screen and (min-width:1408px){.columns.is-variable.is-7-fullhd{--columnGap:1.75rem}}.columns.is-variable.is-8{--columnGap:2rem}@media screen and (max-width:768px){.columns.is-variable.is-8-mobile{--columnGap:2rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-8-tablet{--columnGap:2rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-8-tablet-only{--columnGap:2rem}}@media screen and (max-width:1023px){.columns.is-variable.is-8-touch{--columnGap:2rem}}@media screen and (min-width:1024px){.columns.is-variable.is-8-desktop{--columnGap:2rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-8-desktop-only{--columnGap:2rem}}@media screen and (min-width:1216px){.columns.is-variable.is-8-widescreen{--columnGap:2rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-8-widescreen-only{--columnGap:2rem}}@media screen and (min-width:1408px){.columns.is-variable.is-8-fullhd{--columnGap:2rem}}.tile{align-items:stretch;display:block;flex-basis:0;flex-grow:1;flex-shrink:1;min-height:-webkit-min-content;min-height:-moz-min-content;min-height:min-content}.tile.is-ancestor{margin-left:-.75rem;margin-right:-.75rem;margin-top:-.75rem}.tile.is-ancestor:last-child{margin-bottom:-.75rem}.tile.is-ancestor:not(:last-child){margin-bottom:.75rem}.tile.is-child{margin:0!important}.tile.is-parent{padding:.75rem}.tile.is-vertical{flex-direction:column}.tile.is-vertical>.tile.is-child:not(:last-child){margin-bottom:1.5rem!important}@media screen and (min-width:769px),print{.tile:not(.is-child){display:flex}.tile.is-1{flex:none;width:8.33333%}.tile.is-2{flex:none;width:16.66667%}.tile.is-3{flex:none;width:25%}.tile.is-4{flex:none;width:33.33333%}.tile.is-5{flex:none;width:41.66667%}.tile.is-6{flex:none;width:50%}.tile.is-7{flex:none;width:58.33333%}.tile.is-8{flex:none;width:66.66667%}.tile.is-9{flex:none;width:75%}.tile.is-10{flex:none;width:83.33333%}.tile.is-11{flex:none;width:91.66667%}.tile.is-12{flex:none;width:100%}}.hero{align-items:stretch;display:flex;flex-direction:column;justify-content:space-between}.hero .navbar{background:0 0}.hero .tabs ul{border-bottom:none}.hero.is-white{background-color:#fff;color:#0a0a0a}.hero.is-white a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-white strong{color:inherit}.hero.is-white .title{color:#0a0a0a}.hero.is-white .subtitle{color:rgba(10,10,10,.9)}.hero.is-white .subtitle a:not(.button),.hero.is-white .subtitle strong{color:#0a0a0a}@media screen and (max-width:1023px){.hero.is-white .navbar-menu{background-color:#fff}}.hero.is-white .navbar-item,.hero.is-white .navbar-link{color:rgba(10,10,10,.7)}.hero.is-white .navbar-link.is-active,.hero.is-white .navbar-link:hover,.hero.is-white a.navbar-item.is-active,.hero.is-white a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.hero.is-white .tabs a{color:#0a0a0a;opacity:.9}.hero.is-white .tabs a:hover{opacity:1}.hero.is-white .tabs li.is-active a{opacity:1}.hero.is-white .tabs.is-boxed a,.hero.is-white .tabs.is-toggle a{color:#0a0a0a}.hero.is-white .tabs.is-boxed a:hover,.hero.is-white .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-white .tabs.is-boxed li.is-active a,.hero.is-white .tabs.is-boxed li.is-active a:hover,.hero.is-white .tabs.is-toggle li.is-active a,.hero.is-white .tabs.is-toggle li.is-active a:hover{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.hero.is-white.is-bold{background-image:linear-gradient(141deg,#e6e6e6 0,#fff 71%,#fff 100%)}@media screen and (max-width:768px){.hero.is-white.is-bold .navbar-menu{background-image:linear-gradient(141deg,#e6e6e6 0,#fff 71%,#fff 100%)}}.hero.is-black{background-color:#0a0a0a;color:#fff}.hero.is-black a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-black strong{color:inherit}.hero.is-black .title{color:#fff}.hero.is-black .subtitle{color:rgba(255,255,255,.9)}.hero.is-black .subtitle a:not(.button),.hero.is-black .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-black .navbar-menu{background-color:#0a0a0a}}.hero.is-black .navbar-item,.hero.is-black .navbar-link{color:rgba(255,255,255,.7)}.hero.is-black .navbar-link.is-active,.hero.is-black .navbar-link:hover,.hero.is-black a.navbar-item.is-active,.hero.is-black a.navbar-item:hover{background-color:#000;color:#fff}.hero.is-black .tabs a{color:#fff;opacity:.9}.hero.is-black .tabs a:hover{opacity:1}.hero.is-black .tabs li.is-active a{opacity:1}.hero.is-black .tabs.is-boxed a,.hero.is-black .tabs.is-toggle a{color:#fff}.hero.is-black .tabs.is-boxed a:hover,.hero.is-black .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-black .tabs.is-boxed li.is-active a,.hero.is-black .tabs.is-boxed li.is-active a:hover,.hero.is-black .tabs.is-toggle li.is-active a,.hero.is-black .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#0a0a0a}.hero.is-black.is-bold{background-image:linear-gradient(141deg,#000 0,#0a0a0a 71%,#181616 100%)}@media screen and (max-width:768px){.hero.is-black.is-bold .navbar-menu{background-image:linear-gradient(141deg,#000 0,#0a0a0a 71%,#181616 100%)}}.hero.is-light{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.hero.is-light a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-light strong{color:inherit}.hero.is-light .title{color:rgba(0,0,0,.7)}.hero.is-light .subtitle{color:rgba(0,0,0,.9)}.hero.is-light .subtitle a:not(.button),.hero.is-light .subtitle strong{color:rgba(0,0,0,.7)}@media screen and (max-width:1023px){.hero.is-light .navbar-menu{background-color:#f5f5f5}}.hero.is-light .navbar-item,.hero.is-light .navbar-link{color:rgba(0,0,0,.7)}.hero.is-light .navbar-link.is-active,.hero.is-light .navbar-link:hover,.hero.is-light a.navbar-item.is-active,.hero.is-light a.navbar-item:hover{background-color:#e8e8e8;color:rgba(0,0,0,.7)}.hero.is-light .tabs a{color:rgba(0,0,0,.7);opacity:.9}.hero.is-light .tabs a:hover{opacity:1}.hero.is-light .tabs li.is-active a{opacity:1}.hero.is-light .tabs.is-boxed a,.hero.is-light .tabs.is-toggle a{color:rgba(0,0,0,.7)}.hero.is-light .tabs.is-boxed a:hover,.hero.is-light .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-light .tabs.is-boxed li.is-active a,.hero.is-light .tabs.is-boxed li.is-active a:hover,.hero.is-light .tabs.is-toggle li.is-active a,.hero.is-light .tabs.is-toggle li.is-active a:hover{background-color:rgba(0,0,0,.7);border-color:rgba(0,0,0,.7);color:#f5f5f5}.hero.is-light.is-bold{background-image:linear-gradient(141deg,#dfd8d9 0,#f5f5f5 71%,#fff 100%)}@media screen and (max-width:768px){.hero.is-light.is-bold .navbar-menu{background-image:linear-gradient(141deg,#dfd8d9 0,#f5f5f5 71%,#fff 100%)}}.hero.is-dark{background-color:#363636;color:#fff}.hero.is-dark a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-dark strong{color:inherit}.hero.is-dark .title{color:#fff}.hero.is-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-dark .subtitle a:not(.button),.hero.is-dark .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-dark .navbar-menu{background-color:#363636}}.hero.is-dark .navbar-item,.hero.is-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-dark .navbar-link.is-active,.hero.is-dark .navbar-link:hover,.hero.is-dark a.navbar-item.is-active,.hero.is-dark a.navbar-item:hover{background-color:#292929;color:#fff}.hero.is-dark .tabs a{color:#fff;opacity:.9}.hero.is-dark .tabs a:hover{opacity:1}.hero.is-dark .tabs li.is-active a{opacity:1}.hero.is-dark .tabs.is-boxed a,.hero.is-dark .tabs.is-toggle a{color:#fff}.hero.is-dark .tabs.is-boxed a:hover,.hero.is-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-dark .tabs.is-boxed li.is-active a,.hero.is-dark .tabs.is-boxed li.is-active a:hover,.hero.is-dark .tabs.is-toggle li.is-active a,.hero.is-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#363636}.hero.is-dark.is-bold{background-image:linear-gradient(141deg,#1f191a 0,#363636 71%,#46403f 100%)}@media screen and (max-width:768px){.hero.is-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#1f191a 0,#363636 71%,#46403f 100%)}}.hero.is-primary{background-color:#00d1b2;color:#fff}.hero.is-primary a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-primary strong{color:inherit}.hero.is-primary .title{color:#fff}.hero.is-primary .subtitle{color:rgba(255,255,255,.9)}.hero.is-primary .subtitle a:not(.button),.hero.is-primary .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-primary .navbar-menu{background-color:#00d1b2}}.hero.is-primary .navbar-item,.hero.is-primary .navbar-link{color:rgba(255,255,255,.7)}.hero.is-primary .navbar-link.is-active,.hero.is-primary .navbar-link:hover,.hero.is-primary a.navbar-item.is-active,.hero.is-primary a.navbar-item:hover{background-color:#00b89c;color:#fff}.hero.is-primary .tabs a{color:#fff;opacity:.9}.hero.is-primary .tabs a:hover{opacity:1}.hero.is-primary .tabs li.is-active a{opacity:1}.hero.is-primary .tabs.is-boxed a,.hero.is-primary .tabs.is-toggle a{color:#fff}.hero.is-primary .tabs.is-boxed a:hover,.hero.is-primary .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-primary .tabs.is-boxed li.is-active a,.hero.is-primary .tabs.is-boxed li.is-active a:hover,.hero.is-primary .tabs.is-toggle li.is-active a,.hero.is-primary .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#00d1b2}.hero.is-primary.is-bold{background-image:linear-gradient(141deg,#009e6c 0,#00d1b2 71%,#00e7eb 100%)}@media screen and (max-width:768px){.hero.is-primary.is-bold .navbar-menu{background-image:linear-gradient(141deg,#009e6c 0,#00d1b2 71%,#00e7eb 100%)}}.hero.is-link{background-color:#3273dc;color:#fff}.hero.is-link a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-link strong{color:inherit}.hero.is-link .title{color:#fff}.hero.is-link .subtitle{color:rgba(255,255,255,.9)}.hero.is-link .subtitle a:not(.button),.hero.is-link .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-link .navbar-menu{background-color:#3273dc}}.hero.is-link .navbar-item,.hero.is-link .navbar-link{color:rgba(255,255,255,.7)}.hero.is-link .navbar-link.is-active,.hero.is-link .navbar-link:hover,.hero.is-link a.navbar-item.is-active,.hero.is-link a.navbar-item:hover{background-color:#2366d1;color:#fff}.hero.is-link .tabs a{color:#fff;opacity:.9}.hero.is-link .tabs a:hover{opacity:1}.hero.is-link .tabs li.is-active a{opacity:1}.hero.is-link .tabs.is-boxed a,.hero.is-link .tabs.is-toggle a{color:#fff}.hero.is-link .tabs.is-boxed a:hover,.hero.is-link .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-link .tabs.is-boxed li.is-active a,.hero.is-link .tabs.is-boxed li.is-active a:hover,.hero.is-link .tabs.is-toggle li.is-active a,.hero.is-link .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#3273dc}.hero.is-link.is-bold{background-image:linear-gradient(141deg,#1577c6 0,#3273dc 71%,#4366e5 100%)}@media screen and (max-width:768px){.hero.is-link.is-bold .navbar-menu{background-image:linear-gradient(141deg,#1577c6 0,#3273dc 71%,#4366e5 100%)}}.hero.is-info{background-color:#3298dc;color:#fff}.hero.is-info a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-info strong{color:inherit}.hero.is-info .title{color:#fff}.hero.is-info .subtitle{color:rgba(255,255,255,.9)}.hero.is-info .subtitle a:not(.button),.hero.is-info .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-info .navbar-menu{background-color:#3298dc}}.hero.is-info .navbar-item,.hero.is-info .navbar-link{color:rgba(255,255,255,.7)}.hero.is-info .navbar-link.is-active,.hero.is-info .navbar-link:hover,.hero.is-info a.navbar-item.is-active,.hero.is-info a.navbar-item:hover{background-color:#238cd1;color:#fff}.hero.is-info .tabs a{color:#fff;opacity:.9}.hero.is-info .tabs a:hover{opacity:1}.hero.is-info .tabs li.is-active a{opacity:1}.hero.is-info .tabs.is-boxed a,.hero.is-info .tabs.is-toggle a{color:#fff}.hero.is-info .tabs.is-boxed a:hover,.hero.is-info .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-info .tabs.is-boxed li.is-active a,.hero.is-info .tabs.is-boxed li.is-active a:hover,.hero.is-info .tabs.is-toggle li.is-active a,.hero.is-info .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#3298dc}.hero.is-info.is-bold{background-image:linear-gradient(141deg,#159dc6 0,#3298dc 71%,#4389e5 100%)}@media screen and (max-width:768px){.hero.is-info.is-bold .navbar-menu{background-image:linear-gradient(141deg,#159dc6 0,#3298dc 71%,#4389e5 100%)}}.hero.is-success{background-color:#48c774;color:#fff}.hero.is-success a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-success strong{color:inherit}.hero.is-success .title{color:#fff}.hero.is-success .subtitle{color:rgba(255,255,255,.9)}.hero.is-success .subtitle a:not(.button),.hero.is-success .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-success .navbar-menu{background-color:#48c774}}.hero.is-success .navbar-item,.hero.is-success .navbar-link{color:rgba(255,255,255,.7)}.hero.is-success .navbar-link.is-active,.hero.is-success .navbar-link:hover,.hero.is-success a.navbar-item.is-active,.hero.is-success a.navbar-item:hover{background-color:#3abb67;color:#fff}.hero.is-success .tabs a{color:#fff;opacity:.9}.hero.is-success .tabs a:hover{opacity:1}.hero.is-success .tabs li.is-active a{opacity:1}.hero.is-success .tabs.is-boxed a,.hero.is-success .tabs.is-toggle a{color:#fff}.hero.is-success .tabs.is-boxed a:hover,.hero.is-success .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-success .tabs.is-boxed li.is-active a,.hero.is-success .tabs.is-boxed li.is-active a:hover,.hero.is-success .tabs.is-toggle li.is-active a,.hero.is-success .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#48c774}.hero.is-success.is-bold{background-image:linear-gradient(141deg,#29b342 0,#48c774 71%,#56d296 100%)}@media screen and (max-width:768px){.hero.is-success.is-bold .navbar-menu{background-image:linear-gradient(141deg,#29b342 0,#48c774 71%,#56d296 100%)}}.hero.is-warning{background-color:#ffdd57;color:rgba(0,0,0,.7)}.hero.is-warning a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-warning strong{color:inherit}.hero.is-warning .title{color:rgba(0,0,0,.7)}.hero.is-warning .subtitle{color:rgba(0,0,0,.9)}.hero.is-warning .subtitle a:not(.button),.hero.is-warning .subtitle strong{color:rgba(0,0,0,.7)}@media screen and (max-width:1023px){.hero.is-warning .navbar-menu{background-color:#ffdd57}}.hero.is-warning .navbar-item,.hero.is-warning .navbar-link{color:rgba(0,0,0,.7)}.hero.is-warning .navbar-link.is-active,.hero.is-warning .navbar-link:hover,.hero.is-warning a.navbar-item.is-active,.hero.is-warning a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.hero.is-warning .tabs a{color:rgba(0,0,0,.7);opacity:.9}.hero.is-warning .tabs a:hover{opacity:1}.hero.is-warning .tabs li.is-active a{opacity:1}.hero.is-warning .tabs.is-boxed a,.hero.is-warning .tabs.is-toggle a{color:rgba(0,0,0,.7)}.hero.is-warning .tabs.is-boxed a:hover,.hero.is-warning .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-warning .tabs.is-boxed li.is-active a,.hero.is-warning .tabs.is-boxed li.is-active a:hover,.hero.is-warning .tabs.is-toggle li.is-active a,.hero.is-warning .tabs.is-toggle li.is-active a:hover{background-color:rgba(0,0,0,.7);border-color:rgba(0,0,0,.7);color:#ffdd57}.hero.is-warning.is-bold{background-image:linear-gradient(141deg,#ffaf24 0,#ffdd57 71%,#fffa70 100%)}@media screen and (max-width:768px){.hero.is-warning.is-bold .navbar-menu{background-image:linear-gradient(141deg,#ffaf24 0,#ffdd57 71%,#fffa70 100%)}}.hero.is-danger{background-color:#f14668;color:#fff}.hero.is-danger a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-danger strong{color:inherit}.hero.is-danger .title{color:#fff}.hero.is-danger .subtitle{color:rgba(255,255,255,.9)}.hero.is-danger .subtitle a:not(.button),.hero.is-danger .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-danger .navbar-menu{background-color:#f14668}}.hero.is-danger .navbar-item,.hero.is-danger .navbar-link{color:rgba(255,255,255,.7)}.hero.is-danger .navbar-link.is-active,.hero.is-danger .navbar-link:hover,.hero.is-danger a.navbar-item.is-active,.hero.is-danger a.navbar-item:hover{background-color:#ef2e55;color:#fff}.hero.is-danger .tabs a{color:#fff;opacity:.9}.hero.is-danger .tabs a:hover{opacity:1}.hero.is-danger .tabs li.is-active a{opacity:1}.hero.is-danger .tabs.is-boxed a,.hero.is-danger .tabs.is-toggle a{color:#fff}.hero.is-danger .tabs.is-boxed a:hover,.hero.is-danger .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-danger .tabs.is-boxed li.is-active a,.hero.is-danger .tabs.is-boxed li.is-active a:hover,.hero.is-danger .tabs.is-toggle li.is-active a,.hero.is-danger .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#f14668}.hero.is-danger.is-bold{background-image:linear-gradient(141deg,#fa0a62 0,#f14668 71%,#f7595f 100%)}@media screen and (max-width:768px){.hero.is-danger.is-bold .navbar-menu{background-image:linear-gradient(141deg,#fa0a62 0,#f14668 71%,#f7595f 100%)}}.hero.is-small .hero-body{padding-bottom:1.5rem;padding-top:1.5rem}@media screen and (min-width:769px),print{.hero.is-medium .hero-body{padding-bottom:9rem;padding-top:9rem}}@media screen and (min-width:769px),print{.hero.is-large .hero-body{padding-bottom:18rem;padding-top:18rem}}.hero.is-fullheight .hero-body,.hero.is-fullheight-with-navbar .hero-body,.hero.is-halfheight .hero-body{align-items:center;display:flex}.hero.is-fullheight .hero-body>.container,.hero.is-fullheight-with-navbar .hero-body>.container,.hero.is-halfheight .hero-body>.container{flex-grow:1;flex-shrink:1}.hero.is-halfheight{min-height:50vh}.hero.is-fullheight{min-height:100vh}.hero-video{overflow:hidden}.hero-video video{left:50%;min-height:100%;min-width:100%;position:absolute;top:50%;transform:translate3d(-50%,-50%,0)}.hero-video.is-transparent{opacity:.3}@media screen and (max-width:768px){.hero-video{display:none}}.hero-buttons{margin-top:1.5rem}@media screen and (max-width:768px){.hero-buttons .button{display:flex}.hero-buttons .button:not(:last-child){margin-bottom:.75rem}}@media screen and (min-width:769px),print{.hero-buttons{display:flex;justify-content:center}.hero-buttons .button:not(:last-child){margin-right:1.5rem}}.hero-foot,.hero-head{flex-grow:0;flex-shrink:0}.hero-body{flex-grow:1;flex-shrink:0;padding:3rem 1.5rem}.section{padding:3rem 1.5rem}@media screen and (min-width:1024px){.section.is-medium{padding:9rem 1.5rem}.section.is-large{padding:18rem 1.5rem}}.footer{background-color:#fafafa;padding:3rem 1.5rem 6rem} diff --git a/public/index.html b/public/index.html new file mode 100644 index 000000000..062495ac9 --- /dev/null +++ b/public/index.html @@ -0,0 +1,199 @@ + + + + + + + + + The daugt + + + + +
+
+
+

+ Welcome to daugt +

+

+ This dashboard will help you check the search results with ease. +

+
+
+
+ +
+
+ + + +
+
+ +
+
    + +
+
+ + + + diff --git a/public/jquery-3.4.1.min.js b/public/jquery-3.4.1.min.js new file mode 100644 index 000000000..a1c07fd80 --- /dev/null +++ b/public/jquery-3.4.1.min.js @@ -0,0 +1,2 @@ +/*! jQuery v3.4.1 | (c) JS Foundation and other contributors | jquery.org/license */ +!function(e,t){"use strict";"object"==typeof module&&"object"==typeof module.exports?module.exports=e.document?t(e,!0):function(e){if(!e.document)throw new Error("jQuery requires a window with a document");return t(e)}:t(e)}("undefined"!=typeof window?window:this,function(C,e){"use strict";var t=[],E=C.document,r=Object.getPrototypeOf,s=t.slice,g=t.concat,u=t.push,i=t.indexOf,n={},o=n.toString,v=n.hasOwnProperty,a=v.toString,l=a.call(Object),y={},m=function(e){return"function"==typeof e&&"number"!=typeof e.nodeType},x=function(e){return null!=e&&e===e.window},c={type:!0,src:!0,nonce:!0,noModule:!0};function b(e,t,n){var r,i,o=(n=n||E).createElement("script");if(o.text=e,t)for(r in c)(i=t[r]||t.getAttribute&&t.getAttribute(r))&&o.setAttribute(r,i);n.head.appendChild(o).parentNode.removeChild(o)}function w(e){return null==e?e+"":"object"==typeof e||"function"==typeof e?n[o.call(e)]||"object":typeof e}var f="3.4.1",k=function(e,t){return new k.fn.init(e,t)},p=/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g;function d(e){var t=!!e&&"length"in e&&e.length,n=w(e);return!m(e)&&!x(e)&&("array"===n||0===t||"number"==typeof t&&0+~]|"+M+")"+M+"*"),U=new RegExp(M+"|>"),X=new RegExp($),V=new RegExp("^"+I+"$"),G={ID:new RegExp("^#("+I+")"),CLASS:new RegExp("^\\.("+I+")"),TAG:new RegExp("^("+I+"|[*])"),ATTR:new RegExp("^"+W),PSEUDO:new RegExp("^"+$),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+M+"*(even|odd|(([+-]|)(\\d*)n|)"+M+"*(?:([+-]|)"+M+"*(\\d+)|))"+M+"*\\)|)","i"),bool:new RegExp("^(?:"+R+")$","i"),needsContext:new RegExp("^"+M+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+M+"*((?:-\\d)?\\d*)"+M+"*\\)|)(?=[^-]|$)","i")},Y=/HTML$/i,Q=/^(?:input|select|textarea|button)$/i,J=/^h\d$/i,K=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,ee=/[+~]/,te=new RegExp("\\\\([\\da-f]{1,6}"+M+"?|("+M+")|.)","ig"),ne=function(e,t,n){var r="0x"+t-65536;return r!=r||n?t:r<0?String.fromCharCode(r+65536):String.fromCharCode(r>>10|55296,1023&r|56320)},re=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ie=function(e,t){return t?"\0"===e?"\ufffd":e.slice(0,-1)+"\\"+e.charCodeAt(e.length-1).toString(16)+" ":"\\"+e},oe=function(){T()},ae=be(function(e){return!0===e.disabled&&"fieldset"===e.nodeName.toLowerCase()},{dir:"parentNode",next:"legend"});try{H.apply(t=O.call(m.childNodes),m.childNodes),t[m.childNodes.length].nodeType}catch(e){H={apply:t.length?function(e,t){L.apply(e,O.call(t))}:function(e,t){var n=e.length,r=0;while(e[n++]=t[r++]);e.length=n-1}}}function se(t,e,n,r){var i,o,a,s,u,l,c,f=e&&e.ownerDocument,p=e?e.nodeType:9;if(n=n||[],"string"!=typeof t||!t||1!==p&&9!==p&&11!==p)return n;if(!r&&((e?e.ownerDocument||e:m)!==C&&T(e),e=e||C,E)){if(11!==p&&(u=Z.exec(t)))if(i=u[1]){if(9===p){if(!(a=e.getElementById(i)))return n;if(a.id===i)return n.push(a),n}else if(f&&(a=f.getElementById(i))&&y(e,a)&&a.id===i)return n.push(a),n}else{if(u[2])return H.apply(n,e.getElementsByTagName(t)),n;if((i=u[3])&&d.getElementsByClassName&&e.getElementsByClassName)return H.apply(n,e.getElementsByClassName(i)),n}if(d.qsa&&!A[t+" "]&&(!v||!v.test(t))&&(1!==p||"object"!==e.nodeName.toLowerCase())){if(c=t,f=e,1===p&&U.test(t)){(s=e.getAttribute("id"))?s=s.replace(re,ie):e.setAttribute("id",s=k),o=(l=h(t)).length;while(o--)l[o]="#"+s+" "+xe(l[o]);c=l.join(","),f=ee.test(t)&&ye(e.parentNode)||e}try{return H.apply(n,f.querySelectorAll(c)),n}catch(e){A(t,!0)}finally{s===k&&e.removeAttribute("id")}}}return g(t.replace(B,"$1"),e,n,r)}function ue(){var r=[];return function e(t,n){return r.push(t+" ")>b.cacheLength&&delete e[r.shift()],e[t+" "]=n}}function le(e){return e[k]=!0,e}function ce(e){var t=C.createElement("fieldset");try{return!!e(t)}catch(e){return!1}finally{t.parentNode&&t.parentNode.removeChild(t),t=null}}function fe(e,t){var n=e.split("|"),r=n.length;while(r--)b.attrHandle[n[r]]=t}function pe(e,t){var n=t&&e,r=n&&1===e.nodeType&&1===t.nodeType&&e.sourceIndex-t.sourceIndex;if(r)return r;if(n)while(n=n.nextSibling)if(n===t)return-1;return e?1:-1}function de(t){return function(e){return"input"===e.nodeName.toLowerCase()&&e.type===t}}function he(n){return function(e){var t=e.nodeName.toLowerCase();return("input"===t||"button"===t)&&e.type===n}}function ge(t){return function(e){return"form"in e?e.parentNode&&!1===e.disabled?"label"in e?"label"in e.parentNode?e.parentNode.disabled===t:e.disabled===t:e.isDisabled===t||e.isDisabled!==!t&&ae(e)===t:e.disabled===t:"label"in e&&e.disabled===t}}function ve(a){return le(function(o){return o=+o,le(function(e,t){var n,r=a([],e.length,o),i=r.length;while(i--)e[n=r[i]]&&(e[n]=!(t[n]=e[n]))})})}function ye(e){return e&&"undefined"!=typeof e.getElementsByTagName&&e}for(e in d=se.support={},i=se.isXML=function(e){var t=e.namespaceURI,n=(e.ownerDocument||e).documentElement;return!Y.test(t||n&&n.nodeName||"HTML")},T=se.setDocument=function(e){var t,n,r=e?e.ownerDocument||e:m;return r!==C&&9===r.nodeType&&r.documentElement&&(a=(C=r).documentElement,E=!i(C),m!==C&&(n=C.defaultView)&&n.top!==n&&(n.addEventListener?n.addEventListener("unload",oe,!1):n.attachEvent&&n.attachEvent("onunload",oe)),d.attributes=ce(function(e){return e.className="i",!e.getAttribute("className")}),d.getElementsByTagName=ce(function(e){return e.appendChild(C.createComment("")),!e.getElementsByTagName("*").length}),d.getElementsByClassName=K.test(C.getElementsByClassName),d.getById=ce(function(e){return a.appendChild(e).id=k,!C.getElementsByName||!C.getElementsByName(k).length}),d.getById?(b.filter.ID=function(e){var t=e.replace(te,ne);return function(e){return e.getAttribute("id")===t}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n=t.getElementById(e);return n?[n]:[]}}):(b.filter.ID=function(e){var n=e.replace(te,ne);return function(e){var t="undefined"!=typeof e.getAttributeNode&&e.getAttributeNode("id");return t&&t.value===n}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n,r,i,o=t.getElementById(e);if(o){if((n=o.getAttributeNode("id"))&&n.value===e)return[o];i=t.getElementsByName(e),r=0;while(o=i[r++])if((n=o.getAttributeNode("id"))&&n.value===e)return[o]}return[]}}),b.find.TAG=d.getElementsByTagName?function(e,t){return"undefined"!=typeof t.getElementsByTagName?t.getElementsByTagName(e):d.qsa?t.querySelectorAll(e):void 0}:function(e,t){var n,r=[],i=0,o=t.getElementsByTagName(e);if("*"===e){while(n=o[i++])1===n.nodeType&&r.push(n);return r}return o},b.find.CLASS=d.getElementsByClassName&&function(e,t){if("undefined"!=typeof t.getElementsByClassName&&E)return t.getElementsByClassName(e)},s=[],v=[],(d.qsa=K.test(C.querySelectorAll))&&(ce(function(e){a.appendChild(e).innerHTML="",e.querySelectorAll("[msallowcapture^='']").length&&v.push("[*^$]="+M+"*(?:''|\"\")"),e.querySelectorAll("[selected]").length||v.push("\\["+M+"*(?:value|"+R+")"),e.querySelectorAll("[id~="+k+"-]").length||v.push("~="),e.querySelectorAll(":checked").length||v.push(":checked"),e.querySelectorAll("a#"+k+"+*").length||v.push(".#.+[+~]")}),ce(function(e){e.innerHTML="";var t=C.createElement("input");t.setAttribute("type","hidden"),e.appendChild(t).setAttribute("name","D"),e.querySelectorAll("[name=d]").length&&v.push("name"+M+"*[*^$|!~]?="),2!==e.querySelectorAll(":enabled").length&&v.push(":enabled",":disabled"),a.appendChild(e).disabled=!0,2!==e.querySelectorAll(":disabled").length&&v.push(":enabled",":disabled"),e.querySelectorAll("*,:x"),v.push(",.*:")})),(d.matchesSelector=K.test(c=a.matches||a.webkitMatchesSelector||a.mozMatchesSelector||a.oMatchesSelector||a.msMatchesSelector))&&ce(function(e){d.disconnectedMatch=c.call(e,"*"),c.call(e,"[s!='']:x"),s.push("!=",$)}),v=v.length&&new RegExp(v.join("|")),s=s.length&&new RegExp(s.join("|")),t=K.test(a.compareDocumentPosition),y=t||K.test(a.contains)?function(e,t){var n=9===e.nodeType?e.documentElement:e,r=t&&t.parentNode;return e===r||!(!r||1!==r.nodeType||!(n.contains?n.contains(r):e.compareDocumentPosition&&16&e.compareDocumentPosition(r)))}:function(e,t){if(t)while(t=t.parentNode)if(t===e)return!0;return!1},D=t?function(e,t){if(e===t)return l=!0,0;var n=!e.compareDocumentPosition-!t.compareDocumentPosition;return n||(1&(n=(e.ownerDocument||e)===(t.ownerDocument||t)?e.compareDocumentPosition(t):1)||!d.sortDetached&&t.compareDocumentPosition(e)===n?e===C||e.ownerDocument===m&&y(m,e)?-1:t===C||t.ownerDocument===m&&y(m,t)?1:u?P(u,e)-P(u,t):0:4&n?-1:1)}:function(e,t){if(e===t)return l=!0,0;var n,r=0,i=e.parentNode,o=t.parentNode,a=[e],s=[t];if(!i||!o)return e===C?-1:t===C?1:i?-1:o?1:u?P(u,e)-P(u,t):0;if(i===o)return pe(e,t);n=e;while(n=n.parentNode)a.unshift(n);n=t;while(n=n.parentNode)s.unshift(n);while(a[r]===s[r])r++;return r?pe(a[r],s[r]):a[r]===m?-1:s[r]===m?1:0}),C},se.matches=function(e,t){return se(e,null,null,t)},se.matchesSelector=function(e,t){if((e.ownerDocument||e)!==C&&T(e),d.matchesSelector&&E&&!A[t+" "]&&(!s||!s.test(t))&&(!v||!v.test(t)))try{var n=c.call(e,t);if(n||d.disconnectedMatch||e.document&&11!==e.document.nodeType)return n}catch(e){A(t,!0)}return 0":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(e){return e[1]=e[1].replace(te,ne),e[3]=(e[3]||e[4]||e[5]||"").replace(te,ne),"~="===e[2]&&(e[3]=" "+e[3]+" "),e.slice(0,4)},CHILD:function(e){return e[1]=e[1].toLowerCase(),"nth"===e[1].slice(0,3)?(e[3]||se.error(e[0]),e[4]=+(e[4]?e[5]+(e[6]||1):2*("even"===e[3]||"odd"===e[3])),e[5]=+(e[7]+e[8]||"odd"===e[3])):e[3]&&se.error(e[0]),e},PSEUDO:function(e){var t,n=!e[6]&&e[2];return G.CHILD.test(e[0])?null:(e[3]?e[2]=e[4]||e[5]||"":n&&X.test(n)&&(t=h(n,!0))&&(t=n.indexOf(")",n.length-t)-n.length)&&(e[0]=e[0].slice(0,t),e[2]=n.slice(0,t)),e.slice(0,3))}},filter:{TAG:function(e){var t=e.replace(te,ne).toLowerCase();return"*"===e?function(){return!0}:function(e){return e.nodeName&&e.nodeName.toLowerCase()===t}},CLASS:function(e){var t=p[e+" "];return t||(t=new RegExp("(^|"+M+")"+e+"("+M+"|$)"))&&p(e,function(e){return t.test("string"==typeof e.className&&e.className||"undefined"!=typeof e.getAttribute&&e.getAttribute("class")||"")})},ATTR:function(n,r,i){return function(e){var t=se.attr(e,n);return null==t?"!="===r:!r||(t+="","="===r?t===i:"!="===r?t!==i:"^="===r?i&&0===t.indexOf(i):"*="===r?i&&-1:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function j(e,n,r){return m(n)?k.grep(e,function(e,t){return!!n.call(e,t,e)!==r}):n.nodeType?k.grep(e,function(e){return e===n!==r}):"string"!=typeof n?k.grep(e,function(e){return-1)[^>]*|#([\w-]+))$/;(k.fn.init=function(e,t,n){var r,i;if(!e)return this;if(n=n||q,"string"==typeof e){if(!(r="<"===e[0]&&">"===e[e.length-1]&&3<=e.length?[null,e,null]:L.exec(e))||!r[1]&&t)return!t||t.jquery?(t||n).find(e):this.constructor(t).find(e);if(r[1]){if(t=t instanceof k?t[0]:t,k.merge(this,k.parseHTML(r[1],t&&t.nodeType?t.ownerDocument||t:E,!0)),D.test(r[1])&&k.isPlainObject(t))for(r in t)m(this[r])?this[r](t[r]):this.attr(r,t[r]);return this}return(i=E.getElementById(r[2]))&&(this[0]=i,this.length=1),this}return e.nodeType?(this[0]=e,this.length=1,this):m(e)?void 0!==n.ready?n.ready(e):e(k):k.makeArray(e,this)}).prototype=k.fn,q=k(E);var H=/^(?:parents|prev(?:Until|All))/,O={children:!0,contents:!0,next:!0,prev:!0};function P(e,t){while((e=e[t])&&1!==e.nodeType);return e}k.fn.extend({has:function(e){var t=k(e,this),n=t.length;return this.filter(function(){for(var e=0;e\x20\t\r\n\f]*)/i,he=/^$|^module$|\/(?:java|ecma)script/i,ge={option:[1,""],thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};function ve(e,t){var n;return n="undefined"!=typeof e.getElementsByTagName?e.getElementsByTagName(t||"*"):"undefined"!=typeof e.querySelectorAll?e.querySelectorAll(t||"*"):[],void 0===t||t&&A(e,t)?k.merge([e],n):n}function ye(e,t){for(var n=0,r=e.length;nx",y.noCloneChecked=!!me.cloneNode(!0).lastChild.defaultValue;var Te=/^key/,Ce=/^(?:mouse|pointer|contextmenu|drag|drop)|click/,Ee=/^([^.]*)(?:\.(.+)|)/;function ke(){return!0}function Se(){return!1}function Ne(e,t){return e===function(){try{return E.activeElement}catch(e){}}()==("focus"===t)}function Ae(e,t,n,r,i,o){var a,s;if("object"==typeof t){for(s in"string"!=typeof n&&(r=r||n,n=void 0),t)Ae(e,s,n,r,t[s],o);return e}if(null==r&&null==i?(i=n,r=n=void 0):null==i&&("string"==typeof n?(i=r,r=void 0):(i=r,r=n,n=void 0)),!1===i)i=Se;else if(!i)return e;return 1===o&&(a=i,(i=function(e){return k().off(e),a.apply(this,arguments)}).guid=a.guid||(a.guid=k.guid++)),e.each(function(){k.event.add(this,t,i,r,n)})}function De(e,i,o){o?(Q.set(e,i,!1),k.event.add(e,i,{namespace:!1,handler:function(e){var t,n,r=Q.get(this,i);if(1&e.isTrigger&&this[i]){if(r.length)(k.event.special[i]||{}).delegateType&&e.stopPropagation();else if(r=s.call(arguments),Q.set(this,i,r),t=o(this,i),this[i](),r!==(n=Q.get(this,i))||t?Q.set(this,i,!1):n={},r!==n)return e.stopImmediatePropagation(),e.preventDefault(),n.value}else r.length&&(Q.set(this,i,{value:k.event.trigger(k.extend(r[0],k.Event.prototype),r.slice(1),this)}),e.stopImmediatePropagation())}})):void 0===Q.get(e,i)&&k.event.add(e,i,ke)}k.event={global:{},add:function(t,e,n,r,i){var o,a,s,u,l,c,f,p,d,h,g,v=Q.get(t);if(v){n.handler&&(n=(o=n).handler,i=o.selector),i&&k.find.matchesSelector(ie,i),n.guid||(n.guid=k.guid++),(u=v.events)||(u=v.events={}),(a=v.handle)||(a=v.handle=function(e){return"undefined"!=typeof k&&k.event.triggered!==e.type?k.event.dispatch.apply(t,arguments):void 0}),l=(e=(e||"").match(R)||[""]).length;while(l--)d=g=(s=Ee.exec(e[l])||[])[1],h=(s[2]||"").split(".").sort(),d&&(f=k.event.special[d]||{},d=(i?f.delegateType:f.bindType)||d,f=k.event.special[d]||{},c=k.extend({type:d,origType:g,data:r,handler:n,guid:n.guid,selector:i,needsContext:i&&k.expr.match.needsContext.test(i),namespace:h.join(".")},o),(p=u[d])||((p=u[d]=[]).delegateCount=0,f.setup&&!1!==f.setup.call(t,r,h,a)||t.addEventListener&&t.addEventListener(d,a)),f.add&&(f.add.call(t,c),c.handler.guid||(c.handler.guid=n.guid)),i?p.splice(p.delegateCount++,0,c):p.push(c),k.event.global[d]=!0)}},remove:function(e,t,n,r,i){var o,a,s,u,l,c,f,p,d,h,g,v=Q.hasData(e)&&Q.get(e);if(v&&(u=v.events)){l=(t=(t||"").match(R)||[""]).length;while(l--)if(d=g=(s=Ee.exec(t[l])||[])[1],h=(s[2]||"").split(".").sort(),d){f=k.event.special[d]||{},p=u[d=(r?f.delegateType:f.bindType)||d]||[],s=s[2]&&new RegExp("(^|\\.)"+h.join("\\.(?:.*\\.|)")+"(\\.|$)"),a=o=p.length;while(o--)c=p[o],!i&&g!==c.origType||n&&n.guid!==c.guid||s&&!s.test(c.namespace)||r&&r!==c.selector&&("**"!==r||!c.selector)||(p.splice(o,1),c.selector&&p.delegateCount--,f.remove&&f.remove.call(e,c));a&&!p.length&&(f.teardown&&!1!==f.teardown.call(e,h,v.handle)||k.removeEvent(e,d,v.handle),delete u[d])}else for(d in u)k.event.remove(e,d+t[l],n,r,!0);k.isEmptyObject(u)&&Q.remove(e,"handle events")}},dispatch:function(e){var t,n,r,i,o,a,s=k.event.fix(e),u=new Array(arguments.length),l=(Q.get(this,"events")||{})[s.type]||[],c=k.event.special[s.type]||{};for(u[0]=s,t=1;t\x20\t\r\n\f]*)[^>]*)\/>/gi,qe=/\s*$/g;function Oe(e,t){return A(e,"table")&&A(11!==t.nodeType?t:t.firstChild,"tr")&&k(e).children("tbody")[0]||e}function Pe(e){return e.type=(null!==e.getAttribute("type"))+"/"+e.type,e}function Re(e){return"true/"===(e.type||"").slice(0,5)?e.type=e.type.slice(5):e.removeAttribute("type"),e}function Me(e,t){var n,r,i,o,a,s,u,l;if(1===t.nodeType){if(Q.hasData(e)&&(o=Q.access(e),a=Q.set(t,o),l=o.events))for(i in delete a.handle,a.events={},l)for(n=0,r=l[i].length;n")},clone:function(e,t,n){var r,i,o,a,s,u,l,c=e.cloneNode(!0),f=oe(e);if(!(y.noCloneChecked||1!==e.nodeType&&11!==e.nodeType||k.isXMLDoc(e)))for(a=ve(c),r=0,i=(o=ve(e)).length;r").attr(n.scriptAttrs||{}).prop({charset:n.scriptCharset,src:n.url}).on("load error",i=function(e){r.remove(),i=null,e&&t("error"===e.type?404:200,e.type)}),E.head.appendChild(r[0])},abort:function(){i&&i()}}});var Vt,Gt=[],Yt=/(=)\?(?=&|$)|\?\?/;k.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var e=Gt.pop()||k.expando+"_"+kt++;return this[e]=!0,e}}),k.ajaxPrefilter("json jsonp",function(e,t,n){var r,i,o,a=!1!==e.jsonp&&(Yt.test(e.url)?"url":"string"==typeof e.data&&0===(e.contentType||"").indexOf("application/x-www-form-urlencoded")&&Yt.test(e.data)&&"data");if(a||"jsonp"===e.dataTypes[0])return r=e.jsonpCallback=m(e.jsonpCallback)?e.jsonpCallback():e.jsonpCallback,a?e[a]=e[a].replace(Yt,"$1"+r):!1!==e.jsonp&&(e.url+=(St.test(e.url)?"&":"?")+e.jsonp+"="+r),e.converters["script json"]=function(){return o||k.error(r+" was not called"),o[0]},e.dataTypes[0]="json",i=C[r],C[r]=function(){o=arguments},n.always(function(){void 0===i?k(C).removeProp(r):C[r]=i,e[r]&&(e.jsonpCallback=t.jsonpCallback,Gt.push(r)),o&&m(i)&&i(o[0]),o=i=void 0}),"script"}),y.createHTMLDocument=((Vt=E.implementation.createHTMLDocument("").body).innerHTML="
",2===Vt.childNodes.length),k.parseHTML=function(e,t,n){return"string"!=typeof e?[]:("boolean"==typeof t&&(n=t,t=!1),t||(y.createHTMLDocument?((r=(t=E.implementation.createHTMLDocument("")).createElement("base")).href=E.location.href,t.head.appendChild(r)):t=E),o=!n&&[],(i=D.exec(e))?[t.createElement(i[1])]:(i=we([e],t,o),o&&o.length&&k(o).remove(),k.merge([],i.childNodes)));var r,i,o},k.fn.load=function(e,t,n){var r,i,o,a=this,s=e.indexOf(" ");return-1").append(k.parseHTML(e)).find(r):e)}).always(n&&function(e,t){a.each(function(){n.apply(this,o||[e.responseText,t,e])})}),this},k.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){k.fn[t]=function(e){return this.on(t,e)}}),k.expr.pseudos.animated=function(t){return k.grep(k.timers,function(e){return t===e.elem}).length},k.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=k.css(e,"position"),c=k(e),f={};"static"===l&&(e.style.position="relative"),s=c.offset(),o=k.css(e,"top"),u=k.css(e,"left"),("absolute"===l||"fixed"===l)&&-1<(o+u).indexOf("auto")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),m(t)&&(t=t.call(e,n,k.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),"using"in t?t.using.call(e,f):c.css(f)}},k.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){k.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if("fixed"===k.css(r,"position"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&"static"===k.css(e,"position"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=k(e).offset()).top+=k.css(e,"borderTopWidth",!0),i.left+=k.css(e,"borderLeftWidth",!0))}return{top:t.top-i.top-k.css(r,"marginTop",!0),left:t.left-i.left-k.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&"static"===k.css(e,"position"))e=e.offsetParent;return e||ie})}}),k.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,i){var o="pageYOffset"===i;k.fn[t]=function(e){return _(this,function(e,t,n){var r;if(x(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),k.each(["top","left"],function(e,n){k.cssHooks[n]=ze(y.pixelPosition,function(e,t){if(t)return t=_e(e,n),$e.test(t)?k(e).position()[n]+"px":t})}),k.each({Height:"height",Width:"width"},function(a,s){k.each({padding:"inner"+a,content:s,"":"outer"+a},function(r,o){k.fn[o]=function(e,t){var n=arguments.length&&(r||"boolean"!=typeof e),i=r||(!0===e||!0===t?"margin":"border");return _(this,function(e,t,n){var r;return x(e)?0===o.indexOf("outer")?e["inner"+a]:e.document.documentElement["client"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body["scroll"+a],r["scroll"+a],e.body["offset"+a],r["offset"+a],r["client"+a])):void 0===n?k.css(e,t,i):k.style(e,t,n,i)},s,n?e:void 0,n)}})}),k.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,n){k.fn[n]=function(e,t){return 0=this._config.preview;if(o)f.postMessage({results:n,workerId:b.WORKER_ID,finished:a});else if(q(this._config.chunk)&&!t){if(this._config.chunk(n,this._handle),this._handle.paused()||this._handle.aborted())return void(this._halted=!0);n=void 0,this._completeResults=void 0}return this._config.step||this._config.chunk||(this._completeResults.data=this._completeResults.data.concat(n.data),this._completeResults.errors=this._completeResults.errors.concat(n.errors),this._completeResults.meta=n.meta),this._completed||!a||!q(this._config.complete)||n&&n.meta.aborted||(this._config.complete(this._completeResults,this._input),this._completed=!0),a||n&&n.meta.paused||this._nextChunk(),n}this._halted=!0},this._sendError=function(e){q(this._config.error)?this._config.error(e):o&&this._config.error&&f.postMessage({workerId:b.WORKER_ID,error:e,finished:!1})}}function l(e){var i;(e=e||{}).chunkSize||(e.chunkSize=b.RemoteChunkSize),u.call(this,e),this._nextChunk=n?function(){this._readChunk(),this._chunkLoaded()}:function(){this._readChunk()},this.stream=function(e){this._input=e,this._nextChunk()},this._readChunk=function(){if(this._finished)this._chunkLoaded();else{if(i=new XMLHttpRequest,this._config.withCredentials&&(i.withCredentials=this._config.withCredentials),n||(i.onload=y(this._chunkLoaded,this),i.onerror=y(this._chunkError,this)),i.open("GET",this._input,!n),this._config.downloadRequestHeaders){var e=this._config.downloadRequestHeaders;for(var t in e)i.setRequestHeader(t,e[t])}if(this._config.chunkSize){var r=this._start+this._config.chunkSize-1;i.setRequestHeader("Range","bytes="+this._start+"-"+r)}try{i.send()}catch(e){this._chunkError(e.message)}n&&0===i.status?this._chunkError():this._start+=this._config.chunkSize}},this._chunkLoaded=function(){4===i.readyState&&(i.status<200||400<=i.status?this._chunkError():(this._finished=!this._config.chunkSize||this._start>function(e){var t=e.getResponseHeader("Content-Range");if(null===t)return-1;return parseInt(t.substr(t.lastIndexOf("/")+1))}(i),this.parseChunk(i.responseText)))},this._chunkError=function(e){var t=i.statusText||e;this._sendError(new Error(t))}}function c(e){var i,n;(e=e||{}).chunkSize||(e.chunkSize=b.LocalChunkSize),u.call(this,e);var s="undefined"!=typeof FileReader;this.stream=function(e){this._input=e,n=e.slice||e.webkitSlice||e.mozSlice,s?((i=new FileReader).onload=y(this._chunkLoaded,this),i.onerror=y(this._chunkError,this)):i=new FileReaderSync,this._nextChunk()},this._nextChunk=function(){this._finished||this._config.preview&&!(this._rowCount=this._input.size,this.parseChunk(e.target.result)},this._chunkError=function(){this._sendError(i.error)}}function p(e){var r;u.call(this,e=e||{}),this.stream=function(e){return r=e,this._nextChunk()},this._nextChunk=function(){if(!this._finished){var e=this._config.chunkSize,t=e?r.substr(0,e):r;return r=e?r.substr(e):"",this._finished=!r,this.parseChunk(t)}}}function m(e){u.call(this,e=e||{});var t=[],r=!0,i=!1;this.pause=function(){u.prototype.pause.apply(this,arguments),this._input.pause()},this.resume=function(){u.prototype.resume.apply(this,arguments),this._input.resume()},this.stream=function(e){this._input=e,this._input.on("data",this._streamData),this._input.on("end",this._streamEnd),this._input.on("error",this._streamError)},this._checkIsFinished=function(){i&&1===t.length&&(this._finished=!0)},this._nextChunk=function(){this._checkIsFinished(),t.length?this.parseChunk(t.shift()):r=!0},this._streamData=y(function(e){try{t.push("string"==typeof e?e:e.toString(this._config.encoding)),r&&(r=!1,this._checkIsFinished(),this.parseChunk(t.shift()))}catch(e){this._streamError(e)}},this),this._streamError=y(function(e){this._streamCleanUp(),this._sendError(e)},this),this._streamEnd=y(function(){this._streamCleanUp(),i=!0,this._streamData("")},this),this._streamCleanUp=y(function(){this._input.removeListener("data",this._streamData),this._input.removeListener("end",this._streamEnd),this._input.removeListener("error",this._streamError)},this)}function r(g){var a,o,h,i=Math.pow(2,53),n=-i,s=/^\s*-?(\d*\.?\d+|\d+\.?\d*)(e[-+]?\d+)?\s*$/i,u=/(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d\.\d+([+-][0-2]\d:[0-5]\d|Z))|(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))|(\d{4}-[01]\d-[0-3]\dT[0-2]\d:[0-5]\d([+-][0-2]\d:[0-5]\d|Z))/,t=this,r=0,f=0,d=!1,e=!1,l=[],c={data:[],errors:[],meta:{}};if(q(g.step)){var p=g.step;g.step=function(e){if(c=e,_())m();else{if(m(),0===c.data.length)return;r+=e.data.length,g.preview&&r>g.preview?o.abort():p(c,t)}}}function v(e){return"greedy"===g.skipEmptyLines?""===e.join("").trim():1===e.length&&0===e[0].length}function m(){if(c&&h&&(k("Delimiter","UndetectableDelimiter","Unable to auto-detect delimiting character; defaulted to '"+b.DefaultDelimiter+"'"),h=!1),g.skipEmptyLines)for(var e=0;e=l.length?"__parsed_extra":l[r]),g.transform&&(s=g.transform(s,n)),s=y(n,s),"__parsed_extra"===n?(i[n]=i[n]||[],i[n].push(s)):i[n]=s}return g.header&&(r>l.length?k("FieldMismatch","TooManyFields","Too many fields: expected "+l.length+" fields but parsed "+r,f+t):r=i.length/2?"\r\n":"\r"}(e,i)),h=!1,g.delimiter)q(g.delimiter)&&(g.delimiter=g.delimiter(e),c.meta.delimiter=g.delimiter);else{var n=function(e,t,r,i,n){var s,a,o,h;n=n||[",","\t","|",";",b.RECORD_SEP,b.UNIT_SEP];for(var u=0;u=L)return R(!0)}else for(g=M,M++;;){if(-1===(g=a.indexOf(O,g+1)))return t||u.push({type:"Quotes",code:"MissingQuotes",message:"Quoted field unterminated",row:h.length,index:M}),w();if(g===i-1)return w(a.substring(M,g).replace(_,O));if(O!==z||a[g+1]!==z){if(O===z||0===g||a[g-1]!==z){var y=E(-1===m?p:Math.min(p,m));if(a[g+1+y]===D){f.push(a.substring(M,g).replace(_,O)),a[M=g+1+y+e]!==O&&(g=a.indexOf(O,M)),p=a.indexOf(D,M),m=a.indexOf(I,M);break}var k=E(m);if(a.substr(g+1+k,n)===I){if(f.push(a.substring(M,g).replace(_,O)),C(g+1+k+n),p=a.indexOf(D,M),g=a.indexOf(O,M),o&&(S(),j))return R();if(L&&h.length>=L)return R(!0);break}u.push({type:"Quotes",code:"InvalidQuotes",message:"Trailing quote on quoted field is malformed",row:h.length,index:M}),g++}}else g++}return w();function b(e){h.push(e),d=M}function E(e){var t=0;if(-1!==e){var r=a.substring(g+1,e);r&&""===r.trim()&&(t=r.length)}return t}function w(e){return t||(void 0===e&&(e=a.substr(M)),f.push(e),M=i,b(f),o&&S()),R()}function C(e){M=e,b(f),f=[],m=a.indexOf(I,M)}function R(e,t){return{data:t||!1?h[0]:h,errors:u,meta:{delimiter:D,linebreak:I,aborted:j,truncated:!!e,cursor:d+(r||0)}}}function S(){A(R(void 0,!0)),h=[],u=[]}function x(e,t,r){var i={nextDelim:void 0,quoteSearch:void 0},n=a.indexOf(O,t+1);if(t anyhow::Result<()> { .max_dbs(5) .open(opt.database)?; - let main = env.create_poly_database(None)?; - let postings_ids: Database = env.create_database(Some("postings-ids"))?; - let documents: Database, ByteSlice> = env.create_database(Some("documents"))?; + let index = Index::new(&env)?; + let before = Instant::now(); let rtxn = env.read_txn()?; - let headers = match main.get::<_, Str, ByteSlice>(&rtxn, "headers")? { + + let documents_ids = index.search(&rtxn, &opt.query)?; + let headers = match index.headers(&rtxn)? { Some(headers) => headers, None => return Ok(()), }; - let fst = match main.get::<_, Str, ByteSlice>(&rtxn, "words-fst")? { - Some(bytes) => fst::Set::new(bytes)?, - None => return Ok(()), - }; - - // Building these factories is not free. - let lev0 = LevenshteinAutomatonBuilder::new(0, true); - let lev1 = LevenshteinAutomatonBuilder::new(1, true); - let lev2 = LevenshteinAutomatonBuilder::new(2, true); - - let words: Vec<_> = alphanumeric_tokens(&opt.query).collect(); - let number_of_words = words.len(); - let dfas = words.into_iter().enumerate().map(|(i, word)| { - let word = word.cow_to_lowercase(); - let is_last = i + 1 == number_of_words; - let dfa = match word.len() { - 0..=4 => if is_last { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) }, - 5..=8 => if is_last { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) }, - _ => if is_last { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) }, - }; - (word, dfa) - }); - - let before = Instant::now(); - let mut intersect_result: Option = None; - for (word, dfa) in dfas { - let before = Instant::now(); - let mut union_result = RoaringBitmap::default(); - let mut stream = fst.search(dfa).into_stream(); - while let Some(word) = stream.next() { - let word = std::str::from_utf8(word)?; - if let Some(ids) = postings_ids.get(&rtxn, word)? { - let right = RoaringBitmap::deserialize_from(ids)?; - union_result.union_with(&right); - } - } - eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); - - intersect_result = match intersect_result.take() { - Some(mut left) => { - let before = Instant::now(); - let left_len = left.len(); - left.intersect_with(&union_result); - eprintln!("intersect between {:?} and {:?} took {:.02?}", - left_len, union_result.len(), before.elapsed()); - Some(left) - }, - None => Some(union_result), - }; - } - let mut stdout = io::stdout(); stdout.write_all(&headers)?; - let total_length = intersect_result.as_ref().map_or(0, |x| x.len()); - for id in intersect_result.unwrap_or_default().iter().take(20) { - if let Some(content) = documents.get(&rtxn, &BEU32::new(id))? { + for id in &documents_ids { + if let Some(content) = index.documents.get(&rtxn, &BEU32::new(*id))? { stdout.write_all(&content)?; } } - eprintln!("Took {:.02?} to find {} documents", before.elapsed(), total_length); + eprintln!("Took {:.02?} to find {} documents", before.elapsed(), documents_ids.len()); Ok(()) } diff --git a/src/bin/serve.rs b/src/bin/serve.rs new file mode 100644 index 000000000..c631e024e --- /dev/null +++ b/src/bin/serve.rs @@ -0,0 +1,115 @@ +use std::net::SocketAddr; +use std::path::PathBuf; +use std::str::FromStr; +use std::time::Instant; + +use heed::EnvOpenOptions; +use serde::Deserialize; +use structopt::StructOpt; +use warp::{Filter, http::Response}; + +use mega_mini_indexer::{BEU32, Index}; + +#[derive(Debug, StructOpt)] +#[structopt(name = "mmi", about = "The server side of the mmi project.")] +struct Opt { + /// The database path where the LMDB database is located. + /// It is created if it doesn't already exist. + #[structopt(long = "db", parse(from_os_str))] + database: PathBuf, + + /// The maximum size the database can take on disk. It is recommended to specify + /// the whole disk space (value must be a multiple of a page size). + #[structopt(long = "db-size", default_value = "107374182400")] // 100 GB + database_size: usize, + + /// The ip and port on which the database will listen for HTTP requests. + #[structopt(short = "l", long, default_value = "127.0.0.1:9700")] + http_listen_addr: String, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let opt = Opt::from_args(); + + std::fs::create_dir_all(&opt.database)?; + let env = EnvOpenOptions::new() + .map_size(opt.database_size) + .max_dbs(10) + .open(&opt.database)?; + + let index = Index::new(&env)?; + + // We run and wait on the HTTP server + + // Expose an HTML page to debug the search in a browser + let dash_html_route = warp::filters::method::get() + .and(warp::filters::path::end()) + .map(|| warp::reply::html(include_str!("../../public/index.html"))); + + let dash_bulma_route = warp::filters::method::get() + .and(warp::path!("bulma.min.css")) + .map(|| Response::builder() + .header("content-type", "text/css; charset=utf-8") + .body(include_str!("../../public/bulma.min.css")) + ); + + let dash_jquery_route = warp::filters::method::get() + .and(warp::path!("jquery-3.4.1.min.js")) + .map(|| Response::builder() + .header("content-type", "application/javascript; charset=utf-8") + .body(include_str!("../../public/jquery-3.4.1.min.js")) + ); + + let dash_papaparse_route = warp::filters::method::get() + .and(warp::path!("papaparse.min.js")) + .map(|| Response::builder() + .header("content-type", "application/javascript; charset=utf-8") + .body(include_str!("../../public/papaparse.min.js")) + ); + + #[derive(Deserialize)] + struct QueryBody { + query: String, + } + + let env_cloned = env.clone(); + let query_route = warp::filters::method::post() + .and(warp::path!("query")) + .and(warp::body::json()) + .map(move |query: QueryBody| { + let before_search = Instant::now(); + let rtxn = env_cloned.read_txn().unwrap(); + + let documents_ids = index.search(&rtxn, &query.query).unwrap(); + + let mut body = Vec::new(); + if let Some(headers) = index.headers(&rtxn).unwrap() { + // We write the headers + body.extend_from_slice(headers); + + for id in documents_ids { + if let Some(content) = index.documents.get(&rtxn, &BEU32::new(id)).unwrap() { + body.extend_from_slice(&content); + } + } + } + + Response::builder() + .header("Content-Type", "text/csv") + .header("Time-Ms", before_search.elapsed().as_millis().to_string()) + .body(String::from_utf8(body).unwrap()) + }); + + let routes = dash_html_route + .or(dash_bulma_route) + .or(dash_jquery_route) + .or(dash_papaparse_route) + .or(query_route); + + let addr = SocketAddr::from_str(&opt.http_listen_addr).unwrap(); + eprintln!("listening on http://{}", addr); + warp::serve(routes).run(addr).await; + + Ok(()) +} diff --git a/src/lib.rs b/src/lib.rs index 9e5baf1fd..01bc92df9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,14 @@ use std::collections::HashMap; use std::hash::BuildHasherDefault; +use std::time::Instant; +use cow_utils::CowUtils; +use fst::{IntoStreamer, Streamer}; use fxhash::FxHasher32; +use heed::types::*; +use heed::{PolyDatabase, Database}; +use levenshtein_automata::LevenshteinAutomatonBuilder; +use roaring::RoaringBitmap; use slice_group_by::StrGroupBy; pub type FastMap4 = HashMap>; @@ -14,3 +21,82 @@ pub fn alphanumeric_tokens(string: &str) -> impl Iterator { let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric); string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric) } + +#[derive(Clone)] +pub struct Index { + pub main: PolyDatabase, + pub postings_ids: Database, + pub documents: Database, ByteSlice>, +} + +impl Index { + pub fn new(env: &heed::Env) -> heed::Result { + let main = env.create_poly_database(None)?; + let postings_ids = env.create_database(Some("postings-ids"))?; + let documents = env.create_database(Some("documents"))?; + + Ok(Index { + main, + postings_ids, + documents, + }) + } + + pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result> { + self.main.get::<_, Str, ByteSlice>(rtxn, "headers") + } + + pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result> { + let fst = match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? { + Some(bytes) => fst::Set::new(bytes)?, + None => return Ok(Vec::new()), + }; + + // Building these factories is not free. + let lev0 = LevenshteinAutomatonBuilder::new(0, true); + let lev1 = LevenshteinAutomatonBuilder::new(1, true); + let lev2 = LevenshteinAutomatonBuilder::new(2, true); + + let words: Vec<_> = alphanumeric_tokens(query).collect(); + let number_of_words = words.len(); + let dfas = words.into_iter().enumerate().map(|(i, word)| { + let word = word.cow_to_lowercase(); + let is_last = i + 1 == number_of_words; + let dfa = match word.len() { + 0..=4 => if is_last { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) }, + 5..=8 => if is_last { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) }, + _ => if is_last { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) }, + }; + (word, dfa) + }); + + let mut intersect_result: Option = None; + for (word, dfa) in dfas { + let before = Instant::now(); + let mut union_result = RoaringBitmap::default(); + let mut stream = fst.search(dfa).into_stream(); + while let Some(word) = stream.next() { + let word = std::str::from_utf8(word)?; + if let Some(ids) = self.postings_ids.get(rtxn, word)? { + let right = RoaringBitmap::deserialize_from(ids)?; + union_result.union_with(&right); + } + } + eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); + + intersect_result = match intersect_result.take() { + Some(mut left) => { + let before = Instant::now(); + let left_len = left.len(); + left.intersect_with(&union_result); + eprintln!("intersect between {:?} and {:?} took {:.02?}", + left_len, union_result.len(), before.elapsed()); + Some(left) + }, + None => Some(union_result), + }; + } + + Ok(intersect_result.unwrap_or_default().iter().take(20).collect()) + } +} From dde3e01a599320f88cb77aa6a238fa0e3af5a183 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 31 May 2020 18:20:49 +0200 Subject: [PATCH 0013/1889] Introduce prefix postings ids for better perfs --- src/bin/indexer.rs | 67 +++++++++++++++++++++++++++++----------------- src/lib.rs | 24 ++++++++++++----- 2 files changed, 59 insertions(+), 32 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index a7d2c01f1..17689823c 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -7,15 +7,15 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use anyhow::Context; use cow_utils::CowUtils; use fst::{Streamer, IntoStreamer}; +use heed::EnvOpenOptions; use heed::types::*; -use heed::{EnvOpenOptions, PolyDatabase, Database}; use oxidized_mtbl::{Reader, ReaderOptions, Writer, Merger, MergerOptions}; use rayon::prelude::*; use roaring::RoaringBitmap; use structopt::StructOpt; use mega_mini_indexer::alphanumeric_tokens; -use mega_mini_indexer::{FastMap4, SmallVec32, BEU32, DocumentId}; +use mega_mini_indexer::{FastMap4, SmallVec32, BEU32, Index, DocumentId}; #[cfg(target_os = "linux")] #[global_allocator] @@ -38,6 +38,7 @@ struct Opt { struct Indexed { fst: fst::Set>, postings_ids: FastMap4, + prefix_postings_ids: FastMap4, headers: Vec, documents: Vec<(DocumentId, Vec)>, } @@ -69,8 +70,21 @@ impl MtblKvStore { } } - // postings ids keys are all prefixed by a '2' + // We must write the prefix postings ids key[0] = 2; + let mut stream = indexed.fst.stream(); + while let Some(word) = stream.next() { + key.truncate(1); + key.extend_from_slice(word); + if let Some(ids) = indexed.prefix_postings_ids.remove(word) { + buffer.clear(); + ids.serialize_into(&mut buffer)?; + out.add(&key, &buffer).unwrap(); + } + } + + // postings ids keys are all prefixed by a '2' + key[0] = 3; indexed.documents.sort_unstable(); for (id, content) in indexed.documents { key.truncate(1); @@ -115,7 +129,7 @@ impl MtblKvStore { assert_eq!(left, right); Some(left.to_vec()) } - else if key.starts_with(&[1]) { + else if key.starts_with(&[1]) || key.starts_with(&[2]) { let mut left = RoaringBitmap::deserialize_from(left).unwrap(); let right = RoaringBitmap::deserialize_from(right).unwrap(); left.union_with(&right); @@ -123,7 +137,7 @@ impl MtblKvStore { left.serialize_into(&mut vec).unwrap(); Some(vec) } - else if key.starts_with(&[2]) { + else if key.starts_with(&[3]) { assert_eq!(left, right); Some(left.to_vec()) } @@ -155,6 +169,7 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { let mut document = csv::StringRecord::new(); let mut postings_ids = FastMap4::default(); + let mut prefix_postings_ids = FastMap4::default(); let mut documents = Vec::new(); // Write the headers into a Vec of bytes. @@ -174,6 +189,11 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { postings_ids.entry(SmallVec32::from(word.as_bytes())) .or_insert_with(RoaringBitmap::new) .insert(document_id); + if let Some(prefix) = word.as_bytes().get(0..word.len().min(4)) { + prefix_postings_ids.entry(SmallVec32::from(prefix)) + .or_insert_with(RoaringBitmap::new) + .insert(document_id); + } } } } @@ -185,7 +205,7 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { documents.push((document_id, document)); } - // We compute and store the postings list into the DB. + // We store the words from the postings. let mut new_words = BTreeSet::default(); for (word, _new_ids) in &postings_ids { new_words.insert(word.clone()); @@ -193,20 +213,13 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallVec32::as_ref))?; - let indexed = Indexed { fst: new_words_fst, headers, postings_ids, documents }; + let indexed = Indexed { fst: new_words_fst, headers, postings_ids, prefix_postings_ids, documents }; MtblKvStore::from_indexed(indexed) } // TODO merge with the previous values -fn writer( - wtxn: &mut heed::RwTxn, - main: PolyDatabase, - postings_ids: Database, - documents: Database, ByteSlice>, - mtbl_store: MtblKvStore, -) -> anyhow::Result -{ +fn writer(wtxn: &mut heed::RwTxn, index: Index, mtbl_store: MtblKvStore) -> anyhow::Result { let mtbl_store = match mtbl_store.0 { Some(store) => unsafe { memmap::Mmap::map(&store)? }, None => return Ok(0), @@ -216,25 +229,32 @@ fn writer( // Write the words fst let fst = mtbl_store.get(b"\0words-fst").unwrap(); let fst = fst::Set::new(fst)?; - main.put::<_, Str, ByteSlice>(wtxn, "words-fst", &fst.as_fst().as_bytes())?; + index.main.put::<_, Str, ByteSlice>(wtxn, "words-fst", &fst.as_fst().as_bytes())?; // Write and merge the headers let headers = mtbl_store.get(b"\0headers").unwrap(); - main.put::<_, Str, ByteSlice>(wtxn, "headers", headers.as_ref())?; + index.main.put::<_, Str, ByteSlice>(wtxn, "headers", headers.as_ref())?; // Write and merge the postings lists let mut iter = mtbl_store.iter_prefix(&[1]).unwrap(); while let Some((word, postings)) = iter.next() { let word = std::str::from_utf8(&word[1..]).unwrap(); - postings_ids.put(wtxn, &word, &postings)?; + index.postings_ids.put(wtxn, &word, &postings)?; + } + + // Write and merge the prefix postings lists + let mut iter = mtbl_store.iter_prefix(&[2]).unwrap(); + while let Some((word, postings)) = iter.next() { + let word = std::str::from_utf8(&word[1..]).unwrap(); + index.prefix_postings_ids.put(wtxn, &word, &postings)?; } // Write the documents let mut count = 0; - let mut iter = mtbl_store.iter_prefix(&[2]).unwrap(); + let mut iter = mtbl_store.iter_prefix(&[3]).unwrap(); while let Some((id_bytes, content)) = iter.next() { let id = id_bytes[1..].try_into().map(u32::from_be_bytes).unwrap(); - documents.put(wtxn, &BEU32::new(id), &content)?; + index.documents.put(wtxn, &BEU32::new(id), &content)?; count += 1; } @@ -251,10 +271,7 @@ fn main() -> anyhow::Result<()> { .max_dbs(5) .open(opt.database)?; - let main = env.create_poly_database(None)?; - let postings_ids: Database = env.create_database(Some("postings-ids"))?; - let documents: Database, ByteSlice> = env.create_database(Some("documents"))?; - + let index = Index::new(&env)?; let res = opt.files_to_index .into_par_iter() .try_fold(MtblKvStore::default, |acc, path| { @@ -271,7 +288,7 @@ fn main() -> anyhow::Result<()> { eprintln!("We are writing into LMDB..."); let mut wtxn = env.write_txn()?; - let count = writer(&mut wtxn, main, postings_ids, documents, mtbl_store)?; + let count = writer(&mut wtxn, index, mtbl_store)?; wtxn.commit()?; eprintln!("Wrote {} documents into LMDB", count); diff --git a/src/lib.rs b/src/lib.rs index 01bc92df9..248e360cf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,6 +26,7 @@ pub fn alphanumeric_tokens(string: &str) -> impl Iterator { pub struct Index { pub main: PolyDatabase, pub postings_ids: Database, + pub prefix_postings_ids: Database, pub documents: Database, ByteSlice>, } @@ -33,11 +34,13 @@ impl Index { pub fn new(env: &heed::Env) -> heed::Result { let main = env.create_poly_database(None)?; let postings_ids = env.create_database(Some("postings-ids"))?; + let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?; let documents = env.create_database(Some("documents"))?; Ok(Index { main, postings_ids, + prefix_postings_ids, documents, }) } @@ -73,16 +76,23 @@ impl Index { let mut intersect_result: Option = None; for (word, dfa) in dfas { let before = Instant::now(); + let mut union_result = RoaringBitmap::default(); - let mut stream = fst.search(dfa).into_stream(); - while let Some(word) = stream.next() { - let word = std::str::from_utf8(word)?; - if let Some(ids) = self.postings_ids.get(rtxn, word)? { - let right = RoaringBitmap::deserialize_from(ids)?; - union_result.union_with(&right); + if word.len() <= 4 { + if let Some(ids) = self.prefix_postings_ids.get(rtxn, &word[..word.len().min(4)])? { + union_result = RoaringBitmap::deserialize_from(ids)?; } + } else { + let mut stream = fst.search(dfa).into_stream(); + while let Some(word) = stream.next() { + let word = std::str::from_utf8(word)?; + if let Some(ids) = self.postings_ids.get(rtxn, word)? { + let right = RoaringBitmap::deserialize_from(ids)?; + union_result.union_with(&right); + } + } + eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); } - eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); intersect_result = match intersect_result.take() { Some(mut left) => { From dff68a339a37a2b00c162238530a6d64bbf511e5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 31 May 2020 18:21:24 +0200 Subject: [PATCH 0014/1889] Use OnceCell to cache levenshtein builders --- Cargo.lock | 1 + Cargo.toml | 1 + src/bin/indexer.rs | 8 +++++--- src/lib.rs | 22 +++++++++++----------- 4 files changed, 18 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6acef59fe..be5640e22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -793,6 +793,7 @@ dependencies = [ "jemallocator", "levenshtein_automata", "memmap", + "once_cell", "oxidized-mtbl", "rayon", "roaring", diff --git a/Cargo.toml b/Cargo.toml index 5087f59bc..f3b09dc56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ heed = { version = "0.8.0", default-features = false, features = ["lmdb"] } jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } memmap = "0.7.0" +once_cell = "1.4.0" oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "8918476" } rayon = "1.3.0" roaring = "0.5.2" diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 17689823c..c410e7e99 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -190,9 +190,11 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { .or_insert_with(RoaringBitmap::new) .insert(document_id); if let Some(prefix) = word.as_bytes().get(0..word.len().min(4)) { - prefix_postings_ids.entry(SmallVec32::from(prefix)) - .or_insert_with(RoaringBitmap::new) - .insert(document_id); + for i in 0..prefix.len() { + prefix_postings_ids.entry(SmallVec32::from(&prefix[..i])) + .or_insert_with(RoaringBitmap::new) + .insert(document_id); + } } } } diff --git a/src/lib.rs b/src/lib.rs index 248e360cf..f5de08980 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,10 +7,15 @@ use fst::{IntoStreamer, Streamer}; use fxhash::FxHasher32; use heed::types::*; use heed::{PolyDatabase, Database}; -use levenshtein_automata::LevenshteinAutomatonBuilder; +use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; +use once_cell::sync::OnceCell; use roaring::RoaringBitmap; use slice_group_by::StrGroupBy; +static LEVDIST0: OnceCell = OnceCell::new(); +static LEVDIST1: OnceCell = OnceCell::new(); +static LEVDIST2: OnceCell = OnceCell::new(); + pub type FastMap4 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>; @@ -37,12 +42,7 @@ impl Index { let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?; let documents = env.create_database(Some("documents"))?; - Ok(Index { - main, - postings_ids, - prefix_postings_ids, - documents, - }) + Ok(Index { main, postings_ids, prefix_postings_ids, documents }) } pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result> { @@ -56,9 +56,9 @@ impl Index { }; // Building these factories is not free. - let lev0 = LevenshteinAutomatonBuilder::new(0, true); - let lev1 = LevenshteinAutomatonBuilder::new(1, true); - let lev2 = LevenshteinAutomatonBuilder::new(2, true); + let lev0 = LEVDIST0.get_or_init(|| LevBuilder::new(0, true)); + let lev1 = LEVDIST1.get_or_init(|| LevBuilder::new(1, true)); + let lev2 = LEVDIST2.get_or_init(|| LevBuilder::new(2, true)); let words: Vec<_> = alphanumeric_tokens(query).collect(); let number_of_words = words.len(); @@ -91,8 +91,8 @@ impl Index { union_result.union_with(&right); } } - eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); } + eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); intersect_result = match intersect_result.take() { Some(mut left) => { From 5404776f7a56c851a85eb86c0905cbc09807f119 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 1 Jun 2020 17:52:43 +0200 Subject: [PATCH 0015/1889] Add a little bit more debug --- src/bin/indexer.rs | 31 ++++++++++++++++++++----------- src/lib.rs | 11 ++++++++--- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index c410e7e99..c88e014a7 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -48,6 +48,8 @@ struct MtblKvStore(Option); impl MtblKvStore { fn from_indexed(mut indexed: Indexed) -> anyhow::Result { + eprintln!("{:?}: Creating an MTBL store from an Indexed...", rayon::current_thread_index()); + let outfile = tempfile::tempfile()?; let mut out = Writer::new(outfile, None)?; @@ -73,10 +75,10 @@ impl MtblKvStore { // We must write the prefix postings ids key[0] = 2; let mut stream = indexed.fst.stream(); - while let Some(word) = stream.next() { + while let Some(prefix) = stream.next() { key.truncate(1); - key.extend_from_slice(word); - if let Some(ids) = indexed.prefix_postings_ids.remove(word) { + key.extend_from_slice(prefix); + if let Some(ids) = indexed.prefix_postings_ids.remove(prefix) { buffer.clear(); ids.serialize_into(&mut buffer)?; out.add(&key, &buffer).unwrap(); @@ -93,10 +95,14 @@ impl MtblKvStore { } let out = out.into_inner()?; + + eprintln!("{:?}: MTBL store created!", rayon::current_thread_index()); Ok(MtblKvStore(Some(out))) } fn merge_with(self, other: MtblKvStore) -> anyhow::Result { + eprintln!("{:?}: Merging two MTBL stores...", rayon::current_thread_index()); + let (left, right) = match (self.0, other.0) { (Some(left), Some(right)) => (left, right), (Some(left), None) => return Ok(MtblKvStore(Some(left))), @@ -159,11 +165,15 @@ impl MtblKvStore { } let out = out.into_inner()?; + + eprintln!("{:?}: MTBL stores merged!", rayon::current_thread_index()); Ok(MtblKvStore(Some(out))) } } fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { + eprintln!("{:?}: Indexing into an Indexed...", rayon::current_thread_index()); + const MAX_POSITION: usize = 1000; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; @@ -189,8 +199,8 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { postings_ids.entry(SmallVec32::from(word.as_bytes())) .or_insert_with(RoaringBitmap::new) .insert(document_id); - if let Some(prefix) = word.as_bytes().get(0..word.len().min(4)) { - for i in 0..prefix.len() { + if let Some(prefix) = word.as_bytes().get(0..word.len().min(5)) { + for i in 0..=prefix.len() { prefix_postings_ids.entry(SmallVec32::from(&prefix[..i])) .or_insert_with(RoaringBitmap::new) .insert(document_id); @@ -216,6 +226,7 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallVec32::as_ref))?; let indexed = Indexed { fst: new_words_fst, headers, postings_ids, prefix_postings_ids, documents }; + eprintln!("{:?}: Indexed created!", rayon::current_thread_index()); MtblKvStore::from_indexed(indexed) } @@ -274,19 +285,17 @@ fn main() -> anyhow::Result<()> { .open(opt.database)?; let index = Index::new(&env)?; - let res = opt.files_to_index + let mtbl_store = opt.files_to_index .into_par_iter() .try_fold(MtblKvStore::default, |acc, path| { let rdr = csv::Reader::from_path(path)?; - let mtbl_store = index_csv(rdr)?; - acc.merge_with(mtbl_store) + let store = index_csv(rdr)?; + acc.merge_with(store) }) .inspect(|_| { eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed)) }) - .try_reduce(MtblKvStore::default, MtblKvStore::merge_with); - - let mtbl_store = res?; + .try_reduce(MtblKvStore::default, MtblKvStore::merge_with)?; eprintln!("We are writing into LMDB..."); let mut wtxn = env.write_txn()?; diff --git a/src/lib.rs b/src/lib.rs index f5de08980..c250d455f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -79,18 +79,21 @@ impl Index { let mut union_result = RoaringBitmap::default(); if word.len() <= 4 { - if let Some(ids) = self.prefix_postings_ids.get(rtxn, &word[..word.len().min(4)])? { + if let Some(ids) = self.prefix_postings_ids.get(rtxn, &word[..word.len().min(5)])? { union_result = RoaringBitmap::deserialize_from(ids)?; } } else { + let mut count = 0; let mut stream = fst.search(dfa).into_stream(); while let Some(word) = stream.next() { + count += 1; let word = std::str::from_utf8(word)?; if let Some(ids) = self.postings_ids.get(rtxn, word)? { let right = RoaringBitmap::deserialize_from(ids)?; union_result.union_with(&right); } } + eprint!("with {:?} words ", count); } eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); @@ -99,14 +102,16 @@ impl Index { let before = Instant::now(); let left_len = left.len(); left.intersect_with(&union_result); - eprintln!("intersect between {:?} and {:?} took {:.02?}", - left_len, union_result.len(), before.elapsed()); + eprintln!("intersect between {:?} and {:?} gives {:?} took {:.02?}", + left_len, union_result.len(), left.len(), before.elapsed()); Some(left) }, None => Some(union_result), }; } + eprintln!("{} candidates", intersect_result.as_ref().map_or(0, |r| r.len())); + Ok(intersect_result.unwrap_or_default().iter().take(20).collect()) } } From 6a047519f6df8c8bc3414f39ed69637fbcf62c2d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 1 Jun 2020 18:27:26 +0200 Subject: [PATCH 0016/1889] Do a merge two by two --- src/bin/indexer.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index c88e014a7..eb5916091 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -285,7 +285,8 @@ fn main() -> anyhow::Result<()> { .open(opt.database)?; let index = Index::new(&env)?; - let mtbl_store = opt.files_to_index + + let mut stores: Vec<_> = opt.files_to_index .into_par_iter() .try_fold(MtblKvStore::default, |acc, path| { let rdr = csv::Reader::from_path(path)?; @@ -295,7 +296,20 @@ fn main() -> anyhow::Result<()> { .inspect(|_| { eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed)) }) - .try_reduce(MtblKvStore::default, MtblKvStore::merge_with)?; + .collect::>()?; + + while stores.len() >= 1 { + let s = std::mem::take(&mut stores); + stores = s.into_par_iter().chunks(2).map(|mut v| { + match (v.pop(), v.pop()) { + (Some(a), Some(b)) => a.merge_with(b), + (Some(a), _) => Ok(a), + _ => unreachable!(), + } + }).collect::>()?; + } + + let mtbl_store = stores.pop().unwrap_or_default(); eprintln!("We are writing into LMDB..."); let mut wtxn = env.write_txn()?; From 5cc81a01793d3d71d4f933fd123e6aed982ae661 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 1 Jun 2020 18:39:58 +0200 Subject: [PATCH 0017/1889] Merge many MTBL into one a the same time --- src/bin/indexer.rs | 114 +++++++++++++++++++-------------------------- 1 file changed, 48 insertions(+), 66 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index eb5916091..270d24f9c 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -100,63 +100,57 @@ impl MtblKvStore { Ok(MtblKvStore(Some(out))) } - fn merge_with(self, other: MtblKvStore) -> anyhow::Result { - eprintln!("{:?}: Merging two MTBL stores...", rayon::current_thread_index()); + fn merge(key: &[u8], left: &[u8], right: &[u8]) -> Option> { + if key == b"\0words-fst" { + let left_fst = fst::Set::new(left).unwrap(); + let right_fst = fst::Set::new(right).unwrap(); - let (left, right) = match (self.0, other.0) { - (Some(left), Some(right)) => (left, right), - (Some(left), None) => return Ok(MtblKvStore(Some(left))), - (None, Some(right)) => return Ok(MtblKvStore(Some(right))), - (None, None) => return Ok(MtblKvStore(None)), - }; + // Union of the two FSTs + let op = fst::set::OpBuilder::new() + .add(left_fst.into_stream()) + .add(right_fst.into_stream()) + .r#union(); - let left = unsafe { memmap::Mmap::map(&left)? }; - let right = unsafe { memmap::Mmap::map(&right)? }; - - let left = Reader::new(&left, ReaderOptions::default()).unwrap(); - let right = Reader::new(&right, ReaderOptions::default()).unwrap(); - - fn merge(key: &[u8], left: &[u8], right: &[u8]) -> Option> { - if key == b"\0words-fst" { - let left_fst = fst::Set::new(left).unwrap(); - let right_fst = fst::Set::new(right).unwrap(); - - // Union of the two FSTs - let op = fst::set::OpBuilder::new() - .add(left_fst.into_stream()) - .add(right_fst.into_stream()) - .r#union(); - - let mut build = fst::SetBuilder::memory(); - build.extend_stream(op.into_stream()).unwrap(); - Some(build.into_inner().unwrap()) - } - else if key == b"\0headers" { - assert_eq!(left, right); - Some(left.to_vec()) - } - else if key.starts_with(&[1]) || key.starts_with(&[2]) { - let mut left = RoaringBitmap::deserialize_from(left).unwrap(); - let right = RoaringBitmap::deserialize_from(right).unwrap(); - left.union_with(&right); - let mut vec = Vec::new(); - left.serialize_into(&mut vec).unwrap(); - Some(vec) - } - else if key.starts_with(&[3]) { - assert_eq!(left, right); - Some(left.to_vec()) - } - else { - panic!("wut? {:?}", key) - } + let mut build = fst::SetBuilder::memory(); + build.extend_stream(op.into_stream()).unwrap(); + Some(build.into_inner().unwrap()) } + else if key == b"\0headers" { + assert_eq!(left, right); + Some(left.to_vec()) + } + else if key.starts_with(&[1]) || key.starts_with(&[2]) { + let mut left = RoaringBitmap::deserialize_from(left).unwrap(); + let right = RoaringBitmap::deserialize_from(right).unwrap(); + left.union_with(&right); + let mut vec = Vec::new(); + left.serialize_into(&mut vec).unwrap(); + Some(vec) + } + else if key.starts_with(&[3]) { + assert_eq!(left, right); + Some(left.to_vec()) + } + else { + panic!("wut? {:?}", key) + } + } + + fn from_many(stores: Vec) -> anyhow::Result { + eprintln!("{:?}: Merging {} MTBL stores...", rayon::current_thread_index(), stores.len()); + + let mmaps: Vec<_> = stores.iter().flat_map(|m| { + m.0.as_ref().map(|f| unsafe { memmap::Mmap::map(f).unwrap() }) + }).collect(); + + let sources = mmaps.iter().map(|mmap| { + Reader::new(&mmap, ReaderOptions::default()).unwrap() + }).collect(); let outfile = tempfile::tempfile()?; let mut out = Writer::new(outfile, None)?; - let sources = vec![left, right]; - let opt = MergerOptions { merge }; + let opt = MergerOptions { merge: MtblKvStore::merge }; let mut merger = Merger::new(sources, opt); let mut iter = merger.iter(); @@ -286,30 +280,18 @@ fn main() -> anyhow::Result<()> { let index = Index::new(&env)?; - let mut stores: Vec<_> = opt.files_to_index + let stores: Vec<_> = opt.files_to_index .into_par_iter() - .try_fold(MtblKvStore::default, |acc, path| { + .map(|path| { let rdr = csv::Reader::from_path(path)?; - let store = index_csv(rdr)?; - acc.merge_with(store) + index_csv(rdr) }) .inspect(|_| { eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed)) }) .collect::>()?; - while stores.len() >= 1 { - let s = std::mem::take(&mut stores); - stores = s.into_par_iter().chunks(2).map(|mut v| { - match (v.pop(), v.pop()) { - (Some(a), Some(b)) => a.merge_with(b), - (Some(a), _) => Ok(a), - _ => unreachable!(), - } - }).collect::>()?; - } - - let mtbl_store = stores.pop().unwrap_or_default(); + let mtbl_store = MtblKvStore::from_many(stores)?; eprintln!("We are writing into LMDB..."); let mut wtxn = env.write_txn()?; From 217404299433ba1681d38c979afb3555b5809d47 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 1 Jun 2020 19:49:58 +0200 Subject: [PATCH 0018/1889] Merge only 3 MTBL at the same time --- src/bin/indexer.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 270d24f9c..98c279d85 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -280,7 +280,7 @@ fn main() -> anyhow::Result<()> { let index = Index::new(&env)?; - let stores: Vec<_> = opt.files_to_index + let mut stores: Vec<_> = opt.files_to_index .into_par_iter() .map(|path| { let rdr = csv::Reader::from_path(path)?; @@ -291,7 +291,14 @@ fn main() -> anyhow::Result<()> { }) .collect::>()?; - let mtbl_store = MtblKvStore::from_many(stores)?; + while stores.len() > 1 { + let s = std::mem::take(&mut stores); + stores = s.into_par_iter().chunks(3) + .map(MtblKvStore::from_many) + .collect::>()?; + } + + let mtbl_store = stores.pop().unwrap_or_default(); eprintln!("We are writing into LMDB..."); let mut wtxn = env.write_txn()?; From 1df1f88fe1bb51677f2728061de2bcabc8c532e5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 1 Jun 2020 21:09:32 +0200 Subject: [PATCH 0019/1889] Directly write to LMDB without intermediate final MTBL --- src/bin/indexer.rs | 97 ++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 54 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 98c279d85..bed058dfc 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -1,5 +1,5 @@ use std::collections::BTreeSet; -use std::convert::{TryInto, TryFrom}; +use std::convert::TryFrom; use std::fs::File; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -15,7 +15,7 @@ use roaring::RoaringBitmap; use structopt::StructOpt; use mega_mini_indexer::alphanumeric_tokens; -use mega_mini_indexer::{FastMap4, SmallVec32, BEU32, Index, DocumentId}; +use mega_mini_indexer::{FastMap4, SmallVec32, Index, DocumentId}; #[cfg(target_os = "linux")] #[global_allocator] @@ -136,7 +136,9 @@ impl MtblKvStore { } } - fn from_many(stores: Vec) -> anyhow::Result { + fn from_many(stores: Vec, mut f: F) -> anyhow::Result<()> + where F: FnMut(&[u8], &[u8]) -> anyhow::Result<()> + { eprintln!("{:?}: Merging {} MTBL stores...", rayon::current_thread_index(), stores.len()); let mmaps: Vec<_> = stores.iter().flat_map(|m| { @@ -147,21 +149,16 @@ impl MtblKvStore { Reader::new(&mmap, ReaderOptions::default()).unwrap() }).collect(); - let outfile = tempfile::tempfile()?; - let mut out = Writer::new(outfile, None)?; - let opt = MergerOptions { merge: MtblKvStore::merge }; let mut merger = Merger::new(sources, opt); let mut iter = merger.iter(); while let Some((k, v)) = iter.next() { - out.add(k, v).unwrap(); + (f)(k, v)?; } - let out = out.into_inner()?; - eprintln!("{:?}: MTBL stores merged!", rayon::current_thread_index()); - Ok(MtblKvStore(Some(out))) + Ok(()) } } @@ -226,46 +223,32 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { } // TODO merge with the previous values -fn writer(wtxn: &mut heed::RwTxn, index: Index, mtbl_store: MtblKvStore) -> anyhow::Result { - let mtbl_store = match mtbl_store.0 { - Some(store) => unsafe { memmap::Mmap::map(&store)? }, - None => return Ok(0), - }; - let mtbl_store = Reader::new(&mtbl_store, ReaderOptions::default()).unwrap(); - - // Write the words fst - let fst = mtbl_store.get(b"\0words-fst").unwrap(); - let fst = fst::Set::new(fst)?; - index.main.put::<_, Str, ByteSlice>(wtxn, "words-fst", &fst.as_fst().as_bytes())?; - - // Write and merge the headers - let headers = mtbl_store.get(b"\0headers").unwrap(); - index.main.put::<_, Str, ByteSlice>(wtxn, "headers", headers.as_ref())?; - - // Write and merge the postings lists - let mut iter = mtbl_store.iter_prefix(&[1]).unwrap(); - while let Some((word, postings)) = iter.next() { - let word = std::str::from_utf8(&word[1..]).unwrap(); - index.postings_ids.put(wtxn, &word, &postings)?; +fn writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyhow::Result<()> { + if key == b"\0words-fst" { + // Write the words fst + index.main.put::<_, Str, ByteSlice>(wtxn, "words-fst", val)?; + } + else if key == b"\0headers" { + // Write the headers + index.main.put::<_, Str, ByteSlice>(wtxn, "headers", val)?; + } + else if key.starts_with(&[1]) { + // Write the postings lists + index.postings_ids.as_polymorph() + .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; + } + else if key.starts_with(&[2]) { + // Write the prefix postings lists + index.prefix_postings_ids.as_polymorph() + .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; + } + else if key.starts_with(&[3]) { + // Write the documents + index.documents.as_polymorph() + .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; } - // Write and merge the prefix postings lists - let mut iter = mtbl_store.iter_prefix(&[2]).unwrap(); - while let Some((word, postings)) = iter.next() { - let word = std::str::from_utf8(&word[1..]).unwrap(); - index.prefix_postings_ids.put(wtxn, &word, &postings)?; - } - - // Write the documents - let mut count = 0; - let mut iter = mtbl_store.iter_prefix(&[3]).unwrap(); - while let Some((id_bytes, content)) = iter.next() { - let id = id_bytes[1..].try_into().map(u32::from_be_bytes).unwrap(); - index.documents.put(wtxn, &BEU32::new(id), &content)?; - count += 1; - } - - Ok(count) + Ok(()) } fn main() -> anyhow::Result<()> { @@ -291,18 +274,24 @@ fn main() -> anyhow::Result<()> { }) .collect::>()?; - while stores.len() > 1 { + while stores.len() > 3 { + let chunk_size = (stores.len() / rayon::current_num_threads()).max(2); let s = std::mem::take(&mut stores); - stores = s.into_par_iter().chunks(3) - .map(MtblKvStore::from_many) + stores = s.into_par_iter().chunks(chunk_size) + .map(|v| { + let outfile = tempfile::tempfile()?; + let mut out = Writer::new(outfile, None)?; + MtblKvStore::from_many(v, |k, v| Ok(out.add(k, v).unwrap()))?; + let out = out.into_inner()?; + Ok(MtblKvStore(Some(out))) as anyhow::Result<_> + }) .collect::>()?; } - let mtbl_store = stores.pop().unwrap_or_default(); - eprintln!("We are writing into LMDB..."); let mut wtxn = env.write_txn()?; - let count = writer(&mut wtxn, index, mtbl_store)?; + MtblKvStore::from_many(stores, |k, v| writer(&mut wtxn, &index, k, v))?; + let count = index.documents.len(&wtxn)?; wtxn.commit()?; eprintln!("Wrote {} documents into LMDB", count); From 3a23dc242e4bd1fbcb210c0f8a079c0e3c30a2e9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 4 Jun 2020 16:17:24 +0200 Subject: [PATCH 0020/1889] More efficiently merge MTBLs, more than two at a time --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/bin/indexer.rs | 32 +++++++++++++++++--------------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index be5640e22..69c627947 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -996,7 +996,7 @@ checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" [[package]] name = "oxidized-mtbl" version = "0.1.0" -source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=8918476#8918476f61f4430890d067db7b4a6cfb2d549c43" +source = "git+https://github.com/Kerollmops/oxidized-mtbl.git?rev=6acef3d#6acef3d0fc7fec6a3701038860e51f8bbcee1ee6" dependencies = [ "byteorder 1.3.4", "crc32c", diff --git a/Cargo.toml b/Cargo.toml index f3b09dc56..ce09f5e06 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,7 @@ jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } memmap = "0.7.0" once_cell = "1.4.0" -oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "8918476" } +oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "6acef3d" } rayon = "1.3.0" roaring = "0.5.2" slice-group-by = "0.2.6" diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index bed058dfc..cd3ae66b1 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -100,36 +100,38 @@ impl MtblKvStore { Ok(MtblKvStore(Some(out))) } - fn merge(key: &[u8], left: &[u8], right: &[u8]) -> Option> { + fn merge(key: &[u8], values: &[Vec]) -> Option> { if key == b"\0words-fst" { - let left_fst = fst::Set::new(left).unwrap(); - let right_fst = fst::Set::new(right).unwrap(); + let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect(); // Union of the two FSTs - let op = fst::set::OpBuilder::new() - .add(left_fst.into_stream()) - .add(right_fst.into_stream()) - .r#union(); + let mut op = fst::set::OpBuilder::new(); + fsts.iter().for_each(|fst| op.push(fst.into_stream())); + let op = op.r#union(); let mut build = fst::SetBuilder::memory(); build.extend_stream(op.into_stream()).unwrap(); Some(build.into_inner().unwrap()) } else if key == b"\0headers" { - assert_eq!(left, right); - Some(left.to_vec()) + assert!(values.windows(2).all(|vs| vs[0] == vs[1])); + Some(values[0].to_vec()) } else if key.starts_with(&[1]) || key.starts_with(&[2]) { - let mut left = RoaringBitmap::deserialize_from(left).unwrap(); - let right = RoaringBitmap::deserialize_from(right).unwrap(); - left.union_with(&right); + let mut first = RoaringBitmap::deserialize_from(values[0].as_slice()).unwrap(); + + for value in &values[1..] { + let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap(); + first.union_with(&bitmap); + } + let mut vec = Vec::new(); - left.serialize_into(&mut vec).unwrap(); + first.serialize_into(&mut vec).unwrap(); Some(vec) } else if key.starts_with(&[3]) { - assert_eq!(left, right); - Some(left.to_vec()) + assert!(values.windows(2).all(|vs| vs[0] == vs[1])); + Some(values[0].to_vec()) } else { panic!("wut? {:?}", key) From c42d3c19e2383219d452a837830d175eb29cd347 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 4 Jun 2020 17:38:43 +0200 Subject: [PATCH 0021/1889] Merge the whole list of generated MTBL in one go --- src/bin/indexer.rs | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index cd3ae66b1..c52f83f1d 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -265,7 +265,7 @@ fn main() -> anyhow::Result<()> { let index = Index::new(&env)?; - let mut stores: Vec<_> = opt.files_to_index + let stores: Vec<_> = opt.files_to_index .into_par_iter() .map(|path| { let rdr = csv::Reader::from_path(path)?; @@ -276,20 +276,6 @@ fn main() -> anyhow::Result<()> { }) .collect::>()?; - while stores.len() > 3 { - let chunk_size = (stores.len() / rayon::current_num_threads()).max(2); - let s = std::mem::take(&mut stores); - stores = s.into_par_iter().chunks(chunk_size) - .map(|v| { - let outfile = tempfile::tempfile()?; - let mut out = Writer::new(outfile, None)?; - MtblKvStore::from_many(v, |k, v| Ok(out.add(k, v).unwrap()))?; - let out = out.into_inner()?; - Ok(MtblKvStore(Some(out))) as anyhow::Result<_> - }) - .collect::>()?; - } - eprintln!("We are writing into LMDB..."); let mut wtxn = env.write_txn()?; MtblKvStore::from_many(stores, |k, v| writer(&mut wtxn, &index, k, v))?; From 5d1c625b74f4d8934a16935fc62cdb61f621dcc2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 4 Jun 2020 18:19:52 +0200 Subject: [PATCH 0022/1889] Change the page index texts --- public/index.html | 4 ++-- src/bin/indexer.rs | 2 +- src/bin/search.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/public/index.html b/public/index.html index 062495ac9..50669c0f4 100644 --- a/public/index.html +++ b/public/index.html @@ -6,7 +6,7 @@ - The daugt + The mega-mini-indexer - - - -
-
-
-

- Welcome to the mega-mini-indexer also known as the MMI -

-

- This dashboard will help you check the search results with ease. -

-

Quoted query strings are available and forces the engine to search without typo tolerance or prefixes (e.g. big "black" boat).

-
-
-
- -
-
- - +
+
+
-
-
    - -
-
- + + + - - for (element of httpResults.data) { - const elem = document.createElement('li'); - elem.classList.add("document"); - - const ol = document.createElement('ol'); - - for (const prop in element) { - const field = document.createElement('li'); - field.classList.add("field"); - - const attribute = document.createElement('div'); - attribute.classList.add("attribute"); - attribute.innerHTML = prop; - - const content = document.createElement('div'); - content.classList.add("content"); - content.innerHTML = element[prop]; - - field.appendChild(attribute); - field.appendChild(content); - - ol.appendChild(field); - } - - elem.appendChild(ol); - results.appendChild(elem) - } - - }, - beforeSend: function () { - if (request !== null) { - request.abort(); - } - }, - }); - }); - diff --git a/public/script.js b/public/script.js new file mode 100644 index 000000000..119d05634 --- /dev/null +++ b/public/script.js @@ -0,0 +1,55 @@ +var request = null; + +$('#search').on('input', function () { + var query = $(this).val(); + request = $.ajax({ + type: "POST", + url: "query", + contentType: 'application/json', + data: JSON.stringify({ 'query': query }), + contentType: 'application/json', + success: function (data, textStatus, request) { + let httpResults = Papa.parse(data, { header: true, skipEmptyLines: true }); + results.innerHTML = ''; + + let timeSpent = request.getResponseHeader('Time-Ms'); + let numberOfDocuments = httpResults.data.length; + count.innerHTML = `${numberOfDocuments}`; + time.innerHTML = `${timeSpent}ms`; + + for (element of httpResults.data) { + const elem = document.createElement('li'); + elem.classList.add("document"); + + const ol = document.createElement('ol'); + + for (const prop in element) { + const field = document.createElement('li'); + field.classList.add("field"); + + const attribute = document.createElement('div'); + attribute.classList.add("attribute"); + attribute.innerHTML = prop; + + const content = document.createElement('div'); + content.classList.add("content"); + content.innerHTML = element[prop]; + + field.appendChild(attribute); + field.appendChild(content); + + ol.appendChild(field); + } + + elem.appendChild(ol); + results.appendChild(elem) + } + + }, + beforeSend: function () { + if (request !== null) { + request.abort(); + } + }, + }); +}); diff --git a/public/style.css b/public/style.css new file mode 100644 index 000000000..310b8dac6 --- /dev/null +++ b/public/style.css @@ -0,0 +1,74 @@ +em { + color: hsl(204, 86%, 25%); + font-style: inherit; + background-color: hsl(204, 86%, 88%); +} + +#results { + max-width: 900px; + margin: 20px auto 0 auto; + padding: 0; +} + +.notification { + display: flex; + justify-content: center; +} + +.level-left { + margin-right: 50px; +} + +.document { + padding: 20px 20px; + background-color: #f5f5f5; + border-radius: 4px; + margin-bottom: 20px; + display: flex; +} + +.document ol { + flex: 0 0 75%; + max-width: 75%; + padding: 0; + margin: 0; +} + +.document .image { + max-width: 25%; + flex: 0 0 25%; + padding-left: 30px; + box-sizing: border-box; +} + +.document .image img { + width: 100%; +} + +.field { + list-style-type: none; + display: flex; + flex-wrap: wrap; +} + +.field:not(:last-child) { + margin-bottom: 7px; +} + +.attribute { + flex: 0 0 35%; + max-width: 35%; + text-align: right; + padding-right: 10px; + box-sizing: border-box; + text-transform: uppercase; + color: rgba(0,0,0,.7); +} + +.content { + max-width: 65%; + flex: 0 0 65%; + box-sizing: border-box; + padding-left: 10px; + color: rgba(0,0,0,.9); +} diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 28e5dff71..0a103a29e 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -58,6 +58,13 @@ async fn main() -> anyhow::Result<()> { .body(include_str!("../../public/bulma.min.css")) ); + let dash_style_route = warp::filters::method::get() + .and(warp::path!("style.css")) + .map(|| Response::builder() + .header("content-type", "text/css; charset=utf-8") + .body(include_str!("../../public/style.css")) + ); + let dash_jquery_route = warp::filters::method::get() .and(warp::path!("jquery-3.4.1.min.js")) .map(|| Response::builder() @@ -72,6 +79,13 @@ async fn main() -> anyhow::Result<()> { .body(include_str!("../../public/papaparse.min.js")) ); + let dash_script_route = warp::filters::method::get() + .and(warp::path!("script.js")) + .map(|| Response::builder() + .header("content-type", "application/javascript; charset=utf-8") + .body(include_str!("../../public/script.js")) + ); + #[derive(Deserialize)] struct QueryBody { query: String, @@ -107,8 +121,10 @@ async fn main() -> anyhow::Result<()> { let routes = dash_html_route .or(dash_bulma_route) + .or(dash_style_route) .or(dash_jquery_route) .or(dash_papaparse_route) + .or(dash_script_route) .or(query_route); let addr = SocketAddr::from_str(&opt.http_listen_addr).unwrap(); From f6eae91c7d7253ccfd950403a8f3b78f84837375 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sat, 11 Jul 2020 14:17:37 +0200 Subject: [PATCH 0092/1889] Pretty print the new dashboard numbers --- Cargo.lock | 86 ++++++++++++++++++++++++++++++++ Cargo.toml | 2 + public/filesize.min.js | 6 +++ public/script.js | 10 ++++ src/bin/serve.rs | 37 +++++++++++++- {public => templates}/index.html | 11 ++-- 6 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 public/filesize.min.js rename {public => templates}/index.html (86%) diff --git a/Cargo.lock b/Cargo.lock index b6f0e8c27..02a09ca2d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -18,6 +18,65 @@ version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b585a98a234c46fc563103e9278c9391fde1f4e6850334da895d27edb9580f62" +[[package]] +name = "askama" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afb6c2f00d120a43d67345bbd3e9e21d1ee090c1fc7db7787c05bd969b83bccf" +dependencies = [ + "askama_derive", + "askama_escape", + "askama_shared", + "mime 0.3.16", + "mime_guess 2.0.3", +] + +[[package]] +name = "askama_derive" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d97b96db4caba8842ec48a537cab1e6d7e809d7a408c97ae99f129e1c982750" +dependencies = [ + "askama_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "askama_escape" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90c108c1a94380c89d2215d0ac54ce09796823cca0fd91b299cfff3b33e346fb" + +[[package]] +name = "askama_shared" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3342887c2f85a336bc8bc4fc79603434539cdae37565e20ccf54599dea8e42ee" +dependencies = [ + "askama_escape", + "humansize", + "nom", + "num-traits", + "percent-encoding", + "proc-macro2", + "quote", + "serde", + "syn", + "toml", +] + +[[package]] +name = "askama_warp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96f410ab17fa08f70b5fda07ce1112418642c914864961630808979343ea226" +dependencies = [ + "askama", + "warp", +] + [[package]] name = "atty" version = "0.2.14" @@ -654,6 +713,12 @@ version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd179ae861f0c2e53da70d892f5f3029f9594be0c41dc5269cd371691b1dc2f9" +[[package]] +name = "humansize" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6cab2627acfc432780848602f3f558f7e9dd427352224b0d9324025796d2a5e" + [[package]] name = "hyper" version = "0.13.6" @@ -847,6 +912,8 @@ name = "mega-mini-indexer" version = "0.1.0" dependencies = [ "anyhow", + "askama", + "askama_warp", "bitpacking", "byteorder", "cow-utils", @@ -1038,6 +1105,16 @@ dependencies = [ "winapi 0.3.8", ] +[[package]] +name = "nom" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" +dependencies = [ + "memchr", + "version_check 0.9.2", +] + [[package]] name = "num-traits" version = "0.2.12" @@ -1824,6 +1901,15 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffc92d160b1eef40665be3a05630d003936a3bc7da7421277846c2613e92c71a" +dependencies = [ + "serde", +] + [[package]] name = "tower-service" version = "0.3.0" diff --git a/Cargo.toml b/Cargo.toml index dea1d6086..41e1af998 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,8 @@ indexmap = "1.4.0" itertools = "0.9.0" # http server +askama = "0.10.1" +askama_warp = "0.10.0" serde = { version = "1.0", features = ["derive"] } tokio = { version = "0.2.15", features = ["full"] } warp = "0.2.2" diff --git a/public/filesize.min.js b/public/filesize.min.js new file mode 100644 index 000000000..0d4df433f --- /dev/null +++ b/public/filesize.min.js @@ -0,0 +1,6 @@ +/* + 2020 Jason Mulligan + @version 6.1.0 +*/ +"use strict";!function(e){var x=/^(b|B)$/,M={iec:{bits:["b","Kib","Mib","Gib","Tib","Pib","Eib","Zib","Yib"],bytes:["B","KiB","MiB","GiB","TiB","PiB","EiB","ZiB","YiB"]},jedec:{bits:["b","Kb","Mb","Gb","Tb","Pb","Eb","Zb","Yb"],bytes:["B","KB","MB","GB","TB","PB","EB","ZB","YB"]}},w={iec:["","kibi","mebi","gibi","tebi","pebi","exbi","zebi","yobi"],jedec:["","kilo","mega","giga","tera","peta","exa","zetta","yotta"]};function t(e){var i,t,o,n,b,r,a,l,s,d,u,c,f,p,B,y=1 anyhow::Result<()> { let opt = Opt::from_args(); @@ -44,12 +54,29 @@ async fn main() -> anyhow::Result<()> { let index = Index::new(&env)?; + // Retrieve the database the file stem (w/o the extension) + let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string(); + + // Retrieve the disk file size + let db_size = File::open(opt.database.join("data.mdb"))?.metadata()?.len() as usize; + + // Precompute the number of documents in the database. + let rtxn = env.read_txn().unwrap(); + let docs_count = index.documents.len(&rtxn)?; + drop(rtxn); + // We run and wait on the HTTP server // Expose an HTML page to debug the search in a browser let dash_html_route = warp::filters::method::get() .and(warp::filters::path::end()) - .map(|| warp::reply::html(include_str!("../../public/index.html"))); + .map(move || { + IndexTemplate { + db_name: db_name.clone(), + db_size, + docs_count, + } + }); let dash_bulma_route = warp::filters::method::get() .and(warp::path!("bulma.min.css")) @@ -79,6 +106,13 @@ async fn main() -> anyhow::Result<()> { .body(include_str!("../../public/papaparse.min.js")) ); + let dash_filesize_route = warp::filters::method::get() + .and(warp::path!("filesize.min.js")) + .map(|| Response::builder() + .header("content-type", "application/javascript; charset=utf-8") + .body(include_str!("../../public/filesize.min.js")) + ); + let dash_script_route = warp::filters::method::get() .and(warp::path!("script.js")) .map(|| Response::builder() @@ -124,6 +158,7 @@ async fn main() -> anyhow::Result<()> { .or(dash_style_route) .or(dash_jquery_route) .or(dash_papaparse_route) + .or(dash_filesize_route) .or(dash_script_route) .or(query_route); diff --git a/public/index.html b/templates/index.html similarity index 86% rename from public/index.html rename to templates/index.html index 8f73eb4f7..e45569462 100644 --- a/public/index.html +++ b/templates/index.html @@ -7,7 +7,8 @@ - The milli engine + + {{ db_name }} | The milli engine @@ -20,20 +21,20 @@ From b8a1fc0126140939956d55e3a3b7a5428fe326da Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sat, 11 Jul 2020 14:51:59 +0200 Subject: [PATCH 0093/1889] Clean up the CSS style custom bulma rules --- public/style.css | 4 ---- src/bin/serve.rs | 11 +++-------- templates/index.html | 3 ++- 3 files changed, 5 insertions(+), 13 deletions(-) diff --git a/public/style.css b/public/style.css index 310b8dac6..d2bb0a4a7 100644 --- a/public/style.css +++ b/public/style.css @@ -15,10 +15,6 @@ em { justify-content: center; } -.level-left { - margin-right: 50px; -} - .document { padding: 20px 20px; background-color: #f5f5f5; diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 0f557b850..29a7fb3a2 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -54,16 +54,11 @@ async fn main() -> anyhow::Result<()> { let index = Index::new(&env)?; - // Retrieve the database the file stem (w/o the extension) + // Retrieve the database the file stem (w/o the extension), + // the disk file size and the number of documents in the database. let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string(); - - // Retrieve the disk file size let db_size = File::open(opt.database.join("data.mdb"))?.metadata()?.len() as usize; - - // Precompute the number of documents in the database. - let rtxn = env.read_txn().unwrap(); - let docs_count = index.documents.len(&rtxn)?; - drop(rtxn); + let docs_count = env.read_txn().and_then(|r| index.documents.len(&r))?; // We run and wait on the HTTP server diff --git a/templates/index.html b/templates/index.html index e45569462..c9241a371 100644 --- a/templates/index.html +++ b/templates/index.html @@ -21,7 +21,7 @@
+
From d31da26a51daeea45ae5a8bad203bcb1d38330e9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sat, 11 Jul 2020 23:51:32 +0200 Subject: [PATCH 0094/1889] Avoid cloning RoraringBitmaps when unecessary --- src/lib.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 42f63de51..9d1240a1a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -228,15 +228,22 @@ impl Index { for positions in positions { let before = Instant::now(); + // Precompute the potentially missing unions + positions.iter().enumerate().for_each(|(word, pos)| { + union_cache.entry((word, *pos)).or_insert_with(|| unions_word_pos(word, *pos)); + }); + + // Retrieve the unions along with the popularity of it. let mut to_intersect: Vec<_> = positions.iter() .enumerate() .map(|(word, pos)| { - let docids = union_cache.entry((word, *pos)).or_insert_with(|| unions_word_pos(word, *pos)); - // FIXME don't clone here - (docids.len(), docids.clone()) + let docids = union_cache.get(&(word, *pos)).unwrap(); + (docids.len(), docids) }) .collect(); + // Sort the unions by popuarity to help reduce + // the number of documents as soon as possible. to_intersect.sort_unstable_by_key(|(l, _)| *l); let elapsed_retrieving = before.elapsed(); From 2c62eeea3c4a11e00d0b9d64f32fde77a4027010 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 12 Jul 2020 00:16:41 +0200 Subject: [PATCH 0095/1889] Rename the project milli --- Cargo.lock | 52 +++++++++++++++++++++++----------------------- Cargo.toml | 2 +- README.md | 2 +- benches/search.rs | 2 +- src/bin/indexer.rs | 2 +- src/bin/search.rs | 2 +- src/bin/serve.rs | 2 +- src/bin/stats.rs | 2 +- 8 files changed, 33 insertions(+), 33 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 02a09ca2d..94aea9e2a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -908,7 +908,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" [[package]] -name = "mega-mini-indexer" +name = "memchr" +version = "2.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" + +[[package]] +name = "memmap" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" +dependencies = [ + "libc", + "winapi 0.3.8", +] + +[[package]] +name = "memoffset" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8" +dependencies = [ + "autocfg 1.0.0", +] + +[[package]] +name = "milli" version = "0.1.0" dependencies = [ "anyhow", @@ -941,31 +966,6 @@ dependencies = [ "warp", ] -[[package]] -name = "memchr" -version = "2.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" - -[[package]] -name = "memmap" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" -dependencies = [ - "libc", - "winapi 0.3.8", -] - -[[package]] -name = "memoffset" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8" -dependencies = [ - "autocfg 1.0.0", -] - [[package]] name = "mime" version = "0.2.6" diff --git a/Cargo.toml b/Cargo.toml index 41e1af998..5d62757e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "mega-mini-indexer" +name = "milli" version = "0.1.0" authors = ["Kerollmops "] edition = "2018" diff --git a/README.md b/README.md index a12f1bc7f..2f556b529 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# mega-mini-indexer +# milli A prototype of concurrent indexing, only contains postings ids ## Introduction diff --git a/benches/search.rs b/benches/search.rs index 3db9b3db1..922c9ec66 100644 --- a/benches/search.rs +++ b/benches/search.rs @@ -1,7 +1,7 @@ use std::time::Duration; use heed::EnvOpenOptions; -use mega_mini_indexer::Index; +use milli::Index; use criterion::{criterion_group, criterion_main, BenchmarkId}; fn bench_search(c: &mut criterion::Criterion) { diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index dffa75255..a9191b618 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -17,7 +17,7 @@ use roaring::RoaringBitmap; use slice_group_by::StrGroupBy; use structopt::StructOpt; -use mega_mini_indexer::{FastMap4, SmallVec32, Index, DocumentId, Position}; +use milli::{FastMap4, SmallVec32, Index, DocumentId, Position}; const LMDB_MAX_KEY_LENGTH: usize = 512; const ONE_MILLION: usize = 1_000_000; diff --git a/src/bin/search.rs b/src/bin/search.rs index 22015860a..5f5d458f4 100644 --- a/src/bin/search.rs +++ b/src/bin/search.rs @@ -5,7 +5,7 @@ use std::time::Instant; use heed::EnvOpenOptions; use structopt::StructOpt; -use mega_mini_indexer::{Index, BEU32}; +use milli::{Index, BEU32}; #[cfg(target_os = "linux")] #[global_allocator] diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 29a7fb3a2..387c26d1b 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -10,7 +10,7 @@ use serde::Deserialize; use structopt::StructOpt; use warp::{Filter, http::Response}; -use mega_mini_indexer::{BEU32, Index}; +use milli::{BEU32, Index}; #[cfg(target_os = "linux")] #[global_allocator] diff --git a/src/bin/stats.rs b/src/bin/stats.rs index 91632b437..561afa65f 100644 --- a/src/bin/stats.rs +++ b/src/bin/stats.rs @@ -6,7 +6,7 @@ use cow_utils::CowUtils; use roaring::RoaringBitmap; use slice_group_by::StrGroupBy; -use mega_mini_indexer::{FastMap4, DocumentId, SmallString32}; +use milli::{FastMap4, DocumentId, SmallString32}; const MAX_POSITION: usize = 1000; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; From 12358476dad22909d30bc63919cc34a84eb567e7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 12 Jul 2020 10:55:09 +0200 Subject: [PATCH 0096/1889] Use the log crate instead of stderr --- Cargo.lock | 125 ++++++++++++++++++++++++++++++++++++++---- Cargo.toml | 4 ++ src/best_proximity.rs | 7 ++- src/bin/indexer.rs | 27 ++++----- src/bin/search.rs | 5 +- src/bin/serve.rs | 2 +- src/lib.rs | 19 ++++--- 7 files changed, 150 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94aea9e2a..f0ebc2ff6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,12 +79,12 @@ dependencies = [ [[package]] name = "atty" -version = "0.2.14" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +checksum = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652" dependencies = [ - "hermit-abi", "libc", + "termion", "winapi 0.3.8", ] @@ -164,7 +164,7 @@ version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "31accafdb70df7871592c058eca3985b71104e15ac32f64706022c58867da931" dependencies = [ - "lazy_static", + "lazy_static 1.4.0", "memchr", "regex-automata", "serde", @@ -228,6 +228,17 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +[[package]] +name = "chrono" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c74d84029116787153e02106bf53e66828452a4b325cc8652b788b5967c0a0b6" +dependencies = [ + "num-integer", + "num-traits", + "time", +] + [[package]] name = "clap" version = "2.33.1" @@ -281,7 +292,7 @@ dependencies = [ "criterion-plot", "csv", "itertools", - "lazy_static", + "lazy_static 1.4.0", "num-traits", "oorandom", "plotters", @@ -324,7 +335,7 @@ dependencies = [ "autocfg 1.0.0", "cfg-if", "crossbeam-utils", - "lazy_static", + "lazy_static 1.4.0", "maybe-uninit", "memoffset", "scopeguard", @@ -348,7 +359,7 @@ checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8" dependencies = [ "autocfg 1.0.0", "cfg-if", - "lazy_static", + "lazy_static 1.4.0", ] [[package]] @@ -845,6 +856,12 @@ dependencies = [ "winapi-build", ] +[[package]] +name = "lazy_static" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76f033c7ad61445c5b347c7382dd1237847eb1bce590fe50365dcb33d546be73" + [[package]] name = "lazy_static" version = "1.4.0" @@ -951,6 +968,7 @@ dependencies = [ "itertools", "jemallocator", "levenshtein_automata", + "log 0.4.8", "memmap", "once_cell", "oxidized-mtbl", @@ -960,6 +978,7 @@ dependencies = [ "slice-group-by", "smallstr", "smallvec", + "stderrlog", "structopt", "tempfile", "tokio", @@ -1115,6 +1134,16 @@ dependencies = [ "version_check 0.9.2", ] +[[package]] +name = "num-integer" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d59457e662d541ba17869cf51cf177c0b5f0cbf476c66bdc90bf1edac4f875b" +dependencies = [ + "autocfg 1.0.0", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.12" @@ -1134,6 +1163,12 @@ dependencies = [ "libc", ] +[[package]] +name = "numtoa" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8f8bdf33df195859076e54ab11ee78a1b208382d3a26ec40d142ffc1ecc49ef" + [[package]] name = "once_cell" version = "1.4.0" @@ -1505,7 +1540,7 @@ dependencies = [ "crossbeam-deque", "crossbeam-queue", "crossbeam-utils", - "lazy_static", + "lazy_static 1.4.0", "num_cpus", ] @@ -1524,6 +1559,15 @@ version = "0.1.56" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" +[[package]] +name = "redox_termios" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" +dependencies = [ + "redox_syscall", +] + [[package]] name = "regex" version = "1.3.9" @@ -1738,6 +1782,19 @@ dependencies = [ "winapi 0.3.8", ] +[[package]] +name = "stderrlog" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e5ee9b90a5452c570a0b0ac1c99ae9498db7e56e33d74366de7f2a7add7f25" +dependencies = [ + "atty", + "chrono", + "log 0.4.8", + "termcolor", + "thread_local", +] + [[package]] name = "structopt" version = "0.3.14" @@ -1745,7 +1802,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef" dependencies = [ "clap", - "lazy_static", + "lazy_static 1.4.0", "structopt-derive", ] @@ -1810,6 +1867,27 @@ dependencies = [ "winapi 0.3.8", ] +[[package]] +name = "termcolor" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "termion" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c22cec9d8978d906be5ac94bceb5a010d885c626c4c8855721a4dbd20e3ac905" +dependencies = [ + "libc", + "numtoa", + "redox_syscall", + "redox_termios", +] + [[package]] name = "textwrap" version = "0.11.0" @@ -1819,6 +1897,16 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "thread_local" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1697c4b57aeeb7a536b647165a2825faddffb1d3bad386d507709bd51a90bb14" +dependencies = [ + "lazy_static 0.2.11", + "unreachable", +] + [[package]] name = "time" version = "0.1.43" @@ -1849,7 +1937,7 @@ dependencies = [ "fnv", "futures-core", "iovec", - "lazy_static", + "lazy_static 1.4.0", "libc", "memchr", "mio", @@ -2010,6 +2098,15 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" +[[package]] +name = "unreachable" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" +dependencies = [ + "void", +] + [[package]] name = "url" version = "2.1.1" @@ -2045,6 +2142,12 @@ version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" +[[package]] +name = "void" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" + [[package]] name = "walkdir" version = "2.3.1" @@ -2115,7 +2218,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ded84f06e0ed21499f6184df0e0cb3494727b0c5da89534e0fcc55c51d812101" dependencies = [ "bumpalo", - "lazy_static", + "lazy_static 1.4.0", "log 0.4.8", "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 5d62757e4..4a991436f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,10 @@ smallvec = "1.4.0" structopt = { version = "0.3.14", default-features = false } tempfile = "3.1.0" +# logging +log = "0.4.8" +stderrlog = "0.4.3" + # best proximity indexmap = "1.4.0" diff --git a/src/best_proximity.rs b/src/best_proximity.rs index cb6302a16..6dda4d1fb 100644 --- a/src/best_proximity.rs +++ b/src/best_proximity.rs @@ -1,6 +1,7 @@ use std::cmp; use std::time::Instant; +use log::debug; use crate::iter_shortest_paths::astar_bag; const ONE_ATTRIBUTE: u32 = 1000; @@ -153,18 +154,18 @@ impl BestProximity { }, ); - eprintln!("BestProximity::next() took {:.02?}", before.elapsed()); + debug!("BestProximity::next() took {:.02?}", before.elapsed()); match result { Some((paths, proximity)) => { self.best_proximity = proximity + 1; // We retrieve the last path that we convert into a Vec let paths: Vec<_> = paths.map(|p| p.iter().filter_map(Node::position).collect()).collect(); - eprintln!("result: {} {:?}", proximity, paths); + debug!("result: {} {:?}", proximity, paths); Some((proximity, paths)) }, None => { - eprintln!("result: {:?}", None as Option<()>); + debug!("result: {:?}", None as Option<()>); self.best_proximity += 1; None }, diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index a9191b618..6f490fae7 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -11,6 +11,7 @@ use cow_utils::CowUtils; use fst::{Streamer, IntoStreamer}; use heed::EnvOpenOptions; use heed::types::*; +use log::debug; use oxidized_mtbl::{Reader, ReaderOptions, Writer, Merger, MergerOptions}; use rayon::prelude::*; use roaring::RoaringBitmap; @@ -86,7 +87,7 @@ struct MtblKvStore(Option); impl MtblKvStore { fn from_indexed(mut indexed: Indexed) -> anyhow::Result { - eprintln!("Creating an MTBL store from an Indexed..."); + debug!("Creating an MTBL store from an Indexed..."); let outfile = tempfile::tempfile()?; let mut out = Writer::new(outfile, None)?; @@ -152,7 +153,7 @@ impl MtblKvStore { let out = out.into_inner()?; - eprintln!("MTBL store created!"); + debug!("MTBL store created!"); Ok(MtblKvStore(Some(out))) } @@ -198,7 +199,7 @@ impl MtblKvStore { fn from_many(stores: Vec, mut f: F) -> anyhow::Result<()> where F: FnMut(&[u8], &[u8]) -> anyhow::Result<()> { - eprintln!("Merging {} MTBL stores...", stores.len()); + debug!("Merging {} MTBL stores...", stores.len()); let before = Instant::now(); let mmaps: Vec<_> = stores.iter().flat_map(|m| { @@ -217,7 +218,7 @@ impl MtblKvStore { (f)(k, v)?; } - eprintln!("MTBL stores merged in {:.02?}!", before.elapsed()); + debug!("MTBL stores merged in {:.02?}!", before.elapsed()); Ok(()) } } @@ -256,7 +257,7 @@ fn index_csv( max_mem_usage: usize, ) -> anyhow::Result> { - eprintln!("{:?}: Indexing into an Indexed...", thread_index); + debug!("{:?}: Indexing into an Indexed...", thread_index); let mut stores = Vec::new(); @@ -281,7 +282,7 @@ fn index_csv( let document_id = DocumentId::try_from(document_id).context("generated id is too big")?; if document_id % (ONE_MILLION as u32) == 0 { - eprintln!("We have seen {}m documents so far.", document_id / ONE_MILLION as u32); + debug!("We have seen {}m documents so far.", document_id / ONE_MILLION as u32); } for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { @@ -310,21 +311,21 @@ fn index_csv( if documents.len() % 100_000 == 0 { let usage = mem_usage(&word_positions, &word_position_docids, &documents); if usage > max_mem_usage { - eprintln!("Whoops too much memory used ({}B).", usage); + debug!("Whoops too much memory used ({}B).", usage); let word_positions = mem::take(&mut word_positions); let word_position_docids = mem::take(&mut word_position_docids); let documents = mem::take(&mut documents); let indexed = Indexed::new(word_positions, word_position_docids, headers.clone(), documents)?; - eprintln!("{:?}: Indexed created!", thread_index); + debug!("{:?}: Indexed created!", thread_index); stores.push(MtblKvStore::from_indexed(indexed)?); } } } let indexed = Indexed::new(word_positions, word_position_docids, headers, documents)?; - eprintln!("{:?}: Indexed created!", thread_index); + debug!("{:?}: Indexed created!", thread_index); stores.push(MtblKvStore::from_indexed(indexed)?); Ok(stores) @@ -372,7 +373,7 @@ fn writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyh fn compute_words_attributes_docids(wtxn: &mut heed::RwTxn, index: &Index) -> anyhow::Result<()> { let before = Instant::now(); - eprintln!("Computing the attributes documents ids..."); + debug!("Computing the attributes documents ids..."); let fst = match index.fst(&wtxn)? { Some(fst) => fst.map_data(|s| s.to_vec())?, @@ -408,7 +409,7 @@ fn compute_words_attributes_docids(wtxn: &mut heed::RwTxn, index: &Index) -> any } } - eprintln!("Computing the attributes documents ids took {:.02?}.", before.elapsed()); + debug!("Computing the attributes documents ids took {:.02?}.", before.elapsed()); Ok(()) } @@ -444,7 +445,7 @@ fn main() -> anyhow::Result<()> { let stores: Vec<_> = stores.into_iter().flatten().collect(); - eprintln!("We are writing into LMDB..."); + debug!("We are writing into LMDB..."); let mut wtxn = env.write_txn()?; MtblKvStore::from_many(stores, |k, v| writer(&mut wtxn, &index, k, v))?; @@ -452,7 +453,7 @@ fn main() -> anyhow::Result<()> { let count = index.documents.len(&wtxn)?; wtxn.commit()?; - eprintln!("Wrote {} documents into LMDB", count); + debug!("Wrote {} documents into LMDB", count); Ok(()) } diff --git a/src/bin/search.rs b/src/bin/search.rs index 5f5d458f4..92112826a 100644 --- a/src/bin/search.rs +++ b/src/bin/search.rs @@ -4,8 +4,9 @@ use std::path::PathBuf; use std::time::Instant; use heed::EnvOpenOptions; -use structopt::StructOpt; +use log::debug; use milli::{Index, BEU32}; +use structopt::StructOpt; #[cfg(target_os = "linux")] #[global_allocator] @@ -62,7 +63,7 @@ fn main() -> anyhow::Result<()> { } } - eprintln!("Took {:.02?} to find {} documents", before.elapsed(), documents_ids.len()); + debug!("Took {:.02?} to find {} documents", before.elapsed(), documents_ids.len()); } Ok(()) diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 387c26d1b..9d4ae77d1 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -158,7 +158,7 @@ async fn main() -> anyhow::Result<()> { .or(query_route); let addr = SocketAddr::from_str(&opt.http_listen_addr).unwrap(); - eprintln!("listening on http://{}", addr); + println!("listening on http://{}", addr); warp::serve(routes).run(addr).await; Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 9d1240a1a..fad74976f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,6 +14,7 @@ use fxhash::{FxHasher32, FxHasher64}; use heed::types::*; use heed::{PolyDatabase, Database}; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; +use log::debug; use once_cell::sync::Lazy; use roaring::RoaringBitmap; @@ -138,7 +139,7 @@ impl Index { } } - eprintln!("{} words for {:?} we have found positions {:?} in {:.02?}", + debug!("{} words for {:?} we have found positions {:?} in {:.02?}", count, word, union_positions, before.elapsed()); words.push(derived_words); positions.push(union_positions.iter().collect()); @@ -168,9 +169,9 @@ impl Index { words_attributes_docids.push(intersect_docids); } - eprintln!("The documents you must find for each attribute: {:?}", words_attributes_docids); + debug!("The documents you must find for each attribute: {:?}", words_attributes_docids); - eprintln!("Retrieving words positions took {:.02?}", before.elapsed()); + debug!("Retrieving words positions took {:.02?}", before.elapsed()); // Returns the union of the same position for all the derived words. let unions_word_pos = |word: usize, pos: u32| { @@ -259,10 +260,10 @@ impl Index { } }); - eprintln!("retrieving words took {:.02?} and took {:.02?} to intersect", + debug!("retrieving words took {:.02?} and took {:.02?} to intersect", elapsed_retrieving, before_intersect.elapsed()); - eprintln!("for proximity {:?} {:?} we took {:.02?} to find {} documents", + debug!("for proximity {:?} {:?} we took {:.02?} to find {} documents", proximity, positions, before.elapsed(), intersect_docids.as_ref().map_or(0, |rb| rb.len())); @@ -272,7 +273,7 @@ impl Index { // We found enough documents we can stop here if documents.iter().map(RoaringBitmap::len).sum::() + same_proximity_union.len() >= 20 { - eprintln!("proximity {} took a total of {:.02?}", proximity, same_prox_before.elapsed()); + debug!("proximity {} took a total of {:.02?}", proximity, same_prox_before.elapsed()); break; } } @@ -294,8 +295,8 @@ impl Index { } documents.retain(|rb| !rb.is_empty()); - eprintln!("documents: {:?}", documents); - eprintln!("proximity {} took a total of {:.02?}", proximity, same_prox_before.elapsed()); + debug!("documents: {:?}", documents); + debug!("proximity {} took a total of {:.02?}", proximity, same_prox_before.elapsed()); // We found enough documents we can stop here. if documents.iter().map(RoaringBitmap::len).sum::() >= 20 { @@ -303,7 +304,7 @@ impl Index { } } - eprintln!("{} candidates", documents.iter().map(RoaringBitmap::len).sum::()); + debug!("{} candidates", documents.iter().map(RoaringBitmap::len).sum::()); Ok(documents.iter().flatten().take(20).collect()) } } From f757df5dfd049c6e1b476de53907fb53234ee600 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 12 Jul 2020 11:04:35 +0200 Subject: [PATCH 0097/1889] Introduce the stderr logger to the project --- src/bin/indexer.rs | 10 ++++++++++ src/bin/search.rs | 10 ++++++++++ src/bin/serve.rs | 11 ++++++++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 6f490fae7..e7e3e0fdc 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -55,6 +55,10 @@ struct Opt { #[structopt(long, default_value = "4294967296")] max_memory_usage: usize, + /// Verbose mode (-v, -vv, -vvv, etc.) + #[structopt(short, long, parse(from_occurrences))] + verbose: usize, + /// CSV file to index, if unspecified the CSV is read from standard input. csv_file: Option, } @@ -417,6 +421,12 @@ fn compute_words_attributes_docids(wtxn: &mut heed::RwTxn, index: &Index) -> any fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); + stderrlog::new() + .verbosity(opt.verbose) + .show_level(false) + .timestamp(stderrlog::Timestamp::Off) + .init()?; + if let Some(jobs) = opt.jobs { rayon::ThreadPoolBuilder::new().num_threads(jobs).build_global()?; } diff --git a/src/bin/search.rs b/src/bin/search.rs index 92112826a..f7a939637 100644 --- a/src/bin/search.rs +++ b/src/bin/search.rs @@ -20,6 +20,10 @@ struct Opt { #[structopt(long = "db", parse(from_os_str))] database: PathBuf, + /// Verbose mode (-v, -vv, -vvv, etc.) + #[structopt(short, long, parse(from_occurrences))] + verbose: usize, + /// The query string to search for (doesn't support prefix search yet). query: Option, } @@ -27,6 +31,12 @@ struct Opt { fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); + stderrlog::new() + .verbosity(opt.verbose) + .show_level(false) + .timestamp(stderrlog::Timestamp::Off) + .init()?; + std::fs::create_dir_all(&opt.database)?; let env = EnvOpenOptions::new() .map_size(100 * 1024 * 1024 * 1024) // 100 GB diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 9d4ae77d1..35a19e24a 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -29,6 +29,10 @@ struct Opt { #[structopt(long = "db-size", default_value = "107374182400")] // 100 GB database_size: usize, + /// Verbose mode (-v, -vv, -vvv, etc.) + #[structopt(short, long, parse(from_occurrences))] + verbose: usize, + /// The ip and port on which the database will listen for HTTP requests. #[structopt(short = "l", long, default_value = "127.0.0.1:9700")] http_listen_addr: String, @@ -46,6 +50,12 @@ struct IndexTemplate { async fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); + stderrlog::new() + .verbosity(opt.verbose) + .show_level(false) + .timestamp(stderrlog::Timestamp::Off) + .init()?; + std::fs::create_dir_all(&opt.database)?; let env = EnvOpenOptions::new() .map_size(opt.database_size) @@ -158,7 +168,6 @@ async fn main() -> anyhow::Result<()> { .or(query_route); let addr = SocketAddr::from_str(&opt.http_listen_addr).unwrap(); - println!("listening on http://{}", addr); warp::serve(routes).run(addr).await; Ok(()) From 92c2b1dd2dddf412b62f722a572b8c633539d18d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 12 Jul 2020 11:06:45 +0200 Subject: [PATCH 0098/1889] Refine the help message of the binaries --- src/bin/indexer.rs | 2 +- src/bin/search.rs | 2 +- src/bin/serve.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index e7e3e0fdc..c89e6e7ea 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -36,7 +36,7 @@ pub fn simple_alphanumeric_tokens(string: &str) -> impl Iterator { } #[derive(Debug, StructOpt)] -#[structopt(name = "mm-indexer", about = "The indexer side of the MMI project.")] +#[structopt(name = "milli-indexer", about = "The indexer binary of the milli project.")] struct Opt { /// The database path where the database is located. /// It is created if it doesn't already exist. diff --git a/src/bin/search.rs b/src/bin/search.rs index f7a939637..86abe752c 100644 --- a/src/bin/search.rs +++ b/src/bin/search.rs @@ -13,7 +13,7 @@ use structopt::StructOpt; static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; #[derive(Debug, StructOpt)] -#[structopt(name = "mm-search", about = "The server side of the MMI project.")] +#[structopt(name = "milli-search", about = "A simple search binary for milli project.")] struct Opt { /// The database path where the database is located. /// It is created if it doesn't already exist. diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 35a19e24a..2667ce668 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -17,7 +17,7 @@ use milli::{BEU32, Index}; static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; #[derive(Debug, StructOpt)] -#[structopt(name = "mmi", about = "The server side of the mmi project.")] +#[structopt(name = "milli", about = "The server binary of the milli project.")] struct Opt { /// The database path where the LMDB database is located. /// It is created if it doesn't already exist. From 54afec58a31ff563d4ba1a96538d81b8ef9407ec Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 12 Jul 2020 11:34:48 +0200 Subject: [PATCH 0099/1889] Add a fade in out animation when the server process --- public/script.js | 2 ++ public/style.css | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/public/script.js b/public/script.js index acb8cbf70..3e4d4f543 100644 --- a/public/script.js +++ b/public/script.js @@ -16,6 +16,7 @@ $('#search').on('input', function () { let numberOfDocuments = httpResults.data.length; count.innerHTML = `${numberOfDocuments}`; time.innerHTML = `${timeSpent}ms`; + time.classList.remove('fade-in-out'); for (element of httpResults.data) { const elem = document.createElement('li'); @@ -49,6 +50,7 @@ $('#search').on('input', function () { beforeSend: function () { if (request !== null) { request.abort(); + time.classList.add('fade-in-out'); } }, }); diff --git a/public/style.css b/public/style.css index d2bb0a4a7..b490b88ea 100644 --- a/public/style.css +++ b/public/style.css @@ -68,3 +68,13 @@ em { padding-left: 10px; color: rgba(0,0,0,.9); } + +@keyframes fadeInOut { + 0% { opacity: 1; } + 30% { opacity: 0.3; } + 100% { opacity: 1; } +} + +.fade-in-out { + animation: fadeInOut ease 1s infinite; +} From 6b14b20369fad16c220d3686f9686a888f965463 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 13 Jul 2020 17:50:16 +0200 Subject: [PATCH 0100/1889] Introduce a method to retrieve the number of attributes of the documents --- src/lib.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index fad74976f..61956891a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -74,6 +74,17 @@ impl Index { self.main.get::<_, Str, ByteSlice>(rtxn, "headers") } + pub fn number_of_attributes<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result> { + match self.headers(rtxn)? { + Some(headers) => { + let mut rdr = csv::Reader::from_reader(headers); + let headers = rdr.headers()?; + Ok(Some(headers.len())) + } + None => Ok(None), + } + } + pub fn put_fst>(&self, wtxn: &mut heed::RwTxn, fst: &fst::Set) -> anyhow::Result<()> { Ok(self.main.put::<_, Str, ByteSlice>(wtxn, "words-fst", fst.as_fst().as_bytes())?) } @@ -146,9 +157,9 @@ impl Index { } let mut words_attributes_docids = Vec::new(); - let number_attributes: u32 = 6; + let number_of_attributes = self.number_of_attributes(rtxn)?.map_or(0, |n| n as u32); - for i in 0..number_attributes { + for i in 0..number_of_attributes { let mut intersect_docids: Option = None; for derived_words in &words { let mut union_docids = RoaringBitmap::new(); From 576dd011a15c5c75a5b2237677fc769fd588662f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 13 Jul 2020 18:16:05 +0200 Subject: [PATCH 0101/1889] Compute the candidates but not by attribute --- src/lib.rs | 55 +++++++++++++++++++++++------------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 61956891a..c9ac19de1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -156,32 +156,31 @@ impl Index { positions.push(union_positions.iter().collect()); } - let mut words_attributes_docids = Vec::new(); + // We compute the docids candiate for these words (and derived words). + // We do a union between all the docids of each of the words and derived words, + // we got N unions (where N is the number of query words), we then intersect them. + // TODO we must store the words documents ids to avoid these unions. + let mut candidates = RoaringBitmap::new(); let number_of_attributes = self.number_of_attributes(rtxn)?.map_or(0, |n| n as u32); - - for i in 0..number_of_attributes { - let mut intersect_docids: Option = None; - for derived_words in &words { - let mut union_docids = RoaringBitmap::new(); - for (word, _) in derived_words { - // generate the key with the attribute number. + for (i, derived_words) in words.iter().enumerate() { + let mut union_docids = RoaringBitmap::new(); + for (word, _) in derived_words { + for attr in 0..number_of_attributes { let mut key = word.to_vec(); - key.extend_from_slice(&i.to_be_bytes()); - + key.extend_from_slice(&attr.to_be_bytes()); if let Some(right) = self.word_attribute_docids.get(rtxn, &key)? { union_docids.union_with(&right); } } - match &mut intersect_docids { - Some(left) => left.intersect_with(&union_docids), - None => intersect_docids = Some(union_docids), - } } - words_attributes_docids.push(intersect_docids); + if i == 0 { + candidates = union_docids; + } else { + candidates.intersect_with(&union_docids); + } } - debug!("The documents you must find for each attribute: {:?}", words_attributes_docids); - + debug!("The candidates are {:?}", candidates); debug!("Retrieving words positions took {:.02?}", before.elapsed()); // Returns the union of the same position for all the derived words. @@ -202,7 +201,7 @@ impl Index { let mut union_cache = HashMap::new(); let mut intersect_cache = HashMap::new(); // Returns `true` if there is documents in common between the two words and positions given. - let mut contains_documents = |(lword, lpos), (rword, rpos), union_cache: &mut HashMap<_, _>, words_attributes_docids: &[_]| { + let mut contains_documents = |(lword, lpos), (rword, rpos), union_cache: &mut HashMap<_, _>, candidates: &RoaringBitmap| { let proximity = best_proximity::positions_proximity(lpos, rpos); if proximity == 0 { return false } @@ -217,13 +216,9 @@ impl Index { let lunion_docids = union_cache.get(&(lword, lpos)).unwrap(); let runion_docids = union_cache.get(&(rword, rpos)).unwrap(); - if proximity <= 7 { - let lattr = lpos / 1000; - if let Some(docids) = &words_attributes_docids[lattr as usize] { - if lunion_docids.is_disjoint(&docids) { return false } - if runion_docids.is_disjoint(&docids) { return false } - } - } + // We first check that the docids of these unions are part of the candidates. + if lunion_docids.is_disjoint(candidates) { return false } + if runion_docids.is_disjoint(candidates) { return false } !lunion_docids.is_disjoint(&runion_docids) }) @@ -231,7 +226,7 @@ impl Index { let mut documents = Vec::new(); let mut iter = BestProximity::new(positions); - while let Some((proximity, mut positions)) = iter.next(|l, r| contains_documents(l, r, &mut union_cache, &words_attributes_docids)) { + while let Some((proximity, mut positions)) = iter.next(|l, r| contains_documents(l, r, &mut union_cache, &candidates)) { positions.sort_unstable(); let same_prox_before = Instant::now(); @@ -289,12 +284,8 @@ impl Index { } } - // We achieve to find valid documents ids so we remove them from the candidate list. - for docids in &mut words_attributes_docids { - if let Some(docids) = docids { - docids.difference_with(&same_proximity_union); - } - } + // We achieve to find valid documents ids so we remove them from the candidates list. + candidates.difference_with(&same_proximity_union); documents.push(same_proximity_union); From 3d144e62c45f46bfc977ba6d7ad983401e0ff36c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 13 Jul 2020 18:47:43 +0200 Subject: [PATCH 0102/1889] Search for best proximities in multiple attributes --- src/best_proximity.rs | 13 ++++++--- src/lib.rs | 66 +++++++++++++++++++++++++++++++++---------- 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/src/best_proximity.rs b/src/best_proximity.rs index 6dda4d1fb..6f822ee6d 100644 --- a/src/best_proximity.rs +++ b/src/best_proximity.rs @@ -23,7 +23,7 @@ pub fn positions_proximity(lhs: u32, rhs: u32) -> u32 { } // Returns the attribute and index parts. -fn extract_position(position: u32) -> (u32, u32) { +pub fn extract_position(position: u32) -> (u32, u32) { (position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE) } @@ -66,7 +66,7 @@ impl Node { parent_position: *position, }; // We do not produce the nodes we have already seen in previous iterations loops. - if proximity > 7 || (node.is_complete(positions) && acc_proximity + proximity < best_proximity) { + if node.is_complete(positions) && acc_proximity + proximity < best_proximity { None } else { Some((node, proximity)) @@ -138,7 +138,7 @@ impl BestProximity { { let before = Instant::now(); - if self.best_proximity == self.positions.len() as u32 * (MAX_DISTANCE - 1) { + if self.best_proximity == self.positions.len() as u32 * MAX_DISTANCE { return None; } @@ -177,6 +177,11 @@ impl BestProximity { mod tests { use super::*; + fn sort(mut val: (u32, Vec)) -> (u32, Vec) { + val.1.sort_unstable(); + val + } + #[test] fn same_attribute() { let positions = vec![ @@ -190,7 +195,7 @@ mod tests { assert_eq!(iter.next(f), Some((1+2, vec![vec![0, 1, 3]]))); // 3 assert_eq!(iter.next(f), Some((2+2, vec![vec![2, 1, 3]]))); // 4 assert_eq!(iter.next(f), Some((3+2, vec![vec![3, 1, 3]]))); // 5 - assert_eq!(iter.next(f), Some((1+5, vec![vec![0, 1, 6], vec![4, 1, 3]]))); // 6 + assert_eq!(iter.next(f).map(sort), Some((1+5, vec![vec![0, 1, 6], vec![4, 1, 3]]))); // 6 assert_eq!(iter.next(f), Some((2+5, vec![vec![2, 1, 6]]))); // 7 assert_eq!(iter.next(f), Some((3+5, vec![vec![3, 1, 6]]))); // 8 assert_eq!(iter.next(f), Some((4+5, vec![vec![4, 1, 6]]))); // 9 diff --git a/src/lib.rs b/src/lib.rs index c9ac19de1..6af75e875 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -198,30 +198,66 @@ impl Index { union_docids }; + // Returns the union of the same attribute for all the derived words. + let unions_word_attr = |word: usize, attr: u32| { + let mut union_docids = RoaringBitmap::new(); + for (word, _) in &words[word] { + let mut key = word.clone(); + key.extend_from_slice(&attr.to_be_bytes()); + if let Some(right) = self.word_attribute_docids.get(rtxn, &key).unwrap() { + union_docids.union_with(&right); + } + } + union_docids + }; + let mut union_cache = HashMap::new(); let mut intersect_cache = HashMap::new(); + + let mut attribute_union_cache = HashMap::new(); + let mut attribute_intersect_cache = HashMap::new(); + // Returns `true` if there is documents in common between the two words and positions given. let mut contains_documents = |(lword, lpos), (rword, rpos), union_cache: &mut HashMap<_, _>, candidates: &RoaringBitmap| { - let proximity = best_proximity::positions_proximity(lpos, rpos); + if lpos == rpos { return false } - if proximity == 0 { return false } + let (lattr, _) = best_proximity::extract_position(lpos); + let (rattr, _) = best_proximity::extract_position(rpos); - // We retrieve or compute the intersection between the two given words and positions. - *intersect_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| { - // We retrieve or compute the unions for the two words and positions. - union_cache.entry((lword, lpos)).or_insert_with(|| unions_word_pos(lword, lpos)); - union_cache.entry((rword, rpos)).or_insert_with(|| unions_word_pos(rword, rpos)); + if lattr == rattr { + // We retrieve or compute the intersection between the two given words and positions. + *intersect_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| { + // We retrieve or compute the unions for the two words and positions. + union_cache.entry((lword, lpos)).or_insert_with(|| unions_word_pos(lword, lpos)); + union_cache.entry((rword, rpos)).or_insert_with(|| unions_word_pos(rword, rpos)); - // TODO is there a way to avoid this double gets? - let lunion_docids = union_cache.get(&(lword, lpos)).unwrap(); - let runion_docids = union_cache.get(&(rword, rpos)).unwrap(); + // TODO is there a way to avoid this double gets? + let lunion_docids = union_cache.get(&(lword, lpos)).unwrap(); + let runion_docids = union_cache.get(&(rword, rpos)).unwrap(); - // We first check that the docids of these unions are part of the candidates. - if lunion_docids.is_disjoint(candidates) { return false } - if runion_docids.is_disjoint(candidates) { return false } + // We first check that the docids of these unions are part of the candidates. + if lunion_docids.is_disjoint(candidates) { return false } + if runion_docids.is_disjoint(candidates) { return false } - !lunion_docids.is_disjoint(&runion_docids) - }) + !lunion_docids.is_disjoint(&runion_docids) + }) + } else { + *attribute_intersect_cache.entry(((lword, lattr), (rword, rattr))).or_insert_with(|| { + // We retrieve or compute the unions for the two words and positions. + attribute_union_cache.entry((lword, lattr)).or_insert_with(|| unions_word_attr(lword, lattr)); + attribute_union_cache.entry((rword, rattr)).or_insert_with(|| unions_word_attr(rword, rattr)); + + // TODO is there a way to avoid this double gets? + let lunion_docids = attribute_union_cache.get(&(lword, lattr)).unwrap(); + let runion_docids = attribute_union_cache.get(&(rword, rattr)).unwrap(); + + // We first check that the docids of these unions are part of the candidates. + if lunion_docids.is_disjoint(candidates) { return false } + if runion_docids.is_disjoint(candidates) { return false } + + !lunion_docids.is_disjoint(&runion_docids) + }) + } }; let mut documents = Vec::new(); From aa92311d4eafee6720d9c416ccd397f04858007e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 13 Jul 2020 23:51:41 +0200 Subject: [PATCH 0103/1889] Add a dark theme to the dashboard --- public/bulma-prefers-dark.min.css | 1 + public/style.css | 21 ++++++++++++++++++++- src/bin/serve.rs | 8 ++++++++ templates/index.html | 1 + 4 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 public/bulma-prefers-dark.min.css diff --git a/public/bulma-prefers-dark.min.css b/public/bulma-prefers-dark.min.css new file mode 100644 index 000000000..7ebab0105 --- /dev/null +++ b/public/bulma-prefers-dark.min.css @@ -0,0 +1 @@ +@media (prefers-color-scheme:dark){html{background-color:#17181c}body{color:#b5b5b5}a{color:#5ea3e4}a:hover{color:#dbdbdb}code{background-color:#242424;color:#eb002f}hr{background-color:#242424}strong{color:#dbdbdb}pre{background-color:#242424;color:#b5b5b5}table th{color:#dbdbdb}.has-text-white-dark{color:#fff!important}a.has-text-white-dark:focus,a.has-text-white-dark:hover{color:#fff!important}.has-background-white-dark{background-color:#fff!important}.has-text-black-dark{color:#0a0a0a!important}a.has-text-black-dark:focus,a.has-text-black-dark:hover{color:#242424!important}.has-background-black-dark{background-color:#0a0a0a!important}.has-text-light-dark{color:#f5f5f5!important}a.has-text-light-dark:focus,a.has-text-light-dark:hover{color:#fff!important}.has-background-light-dark{background-color:#f5f5f5!important}.has-text-dark-dark{color:#363636!important}a.has-text-dark-dark:focus,a.has-text-dark-dark:hover{color:#4f4f4f!important}.has-background-dark-dark{background-color:#363636!important}.has-text-primary-dark{color:#00d1b2!important}a.has-text-primary-dark:focus,a.has-text-primary-dark:hover{color:#05ffda!important}.has-background-primary-dark{background-color:#00d1b2!important}.has-text-link-dark{color:#3273dc!important}a.has-text-link-dark:focus,a.has-text-link-dark:hover{color:#5e91e4!important}.has-background-link-dark{background-color:#3273dc!important}.has-text-info-dark{color:#209cee!important}a.has-text-info-dark:focus,a.has-text-info-dark:hover{color:#50b1f2!important}.has-background-info-dark{background-color:#209cee!important}.has-text-success-dark{color:#23d160!important}a.has-text-success-dark:focus,a.has-text-success-dark:hover{color:#48e07d!important}.has-background-success-dark{background-color:#23d160!important}.has-text-warning-dark{color:#ffdd57!important}a.has-text-warning-dark:focus,a.has-text-warning-dark:hover{color:#ffe88a!important}.has-background-warning-dark{background-color:#ffdd57!important}.has-text-danger-dark{color:#ff3860!important}a.has-text-danger-dark:focus,a.has-text-danger-dark:hover{color:#ff6b89!important}.has-background-danger-dark{background-color:#ff3860!important}.has-text-black-bis-dark{color:#121212!important}.has-background-black-bis-dark{background-color:#121212!important}.has-text-black-ter-dark{color:#242424!important}.has-background-black-ter-dark{background-color:#242424!important}.has-text-grey-darker-dark{color:#363636!important}.has-background-grey-darker-dark{background-color:#363636!important}.has-text-grey-dark-dark{color:#4a4a4a!important}.has-background-grey-dark-dark{background-color:#4a4a4a!important}.has-text-grey-dark{color:#7a7a7a!important}.has-background-grey-dark{background-color:#7a7a7a!important}.has-text-grey-light-dark{color:#b5b5b5!important}.has-background-grey-light-dark{background-color:#b5b5b5!important}.has-text-grey-lighter-dark{color:#dbdbdb!important}.has-background-grey-lighter-dark{background-color:#dbdbdb!important}.has-text-white-ter-dark{color:#f5f5f5!important}.has-background-white-ter-dark{background-color:#f5f5f5!important}.has-text-white-bis-dark{color:#fafafa!important}.has-background-white-bis-dark{background-color:#fafafa!important}.box{background-color:#0a0a0a;box-shadow:0 2px 3px rgba(255,255,255,.1),0 0 0 1px rgba(255,255,255,.1);color:#b5b5b5}a.box:focus,a.box:hover{box-shadow:0 2px 3px rgba(255,255,255,.1),0 0 0 1px #5ea3e4}a.box:active{box-shadow:inset 0 1px 2px rgba(255,255,255,.2),0 0 0 1px #5ea3e4}.button{background-color:#0a0a0a;border-color:#363636;color:#dbdbdb}.button.is-hovered,.button:hover{border-color:#4a4a4a;color:#dbdbdb}.button.is-focused,.button:focus{border-color:#5ea3e4;color:#dbdbdb}.button.is-focused:not(:active),.button:focus:not(:active){box-shadow:0 0 0 .125em rgba(94,163,228,.25)}.button.is-active,.button:active{border-color:#b5b5b5;color:#dbdbdb}.button.is-text{color:#b5b5b5}.button.is-text.is-focused,.button.is-text.is-hovered,.button.is-text:focus,.button.is-text:hover{background-color:#242424;color:#dbdbdb}.button.is-text.is-active,.button.is-text:active{background-color:#171717;color:#dbdbdb}.button.is-white{background-color:#e6e6e6;border-color:transparent;color:#0a0a0a}.button.is-white.is-hovered,.button.is-white:hover{background-color:#dfdfdf;border-color:transparent;color:#0a0a0a}.button.is-white.is-focused,.button.is-white:focus{border-color:transparent;color:#0a0a0a}.button.is-white.is-focused:not(:active),.button.is-white:focus:not(:active){box-shadow:0 0 0 .125em rgba(230,230,230,.25)}.button.is-white.is-active,.button.is-white:active{background-color:#d9d9d9;border-color:transparent;color:#0a0a0a}.button.is-white[disabled],fieldset[disabled] .button.is-white{background-color:#e6e6e6;border-color:transparent;box-shadow:none}.button.is-white.is-inverted{background-color:#0a0a0a;color:#e6e6e6}.button.is-white.is-inverted:hover{background-color:#000}.button.is-white.is-inverted[disabled],fieldset[disabled] .button.is-white.is-inverted{background-color:#0a0a0a;border-color:transparent;box-shadow:none;color:#e6e6e6}.button.is-white.is-loading::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-white.is-outlined{background-color:transparent;border-color:#e6e6e6;color:#e6e6e6}.button.is-white.is-outlined:focus,.button.is-white.is-outlined:hover{background-color:#e6e6e6;border-color:#e6e6e6;color:#0a0a0a}.button.is-white.is-outlined.is-loading::after{border-color:transparent transparent #e6e6e6 #e6e6e6!important}.button.is-white.is-outlined[disabled],fieldset[disabled] .button.is-white.is-outlined{background-color:transparent;border-color:#e6e6e6;box-shadow:none;color:#e6e6e6}.button.is-white.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;color:#0a0a0a}.button.is-white.is-inverted.is-outlined:focus,.button.is-white.is-inverted.is-outlined:hover{background-color:#0a0a0a;color:#e6e6e6}.button.is-white.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-white.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;box-shadow:none;color:#0a0a0a}.button.is-black{background-color:#000;border-color:transparent;color:#fff}.button.is-black.is-hovered,.button.is-black:hover{background-color:#000;border-color:transparent;color:#fff}.button.is-black.is-focused,.button.is-black:focus{border-color:transparent;color:#fff}.button.is-black.is-focused:not(:active),.button.is-black:focus:not(:active){box-shadow:0 0 0 .125em rgba(0,0,0,.25)}.button.is-black.is-active,.button.is-black:active{background-color:#000;border-color:transparent;color:#fff}.button.is-black[disabled],fieldset[disabled] .button.is-black{background-color:#000;border-color:transparent;box-shadow:none}.button.is-black.is-inverted{background-color:#fff;color:#000}.button.is-black.is-inverted:hover{background-color:#f2f2f2}.button.is-black.is-inverted[disabled],fieldset[disabled] .button.is-black.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#000}.button.is-black.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-black.is-outlined{background-color:transparent;border-color:#000;color:#000}.button.is-black.is-outlined:focus,.button.is-black.is-outlined:hover{background-color:#000;border-color:#000;color:#fff}.button.is-black.is-outlined.is-loading::after{border-color:transparent transparent #000 #000!important}.button.is-black.is-outlined[disabled],fieldset[disabled] .button.is-black.is-outlined{background-color:transparent;border-color:#000;box-shadow:none;color:#000}.button.is-black.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-black.is-inverted.is-outlined:focus,.button.is-black.is-inverted.is-outlined:hover{background-color:#fff;color:#000}.button.is-black.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-black.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-light{background-color:#dbdbdb;border-color:transparent;color:#363636}.button.is-light.is-hovered,.button.is-light:hover{background-color:#d5d5d5;border-color:transparent;color:#363636}.button.is-light.is-focused,.button.is-light:focus{border-color:transparent;color:#363636}.button.is-light.is-focused:not(:active),.button.is-light:focus:not(:active){box-shadow:0 0 0 .125em rgba(219,219,219,.25)}.button.is-light.is-active,.button.is-light:active{background-color:#cfcfcf;border-color:transparent;color:#363636}.button.is-light[disabled],fieldset[disabled] .button.is-light{background-color:#dbdbdb;border-color:transparent;box-shadow:none}.button.is-light.is-inverted{background-color:#363636;color:#dbdbdb}.button.is-light.is-inverted:hover{background-color:#292929}.button.is-light.is-inverted[disabled],fieldset[disabled] .button.is-light.is-inverted{background-color:#363636;border-color:transparent;box-shadow:none;color:#dbdbdb}.button.is-light.is-loading::after{border-color:transparent transparent #363636 #363636!important}.button.is-light.is-outlined{background-color:transparent;border-color:#dbdbdb;color:#dbdbdb}.button.is-light.is-outlined:focus,.button.is-light.is-outlined:hover{background-color:#dbdbdb;border-color:#dbdbdb;color:#363636}.button.is-light.is-outlined.is-loading::after{border-color:transparent transparent #dbdbdb #dbdbdb!important}.button.is-light.is-outlined[disabled],fieldset[disabled] .button.is-light.is-outlined{background-color:transparent;border-color:#dbdbdb;box-shadow:none;color:#dbdbdb}.button.is-light.is-inverted.is-outlined{background-color:transparent;border-color:#363636;color:#363636}.button.is-light.is-inverted.is-outlined:focus,.button.is-light.is-inverted.is-outlined:hover{background-color:#363636;color:#dbdbdb}.button.is-light.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-light.is-inverted.is-outlined{background-color:transparent;border-color:#363636;box-shadow:none;color:#363636}.button.is-dark{background-color:#1c1c1c;border-color:transparent;color:#f5f5f5}.button.is-dark.is-hovered,.button.is-dark:hover{background-color:#161616;border-color:transparent;color:#f5f5f5}.button.is-dark.is-focused,.button.is-dark:focus{border-color:transparent;color:#f5f5f5}.button.is-dark.is-focused:not(:active),.button.is-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(28,28,28,.25)}.button.is-dark.is-active,.button.is-dark:active{background-color:#0f0f0f;border-color:transparent;color:#f5f5f5}.button.is-dark[disabled],fieldset[disabled] .button.is-dark{background-color:#1c1c1c;border-color:transparent;box-shadow:none}.button.is-dark.is-inverted{background-color:#f5f5f5;color:#1c1c1c}.button.is-dark.is-inverted:hover{background-color:#e8e8e8}.button.is-dark.is-inverted[disabled],fieldset[disabled] .button.is-dark.is-inverted{background-color:#f5f5f5;border-color:transparent;box-shadow:none;color:#1c1c1c}.button.is-dark.is-loading::after{border-color:transparent transparent #f5f5f5 #f5f5f5!important}.button.is-dark.is-outlined{background-color:transparent;border-color:#1c1c1c;color:#1c1c1c}.button.is-dark.is-outlined:focus,.button.is-dark.is-outlined:hover{background-color:#1c1c1c;border-color:#1c1c1c;color:#f5f5f5}.button.is-dark.is-outlined.is-loading::after{border-color:transparent transparent #1c1c1c #1c1c1c!important}.button.is-dark.is-outlined[disabled],fieldset[disabled] .button.is-dark.is-outlined{background-color:transparent;border-color:#1c1c1c;box-shadow:none;color:#1c1c1c}.button.is-dark.is-inverted.is-outlined{background-color:transparent;border-color:#f5f5f5;color:#f5f5f5}.button.is-dark.is-inverted.is-outlined:focus,.button.is-dark.is-inverted.is-outlined:hover{background-color:#f5f5f5;color:#1c1c1c}.button.is-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-dark.is-inverted.is-outlined{background-color:transparent;border-color:#f5f5f5;box-shadow:none;color:#f5f5f5}.button.is-primary{background-color:#009e86;border-color:transparent;color:#fff}.button.is-primary.is-hovered,.button.is-primary:hover{background-color:#00917c;border-color:transparent;color:#fff}.button.is-primary.is-focused,.button.is-primary:focus{border-color:transparent;color:#fff}.button.is-primary.is-focused:not(:active),.button.is-primary:focus:not(:active){box-shadow:0 0 0 .125em rgba(0,158,134,.25)}.button.is-primary.is-active,.button.is-primary:active{background-color:#008571;border-color:transparent;color:#fff}.button.is-primary[disabled],fieldset[disabled] .button.is-primary{background-color:#009e86;border-color:transparent;box-shadow:none}.button.is-primary.is-inverted{background-color:#fff;color:#009e86}.button.is-primary.is-inverted:hover{background-color:#f2f2f2}.button.is-primary.is-inverted[disabled],fieldset[disabled] .button.is-primary.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#009e86}.button.is-primary.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-primary.is-outlined{background-color:transparent;border-color:#009e86;color:#009e86}.button.is-primary.is-outlined:focus,.button.is-primary.is-outlined:hover{background-color:#009e86;border-color:#009e86;color:#fff}.button.is-primary.is-outlined.is-loading::after{border-color:transparent transparent #009e86 #009e86!important}.button.is-primary.is-outlined[disabled],fieldset[disabled] .button.is-primary.is-outlined{background-color:transparent;border-color:#009e86;box-shadow:none;color:#009e86}.button.is-primary.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-primary.is-inverted.is-outlined:focus,.button.is-primary.is-inverted.is-outlined:hover{background-color:#fff;color:#009e86}.button.is-primary.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-primary.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-link{background-color:#205bbc;border-color:transparent;color:#fff}.button.is-link.is-hovered,.button.is-link:hover{background-color:#1e56b1;border-color:transparent;color:#fff}.button.is-link.is-focused,.button.is-link:focus{border-color:transparent;color:#fff}.button.is-link.is-focused:not(:active),.button.is-link:focus:not(:active){box-shadow:0 0 0 .125em rgba(32,91,188,.25)}.button.is-link.is-active,.button.is-link:active{background-color:#1c51a6;border-color:transparent;color:#fff}.button.is-link[disabled],fieldset[disabled] .button.is-link{background-color:#205bbc;border-color:transparent;box-shadow:none}.button.is-link.is-inverted{background-color:#fff;color:#205bbc}.button.is-link.is-inverted:hover{background-color:#f2f2f2}.button.is-link.is-inverted[disabled],fieldset[disabled] .button.is-link.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#205bbc}.button.is-link.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-link.is-outlined{background-color:transparent;border-color:#205bbc;color:#205bbc}.button.is-link.is-outlined:focus,.button.is-link.is-outlined:hover{background-color:#205bbc;border-color:#205bbc;color:#fff}.button.is-link.is-outlined.is-loading::after{border-color:transparent transparent #205bbc #205bbc!important}.button.is-link.is-outlined[disabled],fieldset[disabled] .button.is-link.is-outlined{background-color:transparent;border-color:#205bbc;box-shadow:none;color:#205bbc}.button.is-link.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-link.is-inverted.is-outlined:focus,.button.is-link.is-inverted.is-outlined:hover{background-color:#fff;color:#205bbc}.button.is-link.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-link.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-info{background-color:#0f81cc;border-color:transparent;color:#fff}.button.is-info.is-hovered,.button.is-info:hover{background-color:#0e79c0;border-color:transparent;color:#fff}.button.is-info.is-focused,.button.is-info:focus{border-color:transparent;color:#fff}.button.is-info.is-focused:not(:active),.button.is-info:focus:not(:active){box-shadow:0 0 0 .125em rgba(15,129,204,.25)}.button.is-info.is-active,.button.is-info:active{background-color:#0e72b4;border-color:transparent;color:#fff}.button.is-info[disabled],fieldset[disabled] .button.is-info{background-color:#0f81cc;border-color:transparent;box-shadow:none}.button.is-info.is-inverted{background-color:#fff;color:#0f81cc}.button.is-info.is-inverted:hover{background-color:#f2f2f2}.button.is-info.is-inverted[disabled],fieldset[disabled] .button.is-info.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#0f81cc}.button.is-info.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-info.is-outlined{background-color:transparent;border-color:#0f81cc;color:#0f81cc}.button.is-info.is-outlined:focus,.button.is-info.is-outlined:hover{background-color:#0f81cc;border-color:#0f81cc;color:#fff}.button.is-info.is-outlined.is-loading::after{border-color:transparent transparent #0f81cc #0f81cc!important}.button.is-info.is-outlined[disabled],fieldset[disabled] .button.is-info.is-outlined{background-color:transparent;border-color:#0f81cc;box-shadow:none;color:#0f81cc}.button.is-info.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-info.is-inverted.is-outlined:focus,.button.is-info.is-inverted.is-outlined:hover{background-color:#fff;color:#0f81cc}.button.is-info.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-info.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-success{background-color:#1ca64c;border-color:transparent;color:#fff}.button.is-success.is-hovered,.button.is-success:hover{background-color:#1a9b47;border-color:transparent;color:#fff}.button.is-success.is-focused,.button.is-success:focus{border-color:transparent;color:#fff}.button.is-success.is-focused:not(:active),.button.is-success:focus:not(:active){box-shadow:0 0 0 .125em rgba(28,166,76,.25)}.button.is-success.is-active,.button.is-success:active{background-color:#189042;border-color:transparent;color:#fff}.button.is-success[disabled],fieldset[disabled] .button.is-success{background-color:#1ca64c;border-color:transparent;box-shadow:none}.button.is-success.is-inverted{background-color:#fff;color:#1ca64c}.button.is-success.is-inverted:hover{background-color:#f2f2f2}.button.is-success.is-inverted[disabled],fieldset[disabled] .button.is-success.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#1ca64c}.button.is-success.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-success.is-outlined{background-color:transparent;border-color:#1ca64c;color:#1ca64c}.button.is-success.is-outlined:focus,.button.is-success.is-outlined:hover{background-color:#1ca64c;border-color:#1ca64c;color:#fff}.button.is-success.is-outlined.is-loading::after{border-color:transparent transparent #1ca64c #1ca64c!important}.button.is-success.is-outlined[disabled],fieldset[disabled] .button.is-success.is-outlined{background-color:transparent;border-color:#1ca64c;box-shadow:none;color:#1ca64c}.button.is-success.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-success.is-inverted.is-outlined:focus,.button.is-success.is-inverted.is-outlined:hover{background-color:#fff;color:#1ca64c}.button.is-success.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-success.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-warning{background-color:#ffd324;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-hovered,.button.is-warning:hover{background-color:#ffd117;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-focused,.button.is-warning:focus{border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-focused:not(:active),.button.is-warning:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,211,36,.25)}.button.is-warning.is-active,.button.is-warning:active{background-color:#ffce0a;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning[disabled],fieldset[disabled] .button.is-warning{background-color:#ffd324;border-color:transparent;box-shadow:none}.button.is-warning.is-inverted{background-color:rgba(0,0,0,.7);color:#ffd324}.button.is-warning.is-inverted:hover{background-color:rgba(0,0,0,.7)}.button.is-warning.is-inverted[disabled],fieldset[disabled] .button.is-warning.is-inverted{background-color:rgba(0,0,0,.7);border-color:transparent;box-shadow:none;color:#ffd324}.button.is-warning.is-loading::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-warning.is-outlined{background-color:transparent;border-color:#ffd324;color:#ffd324}.button.is-warning.is-outlined:focus,.button.is-warning.is-outlined:hover{background-color:#ffd324;border-color:#ffd324;color:rgba(0,0,0,.7)}.button.is-warning.is-outlined.is-loading::after{border-color:transparent transparent #ffd324 #ffd324!important}.button.is-warning.is-outlined[disabled],fieldset[disabled] .button.is-warning.is-outlined{background-color:transparent;border-color:#ffd324;box-shadow:none;color:#ffd324}.button.is-warning.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);color:rgba(0,0,0,.7)}.button.is-warning.is-inverted.is-outlined:focus,.button.is-warning.is-inverted.is-outlined:hover{background-color:rgba(0,0,0,.7);color:#ffd324}.button.is-warning.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-warning.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);box-shadow:none;color:rgba(0,0,0,.7)}.button.is-danger{background-color:#ff0537;border-color:transparent;color:#fff}.button.is-danger.is-hovered,.button.is-danger:hover{background-color:#f70031;border-color:transparent;color:#fff}.button.is-danger.is-focused,.button.is-danger:focus{border-color:transparent;color:#fff}.button.is-danger.is-focused:not(:active),.button.is-danger:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,5,55,.25)}.button.is-danger.is-active,.button.is-danger:active{background-color:#eb002f;border-color:transparent;color:#fff}.button.is-danger[disabled],fieldset[disabled] .button.is-danger{background-color:#ff0537;border-color:transparent;box-shadow:none}.button.is-danger.is-inverted{background-color:#fff;color:#ff0537}.button.is-danger.is-inverted:hover{background-color:#f2f2f2}.button.is-danger.is-inverted[disabled],fieldset[disabled] .button.is-danger.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#ff0537}.button.is-danger.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-danger.is-outlined{background-color:transparent;border-color:#ff0537;color:#ff0537}.button.is-danger.is-outlined:focus,.button.is-danger.is-outlined:hover{background-color:#ff0537;border-color:#ff0537;color:#fff}.button.is-danger.is-outlined.is-loading::after{border-color:transparent transparent #ff0537 #ff0537!important}.button.is-danger.is-outlined[disabled],fieldset[disabled] .button.is-danger.is-outlined{background-color:transparent;border-color:#ff0537;box-shadow:none;color:#ff0537}.button.is-danger.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-danger.is-inverted.is-outlined:focus,.button.is-danger.is-inverted.is-outlined:hover{background-color:#fff;color:#ff0537}.button.is-danger.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-danger.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-white-dark{background-color:#fff;border-color:transparent;color:#0a0a0a}.button.is-white-dark.is-hovered,.button.is-white-dark:hover{background-color:#f9f9f9;border-color:transparent;color:#0a0a0a}.button.is-white-dark.is-focused,.button.is-white-dark:focus{border-color:transparent;color:#0a0a0a}.button.is-white-dark.is-focused:not(:active),.button.is-white-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.button.is-white-dark.is-active,.button.is-white-dark:active{background-color:#f2f2f2;border-color:transparent;color:#0a0a0a}.button.is-white-dark[disabled],fieldset[disabled] .button.is-white-dark{background-color:#fff;border-color:transparent;box-shadow:none}.button.is-white-dark.is-inverted{background-color:#0a0a0a;color:#fff}.button.is-white-dark.is-inverted:hover{background-color:#000}.button.is-white-dark.is-inverted[disabled],fieldset[disabled] .button.is-white-dark.is-inverted{background-color:#0a0a0a;border-color:transparent;box-shadow:none;color:#fff}.button.is-white-dark.is-loading::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-white-dark.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-white-dark.is-outlined:focus,.button.is-white-dark.is-outlined:hover{background-color:#fff;border-color:#fff;color:#0a0a0a}.button.is-white-dark.is-outlined.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-white-dark.is-outlined[disabled],fieldset[disabled] .button.is-white-dark.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-white-dark.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;color:#0a0a0a}.button.is-white-dark.is-inverted.is-outlined:focus,.button.is-white-dark.is-inverted.is-outlined:hover{background-color:#0a0a0a;color:#fff}.button.is-white-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-white-dark.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;box-shadow:none;color:#0a0a0a}.button.is-black-dark{background-color:#0a0a0a;border-color:transparent;color:#fff}.button.is-black-dark.is-hovered,.button.is-black-dark:hover{background-color:#040404;border-color:transparent;color:#fff}.button.is-black-dark.is-focused,.button.is-black-dark:focus{border-color:transparent;color:#fff}.button.is-black-dark.is-focused:not(:active),.button.is-black-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.button.is-black-dark.is-active,.button.is-black-dark:active{background-color:#000;border-color:transparent;color:#fff}.button.is-black-dark[disabled],fieldset[disabled] .button.is-black-dark{background-color:#0a0a0a;border-color:transparent;box-shadow:none}.button.is-black-dark.is-inverted{background-color:#fff;color:#0a0a0a}.button.is-black-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-black-dark.is-inverted[disabled],fieldset[disabled] .button.is-black-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#0a0a0a}.button.is-black-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-black-dark.is-outlined{background-color:transparent;border-color:#0a0a0a;color:#0a0a0a}.button.is-black-dark.is-outlined:focus,.button.is-black-dark.is-outlined:hover{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.button.is-black-dark.is-outlined.is-loading::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-black-dark.is-outlined[disabled],fieldset[disabled] .button.is-black-dark.is-outlined{background-color:transparent;border-color:#0a0a0a;box-shadow:none;color:#0a0a0a}.button.is-black-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-black-dark.is-inverted.is-outlined:focus,.button.is-black-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#0a0a0a}.button.is-black-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-black-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-light-dark{background-color:#f5f5f5;border-color:transparent;color:#363636}.button.is-light-dark.is-hovered,.button.is-light-dark:hover{background-color:#eee;border-color:transparent;color:#363636}.button.is-light-dark.is-focused,.button.is-light-dark:focus{border-color:transparent;color:#363636}.button.is-light-dark.is-focused:not(:active),.button.is-light-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.button.is-light-dark.is-active,.button.is-light-dark:active{background-color:#e8e8e8;border-color:transparent;color:#363636}.button.is-light-dark[disabled],fieldset[disabled] .button.is-light-dark{background-color:#f5f5f5;border-color:transparent;box-shadow:none}.button.is-light-dark.is-inverted{background-color:#363636;color:#f5f5f5}.button.is-light-dark.is-inverted:hover{background-color:#292929}.button.is-light-dark.is-inverted[disabled],fieldset[disabled] .button.is-light-dark.is-inverted{background-color:#363636;border-color:transparent;box-shadow:none;color:#f5f5f5}.button.is-light-dark.is-loading::after{border-color:transparent transparent #363636 #363636!important}.button.is-light-dark.is-outlined{background-color:transparent;border-color:#f5f5f5;color:#f5f5f5}.button.is-light-dark.is-outlined:focus,.button.is-light-dark.is-outlined:hover{background-color:#f5f5f5;border-color:#f5f5f5;color:#363636}.button.is-light-dark.is-outlined.is-loading::after{border-color:transparent transparent #f5f5f5 #f5f5f5!important}.button.is-light-dark.is-outlined[disabled],fieldset[disabled] .button.is-light-dark.is-outlined{background-color:transparent;border-color:#f5f5f5;box-shadow:none;color:#f5f5f5}.button.is-light-dark.is-inverted.is-outlined{background-color:transparent;border-color:#363636;color:#363636}.button.is-light-dark.is-inverted.is-outlined:focus,.button.is-light-dark.is-inverted.is-outlined:hover{background-color:#363636;color:#f5f5f5}.button.is-light-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-light-dark.is-inverted.is-outlined{background-color:transparent;border-color:#363636;box-shadow:none;color:#363636}.button.is-dark-dark{background-color:#363636;border-color:transparent;color:#f5f5f5}.button.is-dark-dark.is-hovered,.button.is-dark-dark:hover{background-color:#2f2f2f;border-color:transparent;color:#f5f5f5}.button.is-dark-dark.is-focused,.button.is-dark-dark:focus{border-color:transparent;color:#f5f5f5}.button.is-dark-dark.is-focused:not(:active),.button.is-dark-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.button.is-dark-dark.is-active,.button.is-dark-dark:active{background-color:#292929;border-color:transparent;color:#f5f5f5}.button.is-dark-dark[disabled],fieldset[disabled] .button.is-dark-dark{background-color:#363636;border-color:transparent;box-shadow:none}.button.is-dark-dark.is-inverted{background-color:#f5f5f5;color:#363636}.button.is-dark-dark.is-inverted:hover{background-color:#e8e8e8}.button.is-dark-dark.is-inverted[disabled],fieldset[disabled] .button.is-dark-dark.is-inverted{background-color:#f5f5f5;border-color:transparent;box-shadow:none;color:#363636}.button.is-dark-dark.is-loading::after{border-color:transparent transparent #f5f5f5 #f5f5f5!important}.button.is-dark-dark.is-outlined{background-color:transparent;border-color:#363636;color:#363636}.button.is-dark-dark.is-outlined:focus,.button.is-dark-dark.is-outlined:hover{background-color:#363636;border-color:#363636;color:#f5f5f5}.button.is-dark-dark.is-outlined.is-loading::after{border-color:transparent transparent #363636 #363636!important}.button.is-dark-dark.is-outlined[disabled],fieldset[disabled] .button.is-dark-dark.is-outlined{background-color:transparent;border-color:#363636;box-shadow:none;color:#363636}.button.is-dark-dark.is-inverted.is-outlined{background-color:transparent;border-color:#f5f5f5;color:#f5f5f5}.button.is-dark-dark.is-inverted.is-outlined:focus,.button.is-dark-dark.is-inverted.is-outlined:hover{background-color:#f5f5f5;color:#363636}.button.is-dark-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-dark-dark.is-inverted.is-outlined{background-color:transparent;border-color:#f5f5f5;box-shadow:none;color:#f5f5f5}.button.is-primary-dark{background-color:#00d1b2;border-color:transparent;color:#fff}.button.is-primary-dark.is-hovered,.button.is-primary-dark:hover{background-color:#00c4a7;border-color:transparent;color:#fff}.button.is-primary-dark.is-focused,.button.is-primary-dark:focus{border-color:transparent;color:#fff}.button.is-primary-dark.is-focused:not(:active),.button.is-primary-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.button.is-primary-dark.is-active,.button.is-primary-dark:active{background-color:#00b89c;border-color:transparent;color:#fff}.button.is-primary-dark[disabled],fieldset[disabled] .button.is-primary-dark{background-color:#00d1b2;border-color:transparent;box-shadow:none}.button.is-primary-dark.is-inverted{background-color:#fff;color:#00d1b2}.button.is-primary-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-primary-dark.is-inverted[disabled],fieldset[disabled] .button.is-primary-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#00d1b2}.button.is-primary-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-primary-dark.is-outlined{background-color:transparent;border-color:#00d1b2;color:#00d1b2}.button.is-primary-dark.is-outlined:focus,.button.is-primary-dark.is-outlined:hover{background-color:#00d1b2;border-color:#00d1b2;color:#fff}.button.is-primary-dark.is-outlined.is-loading::after{border-color:transparent transparent #00d1b2 #00d1b2!important}.button.is-primary-dark.is-outlined[disabled],fieldset[disabled] .button.is-primary-dark.is-outlined{background-color:transparent;border-color:#00d1b2;box-shadow:none;color:#00d1b2}.button.is-primary-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-primary-dark.is-inverted.is-outlined:focus,.button.is-primary-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#00d1b2}.button.is-primary-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-primary-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-link-dark{background-color:#3273dc;border-color:transparent;color:#fff}.button.is-link-dark.is-hovered,.button.is-link-dark:hover{background-color:#276cda;border-color:transparent;color:#fff}.button.is-link-dark.is-focused,.button.is-link-dark:focus{border-color:transparent;color:#fff}.button.is-link-dark.is-focused:not(:active),.button.is-link-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.button.is-link-dark.is-active,.button.is-link-dark:active{background-color:#2366d1;border-color:transparent;color:#fff}.button.is-link-dark[disabled],fieldset[disabled] .button.is-link-dark{background-color:#3273dc;border-color:transparent;box-shadow:none}.button.is-link-dark.is-inverted{background-color:#fff;color:#3273dc}.button.is-link-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-link-dark.is-inverted[disabled],fieldset[disabled] .button.is-link-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#3273dc}.button.is-link-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-link-dark.is-outlined{background-color:transparent;border-color:#3273dc;color:#3273dc}.button.is-link-dark.is-outlined:focus,.button.is-link-dark.is-outlined:hover{background-color:#3273dc;border-color:#3273dc;color:#fff}.button.is-link-dark.is-outlined.is-loading::after{border-color:transparent transparent #3273dc #3273dc!important}.button.is-link-dark.is-outlined[disabled],fieldset[disabled] .button.is-link-dark.is-outlined{background-color:transparent;border-color:#3273dc;box-shadow:none;color:#3273dc}.button.is-link-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-link-dark.is-inverted.is-outlined:focus,.button.is-link-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#3273dc}.button.is-link-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-link-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-info-dark{background-color:#209cee;border-color:transparent;color:#fff}.button.is-info-dark.is-hovered,.button.is-info-dark:hover{background-color:#1496ed;border-color:transparent;color:#fff}.button.is-info-dark.is-focused,.button.is-info-dark:focus{border-color:transparent;color:#fff}.button.is-info-dark.is-focused:not(:active),.button.is-info-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(32,156,238,.25)}.button.is-info-dark.is-active,.button.is-info-dark:active{background-color:#118fe4;border-color:transparent;color:#fff}.button.is-info-dark[disabled],fieldset[disabled] .button.is-info-dark{background-color:#209cee;border-color:transparent;box-shadow:none}.button.is-info-dark.is-inverted{background-color:#fff;color:#209cee}.button.is-info-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-info-dark.is-inverted[disabled],fieldset[disabled] .button.is-info-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#209cee}.button.is-info-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-info-dark.is-outlined{background-color:transparent;border-color:#209cee;color:#209cee}.button.is-info-dark.is-outlined:focus,.button.is-info-dark.is-outlined:hover{background-color:#209cee;border-color:#209cee;color:#fff}.button.is-info-dark.is-outlined.is-loading::after{border-color:transparent transparent #209cee #209cee!important}.button.is-info-dark.is-outlined[disabled],fieldset[disabled] .button.is-info-dark.is-outlined{background-color:transparent;border-color:#209cee;box-shadow:none;color:#209cee}.button.is-info-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-info-dark.is-inverted.is-outlined:focus,.button.is-info-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#209cee}.button.is-info-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-info-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-success-dark{background-color:#23d160;border-color:transparent;color:#fff}.button.is-success-dark.is-hovered,.button.is-success-dark:hover{background-color:#22c65b;border-color:transparent;color:#fff}.button.is-success-dark.is-focused,.button.is-success-dark:focus{border-color:transparent;color:#fff}.button.is-success-dark.is-focused:not(:active),.button.is-success-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(35,209,96,.25)}.button.is-success-dark.is-active,.button.is-success-dark:active{background-color:#20bc56;border-color:transparent;color:#fff}.button.is-success-dark[disabled],fieldset[disabled] .button.is-success-dark{background-color:#23d160;border-color:transparent;box-shadow:none}.button.is-success-dark.is-inverted{background-color:#fff;color:#23d160}.button.is-success-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-success-dark.is-inverted[disabled],fieldset[disabled] .button.is-success-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#23d160}.button.is-success-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-success-dark.is-outlined{background-color:transparent;border-color:#23d160;color:#23d160}.button.is-success-dark.is-outlined:focus,.button.is-success-dark.is-outlined:hover{background-color:#23d160;border-color:#23d160;color:#fff}.button.is-success-dark.is-outlined.is-loading::after{border-color:transparent transparent #23d160 #23d160!important}.button.is-success-dark.is-outlined[disabled],fieldset[disabled] .button.is-success-dark.is-outlined{background-color:transparent;border-color:#23d160;box-shadow:none;color:#23d160}.button.is-success-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-success-dark.is-inverted.is-outlined:focus,.button.is-success-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#23d160}.button.is-success-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-success-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-warning-dark{background-color:#ffdd57;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning-dark.is-hovered,.button.is-warning-dark:hover{background-color:#ffdb4a;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning-dark.is-focused,.button.is-warning-dark:focus{border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning-dark.is-focused:not(:active),.button.is-warning-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.button.is-warning-dark.is-active,.button.is-warning-dark:active{background-color:#ffd83d;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning-dark[disabled],fieldset[disabled] .button.is-warning-dark{background-color:#ffdd57;border-color:transparent;box-shadow:none}.button.is-warning-dark.is-inverted{background-color:rgba(0,0,0,.7);color:#ffdd57}.button.is-warning-dark.is-inverted:hover{background-color:rgba(0,0,0,.7)}.button.is-warning-dark.is-inverted[disabled],fieldset[disabled] .button.is-warning-dark.is-inverted{background-color:rgba(0,0,0,.7);border-color:transparent;box-shadow:none;color:#ffdd57}.button.is-warning-dark.is-loading::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-warning-dark.is-outlined{background-color:transparent;border-color:#ffdd57;color:#ffdd57}.button.is-warning-dark.is-outlined:focus,.button.is-warning-dark.is-outlined:hover{background-color:#ffdd57;border-color:#ffdd57;color:rgba(0,0,0,.7)}.button.is-warning-dark.is-outlined.is-loading::after{border-color:transparent transparent #ffdd57 #ffdd57!important}.button.is-warning-dark.is-outlined[disabled],fieldset[disabled] .button.is-warning-dark.is-outlined{background-color:transparent;border-color:#ffdd57;box-shadow:none;color:#ffdd57}.button.is-warning-dark.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);color:rgba(0,0,0,.7)}.button.is-warning-dark.is-inverted.is-outlined:focus,.button.is-warning-dark.is-inverted.is-outlined:hover{background-color:rgba(0,0,0,.7);color:#ffdd57}.button.is-warning-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-warning-dark.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);box-shadow:none;color:rgba(0,0,0,.7)}.button.is-danger-dark{background-color:#ff3860;border-color:transparent;color:#fff}.button.is-danger-dark.is-hovered,.button.is-danger-dark:hover{background-color:#ff2b56;border-color:transparent;color:#fff}.button.is-danger-dark.is-focused,.button.is-danger-dark:focus{border-color:transparent;color:#fff}.button.is-danger-dark.is-focused:not(:active),.button.is-danger-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,56,96,.25)}.button.is-danger-dark.is-active,.button.is-danger-dark:active{background-color:#ff1f4b;border-color:transparent;color:#fff}.button.is-danger-dark[disabled],fieldset[disabled] .button.is-danger-dark{background-color:#ff3860;border-color:transparent;box-shadow:none}.button.is-danger-dark.is-inverted{background-color:#fff;color:#ff3860}.button.is-danger-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-danger-dark.is-inverted[disabled],fieldset[disabled] .button.is-danger-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#ff3860}.button.is-danger-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-danger-dark.is-outlined{background-color:transparent;border-color:#ff3860;color:#ff3860}.button.is-danger-dark.is-outlined:focus,.button.is-danger-dark.is-outlined:hover{background-color:#ff3860;border-color:#ff3860;color:#fff}.button.is-danger-dark.is-outlined.is-loading::after{border-color:transparent transparent #ff3860 #ff3860!important}.button.is-danger-dark.is-outlined[disabled],fieldset[disabled] .button.is-danger-dark.is-outlined{background-color:transparent;border-color:#ff3860;box-shadow:none;color:#ff3860}.button.is-danger-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-danger-dark.is-inverted.is-outlined:focus,.button.is-danger-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#ff3860}.button.is-danger-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-danger-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button[disabled],fieldset[disabled] .button{background-color:#0a0a0a;border-color:#363636}.button.is-static{background-color:#f5f5f5;border-color:#363636;color:#7a7a7a}.content h1,.content h2,.content h3,.content h4,.content h5,.content h6{color:#dbdbdb}.content blockquote{background-color:#242424;border-left:5px solid #363636}.content table td,.content table th{border:1px solid #363636}.content table th{color:#dbdbdb}.content table thead td,.content table thead th{color:#dbdbdb}.content table tfoot td,.content table tfoot th{color:#dbdbdb}.input,.textarea{background-color:#0a0a0a;border-color:#363636;color:#dbdbdb;box-shadow:inset 0 1px 2px rgba(255,255,255,.1)}.input::-moz-placeholder,.textarea::-moz-placeholder{color:rgba(219,219,219,.3)}.input::-webkit-input-placeholder,.textarea::-webkit-input-placeholder{color:rgba(219,219,219,.3)}.input:-moz-placeholder,.textarea:-moz-placeholder{color:rgba(219,219,219,.3)}.input:-ms-input-placeholder,.textarea:-ms-input-placeholder{color:rgba(219,219,219,.3)}.input.is-hovered,.input:hover,.textarea.is-hovered,.textarea:hover{border-color:#4a4a4a}.input.is-active,.input.is-focused,.input:active,.input:focus,.textarea.is-active,.textarea.is-focused,.textarea:active,.textarea:focus{border-color:#5ea3e4;box-shadow:0 0 0 .125em rgba(94,163,228,.25)}.input[disabled],.textarea[disabled],fieldset[disabled] .input,fieldset[disabled] .textarea{background-color:#242424;border-color:#242424;color:#b5b5b5}.input[disabled]::-moz-placeholder,.textarea[disabled]::-moz-placeholder,fieldset[disabled] .input::-moz-placeholder,fieldset[disabled] .textarea::-moz-placeholder{color:rgba(181,181,181,.3)}.input[disabled]::-webkit-input-placeholder,.textarea[disabled]::-webkit-input-placeholder,fieldset[disabled] .input::-webkit-input-placeholder,fieldset[disabled] .textarea::-webkit-input-placeholder{color:rgba(181,181,181,.3)}.input[disabled]:-moz-placeholder,.textarea[disabled]:-moz-placeholder,fieldset[disabled] .input:-moz-placeholder,fieldset[disabled] .textarea:-moz-placeholder{color:rgba(181,181,181,.3)}.input[disabled]:-ms-input-placeholder,.textarea[disabled]:-ms-input-placeholder,fieldset[disabled] .input:-ms-input-placeholder,fieldset[disabled] .textarea:-ms-input-placeholder{color:rgba(181,181,181,.3)}.input.is-white,.textarea.is-white{border-color:#e6e6e6}.input.is-white.is-active,.input.is-white.is-focused,.input.is-white:active,.input.is-white:focus,.textarea.is-white.is-active,.textarea.is-white.is-focused,.textarea.is-white:active,.textarea.is-white:focus{box-shadow:0 0 0 .125em rgba(230,230,230,.25)}.input.is-black,.textarea.is-black{border-color:#000}.input.is-black.is-active,.input.is-black.is-focused,.input.is-black:active,.input.is-black:focus,.textarea.is-black.is-active,.textarea.is-black.is-focused,.textarea.is-black:active,.textarea.is-black:focus{box-shadow:0 0 0 .125em rgba(0,0,0,.25)}.input.is-light,.textarea.is-light{border-color:#dbdbdb}.input.is-light.is-active,.input.is-light.is-focused,.input.is-light:active,.input.is-light:focus,.textarea.is-light.is-active,.textarea.is-light.is-focused,.textarea.is-light:active,.textarea.is-light:focus{box-shadow:0 0 0 .125em rgba(219,219,219,.25)}.input.is-dark,.textarea.is-dark{border-color:#1c1c1c}.input.is-dark.is-active,.input.is-dark.is-focused,.input.is-dark:active,.input.is-dark:focus,.textarea.is-dark.is-active,.textarea.is-dark.is-focused,.textarea.is-dark:active,.textarea.is-dark:focus{box-shadow:0 0 0 .125em rgba(28,28,28,.25)}.input.is-primary,.textarea.is-primary{border-color:#009e86}.input.is-primary.is-active,.input.is-primary.is-focused,.input.is-primary:active,.input.is-primary:focus,.textarea.is-primary.is-active,.textarea.is-primary.is-focused,.textarea.is-primary:active,.textarea.is-primary:focus{box-shadow:0 0 0 .125em rgba(0,158,134,.25)}.input.is-link,.textarea.is-link{border-color:#205bbc}.input.is-link.is-active,.input.is-link.is-focused,.input.is-link:active,.input.is-link:focus,.textarea.is-link.is-active,.textarea.is-link.is-focused,.textarea.is-link:active,.textarea.is-link:focus{box-shadow:0 0 0 .125em rgba(32,91,188,.25)}.input.is-info,.textarea.is-info{border-color:#0f81cc}.input.is-info.is-active,.input.is-info.is-focused,.input.is-info:active,.input.is-info:focus,.textarea.is-info.is-active,.textarea.is-info.is-focused,.textarea.is-info:active,.textarea.is-info:focus{box-shadow:0 0 0 .125em rgba(15,129,204,.25)}.input.is-success,.textarea.is-success{border-color:#1ca64c}.input.is-success.is-active,.input.is-success.is-focused,.input.is-success:active,.input.is-success:focus,.textarea.is-success.is-active,.textarea.is-success.is-focused,.textarea.is-success:active,.textarea.is-success:focus{box-shadow:0 0 0 .125em rgba(28,166,76,.25)}.input.is-warning,.textarea.is-warning{border-color:#ffd324}.input.is-warning.is-active,.input.is-warning.is-focused,.input.is-warning:active,.input.is-warning:focus,.textarea.is-warning.is-active,.textarea.is-warning.is-focused,.textarea.is-warning:active,.textarea.is-warning:focus{box-shadow:0 0 0 .125em rgba(255,211,36,.25)}.input.is-danger,.textarea.is-danger{border-color:#ff0537}.input.is-danger.is-active,.input.is-danger.is-focused,.input.is-danger:active,.input.is-danger:focus,.textarea.is-danger.is-active,.textarea.is-danger.is-focused,.textarea.is-danger:active,.textarea.is-danger:focus{box-shadow:0 0 0 .125em rgba(255,5,55,.25)}.input.is-white-dark,.textarea.is-white-dark{border-color:#fff}.input.is-white-dark.is-active,.input.is-white-dark.is-focused,.input.is-white-dark:active,.input.is-white-dark:focus,.textarea.is-white-dark.is-active,.textarea.is-white-dark.is-focused,.textarea.is-white-dark:active,.textarea.is-white-dark:focus{box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.input.is-black-dark,.textarea.is-black-dark{border-color:#0a0a0a}.input.is-black-dark.is-active,.input.is-black-dark.is-focused,.input.is-black-dark:active,.input.is-black-dark:focus,.textarea.is-black-dark.is-active,.textarea.is-black-dark.is-focused,.textarea.is-black-dark:active,.textarea.is-black-dark:focus{box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.input.is-light-dark,.textarea.is-light-dark{border-color:#f5f5f5}.input.is-light-dark.is-active,.input.is-light-dark.is-focused,.input.is-light-dark:active,.input.is-light-dark:focus,.textarea.is-light-dark.is-active,.textarea.is-light-dark.is-focused,.textarea.is-light-dark:active,.textarea.is-light-dark:focus{box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.input.is-dark-dark,.textarea.is-dark-dark{border-color:#363636}.input.is-dark-dark.is-active,.input.is-dark-dark.is-focused,.input.is-dark-dark:active,.input.is-dark-dark:focus,.textarea.is-dark-dark.is-active,.textarea.is-dark-dark.is-focused,.textarea.is-dark-dark:active,.textarea.is-dark-dark:focus{box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.input.is-primary-dark,.textarea.is-primary-dark{border-color:#00d1b2}.input.is-primary-dark.is-active,.input.is-primary-dark.is-focused,.input.is-primary-dark:active,.input.is-primary-dark:focus,.textarea.is-primary-dark.is-active,.textarea.is-primary-dark.is-focused,.textarea.is-primary-dark:active,.textarea.is-primary-dark:focus{box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.input.is-link-dark,.textarea.is-link-dark{border-color:#3273dc}.input.is-link-dark.is-active,.input.is-link-dark.is-focused,.input.is-link-dark:active,.input.is-link-dark:focus,.textarea.is-link-dark.is-active,.textarea.is-link-dark.is-focused,.textarea.is-link-dark:active,.textarea.is-link-dark:focus{box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.input.is-info-dark,.textarea.is-info-dark{border-color:#209cee}.input.is-info-dark.is-active,.input.is-info-dark.is-focused,.input.is-info-dark:active,.input.is-info-dark:focus,.textarea.is-info-dark.is-active,.textarea.is-info-dark.is-focused,.textarea.is-info-dark:active,.textarea.is-info-dark:focus{box-shadow:0 0 0 .125em rgba(32,156,238,.25)}.input.is-success-dark,.textarea.is-success-dark{border-color:#23d160}.input.is-success-dark.is-active,.input.is-success-dark.is-focused,.input.is-success-dark:active,.input.is-success-dark:focus,.textarea.is-success-dark.is-active,.textarea.is-success-dark.is-focused,.textarea.is-success-dark:active,.textarea.is-success-dark:focus{box-shadow:0 0 0 .125em rgba(35,209,96,.25)}.input.is-warning-dark,.textarea.is-warning-dark{border-color:#ffdd57}.input.is-warning-dark.is-active,.input.is-warning-dark.is-focused,.input.is-warning-dark:active,.input.is-warning-dark:focus,.textarea.is-warning-dark.is-active,.textarea.is-warning-dark.is-focused,.textarea.is-warning-dark:active,.textarea.is-warning-dark:focus{box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.input.is-danger-dark,.textarea.is-danger-dark{border-color:#ff3860}.input.is-danger-dark.is-active,.input.is-danger-dark.is-focused,.input.is-danger-dark:active,.input.is-danger-dark:focus,.textarea.is-danger-dark.is-active,.textarea.is-danger-dark.is-focused,.textarea.is-danger-dark:active,.textarea.is-danger-dark:focus{box-shadow:0 0 0 .125em rgba(255,56,96,.25)}.checkbox:hover,.radio:hover{color:#dbdbdb}.checkbox[disabled],.radio[disabled],fieldset[disabled] .checkbox,fieldset[disabled] .radio{color:#b5b5b5}.select:not(.is-multiple):not(.is-loading)::after{border-color:#5ea3e4}.select select[disabled]:hover,fieldset[disabled] .select select:hover{border-color:#242424}.select:not(.is-multiple):not(.is-loading):hover::after{border-color:#dbdbdb}.select.is-white:not(:hover)::after{border-color:#e6e6e6}.select.is-white select{border-color:#e6e6e6}.select.is-white select.is-hovered,.select.is-white select:hover{border-color:#d9d9d9}.select.is-white select.is-active,.select.is-white select.is-focused,.select.is-white select:active,.select.is-white select:focus{box-shadow:0 0 0 .125em rgba(230,230,230,.25)}.select.is-black:not(:hover)::after{border-color:#000}.select.is-black select{border-color:#000}.select.is-black select.is-hovered,.select.is-black select:hover{border-color:#000}.select.is-black select.is-active,.select.is-black select.is-focused,.select.is-black select:active,.select.is-black select:focus{box-shadow:0 0 0 .125em rgba(0,0,0,.25)}.select.is-light:not(:hover)::after{border-color:#dbdbdb}.select.is-light select{border-color:#dbdbdb}.select.is-light select.is-hovered,.select.is-light select:hover{border-color:#cfcfcf}.select.is-light select.is-active,.select.is-light select.is-focused,.select.is-light select:active,.select.is-light select:focus{box-shadow:0 0 0 .125em rgba(219,219,219,.25)}.select.is-dark:not(:hover)::after{border-color:#1c1c1c}.select.is-dark select{border-color:#1c1c1c}.select.is-dark select.is-hovered,.select.is-dark select:hover{border-color:#0f0f0f}.select.is-dark select.is-active,.select.is-dark select.is-focused,.select.is-dark select:active,.select.is-dark select:focus{box-shadow:0 0 0 .125em rgba(28,28,28,.25)}.select.is-primary:not(:hover)::after{border-color:#009e86}.select.is-primary select{border-color:#009e86}.select.is-primary select.is-hovered,.select.is-primary select:hover{border-color:#008571}.select.is-primary select.is-active,.select.is-primary select.is-focused,.select.is-primary select:active,.select.is-primary select:focus{box-shadow:0 0 0 .125em rgba(0,158,134,.25)}.select.is-link:not(:hover)::after{border-color:#205bbc}.select.is-link select{border-color:#205bbc}.select.is-link select.is-hovered,.select.is-link select:hover{border-color:#1c51a6}.select.is-link select.is-active,.select.is-link select.is-focused,.select.is-link select:active,.select.is-link select:focus{box-shadow:0 0 0 .125em rgba(32,91,188,.25)}.select.is-info:not(:hover)::after{border-color:#0f81cc}.select.is-info select{border-color:#0f81cc}.select.is-info select.is-hovered,.select.is-info select:hover{border-color:#0e72b4}.select.is-info select.is-active,.select.is-info select.is-focused,.select.is-info select:active,.select.is-info select:focus{box-shadow:0 0 0 .125em rgba(15,129,204,.25)}.select.is-success:not(:hover)::after{border-color:#1ca64c}.select.is-success select{border-color:#1ca64c}.select.is-success select.is-hovered,.select.is-success select:hover{border-color:#189042}.select.is-success select.is-active,.select.is-success select.is-focused,.select.is-success select:active,.select.is-success select:focus{box-shadow:0 0 0 .125em rgba(28,166,76,.25)}.select.is-warning:not(:hover)::after{border-color:#ffd324}.select.is-warning select{border-color:#ffd324}.select.is-warning select.is-hovered,.select.is-warning select:hover{border-color:#ffce0a}.select.is-warning select.is-active,.select.is-warning select.is-focused,.select.is-warning select:active,.select.is-warning select:focus{box-shadow:0 0 0 .125em rgba(255,211,36,.25)}.select.is-danger:not(:hover)::after{border-color:#ff0537}.select.is-danger select{border-color:#ff0537}.select.is-danger select.is-hovered,.select.is-danger select:hover{border-color:#eb002f}.select.is-danger select.is-active,.select.is-danger select.is-focused,.select.is-danger select:active,.select.is-danger select:focus{box-shadow:0 0 0 .125em rgba(255,5,55,.25)}.select.is-white-dark:not(:hover)::after{border-color:#fff}.select.is-white-dark select{border-color:#fff}.select.is-white-dark select.is-hovered,.select.is-white-dark select:hover{border-color:#f2f2f2}.select.is-white-dark select.is-active,.select.is-white-dark select.is-focused,.select.is-white-dark select:active,.select.is-white-dark select:focus{box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.select.is-black-dark:not(:hover)::after{border-color:#0a0a0a}.select.is-black-dark select{border-color:#0a0a0a}.select.is-black-dark select.is-hovered,.select.is-black-dark select:hover{border-color:#000}.select.is-black-dark select.is-active,.select.is-black-dark select.is-focused,.select.is-black-dark select:active,.select.is-black-dark select:focus{box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.select.is-light-dark:not(:hover)::after{border-color:#f5f5f5}.select.is-light-dark select{border-color:#f5f5f5}.select.is-light-dark select.is-hovered,.select.is-light-dark select:hover{border-color:#e8e8e8}.select.is-light-dark select.is-active,.select.is-light-dark select.is-focused,.select.is-light-dark select:active,.select.is-light-dark select:focus{box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.select.is-dark-dark:not(:hover)::after{border-color:#363636}.select.is-dark-dark select{border-color:#363636}.select.is-dark-dark select.is-hovered,.select.is-dark-dark select:hover{border-color:#292929}.select.is-dark-dark select.is-active,.select.is-dark-dark select.is-focused,.select.is-dark-dark select:active,.select.is-dark-dark select:focus{box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.select.is-primary-dark:not(:hover)::after{border-color:#00d1b2}.select.is-primary-dark select{border-color:#00d1b2}.select.is-primary-dark select.is-hovered,.select.is-primary-dark select:hover{border-color:#00b89c}.select.is-primary-dark select.is-active,.select.is-primary-dark select.is-focused,.select.is-primary-dark select:active,.select.is-primary-dark select:focus{box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.select.is-link-dark:not(:hover)::after{border-color:#3273dc}.select.is-link-dark select{border-color:#3273dc}.select.is-link-dark select.is-hovered,.select.is-link-dark select:hover{border-color:#2366d1}.select.is-link-dark select.is-active,.select.is-link-dark select.is-focused,.select.is-link-dark select:active,.select.is-link-dark select:focus{box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.select.is-info-dark:not(:hover)::after{border-color:#209cee}.select.is-info-dark select{border-color:#209cee}.select.is-info-dark select.is-hovered,.select.is-info-dark select:hover{border-color:#118fe4}.select.is-info-dark select.is-active,.select.is-info-dark select.is-focused,.select.is-info-dark select:active,.select.is-info-dark select:focus{box-shadow:0 0 0 .125em rgba(32,156,238,.25)}.select.is-success-dark:not(:hover)::after{border-color:#23d160}.select.is-success-dark select{border-color:#23d160}.select.is-success-dark select.is-hovered,.select.is-success-dark select:hover{border-color:#20bc56}.select.is-success-dark select.is-active,.select.is-success-dark select.is-focused,.select.is-success-dark select:active,.select.is-success-dark select:focus{box-shadow:0 0 0 .125em rgba(35,209,96,.25)}.select.is-warning-dark:not(:hover)::after{border-color:#ffdd57}.select.is-warning-dark select{border-color:#ffdd57}.select.is-warning-dark select.is-hovered,.select.is-warning-dark select:hover{border-color:#ffd83d}.select.is-warning-dark select.is-active,.select.is-warning-dark select.is-focused,.select.is-warning-dark select:active,.select.is-warning-dark select:focus{box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.select.is-danger-dark:not(:hover)::after{border-color:#ff3860}.select.is-danger-dark select{border-color:#ff3860}.select.is-danger-dark select.is-hovered,.select.is-danger-dark select:hover{border-color:#ff1f4b}.select.is-danger-dark select.is-active,.select.is-danger-dark select.is-focused,.select.is-danger-dark select:active,.select.is-danger-dark select:focus{box-shadow:0 0 0 .125em rgba(255,56,96,.25)}.select.is-disabled::after{border-color:#b5b5b5}.file.is-white .file-cta{background-color:#e6e6e6;color:#0a0a0a}.file.is-white.is-hovered .file-cta,.file.is-white:hover .file-cta{background-color:#dfdfdf;color:#0a0a0a}.file.is-white.is-focused .file-cta,.file.is-white:focus .file-cta{box-shadow:0 0 .5em rgba(230,230,230,.25);color:#0a0a0a}.file.is-white.is-active .file-cta,.file.is-white:active .file-cta{background-color:#d9d9d9;color:#0a0a0a}.file.is-black .file-cta{background-color:#000;color:#fff}.file.is-black.is-hovered .file-cta,.file.is-black:hover .file-cta{background-color:#000;color:#fff}.file.is-black.is-focused .file-cta,.file.is-black:focus .file-cta{box-shadow:0 0 .5em rgba(0,0,0,.25);color:#fff}.file.is-black.is-active .file-cta,.file.is-black:active .file-cta{background-color:#000;color:#fff}.file.is-light .file-cta{background-color:#dbdbdb;color:#363636}.file.is-light.is-hovered .file-cta,.file.is-light:hover .file-cta{background-color:#d5d5d5;color:#363636}.file.is-light.is-focused .file-cta,.file.is-light:focus .file-cta{box-shadow:0 0 .5em rgba(219,219,219,.25);color:#363636}.file.is-light.is-active .file-cta,.file.is-light:active .file-cta{background-color:#cfcfcf;color:#363636}.file.is-dark .file-cta{background-color:#1c1c1c;color:#f5f5f5}.file.is-dark.is-hovered .file-cta,.file.is-dark:hover .file-cta{background-color:#161616;color:#f5f5f5}.file.is-dark.is-focused .file-cta,.file.is-dark:focus .file-cta{box-shadow:0 0 .5em rgba(28,28,28,.25);color:#f5f5f5}.file.is-dark.is-active .file-cta,.file.is-dark:active .file-cta{background-color:#0f0f0f;color:#f5f5f5}.file.is-primary .file-cta{background-color:#009e86;color:#fff}.file.is-primary.is-hovered .file-cta,.file.is-primary:hover .file-cta{background-color:#00917c;color:#fff}.file.is-primary.is-focused .file-cta,.file.is-primary:focus .file-cta{box-shadow:0 0 .5em rgba(0,158,134,.25);color:#fff}.file.is-primary.is-active .file-cta,.file.is-primary:active .file-cta{background-color:#008571;color:#fff}.file.is-link .file-cta{background-color:#205bbc;color:#fff}.file.is-link.is-hovered .file-cta,.file.is-link:hover .file-cta{background-color:#1e56b1;color:#fff}.file.is-link.is-focused .file-cta,.file.is-link:focus .file-cta{box-shadow:0 0 .5em rgba(32,91,188,.25);color:#fff}.file.is-link.is-active .file-cta,.file.is-link:active .file-cta{background-color:#1c51a6;color:#fff}.file.is-info .file-cta{background-color:#0f81cc;color:#fff}.file.is-info.is-hovered .file-cta,.file.is-info:hover .file-cta{background-color:#0e79c0;color:#fff}.file.is-info.is-focused .file-cta,.file.is-info:focus .file-cta{box-shadow:0 0 .5em rgba(15,129,204,.25);color:#fff}.file.is-info.is-active .file-cta,.file.is-info:active .file-cta{background-color:#0e72b4;color:#fff}.file.is-success .file-cta{background-color:#1ca64c;color:#fff}.file.is-success.is-hovered .file-cta,.file.is-success:hover .file-cta{background-color:#1a9b47;color:#fff}.file.is-success.is-focused .file-cta,.file.is-success:focus .file-cta{box-shadow:0 0 .5em rgba(28,166,76,.25);color:#fff}.file.is-success.is-active .file-cta,.file.is-success:active .file-cta{background-color:#189042;color:#fff}.file.is-warning .file-cta{background-color:#ffd324;color:rgba(0,0,0,.7)}.file.is-warning.is-hovered .file-cta,.file.is-warning:hover .file-cta{background-color:#ffd117;color:rgba(0,0,0,.7)}.file.is-warning.is-focused .file-cta,.file.is-warning:focus .file-cta{box-shadow:0 0 .5em rgba(255,211,36,.25);color:rgba(0,0,0,.7)}.file.is-warning.is-active .file-cta,.file.is-warning:active .file-cta{background-color:#ffce0a;color:rgba(0,0,0,.7)}.file.is-danger .file-cta{background-color:#ff0537;color:#fff}.file.is-danger.is-hovered .file-cta,.file.is-danger:hover .file-cta{background-color:#f70031;color:#fff}.file.is-danger.is-focused .file-cta,.file.is-danger:focus .file-cta{box-shadow:0 0 .5em rgba(255,5,55,.25);color:#fff}.file.is-danger.is-active .file-cta,.file.is-danger:active .file-cta{background-color:#eb002f;color:#fff}.file.is-white-dark .file-cta{background-color:#fff;color:#0a0a0a}.file.is-white-dark.is-hovered .file-cta,.file.is-white-dark:hover .file-cta{background-color:#f9f9f9;color:#0a0a0a}.file.is-white-dark.is-focused .file-cta,.file.is-white-dark:focus .file-cta{box-shadow:0 0 .5em rgba(255,255,255,.25);color:#0a0a0a}.file.is-white-dark.is-active .file-cta,.file.is-white-dark:active .file-cta{background-color:#f2f2f2;color:#0a0a0a}.file.is-black-dark .file-cta{background-color:#0a0a0a;color:#fff}.file.is-black-dark.is-hovered .file-cta,.file.is-black-dark:hover .file-cta{background-color:#040404;color:#fff}.file.is-black-dark.is-focused .file-cta,.file.is-black-dark:focus .file-cta{box-shadow:0 0 .5em rgba(10,10,10,.25);color:#fff}.file.is-black-dark.is-active .file-cta,.file.is-black-dark:active .file-cta{background-color:#000;color:#fff}.file.is-light-dark .file-cta{background-color:#f5f5f5;color:#363636}.file.is-light-dark.is-hovered .file-cta,.file.is-light-dark:hover .file-cta{background-color:#eee;color:#363636}.file.is-light-dark.is-focused .file-cta,.file.is-light-dark:focus .file-cta{box-shadow:0 0 .5em rgba(245,245,245,.25);color:#363636}.file.is-light-dark.is-active .file-cta,.file.is-light-dark:active .file-cta{background-color:#e8e8e8;color:#363636}.file.is-dark-dark .file-cta{background-color:#363636;color:#f5f5f5}.file.is-dark-dark.is-hovered .file-cta,.file.is-dark-dark:hover .file-cta{background-color:#2f2f2f;color:#f5f5f5}.file.is-dark-dark.is-focused .file-cta,.file.is-dark-dark:focus .file-cta{box-shadow:0 0 .5em rgba(54,54,54,.25);color:#f5f5f5}.file.is-dark-dark.is-active .file-cta,.file.is-dark-dark:active .file-cta{background-color:#292929;color:#f5f5f5}.file.is-primary-dark .file-cta{background-color:#00d1b2;color:#fff}.file.is-primary-dark.is-hovered .file-cta,.file.is-primary-dark:hover .file-cta{background-color:#00c4a7;color:#fff}.file.is-primary-dark.is-focused .file-cta,.file.is-primary-dark:focus .file-cta{box-shadow:0 0 .5em rgba(0,209,178,.25);color:#fff}.file.is-primary-dark.is-active .file-cta,.file.is-primary-dark:active .file-cta{background-color:#00b89c;color:#fff}.file.is-link-dark .file-cta{background-color:#3273dc;color:#fff}.file.is-link-dark.is-hovered .file-cta,.file.is-link-dark:hover .file-cta{background-color:#276cda;color:#fff}.file.is-link-dark.is-focused .file-cta,.file.is-link-dark:focus .file-cta{box-shadow:0 0 .5em rgba(50,115,220,.25);color:#fff}.file.is-link-dark.is-active .file-cta,.file.is-link-dark:active .file-cta{background-color:#2366d1;color:#fff}.file.is-info-dark .file-cta{background-color:#209cee;color:#fff}.file.is-info-dark.is-hovered .file-cta,.file.is-info-dark:hover .file-cta{background-color:#1496ed;color:#fff}.file.is-info-dark.is-focused .file-cta,.file.is-info-dark:focus .file-cta{box-shadow:0 0 .5em rgba(32,156,238,.25);color:#fff}.file.is-info-dark.is-active .file-cta,.file.is-info-dark:active .file-cta{background-color:#118fe4;color:#fff}.file.is-success-dark .file-cta{background-color:#23d160;color:#fff}.file.is-success-dark.is-hovered .file-cta,.file.is-success-dark:hover .file-cta{background-color:#22c65b;color:#fff}.file.is-success-dark.is-focused .file-cta,.file.is-success-dark:focus .file-cta{box-shadow:0 0 .5em rgba(35,209,96,.25);color:#fff}.file.is-success-dark.is-active .file-cta,.file.is-success-dark:active .file-cta{background-color:#20bc56;color:#fff}.file.is-warning-dark .file-cta{background-color:#ffdd57;color:rgba(0,0,0,.7)}.file.is-warning-dark.is-hovered .file-cta,.file.is-warning-dark:hover .file-cta{background-color:#ffdb4a;color:rgba(0,0,0,.7)}.file.is-warning-dark.is-focused .file-cta,.file.is-warning-dark:focus .file-cta{box-shadow:0 0 .5em rgba(255,221,87,.25);color:rgba(0,0,0,.7)}.file.is-warning-dark.is-active .file-cta,.file.is-warning-dark:active .file-cta{background-color:#ffd83d;color:rgba(0,0,0,.7)}.file.is-danger-dark .file-cta{background-color:#ff3860;color:#fff}.file.is-danger-dark.is-hovered .file-cta,.file.is-danger-dark:hover .file-cta{background-color:#ff2b56;color:#fff}.file.is-danger-dark.is-focused .file-cta,.file.is-danger-dark:focus .file-cta{box-shadow:0 0 .5em rgba(255,56,96,.25);color:#fff}.file.is-danger-dark.is-active .file-cta,.file.is-danger-dark:active .file-cta{background-color:#ff1f4b;color:#fff}.file-label:hover .file-cta{background-color:#1d1d1d;color:#dbdbdb}.file-label:hover .file-name{border-color:#2f2f2f}.file-label:active .file-cta{background-color:#171717;color:#dbdbdb}.file-label:active .file-name{border-color:#292929}.file-cta,.file-name{border-color:#363636}.file-cta{background-color:#242424;color:#b5b5b5}.file-name{border-color:#363636}.label{color:#dbdbdb}.help.is-white{color:#e6e6e6}.help.is-black{color:#000}.help.is-light{color:#dbdbdb}.help.is-dark{color:#1c1c1c}.help.is-primary{color:#009e86}.help.is-link{color:#205bbc}.help.is-info{color:#0f81cc}.help.is-success{color:#1ca64c}.help.is-warning{color:#ffd324}.help.is-danger{color:#ff0537}.help.is-white-dark{color:#fff}.help.is-black-dark{color:#0a0a0a}.help.is-light-dark{color:#f5f5f5}.help.is-dark-dark{color:#363636}.help.is-primary-dark{color:#00d1b2}.help.is-link-dark{color:#3273dc}.help.is-info-dark{color:#209cee}.help.is-success-dark{color:#23d160}.help.is-warning-dark{color:#ffdd57}.help.is-danger-dark{color:#ff3860}.control.has-icons-left .icon,.control.has-icons-right .icon{color:#363636}.notification{background-color:#242424}.notification code,.notification pre{background:#0a0a0a}.notification.is-white{background-color:#e6e6e6;color:#0a0a0a}.notification.is-black{background-color:#000;color:#fff}.notification.is-light{background-color:#dbdbdb;color:#363636}.notification.is-dark{background-color:#1c1c1c;color:#f5f5f5}.notification.is-primary{background-color:#009e86;color:#fff}.notification.is-link{background-color:#205bbc;color:#fff}.notification.is-info{background-color:#0f81cc;color:#fff}.notification.is-success{background-color:#1ca64c;color:#fff}.notification.is-warning{background-color:#ffd324;color:rgba(0,0,0,.7)}.notification.is-danger{background-color:#ff0537;color:#fff}.notification.is-white-dark{background-color:#fff;color:#0a0a0a}.notification.is-black-dark{background-color:#0a0a0a;color:#fff}.notification.is-light-dark{background-color:#f5f5f5;color:#363636}.notification.is-dark-dark{background-color:#363636;color:#f5f5f5}.notification.is-primary-dark{background-color:#00d1b2;color:#fff}.notification.is-link-dark{background-color:#3273dc;color:#fff}.notification.is-info-dark{background-color:#209cee;color:#fff}.notification.is-success-dark{background-color:#23d160;color:#fff}.notification.is-warning-dark{background-color:#ffdd57;color:rgba(0,0,0,.7)}.notification.is-danger-dark{background-color:#ff3860;color:#fff}.progress::-webkit-progress-bar{background-color:#363636}.progress::-webkit-progress-value{background-color:#b5b5b5}.progress::-moz-progress-bar{background-color:#b5b5b5}.progress::-ms-fill{background-color:#b5b5b5}.progress:indeterminate{background-color:#363636;background-image:linear-gradient(to right,#4a4a4a 30%,#363636 30%)}.progress.is-white::-webkit-progress-value{background-color:#e6e6e6}.progress.is-white::-moz-progress-bar{background-color:#e6e6e6}.progress.is-white::-ms-fill{background-color:#e6e6e6}.progress.is-white:indeterminate{background-image:linear-gradient(to right,#e6e6e6 30%,#363636 30%)}.progress.is-black::-webkit-progress-value{background-color:#000}.progress.is-black::-moz-progress-bar{background-color:#000}.progress.is-black::-ms-fill{background-color:#000}.progress.is-black:indeterminate{background-image:linear-gradient(to right,#000 30%,#363636 30%)}.progress.is-light::-webkit-progress-value{background-color:#dbdbdb}.progress.is-light::-moz-progress-bar{background-color:#dbdbdb}.progress.is-light::-ms-fill{background-color:#dbdbdb}.progress.is-light:indeterminate{background-image:linear-gradient(to right,#dbdbdb 30%,#363636 30%)}.progress.is-dark::-webkit-progress-value{background-color:#1c1c1c}.progress.is-dark::-moz-progress-bar{background-color:#1c1c1c}.progress.is-dark::-ms-fill{background-color:#1c1c1c}.progress.is-dark:indeterminate{background-image:linear-gradient(to right,#1c1c1c 30%,#363636 30%)}.progress.is-primary::-webkit-progress-value{background-color:#009e86}.progress.is-primary::-moz-progress-bar{background-color:#009e86}.progress.is-primary::-ms-fill{background-color:#009e86}.progress.is-primary:indeterminate{background-image:linear-gradient(to right,#009e86 30%,#363636 30%)}.progress.is-link::-webkit-progress-value{background-color:#205bbc}.progress.is-link::-moz-progress-bar{background-color:#205bbc}.progress.is-link::-ms-fill{background-color:#205bbc}.progress.is-link:indeterminate{background-image:linear-gradient(to right,#205bbc 30%,#363636 30%)}.progress.is-info::-webkit-progress-value{background-color:#0f81cc}.progress.is-info::-moz-progress-bar{background-color:#0f81cc}.progress.is-info::-ms-fill{background-color:#0f81cc}.progress.is-info:indeterminate{background-image:linear-gradient(to right,#0f81cc 30%,#363636 30%)}.progress.is-success::-webkit-progress-value{background-color:#1ca64c}.progress.is-success::-moz-progress-bar{background-color:#1ca64c}.progress.is-success::-ms-fill{background-color:#1ca64c}.progress.is-success:indeterminate{background-image:linear-gradient(to right,#1ca64c 30%,#363636 30%)}.progress.is-warning::-webkit-progress-value{background-color:#ffd324}.progress.is-warning::-moz-progress-bar{background-color:#ffd324}.progress.is-warning::-ms-fill{background-color:#ffd324}.progress.is-warning:indeterminate{background-image:linear-gradient(to right,#ffd324 30%,#363636 30%)}.progress.is-danger::-webkit-progress-value{background-color:#ff0537}.progress.is-danger::-moz-progress-bar{background-color:#ff0537}.progress.is-danger::-ms-fill{background-color:#ff0537}.progress.is-danger:indeterminate{background-image:linear-gradient(to right,#ff0537 30%,#363636 30%)}.progress.is-white-dark::-webkit-progress-value{background-color:#fff}.progress.is-white-dark::-moz-progress-bar{background-color:#fff}.progress.is-white-dark::-ms-fill{background-color:#fff}.progress.is-white-dark:indeterminate{background-image:linear-gradient(to right,#fff 30%,#363636 30%)}.progress.is-black-dark::-webkit-progress-value{background-color:#0a0a0a}.progress.is-black-dark::-moz-progress-bar{background-color:#0a0a0a}.progress.is-black-dark::-ms-fill{background-color:#0a0a0a}.progress.is-black-dark:indeterminate{background-image:linear-gradient(to right,#0a0a0a 30%,#363636 30%)}.progress.is-light-dark::-webkit-progress-value{background-color:#f5f5f5}.progress.is-light-dark::-moz-progress-bar{background-color:#f5f5f5}.progress.is-light-dark::-ms-fill{background-color:#f5f5f5}.progress.is-light-dark:indeterminate{background-image:linear-gradient(to right,#f5f5f5 30%,#363636 30%)}.progress.is-dark-dark::-webkit-progress-value{background-color:#363636}.progress.is-dark-dark::-moz-progress-bar{background-color:#363636}.progress.is-dark-dark::-ms-fill{background-color:#363636}.progress.is-dark-dark:indeterminate{background-image:linear-gradient(to right,#363636 30%,#363636 30%)}.progress.is-primary-dark::-webkit-progress-value{background-color:#00d1b2}.progress.is-primary-dark::-moz-progress-bar{background-color:#00d1b2}.progress.is-primary-dark::-ms-fill{background-color:#00d1b2}.progress.is-primary-dark:indeterminate{background-image:linear-gradient(to right,#00d1b2 30%,#363636 30%)}.progress.is-link-dark::-webkit-progress-value{background-color:#3273dc}.progress.is-link-dark::-moz-progress-bar{background-color:#3273dc}.progress.is-link-dark::-ms-fill{background-color:#3273dc}.progress.is-link-dark:indeterminate{background-image:linear-gradient(to right,#3273dc 30%,#363636 30%)}.progress.is-info-dark::-webkit-progress-value{background-color:#209cee}.progress.is-info-dark::-moz-progress-bar{background-color:#209cee}.progress.is-info-dark::-ms-fill{background-color:#209cee}.progress.is-info-dark:indeterminate{background-image:linear-gradient(to right,#209cee 30%,#363636 30%)}.progress.is-success-dark::-webkit-progress-value{background-color:#23d160}.progress.is-success-dark::-moz-progress-bar{background-color:#23d160}.progress.is-success-dark::-ms-fill{background-color:#23d160}.progress.is-success-dark:indeterminate{background-image:linear-gradient(to right,#23d160 30%,#363636 30%)}.progress.is-warning-dark::-webkit-progress-value{background-color:#ffdd57}.progress.is-warning-dark::-moz-progress-bar{background-color:#ffdd57}.progress.is-warning-dark::-ms-fill{background-color:#ffdd57}.progress.is-warning-dark:indeterminate{background-image:linear-gradient(to right,#ffdd57 30%,#363636 30%)}.progress.is-danger-dark::-webkit-progress-value{background-color:#ff3860}.progress.is-danger-dark::-moz-progress-bar{background-color:#ff3860}.progress.is-danger-dark::-ms-fill{background-color:#ff3860}.progress.is-danger-dark:indeterminate{background-image:linear-gradient(to right,#ff3860 30%,#363636 30%)}.table{background-color:#0a0a0a;color:#dbdbdb}.table td,.table th{border:1px solid #363636}.table td.is-white,.table th.is-white{background-color:#e6e6e6;border-color:#e6e6e6;color:#0a0a0a}.table td.is-black,.table th.is-black{background-color:#000;border-color:#000;color:#fff}.table td.is-light,.table th.is-light{background-color:#dbdbdb;border-color:#dbdbdb;color:#363636}.table td.is-dark,.table th.is-dark{background-color:#1c1c1c;border-color:#1c1c1c;color:#f5f5f5}.table td.is-primary,.table th.is-primary{background-color:#009e86;border-color:#009e86;color:#fff}.table td.is-link,.table th.is-link{background-color:#205bbc;border-color:#205bbc;color:#fff}.table td.is-info,.table th.is-info{background-color:#0f81cc;border-color:#0f81cc;color:#fff}.table td.is-success,.table th.is-success{background-color:#1ca64c;border-color:#1ca64c;color:#fff}.table td.is-warning,.table th.is-warning{background-color:#ffd324;border-color:#ffd324;color:rgba(0,0,0,.7)}.table td.is-danger,.table th.is-danger{background-color:#ff0537;border-color:#ff0537;color:#fff}.table td.is-white-dark,.table th.is-white-dark{background-color:#fff;border-color:#fff;color:#0a0a0a}.table td.is-black-dark,.table th.is-black-dark{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.table td.is-light-dark,.table th.is-light-dark{background-color:#f5f5f5;border-color:#f5f5f5;color:#363636}.table td.is-dark-dark,.table th.is-dark-dark{background-color:#363636;border-color:#363636;color:#f5f5f5}.table td.is-primary-dark,.table th.is-primary-dark{background-color:#00d1b2;border-color:#00d1b2;color:#fff}.table td.is-link-dark,.table th.is-link-dark{background-color:#3273dc;border-color:#3273dc;color:#fff}.table td.is-info-dark,.table th.is-info-dark{background-color:#209cee;border-color:#209cee;color:#fff}.table td.is-success-dark,.table th.is-success-dark{background-color:#23d160;border-color:#23d160;color:#fff}.table td.is-warning-dark,.table th.is-warning-dark{background-color:#ffdd57;border-color:#ffdd57;color:rgba(0,0,0,.7)}.table td.is-danger-dark,.table th.is-danger-dark{background-color:#ff3860;border-color:#ff3860;color:#fff}.table td.is-selected,.table th.is-selected{background-color:#009e86;color:#e6e6e6}.table th{color:#dbdbdb}.table tr.is-selected{background-color:#009e86;color:#e6e6e6}.table tr.is-selected td,.table tr.is-selected th{border-color:#e6e6e6}.table thead td,.table thead th{color:#dbdbdb}.table tfoot td,.table tfoot th{color:#dbdbdb}.table.is-hoverable tbody tr:not(.is-selected):hover{background-color:#121212}.table.is-hoverable.is-striped tbody tr:not(.is-selected):hover{background-color:#121212}.table.is-hoverable.is-striped tbody tr:not(.is-selected):hover:nth-child(even){background-color:#242424}.table.is-striped tbody tr:not(.is-selected):nth-child(even){background-color:#121212}.tag:not(body){background-color:#242424;color:#b5b5b5}.tag:not(body).is-white{background-color:#e6e6e6;color:#0a0a0a}.tag:not(body).is-black{background-color:#000;color:#fff}.tag:not(body).is-light{background-color:#dbdbdb;color:#363636}.tag:not(body).is-dark{background-color:#1c1c1c;color:#f5f5f5}.tag:not(body).is-primary{background-color:#009e86;color:#fff}.tag:not(body).is-link{background-color:#205bbc;color:#fff}.tag:not(body).is-info{background-color:#0f81cc;color:#fff}.tag:not(body).is-success{background-color:#1ca64c;color:#fff}.tag:not(body).is-warning{background-color:#ffd324;color:rgba(0,0,0,.7)}.tag:not(body).is-danger{background-color:#ff0537;color:#fff}.tag:not(body).is-white-dark{background-color:#fff;color:#0a0a0a}.tag:not(body).is-black-dark{background-color:#0a0a0a;color:#fff}.tag:not(body).is-light-dark{background-color:#f5f5f5;color:#363636}.tag:not(body).is-dark-dark{background-color:#363636;color:#f5f5f5}.tag:not(body).is-primary-dark{background-color:#00d1b2;color:#fff}.tag:not(body).is-link-dark{background-color:#3273dc;color:#fff}.tag:not(body).is-info-dark{background-color:#209cee;color:#fff}.tag:not(body).is-success-dark{background-color:#23d160;color:#fff}.tag:not(body).is-warning-dark{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tag:not(body).is-danger-dark{background-color:#ff3860;color:#fff}.tag:not(body).is-delete:focus,.tag:not(body).is-delete:hover{background-color:#171717}.tag:not(body).is-delete:active{background-color:#0a0a0a}.title{color:#dbdbdb}.subtitle{color:#b5b5b5}.subtitle strong{color:#dbdbdb}.number{background-color:#242424}.breadcrumb a{color:#5ea3e4}.breadcrumb a:hover{color:#dbdbdb}.breadcrumb li.is-active a{color:#dbdbdb}.breadcrumb li+li::before{color:#4a4a4a}.card{background-color:#0a0a0a;box-shadow:0 2px 3px rgba(255,255,255,.1),0 0 0 1px rgba(255,255,255,.1);color:#b5b5b5}.card-header{box-shadow:0 1px 2px rgba(255,255,255,.1)}.card-header-title{color:#dbdbdb}.card-footer{border-top:1px solid #363636}.card-footer-item:not(:last-child){border-right:1px solid #363636}.dropdown-content{background-color:#0a0a0a;box-shadow:0 2px 3px rgba(255,255,255,.1),0 0 0 1px rgba(255,255,255,.1)}.dropdown-item{color:#b5b5b5}a.dropdown-item:hover,button.dropdown-item:hover{background-color:#242424;color:#fff}a.dropdown-item.is-active,button.dropdown-item.is-active{background-color:#5ea3e4;color:#fff}.dropdown-divider{background-color:#363636}.list{background-color:#0a0a0a;box-shadow:0 2px 3px rgba(255,255,255,.1),0 0 0 1px rgba(255,255,255,.1)}.list-item:not(a){color:#b5b5b5}.list-item:not(:last-child){border-bottom:1px solid #363636}.list-item.is-active{background-color:#5ea3e4;color:#fff}a.list-item{background-color:#242424}.media .media{border-top:1px solid rgba(54,54,54,.5)}.media+.media{border-top:1px solid rgba(54,54,54,.5)}.menu-list a{color:#b5b5b5}.menu-list a:hover{background-color:#242424;color:#dbdbdb}.menu-list a.is-active{background-color:#5ea3e4;color:#fff}.menu-list li ul{border-left:1px solid #363636}.message{background-color:#242424}.message.is-white{background-color:#242424}.message.is-white .message-header{background-color:#fff;color:#0a0a0a}.message.is-white .message-body{border-color:#fff;color:#b5b5b5}.message.is-black{background-color:#242424}.message.is-black .message-header{background-color:#0a0a0a;color:#fff}.message.is-black .message-body{border-color:#0a0a0a;color:#b5b5b5}.message.is-light{background-color:#242424}.message.is-light .message-header{background-color:#f5f5f5;color:#363636}.message.is-light .message-body{border-color:#f5f5f5;color:#b5b5b5}.message.is-dark{background-color:#242424}.message.is-dark .message-header{background-color:#363636;color:#f5f5f5}.message.is-dark .message-body{border-color:#363636;color:#b5b5b5}.message.is-primary{background-color:#242424}.message.is-primary .message-header{background-color:#00d1b2;color:#fff}.message.is-primary .message-body{border-color:#00d1b2;color:#b5b5b5}.message.is-link{background-color:#242424}.message.is-link .message-header{background-color:#3273dc;color:#fff}.message.is-link .message-body{border-color:#3273dc;color:#b5b5b5}.message.is-info{background-color:#242424}.message.is-info .message-header{background-color:#209cee;color:#fff}.message.is-info .message-body{border-color:#209cee;color:#b5b5b5}.message.is-success{background-color:#242424}.message.is-success .message-header{background-color:#23d160;color:#fff}.message.is-success .message-body{border-color:#23d160;color:#b5b5b5}.message.is-warning{background-color:#242424}.message.is-warning .message-header{background-color:#ffdd57;color:rgba(0,0,0,.7)}.message.is-warning .message-body{border-color:#ffdd57;color:#b5b5b5}.message.is-danger{background-color:#242424}.message.is-danger .message-header{background-color:#ff3860;color:#fff}.message.is-danger .message-body{border-color:#ff3860;color:#b5b5b5}.message.is-white-dark{background-color:#242424}.message.is-white-dark .message-header{background-color:#fff;color:#0a0a0a}.message.is-white-dark .message-body{border-color:#fff;color:#b5b5b5}.message.is-black-dark{background-color:#242424}.message.is-black-dark .message-header{background-color:#0a0a0a;color:#fff}.message.is-black-dark .message-body{border-color:#0a0a0a;color:#b5b5b5}.message.is-light-dark{background-color:#242424}.message.is-light-dark .message-header{background-color:#f5f5f5;color:#363636}.message.is-light-dark .message-body{border-color:#f5f5f5;color:#b5b5b5}.message.is-dark-dark{background-color:#242424}.message.is-dark-dark .message-header{background-color:#363636;color:#f5f5f5}.message.is-dark-dark .message-body{border-color:#363636;color:#b5b5b5}.message.is-primary-dark{background-color:#242424}.message.is-primary-dark .message-header{background-color:#00d1b2;color:#fff}.message.is-primary-dark .message-body{border-color:#00d1b2;color:#b5b5b5}.message.is-link-dark{background-color:#242424}.message.is-link-dark .message-header{background-color:#3273dc;color:#fff}.message.is-link-dark .message-body{border-color:#3273dc;color:#b5b5b5}.message.is-info-dark{background-color:#242424}.message.is-info-dark .message-header{background-color:#209cee;color:#fff}.message.is-info-dark .message-body{border-color:#209cee;color:#b5b5b5}.message.is-success-dark{background-color:#242424}.message.is-success-dark .message-header{background-color:#23d160;color:#fff}.message.is-success-dark .message-body{border-color:#23d160;color:#b5b5b5}.message.is-warning-dark{background-color:#242424}.message.is-warning-dark .message-header{background-color:#ffdd57;color:rgba(0,0,0,.7)}.message.is-warning-dark .message-body{border-color:#ffdd57;color:#b5b5b5}.message.is-danger-dark{background-color:#242424}.message.is-danger-dark .message-header{background-color:#ff3860;color:#fff}.message.is-danger-dark .message-body{border-color:#ff3860;color:#b5b5b5}.message-header{background-color:#b5b5b5;color:#fff}.message-body{border-color:#363636;color:#b5b5b5}.message-body code,.message-body pre{background-color:#0a0a0a}.modal-background{background-color:rgba(255,255,255,.86)}.modal-card-foot,.modal-card-head{background-color:#242424}.modal-card-head{border-bottom:1px solid #363636}.modal-card-title{color:#dbdbdb}.modal-card-foot{border-top:1px solid #363636}.modal-card-body{-webkit-overflow-scrolling:touch;background-color:#fff}.navbar{background-color:#17181c}.navbar.is-white{background-color:#e6e6e6;color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link,.navbar.is-white .navbar-brand>.navbar-item{color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link.is-active,.navbar.is-white .navbar-brand .navbar-link:hover,.navbar.is-white .navbar-brand>a.navbar-item.is-active,.navbar.is-white .navbar-brand>a.navbar-item:hover{background-color:#d9d9d9;color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link::after{border-color:#0a0a0a}.navbar.is-white .navbar-burger{color:#0a0a0a}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-white .navbar-end .navbar-link,.navbar.is-white .navbar-end>.navbar-item,.navbar.is-white .navbar-start .navbar-link,.navbar.is-white .navbar-start>.navbar-item{color:#0a0a0a}.navbar.is-white .navbar-end .navbar-link.is-active,.navbar.is-white .navbar-end .navbar-link:hover,.navbar.is-white .navbar-end>a.navbar-item.is-active,.navbar.is-white .navbar-end>a.navbar-item:hover,.navbar.is-white .navbar-start .navbar-link.is-active,.navbar.is-white .navbar-start .navbar-link:hover,.navbar.is-white .navbar-start>a.navbar-item.is-active,.navbar.is-white .navbar-start>a.navbar-item:hover{background-color:#d9d9d9;color:#0a0a0a}.navbar.is-white .navbar-end .navbar-link::after,.navbar.is-white .navbar-start .navbar-link::after{border-color:#0a0a0a}.navbar.is-white .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-white .navbar-item.has-dropdown:hover .navbar-link{background-color:#d9d9d9;color:#0a0a0a}.navbar.is-white .navbar-dropdown a.navbar-item.is-active{background-color:#e6e6e6;color:#0a0a0a}}@media (prefers-color-scheme:dark){.navbar.is-black{background-color:#000;color:#fff}.navbar.is-black .navbar-brand .navbar-link,.navbar.is-black .navbar-brand>.navbar-item{color:#fff}.navbar.is-black .navbar-brand .navbar-link.is-active,.navbar.is-black .navbar-brand .navbar-link:hover,.navbar.is-black .navbar-brand>a.navbar-item.is-active,.navbar.is-black .navbar-brand>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-black .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-black .navbar-end .navbar-link,.navbar.is-black .navbar-end>.navbar-item,.navbar.is-black .navbar-start .navbar-link,.navbar.is-black .navbar-start>.navbar-item{color:#fff}.navbar.is-black .navbar-end .navbar-link.is-active,.navbar.is-black .navbar-end .navbar-link:hover,.navbar.is-black .navbar-end>a.navbar-item.is-active,.navbar.is-black .navbar-end>a.navbar-item:hover,.navbar.is-black .navbar-start .navbar-link.is-active,.navbar.is-black .navbar-start .navbar-link:hover,.navbar.is-black .navbar-start>a.navbar-item.is-active,.navbar.is-black .navbar-start>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black .navbar-end .navbar-link::after,.navbar.is-black .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-black .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-black .navbar-item.has-dropdown:hover .navbar-link{background-color:#000;color:#fff}.navbar.is-black .navbar-dropdown a.navbar-item.is-active{background-color:#000;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-light{background-color:#dbdbdb;color:#363636}.navbar.is-light .navbar-brand .navbar-link,.navbar.is-light .navbar-brand>.navbar-item{color:#363636}.navbar.is-light .navbar-brand .navbar-link.is-active,.navbar.is-light .navbar-brand .navbar-link:hover,.navbar.is-light .navbar-brand>a.navbar-item.is-active,.navbar.is-light .navbar-brand>a.navbar-item:hover{background-color:#cfcfcf;color:#363636}.navbar.is-light .navbar-brand .navbar-link::after{border-color:#363636}.navbar.is-light .navbar-burger{color:#363636}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-light .navbar-end .navbar-link,.navbar.is-light .navbar-end>.navbar-item,.navbar.is-light .navbar-start .navbar-link,.navbar.is-light .navbar-start>.navbar-item{color:#363636}.navbar.is-light .navbar-end .navbar-link.is-active,.navbar.is-light .navbar-end .navbar-link:hover,.navbar.is-light .navbar-end>a.navbar-item.is-active,.navbar.is-light .navbar-end>a.navbar-item:hover,.navbar.is-light .navbar-start .navbar-link.is-active,.navbar.is-light .navbar-start .navbar-link:hover,.navbar.is-light .navbar-start>a.navbar-item.is-active,.navbar.is-light .navbar-start>a.navbar-item:hover{background-color:#cfcfcf;color:#363636}.navbar.is-light .navbar-end .navbar-link::after,.navbar.is-light .navbar-start .navbar-link::after{border-color:#363636}.navbar.is-light .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-light .navbar-item.has-dropdown:hover .navbar-link{background-color:#cfcfcf;color:#363636}.navbar.is-light .navbar-dropdown a.navbar-item.is-active{background-color:#dbdbdb;color:#363636}}@media (prefers-color-scheme:dark){.navbar.is-dark{background-color:#1c1c1c;color:#f5f5f5}.navbar.is-dark .navbar-brand .navbar-link,.navbar.is-dark .navbar-brand>.navbar-item{color:#f5f5f5}.navbar.is-dark .navbar-brand .navbar-link.is-active,.navbar.is-dark .navbar-brand .navbar-link:hover,.navbar.is-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-dark .navbar-brand>a.navbar-item:hover{background-color:#0f0f0f;color:#f5f5f5}.navbar.is-dark .navbar-brand .navbar-link::after{border-color:#f5f5f5}.navbar.is-dark .navbar-burger{color:#f5f5f5}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-dark .navbar-end .navbar-link,.navbar.is-dark .navbar-end>.navbar-item,.navbar.is-dark .navbar-start .navbar-link,.navbar.is-dark .navbar-start>.navbar-item{color:#f5f5f5}.navbar.is-dark .navbar-end .navbar-link.is-active,.navbar.is-dark .navbar-end .navbar-link:hover,.navbar.is-dark .navbar-end>a.navbar-item.is-active,.navbar.is-dark .navbar-end>a.navbar-item:hover,.navbar.is-dark .navbar-start .navbar-link.is-active,.navbar.is-dark .navbar-start .navbar-link:hover,.navbar.is-dark .navbar-start>a.navbar-item.is-active,.navbar.is-dark .navbar-start>a.navbar-item:hover{background-color:#0f0f0f;color:#f5f5f5}.navbar.is-dark .navbar-end .navbar-link::after,.navbar.is-dark .navbar-start .navbar-link::after{border-color:#f5f5f5}.navbar.is-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#0f0f0f;color:#f5f5f5}.navbar.is-dark .navbar-dropdown a.navbar-item.is-active{background-color:#1c1c1c;color:#f5f5f5}}@media (prefers-color-scheme:dark){.navbar.is-primary{background-color:#009e86;color:#fff}.navbar.is-primary .navbar-brand .navbar-link,.navbar.is-primary .navbar-brand>.navbar-item{color:#fff}.navbar.is-primary .navbar-brand .navbar-link.is-active,.navbar.is-primary .navbar-brand .navbar-link:hover,.navbar.is-primary .navbar-brand>a.navbar-item.is-active,.navbar.is-primary .navbar-brand>a.navbar-item:hover{background-color:#008571;color:#fff}.navbar.is-primary .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-primary .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-primary .navbar-end .navbar-link,.navbar.is-primary .navbar-end>.navbar-item,.navbar.is-primary .navbar-start .navbar-link,.navbar.is-primary .navbar-start>.navbar-item{color:#fff}.navbar.is-primary .navbar-end .navbar-link.is-active,.navbar.is-primary .navbar-end .navbar-link:hover,.navbar.is-primary .navbar-end>a.navbar-item.is-active,.navbar.is-primary .navbar-end>a.navbar-item:hover,.navbar.is-primary .navbar-start .navbar-link.is-active,.navbar.is-primary .navbar-start .navbar-link:hover,.navbar.is-primary .navbar-start>a.navbar-item.is-active,.navbar.is-primary .navbar-start>a.navbar-item:hover{background-color:#008571;color:#fff}.navbar.is-primary .navbar-end .navbar-link::after,.navbar.is-primary .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-primary .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-primary .navbar-item.has-dropdown:hover .navbar-link{background-color:#008571;color:#fff}.navbar.is-primary .navbar-dropdown a.navbar-item.is-active{background-color:#009e86;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-link{background-color:#205bbc;color:#fff}.navbar.is-link .navbar-brand .navbar-link,.navbar.is-link .navbar-brand>.navbar-item{color:#fff}.navbar.is-link .navbar-brand .navbar-link.is-active,.navbar.is-link .navbar-brand .navbar-link:hover,.navbar.is-link .navbar-brand>a.navbar-item.is-active,.navbar.is-link .navbar-brand>a.navbar-item:hover{background-color:#1c51a6;color:#fff}.navbar.is-link .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-link .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-link .navbar-end .navbar-link,.navbar.is-link .navbar-end>.navbar-item,.navbar.is-link .navbar-start .navbar-link,.navbar.is-link .navbar-start>.navbar-item{color:#fff}.navbar.is-link .navbar-end .navbar-link.is-active,.navbar.is-link .navbar-end .navbar-link:hover,.navbar.is-link .navbar-end>a.navbar-item.is-active,.navbar.is-link .navbar-end>a.navbar-item:hover,.navbar.is-link .navbar-start .navbar-link.is-active,.navbar.is-link .navbar-start .navbar-link:hover,.navbar.is-link .navbar-start>a.navbar-item.is-active,.navbar.is-link .navbar-start>a.navbar-item:hover{background-color:#1c51a6;color:#fff}.navbar.is-link .navbar-end .navbar-link::after,.navbar.is-link .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-link .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-link .navbar-item.has-dropdown:hover .navbar-link{background-color:#1c51a6;color:#fff}.navbar.is-link .navbar-dropdown a.navbar-item.is-active{background-color:#205bbc;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-info{background-color:#0f81cc;color:#fff}.navbar.is-info .navbar-brand .navbar-link,.navbar.is-info .navbar-brand>.navbar-item{color:#fff}.navbar.is-info .navbar-brand .navbar-link.is-active,.navbar.is-info .navbar-brand .navbar-link:hover,.navbar.is-info .navbar-brand>a.navbar-item.is-active,.navbar.is-info .navbar-brand>a.navbar-item:hover{background-color:#0e72b4;color:#fff}.navbar.is-info .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-info .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-info .navbar-end .navbar-link,.navbar.is-info .navbar-end>.navbar-item,.navbar.is-info .navbar-start .navbar-link,.navbar.is-info .navbar-start>.navbar-item{color:#fff}.navbar.is-info .navbar-end .navbar-link.is-active,.navbar.is-info .navbar-end .navbar-link:hover,.navbar.is-info .navbar-end>a.navbar-item.is-active,.navbar.is-info .navbar-end>a.navbar-item:hover,.navbar.is-info .navbar-start .navbar-link.is-active,.navbar.is-info .navbar-start .navbar-link:hover,.navbar.is-info .navbar-start>a.navbar-item.is-active,.navbar.is-info .navbar-start>a.navbar-item:hover{background-color:#0e72b4;color:#fff}.navbar.is-info .navbar-end .navbar-link::after,.navbar.is-info .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-info .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-info .navbar-item.has-dropdown:hover .navbar-link{background-color:#0e72b4;color:#fff}.navbar.is-info .navbar-dropdown a.navbar-item.is-active{background-color:#0f81cc;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-success{background-color:#1ca64c;color:#fff}.navbar.is-success .navbar-brand .navbar-link,.navbar.is-success .navbar-brand>.navbar-item{color:#fff}.navbar.is-success .navbar-brand .navbar-link.is-active,.navbar.is-success .navbar-brand .navbar-link:hover,.navbar.is-success .navbar-brand>a.navbar-item.is-active,.navbar.is-success .navbar-brand>a.navbar-item:hover{background-color:#189042;color:#fff}.navbar.is-success .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-success .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-success .navbar-end .navbar-link,.navbar.is-success .navbar-end>.navbar-item,.navbar.is-success .navbar-start .navbar-link,.navbar.is-success .navbar-start>.navbar-item{color:#fff}.navbar.is-success .navbar-end .navbar-link.is-active,.navbar.is-success .navbar-end .navbar-link:hover,.navbar.is-success .navbar-end>a.navbar-item.is-active,.navbar.is-success .navbar-end>a.navbar-item:hover,.navbar.is-success .navbar-start .navbar-link.is-active,.navbar.is-success .navbar-start .navbar-link:hover,.navbar.is-success .navbar-start>a.navbar-item.is-active,.navbar.is-success .navbar-start>a.navbar-item:hover{background-color:#189042;color:#fff}.navbar.is-success .navbar-end .navbar-link::after,.navbar.is-success .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-success .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-success .navbar-item.has-dropdown:hover .navbar-link{background-color:#189042;color:#fff}.navbar.is-success .navbar-dropdown a.navbar-item.is-active{background-color:#1ca64c;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-warning{background-color:#ffd324;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link,.navbar.is-warning .navbar-brand>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link.is-active,.navbar.is-warning .navbar-brand .navbar-link:hover,.navbar.is-warning .navbar-brand>a.navbar-item.is-active,.navbar.is-warning .navbar-brand>a.navbar-item:hover{background-color:#ffce0a;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-burger{color:rgba(0,0,0,.7)}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-warning .navbar-end .navbar-link,.navbar.is-warning .navbar-end>.navbar-item,.navbar.is-warning .navbar-start .navbar-link,.navbar.is-warning .navbar-start>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-end .navbar-link.is-active,.navbar.is-warning .navbar-end .navbar-link:hover,.navbar.is-warning .navbar-end>a.navbar-item.is-active,.navbar.is-warning .navbar-end>a.navbar-item:hover,.navbar.is-warning .navbar-start .navbar-link.is-active,.navbar.is-warning .navbar-start .navbar-link:hover,.navbar.is-warning .navbar-start>a.navbar-item.is-active,.navbar.is-warning .navbar-start>a.navbar-item:hover{background-color:#ffce0a;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-end .navbar-link::after,.navbar.is-warning .navbar-start .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-warning .navbar-item.has-dropdown:hover .navbar-link{background-color:#ffce0a;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-dropdown a.navbar-item.is-active{background-color:#ffd324;color:rgba(0,0,0,.7)}}@media (prefers-color-scheme:dark){.navbar.is-danger{background-color:#ff0537;color:#fff}.navbar.is-danger .navbar-brand .navbar-link,.navbar.is-danger .navbar-brand>.navbar-item{color:#fff}.navbar.is-danger .navbar-brand .navbar-link.is-active,.navbar.is-danger .navbar-brand .navbar-link:hover,.navbar.is-danger .navbar-brand>a.navbar-item.is-active,.navbar.is-danger .navbar-brand>a.navbar-item:hover{background-color:#eb002f;color:#fff}.navbar.is-danger .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-danger .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-danger .navbar-end .navbar-link,.navbar.is-danger .navbar-end>.navbar-item,.navbar.is-danger .navbar-start .navbar-link,.navbar.is-danger .navbar-start>.navbar-item{color:#fff}.navbar.is-danger .navbar-end .navbar-link.is-active,.navbar.is-danger .navbar-end .navbar-link:hover,.navbar.is-danger .navbar-end>a.navbar-item.is-active,.navbar.is-danger .navbar-end>a.navbar-item:hover,.navbar.is-danger .navbar-start .navbar-link.is-active,.navbar.is-danger .navbar-start .navbar-link:hover,.navbar.is-danger .navbar-start>a.navbar-item.is-active,.navbar.is-danger .navbar-start>a.navbar-item:hover{background-color:#eb002f;color:#fff}.navbar.is-danger .navbar-end .navbar-link::after,.navbar.is-danger .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-danger .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-danger .navbar-item.has-dropdown:hover .navbar-link{background-color:#eb002f;color:#fff}.navbar.is-danger .navbar-dropdown a.navbar-item.is-active{background-color:#ff0537;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-white-dark{background-color:#fff;color:#0a0a0a}.navbar.is-white-dark .navbar-brand .navbar-link,.navbar.is-white-dark .navbar-brand>.navbar-item{color:#0a0a0a}.navbar.is-white-dark .navbar-brand .navbar-link.is-active,.navbar.is-white-dark .navbar-brand .navbar-link:hover,.navbar.is-white-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-white-dark .navbar-brand>a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white-dark .navbar-brand .navbar-link::after{border-color:#0a0a0a}.navbar.is-white-dark .navbar-burger{color:#0a0a0a}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-white-dark .navbar-end .navbar-link,.navbar.is-white-dark .navbar-end>.navbar-item,.navbar.is-white-dark .navbar-start .navbar-link,.navbar.is-white-dark .navbar-start>.navbar-item{color:#0a0a0a}.navbar.is-white-dark .navbar-end .navbar-link.is-active,.navbar.is-white-dark .navbar-end .navbar-link:hover,.navbar.is-white-dark .navbar-end>a.navbar-item.is-active,.navbar.is-white-dark .navbar-end>a.navbar-item:hover,.navbar.is-white-dark .navbar-start .navbar-link.is-active,.navbar.is-white-dark .navbar-start .navbar-link:hover,.navbar.is-white-dark .navbar-start>a.navbar-item.is-active,.navbar.is-white-dark .navbar-start>a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white-dark .navbar-end .navbar-link::after,.navbar.is-white-dark .navbar-start .navbar-link::after{border-color:#0a0a0a}.navbar.is-white-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-white-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white-dark .navbar-dropdown a.navbar-item.is-active{background-color:#fff;color:#0a0a0a}}@media (prefers-color-scheme:dark){.navbar.is-black-dark{background-color:#0a0a0a;color:#fff}.navbar.is-black-dark .navbar-brand .navbar-link,.navbar.is-black-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-black-dark .navbar-brand .navbar-link.is-active,.navbar.is-black-dark .navbar-brand .navbar-link:hover,.navbar.is-black-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-black-dark .navbar-brand>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-black-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-black-dark .navbar-end .navbar-link,.navbar.is-black-dark .navbar-end>.navbar-item,.navbar.is-black-dark .navbar-start .navbar-link,.navbar.is-black-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-black-dark .navbar-end .navbar-link.is-active,.navbar.is-black-dark .navbar-end .navbar-link:hover,.navbar.is-black-dark .navbar-end>a.navbar-item.is-active,.navbar.is-black-dark .navbar-end>a.navbar-item:hover,.navbar.is-black-dark .navbar-start .navbar-link.is-active,.navbar.is-black-dark .navbar-start .navbar-link:hover,.navbar.is-black-dark .navbar-start>a.navbar-item.is-active,.navbar.is-black-dark .navbar-start>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black-dark .navbar-end .navbar-link::after,.navbar.is-black-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-black-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-black-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#000;color:#fff}.navbar.is-black-dark .navbar-dropdown a.navbar-item.is-active{background-color:#0a0a0a;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-light-dark{background-color:#f5f5f5;color:#363636}.navbar.is-light-dark .navbar-brand .navbar-link,.navbar.is-light-dark .navbar-brand>.navbar-item{color:#363636}.navbar.is-light-dark .navbar-brand .navbar-link.is-active,.navbar.is-light-dark .navbar-brand .navbar-link:hover,.navbar.is-light-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-light-dark .navbar-brand>a.navbar-item:hover{background-color:#e8e8e8;color:#363636}.navbar.is-light-dark .navbar-brand .navbar-link::after{border-color:#363636}.navbar.is-light-dark .navbar-burger{color:#363636}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-light-dark .navbar-end .navbar-link,.navbar.is-light-dark .navbar-end>.navbar-item,.navbar.is-light-dark .navbar-start .navbar-link,.navbar.is-light-dark .navbar-start>.navbar-item{color:#363636}.navbar.is-light-dark .navbar-end .navbar-link.is-active,.navbar.is-light-dark .navbar-end .navbar-link:hover,.navbar.is-light-dark .navbar-end>a.navbar-item.is-active,.navbar.is-light-dark .navbar-end>a.navbar-item:hover,.navbar.is-light-dark .navbar-start .navbar-link.is-active,.navbar.is-light-dark .navbar-start .navbar-link:hover,.navbar.is-light-dark .navbar-start>a.navbar-item.is-active,.navbar.is-light-dark .navbar-start>a.navbar-item:hover{background-color:#e8e8e8;color:#363636}.navbar.is-light-dark .navbar-end .navbar-link::after,.navbar.is-light-dark .navbar-start .navbar-link::after{border-color:#363636}.navbar.is-light-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-light-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#e8e8e8;color:#363636}.navbar.is-light-dark .navbar-dropdown a.navbar-item.is-active{background-color:#f5f5f5;color:#363636}}@media (prefers-color-scheme:dark){.navbar.is-dark-dark{background-color:#363636;color:#f5f5f5}.navbar.is-dark-dark .navbar-brand .navbar-link,.navbar.is-dark-dark .navbar-brand>.navbar-item{color:#f5f5f5}.navbar.is-dark-dark .navbar-brand .navbar-link.is-active,.navbar.is-dark-dark .navbar-brand .navbar-link:hover,.navbar.is-dark-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-dark-dark .navbar-brand>a.navbar-item:hover{background-color:#292929;color:#f5f5f5}.navbar.is-dark-dark .navbar-brand .navbar-link::after{border-color:#f5f5f5}.navbar.is-dark-dark .navbar-burger{color:#f5f5f5}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-dark-dark .navbar-end .navbar-link,.navbar.is-dark-dark .navbar-end>.navbar-item,.navbar.is-dark-dark .navbar-start .navbar-link,.navbar.is-dark-dark .navbar-start>.navbar-item{color:#f5f5f5}.navbar.is-dark-dark .navbar-end .navbar-link.is-active,.navbar.is-dark-dark .navbar-end .navbar-link:hover,.navbar.is-dark-dark .navbar-end>a.navbar-item.is-active,.navbar.is-dark-dark .navbar-end>a.navbar-item:hover,.navbar.is-dark-dark .navbar-start .navbar-link.is-active,.navbar.is-dark-dark .navbar-start .navbar-link:hover,.navbar.is-dark-dark .navbar-start>a.navbar-item.is-active,.navbar.is-dark-dark .navbar-start>a.navbar-item:hover{background-color:#292929;color:#f5f5f5}.navbar.is-dark-dark .navbar-end .navbar-link::after,.navbar.is-dark-dark .navbar-start .navbar-link::after{border-color:#f5f5f5}.navbar.is-dark-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-dark-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#292929;color:#f5f5f5}.navbar.is-dark-dark .navbar-dropdown a.navbar-item.is-active{background-color:#363636;color:#f5f5f5}}@media (prefers-color-scheme:dark){.navbar.is-primary-dark{background-color:#00d1b2;color:#fff}.navbar.is-primary-dark .navbar-brand .navbar-link,.navbar.is-primary-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-primary-dark .navbar-brand .navbar-link.is-active,.navbar.is-primary-dark .navbar-brand .navbar-link:hover,.navbar.is-primary-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-primary-dark .navbar-brand>a.navbar-item:hover{background-color:#00b89c;color:#fff}.navbar.is-primary-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-primary-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-primary-dark .navbar-end .navbar-link,.navbar.is-primary-dark .navbar-end>.navbar-item,.navbar.is-primary-dark .navbar-start .navbar-link,.navbar.is-primary-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-primary-dark .navbar-end .navbar-link.is-active,.navbar.is-primary-dark .navbar-end .navbar-link:hover,.navbar.is-primary-dark .navbar-end>a.navbar-item.is-active,.navbar.is-primary-dark .navbar-end>a.navbar-item:hover,.navbar.is-primary-dark .navbar-start .navbar-link.is-active,.navbar.is-primary-dark .navbar-start .navbar-link:hover,.navbar.is-primary-dark .navbar-start>a.navbar-item.is-active,.navbar.is-primary-dark .navbar-start>a.navbar-item:hover{background-color:#00b89c;color:#fff}.navbar.is-primary-dark .navbar-end .navbar-link::after,.navbar.is-primary-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-primary-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-primary-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#00b89c;color:#fff}.navbar.is-primary-dark .navbar-dropdown a.navbar-item.is-active{background-color:#00d1b2;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-link-dark{background-color:#3273dc;color:#fff}.navbar.is-link-dark .navbar-brand .navbar-link,.navbar.is-link-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-link-dark .navbar-brand .navbar-link.is-active,.navbar.is-link-dark .navbar-brand .navbar-link:hover,.navbar.is-link-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-link-dark .navbar-brand>a.navbar-item:hover{background-color:#2366d1;color:#fff}.navbar.is-link-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-link-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-link-dark .navbar-end .navbar-link,.navbar.is-link-dark .navbar-end>.navbar-item,.navbar.is-link-dark .navbar-start .navbar-link,.navbar.is-link-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-link-dark .navbar-end .navbar-link.is-active,.navbar.is-link-dark .navbar-end .navbar-link:hover,.navbar.is-link-dark .navbar-end>a.navbar-item.is-active,.navbar.is-link-dark .navbar-end>a.navbar-item:hover,.navbar.is-link-dark .navbar-start .navbar-link.is-active,.navbar.is-link-dark .navbar-start .navbar-link:hover,.navbar.is-link-dark .navbar-start>a.navbar-item.is-active,.navbar.is-link-dark .navbar-start>a.navbar-item:hover{background-color:#2366d1;color:#fff}.navbar.is-link-dark .navbar-end .navbar-link::after,.navbar.is-link-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-link-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-link-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#2366d1;color:#fff}.navbar.is-link-dark .navbar-dropdown a.navbar-item.is-active{background-color:#3273dc;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-info-dark{background-color:#209cee;color:#fff}.navbar.is-info-dark .navbar-brand .navbar-link,.navbar.is-info-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-info-dark .navbar-brand .navbar-link.is-active,.navbar.is-info-dark .navbar-brand .navbar-link:hover,.navbar.is-info-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-info-dark .navbar-brand>a.navbar-item:hover{background-color:#118fe4;color:#fff}.navbar.is-info-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-info-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-info-dark .navbar-end .navbar-link,.navbar.is-info-dark .navbar-end>.navbar-item,.navbar.is-info-dark .navbar-start .navbar-link,.navbar.is-info-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-info-dark .navbar-end .navbar-link.is-active,.navbar.is-info-dark .navbar-end .navbar-link:hover,.navbar.is-info-dark .navbar-end>a.navbar-item.is-active,.navbar.is-info-dark .navbar-end>a.navbar-item:hover,.navbar.is-info-dark .navbar-start .navbar-link.is-active,.navbar.is-info-dark .navbar-start .navbar-link:hover,.navbar.is-info-dark .navbar-start>a.navbar-item.is-active,.navbar.is-info-dark .navbar-start>a.navbar-item:hover{background-color:#118fe4;color:#fff}.navbar.is-info-dark .navbar-end .navbar-link::after,.navbar.is-info-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-info-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-info-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#118fe4;color:#fff}.navbar.is-info-dark .navbar-dropdown a.navbar-item.is-active{background-color:#209cee;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-success-dark{background-color:#23d160;color:#fff}.navbar.is-success-dark .navbar-brand .navbar-link,.navbar.is-success-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-success-dark .navbar-brand .navbar-link.is-active,.navbar.is-success-dark .navbar-brand .navbar-link:hover,.navbar.is-success-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-success-dark .navbar-brand>a.navbar-item:hover{background-color:#20bc56;color:#fff}.navbar.is-success-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-success-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-success-dark .navbar-end .navbar-link,.navbar.is-success-dark .navbar-end>.navbar-item,.navbar.is-success-dark .navbar-start .navbar-link,.navbar.is-success-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-success-dark .navbar-end .navbar-link.is-active,.navbar.is-success-dark .navbar-end .navbar-link:hover,.navbar.is-success-dark .navbar-end>a.navbar-item.is-active,.navbar.is-success-dark .navbar-end>a.navbar-item:hover,.navbar.is-success-dark .navbar-start .navbar-link.is-active,.navbar.is-success-dark .navbar-start .navbar-link:hover,.navbar.is-success-dark .navbar-start>a.navbar-item.is-active,.navbar.is-success-dark .navbar-start>a.navbar-item:hover{background-color:#20bc56;color:#fff}.navbar.is-success-dark .navbar-end .navbar-link::after,.navbar.is-success-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-success-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-success-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#20bc56;color:#fff}.navbar.is-success-dark .navbar-dropdown a.navbar-item.is-active{background-color:#23d160;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-warning-dark{background-color:#ffdd57;color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-brand .navbar-link,.navbar.is-warning-dark .navbar-brand>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-brand .navbar-link.is-active,.navbar.is-warning-dark .navbar-brand .navbar-link:hover,.navbar.is-warning-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-warning-dark .navbar-brand>a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-brand .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-burger{color:rgba(0,0,0,.7)}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-warning-dark .navbar-end .navbar-link,.navbar.is-warning-dark .navbar-end>.navbar-item,.navbar.is-warning-dark .navbar-start .navbar-link,.navbar.is-warning-dark .navbar-start>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-end .navbar-link.is-active,.navbar.is-warning-dark .navbar-end .navbar-link:hover,.navbar.is-warning-dark .navbar-end>a.navbar-item.is-active,.navbar.is-warning-dark .navbar-end>a.navbar-item:hover,.navbar.is-warning-dark .navbar-start .navbar-link.is-active,.navbar.is-warning-dark .navbar-start .navbar-link:hover,.navbar.is-warning-dark .navbar-start>a.navbar-item.is-active,.navbar.is-warning-dark .navbar-start>a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-end .navbar-link::after,.navbar.is-warning-dark .navbar-start .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-warning-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-dropdown a.navbar-item.is-active{background-color:#ffdd57;color:rgba(0,0,0,.7)}}@media (prefers-color-scheme:dark){.navbar.is-danger-dark{background-color:#ff3860;color:#fff}.navbar.is-danger-dark .navbar-brand .navbar-link,.navbar.is-danger-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-danger-dark .navbar-brand .navbar-link.is-active,.navbar.is-danger-dark .navbar-brand .navbar-link:hover,.navbar.is-danger-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-danger-dark .navbar-brand>a.navbar-item:hover{background-color:#ff1f4b;color:#fff}.navbar.is-danger-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-danger-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-danger-dark .navbar-end .navbar-link,.navbar.is-danger-dark .navbar-end>.navbar-item,.navbar.is-danger-dark .navbar-start .navbar-link,.navbar.is-danger-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-danger-dark .navbar-end .navbar-link.is-active,.navbar.is-danger-dark .navbar-end .navbar-link:hover,.navbar.is-danger-dark .navbar-end>a.navbar-item.is-active,.navbar.is-danger-dark .navbar-end>a.navbar-item:hover,.navbar.is-danger-dark .navbar-start .navbar-link.is-active,.navbar.is-danger-dark .navbar-start .navbar-link:hover,.navbar.is-danger-dark .navbar-start>a.navbar-item.is-active,.navbar.is-danger-dark .navbar-start>a.navbar-item:hover{background-color:#ff1f4b;color:#fff}.navbar.is-danger-dark .navbar-end .navbar-link::after,.navbar.is-danger-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-danger-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-danger-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#ff1f4b;color:#fff}.navbar.is-danger-dark .navbar-dropdown a.navbar-item.is-active{background-color:#ff3860;color:#fff}}@media (prefers-color-scheme:dark){.navbar.has-shadow{box-shadow:0 2px 0 0 #242424}.navbar.is-fixed-bottom.has-shadow{box-shadow:0 -2px 0 0 #242424}.navbar-burger{color:#b5b5b5}.navbar-item,.navbar-link{color:#b5b5b5}.navbar-link.is-active,.navbar-link:hover,a.navbar-item.is-active,a.navbar-item:hover{background-color:#121212;color:#5ea3e4}.navbar-item:hover{border-bottom-color:#5ea3e4}.navbar-item.is-active{border-bottom-color:#5ea3e4;color:#5ea3e4}.navbar-link:not(.is-arrowless)::after{border-color:#5ea3e4}.navbar-divider{background-color:#242424}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.navbar-menu{background-color:#17181c;box-shadow:0 8px 16px rgba(255,255,255,.1)}.navbar.is-fixed-bottom-touch.has-shadow{box-shadow:0 -2px 3px rgba(255,255,255,.1)}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-transparent .navbar-dropdown a.navbar-item:hover{background-color:#242424;color:#fff}.navbar.is-transparent .navbar-dropdown a.navbar-item.is-active{background-color:#242424;color:#5ea3e4}.navbar-item.has-dropdown-up .navbar-dropdown{border-bottom:2px solid #363636;box-shadow:0 -8px 8px rgba(255,255,255,.1)}.navbar-dropdown{background-color:#0a0a0a;border-top:2px solid #363636;box-shadow:0 8px 8px rgba(255,255,255,.1)}.navbar-dropdown a.navbar-item:hover{background-color:#242424;color:#fff}.navbar-dropdown a.navbar-item.is-active{background-color:#242424;color:#5ea3e4}.navbar-dropdown.is-boxed,.navbar.is-spaced .navbar-dropdown{box-shadow:0 8px 8px rgba(255,255,255,.1),0 0 0 1px rgba(255,255,255,.1)}.navbar.is-fixed-bottom-desktop.has-shadow{box-shadow:0 -2px 3px rgba(255,255,255,.1)}.navbar-link.is-active,a.navbar-item.is-active{color:#fff}.navbar-item.has-dropdown.is-active .navbar-link,.navbar-item.has-dropdown:hover .navbar-link{background-color:#121212}}@media (prefers-color-scheme:dark){.pagination-link,.pagination-next,.pagination-previous{border-color:#363636;color:#dbdbdb}.pagination-link:hover,.pagination-next:hover,.pagination-previous:hover{border-color:#4a4a4a;color:#dbdbdb}.pagination-link:focus,.pagination-next:focus,.pagination-previous:focus{border-color:#5ea3e4}.pagination-link:active,.pagination-next:active,.pagination-previous:active{box-shadow:inset 0 1px 2px rgba(255,255,255,.2)}.pagination-link[disabled],.pagination-next[disabled],.pagination-previous[disabled]{background-color:#363636;border-color:#363636;color:#7a7a7a}.pagination-link.is-current{background-color:#5ea3e4;border-color:#5ea3e4;color:#fff}.pagination-ellipsis{color:#4a4a4a}.panel-block,.panel-heading,.panel-tabs{border-bottom:1px solid #363636;border-left:1px solid #363636;border-right:1px solid #363636}.panel-block:first-child,.panel-heading:first-child,.panel-tabs:first-child{border-top:1px solid #363636}.panel-heading{background-color:#242424;color:#dbdbdb}.panel-tabs a{border-bottom:1px solid #363636}.panel-tabs a.is-active{border-bottom-color:#b5b5b5;color:#dbdbdb}.panel-list a{color:#b5b5b5}.panel-list a:hover{color:#5ea3e4}.panel-block{color:#dbdbdb}.panel-block.is-active{border-left-color:#5ea3e4;color:#dbdbdb}.panel-block.is-active .panel-icon{color:#5ea3e4}a.panel-block:hover,label.panel-block:hover{background-color:#242424}.tabs a{border-bottom-color:#363636;color:#b5b5b5}.tabs a:hover{border-bottom-color:#dbdbdb;color:#dbdbdb}.tabs li.is-active a{border-bottom-color:#5ea3e4;color:#5ea3e4}.tabs ul{border-bottom-color:#363636}.tabs.is-boxed a:hover{background-color:#242424;border-bottom-color:#363636}.tabs.is-boxed li.is-active a{background-color:#0a0a0a;border-color:#363636}.tabs.is-toggle a{border-color:#363636}.tabs.is-toggle a:hover{background-color:#242424;border-color:#4a4a4a}.tabs.is-toggle li.is-active a{background-color:#5ea3e4;border-color:#5ea3e4;color:#fff}.hero.is-white,.hero.is-white-dark{background-color:#e6e6e6;color:#0a0a0a}.hero.is-white a:not(.button):not(.dropdown-item):not(.tag),.hero.is-white strong,.hero.is-white-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-white-dark strong{color:inherit}.hero.is-white .title,.hero.is-white-dark .title{color:#0a0a0a}.hero.is-white .subtitle,.hero.is-white-dark .subtitle{color:rgba(10,10,10,.9)}.hero.is-white .subtitle a:not(.button),.hero.is-white .subtitle strong,.hero.is-white-dark .subtitle a:not(.button),.hero.is-white-dark .subtitle strong{color:#0a0a0a}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-white .navbar-menu,.hero.is-white-dark .navbar-menu{background-color:#e6e6e6}}@media (prefers-color-scheme:dark){.hero.is-white .navbar-item,.hero.is-white .navbar-link,.hero.is-white-dark .navbar-item,.hero.is-white-dark .navbar-link{color:rgba(10,10,10,.7)}.hero.is-white .navbar-link.is-active,.hero.is-white .navbar-link:hover,.hero.is-white a.navbar-item.is-active,.hero.is-white a.navbar-item:hover,.hero.is-white-dark .navbar-link.is-active,.hero.is-white-dark .navbar-link:hover,.hero.is-white-dark a.navbar-item.is-active,.hero.is-white-dark a.navbar-item:hover{background-color:#d9d9d9;color:#0a0a0a}.hero.is-white .tabs a,.hero.is-white-dark .tabs a{color:#0a0a0a;opacity:.9}.hero.is-white .tabs a:hover,.hero.is-white-dark .tabs a:hover{opacity:1}.hero.is-white .tabs li.is-active a,.hero.is-white-dark .tabs li.is-active a{opacity:1}.hero.is-white .tabs.is-boxed a,.hero.is-white .tabs.is-toggle a,.hero.is-white-dark .tabs.is-boxed a,.hero.is-white-dark .tabs.is-toggle a{color:#0a0a0a}.hero.is-white .tabs.is-boxed a:hover,.hero.is-white .tabs.is-toggle a:hover,.hero.is-white-dark .tabs.is-boxed a:hover,.hero.is-white-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-white .tabs.is-boxed li.is-active a,.hero.is-white .tabs.is-boxed li.is-active a:hover,.hero.is-white .tabs.is-toggle li.is-active a,.hero.is-white .tabs.is-toggle li.is-active a:hover,.hero.is-white-dark .tabs.is-boxed li.is-active a,.hero.is-white-dark .tabs.is-boxed li.is-active a:hover,.hero.is-white-dark .tabs.is-toggle li.is-active a,.hero.is-white-dark .tabs.is-toggle li.is-active a:hover{background-color:#0a0a0a;border-color:#0a0a0a;color:#e6e6e6}.hero.is-white-dark.is-bold,.hero.is-white.is-bold{background-image:linear-gradient(141deg,#d1c7c9 0,#e6e6e6 71%,#f3f2f2 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-white-dark.is-bold .navbar-menu,.hero.is-white.is-bold .navbar-menu{background-image:linear-gradient(141deg,#d1c7c9 0,#e6e6e6 71%,#f3f2f2 100%)}}@media (prefers-color-scheme:dark){.hero.is-black,.hero.is-black-dark{background-color:#000;color:#fff}.hero.is-black a:not(.button):not(.dropdown-item):not(.tag),.hero.is-black strong,.hero.is-black-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-black-dark strong{color:inherit}.hero.is-black .title,.hero.is-black-dark .title{color:#fff}.hero.is-black .subtitle,.hero.is-black-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-black .subtitle a:not(.button),.hero.is-black .subtitle strong,.hero.is-black-dark .subtitle a:not(.button),.hero.is-black-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-black .navbar-menu,.hero.is-black-dark .navbar-menu{background-color:#000}}@media (prefers-color-scheme:dark){.hero.is-black .navbar-item,.hero.is-black .navbar-link,.hero.is-black-dark .navbar-item,.hero.is-black-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-black .navbar-link.is-active,.hero.is-black .navbar-link:hover,.hero.is-black a.navbar-item.is-active,.hero.is-black a.navbar-item:hover,.hero.is-black-dark .navbar-link.is-active,.hero.is-black-dark .navbar-link:hover,.hero.is-black-dark a.navbar-item.is-active,.hero.is-black-dark a.navbar-item:hover{background-color:#000;color:#fff}.hero.is-black .tabs a,.hero.is-black-dark .tabs a{color:#fff;opacity:.9}.hero.is-black .tabs a:hover,.hero.is-black-dark .tabs a:hover{opacity:1}.hero.is-black .tabs li.is-active a,.hero.is-black-dark .tabs li.is-active a{opacity:1}.hero.is-black .tabs.is-boxed a,.hero.is-black .tabs.is-toggle a,.hero.is-black-dark .tabs.is-boxed a,.hero.is-black-dark .tabs.is-toggle a{color:#fff}.hero.is-black .tabs.is-boxed a:hover,.hero.is-black .tabs.is-toggle a:hover,.hero.is-black-dark .tabs.is-boxed a:hover,.hero.is-black-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-black .tabs.is-boxed li.is-active a,.hero.is-black .tabs.is-boxed li.is-active a:hover,.hero.is-black .tabs.is-toggle li.is-active a,.hero.is-black .tabs.is-toggle li.is-active a:hover,.hero.is-black-dark .tabs.is-boxed li.is-active a,.hero.is-black-dark .tabs.is-boxed li.is-active a:hover,.hero.is-black-dark .tabs.is-toggle li.is-active a,.hero.is-black-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#000}.hero.is-black-dark.is-bold,.hero.is-black.is-bold{background-image:linear-gradient(141deg,#000 0,#000 71%,#0d0d0d 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-black-dark.is-bold .navbar-menu,.hero.is-black.is-bold .navbar-menu{background-image:linear-gradient(141deg,#000 0,#000 71%,#0d0d0d 100%)}}@media (prefers-color-scheme:dark){.hero.is-light,.hero.is-light-dark{background-color:#dbdbdb;color:#363636}.hero.is-light a:not(.button):not(.dropdown-item):not(.tag),.hero.is-light strong,.hero.is-light-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-light-dark strong{color:inherit}.hero.is-light .title,.hero.is-light-dark .title{color:#363636}.hero.is-light .subtitle,.hero.is-light-dark .subtitle{color:rgba(54,54,54,.9)}.hero.is-light .subtitle a:not(.button),.hero.is-light .subtitle strong,.hero.is-light-dark .subtitle a:not(.button),.hero.is-light-dark .subtitle strong{color:#363636}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-light .navbar-menu,.hero.is-light-dark .navbar-menu{background-color:#dbdbdb}}@media (prefers-color-scheme:dark){.hero.is-light .navbar-item,.hero.is-light .navbar-link,.hero.is-light-dark .navbar-item,.hero.is-light-dark .navbar-link{color:rgba(54,54,54,.7)}.hero.is-light .navbar-link.is-active,.hero.is-light .navbar-link:hover,.hero.is-light a.navbar-item.is-active,.hero.is-light a.navbar-item:hover,.hero.is-light-dark .navbar-link.is-active,.hero.is-light-dark .navbar-link:hover,.hero.is-light-dark a.navbar-item.is-active,.hero.is-light-dark a.navbar-item:hover{background-color:#cfcfcf;color:#363636}.hero.is-light .tabs a,.hero.is-light-dark .tabs a{color:#363636;opacity:.9}.hero.is-light .tabs a:hover,.hero.is-light-dark .tabs a:hover{opacity:1}.hero.is-light .tabs li.is-active a,.hero.is-light-dark .tabs li.is-active a{opacity:1}.hero.is-light .tabs.is-boxed a,.hero.is-light .tabs.is-toggle a,.hero.is-light-dark .tabs.is-boxed a,.hero.is-light-dark .tabs.is-toggle a{color:#363636}.hero.is-light .tabs.is-boxed a:hover,.hero.is-light .tabs.is-toggle a:hover,.hero.is-light-dark .tabs.is-boxed a:hover,.hero.is-light-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-light .tabs.is-boxed li.is-active a,.hero.is-light .tabs.is-boxed li.is-active a:hover,.hero.is-light .tabs.is-toggle li.is-active a,.hero.is-light .tabs.is-toggle li.is-active a:hover,.hero.is-light-dark .tabs.is-boxed li.is-active a,.hero.is-light-dark .tabs.is-boxed li.is-active a:hover,.hero.is-light-dark .tabs.is-toggle li.is-active a,.hero.is-light-dark .tabs.is-toggle li.is-active a:hover{background-color:#363636;border-color:#363636;color:#dbdbdb}.hero.is-light-dark.is-bold,.hero.is-light.is-bold{background-image:linear-gradient(141deg,#c8bcbe 0,#dbdbdb 71%,#e9e7e7 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-light-dark.is-bold .navbar-menu,.hero.is-light.is-bold .navbar-menu{background-image:linear-gradient(141deg,#c8bcbe 0,#dbdbdb 71%,#e9e7e7 100%)}}@media (prefers-color-scheme:dark){.hero.is-dark,.hero.is-dark-dark{background-color:#1c1c1c;color:#f5f5f5}.hero.is-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-dark strong,.hero.is-dark-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-dark-dark strong{color:inherit}.hero.is-dark .title,.hero.is-dark-dark .title{color:#f5f5f5}.hero.is-dark .subtitle,.hero.is-dark-dark .subtitle{color:rgba(245,245,245,.9)}.hero.is-dark .subtitle a:not(.button),.hero.is-dark .subtitle strong,.hero.is-dark-dark .subtitle a:not(.button),.hero.is-dark-dark .subtitle strong{color:#f5f5f5}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-dark .navbar-menu,.hero.is-dark-dark .navbar-menu{background-color:#1c1c1c}}@media (prefers-color-scheme:dark){.hero.is-dark .navbar-item,.hero.is-dark .navbar-link,.hero.is-dark-dark .navbar-item,.hero.is-dark-dark .navbar-link{color:rgba(245,245,245,.7)}.hero.is-dark .navbar-link.is-active,.hero.is-dark .navbar-link:hover,.hero.is-dark a.navbar-item.is-active,.hero.is-dark a.navbar-item:hover,.hero.is-dark-dark .navbar-link.is-active,.hero.is-dark-dark .navbar-link:hover,.hero.is-dark-dark a.navbar-item.is-active,.hero.is-dark-dark a.navbar-item:hover{background-color:#0f0f0f;color:#f5f5f5}.hero.is-dark .tabs a,.hero.is-dark-dark .tabs a{color:#f5f5f5;opacity:.9}.hero.is-dark .tabs a:hover,.hero.is-dark-dark .tabs a:hover{opacity:1}.hero.is-dark .tabs li.is-active a,.hero.is-dark-dark .tabs li.is-active a{opacity:1}.hero.is-dark .tabs.is-boxed a,.hero.is-dark .tabs.is-toggle a,.hero.is-dark-dark .tabs.is-boxed a,.hero.is-dark-dark .tabs.is-toggle a{color:#f5f5f5}.hero.is-dark .tabs.is-boxed a:hover,.hero.is-dark .tabs.is-toggle a:hover,.hero.is-dark-dark .tabs.is-boxed a:hover,.hero.is-dark-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-dark .tabs.is-boxed li.is-active a,.hero.is-dark .tabs.is-boxed li.is-active a:hover,.hero.is-dark .tabs.is-toggle li.is-active a,.hero.is-dark .tabs.is-toggle li.is-active a:hover,.hero.is-dark-dark .tabs.is-boxed li.is-active a,.hero.is-dark-dark .tabs.is-boxed li.is-active a:hover,.hero.is-dark-dark .tabs.is-toggle li.is-active a,.hero.is-dark-dark .tabs.is-toggle li.is-active a:hover{background-color:#f5f5f5;border-color:#f5f5f5;color:#1c1c1c}.hero.is-dark-dark.is-bold,.hero.is-dark.is-bold{background-image:linear-gradient(141deg,#030202 0,#1c1c1c 71%,#2b2727 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-dark-dark.is-bold .navbar-menu,.hero.is-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#030202 0,#1c1c1c 71%,#2b2727 100%)}}@media (prefers-color-scheme:dark){.hero.is-primary,.hero.is-primary-dark{background-color:#009e86;color:#fff}.hero.is-primary a:not(.button):not(.dropdown-item):not(.tag),.hero.is-primary strong,.hero.is-primary-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-primary-dark strong{color:inherit}.hero.is-primary .title,.hero.is-primary-dark .title{color:#fff}.hero.is-primary .subtitle,.hero.is-primary-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-primary .subtitle a:not(.button),.hero.is-primary .subtitle strong,.hero.is-primary-dark .subtitle a:not(.button),.hero.is-primary-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-primary .navbar-menu,.hero.is-primary-dark .navbar-menu{background-color:#009e86}}@media (prefers-color-scheme:dark){.hero.is-primary .navbar-item,.hero.is-primary .navbar-link,.hero.is-primary-dark .navbar-item,.hero.is-primary-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-primary .navbar-link.is-active,.hero.is-primary .navbar-link:hover,.hero.is-primary a.navbar-item.is-active,.hero.is-primary a.navbar-item:hover,.hero.is-primary-dark .navbar-link.is-active,.hero.is-primary-dark .navbar-link:hover,.hero.is-primary-dark a.navbar-item.is-active,.hero.is-primary-dark a.navbar-item:hover{background-color:#008571;color:#fff}.hero.is-primary .tabs a,.hero.is-primary-dark .tabs a{color:#fff;opacity:.9}.hero.is-primary .tabs a:hover,.hero.is-primary-dark .tabs a:hover{opacity:1}.hero.is-primary .tabs li.is-active a,.hero.is-primary-dark .tabs li.is-active a{opacity:1}.hero.is-primary .tabs.is-boxed a,.hero.is-primary .tabs.is-toggle a,.hero.is-primary-dark .tabs.is-boxed a,.hero.is-primary-dark .tabs.is-toggle a{color:#fff}.hero.is-primary .tabs.is-boxed a:hover,.hero.is-primary .tabs.is-toggle a:hover,.hero.is-primary-dark .tabs.is-boxed a:hover,.hero.is-primary-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-primary .tabs.is-boxed li.is-active a,.hero.is-primary .tabs.is-boxed li.is-active a:hover,.hero.is-primary .tabs.is-toggle li.is-active a,.hero.is-primary .tabs.is-toggle li.is-active a:hover,.hero.is-primary-dark .tabs.is-boxed li.is-active a,.hero.is-primary-dark .tabs.is-boxed li.is-active a:hover,.hero.is-primary-dark .tabs.is-toggle li.is-active a,.hero.is-primary-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#009e86}.hero.is-primary-dark.is-bold,.hero.is-primary.is-bold{background-image:linear-gradient(141deg,#006b49 0,#009e86 71%,#00b5b8 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-primary-dark.is-bold .navbar-menu,.hero.is-primary.is-bold .navbar-menu{background-image:linear-gradient(141deg,#006b49 0,#009e86 71%,#00b5b8 100%)}}@media (prefers-color-scheme:dark){.hero.is-link,.hero.is-link-dark{background-color:#205bbc;color:#fff}.hero.is-link a:not(.button):not(.dropdown-item):not(.tag),.hero.is-link strong,.hero.is-link-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-link-dark strong{color:inherit}.hero.is-link .title,.hero.is-link-dark .title{color:#fff}.hero.is-link .subtitle,.hero.is-link-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-link .subtitle a:not(.button),.hero.is-link .subtitle strong,.hero.is-link-dark .subtitle a:not(.button),.hero.is-link-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-link .navbar-menu,.hero.is-link-dark .navbar-menu{background-color:#205bbc}}@media (prefers-color-scheme:dark){.hero.is-link .navbar-item,.hero.is-link .navbar-link,.hero.is-link-dark .navbar-item,.hero.is-link-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-link .navbar-link.is-active,.hero.is-link .navbar-link:hover,.hero.is-link a.navbar-item.is-active,.hero.is-link a.navbar-item:hover,.hero.is-link-dark .navbar-link.is-active,.hero.is-link-dark .navbar-link:hover,.hero.is-link-dark a.navbar-item.is-active,.hero.is-link-dark a.navbar-item:hover{background-color:#1c51a6;color:#fff}.hero.is-link .tabs a,.hero.is-link-dark .tabs a{color:#fff;opacity:.9}.hero.is-link .tabs a:hover,.hero.is-link-dark .tabs a:hover{opacity:1}.hero.is-link .tabs li.is-active a,.hero.is-link-dark .tabs li.is-active a{opacity:1}.hero.is-link .tabs.is-boxed a,.hero.is-link .tabs.is-toggle a,.hero.is-link-dark .tabs.is-boxed a,.hero.is-link-dark .tabs.is-toggle a{color:#fff}.hero.is-link .tabs.is-boxed a:hover,.hero.is-link .tabs.is-toggle a:hover,.hero.is-link-dark .tabs.is-boxed a:hover,.hero.is-link-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-link .tabs.is-boxed li.is-active a,.hero.is-link .tabs.is-boxed li.is-active a:hover,.hero.is-link .tabs.is-toggle li.is-active a,.hero.is-link .tabs.is-toggle li.is-active a:hover,.hero.is-link-dark .tabs.is-boxed li.is-active a,.hero.is-link-dark .tabs.is-boxed li.is-active a:hover,.hero.is-link-dark .tabs.is-toggle li.is-active a,.hero.is-link-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#205bbc}.hero.is-link-dark.is-bold,.hero.is-link.is-bold{background-image:linear-gradient(141deg,#105b98 0,#205bbc 71%,#1d46d7 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-link-dark.is-bold .navbar-menu,.hero.is-link.is-bold .navbar-menu{background-image:linear-gradient(141deg,#105b98 0,#205bbc 71%,#1d46d7 100%)}}@media (prefers-color-scheme:dark){.hero.is-info,.hero.is-info-dark{background-color:#0f81cc;color:#fff}.hero.is-info a:not(.button):not(.dropdown-item):not(.tag),.hero.is-info strong,.hero.is-info-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-info-dark strong{color:inherit}.hero.is-info .title,.hero.is-info-dark .title{color:#fff}.hero.is-info .subtitle,.hero.is-info-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-info .subtitle a:not(.button),.hero.is-info .subtitle strong,.hero.is-info-dark .subtitle a:not(.button),.hero.is-info-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-info .navbar-menu,.hero.is-info-dark .navbar-menu{background-color:#0f81cc}}@media (prefers-color-scheme:dark){.hero.is-info .navbar-item,.hero.is-info .navbar-link,.hero.is-info-dark .navbar-item,.hero.is-info-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-info .navbar-link.is-active,.hero.is-info .navbar-link:hover,.hero.is-info a.navbar-item.is-active,.hero.is-info a.navbar-item:hover,.hero.is-info-dark .navbar-link.is-active,.hero.is-info-dark .navbar-link:hover,.hero.is-info-dark a.navbar-item.is-active,.hero.is-info-dark a.navbar-item:hover{background-color:#0e72b4;color:#fff}.hero.is-info .tabs a,.hero.is-info-dark .tabs a{color:#fff;opacity:.9}.hero.is-info .tabs a:hover,.hero.is-info-dark .tabs a:hover{opacity:1}.hero.is-info .tabs li.is-active a,.hero.is-info-dark .tabs li.is-active a{opacity:1}.hero.is-info .tabs.is-boxed a,.hero.is-info .tabs.is-toggle a,.hero.is-info-dark .tabs.is-boxed a,.hero.is-info-dark .tabs.is-toggle a{color:#fff}.hero.is-info .tabs.is-boxed a:hover,.hero.is-info .tabs.is-toggle a:hover,.hero.is-info-dark .tabs.is-boxed a:hover,.hero.is-info-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-info .tabs.is-boxed li.is-active a,.hero.is-info .tabs.is-boxed li.is-active a:hover,.hero.is-info .tabs.is-toggle li.is-active a,.hero.is-info .tabs.is-toggle li.is-active a:hover,.hero.is-info-dark .tabs.is-boxed li.is-active a,.hero.is-info-dark .tabs.is-boxed li.is-active a:hover,.hero.is-info-dark .tabs.is-toggle li.is-active a,.hero.is-info-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#0f81cc}.hero.is-info-dark.is-bold,.hero.is-info.is-bold{background-image:linear-gradient(141deg,#037fa5 0,#0f81cc 71%,#0b6cea 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-info-dark.is-bold .navbar-menu,.hero.is-info.is-bold .navbar-menu{background-image:linear-gradient(141deg,#037fa5 0,#0f81cc 71%,#0b6cea 100%)}}@media (prefers-color-scheme:dark){.hero.is-success,.hero.is-success-dark{background-color:#1ca64c;color:#fff}.hero.is-success a:not(.button):not(.dropdown-item):not(.tag),.hero.is-success strong,.hero.is-success-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-success-dark strong{color:inherit}.hero.is-success .title,.hero.is-success-dark .title{color:#fff}.hero.is-success .subtitle,.hero.is-success-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-success .subtitle a:not(.button),.hero.is-success .subtitle strong,.hero.is-success-dark .subtitle a:not(.button),.hero.is-success-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-success .navbar-menu,.hero.is-success-dark .navbar-menu{background-color:#1ca64c}}@media (prefers-color-scheme:dark){.hero.is-success .navbar-item,.hero.is-success .navbar-link,.hero.is-success-dark .navbar-item,.hero.is-success-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-success .navbar-link.is-active,.hero.is-success .navbar-link:hover,.hero.is-success a.navbar-item.is-active,.hero.is-success a.navbar-item:hover,.hero.is-success-dark .navbar-link.is-active,.hero.is-success-dark .navbar-link:hover,.hero.is-success-dark a.navbar-item.is-active,.hero.is-success-dark a.navbar-item:hover{background-color:#189042;color:#fff}.hero.is-success .tabs a,.hero.is-success-dark .tabs a{color:#fff;opacity:.9}.hero.is-success .tabs a:hover,.hero.is-success-dark .tabs a:hover{opacity:1}.hero.is-success .tabs li.is-active a,.hero.is-success-dark .tabs li.is-active a{opacity:1}.hero.is-success .tabs.is-boxed a,.hero.is-success .tabs.is-toggle a,.hero.is-success-dark .tabs.is-boxed a,.hero.is-success-dark .tabs.is-toggle a{color:#fff}.hero.is-success .tabs.is-boxed a:hover,.hero.is-success .tabs.is-toggle a:hover,.hero.is-success-dark .tabs.is-boxed a:hover,.hero.is-success-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-success .tabs.is-boxed li.is-active a,.hero.is-success .tabs.is-boxed li.is-active a:hover,.hero.is-success .tabs.is-toggle li.is-active a,.hero.is-success .tabs.is-toggle li.is-active a:hover,.hero.is-success-dark .tabs.is-boxed li.is-active a,.hero.is-success-dark .tabs.is-boxed li.is-active a:hover,.hero.is-success-dark .tabs.is-toggle li.is-active a,.hero.is-success-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#1ca64c}.hero.is-success-dark.is-bold,.hero.is-success.is-bold{background-image:linear-gradient(141deg,#0e8123 0,#1ca64c 71%,#1ac170 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-success-dark.is-bold .navbar-menu,.hero.is-success.is-bold .navbar-menu{background-image:linear-gradient(141deg,#0e8123 0,#1ca64c 71%,#1ac170 100%)}}@media (prefers-color-scheme:dark){.hero.is-warning,.hero.is-warning-dark{background-color:#ffd324;color:rgba(0,0,0,.7)}.hero.is-warning a:not(.button):not(.dropdown-item):not(.tag),.hero.is-warning strong,.hero.is-warning-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-warning-dark strong{color:inherit}.hero.is-warning .title,.hero.is-warning-dark .title{color:rgba(0,0,0,.7)}.hero.is-warning .subtitle,.hero.is-warning-dark .subtitle{color:rgba(0,0,0,.9)}.hero.is-warning .subtitle a:not(.button),.hero.is-warning .subtitle strong,.hero.is-warning-dark .subtitle a:not(.button),.hero.is-warning-dark .subtitle strong{color:rgba(0,0,0,.7)}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-warning .navbar-menu,.hero.is-warning-dark .navbar-menu{background-color:#ffd324}}@media (prefers-color-scheme:dark){.hero.is-warning .navbar-item,.hero.is-warning .navbar-link,.hero.is-warning-dark .navbar-item,.hero.is-warning-dark .navbar-link{color:rgba(0,0,0,.7)}.hero.is-warning .navbar-link.is-active,.hero.is-warning .navbar-link:hover,.hero.is-warning a.navbar-item.is-active,.hero.is-warning a.navbar-item:hover,.hero.is-warning-dark .navbar-link.is-active,.hero.is-warning-dark .navbar-link:hover,.hero.is-warning-dark a.navbar-item.is-active,.hero.is-warning-dark a.navbar-item:hover{background-color:#ffce0a;color:rgba(0,0,0,.7)}.hero.is-warning .tabs a,.hero.is-warning-dark .tabs a{color:rgba(0,0,0,.7);opacity:.9}.hero.is-warning .tabs a:hover,.hero.is-warning-dark .tabs a:hover{opacity:1}.hero.is-warning .tabs li.is-active a,.hero.is-warning-dark .tabs li.is-active a{opacity:1}.hero.is-warning .tabs.is-boxed a,.hero.is-warning .tabs.is-toggle a,.hero.is-warning-dark .tabs.is-boxed a,.hero.is-warning-dark .tabs.is-toggle a{color:rgba(0,0,0,.7)}.hero.is-warning .tabs.is-boxed a:hover,.hero.is-warning .tabs.is-toggle a:hover,.hero.is-warning-dark .tabs.is-boxed a:hover,.hero.is-warning-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-warning .tabs.is-boxed li.is-active a,.hero.is-warning .tabs.is-boxed li.is-active a:hover,.hero.is-warning .tabs.is-toggle li.is-active a,.hero.is-warning .tabs.is-toggle li.is-active a:hover,.hero.is-warning-dark .tabs.is-boxed li.is-active a,.hero.is-warning-dark .tabs.is-boxed li.is-active a:hover,.hero.is-warning-dark .tabs.is-toggle li.is-active a,.hero.is-warning-dark .tabs.is-toggle li.is-active a:hover{background-color:rgba(0,0,0,.7);border-color:rgba(0,0,0,.7);color:#ffd324}.hero.is-warning-dark.is-bold,.hero.is-warning.is-bold{background-image:linear-gradient(141deg,#f09800 0,#ffd324 71%,#fff93d 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-warning-dark.is-bold .navbar-menu,.hero.is-warning.is-bold .navbar-menu{background-image:linear-gradient(141deg,#f09800 0,#ffd324 71%,#fff93d 100%)}}@media (prefers-color-scheme:dark){.hero.is-danger,.hero.is-danger-dark{background-color:#ff0537;color:#fff}.hero.is-danger a:not(.button):not(.dropdown-item):not(.tag),.hero.is-danger strong,.hero.is-danger-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-danger-dark strong{color:inherit}.hero.is-danger .title,.hero.is-danger-dark .title{color:#fff}.hero.is-danger .subtitle,.hero.is-danger-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-danger .subtitle a:not(.button),.hero.is-danger .subtitle strong,.hero.is-danger-dark .subtitle a:not(.button),.hero.is-danger-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-danger .navbar-menu,.hero.is-danger-dark .navbar-menu{background-color:#ff0537}}@media (prefers-color-scheme:dark){.hero.is-danger .navbar-item,.hero.is-danger .navbar-link,.hero.is-danger-dark .navbar-item,.hero.is-danger-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-danger .navbar-link.is-active,.hero.is-danger .navbar-link:hover,.hero.is-danger a.navbar-item.is-active,.hero.is-danger a.navbar-item:hover,.hero.is-danger-dark .navbar-link.is-active,.hero.is-danger-dark .navbar-link:hover,.hero.is-danger-dark a.navbar-item.is-active,.hero.is-danger-dark a.navbar-item:hover{background-color:#eb002f;color:#fff}.hero.is-danger .tabs a,.hero.is-danger-dark .tabs a{color:#fff;opacity:.9}.hero.is-danger .tabs a:hover,.hero.is-danger-dark .tabs a:hover{opacity:1}.hero.is-danger .tabs li.is-active a,.hero.is-danger-dark .tabs li.is-active a{opacity:1}.hero.is-danger .tabs.is-boxed a,.hero.is-danger .tabs.is-toggle a,.hero.is-danger-dark .tabs.is-boxed a,.hero.is-danger-dark .tabs.is-toggle a{color:#fff}.hero.is-danger .tabs.is-boxed a:hover,.hero.is-danger .tabs.is-toggle a:hover,.hero.is-danger-dark .tabs.is-boxed a:hover,.hero.is-danger-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-danger .tabs.is-boxed li.is-active a,.hero.is-danger .tabs.is-boxed li.is-active a:hover,.hero.is-danger .tabs.is-toggle li.is-active a,.hero.is-danger .tabs.is-toggle li.is-active a:hover,.hero.is-danger-dark .tabs.is-boxed li.is-active a,.hero.is-danger-dark .tabs.is-boxed li.is-active a:hover,.hero.is-danger-dark .tabs.is-toggle li.is-active a,.hero.is-danger-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#ff0537}.hero.is-danger-dark.is-bold,.hero.is-danger.is-bold{background-image:linear-gradient(141deg,#d1004d 0,#ff0537 71%,#ff1f26 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-danger-dark.is-bold .navbar-menu,.hero.is-danger.is-bold .navbar-menu{background-image:linear-gradient(141deg,#d1004d 0,#ff0537 71%,#ff1f26 100%)}}@media (prefers-color-scheme:dark){.hero.is-white-dark{background-color:#fff;color:#0a0a0a}.hero.is-white-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-white-dark strong{color:inherit}.hero.is-white-dark .title{color:#0a0a0a}.hero.is-white-dark .subtitle{color:rgba(10,10,10,.9)}.hero.is-white-dark .subtitle a:not(.button),.hero.is-white-dark .subtitle strong{color:#0a0a0a}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-white-dark .navbar-menu{background-color:#fff}}@media (prefers-color-scheme:dark){.hero.is-white-dark .navbar-item,.hero.is-white-dark .navbar-link{color:rgba(10,10,10,.7)}.hero.is-white-dark .navbar-link.is-active,.hero.is-white-dark .navbar-link:hover,.hero.is-white-dark a.navbar-item.is-active,.hero.is-white-dark a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.hero.is-white-dark .tabs a{color:#0a0a0a;opacity:.9}.hero.is-white-dark .tabs a:hover{opacity:1}.hero.is-white-dark .tabs li.is-active a{opacity:1}.hero.is-white-dark .tabs.is-boxed a,.hero.is-white-dark .tabs.is-toggle a{color:#0a0a0a}.hero.is-white-dark .tabs.is-boxed a:hover,.hero.is-white-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-white-dark .tabs.is-boxed li.is-active a,.hero.is-white-dark .tabs.is-boxed li.is-active a:hover,.hero.is-white-dark .tabs.is-toggle li.is-active a,.hero.is-white-dark .tabs.is-toggle li.is-active a:hover{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.hero.is-white-dark.is-bold{background-image:linear-gradient(141deg,#e6e6e6 0,#fff 71%,#fff 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-white-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#e6e6e6 0,#fff 71%,#fff 100%)}}@media (prefers-color-scheme:dark){.hero.is-black-dark{background-color:#0a0a0a;color:#fff}.hero.is-black-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-black-dark strong{color:inherit}.hero.is-black-dark .title{color:#fff}.hero.is-black-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-black-dark .subtitle a:not(.button),.hero.is-black-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-black-dark .navbar-menu{background-color:#0a0a0a}}@media (prefers-color-scheme:dark){.hero.is-black-dark .navbar-item,.hero.is-black-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-black-dark .navbar-link.is-active,.hero.is-black-dark .navbar-link:hover,.hero.is-black-dark a.navbar-item.is-active,.hero.is-black-dark a.navbar-item:hover{background-color:#000;color:#fff}.hero.is-black-dark .tabs a{color:#fff;opacity:.9}.hero.is-black-dark .tabs a:hover{opacity:1}.hero.is-black-dark .tabs li.is-active a{opacity:1}.hero.is-black-dark .tabs.is-boxed a,.hero.is-black-dark .tabs.is-toggle a{color:#fff}.hero.is-black-dark .tabs.is-boxed a:hover,.hero.is-black-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-black-dark .tabs.is-boxed li.is-active a,.hero.is-black-dark .tabs.is-boxed li.is-active a:hover,.hero.is-black-dark .tabs.is-toggle li.is-active a,.hero.is-black-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#0a0a0a}.hero.is-black-dark.is-bold{background-image:linear-gradient(141deg,#000 0,#0a0a0a 71%,#181616 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-black-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#000 0,#0a0a0a 71%,#181616 100%)}}@media (prefers-color-scheme:dark){.hero.is-light-dark{background-color:#f5f5f5;color:#363636}.hero.is-light-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-light-dark strong{color:inherit}.hero.is-light-dark .title{color:#363636}.hero.is-light-dark .subtitle{color:rgba(54,54,54,.9)}.hero.is-light-dark .subtitle a:not(.button),.hero.is-light-dark .subtitle strong{color:#363636}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-light-dark .navbar-menu{background-color:#f5f5f5}}@media (prefers-color-scheme:dark){.hero.is-light-dark .navbar-item,.hero.is-light-dark .navbar-link{color:rgba(54,54,54,.7)}.hero.is-light-dark .navbar-link.is-active,.hero.is-light-dark .navbar-link:hover,.hero.is-light-dark a.navbar-item.is-active,.hero.is-light-dark a.navbar-item:hover{background-color:#e8e8e8;color:#363636}.hero.is-light-dark .tabs a{color:#363636;opacity:.9}.hero.is-light-dark .tabs a:hover{opacity:1}.hero.is-light-dark .tabs li.is-active a{opacity:1}.hero.is-light-dark .tabs.is-boxed a,.hero.is-light-dark .tabs.is-toggle a{color:#363636}.hero.is-light-dark .tabs.is-boxed a:hover,.hero.is-light-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-light-dark .tabs.is-boxed li.is-active a,.hero.is-light-dark .tabs.is-boxed li.is-active a:hover,.hero.is-light-dark .tabs.is-toggle li.is-active a,.hero.is-light-dark .tabs.is-toggle li.is-active a:hover{background-color:#363636;border-color:#363636;color:#f5f5f5}.hero.is-light-dark.is-bold{background-image:linear-gradient(141deg,#dfd8d9 0,#f5f5f5 71%,#fff 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-light-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#dfd8d9 0,#f5f5f5 71%,#fff 100%)}}@media (prefers-color-scheme:dark){.hero.is-dark-dark{background-color:#363636;color:#f5f5f5}.hero.is-dark-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-dark-dark strong{color:inherit}.hero.is-dark-dark .title{color:#f5f5f5}.hero.is-dark-dark .subtitle{color:rgba(245,245,245,.9)}.hero.is-dark-dark .subtitle a:not(.button),.hero.is-dark-dark .subtitle strong{color:#f5f5f5}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-dark-dark .navbar-menu{background-color:#363636}}@media (prefers-color-scheme:dark){.hero.is-dark-dark .navbar-item,.hero.is-dark-dark .navbar-link{color:rgba(245,245,245,.7)}.hero.is-dark-dark .navbar-link.is-active,.hero.is-dark-dark .navbar-link:hover,.hero.is-dark-dark a.navbar-item.is-active,.hero.is-dark-dark a.navbar-item:hover{background-color:#292929;color:#f5f5f5}.hero.is-dark-dark .tabs a{color:#f5f5f5;opacity:.9}.hero.is-dark-dark .tabs a:hover{opacity:1}.hero.is-dark-dark .tabs li.is-active a{opacity:1}.hero.is-dark-dark .tabs.is-boxed a,.hero.is-dark-dark .tabs.is-toggle a{color:#f5f5f5}.hero.is-dark-dark .tabs.is-boxed a:hover,.hero.is-dark-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-dark-dark .tabs.is-boxed li.is-active a,.hero.is-dark-dark .tabs.is-boxed li.is-active a:hover,.hero.is-dark-dark .tabs.is-toggle li.is-active a,.hero.is-dark-dark .tabs.is-toggle li.is-active a:hover{background-color:#f5f5f5;border-color:#f5f5f5;color:#363636}.hero.is-dark-dark.is-bold{background-image:linear-gradient(141deg,#1f191a 0,#363636 71%,#46403f 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-dark-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#1f191a 0,#363636 71%,#46403f 100%)}}@media (prefers-color-scheme:dark){.hero.is-primary-dark{background-color:#00d1b2;color:#fff}.hero.is-primary-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-primary-dark strong{color:inherit}.hero.is-primary-dark .title{color:#fff}.hero.is-primary-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-primary-dark .subtitle a:not(.button),.hero.is-primary-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-primary-dark .navbar-menu{background-color:#00d1b2}}@media (prefers-color-scheme:dark){.hero.is-primary-dark .navbar-item,.hero.is-primary-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-primary-dark .navbar-link.is-active,.hero.is-primary-dark .navbar-link:hover,.hero.is-primary-dark a.navbar-item.is-active,.hero.is-primary-dark a.navbar-item:hover{background-color:#00b89c;color:#fff}.hero.is-primary-dark .tabs a{color:#fff;opacity:.9}.hero.is-primary-dark .tabs a:hover{opacity:1}.hero.is-primary-dark .tabs li.is-active a{opacity:1}.hero.is-primary-dark .tabs.is-boxed a,.hero.is-primary-dark .tabs.is-toggle a{color:#fff}.hero.is-primary-dark .tabs.is-boxed a:hover,.hero.is-primary-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-primary-dark .tabs.is-boxed li.is-active a,.hero.is-primary-dark .tabs.is-boxed li.is-active a:hover,.hero.is-primary-dark .tabs.is-toggle li.is-active a,.hero.is-primary-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#00d1b2}.hero.is-primary-dark.is-bold{background-image:linear-gradient(141deg,#009e6c 0,#00d1b2 71%,#00e7eb 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-primary-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#009e6c 0,#00d1b2 71%,#00e7eb 100%)}}@media (prefers-color-scheme:dark){.hero.is-link-dark{background-color:#3273dc;color:#fff}.hero.is-link-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-link-dark strong{color:inherit}.hero.is-link-dark .title{color:#fff}.hero.is-link-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-link-dark .subtitle a:not(.button),.hero.is-link-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-link-dark .navbar-menu{background-color:#3273dc}}@media (prefers-color-scheme:dark){.hero.is-link-dark .navbar-item,.hero.is-link-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-link-dark .navbar-link.is-active,.hero.is-link-dark .navbar-link:hover,.hero.is-link-dark a.navbar-item.is-active,.hero.is-link-dark a.navbar-item:hover{background-color:#2366d1;color:#fff}.hero.is-link-dark .tabs a{color:#fff;opacity:.9}.hero.is-link-dark .tabs a:hover{opacity:1}.hero.is-link-dark .tabs li.is-active a{opacity:1}.hero.is-link-dark .tabs.is-boxed a,.hero.is-link-dark .tabs.is-toggle a{color:#fff}.hero.is-link-dark .tabs.is-boxed a:hover,.hero.is-link-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-link-dark .tabs.is-boxed li.is-active a,.hero.is-link-dark .tabs.is-boxed li.is-active a:hover,.hero.is-link-dark .tabs.is-toggle li.is-active a,.hero.is-link-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#3273dc}.hero.is-link-dark.is-bold{background-image:linear-gradient(141deg,#1577c6 0,#3273dc 71%,#4366e5 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-link-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#1577c6 0,#3273dc 71%,#4366e5 100%)}}@media (prefers-color-scheme:dark){.hero.is-info-dark{background-color:#209cee;color:#fff}.hero.is-info-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-info-dark strong{color:inherit}.hero.is-info-dark .title{color:#fff}.hero.is-info-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-info-dark .subtitle a:not(.button),.hero.is-info-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-info-dark .navbar-menu{background-color:#209cee}}@media (prefers-color-scheme:dark){.hero.is-info-dark .navbar-item,.hero.is-info-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-info-dark .navbar-link.is-active,.hero.is-info-dark .navbar-link:hover,.hero.is-info-dark a.navbar-item.is-active,.hero.is-info-dark a.navbar-item:hover{background-color:#118fe4;color:#fff}.hero.is-info-dark .tabs a{color:#fff;opacity:.9}.hero.is-info-dark .tabs a:hover{opacity:1}.hero.is-info-dark .tabs li.is-active a{opacity:1}.hero.is-info-dark .tabs.is-boxed a,.hero.is-info-dark .tabs.is-toggle a{color:#fff}.hero.is-info-dark .tabs.is-boxed a:hover,.hero.is-info-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-info-dark .tabs.is-boxed li.is-active a,.hero.is-info-dark .tabs.is-boxed li.is-active a:hover,.hero.is-info-dark .tabs.is-toggle li.is-active a,.hero.is-info-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#209cee}.hero.is-info-dark.is-bold{background-image:linear-gradient(141deg,#04a6d7 0,#209cee 71%,#3287f5 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-info-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#04a6d7 0,#209cee 71%,#3287f5 100%)}}@media (prefers-color-scheme:dark){.hero.is-success-dark{background-color:#23d160;color:#fff}.hero.is-success-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-success-dark strong{color:inherit}.hero.is-success-dark .title{color:#fff}.hero.is-success-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-success-dark .subtitle a:not(.button),.hero.is-success-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-success-dark .navbar-menu{background-color:#23d160}}@media (prefers-color-scheme:dark){.hero.is-success-dark .navbar-item,.hero.is-success-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-success-dark .navbar-link.is-active,.hero.is-success-dark .navbar-link:hover,.hero.is-success-dark a.navbar-item.is-active,.hero.is-success-dark a.navbar-item:hover{background-color:#20bc56;color:#fff}.hero.is-success-dark .tabs a{color:#fff;opacity:.9}.hero.is-success-dark .tabs a:hover{opacity:1}.hero.is-success-dark .tabs li.is-active a{opacity:1}.hero.is-success-dark .tabs.is-boxed a,.hero.is-success-dark .tabs.is-toggle a{color:#fff}.hero.is-success-dark .tabs.is-boxed a:hover,.hero.is-success-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-success-dark .tabs.is-boxed li.is-active a,.hero.is-success-dark .tabs.is-boxed li.is-active a:hover,.hero.is-success-dark .tabs.is-toggle li.is-active a,.hero.is-success-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#23d160}.hero.is-success-dark.is-bold{background-image:linear-gradient(141deg,#12af2f 0,#23d160 71%,#2ce28a 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-success-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#12af2f 0,#23d160 71%,#2ce28a 100%)}}@media (prefers-color-scheme:dark){.hero.is-warning-dark{background-color:#ffdd57;color:rgba(0,0,0,.7)}.hero.is-warning-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-warning-dark strong{color:inherit}.hero.is-warning-dark .title{color:rgba(0,0,0,.7)}.hero.is-warning-dark .subtitle{color:rgba(0,0,0,.9)}.hero.is-warning-dark .subtitle a:not(.button),.hero.is-warning-dark .subtitle strong{color:rgba(0,0,0,.7)}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-warning-dark .navbar-menu{background-color:#ffdd57}}@media (prefers-color-scheme:dark){.hero.is-warning-dark .navbar-item,.hero.is-warning-dark .navbar-link{color:rgba(0,0,0,.7)}.hero.is-warning-dark .navbar-link.is-active,.hero.is-warning-dark .navbar-link:hover,.hero.is-warning-dark a.navbar-item.is-active,.hero.is-warning-dark a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.hero.is-warning-dark .tabs a{color:rgba(0,0,0,.7);opacity:.9}.hero.is-warning-dark .tabs a:hover{opacity:1}.hero.is-warning-dark .tabs li.is-active a{opacity:1}.hero.is-warning-dark .tabs.is-boxed a,.hero.is-warning-dark .tabs.is-toggle a{color:rgba(0,0,0,.7)}.hero.is-warning-dark .tabs.is-boxed a:hover,.hero.is-warning-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-warning-dark .tabs.is-boxed li.is-active a,.hero.is-warning-dark .tabs.is-boxed li.is-active a:hover,.hero.is-warning-dark .tabs.is-toggle li.is-active a,.hero.is-warning-dark .tabs.is-toggle li.is-active a:hover{background-color:rgba(0,0,0,.7);border-color:rgba(0,0,0,.7);color:#ffdd57}.hero.is-warning-dark.is-bold{background-image:linear-gradient(141deg,#ffaf24 0,#ffdd57 71%,#fffa70 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-warning-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#ffaf24 0,#ffdd57 71%,#fffa70 100%)}}@media (prefers-color-scheme:dark){.hero.is-danger-dark{background-color:#ff3860;color:#fff}.hero.is-danger-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-danger-dark strong{color:inherit}.hero.is-danger-dark .title{color:#fff}.hero.is-danger-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-danger-dark .subtitle a:not(.button),.hero.is-danger-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-danger-dark .navbar-menu{background-color:#ff3860}}@media (prefers-color-scheme:dark){.hero.is-danger-dark .navbar-item,.hero.is-danger-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-danger-dark .navbar-link.is-active,.hero.is-danger-dark .navbar-link:hover,.hero.is-danger-dark a.navbar-item.is-active,.hero.is-danger-dark a.navbar-item:hover{background-color:#ff1f4b;color:#fff}.hero.is-danger-dark .tabs a{color:#fff;opacity:.9}.hero.is-danger-dark .tabs a:hover{opacity:1}.hero.is-danger-dark .tabs li.is-active a{opacity:1}.hero.is-danger-dark .tabs.is-boxed a,.hero.is-danger-dark .tabs.is-toggle a{color:#fff}.hero.is-danger-dark .tabs.is-boxed a:hover,.hero.is-danger-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-danger-dark .tabs.is-boxed li.is-active a,.hero.is-danger-dark .tabs.is-boxed li.is-active a:hover,.hero.is-danger-dark .tabs.is-toggle li.is-active a,.hero.is-danger-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#ff3860}.hero.is-danger-dark.is-bold{background-image:linear-gradient(141deg,#ff0561 0,#ff3860 71%,#ff5257 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-danger-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#ff0561 0,#ff3860 71%,#ff5257 100%)}}@media (prefers-color-scheme:dark){.footer{background-color:#121212}} diff --git a/public/style.css b/public/style.css index b490b88ea..edbae8d82 100644 --- a/public/style.css +++ b/public/style.css @@ -58,7 +58,7 @@ em { padding-right: 10px; box-sizing: border-box; text-transform: uppercase; - color: rgba(0,0,0,.7); + opacity: 0.7; } .content { @@ -78,3 +78,22 @@ em { .fade-in-out { animation: fadeInOut ease 1s infinite; } + +@media (prefers-color-scheme:dark) { + .hero.is-light { + background-color: #242424; + color: inherit; + } + + .hero.is-light .title { + color: inherit; + } + + .document { + background-color: #242424; + } + + .content { + color: #dbdbdb; + } +} diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 2667ce668..6be5438ae 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -90,6 +90,13 @@ async fn main() -> anyhow::Result<()> { .body(include_str!("../../public/bulma.min.css")) ); + let dash_bulma_dark_route = warp::filters::method::get() + .and(warp::path!("bulma-prefers-dark.min.css")) + .map(|| Response::builder() + .header("content-type", "text/css; charset=utf-8") + .body(include_str!("../../public/bulma-prefers-dark.min.css")) + ); + let dash_style_route = warp::filters::method::get() .and(warp::path!("style.css")) .map(|| Response::builder() @@ -160,6 +167,7 @@ async fn main() -> anyhow::Result<()> { let routes = dash_html_route .or(dash_bulma_route) + .or(dash_bulma_dark_route) .or(dash_style_route) .or(dash_jquery_route) .or(dash_papaparse_route) diff --git a/templates/index.html b/templates/index.html index c9241a371..f4fb6894d 100644 --- a/templates/index.html +++ b/templates/index.html @@ -4,6 +4,7 @@ + From dd385ad05b9116330096f4de008a23dbad0ddef3 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jul 2020 11:03:21 +0200 Subject: [PATCH 0104/1889] Customize the mark tag css --- public/style.css | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/public/style.css b/public/style.css index edbae8d82..feecea4be 100644 --- a/public/style.css +++ b/public/style.css @@ -1,9 +1,3 @@ -em { - color: hsl(204, 86%, 25%); - font-style: inherit; - background-color: hsl(204, 86%, 88%); -} - #results { max-width: 900px; margin: 20px auto 0 auto; @@ -69,6 +63,11 @@ em { color: rgba(0,0,0,.9); } +.content mark { + background-color: hsl(204, 86%, 88%); + color: hsl(204, 86%, 25%); +} + @keyframes fadeInOut { 0% { opacity: 1; } 30% { opacity: 0.3; } @@ -96,4 +95,9 @@ em { .content { color: #dbdbdb; } + + .content mark { + background-color: hsl(0, 0%, 35%); + color: hsl(0,0%,90.2%); + } } From 085c3766552b98f0876ae9f08d50da97bebbaa40 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jul 2020 11:27:46 +0200 Subject: [PATCH 0105/1889] Use the regex crate to highlight "hello" --- Cargo.lock | 24 +++++++++++++++++++++++- Cargo.toml | 3 +++ src/bin/serve.rs | 19 ++++++++++++++++++- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f0ebc2ff6..554a7dfea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,6 +6,15 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10" +[[package]] +name = "aho-corasick" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" +dependencies = [ + "memchr", +] + [[package]] name = "anyhow" version = "1.0.31" @@ -973,6 +982,7 @@ dependencies = [ "once_cell", "oxidized-mtbl", "rayon", + "regex", "roaring", "serde", "slice-group-by", @@ -1574,7 +1584,10 @@ version = "1.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" dependencies = [ + "aho-corasick", + "memchr", "regex-syntax", + "thread_local 1.0.1", ] [[package]] @@ -1792,7 +1805,7 @@ dependencies = [ "chrono", "log 0.4.8", "termcolor", - "thread_local", + "thread_local 0.3.4", ] [[package]] @@ -1907,6 +1920,15 @@ dependencies = [ "unreachable", ] +[[package]] +name = "thread_local" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +dependencies = [ + "lazy_static 1.4.0", +] + [[package]] name = "time" version = "0.1.43" diff --git a/Cargo.toml b/Cargo.toml index 4a991436f..4dbbbd776 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,9 @@ smallvec = "1.4.0" structopt = { version = "0.3.14", default-features = false } tempfile = "3.1.0" +# to highlight the documents +regex = "1.3.9" + # logging log = "0.4.8" stderrlog = "0.4.3" diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 6be5438ae..f8e84555d 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::fs::File; use std::net::SocketAddr; use std::path::PathBuf; @@ -6,6 +7,7 @@ use std::time::Instant; use askama_warp::Template; use heed::EnvOpenOptions; +use regex::Regex; use serde::Deserialize; use structopt::StructOpt; use warp::{Filter, http::Response}; @@ -29,6 +31,10 @@ struct Opt { #[structopt(long = "db-size", default_value = "107374182400")] // 100 GB database_size: usize, + /// Disable document highlighting on the dashboard. + #[structopt(long)] + disable_highlighting: bool, + /// Verbose mode (-v, -vv, -vvv, etc.) #[structopt(short, long, parse(from_occurrences))] verbose: usize, @@ -138,6 +144,7 @@ async fn main() -> anyhow::Result<()> { } let env_cloned = env.clone(); + let disable_highlighting = opt.disable_highlighting; let query_route = warp::filters::method::post() .and(warp::path!("query")) .and(warp::body::json()) @@ -152,10 +159,20 @@ async fn main() -> anyhow::Result<()> { // We write the headers body.extend_from_slice(headers); + let re = Regex::new(r"(?i)(hello)").unwrap(); + for id in documents_ids { let content = index.documents.get(&rtxn, &BEU32::new(id)).unwrap(); let content = content.expect(&format!("could not find document {}", id)); - body.extend_from_slice(&content); + let content = std::str::from_utf8(content).unwrap(); + + let content = if disable_highlighting { + Cow::from(content) + } else { + re.replace_all(content, "$1") + }; + + body.extend_from_slice(content.as_bytes()); } } From 9ade00e27b070b7e1e117a3d479928bc51ddb6dd Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jul 2020 11:51:21 +0200 Subject: [PATCH 0106/1889] Highlight all the matching words --- src/bin/search.rs | 2 +- src/bin/serve.rs | 11 +++++++++-- src/lib.rs | 13 ++++++++----- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/bin/search.rs b/src/bin/search.rs index 86abe752c..c3fd7cd66 100644 --- a/src/bin/search.rs +++ b/src/bin/search.rs @@ -58,7 +58,7 @@ fn main() -> anyhow::Result<()> { let before = Instant::now(); let query = result?; - let documents_ids = index.search(&rtxn, &query)?; + let (_, documents_ids) = index.search(&rtxn, &query)?; let headers = match index.headers(&rtxn)? { Some(headers) => headers, None => return Ok(()), diff --git a/src/bin/serve.rs b/src/bin/serve.rs index f8e84555d..f9e18b315 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -152,14 +152,21 @@ async fn main() -> anyhow::Result<()> { let before_search = Instant::now(); let rtxn = env_cloned.read_txn().unwrap(); - let documents_ids = index.search(&rtxn, &query.query).unwrap(); + let (words, documents_ids) = index.search(&rtxn, &query.query).unwrap(); let mut body = Vec::new(); if let Some(headers) = index.headers(&rtxn).unwrap() { // We write the headers body.extend_from_slice(headers); - let re = Regex::new(r"(?i)(hello)").unwrap(); + let mut regex = format!(r"(?i)\b("); + let number_of_words = words.len(); + words.into_iter().enumerate().for_each(|(i, w)| { + regex.push_str(&w); + if i != number_of_words - 1 { regex.push('|') } + }); + regex.push_str(r")\b"); + let re = Regex::new(®ex).unwrap(); for id in documents_ids { let content = index.documents.get(&rtxn, &BEU32::new(id)).unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 6af75e875..9384155b7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ mod iter_shortest_paths; mod query_tokens; use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashSet, HashMap}; use std::hash::BuildHasherDefault; use std::time::Instant; @@ -96,10 +96,10 @@ impl Index { } } - pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result> { + pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result<(HashSet, Vec)> { let fst = match self.fst(rtxn)? { Some(fst) => fst, - None => return Ok(vec![]), + None => return Ok(Default::default()), }; let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2); @@ -342,7 +342,10 @@ impl Index { } } - debug!("{} candidates", documents.iter().map(RoaringBitmap::len).sum::()); - Ok(documents.iter().flatten().take(20).collect()) + debug!("{} final candidates", documents.iter().map(RoaringBitmap::len).sum::()); + let words = words.into_iter().flatten().map(|(w, _)| String::from_utf8(w).unwrap()).collect(); + let documents = documents.iter().flatten().take(20).collect(); + + Ok((words, documents)) } } From ee305c92843b5d10409334e0f63f858088212b20 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jul 2020 23:51:12 +0200 Subject: [PATCH 0107/1889] Replace the title by the milli logo --- public/logo-black.svg | 6 ++++++ public/logo-white.svg | 6 ++++++ public/style.css | 16 ++++++++++++++++ src/bin/serve.rs | 16 ++++++++++++++++ templates/index.html | 11 ++++++++--- 5 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 public/logo-black.svg create mode 100644 public/logo-white.svg diff --git a/public/logo-black.svg b/public/logo-black.svg new file mode 100644 index 000000000..2a3fb1d89 --- /dev/null +++ b/public/logo-black.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/public/logo-white.svg b/public/logo-white.svg new file mode 100644 index 000000000..58bfd5738 --- /dev/null +++ b/public/logo-white.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/public/style.css b/public/style.css index feecea4be..970b71ec5 100644 --- a/public/style.css +++ b/public/style.css @@ -4,6 +4,14 @@ padding: 0; } +#logo-white { + display: none; +} + +#logo-black { + display: inherit; +} + .notification { display: flex; justify-content: center; @@ -79,6 +87,14 @@ } @media (prefers-color-scheme:dark) { + #logo-white { + display: inherit; + } + + #logo-black { + display: none; + } + .hero.is-light { background-color: #242424; color: inherit; diff --git a/src/bin/serve.rs b/src/bin/serve.rs index f9e18b315..21470f84e 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -138,6 +138,20 @@ async fn main() -> anyhow::Result<()> { .body(include_str!("../../public/script.js")) ); + let dash_logo_white_route = warp::filters::method::get() + .and(warp::path!("logo-white.svg")) + .map(|| Response::builder() + .header("content-type", "image/svg+xml") + .body(include_str!("../../public/logo-white.svg")) + ); + + let dash_logo_black_route = warp::filters::method::get() + .and(warp::path!("logo-black.svg")) + .map(|| Response::builder() + .header("content-type", "image/svg+xml") + .body(include_str!("../../public/logo-black.svg")) + ); + #[derive(Deserialize)] struct QueryBody { query: String, @@ -197,6 +211,8 @@ async fn main() -> anyhow::Result<()> { .or(dash_papaparse_route) .or(dash_filesize_route) .or(dash_script_route) + .or(dash_logo_white_route) + .or(dash_logo_black_route) .or(query_route); let addr = SocketAddr::from_str(&opt.http_listen_addr).unwrap(); diff --git a/templates/index.html b/templates/index.html index f4fb6894d..4aae7cf5f 100644 --- a/templates/index.html +++ b/templates/index.html @@ -16,9 +16,14 @@
-

- Welcome to milli -

+ +
+
+ milli logo in white + milli logo in black +
+
+
+ diff --git a/templates/updates.html b/templates/updates.html new file mode 100644 index 000000000..ec83af4e0 --- /dev/null +++ b/templates/updates.html @@ -0,0 +1,48 @@ + + + + + + + + + + {{ db_name }} | Updates + + + +
+
+
+ +
+
+ milli logo in white + milli logo in black +
+
+
+
+
+ +
+
    + + {% for update in updates %} +
  1. +
      +
    1. +
      text
      {{ update }}
      +
    2. +
    +
  2. + {% endfor %} + +
+
+ + + + + + From 35c9a3c55887db0c72adc0c54f945b1499d5df67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 20 Oct 2020 11:19:34 +0200 Subject: [PATCH 0244/1889] Brodacast the updates infos to every ws clients --- Cargo.lock | 33 --------------------------------- Cargo.toml | 1 - src/subcommand/serve.rs | 33 ++++++++++++++++++++++----------- 3 files changed, 22 insertions(+), 45 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index daeb32d1c..dd5335894 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -77,17 +77,6 @@ dependencies = [ "warp", ] -[[package]] -name = "async-channel" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59740d83946db6a5af71ae25ddf9562c2b176b2ca42cf99a455f09f4a220d6b9" -dependencies = [ - "concurrent-queue", - "event-listener", - "futures-core", -] - [[package]] name = "atty" version = "0.2.11" @@ -206,12 +195,6 @@ version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38" -[[package]] -name = "cache-padded" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "631ae5198c9be5e753e5cc215e1bd73c2b466a3565173db433f52bb9d3e66dba" - [[package]] name = "cast" version = "0.2.3" @@ -274,15 +257,6 @@ dependencies = [ "bitflags", ] -[[package]] -name = "concurrent-queue" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30ed07550be01594c6026cff2a1d7fe9c8f683caa798e12b68694ac9e88286a3" -dependencies = [ - "cache-padded", -] - [[package]] name = "const_fn" version = "0.4.2" @@ -446,12 +420,6 @@ version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" -[[package]] -name = "event-listener" -version = "2.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7531096570974c3a9dcf9e4b8e1cede1ec26cf5046219fb3b9d897503b9be59" - [[package]] name = "fake-simd" version = "0.1.2" @@ -1033,7 +1001,6 @@ dependencies = [ "anyhow", "askama", "askama_warp", - "async-channel", "bstr", "byteorder", "bytes", diff --git a/Cargo.toml b/Cargo.toml index d1e3d8343..4589341d4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,7 +6,6 @@ edition = "2018" [dependencies] anyhow = "1.0.28" -async-channel = "1.5.1" bstr = "0.2.13" byteorder = "1.3.4" crossbeam-channel = "0.5.0" diff --git a/src/subcommand/serve.rs b/src/subcommand/serve.rs index 93f37b5cf..2662cdd56 100644 --- a/src/subcommand/serve.rs +++ b/src/subcommand/serve.rs @@ -8,13 +8,14 @@ use std::time::Duration; use std::time::Instant; use askama_warp::Template; -use futures::FutureExt; -use futures::StreamExt; +use futures::{FutureExt, StreamExt}; +use futures::stream; use heed::EnvOpenOptions; use serde::Deserialize; use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; +use tokio::sync::broadcast; use warp::filters::ws::Message; use warp::{Filter, http::Response}; @@ -110,15 +111,15 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let update_store_path = opt.database.join("updates.mdb"); create_dir_all(&update_store_path)?; - let (update_status_sender, update_status_receiver) = async_channel::unbounded(); + let (update_status_sender, _) = broadcast::channel(100); let update_status_sender_cloned = update_status_sender.clone(); let update_store = UpdateStore::open( update_store_options, update_store_path, move |uid, meta: String, _content| { - let _ = update_status_sender_cloned.try_send(format!("processing update {}", uid)); + let _ = update_status_sender_cloned.send(format!("processing update {}", uid)); std::thread::sleep(Duration::from_secs(3)); - let _ = update_status_sender_cloned.try_send(format!("update {} processed", uid)); + let _ = update_status_sender_cloned.send(format!("update {} processed", uid)); Ok(meta) })?; @@ -288,7 +289,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { async fn buf_stream( update_store: Arc>, - update_status_sender: async_channel::Sender, + update_status_sender: broadcast::Sender, mut stream: impl futures::Stream> + Unpin, ) -> Result { @@ -305,29 +306,39 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let meta = String::from("I am the metadata"); let uid = update_store.register_update(&meta, &mmap[..]).unwrap(); - update_status_sender.try_send(format!("update {} pending", uid)).unwrap(); + update_status_sender.send(format!("update {} pending", uid)).unwrap(); eprintln!("Registering update {}", uid); Ok(warp::reply()) } let update_store_cloned = update_store.clone(); + let update_status_sender_cloned = update_status_sender.clone(); let indexing_route = warp::filters::method::post() .and(warp::path!("documents")) .and(warp::body::stream()) .and_then(move |stream| { - buf_stream(update_store_cloned.clone(), update_status_sender.clone(), stream) + buf_stream(update_store_cloned.clone(), update_status_sender_cloned.clone(), stream) }); let update_ws_route = warp::ws() .and(warp::path!("updates" / "ws")) .map(move |ws: warp::ws::Ws| { // And then our closure will be called when it completes... - let update_status_receiver_cloned = update_status_receiver.clone(); + let update_status_receiver = update_status_sender.subscribe(); ws.on_upgrade(|websocket| { // Just echo all updates messages... - update_status_receiver_cloned - .map(|msg| Ok(Message::text(msg))) + update_status_receiver + .into_stream() + .flat_map(|result| { + match result{ + Ok(msg) => stream::iter(Some(Ok(Message::text(msg)))), + Err(e) => { + eprintln!("channel error: {:?}", e); + stream::iter(None) + }, + } + }) .forward(websocket) .map(|result| { if let Err(e) = result { From 03ca1ff634c853b757cc13c0397bcdadb4546d3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 20 Oct 2020 12:09:38 +0200 Subject: [PATCH 0245/1889] Make the updates page interactive --- public/updates-script.js | 26 ++++++++++++++++--- src/subcommand/serve.rs | 56 ++++++++++++++++++++++++++++------------ templates/updates.html | 49 +++++++++++++++++++++++++++++------ 3 files changed, 102 insertions(+), 29 deletions(-) diff --git a/public/updates-script.js b/public/updates-script.js index f2f68e920..68830c767 100644 --- a/public/updates-script.js +++ b/public/updates-script.js @@ -1,13 +1,19 @@ $(window).on('load', function () { - let url = 'ws://' + window.location.hostname + ':' + window.location.port + '/updates/ws'; + let wsProtcol = "ws"; + if (window.location.protocol === 'https') { + wsProtcol = 'wss'; + } + + let url = wsProtcol + '://' + window.location.hostname + ':' + window.location.port + '/updates/ws'; var socket = new WebSocket(url); socket.onmessage = function (event) { - console.log(event.data); + let status = JSON.parse(event.data); - if (event.data.endsWith("processed")) { + if (status.type == 'Pending') { const elem = document.createElement('li'); elem.classList.add("document"); + elem.setAttribute("id", 'update-' + status.update_id); const ol = document.createElement('ol'); const field = document.createElement('li'); @@ -19,7 +25,7 @@ $(window).on('load', function () { const content = document.createElement('div'); content.classList.add("content"); - content.innerHTML = event.data; + content.innerHTML = 'Pending ' + status.update_id; field.appendChild(attribute); field.appendChild(content); @@ -29,6 +35,18 @@ $(window).on('load', function () { prependChild(results, elem); } + + if (status.type == "Processing") { + const id = 'update-' + status.update_id; + const content = $(`#${id} .content`); + content.html('Processing ' + status.update_id); + } + + if (status.type == "Processed") { + const id = 'update-' + status.update_id; + const content = $(`#${id} .content`); + content.html('Processed ' + status.update_id); + } } }); diff --git a/src/subcommand/serve.rs b/src/subcommand/serve.rs index 2662cdd56..64c9edf20 100644 --- a/src/subcommand/serve.rs +++ b/src/subcommand/serve.rs @@ -11,7 +11,7 @@ use askama_warp::Template; use futures::{FutureExt, StreamExt}; use futures::stream; use heed::EnvOpenOptions; -use serde::Deserialize; +use serde::{Serialize, Deserialize}; use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; @@ -84,9 +84,19 @@ struct IndexTemplate { #[derive(Template)] #[template(path = "updates.html")] -struct UpdatesTemplate { +struct UpdatesTemplate { db_name: String, - updates: Vec, + db_size: usize, + docs_count: usize, + updates: Vec>, +} + +#[derive(Debug, Clone, Serialize)] +#[serde(tag = "type")] +enum UpdateStatus { + Pending { update_id: u64, meta: M }, + Processing { update_id: u64, meta: M }, + Processed { update_id: u64, meta: M }, } pub fn run(opt: Opt) -> anyhow::Result<()> { @@ -116,10 +126,14 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let update_store = UpdateStore::open( update_store_options, update_store_path, - move |uid, meta: String, _content| { - let _ = update_status_sender_cloned.send(format!("processing update {}", uid)); + move |update_id, meta: String, _content| { + let processing = UpdateStatus::Processing { update_id, meta: meta.clone() }; + let _ = update_status_sender_cloned.send(processing); + std::thread::sleep(Duration::from_secs(3)); - let _ = update_status_sender_cloned.send(format!("update {} processed", uid)); + + let processed = UpdateStatus::Processed { update_id, meta: meta.clone() }; + let _ = update_status_sender_cloned.send(processed); Ok(meta) })?; @@ -149,19 +163,24 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let mut updates = update_store.iter_metas(|processed, pending| { let mut updates = Vec::new(); for result in processed { - let (id, _) = result?; - updates.push(format!("update {} processed", id.get())); + let (uid, meta) = result?; + updates.push(UpdateStatus::Processed { update_id: uid.get(), meta }); } for result in pending { - let (id, _) = result?; - updates.push(format!("update {} pending", id.get())); + let (uid, meta) = result?; + updates.push(UpdateStatus::Pending { update_id: uid.get(), meta }); } Ok(updates) }).unwrap(); if header.contains("text/html") { updates.reverse(); - let template = UpdatesTemplate { db_name: db_name.clone(), updates }; + let template = UpdatesTemplate { + db_name: db_name.clone(), + db_size, + docs_count, + updates, + }; Box::new(template) as Box } else { Box::new(warp::reply::json(&updates)) @@ -289,7 +308,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { async fn buf_stream( update_store: Arc>, - update_status_sender: broadcast::Sender, + update_status_sender: broadcast::Sender>, mut stream: impl futures::Stream> + Unpin, ) -> Result { @@ -305,9 +324,9 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let mmap = unsafe { memmap::Mmap::map(&file).unwrap() }; let meta = String::from("I am the metadata"); - let uid = update_store.register_update(&meta, &mmap[..]).unwrap(); - update_status_sender.send(format!("update {} pending", uid)).unwrap(); - eprintln!("Registering update {}", uid); + let update_id = update_store.register_update(&meta, &mmap[..]).unwrap(); + update_status_sender.send(UpdateStatus::Pending { update_id, meta }).unwrap(); + eprintln!("update {} registered", update_id); Ok(warp::reply()) } @@ -331,8 +350,11 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { update_status_receiver .into_stream() .flat_map(|result| { - match result{ - Ok(msg) => stream::iter(Some(Ok(Message::text(msg)))), + match result { + Ok(status) => { + let msg = serde_json::to_string(&status).unwrap(); + stream::iter(Some(Ok(Message::text(msg)))) + }, Err(e) => { eprintln!("channel error: {:?}", e); stream::iter(None) diff --git a/templates/updates.html b/templates/updates.html index ec83af4e0..55c8088c3 100644 --- a/templates/updates.html +++ b/templates/updates.html @@ -11,7 +11,7 @@ -
+
@@ -21,6 +21,27 @@ milli logo in black
+ +
@@ -29,13 +50,25 @@
    {% for update in updates %} -
  1. -
      -
    1. -
      text
      {{ update }}
      -
    2. -
    -
  2. + {% match update %} + {% when UpdateStatus::Pending with { update_id , meta } %} +
  3. +
      +
    1. +
      text
      Pending {{ update_id }}
      +
    2. +
    +
  4. + {% when UpdateStatus::Processed with { update_id , meta } %} +
  5. +
      +
    1. +
      text
      Processed {{ update_id }}
      +
    2. +
    +
  6. + {% else %} + {% endmatch %} {% endfor %}
From 3a934b7020238d1a6015cf2e128cb48378c0d275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 20 Oct 2020 12:19:48 +0200 Subject: [PATCH 0246/1889] Split the update attributes on the updates front page --- public/updates-script.js | 47 ++++++++++++++++++++++++++++++---------- templates/updates.html | 22 ++++++++++++------- 2 files changed, 49 insertions(+), 20 deletions(-) diff --git a/public/updates-script.js b/public/updates-script.js index 68830c767..57a28835e 100644 --- a/public/updates-script.js +++ b/public/updates-script.js @@ -19,16 +19,29 @@ $(window).on('load', function () { const field = document.createElement('li'); field.classList.add("field"); - const attribute = document.createElement('div'); - attribute.classList.add("attribute"); - attribute.innerHTML = "TEXT"; + const attributeUpdateId = document.createElement('div'); + attributeUpdateId.classList.add("attribute"); + attributeUpdateId.innerHTML = "update id"; - const content = document.createElement('div'); - content.classList.add("content"); - content.innerHTML = 'Pending ' + status.update_id; + const contentUpdateId = document.createElement('div'); + contentUpdateId.classList.add("updateId"); + contentUpdateId.classList.add("content"); + contentUpdateId.innerHTML = status.update_id; - field.appendChild(attribute); - field.appendChild(content); + field.appendChild(attributeUpdateId); + field.appendChild(contentUpdateId); + + const attributeUpdateStatus = document.createElement('div'); + attributeUpdateStatus.classList.add("attribute"); + attributeUpdateStatus.innerHTML = "update status"; + + const contentUpdateStatus = document.createElement('div'); + contentUpdateStatus.classList.add("updateStatus"); + contentUpdateStatus.classList.add("content"); + contentUpdateStatus.innerHTML = 'pending'; + + field.appendChild(attributeUpdateStatus); + field.appendChild(contentUpdateStatus); ol.appendChild(field); elem.appendChild(ol); @@ -38,14 +51,14 @@ $(window).on('load', function () { if (status.type == "Processing") { const id = 'update-' + status.update_id; - const content = $(`#${id} .content`); - content.html('Processing ' + status.update_id); + const content = $(`#${id} .updateStatus.content`); + content.html('processing'); } if (status.type == "Processed") { const id = 'update-' + status.update_id; - const content = $(`#${id} .content`); - content.html('Processed ' + status.update_id); + const content = $(`#${id} .updateStatus.content`); + content.html('processed'); } } }); @@ -53,3 +66,13 @@ $(window).on('load', function () { function prependChild(parent, newFirstChild) { parent.insertBefore(newFirstChild, parent.firstChild) } + +// Make the number of document a little bit prettier +$('#docs-count').text(function(index, text) { + return parseInt(text).toLocaleString() +}); + +// Make the database a little bit easier to read +$('#db-size').text(function(index, text) { + return filesize(parseInt(text)) +}); diff --git a/templates/updates.html b/templates/updates.html index 55c8088c3..909df222e 100644 --- a/templates/updates.html +++ b/templates/updates.html @@ -7,6 +7,8 @@ + + {{ db_name }} | Updates @@ -15,12 +17,14 @@
-
-
- milli logo in white - milli logo in black -
-
+
+
+
+ milli logo in white + milli logo in black +
+
+
-
- -
+
+
+
+
+ +
+
-
-
    - -
+
+
    + +
+
+
From cb5e57e2dd7cbca398b671d68e04e1fc6302798f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 7 Jan 2021 10:17:27 +0100 Subject: [PATCH 0429/1889] FacetCondition can be created from array of facets --- src/search/facet/facet_condition.rs | 76 +++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/src/search/facet/facet_condition.rs b/src/search/facet/facet_condition.rs index 92331ee7f..ce4a22119 100644 --- a/src/search/facet/facet_condition.rs +++ b/src/search/facet/facet_condition.rs @@ -3,6 +3,7 @@ use std::fmt::Debug; use std::ops::Bound::{self, Included, Excluded}; use std::str::FromStr; +use either::Either; use heed::types::{ByteSlice, DecodeIgnore}; use log::debug; use num_traits::Bounded; @@ -141,6 +142,81 @@ where T: FromStr, } impl FacetCondition { + pub fn from_array( + rtxn: &heed::RoTxn, + index: &Index, + array: I, + ) -> anyhow::Result> + where I: IntoIterator>, + J: IntoIterator, + A: AsRef, + B: AsRef, + { + fn facet_condition( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + key: &str, + value: &str, + ) -> anyhow::Result + { + let fid = fields_ids_map.id(key).unwrap(); + let ftype = faceted_fields.get(&fid).copied().unwrap(); + let (neg, value) = match value.strip_prefix('-') { + Some(value) => (true, value), + None => (false, value), + }; + + let operator = match ftype { + FacetType::String => OperatorString(fid, FacetStringOperator::equal(value)), + FacetType::Float => OperatorF64(fid, FacetNumberOperator::Equal(value.parse()?)), + FacetType::Integer => OperatorI64(fid, FacetNumberOperator::Equal(value.parse()?)), + }; + + if neg { Ok(operator.negate()) } else { Ok(operator) } + } + + let fields_ids_map = index.fields_ids_map(rtxn)?; + let faceted_fields = index.faceted_fields(rtxn)?; + let mut ands = None; + + for either in array { + match either { + Either::Left(array) => { + let mut ors = None; + for rule in array { + let mut iter = rule.as_ref().splitn(2, ':'); + let key = iter.next().unwrap(); + let value = iter.next().unwrap(); + let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?; + ors = match ors.take() { + Some(ors) => Some(Or(Box::new(ors), Box::new(condition))), + None => Some(condition), + }; + } + + if let Some(rule) = ors { + ands = match ands.take() { + Some(ands) => Some(And(Box::new(ands), Box::new(rule))), + None => Some(rule), + }; + } + }, + Either::Right(rule) => { + let mut iter = rule.as_ref().splitn(2, ':'); + let key = iter.next().unwrap(); + let value = iter.next().unwrap(); + let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?; + ands = match ands.take() { + Some(ands) => Some(And(Box::new(ands), Box::new(condition))), + None => Some(condition), + }; + } + } + } + + Ok(ands) + } + pub fn from_str( rtxn: &heed::RoTxn, index: &Index, From afa86d8a45f20e1e4f1a88d7401ade0f4b6358e8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 21 Dec 2020 19:36:14 +0100 Subject: [PATCH 0430/1889] Add a simple test to the FacetCondition from_array method --- src/search/facet/facet_condition.rs | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/search/facet/facet_condition.rs b/src/search/facet/facet_condition.rs index ce4a22119..762134759 100644 --- a/src/search/facet/facet_condition.rs +++ b/src/search/facet/facet_condition.rs @@ -717,4 +717,35 @@ mod tests { ); assert_eq!(condition, expected); } + + #[test] + fn from_array() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order + builder.set_faceted_fields(hashmap!{ + "channel".into() => "string".into(), + "timestamp".into() => "integer".into(), + }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FacetCondition::from_array( + &rtxn, &index, + vec![Either::Right("channel:gotaga"), Either::Left(vec!["timestamp:44", "channel:-ponce"])], + ).unwrap().unwrap(); + let expected = FacetCondition::from_str( + &rtxn, &index, + "channel = gotaga AND (timestamp = 44 OR channel != ponce)", + ).unwrap(); + assert_eq!(condition, expected); + } } From 33945a311573020081563eb45e410bd986eba5c5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 7 Jan 2021 10:15:31 +0100 Subject: [PATCH 0431/1889] Introduce a new facet filters query field --- http-ui/Cargo.lock | 627 ++++++++++++++++++----------------- http-ui/Cargo.toml | 1 + http-ui/public/script.js | 47 ++- http-ui/src/main.rs | 51 ++- http-ui/templates/index.html | 2 +- 5 files changed, 407 insertions(+), 321 deletions(-) diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index 04f33336f..1ef0ccfbd 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -23,21 +23,15 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.38" +version = "1.0.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afddf7f520a80dbf76e6f50a35bca42a2331ef227a28b3b6dc5c2e2338d114b1" - -[[package]] -name = "arrayvec" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" +checksum = "bf8dcb5b4bbaa28653b647d8c77bd4ed40183b48882e130c1f1ffb73de069fd7" [[package]] name = "askama" -version = "0.10.5" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d298738b6e47e1034e560e5afe63aa488fea34e25ec11b855a76f0d7b8e73134" +checksum = "70a6e7ebd44d0047fd48206c83c5cd3214acc7b9d87f001da170145c47ef7d12" dependencies = [ "askama_derive", "askama_escape", @@ -48,12 +42,13 @@ dependencies = [ [[package]] name = "askama_derive" -version = "0.10.5" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2925c4c290382f9d2fa3d1c1b6a63fa1427099721ecca4749b154cc9c25522" +checksum = "e1d7169690c4f56343dcd821ab834972a22570a2662a19a84fd7775d5e1c3881" dependencies = [ "askama_shared", "proc-macro2", + "quote", "syn", ] @@ -65,9 +60,9 @@ checksum = "90c108c1a94380c89d2215d0ac54ce09796823cca0fd91b299cfff3b33e346fb" [[package]] name = "askama_shared" -version = "0.11.1" +version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2582b77e0f3c506ec4838a25fa8a5f97b9bed72bb6d3d272ea1c031d8bd373bc" +checksum = "62fc272363345c8cdc030e4c259d9d028237f8b057dc9bb327772a257bde6bb5" dependencies = [ "askama_escape", "humansize", @@ -102,6 +97,12 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "autocfg" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2" + [[package]] name = "autocfg" version = "1.0.1" @@ -114,12 +115,6 @@ version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" -[[package]] -name = "base64" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" - [[package]] name = "bincode" version = "1.3.1" @@ -136,18 +131,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" -[[package]] -name = "bitvec" -version = "0.19.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7ba35e9565969edb811639dbebfe34edc0368e472c5018474c8eb2543397f81" -dependencies = [ - "funty", - "radium", - "tap", - "wyz", -] - [[package]] name = "block-buffer" version = "0.7.3" @@ -217,9 +200,9 @@ dependencies = [ [[package]] name = "byteorder" -version = "1.4.2" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b" +checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" [[package]] name = "bytes" @@ -227,17 +210,11 @@ version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38" -[[package]] -name = "bytes" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040" - [[package]] name = "cc" -version = "1.0.66" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48" +checksum = "ed67cbde08356238e75fc4656be4749481eeffb09e19f320a25237d5221c985d" dependencies = [ "jobserver", ] @@ -298,10 +275,19 @@ dependencies = [ ] [[package]] -name = "const_fn" -version = "0.4.5" +name = "cloudabi" +version = "0.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28b9d6de7f49e22cf97ad17fc4036ece69300032f45f78f30b4a4482cdc3f4a6" +checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +dependencies = [ + "bitflags", +] + +[[package]] +name = "const_fn" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c478836e029dcef17fb47c89023448c64f781a046e0300e257ad8225ae59afab" [[package]] name = "cow-utils" @@ -331,7 +317,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.1", + "crossbeam-utils 0.8.0", ] [[package]] @@ -342,18 +328,18 @@ checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils 0.8.1", + "crossbeam-utils 0.8.0", ] [[package]] name = "crossbeam-epoch" -version = "0.9.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1aaa739f95311c2c7887a76863f500026092fb1dce0161dab577e559ef3569d" +checksum = "ec0f606a85340376eef0d6d8fec399e6d4a544d648386c6645eb6d0653b27d9f" dependencies = [ "cfg-if 1.0.0", "const_fn", - "crossbeam-utils 0.8.1", + "crossbeam-utils 0.8.0", "lazy_static", "memoffset", "scopeguard", @@ -380,20 +366,21 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.1" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d" +checksum = "ec91540d98355f690a86367e566ecad2e9e579f230230eb7c21398372be73ea5" dependencies = [ - "autocfg", + "autocfg 1.0.1", "cfg-if 1.0.0", + "const_fn", "lazy_static", ] [[package]] name = "csv" -version = "1.1.5" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d58633299b24b515ac72a3f869f8b91306a3cec616a602843a383acd6f9e97" +checksum = "fc4666154fd004af3fd6f1da2e81a96fd5a81927fe8ddb6ecc79e2aa6e138b54" dependencies = [ "bstr", "csv-core", @@ -437,9 +424,9 @@ dependencies = [ [[package]] name = "dtoa" -version = "0.4.7" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d7ed2934d741c6b37e33e3832298e8850b53fd2d2bea03873375596c7cea4e" +checksum = "134951f4028bdadb9b84baf4232681efbf277da25144b9b0ad65df75946c422b" [[package]] name = "either" @@ -493,6 +480,12 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d79238883cf0307100b90aba4a755d8051a3182305dfe7f649a1e9dc0517006f" +[[package]] +name = "fuchsia-cprng" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" + [[package]] name = "fuchsia-zircon" version = "0.3.3" @@ -509,17 +502,11 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" -[[package]] -name = "funty" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" - [[package]] name = "futures" -version = "0.3.12" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150" +checksum = "95314d38584ffbfda215621d723e0a3906f032e03ae5551e650058dac83d4797" dependencies = [ "futures-channel", "futures-core", @@ -532,9 +519,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.12" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" +checksum = "0448174b01148032eed37ac4aed28963aaaa8cfa93569a08e5b479bbc6c2c151" dependencies = [ "futures-core", "futures-sink", @@ -542,15 +529,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.12" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" +checksum = "18eaa56102984bed2c88ea39026cff3ce3b4c7f508ca970cedf2450ea10d4e46" [[package]] name = "futures-executor" -version = "0.3.12" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9" +checksum = "f5f8e0c9258abaea85e78ebdda17ef9666d390e987f006be6080dfe354b708cb" dependencies = [ "futures-core", "futures-task", @@ -559,15 +546,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.12" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" +checksum = "6e1798854a4727ff944a7b12aa999f58ce7aa81db80d2dfaaf2ba06f065ddd2b" [[package]] name = "futures-macro" -version = "0.3.12" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" +checksum = "e36fccf3fc58563b4a14d265027c627c3b665d7fed489427e88e7cc929559efe" dependencies = [ "proc-macro-hack", "proc-macro2", @@ -577,24 +564,24 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.12" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" +checksum = "0e3ca3f17d6e8804ae5d3df7a7d35b2b3a6fe89dac84b31872720fc3060a0b11" [[package]] name = "futures-task" -version = "0.3.12" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" +checksum = "96d502af37186c4fef99453df03e374683f8a1eec9dcc1e66b3b82dc8278ce3c" dependencies = [ "once_cell", ] [[package]] name = "futures-util" -version = "0.3.12" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" +checksum = "abcb44342f62e6f3e8ac427b8aa815f724fd705dfad060b18ac7866c15bb8e34" dependencies = [ "futures-channel", "futures-core", @@ -603,7 +590,7 @@ dependencies = [ "futures-sink", "futures-task", "memchr", - "pin-project-lite 0.2.4", + "pin-project 1.0.1", "pin-utils", "proc-macro-hack", "proc-macro-nested", @@ -640,26 +627,15 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.1.16" +version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +checksum = "fc587bc0ec293155d5bfa6b9891ec18a1e330c234f896ea47fbada4cadbe47e6" dependencies = [ - "cfg-if 1.0.0", + "cfg-if 0.1.10", "libc", "wasi 0.9.0+wasi-snapshot-preview1", ] -[[package]] -name = "getrandom" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "wasi 0.10.1+wasi-snapshot-preview1", -] - [[package]] name = "glob" version = "0.3.0" @@ -686,7 +662,7 @@ version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e4728fd124914ad25e99e3d15a9361a879f6620f63cb56bbb08f95abb97a535" dependencies = [ - "bytes 0.5.6", + "bytes", "fnv", "futures-core", "futures-sink", @@ -707,7 +683,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf" dependencies = [ "ahash", - "autocfg", + "autocfg 1.0.1", ] [[package]] @@ -718,13 +694,13 @@ checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" [[package]] name = "headers" -version = "0.3.3" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62689dc57c7456e69712607ffcbd0aa1dfcccf9af73727e9b25bc1825375cac3" +checksum = "ed18eb2459bf1a09ad2d6b1547840c3e5e62882fa09b9a6a20b1de8e3228848f" dependencies = [ - "base64 0.13.0", + "base64", "bitflags", - "bytes 1.0.1", + "bytes", "headers-core", "http", "mime", @@ -743,18 +719,18 @@ dependencies = [ [[package]] name = "heck" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac" +checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" dependencies = [ "unicode-segmentation", ] [[package]] name = "heed" -version = "0.10.6" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afcc6c911acaadad3ebe9f1ef1707d80bd71c92037566f47b6238a03b60adf1a" +checksum = "2eaba3b0edee6a9cd551f24caca2027922b03259f7203a15f0b86af4c1348fcc" dependencies = [ "byteorder", "heed-traits", @@ -763,7 +739,6 @@ dependencies = [ "lmdb-rkv-sys", "once_cell", "page_size", - "serde", "synchronoise", "url", "zerocopy", @@ -790,20 +765,20 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.1.18" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c" +checksum = "5aca5565f760fb5b220e499d72710ed156fdb74e631659e99377d9ebfbd13ae8" dependencies = [ "libc", ] [[package]] name = "http" -version = "0.2.3" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7245cd7449cc792608c3c8a9eaf69bd4eabbabf802713748fd739c98b82f0747" +checksum = "28d569972648b2c512421b5f2a405ad6ac9666547189d0c5477a3f200f3e02f9" dependencies = [ - "bytes 1.0.1", + "bytes", "fnv", "itoa", ] @@ -814,7 +789,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13d5ff830006f7646652e057693569bfe0d51760c0085a071769d142a205111b" dependencies = [ - "bytes 0.5.6", + "bytes", "http", ] @@ -826,7 +801,8 @@ dependencies = [ "askama", "askama_warp", "byte-unit", - "bytes 0.5.6", + "bytes", + "either", "flate2", "fst", "futures", @@ -877,7 +853,7 @@ version = "0.13.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ad767baac13b44d4529fcf58ba2cd0995e36e7b435bc5b039de6f47e880dbf" dependencies = [ - "bytes 0.5.6", + "bytes", "futures-channel", "futures-core", "futures-util", @@ -887,7 +863,7 @@ dependencies = [ "httparse", "httpdate", "itoa", - "pin-project 1.0.4", + "pin-project 1.0.1", "socket2", "tokio", "tower-service", @@ -908,11 +884,11 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.6.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b" +checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2" dependencies = [ - "autocfg", + "autocfg 1.0.1", "hashbrown 0.9.1", ] @@ -922,7 +898,7 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19a8a95243d5a0398cae618ec29477c6e3cb631152be5c19481f80bc71559754" dependencies = [ - "bytes 0.5.6", + "bytes", ] [[package]] @@ -945,9 +921,9 @@ dependencies = [ [[package]] name = "itoa" -version = "0.4.7" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" +checksum = "dc6f3ad7b9d11a0c00842ff8de1b60ee58661048eb8049ed33c73594f359d7e6" [[package]] name = "jemalloc-sys" @@ -1019,30 +995,17 @@ dependencies = [ "fst", ] -[[package]] -name = "lexical-core" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db65c6da02e61f55dae90a0ae427b2a5f6b3e8db09f58d10efab23af92592616" -dependencies = [ - "arrayvec", - "bitflags", - "cfg-if 0.1.10", - "ryu", - "static_assertions", -] - [[package]] name = "libc" -version = "0.2.82" +version = "0.2.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89203f3fba0a3795506acaad8ebce3c80c0af93f994d5a1d7a0b1eeb23271929" +checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614" [[package]] name = "linked-hash-map" -version = "0.5.4" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fb9b38af92608140b86b693604b9ffcc5824240a484d1ecd4795bacb2fe88f3" +checksum = "8dd5a6d5999d9907cda8ed67bbd137d3af8085216c2ac62de5be860bd41f304a" [[package]] name = "lmdb-rkv-sys" @@ -1057,9 +1020,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.13" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcf3805d4480bb5b86070dcfeb9e2cb2ebc148adb753c5cca5f884d1d65a42b2" +checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b" dependencies = [ "cfg-if 0.1.10", ] @@ -1110,11 +1073,11 @@ dependencies = [ [[package]] name = "memoffset" -version = "0.6.1" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87" +checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa" dependencies = [ - "autocfg", + "autocfg 1.0.1", ] [[package]] @@ -1186,14 +1149,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0f2d26ec3309788e423cfbf68ad1800f061638098d76a83681af979dc4eda19d" dependencies = [ "adler", - "autocfg", + "autocfg 1.0.1", ] [[package]] name = "mio" -version = "0.6.23" +version = "0.6.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4afd66f5b91bf2a3bc13fad0e21caedac168ca4c707504e75585648ae80e4cc4" +checksum = "fce347092656428bc8eaf6201042cb551b8d67855af7374542a92a0fbfcac430" dependencies = [ "cfg-if 0.1.10", "fuchsia-zircon", @@ -1202,7 +1165,7 @@ dependencies = [ "kernel32-sys", "libc", "log", - "miow 0.2.2", + "miow 0.2.1", "net2", "slab", "winapi 0.2.8", @@ -1216,7 +1179,7 @@ checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" dependencies = [ "log", "mio", - "miow 0.3.6", + "miow 0.3.5", "winapi 0.3.9", ] @@ -1233,9 +1196,9 @@ dependencies = [ [[package]] name = "miow" -version = "0.2.2" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebd808424166322d4a38da87083bfddd3ac4c131334ed55856112eb06d46944d" +checksum = "8c1f2f3b1cf331de6896aabf6e9d55dca90356cc9960cca7eaaf408a355ae919" dependencies = [ "kernel32-sys", "net2", @@ -1245,9 +1208,9 @@ dependencies = [ [[package]] name = "miow" -version = "0.3.6" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a33c1b55807fbed163481b5ba66db4b2fa6cde694a5027be10fb724206c5897" +checksum = "07b88fb9795d4d36d62a012dfbf49a8f5cf12751f36d31a9dbe66d528e58979e" dependencies = [ "socket2", "winapi 0.3.9", @@ -1255,9 +1218,9 @@ dependencies = [ [[package]] name = "multipart" -version = "0.17.1" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d050aeedc89243f5347c3e237e3e13dc76fbe4ae3742a57b94dc14f69acf76d4" +checksum = "8209c33c951f07387a8497841122fc6f712165e3f9bda3e6be4645b58188f676" dependencies = [ "buf_redux", "httparse", @@ -1265,7 +1228,7 @@ dependencies = [ "mime", "mime_guess", "quick-error", - "rand 0.7.3", + "rand 0.6.5", "safemem", "tempfile", "twoway", @@ -1276,14 +1239,14 @@ name = "near-proximity" version = "0.1.0" source = "git+https://github.com/Kerollmops/plane-sweep-proximity?rev=6608205#66082058537f6fe7709adc4690048d62f3c0e9b7" dependencies = [ - "tinyvec", + "tinyvec 1.0.1", ] [[package]] name = "net2" -version = "0.2.37" +version = "0.2.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" +checksum = "3ebc3ec692ed7c9a255596c67808dee269f64655d8baf7b4f0638e51ba1d6853" dependencies = [ "cfg-if 0.1.10", "libc", @@ -1292,24 +1255,22 @@ dependencies = [ [[package]] name = "nix" -version = "0.19.1" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ccba0cfe4fdf15982d1674c69b1fd80bad427d293849982668dfe454bd61f2" +checksum = "85db2feff6bf70ebc3a4793191517d5f0331100a2f10f9bf93b5e5214f32b7b7" dependencies = [ "bitflags", "cc", - "cfg-if 1.0.0", + "cfg-if 0.1.10", "libc", ] [[package]] name = "nom" -version = "6.0.1" +version = "5.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88034cfd6b4a0d54dd14f4a507eceee36c0b70e5a02236c4e4df571102be17f0" +checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" dependencies = [ - "bitvec", - "lexical-core", "memchr", "version_check", ] @@ -1320,7 +1281,7 @@ version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" dependencies = [ - "autocfg", + "autocfg 1.0.1", "num-traits", ] @@ -1330,7 +1291,7 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" dependencies = [ - "autocfg", + "autocfg 1.0.1", ] [[package]] @@ -1369,9 +1330,9 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "ordered-float" -version = "2.0.1" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dacdec97876ef3ede8c50efc429220641a0b11ba0048b4b0c357bccbc47c5204" +checksum = "9fe9037165d7023b1228bc4ae9a2fa1a2b0095eca6c2998c624723dfd01314a5" dependencies = [ "num-traits", ] @@ -1492,11 +1453,11 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.0.4" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95b70b68509f17aa2857863b6fa00bf21fc93674c7a8893de2f469f6aa7ca2f2" +checksum = "ee41d838744f60d959d7074e3afb6b35c7456d0f61cad38a24e35e6553f73841" dependencies = [ - "pin-project-internal 1.0.4", + "pin-project-internal 1.0.1", ] [[package]] @@ -1512,9 +1473,9 @@ dependencies = [ [[package]] name = "pin-project-internal" -version = "1.0.4" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caa25a6393f22ce819b0f50e0be89287292fda8d425be38ee0ca14c4931d9e71" +checksum = "81a4ffa594b66bff340084d4081df649a7dc049ac8d7fc458d8e628bfbbb2f86" dependencies = [ "proc-macro2", "quote", @@ -1527,12 +1488,6 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b" -[[package]] -name = "pin-project-lite" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827" - [[package]] name = "pin-utils" version = "0.1.0" @@ -1583,9 +1538,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro-nested" -version = "0.1.7" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" +checksum = "eba180dafb9038b050a4c280019bbedf9f2467b61e5d892dcad585bb57aadc5a" [[package]] name = "proc-macro2" @@ -1604,18 +1559,31 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" [[package]] name = "quote" -version = "1.0.8" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" +checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37" dependencies = [ "proc-macro2", ] [[package]] -name = "radium" -version = "0.5.3" +name = "rand" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" +checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" +dependencies = [ + "autocfg 0.1.7", + "libc", + "rand_chacha 0.1.1", + "rand_core 0.4.2", + "rand_hc 0.1.0", + "rand_isaac", + "rand_jitter", + "rand_os", + "rand_pcg 0.1.2", + "rand_xorshift", + "winapi 0.3.9", +] [[package]] name = "rand" @@ -1623,24 +1591,22 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ - "getrandom 0.1.16", + "getrandom", "libc", "rand_chacha 0.2.2", "rand_core 0.5.1", "rand_hc 0.2.0", - "rand_pcg", + "rand_pcg 0.2.1", ] [[package]] -name = "rand" -version = "0.8.2" +name = "rand_chacha" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18519b42a40024d661e1714153e9ad0c3de27cd495760ceb09710920f1098b1e" +checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" dependencies = [ - "libc", - "rand_chacha 0.3.0", - "rand_core 0.6.1", - "rand_hc 0.3.0", + "autocfg 0.1.7", + "rand_core 0.3.1", ] [[package]] @@ -1654,31 +1620,36 @@ dependencies = [ ] [[package]] -name = "rand_chacha" -version = "0.3.0" +name = "rand_core" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d" +checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" dependencies = [ - "ppv-lite86", - "rand_core 0.6.1", + "rand_core 0.4.2", ] +[[package]] +name = "rand_core" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" + [[package]] name = "rand_core" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" dependencies = [ - "getrandom 0.1.16", + "getrandom", ] [[package]] -name = "rand_core" -version = "0.6.1" +name = "rand_hc" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c026d7df8b298d90ccbbc5190bd04d85e159eaf5576caeacf8741da93ccbd2e5" +checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" dependencies = [ - "getrandom 0.2.2", + "rand_core 0.3.1", ] [[package]] @@ -1691,12 +1662,47 @@ dependencies = [ ] [[package]] -name = "rand_hc" -version = "0.3.0" +name = "rand_isaac" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" +checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" dependencies = [ - "rand_core 0.6.1", + "rand_core 0.3.1", +] + +[[package]] +name = "rand_jitter" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b" +dependencies = [ + "libc", + "rand_core 0.4.2", + "winapi 0.3.9", +] + +[[package]] +name = "rand_os" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" +dependencies = [ + "cloudabi", + "fuchsia-cprng", + "libc", + "rand_core 0.4.2", + "rdrand", + "winapi 0.3.9", +] + +[[package]] +name = "rand_pcg" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" +dependencies = [ + "autocfg 0.1.7", + "rand_core 0.4.2", ] [[package]] @@ -1708,13 +1714,22 @@ dependencies = [ "rand_core 0.5.1", ] +[[package]] +name = "rand_xorshift" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" +dependencies = [ + "rand_core 0.3.1", +] + [[package]] name = "rayon" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674" dependencies = [ - "autocfg", + "autocfg 1.0.1", "crossbeam-deque", "either", "rayon-core", @@ -1728,25 +1743,31 @@ checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" dependencies = [ "crossbeam-channel", "crossbeam-deque", - "crossbeam-utils 0.8.1", + "crossbeam-utils 0.8.0", "lazy_static", "num_cpus", ] [[package]] -name = "redox_syscall" -version = "0.2.4" +name = "rdrand" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ec8ca9416c5ea37062b502703cd7fcb207736bc294f6e0cf367ac6fc234570" +checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" dependencies = [ - "bitflags", + "rand_core 0.3.1", ] [[package]] -name = "regex" -version = "1.4.3" +name = "redox_syscall" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a" +checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" + +[[package]] +name = "regex" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38cf2c13ed4745de91a5eb834e11c00bcc3709e773173b2ce4c56c9fbde04b9c" dependencies = [ "aho-corasick", "memchr", @@ -1765,9 +1786,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.22" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581" +checksum = "3b181ba2dcf07aaccad5448e8ead58db5b742cf85dfe035e2227f137a539a189" [[package]] name = "remove_dir_all" @@ -1786,9 +1807,9 @@ checksum = "21215c1b9d8f7832b433255bd9eea3e2779aa55b21b2f8e13aad62c74749b237" [[package]] name = "roaring" -version = "0.6.3" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f12bdbc3b9b2fd12148ee9f97f9e36438f1e84d3ce47fec0ad6b4bfbb62b3a35" +checksum = "99a260b0fb7df2095948f4a1d37afe5d1a08a2ccc7380f418cec049dc9560077" dependencies = [ "byteorder", ] @@ -1819,18 +1840,18 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "serde" -version = "1.0.120" +version = "1.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "166b2349061381baf54a58e4b13c89369feb0ef2eaa57198899e2312aac30aab" +checksum = "b88fa983de7720629c9387e9f517353ed404164b1e482c970a90c1a4aaf7dc1a" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.120" +version = "1.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca2a8cb5805ce9e3b95435e3765b7b553cecc762d938d409434338386cb5775" +checksum = "cbd1ae72adb44aab48f325a02444a5fc079349a8d804c1fc922aed3f7454c74e" dependencies = [ "proc-macro2", "quote", @@ -1839,9 +1860,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.61" +version = "1.0.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" +checksum = "dcac07dbffa1c65e7f816ab9eba78eb142c6d44410f4eeba1e26e4f5dfa56b95" dependencies = [ "indexmap", "itoa", @@ -1888,9 +1909,9 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.3.0" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6" +checksum = "ce32ea0c6c56d5eacaeb814fbed9960547021d3edd010ded1425f180536b20ab" dependencies = [ "libc", ] @@ -1925,38 +1946,33 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.6.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" +checksum = "fbee7696b84bbf3d89a1c2eccff0850e3047ed46bfcd2e92c29a2d074d57e252" [[package]] name = "snap" -version = "1.0.3" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98d3306e84bf86710d6cd8b4c9c3b721d5454cc91a603180f8f8cd06cfd317b4" +checksum = "da73c8f77aebc0e40c300b93f0a5f1bece7a248a36eee287d4e095f35c7b7d6e" [[package]] name = "socket2" -version = "0.3.19" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "122e570113d28d773067fab24266b66753f6ea915758651696b6e35e49f88d6e" +checksum = "b1fa70dc5c8104ec096f4fe7ede7a221d35ae13dcd19ba1ad9a81d2cab9a1c44" dependencies = [ - "cfg-if 1.0.0", + "cfg-if 0.1.10", "libc", + "redox_syscall", "winapi 0.3.9", ] -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "stderrlog" -version = "0.5.1" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45a53e2eff3e94a019afa6265e8ee04cb05b9d33fe9f5078b14e4e391d155a38" +checksum = "b02f316286ae558d83acc93dd81eaba096e746987a7961d4a9ae026842bae67f" dependencies = [ "atty", "chrono", @@ -1967,9 +1983,9 @@ dependencies = [ [[package]] name = "structopt" -version = "0.3.21" +version = "0.3.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5277acd7ee46e63e5168a80734c9f6ee81b1367a7d8772a2d765df2a3705d28c" +checksum = "126d630294ec449fae0b16f964e35bf3c74f940da9dca17ee9b905f7b3112eb8" dependencies = [ "clap", "lazy_static", @@ -1978,9 +1994,9 @@ dependencies = [ [[package]] name = "structopt-derive" -version = "0.4.14" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ba9cdfda491b814720b6b06e0cac513d922fc407582032e8706e9f137976f90" +checksum = "65e51c492f9e23a220534971ff5afc14037289de430e3c83f9daf6a1b6ae91e8" dependencies = [ "heck", "proc-macro-error", @@ -1991,9 +2007,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.58" +version = "1.0.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc60a3d73ea6594cd712d830cc1f0390fd71542d8c8cd24e70cc54cdfd5e05d5" +checksum = "cc371affeffc477f42a221a1e4297aedcea33d47d19b61455588bd9d8f6b19ac" dependencies = [ "proc-macro2", "quote", @@ -2021,21 +2037,15 @@ dependencies = [ "unicode-xid", ] -[[package]] -name = "tap" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36474e732d1affd3a6ed582781b3683df3d0563714c59c39591e8ff707cf078e" - [[package]] name = "tempfile" -version = "3.2.0" +version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" +checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9" dependencies = [ - "cfg-if 1.0.0", + "cfg-if 0.1.10", "libc", - "rand 0.8.2", + "rand 0.7.3", "redox_syscall", "remove_dir_all", "winapi 0.3.9", @@ -2053,9 +2063,9 @@ dependencies = [ [[package]] name = "termcolor" -version = "1.1.2" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +checksum = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f" dependencies = [ "winapi-util", ] @@ -2081,19 +2091,26 @@ dependencies = [ [[package]] name = "time" -version = "0.1.43" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" +checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" dependencies = [ "libc", + "wasi 0.10.0+wasi-snapshot-preview1", "winapi 0.3.9", ] [[package]] name = "tinyvec" -version = "1.1.0" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf8dbc19eb42fba10e8feaaec282fb50e2c14b2726d6301dbfeed0f73306a6f" +checksum = "238ce071d267c5710f9d31451efec16c5ee22de34df17cc05e56cbc92e967117" + +[[package]] +name = "tinyvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b78a366903f506d2ad52ca8dc552102ffdd3e937ba8a227f024dc1d1eae28575" dependencies = [ "tinyvec_macros", ] @@ -2106,11 +2123,11 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "0.2.24" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "099837d3464c16a808060bb3f02263b412f6fafcb5d01c533d309985fbeebe48" +checksum = "5d34ca54d84bf2b5b4d7d31e901a8464f7b60ac145a284fba25ceb801f2ddccd" dependencies = [ - "bytes 0.5.6", + "bytes", "fnv", "futures-core", "iovec", @@ -2121,7 +2138,7 @@ dependencies = [ "mio-named-pipes", "mio-uds", "num_cpus", - "pin-project-lite 0.1.11", + "pin-project-lite", "signal-hook-registry", "slab", "tokio-macros", @@ -2130,9 +2147,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "0.2.6" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e44da00bfc73a25f814cd8d7e57a68a5c31b74b3152a0a1d1f590c97ed06265a" +checksum = "f0c3acc6aa564495a0f2e1d59fab677cd7f81a19994cfc7f3ad0e64301560389" dependencies = [ "proc-macro2", "quote", @@ -2158,19 +2175,19 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be8242891f2b6cbef26a2d7e8605133c2c554cd35b3e4948ea892d6d68436499" dependencies = [ - "bytes 0.5.6", + "bytes", "futures-core", "futures-sink", "log", - "pin-project-lite 0.1.11", + "pin-project-lite", "tokio", ] [[package]] name = "toml" -version = "0.5.8" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +checksum = "75cf45bb0bef80604d001caaec0d09da99611b3c0fd39d3080468875cdb65645" dependencies = [ "serde", ] @@ -2183,13 +2200,13 @@ checksum = "e987b6bf443f4b5b3b6f38704195592cca41c5bb7aedd3c3693c7081f8289860" [[package]] name = "tracing" -version = "0.1.22" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f47026cdc4080c07e49b37087de021820269d996f581aac150ef9e5583eefe3" +checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27" dependencies = [ - "cfg-if 1.0.0", + "cfg-if 0.1.10", "log", - "pin-project-lite 0.2.4", + "pin-project-lite", "tracing-core", ] @@ -2224,9 +2241,9 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0308d80d86700c5878b9ef6321f020f29b1bb9d5ff3cab25e75e23f3a492a23" dependencies = [ - "base64 0.12.3", + "base64", "byteorder", - "bytes 0.5.6", + "bytes", "http", "httparse", "input_buffer", @@ -2278,18 +2295,18 @@ dependencies = [ [[package]] name = "unicode-normalization" -version = "0.1.16" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a13e63ab62dbe32aeee58d1c5408d35c36c392bba5d9d3142287219721afe606" +checksum = "6fb19cf769fa8c6a80a162df694621ebeb4dafb606470b2b2fce0be40a98a977" dependencies = [ - "tinyvec", + "tinyvec 0.3.4", ] [[package]] name = "unicode-segmentation" -version = "1.7.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0d2e7be6ae3a5fa87eed5fb451aff96f2573d2694942e40543ae0bbe19c796" +checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0" [[package]] name = "unicode-width" @@ -2335,11 +2352,11 @@ checksum = "9071ac216321a4470a69fb2b28cfc68dcd1a39acd877c8be8e014df6772d8efa" [[package]] name = "uuid" -version = "0.8.2" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +checksum = "9fde2f6a4bea1d6e007c4ad38c6839fa71cbb63b6dbf5b595aa38dc9b1093c11" dependencies = [ - "getrandom 0.2.2", + "rand 0.7.3", ] [[package]] @@ -2364,7 +2381,7 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f41be6df54c97904af01aa23e613d4521eed7ab23537cede692d4058f6449407" dependencies = [ - "bytes 0.5.6", + "bytes", "futures", "headers", "http", @@ -2394,9 +2411,9 @@ checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" [[package]] name = "wasi" -version = "0.10.1+wasi-snapshot-preview1" +version = "0.10.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93c6c3420963c5c64bca373b25e77acb562081b9bb4dd5bb864187742186cea9" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" [[package]] name = "whatlang" @@ -2460,12 +2477,6 @@ dependencies = [ "winapi-build", ] -[[package]] -name = "wyz" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" - [[package]] name = "zerocopy" version = "0.3.0" @@ -2489,18 +2500,18 @@ dependencies = [ [[package]] name = "zstd" -version = "0.5.4+zstd.1.4.7" +version = "0.5.3+zstd.1.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69996ebdb1ba8b1517f61387a883857818a66c8a295f487b1ffd8fd9d2c82910" +checksum = "01b32eaf771efa709e8308605bbf9319bf485dc1503179ec0469b611937c0cd8" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "2.0.6+zstd.1.4.7" +version = "2.0.5+zstd.1.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98aa931fb69ecee256d44589d19754e61851ae4769bf963b385119b1cc37a49e" +checksum = "1cfb642e0d27f64729a639c52db457e0ae906e7bc6f5fe8f5c453230400f1055" dependencies = [ "libc", "zstd-sys", @@ -2508,9 +2519,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "1.4.18+zstd.1.4.7" +version = "1.4.17+zstd.1.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1e6e8778706838f43f771d80d37787cb2fe06dafe89dd3aebaf6721b9eaec81" +checksum = "b89249644df056b522696b1bb9e7c18c87e8ffa3e2f0dc3b0155875d6498f01b" dependencies = [ "cc", "glob", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 5fcdf9caf..92932cb00 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -22,6 +22,7 @@ tempfile = "3.1.0" askama = "0.10.1" askama_warp = "0.10.0" bytes = "0.5.6" +either = "1.6.1" flate2 = "1.0.19" futures = "0.3.6" serde = { version = "1.0", features = ["derive"] } diff --git a/http-ui/public/script.js b/http-ui/public/script.js index 6a76fe859..f935cc07d 100644 --- a/http-ui/public/script.js +++ b/http-ui/public/script.js @@ -2,9 +2,10 @@ var request = null; var timeoutID = null; var selected_facets = {}; -$('#query, #facet').on('input', function () { +$('#query, #filters').on('input', function () { var query = $('#query').val(); - var facet = $('#facet').val(); + var filters = $('#filters').val(); + var facet_filters = selectedFacetsToArray(selected_facets); var timeoutMs = 100; if (timeoutID !== null) { @@ -17,7 +18,10 @@ $('#query, #facet').on('input', function () { url: "query", contentType: 'application/json', data: JSON.stringify({ - 'query': query, 'facetCondition': facet, "facetDistribution": true + 'query': query, + 'filters': filters, + 'facetFilters': facet_filters, + "facetDistribution": true, }), contentType: 'application/json', success: function (data, textStatus, request) { @@ -41,7 +45,20 @@ $('#query, #facet').on('input', function () { // Create the select element let select = $(``); - for (value of data.facets[facet_name]) { + let selected_values = selected_facets[facet_name] || []; + // Create the previously selected facets (mark them as selected) + for (value of selected_values) { + let option = $('') + .text(value) + .attr('selected', "selected") + .attr('value', value) + .attr('title', value); + select.append(option); + } + + // Create the newly discovered facets + let diff = diffArray(data.facets[facet_name], selected_values); + for (value of diff) { let option = $('') .text(value) .attr('value', value) @@ -53,7 +70,6 @@ $('#query, #facet').on('input', function () { $('#facets').append(div); } - for (element of data.documents) { const elem = document.createElement('li'); elem.classList.add("document"); @@ -87,8 +103,8 @@ $('#query, #facet').on('input', function () { $('#facets select').on('change', function(e) { let facet_name = $(this).attr('data-facet-name'); selected_facets[facet_name] = $(this).val(); + $('#query').trigger('input'); }); - }, beforeSend: function () { if (request !== null) { @@ -100,6 +116,25 @@ $('#query, #facet').on('input', function () { }, timeoutMs); }); +function diffArray(arr1, arr2) { + return arr1.concat(arr2).filter(function (val) { + if (!(arr1.includes(val) && arr2.includes(val))) + return val; + }); +} + +function selectedFacetsToArray(facets_obj) { + var array = []; + for (const facet_name in facets_obj) { + var subarray = []; + for (const facet_value of facets_obj[facet_name]) { + subarray.push(`${facet_name}:${facet_value}`); + } + array.push(subarray); + } + return array; +} + // Make the number of document a little bit prettier $('#docs-count').text(function(index, text) { return parseInt(text).toLocaleString() diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 46f81ab5e..6e4f42a4e 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -11,6 +11,7 @@ use std::{mem, io}; use askama_warp::Template; use byte_unit::Byte; +use either::Either; use flate2::read::GzDecoder; use futures::stream; use futures::{FutureExt, StreamExt}; @@ -620,12 +621,29 @@ async fn main() -> anyhow::Result<()> { .body(include_str!("../public/logo-black.svg")) ); + #[derive(Debug, Deserialize)] + #[serde(untagged)] + enum UntaggedEither { + Left(L), + Right(R), + } + + impl From> for Either { + fn from(value: UntaggedEither) -> Either { + match value { + UntaggedEither::Left(left) => Either::Left(left), + UntaggedEither::Right(right) => Either::Right(right), + } + } + } + #[derive(Debug, Deserialize)] #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] struct QueryBody { query: Option, - facet_condition: Option, + filters: Option, + facet_filters: Option, String>>>, facet_distribution: Option, } @@ -651,11 +669,32 @@ async fn main() -> anyhow::Result<()> { if let Some(query) = query.query { search.query(query); } - if let Some(condition) = query.facet_condition { - if !condition.trim().is_empty() { - let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap(); - search.facet_condition(condition); - } + + let filters = match query.filters { + Some(condition) if !condition.trim().is_empty() => { + Some(FacetCondition::from_str(&rtxn, &index, &condition).unwrap()) + }, + _otherwise => None, + }; + + let facet_filters = match query.facet_filters { + Some(array) => { + let eithers = array.into_iter().map(Into::into); + FacetCondition::from_array(&rtxn, &index, eithers).unwrap() + }, + _otherwise => None, + }; + + let condition = match (filters, facet_filters) { + (Some(filters), Some(facet_filters)) => { + Some(FacetCondition::And(Box::new(filters), Box::new(facet_filters))) + }, + (Some(condition), None) | (None, Some(condition)) => Some(condition), + _otherwise => None, + }; + + if let Some(condition) = condition { + search.facet_condition(condition); } let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap(); diff --git a/http-ui/templates/index.html b/http-ui/templates/index.html index 114652e7b..adb202b3a 100644 --- a/http-ui/templates/index.html +++ b/http-ui/templates/index.html @@ -56,7 +56,7 @@
- +
From d893e83622018d15633e32190a93ad6fe6152a32 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 6 Jan 2021 15:10:30 +0100 Subject: [PATCH 0432/1889] Speed-up facet aggregation by using a FacetIter --- src/search/facet/facet_distribution.rs | 137 ++++++++++++++++--------- src/search/facet/mod.rs | 39 ++++++- src/search/mod.rs | 8 +- 3 files changed, 124 insertions(+), 60 deletions(-) diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 2ee297fa2..8256d1234 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -9,7 +9,7 @@ use serde_json::Value; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; -use crate::search::facet::FacetRange; +use crate::search::facet::{FacetIter, FacetRange}; use crate::{Index, FieldId}; pub struct FacetDistribution<'a> { @@ -41,61 +41,99 @@ impl<'a> FacetDistribution<'a> { } fn facet_values(&self, field_id: FieldId, facet_type: FacetType) -> heed::Result> { - if let Some(candidates) = self.candidates.as_ref().filter(|c| c.len() <= 1000) { - let mut key_buffer = vec![field_id]; - match facet_type { - FacetType::Float => { - let mut facet_values = HashSet::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - for result in iter { - let ((_, _, value), ()) = result?; - facet_values.insert(OrderedFloat(value)); + if let Some(candidates) = self.candidates.as_ref() { + if candidates.len() <= 1000 { + let mut key_buffer = vec![field_id]; + match facet_type { + FacetType::Float => { + let mut facet_values = HashSet::new(); + for docid in candidates { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = self.index.field_id_docid_facet_values + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + for result in iter { + let ((_, _, value), ()) = result?; + facet_values.insert(OrderedFloat(value)); + } } - } - Ok(facet_values.into_iter().map(|f| Value::from(*f)).collect()) - }, - FacetType::Integer => { - let mut facet_values = HashSet::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - for result in iter { - let ((_, _, value), ()) = result?; - facet_values.insert(value); + Ok(facet_values.into_iter().map(|f| Value::from(*f)).collect()) + }, + FacetType::Integer => { + let mut facet_values = HashSet::new(); + for docid in candidates { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = self.index.field_id_docid_facet_values + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + for result in iter { + let ((_, _, value), ()) = result?; + facet_values.insert(value); + } } - } - Ok(facet_values.into_iter().map(Value::from).collect()) - }, - FacetType::String => { - let mut facet_values = HashSet::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - for result in iter { - let ((_, _, value), ()) = result?; - facet_values.insert(value); + Ok(facet_values.into_iter().map(Value::from).collect()) + }, + FacetType::String => { + let mut facet_values = HashSet::new(); + for docid in candidates { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = self.index.field_id_docid_facet_values + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + for result in iter { + let ((_, _, value), ()) = result?; + facet_values.insert(value); + } } + Ok(facet_values.into_iter().map(Value::from).collect()) + }, + } + } else { + let iter = match facet_type { + FacetType::String => { + let db = self.index.facet_field_id_value_docids; + let iter = db + .prefix_iter(self.rtxn, &[field_id])? + .remap_key_type::() + .map(|r| r.map(|((_, v), docids)| (Value::from(v), docids))); + Box::new(iter) as Box::> + }, + FacetType::Integer => { + let iter = FacetIter::::new_non_reducing( + self.rtxn, self.index, field_id, candidates.clone(), + )?; + Box::new(iter.map(|r| r.map(|(v, docids)| (Value::from(v), docids)))) + }, + FacetType::Float => { + let iter = FacetIter::::new_non_reducing( + self.rtxn, self.index, field_id, candidates.clone(), + )?; + Box::new(iter.map(|r| r.map(|(v, docids)| (Value::from(v), docids)))) + }, + }; + + let mut facet_values = Vec::new(); + for result in iter { + let (value, docids) = result?; + if self.candidates.as_ref().map_or(true, |c| docids.is_disjoint(c)) { + facet_values.push(value); } - Ok(facet_values.into_iter().map(Value::from).collect()) - }, + if facet_values.len() == self.max_values_by_facet { + break; + } + } + + Ok(facet_values) } } else { let db = self.index.facet_field_id_value_docids; let iter = match facet_type { FacetType::String => { let iter = db - .prefix_iter(&self.rtxn, &[field_id])? + .prefix_iter(self.rtxn, &[field_id])? .remap_key_type::() .map(|r| r.map(|((_, v), docids)| (Value::from(v), docids))); Box::new(iter) as Box::> @@ -119,11 +157,8 @@ impl<'a> FacetDistribution<'a> { let mut facet_values = Vec::new(); for result in iter { let (value, docids) = result?; - match &self.candidates { - Some(candidates) => if !docids.is_disjoint(candidates) { - facet_values.push(value); - }, - None => facet_values.push(value), + if self.candidates.as_ref().map_or(true, |c| docids.is_disjoint(c)) { + facet_values.push(value); } if facet_values.len() == self.max_values_by_facet { break; diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs index 70b5b4658..e5b06185f 100644 --- a/src/search/facet/mod.rs +++ b/src/search/facet/mod.rs @@ -147,6 +147,7 @@ pub struct FacetIter<'t, T: 't, KC> { db: Database, field_id: FieldId, level_iters: Vec<(RoaringBitmap, Either, FacetRevRange<'t, T, KC>>)>, + must_reduce: bool, } impl<'t, T, KC> FacetIter<'t, T, KC> @@ -155,7 +156,10 @@ where KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, T: PartialOrd + Copy + Bounded, { - pub fn new( + /// Create a `FacetIter` that will iterate on the different facet entries + /// (facet value + documents ids) and that will reduce the given documents ids + /// while iterating on the different facet levels. + pub fn new_reducing( rtxn: &'t heed::RoTxn, index: &'t Index, field_id: FieldId, @@ -165,10 +169,14 @@ where let db = index.facet_field_id_value_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Left(highest_iter))] }) + let level_iters = vec![(documents_ids, Left(highest_iter))]; + Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) } - pub fn new_reverse( + /// Create a `FacetIter` that will iterate on the different facet entries in reverse + /// (facet value + documents ids) and that will reduce the given documents ids + /// while iterating on the different facet levels. + pub fn new_reverse_reducing( rtxn: &'t heed::RoTxn, index: &'t Index, field_id: FieldId, @@ -178,7 +186,26 @@ where let db = index.facet_field_id_value_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Right(highest_iter))] }) + let level_iters = vec![(documents_ids, Right(highest_iter))]; + Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) + } + + /// Create a `FacetIter` that will iterate on the different facet entries + /// (facet value + documents ids) and that will not reduce the given documents ids + /// while iterating on the different facet levels, possibly returning multiple times + /// a document id associated with multiple facet values. + pub fn new_non_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> + { + let db = index.facet_field_id_value_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Left(highest_iter))]; + Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) } fn highest_level(rtxn: &'t heed::RoTxn, db: Database, fid: FieldId) -> heed::Result> { @@ -216,7 +243,9 @@ where docids.intersect_with(&documents_ids); if !docids.is_empty() { - documents_ids.difference_with(&docids); + if self.must_reduce { + documents_ids.difference_with(&docids); + } if level == 0 { debug!("found {:?} at {:?}", docids, left); diff --git a/src/search/mod.rs b/src/search/mod.rs index 05999caed..459b301a6 100644 --- a/src/search/mod.rs +++ b/src/search/mod.rs @@ -189,9 +189,9 @@ impl<'a> Search<'a> { } } else { let facet_fn = if ascending { - FacetIter::::new + FacetIter::::new_reducing } else { - FacetIter::::new_reverse + FacetIter::::new_reverse_reducing }; let mut limit_tmp = limit; let mut output = Vec::new(); @@ -226,9 +226,9 @@ impl<'a> Search<'a> { } } else { let facet_fn = if ascending { - FacetIter::::new + FacetIter::::new_reducing } else { - FacetIter::::new_reverse + FacetIter::::new_reverse_reducing }; let mut limit_tmp = limit; let mut output = Vec::new(); From 51a37de885c0d0e7fd82273153eae58c96be966d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 13 Jan 2021 11:30:27 +0100 Subject: [PATCH 0433/1889] Introduce the FacetValue enum type --- Cargo.lock | 1 + Cargo.toml | 2 +- src/facet/mod.rs | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index d98b7fd8e..1296a517f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -869,6 +869,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dacdec97876ef3ede8c50efc429220641a0b11ba0048b4b0c357bccbc47c5204" dependencies = [ "num-traits", + "serde", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 52b25cfde..2e4ea13a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,7 @@ near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", num-traits = "0.2.14" obkv = "0.1.0" once_cell = "1.4.0" -ordered-float = "2.0.0" +ordered-float = { version = "2.0.0", features = ["serde"] } rayon = "1.3.1" regex = "1.4.2" ringtail = "0.3.0" diff --git a/src/facet/mod.rs b/src/facet/mod.rs index 9ec99f2d3..274d2588d 100644 --- a/src/facet/mod.rs +++ b/src/facet/mod.rs @@ -1,4 +1,6 @@ mod facet_type; +mod facet_value; pub mod value_encoding; pub use self::facet_type::FacetType; +pub use self::facet_value::FacetValue; From 4b9e81fc89c826827bf2a2042bd6201fdfeb04d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 13 Jan 2021 11:59:16 +0100 Subject: [PATCH 0434/1889] Order the facet values lexicographically --- src/facet/facet_value.rs | 40 ++++++++++++++++++++ src/search/facet/facet_distribution.rs | 52 +++++++++++++------------- 2 files changed, 65 insertions(+), 27 deletions(-) create mode 100644 src/facet/facet_value.rs diff --git a/src/facet/facet_value.rs b/src/facet/facet_value.rs new file mode 100644 index 000000000..823eddcee --- /dev/null +++ b/src/facet/facet_value.rs @@ -0,0 +1,40 @@ +use ordered_float::OrderedFloat; +use serde::{Serialize, Deserialize}; + +#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +#[derive(Serialize, Deserialize)] +pub enum FacetValue { + String(String), + Float(OrderedFloat), + Integer(i64), +} + +impl From for FacetValue { + fn from(string: String) -> FacetValue { + FacetValue::String(string) + } +} + +impl From<&str> for FacetValue { + fn from(string: &str) -> FacetValue { + FacetValue::String(string.to_owned()) + } +} + +impl From for FacetValue { + fn from(float: f64) -> FacetValue { + FacetValue::Float(OrderedFloat(float)) + } +} + +impl From> for FacetValue { + fn from(float: OrderedFloat) -> FacetValue { + FacetValue::Float(float) + } +} + +impl From for FacetValue { + fn from(integer: i64) -> FacetValue { + FacetValue::Integer(integer) + } +} diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 8256d1234..244169fc2 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -1,12 +1,10 @@ -use std::collections::{HashSet, HashMap}; -use std::{cmp, fmt}; +use std::collections::{HashSet, BTreeSet, BTreeMap}; use std::ops::Bound::Unbounded; +use std::{cmp, fmt}; -use ordered_float::OrderedFloat; use roaring::RoaringBitmap; -use serde_json::Value; -use crate::facet::FacetType; +use crate::facet::{FacetType, FacetValue}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::search::facet::{FacetIter, FacetRange}; @@ -40,13 +38,13 @@ impl<'a> FacetDistribution<'a> { self } - fn facet_values(&self, field_id: FieldId, facet_type: FacetType) -> heed::Result> { + fn facet_values(&self, field_id: FieldId, facet_type: FacetType) -> heed::Result> { if let Some(candidates) = self.candidates.as_ref() { if candidates.len() <= 1000 { let mut key_buffer = vec![field_id]; match facet_type { FacetType::Float => { - let mut facet_values = HashSet::new(); + let mut facet_values = BTreeSet::new(); for docid in candidates { key_buffer.truncate(1); key_buffer.extend_from_slice(&docid.to_be_bytes()); @@ -55,13 +53,13 @@ impl<'a> FacetDistribution<'a> { .remap_key_type::(); for result in iter { let ((_, _, value), ()) = result?; - facet_values.insert(OrderedFloat(value)); + facet_values.insert(FacetValue::from(value)); } } - Ok(facet_values.into_iter().map(|f| Value::from(*f)).collect()) + Ok(facet_values) }, FacetType::Integer => { - let mut facet_values = HashSet::new(); + let mut facet_values = BTreeSet::new(); for docid in candidates { key_buffer.truncate(1); key_buffer.extend_from_slice(&docid.to_be_bytes()); @@ -70,13 +68,13 @@ impl<'a> FacetDistribution<'a> { .remap_key_type::(); for result in iter { let ((_, _, value), ()) = result?; - facet_values.insert(value); + facet_values.insert(FacetValue::from(value)); } } - Ok(facet_values.into_iter().map(Value::from).collect()) + Ok(facet_values) }, FacetType::String => { - let mut facet_values = HashSet::new(); + let mut facet_values = BTreeSet::new(); for docid in candidates { key_buffer.truncate(1); key_buffer.extend_from_slice(&docid.to_be_bytes()); @@ -85,10 +83,10 @@ impl<'a> FacetDistribution<'a> { .remap_key_type::(); for result in iter { let ((_, _, value), ()) = result?; - facet_values.insert(value); + facet_values.insert(FacetValue::from(value)); } } - Ok(facet_values.into_iter().map(Value::from).collect()) + Ok(facet_values) }, } } else { @@ -98,28 +96,28 @@ impl<'a> FacetDistribution<'a> { let iter = db .prefix_iter(self.rtxn, &[field_id])? .remap_key_type::() - .map(|r| r.map(|((_, v), docids)| (Value::from(v), docids))); + .map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids))); Box::new(iter) as Box::> }, FacetType::Integer => { let iter = FacetIter::::new_non_reducing( self.rtxn, self.index, field_id, candidates.clone(), )?; - Box::new(iter.map(|r| r.map(|(v, docids)| (Value::from(v), docids)))) + Box::new(iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))) }, FacetType::Float => { let iter = FacetIter::::new_non_reducing( self.rtxn, self.index, field_id, candidates.clone(), )?; - Box::new(iter.map(|r| r.map(|(v, docids)| (Value::from(v), docids)))) + Box::new(iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))) }, }; - let mut facet_values = Vec::new(); + let mut facet_values = BTreeSet::new(); for result in iter { let (value, docids) = result?; if self.candidates.as_ref().map_or(true, |c| docids.is_disjoint(c)) { - facet_values.push(value); + facet_values.insert(value); } if facet_values.len() == self.max_values_by_facet { break; @@ -135,7 +133,7 @@ impl<'a> FacetDistribution<'a> { let iter = db .prefix_iter(self.rtxn, &[field_id])? .remap_key_type::() - .map(|r| r.map(|((_, v), docids)| (Value::from(v), docids))); + .map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids))); Box::new(iter) as Box::> }, FacetType::Integer => { @@ -143,22 +141,22 @@ impl<'a> FacetDistribution<'a> { let range = FacetRange::::new( self.rtxn, db, field_id, 0, Unbounded, Unbounded, )?; - Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (Value::from(v), docids)))) + Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) }, FacetType::Float => { let db = db.remap_key_type::(); let range = FacetRange::::new( self.rtxn, db, field_id, 0, Unbounded, Unbounded, )?; - Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (Value::from(v), docids)))) + Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) }, }; - let mut facet_values = Vec::new(); + let mut facet_values = BTreeSet::new(); for result in iter { let (value, docids) = result?; if self.candidates.as_ref().map_or(true, |c| docids.is_disjoint(c)) { - facet_values.push(value); + facet_values.insert(value); } if facet_values.len() == self.max_values_by_facet { break; @@ -169,7 +167,7 @@ impl<'a> FacetDistribution<'a> { } } - pub fn execute(&self) -> heed::Result>> { + pub fn execute(&self) -> heed::Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let faceted_fields = self.index.faceted_fields(self.rtxn)?; let fields_ids: Vec<_> = match &self.facets { @@ -182,7 +180,7 @@ impl<'a> FacetDistribution<'a> { None => faceted_fields.iter().map(|(id, t)| (*id, *t)).collect(), }; - let mut facets_values = HashMap::new(); + let mut facets_values = BTreeMap::new(); for (fid, ftype) in fields_ids { let facet_name = fields_ids_map.name(fid).unwrap(); let values = self.facet_values(fid, ftype)?; From 7be275b6925d36e3a49dfaf8dcb98ab7a789af15 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 26 Jan 2021 14:14:37 +0100 Subject: [PATCH 0435/1889] Add the count to the facet distribution --- http-ui/Cargo.lock | 1 + http-ui/public/script.js | 2 +- http-ui/src/main.rs | 5 ++-- src/facet/facet_value.rs | 22 +++++++++++++-- src/search/facet/facet_distribution.rs | 38 ++++++++++++++------------ 5 files changed, 46 insertions(+), 22 deletions(-) diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index 1ef0ccfbd..cb5103003 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -1335,6 +1335,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fe9037165d7023b1228bc4ae9a2fa1a2b0095eca6c2998c624723dfd01314a5" dependencies = [ "num-traits", + "serde", ] [[package]] diff --git a/http-ui/public/script.js b/http-ui/public/script.js index f935cc07d..9887f06ce 100644 --- a/http-ui/public/script.js +++ b/http-ui/public/script.js @@ -57,7 +57,7 @@ $('#query, #filters').on('input', function () { } // Create the newly discovered facets - let diff = diffArray(data.facets[facet_name], selected_values); + let diff = diffArray(Object.keys(data.facets[facet_name]), selected_values); for (value of diff) { let option = $('') .text(value) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 6e4f42a4e..54f87c3e6 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,4 +1,4 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::fmt::Display; use std::fs::{File, create_dir_all}; use std::net::SocketAddr; @@ -29,6 +29,7 @@ use warp::filters::ws::Message; use warp::{Filter, http::Response}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; +use milli::facet::FacetValue; use milli::update::UpdateIndexingStep::*; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; @@ -652,7 +653,7 @@ async fn main() -> anyhow::Result<()> { struct Answer { documents: Vec>, number_of_candidates: u64, - facets: HashMap>, + facets: BTreeMap>, } let disable_highlighting = opt.disable_highlighting; diff --git a/src/facet/facet_value.rs b/src/facet/facet_value.rs index 823eddcee..9630c6634 100644 --- a/src/facet/facet_value.rs +++ b/src/facet/facet_value.rs @@ -1,8 +1,7 @@ use ordered_float::OrderedFloat; -use serde::{Serialize, Deserialize}; +use serde::{Serialize, Serializer}; #[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] -#[derive(Serialize, Deserialize)] pub enum FacetValue { String(String), Float(OrderedFloat), @@ -38,3 +37,22 @@ impl From for FacetValue { FacetValue::Integer(integer) } } + +impl Serialize for FacetValue { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self { + FacetValue::String(string) => serializer.serialize_str(string), + FacetValue::Float(float) => { + let string = float.to_string(); + serializer.serialize_str(&string) + }, + FacetValue::Integer(integer) => { + let string = integer.to_string(); + serializer.serialize_str(&string) + }, + } + } +} diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 244169fc2..55b529308 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -1,4 +1,4 @@ -use std::collections::{HashSet, BTreeSet, BTreeMap}; +use std::collections::{HashSet, BTreeMap}; use std::ops::Bound::Unbounded; use std::{cmp, fmt}; @@ -38,13 +38,18 @@ impl<'a> FacetDistribution<'a> { self } - fn facet_values(&self, field_id: FieldId, facet_type: FacetType) -> heed::Result> { + fn facet_values( + &self, + field_id: FieldId, + facet_type: FacetType, + ) -> heed::Result> + { if let Some(candidates) = self.candidates.as_ref() { if candidates.len() <= 1000 { let mut key_buffer = vec![field_id]; match facet_type { FacetType::Float => { - let mut facet_values = BTreeSet::new(); + let mut facet_values = BTreeMap::new(); for docid in candidates { key_buffer.truncate(1); key_buffer.extend_from_slice(&docid.to_be_bytes()); @@ -53,13 +58,13 @@ impl<'a> FacetDistribution<'a> { .remap_key_type::(); for result in iter { let ((_, _, value), ()) = result?; - facet_values.insert(FacetValue::from(value)); + *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; } } Ok(facet_values) }, FacetType::Integer => { - let mut facet_values = BTreeSet::new(); + let mut facet_values = BTreeMap::new(); for docid in candidates { key_buffer.truncate(1); key_buffer.extend_from_slice(&docid.to_be_bytes()); @@ -68,13 +73,13 @@ impl<'a> FacetDistribution<'a> { .remap_key_type::(); for result in iter { let ((_, _, value), ()) = result?; - facet_values.insert(FacetValue::from(value)); + *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; } } Ok(facet_values) }, FacetType::String => { - let mut facet_values = BTreeSet::new(); + let mut facet_values = BTreeMap::new(); for docid in candidates { key_buffer.truncate(1); key_buffer.extend_from_slice(&docid.to_be_bytes()); @@ -83,7 +88,7 @@ impl<'a> FacetDistribution<'a> { .remap_key_type::(); for result in iter { let ((_, _, value), ()) = result?; - facet_values.insert(FacetValue::from(value)); + *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; } } Ok(facet_values) @@ -113,11 +118,12 @@ impl<'a> FacetDistribution<'a> { }, }; - let mut facet_values = BTreeSet::new(); + let mut facet_values = BTreeMap::new(); for result in iter { - let (value, docids) = result?; - if self.candidates.as_ref().map_or(true, |c| docids.is_disjoint(c)) { - facet_values.insert(value); + let (value, mut docids) = result?; + docids.intersect_with(candidates); + if !docids.is_empty() { + facet_values.insert(value, docids.len()); } if facet_values.len() == self.max_values_by_facet { break; @@ -152,12 +158,10 @@ impl<'a> FacetDistribution<'a> { }, }; - let mut facet_values = BTreeSet::new(); + let mut facet_values = BTreeMap::new(); for result in iter { let (value, docids) = result?; - if self.candidates.as_ref().map_or(true, |c| docids.is_disjoint(c)) { - facet_values.insert(value); - } + facet_values.insert(value, docids.len()); if facet_values.len() == self.max_values_by_facet { break; } @@ -167,7 +171,7 @@ impl<'a> FacetDistribution<'a> { } } - pub fn execute(&self) -> heed::Result>> { + pub fn execute(&self) -> heed::Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let faceted_fields = self.index.faceted_fields(self.rtxn)?; let fields_ids: Vec<_> = match &self.facets { From b0c31500fc0e6bce9ad8ffa86aea4f6b31902e56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 20 Jan 2021 13:28:34 +0100 Subject: [PATCH 0436/1889] Simplify the front page --- http-ui/public/script.js | 65 ++++++++++++------------------------ http-ui/public/style.css | 27 +++++++++++++++ http-ui/templates/index.html | 20 ++++------- 3 files changed, 56 insertions(+), 56 deletions(-) diff --git a/http-ui/public/script.js b/http-ui/public/script.js index 9887f06ce..4a16e8fc3 100644 --- a/http-ui/public/script.js +++ b/http-ui/public/script.js @@ -1,11 +1,9 @@ var request = null; var timeoutID = null; -var selected_facets = {}; $('#query, #filters').on('input', function () { var query = $('#query').val(); var filters = $('#filters').val(); - var facet_filters = selectedFacetsToArray(selected_facets); var timeoutMs = 100; if (timeoutID !== null) { @@ -20,12 +18,11 @@ $('#query, #filters').on('input', function () { data: JSON.stringify({ 'query': query, 'filters': filters, - 'facetFilters': facet_filters, "facetDistribution": true, }), contentType: 'application/json', success: function (data, textStatus, request) { - documents.innerHTML = ''; + results.innerHTML = ''; facets.innerHTML = ''; let timeSpent = request.getResponseHeader('Time-Ms'); @@ -35,39 +32,15 @@ $('#query, #filters').on('input', function () { time.classList.remove('fade-in-out'); for (facet_name in data.facets) { - // Append an header to the list of facets - let upperCaseName = facet_name.charAt(0).toUpperCase() + facet_name.slice(1); - $("

").text(upperCaseName).appendTo($('#facets')); - - // Create a div for a bulma select - const header = document.createElement('div'); - let div = $("
").addClass('select is-multiple'); - - // Create the select element - let select = $(``); - let selected_values = selected_facets[facet_name] || []; - // Create the previously selected facets (mark them as selected) - for (value of selected_values) { - let option = $('') - .text(value) - .attr('selected', "selected") - .attr('value', value) - .attr('title', value); - select.append(option); + for (value in data.facets[facet_name]) { + const elem = document.createElement('span'); + const count = data.facets[facet_name][value]; + elem.classList.add("tag"); + elem.setAttribute('data-name', facet_name); + elem.setAttribute('data-value', value); + elem.innerHTML = `${facet_name}:${value} (${count})`; + facets.appendChild(elem); } - - // Create the newly discovered facets - let diff = diffArray(Object.keys(data.facets[facet_name]), selected_values); - for (value of diff) { - let option = $('') - .text(value) - .attr('value', value) - .attr('title', value); - select.append(option); - } - - div.append(select); - $('#facets').append(div); } for (element of data.documents) { @@ -95,15 +68,21 @@ $('#query, #filters').on('input', function () { } elem.appendChild(ol); - documents.appendChild(elem); + results.appendChild(elem); } - // When we click on a facet value we change the global values - // to make sure that we don't loose the selection between requests. - $('#facets select').on('change', function(e) { - let facet_name = $(this).attr('data-facet-name'); - selected_facets[facet_name] = $(this).val(); - $('#query').trigger('input'); + // When we click on a tag we append the facet value + // at the end of the facet query. + $('#facets .tag').on('click', function () { + let name = $(this).attr("data-name"); + let value = $(this).attr("data-value"); + + let facet_query = $('#filters').val().trim(); + if (facet_query === "") { + $('#filters').val(`${name} = "${value}"`).trigger('input'); + } else { + $('#filters').val(`${facet_query} AND ${name} = "${value}"`).trigger('input'); + } }); }, beforeSend: function () { diff --git a/http-ui/public/style.css b/http-ui/public/style.css index c7ce75537..1de348082 100644 --- a/http-ui/public/style.css +++ b/http-ui/public/style.css @@ -1,5 +1,24 @@ #results { + max-width: 900px; margin: 20px auto 0 auto; + padding: 0; +} + +#facets .tag { + margin-right: 1em; + margin-bottom: 1em; +} + +#facets { + max-width: 900px; + margin: 20px auto 0 auto; + padding: 0; + max-height: 16em; + overflow: scroll; +} + +#facets .tag:hover { + cursor: pointer; } #logo-white { @@ -61,6 +80,14 @@ opacity: 0.7; } +.content { + max-width: 65%; + flex: 0 0 65%; + box-sizing: border-box; + padding-left: 10px; + color: rgba(0,0,0,.9); +} + .content mark { background-color: hsl(204, 86%, 88%); color: hsl(204, 86%, 25%); diff --git a/http-ui/templates/index.html b/http-ui/templates/index.html index adb202b3a..83b1a3e49 100644 --- a/http-ui/templates/index.html +++ b/http-ui/templates/index.html @@ -84,20 +84,14 @@
-
-
-
-
- -
-
+
+ +
-
-
    - -
-
-
+
+
    + +
From 916dd3b7c5435c3b6408136e8f1399a2d63ba7ee Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 26 Jan 2021 14:12:16 +0100 Subject: [PATCH 0437/1889] Use the faceted_fields_ids method to fetch the ids --- src/search/facet/facet_condition.rs | 2 +- src/search/facet/facet_distribution.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/search/facet/facet_condition.rs b/src/search/facet/facet_condition.rs index 762134759..19542a3ab 100644 --- a/src/search/facet/facet_condition.rs +++ b/src/search/facet/facet_condition.rs @@ -176,7 +176,7 @@ impl FacetCondition { } let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields(rtxn)?; + let faceted_fields = index.faceted_fields_ids(rtxn)?; let mut ands = None; for either in array { diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 55b529308..06c84bf17 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -173,7 +173,7 @@ impl<'a> FacetDistribution<'a> { pub fn execute(&self) -> heed::Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let faceted_fields = self.index.faceted_fields(self.rtxn)?; + let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?; let fields_ids: Vec<_> = match &self.facets { Some(names) => { names.iter().filter_map(|n| { From 61dbcfa44ad777d44fdca205815b8c6ee067ccad Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 26 Jan 2021 14:38:43 +0100 Subject: [PATCH 0438/1889] Bump the roaring to 0.6.4 --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- http-ui/Cargo.lock | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1296a517f..5f0d68ac4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1212,9 +1212,9 @@ checksum = "21215c1b9d8f7832b433255bd9eea3e2779aa55b21b2f8e13aad62c74749b237" [[package]] name = "roaring" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f12bdbc3b9b2fd12148ee9f97f9e36438f1e84d3ce47fec0ad6b4bfbb62b3a35" +checksum = "4d60b41c8f25d07cecab125cb46ebbf234fc055effc61ca2392a3ef4f9422304" dependencies = [ "byteorder", ] diff --git a/Cargo.toml b/Cargo.toml index 2e4ea13a8..0acfb494b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,7 @@ ordered-float = { version = "2.0.0", features = ["serde"] } rayon = "1.3.1" regex = "1.4.2" ringtail = "0.3.0" -roaring = "0.6.1" +roaring = "0.6.4" serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0.59", features = ["preserve_order"] } slice-group-by = "0.2.6" diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index cb5103003..06ed2cf1a 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -1808,9 +1808,9 @@ checksum = "21215c1b9d8f7832b433255bd9eea3e2779aa55b21b2f8e13aad62c74749b237" [[package]] name = "roaring" -version = "0.6.1" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99a260b0fb7df2095948f4a1d37afe5d1a08a2ccc7380f418cec049dc9560077" +checksum = "4d60b41c8f25d07cecab125cb46ebbf234fc055effc61ca2392a3ef4f9422304" dependencies = [ "byteorder", ] From 70e9b1e9363f618fb898efa3df1f8c304b14073d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 20 Jan 2021 17:11:36 +0100 Subject: [PATCH 0439/1889] Introduce a flag to the search subcommand to display the facet distribution --- src/subcommand/search.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/subcommand/search.rs b/src/subcommand/search.rs index d816c47fe..178ef9941 100644 --- a/src/subcommand/search.rs +++ b/src/subcommand/search.rs @@ -29,6 +29,10 @@ pub struct Opt { /// The query string to search for (doesn't support prefix search yet). query: Option, + + /// Compute and print the facet distribution of all the faceted fields. + #[structopt(long)] + print_facet_distribution: bool, } pub fn run(opt: Opt) -> anyhow::Result<()> { @@ -71,6 +75,12 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { let _ = writeln!(&mut stdout); } + if opt.print_facet_distribution { + let facets = index.facets(&rtxn).candidates(result.candidates).execute()?; + serde_json::to_writer(&mut stdout, &facets)?; + let _ = writeln!(&mut stdout); + } + debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len()); } From 433ac8c38a814f96e4aa3a7799192c128afbb076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 14:11:10 +0100 Subject: [PATCH 0440/1889] Remove the ordered-float serde feature --- Cargo.lock | 1 - Cargo.toml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5f0d68ac4..ba477336e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -869,7 +869,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dacdec97876ef3ede8c50efc429220641a0b11ba0048b4b0c357bccbc47c5204" dependencies = [ "num-traits", - "serde", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 0acfb494b..d9ad19e78 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,7 @@ near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", num-traits = "0.2.14" obkv = "0.1.0" once_cell = "1.4.0" -ordered-float = { version = "2.0.0", features = ["serde"] } +ordered-float = "2.0.0" rayon = "1.3.1" regex = "1.4.2" ringtail = "0.3.0" From 65b821b192105da64daacae5d8dcf7812598b2ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 14:15:33 +0100 Subject: [PATCH 0441/1889] Rename the Index facets method into facets_distribution --- http-ui/Cargo.lock | 1 - http-ui/src/main.rs | 2 +- src/index.rs | 2 +- src/subcommand/search.rs | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index 06ed2cf1a..29e04d714 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -1335,7 +1335,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fe9037165d7023b1228bc4ae9a2fa1a2b0095eca6c2998c624723dfd01314a5" dependencies = [ "num-traits", - "serde", ] [[package]] diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 54f87c3e6..5c61d3e75 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -702,7 +702,7 @@ async fn main() -> anyhow::Result<()> { let number_of_candidates = candidates.len(); let facets = if query.facet_distribution == Some(true) { - Some(index.facets(&rtxn).candidates(candidates).execute().unwrap()) + Some(index.facets_distribution(&rtxn).candidates(candidates).execute().unwrap()) } else { None }; diff --git a/src/index.rs b/src/index.rs index 6020e332c..c0dd22986 100644 --- a/src/index.rs +++ b/src/index.rs @@ -351,7 +351,7 @@ impl Index { Ok(self.documents_ids(rtxn).map(|docids| docids.len() as usize)?) } - pub fn facets<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { + pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { FacetDistribution::new(rtxn, self) } diff --git a/src/subcommand/search.rs b/src/subcommand/search.rs index 178ef9941..0a150209e 100644 --- a/src/subcommand/search.rs +++ b/src/subcommand/search.rs @@ -76,7 +76,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { } if opt.print_facet_distribution { - let facets = index.facets(&rtxn).candidates(result.candidates).execute()?; + let facets = index.facets_distribution(&rtxn).candidates(result.candidates).execute()?; serde_json::to_writer(&mut stdout, &facets)?; let _ = writeln!(&mut stdout); } From 60480a1e2ff3c641a11ba50aeaa326c45c15f367 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 14:25:53 +0100 Subject: [PATCH 0442/1889] Rework the FacetCondition from_array constructor --- src/search/facet/facet_condition.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/search/facet/facet_condition.rs b/src/search/facet/facet_condition.rs index 19542a3ab..8e7d16a83 100644 --- a/src/search/facet/facet_condition.rs +++ b/src/search/facet/facet_condition.rs @@ -3,6 +3,7 @@ use std::fmt::Debug; use std::ops::Bound::{self, Included, Excluded}; use std::str::FromStr; +use anyhow::Context; use either::Either; use heed::types::{ByteSlice, DecodeIgnore}; use log::debug; @@ -154,16 +155,20 @@ impl FacetCondition { { fn facet_condition( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashMap, key: &str, value: &str, ) -> anyhow::Result { - let fid = fields_ids_map.id(key).unwrap(); - let ftype = faceted_fields.get(&fid).copied().unwrap(); - let (neg, value) = match value.strip_prefix('-') { - Some(value) => (true, value), - None => (false, value), + let fid = fields_ids_map.id(key).with_context(|| { + format!("{:?} must isn't part of the fields ids map", key) + })?; + let ftype = faceted_fields.get(key).copied().with_context(|| { + format!("{:?} must isn't a faceted field", key) + })?; + let (neg, value) = match value.trim().strip_prefix('-') { + Some(value) => (true, value.trim()), + None => (false, value.trim()), }; let operator = match ftype { @@ -176,7 +181,7 @@ impl FacetCondition { } let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields_ids(rtxn)?; + let faceted_fields = index.faceted_fields(rtxn)?; let mut ands = None; for either in array { @@ -185,8 +190,8 @@ impl FacetCondition { let mut ors = None; for rule in array { let mut iter = rule.as_ref().splitn(2, ':'); - let key = iter.next().unwrap(); - let value = iter.next().unwrap(); + let key = iter.next().context("missing facet condition key")?; + let value = iter.next().context("missing facet condition value")?; let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?; ors = match ors.take() { Some(ors) => Some(Or(Box::new(ors), Box::new(condition))), @@ -203,8 +208,8 @@ impl FacetCondition { }, Either::Right(rule) => { let mut iter = rule.as_ref().splitn(2, ':'); - let key = iter.next().unwrap(); - let value = iter.next().unwrap(); + let key = iter.next().context("missing facet condition key")?; + let value = iter.next().context("missing facet condition value")?; let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?; ands = match ands.take() { Some(ands) => Some(And(Box::new(ands), Box::new(condition))), From d91d321129620b5ddf6c97acdbe5adb65a66a2bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 14:32:30 +0100 Subject: [PATCH 0443/1889] Introduce some constants to the FacetDistribution struct and settings --- src/search/facet/facet_distribution.rs | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 06c84bf17..8bab2cb62 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -10,6 +10,18 @@ use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Cod use crate::search::facet::{FacetIter, FacetRange}; use crate::{Index, FieldId}; +/// The default number of values by facets that will +/// be fetched from the key-value store. +const DEFAULT_VALUES_BY_FACET: usize = 100; + +/// The hard limit in the number of values by facets that will be fetched from +/// the key-value store. Searching for more values could slow down the engine. +const MAX_VALUES_BY_FACET: usize = 1000; + +/// Threshold on the number of candidates that will make +/// the system to choose between one algorithm or another. +const CANDIDATES_THRESHOLD: u64 = 1000; + pub struct FacetDistribution<'a> { facets: Option>, candidates: Option, @@ -20,7 +32,13 @@ pub struct FacetDistribution<'a> { impl<'a> FacetDistribution<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> { - FacetDistribution { facets: None, candidates: None, max_values_by_facet: 100, rtxn, index } + FacetDistribution { + facets: None, + candidates: None, + max_values_by_facet: DEFAULT_VALUES_BY_FACET, + rtxn, + index, + } } pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { @@ -34,7 +52,7 @@ impl<'a> FacetDistribution<'a> { } pub fn max_values_by_facet(&mut self, max: usize) -> &mut Self { - self.max_values_by_facet = cmp::min(max, 1000); + self.max_values_by_facet = cmp::min(max, MAX_VALUES_BY_FACET); self } @@ -45,7 +63,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result> { if let Some(candidates) = self.candidates.as_ref() { - if candidates.len() <= 1000 { + if candidates.len() <= CANDIDATES_THRESHOLD { let mut key_buffer = vec![field_id]; match facet_type { FacetType::Float => { From b52d500fbca32847ebefb557623c611caac9d1d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 14:36:49 +0100 Subject: [PATCH 0444/1889] Reorder the FacetType enum branching in the facet_value method --- src/search/facet/facet_distribution.rs | 50 +++++++++++++------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 8bab2cb62..490a18229 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -66,6 +66,21 @@ impl<'a> FacetDistribution<'a> { if candidates.len() <= CANDIDATES_THRESHOLD { let mut key_buffer = vec![field_id]; match facet_type { + FacetType::String => { + let mut facet_values = BTreeMap::new(); + for docid in candidates { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = self.index.field_id_docid_facet_values + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + for result in iter { + let ((_, _, value), ()) = result?; + *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; + } + } + Ok(facet_values) + }, FacetType::Float => { let mut facet_values = BTreeMap::new(); for docid in candidates { @@ -96,21 +111,6 @@ impl<'a> FacetDistribution<'a> { } Ok(facet_values) }, - FacetType::String => { - let mut facet_values = BTreeMap::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - for result in iter { - let ((_, _, value), ()) = result?; - *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; - } - } - Ok(facet_values) - }, } } else { let iter = match facet_type { @@ -122,14 +122,14 @@ impl<'a> FacetDistribution<'a> { .map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids))); Box::new(iter) as Box::> }, - FacetType::Integer => { - let iter = FacetIter::::new_non_reducing( + FacetType::Float => { + let iter = FacetIter::::new_non_reducing( self.rtxn, self.index, field_id, candidates.clone(), )?; Box::new(iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))) }, - FacetType::Float => { - let iter = FacetIter::::new_non_reducing( + FacetType::Integer => { + let iter = FacetIter::::new_non_reducing( self.rtxn, self.index, field_id, candidates.clone(), )?; Box::new(iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))) @@ -160,16 +160,16 @@ impl<'a> FacetDistribution<'a> { .map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids))); Box::new(iter) as Box::> }, - FacetType::Integer => { - let db = db.remap_key_type::(); - let range = FacetRange::::new( + FacetType::Float => { + let db = db.remap_key_type::(); + let range = FacetRange::::new( self.rtxn, db, field_id, 0, Unbounded, Unbounded, )?; Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) }, - FacetType::Float => { - let db = db.remap_key_type::(); - let range = FacetRange::::new( + FacetType::Integer => { + let db = db.remap_key_type::(); + let range = FacetRange::::new( self.rtxn, db, field_id, 0, Unbounded, Unbounded, )?; Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) From 2e00740515a9972088ad0c156a672d9cf4c4bae4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 14:41:36 +0100 Subject: [PATCH 0445/1889] Make sure that we don't iterate throught all string facet values --- src/search/facet/facet_distribution.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 490a18229..970631218 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -63,17 +63,18 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result> { if let Some(candidates) = self.candidates.as_ref() { - if candidates.len() <= CANDIDATES_THRESHOLD { + if candidates.len() <= CANDIDATES_THRESHOLD || facet_type == FacetType::String { let mut key_buffer = vec![field_id]; match facet_type { FacetType::String => { let mut facet_values = BTreeMap::new(); - for docid in candidates { + for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { key_buffer.truncate(1); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = self.index.field_id_docid_facet_values .prefix_iter(self.rtxn, &key_buffer)? .remap_key_type::(); + for result in iter { let ((_, _, value), ()) = result?; *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; @@ -89,6 +90,7 @@ impl<'a> FacetDistribution<'a> { let iter = self.index.field_id_docid_facet_values .prefix_iter(self.rtxn, &key_buffer)? .remap_key_type::(); + for result in iter { let ((_, _, value), ()) = result?; *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; @@ -104,6 +106,7 @@ impl<'a> FacetDistribution<'a> { let iter = self.index.field_id_docid_facet_values .prefix_iter(self.rtxn, &key_buffer)? .remap_key_type::(); + for result in iter { let ((_, _, value), ()) = result?; *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; @@ -114,19 +117,13 @@ impl<'a> FacetDistribution<'a> { } } else { let iter = match facet_type { - FacetType::String => { - let db = self.index.facet_field_id_value_docids; - let iter = db - .prefix_iter(self.rtxn, &[field_id])? - .remap_key_type::() - .map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids))); - Box::new(iter) as Box::> - }, + FacetType::String => unreachable!(), FacetType::Float => { let iter = FacetIter::::new_non_reducing( self.rtxn, self.index, field_id, candidates.clone(), )?; - Box::new(iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))) + let iter = iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids))); + Box::new(iter) as Box::> }, FacetType::Integer => { let iter = FacetIter::::new_non_reducing( From 9c8a654079f82091f3fa1d086a43a58ff28c45f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 14:44:16 +0100 Subject: [PATCH 0446/1889] Add comments to help read the facet_values branchings --- src/search/facet/facet_distribution.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 970631218..10fe1a3c3 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -63,7 +63,11 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result> { if let Some(candidates) = self.candidates.as_ref() { + // Classic search, candidates were specified, we must return + // facet values only related to those candidates. if candidates.len() <= CANDIDATES_THRESHOLD || facet_type == FacetType::String { + // There is a small amount of candidates OR we ask for facet string values so we + // decide to iterate over the facet values of each one of them, one by one. let mut key_buffer = vec![field_id]; match facet_type { FacetType::String => { @@ -116,6 +120,8 @@ impl<'a> FacetDistribution<'a> { }, } } else { + // There is too much documents, we use the facet levels to move throught + // the facet values, to find the candidates and values associated. let iter = match facet_type { FacetType::String => unreachable!(), FacetType::Float => { @@ -148,6 +154,8 @@ impl<'a> FacetDistribution<'a> { Ok(facet_values) } } else { + // Placeholder search, a.k.a. no candidates were specified. We iterate throught the + // facet values one by one and iterate on the facet level 0 for numbers. let db = self.index.facet_field_id_value_docids; let iter = match facet_type { FacetType::String => { From 11309ee99c6ad8d6d708a076847db6dfff615eee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 14:53:50 +0100 Subject: [PATCH 0447/1889] Rework the FacetDistribution execute method to use the faceted_fields struct --- src/search/facet/facet_distribution.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 10fe1a3c3..01f7b7205 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -196,22 +196,20 @@ impl<'a> FacetDistribution<'a> { pub fn execute(&self) -> heed::Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?; + let faceted_fields = self.index.faceted_fields(self.rtxn)?; let fields_ids: Vec<_> = match &self.facets { - Some(names) => { - names.iter().filter_map(|n| { - let id = fields_ids_map.id(n)?; - faceted_fields.get(&id).cloned().map(|t| (id, t)) - }).collect() - }, - None => faceted_fields.iter().map(|(id, t)| (*id, *t)).collect(), + Some(names) => names + .iter() + .filter_map(|n| faceted_fields.get(n).map(|t| (n.to_string(), *t))) + .collect(), + None => faceted_fields.into_iter().collect(), }; let mut facets_values = BTreeMap::new(); - for (fid, ftype) in fields_ids { - let facet_name = fields_ids_map.name(fid).unwrap(); + for (name, ftype) in fields_ids { + let fid = fields_ids_map.id(&name).unwrap(); let values = self.facet_values(fid, ftype)?; - facets_values.insert(facet_name.to_string(), values); + facets_values.insert(name, values); } Ok(facets_values) From a3e3bebed7d2f07872185b32d57d5eb5bdfecc7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 18:29:54 +0100 Subject: [PATCH 0448/1889] Rework the FacetDistribution execute method to use the faceted_fields struct --- src/search/facet/facet_distribution.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 01f7b7205..fc3c72853 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -2,6 +2,7 @@ use std::collections::{HashSet, BTreeMap}; use std::ops::Bound::Unbounded; use std::{cmp, fmt}; +use anyhow::Context; use roaring::RoaringBitmap; use crate::facet::{FacetType, FacetValue}; @@ -194,7 +195,7 @@ impl<'a> FacetDistribution<'a> { } } - pub fn execute(&self) -> heed::Result>> { + pub fn execute(&self) -> anyhow::Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let faceted_fields = self.index.faceted_fields(self.rtxn)?; let fields_ids: Vec<_> = match &self.facets { @@ -207,7 +208,9 @@ impl<'a> FacetDistribution<'a> { let mut facets_values = BTreeMap::new(); for (name, ftype) in fields_ids { - let fid = fields_ids_map.id(&name).unwrap(); + let fid = fields_ids_map.id(&name).with_context(|| { + format!("missing field name {:?} from the fields id map", name) + })?; let values = self.facet_values(fid, ftype)?; facets_values.insert(name, values); } From b41bf586580f1cce26b30768f7c26bd7bb02f8a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 16:13:49 +0100 Subject: [PATCH 0449/1889] Split the FacetDistribution facet_values method into three --- src/search/facet/facet_distribution.rs | 278 ++++++++++++++----------- 1 file changed, 154 insertions(+), 124 deletions(-) diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index fc3c72853..4a650b9e6 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -57,6 +57,155 @@ impl<'a> FacetDistribution<'a> { self } + /// There is a small amount of candidates OR we ask for facet string values so we + /// decide to iterate over the facet values of each one of them, one by one. + fn facet_values_from_documents( + &self, + field_id: FieldId, + facet_type: FacetType, + candidates: &RoaringBitmap, + ) -> heed::Result> + { + let mut key_buffer = vec![field_id]; + match facet_type { + FacetType::String => { + let mut facet_values = BTreeMap::new(); + for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = self.index.field_id_docid_facet_values + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, value), ()) = result?; + *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; + } + } + Ok(facet_values) + }, + FacetType::Float => { + let mut facet_values = BTreeMap::new(); + for docid in candidates { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = self.index.field_id_docid_facet_values + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, value), ()) = result?; + *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; + } + } + Ok(facet_values) + }, + FacetType::Integer => { + let mut facet_values = BTreeMap::new(); + for docid in candidates { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = self.index.field_id_docid_facet_values + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, value), ()) = result?; + *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; + } + } + Ok(facet_values) + }, + } + } + + /// There is too much documents, we use the facet levels to move throught + /// the facet values, to find the candidates and values associated. + fn facet_values_from_facet_levels( + &self, + field_id: FieldId, + facet_type: FacetType, + candidates: &RoaringBitmap, + ) -> heed::Result> + { + let iter = match facet_type { + FacetType::String => unreachable!(), + FacetType::Float => { + let iter = FacetIter::::new_non_reducing( + self.rtxn, self.index, field_id, candidates.clone(), + )?; + let iter = iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids))); + Box::new(iter) as Box::> + }, + FacetType::Integer => { + let iter = FacetIter::::new_non_reducing( + self.rtxn, self.index, field_id, candidates.clone(), + )?; + Box::new(iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))) + }, + }; + + let mut facet_values = BTreeMap::new(); + for result in iter { + let (value, mut docids) = result?; + docids.intersect_with(candidates); + if !docids.is_empty() { + facet_values.insert(value, docids.len()); + } + if facet_values.len() == self.max_values_by_facet { + break; + } + } + + Ok(facet_values) + } + + /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the + /// facet values one by one and iterate on the facet level 0 for numbers. + fn facet_values_from_raw_facet_database( + &self, + field_id: FieldId, + facet_type: FacetType, + ) -> heed::Result> + { + let db = self.index.facet_field_id_value_docids; + let level = 0; + let iter = match facet_type { + FacetType::String => { + let iter = db + .prefix_iter(self.rtxn, &[field_id])? + .remap_key_type::() + .map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids))); + Box::new(iter) as Box::> + }, + FacetType::Float => { + let db = db.remap_key_type::(); + let range = FacetRange::::new( + self.rtxn, db, field_id, level, Unbounded, Unbounded, + )?; + Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) + }, + FacetType::Integer => { + let db = db.remap_key_type::(); + let range = FacetRange::::new( + self.rtxn, db, field_id, level, Unbounded, Unbounded, + )?; + Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) + }, + }; + + let mut facet_values = BTreeMap::new(); + for result in iter { + let (value, docids) = result?; + facet_values.insert(value, docids.len()); + if facet_values.len() == self.max_values_by_facet { + break; + } + } + + Ok(facet_values) + } + fn facet_values( &self, field_id: FieldId, @@ -64,134 +213,15 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result> { if let Some(candidates) = self.candidates.as_ref() { - // Classic search, candidates were specified, we must return - // facet values only related to those candidates. + // Classic search, candidates were specified, we must return facet values only related + // to those candidates. We also enter here for facet strings for performance reasons. if candidates.len() <= CANDIDATES_THRESHOLD || facet_type == FacetType::String { - // There is a small amount of candidates OR we ask for facet string values so we - // decide to iterate over the facet values of each one of them, one by one. - let mut key_buffer = vec![field_id]; - match facet_type { - FacetType::String => { - let mut facet_values = BTreeMap::new(); - for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - - for result in iter { - let ((_, _, value), ()) = result?; - *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; - } - } - Ok(facet_values) - }, - FacetType::Float => { - let mut facet_values = BTreeMap::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - - for result in iter { - let ((_, _, value), ()) = result?; - *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; - } - } - Ok(facet_values) - }, - FacetType::Integer => { - let mut facet_values = BTreeMap::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - - for result in iter { - let ((_, _, value), ()) = result?; - *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; - } - } - Ok(facet_values) - }, - } + self.facet_values_from_documents(field_id, facet_type, candidates) } else { - // There is too much documents, we use the facet levels to move throught - // the facet values, to find the candidates and values associated. - let iter = match facet_type { - FacetType::String => unreachable!(), - FacetType::Float => { - let iter = FacetIter::::new_non_reducing( - self.rtxn, self.index, field_id, candidates.clone(), - )?; - let iter = iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids))); - Box::new(iter) as Box::> - }, - FacetType::Integer => { - let iter = FacetIter::::new_non_reducing( - self.rtxn, self.index, field_id, candidates.clone(), - )?; - Box::new(iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))) - }, - }; - - let mut facet_values = BTreeMap::new(); - for result in iter { - let (value, mut docids) = result?; - docids.intersect_with(candidates); - if !docids.is_empty() { - facet_values.insert(value, docids.len()); - } - if facet_values.len() == self.max_values_by_facet { - break; - } - } - - Ok(facet_values) + self.facet_values_from_facet_levels(field_id, facet_type, candidates) } } else { - // Placeholder search, a.k.a. no candidates were specified. We iterate throught the - // facet values one by one and iterate on the facet level 0 for numbers. - let db = self.index.facet_field_id_value_docids; - let iter = match facet_type { - FacetType::String => { - let iter = db - .prefix_iter(self.rtxn, &[field_id])? - .remap_key_type::() - .map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids))); - Box::new(iter) as Box::> - }, - FacetType::Float => { - let db = db.remap_key_type::(); - let range = FacetRange::::new( - self.rtxn, db, field_id, 0, Unbounded, Unbounded, - )?; - Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) - }, - FacetType::Integer => { - let db = db.remap_key_type::(); - let range = FacetRange::::new( - self.rtxn, db, field_id, 0, Unbounded, Unbounded, - )?; - Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) - }, - }; - - let mut facet_values = BTreeMap::new(); - for result in iter { - let (value, docids) = result?; - facet_values.insert(value, docids.len()); - if facet_values.len() == self.max_values_by_facet { - break; - } - } - - Ok(facet_values) + self.facet_values_from_raw_facet_database(field_id, facet_type) } } From b6e91291fbf7227f087e5e041f68f31108f3d124 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 16:31:57 +0100 Subject: [PATCH 0450/1889] Add a comment to explain Serialize on FacetValue is implemented by hand --- src/facet/facet_value.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/facet/facet_value.rs b/src/facet/facet_value.rs index 9630c6634..f311ca3dd 100644 --- a/src/facet/facet_value.rs +++ b/src/facet/facet_value.rs @@ -38,6 +38,8 @@ impl From for FacetValue { } } +/// We implement Serialize ourselves because we need to always serialize it as a string, +/// JSON object keys must be strings not numbers. impl Serialize for FacetValue { fn serialize(&self, serializer: S) -> Result where From f5f4438b4351c02cc5414e5e88807657431c4a4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Jan 2021 18:31:09 +0100 Subject: [PATCH 0451/1889] Remove the duplicated code inside the facet_values_from_documents method --- src/search/facet/facet_distribution.rs | 80 ++++++++++++-------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/src/search/facet/facet_distribution.rs b/src/search/facet/facet_distribution.rs index 4a650b9e6..afa4f2a5a 100644 --- a/src/search/facet/facet_distribution.rs +++ b/src/search/facet/facet_distribution.rs @@ -3,13 +3,14 @@ use std::ops::Bound::Unbounded; use std::{cmp, fmt}; use anyhow::Context; +use heed::BytesDecode; use roaring::RoaringBitmap; use crate::facet::{FacetType, FacetValue}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::search::facet::{FacetIter, FacetRange}; -use crate::{Index, FieldId}; +use crate::{Index, FieldId, DocumentId}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -66,55 +67,46 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, ) -> heed::Result> { - let mut key_buffer = vec![field_id]; + fn fetch_facet_values<'t, KC, K: 't>( + index: &Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + candidates: &RoaringBitmap, + ) -> heed::Result> + where + KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, + K: Into, + { + let mut facet_values = BTreeMap::new(); + let mut key_buffer = vec![field_id]; + + for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { + key_buffer.truncate(1); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = index.field_id_docid_facet_values + .prefix_iter(rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, value), ()) = result?; + *facet_values.entry(value.into()).or_insert(0) += 1; + } + } + + Ok(facet_values) + } + + let index = self.index; + let rtxn = self.rtxn; match facet_type { FacetType::String => { - let mut facet_values = BTreeMap::new(); - for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - - for result in iter { - let ((_, _, value), ()) = result?; - *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; - } - } - Ok(facet_values) + fetch_facet_values::(index, rtxn, field_id, candidates) }, FacetType::Float => { - let mut facet_values = BTreeMap::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - - for result in iter { - let ((_, _, value), ()) = result?; - *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; - } - } - Ok(facet_values) + fetch_facet_values::(index, rtxn, field_id, candidates) }, FacetType::Integer => { - let mut facet_values = BTreeMap::new(); - for docid in candidates { - key_buffer.truncate(1); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = self.index.field_id_docid_facet_values - .prefix_iter(self.rtxn, &key_buffer)? - .remap_key_type::(); - - for result in iter { - let ((_, _, value), ()) = result?; - *facet_values.entry(FacetValue::from(value)).or_insert(0) += 1; - } - } - Ok(facet_values) + fetch_facet_values::(index, rtxn, field_id, candidates) }, } } From 14ae01a6c9a80fa43d7c8cf53f53801cae1949ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 1 Feb 2021 18:10:57 +0100 Subject: [PATCH 0452/1889] Fix some typos in error messages --- src/search/facet/facet_condition.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/search/facet/facet_condition.rs b/src/search/facet/facet_condition.rs index 8e7d16a83..578344d37 100644 --- a/src/search/facet/facet_condition.rs +++ b/src/search/facet/facet_condition.rs @@ -161,10 +161,10 @@ impl FacetCondition { ) -> anyhow::Result { let fid = fields_ids_map.id(key).with_context(|| { - format!("{:?} must isn't part of the fields ids map", key) + format!("{:?} isn't present in the fields ids map", key) })?; let ftype = faceted_fields.get(key).copied().with_context(|| { - format!("{:?} must isn't a faceted field", key) + format!("{:?} isn't a faceted field", key) })?; let (neg, value) = match value.trim().strip_prefix('-') { Some(value) => (true, value.trim()), From 91d8198d17e6801a24a7b18a24c16651001aec7e Mon Sep 17 00:00:00 2001 From: mpostma Date: Wed, 30 Dec 2020 18:43:50 +0100 Subject: [PATCH 0453/1889] return documents number on addition --- src/update/index_documents/mod.rs | 15 ++++++++++++--- src/update/mod.rs | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 90bc5ef3d..2c5d34092 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -12,8 +12,9 @@ use grenad::{Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; use heed::types::ByteSlice; use log::{debug, info, error}; use memmap::Mmap; -use rayon::prelude::*; use rayon::ThreadPool; +use rayon::prelude::*; +use serde::{Serialize, Deserialize}; use crate::index::Index; use crate::update::{Facets, UpdateIndexingStep}; @@ -32,6 +33,11 @@ mod merge_function; mod store; mod transform; +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct DocumentAdditionResult { + nb_documents: usize, +} + #[derive(Debug, Copy, Clone)] pub enum WriteMethod { Append, @@ -253,7 +259,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.autogenerate_docids = false; } - pub fn execute(self, reader: R, progress_callback: F) -> anyhow::Result<()> + pub fn execute(self, reader: R, progress_callback: F) -> anyhow::Result where R: io::Read, F: Fn(UpdateIndexingStep) + Sync, @@ -279,9 +285,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { UpdateFormat::JsonStream => transform.output_from_json_stream(reader, &progress_callback)?, }; + let nb_documents = output.documents_count; + info!("Update transformed in {:.02?}", before_transform.elapsed()); - self.execute_raw(output, progress_callback) + self.execute_raw(output, progress_callback)?; + Ok(DocumentAdditionResult { nb_documents }) } pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> anyhow::Result<()> diff --git a/src/update/mod.rs b/src/update/mod.rs index 407d9f498..2cd532c83 100644 --- a/src/update/mod.rs +++ b/src/update/mod.rs @@ -10,7 +10,7 @@ mod update_step; pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::DeleteDocuments; -pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; +pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat, DocumentAdditionResult}; pub use self::facets::Facets; pub use self::settings::Settings; pub use self::update_builder::UpdateBuilder; From d487791b0312d965ac822d5868d5274e88ba7368 Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 22 Dec 2020 18:17:35 +0100 Subject: [PATCH 0454/1889] derive serde for method and format This is nicer when working with UpdateMeta struct --- src/update/index_documents/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 2c5d34092..7fab93343 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -181,7 +181,7 @@ pub fn write_into_lmdb_database( Ok(()) } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[non_exhaustive] pub enum IndexDocumentsMethod { /// Replace the previous document with the new one, @@ -193,7 +193,7 @@ pub enum IndexDocumentsMethod { UpdateDocuments, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[non_exhaustive] pub enum UpdateFormat { /// The given update is a real **comma seperated** CSV with headers on the first line. From 3b604326874691518a2165f9e3d18dd790d6d982 Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 22 Dec 2020 16:21:07 +0100 Subject: [PATCH 0455/1889] Use update_id in UpdateBuilder Add `the update_id` to the to the updates. The rationale is the following: - It allows for better tracability of the update events, thus improved debugging and logging. - The enigne is now aware of what he's already processed, and can return it if asked. It may not make sense now, but in the future, the update store may not work the same way, and this information about the state of the engine will be desirable (distributed environement). --- http-ui/src/main.rs | 6 +- src/search/facet/facet_condition.rs | 12 ++-- src/update/clear_documents.rs | 10 +++- src/update/delete_documents.rs | 5 +- src/update/facets.rs | 8 ++- src/update/index_documents/mod.rs | 87 ++++++++++++++++------------- src/update/settings.rs | 78 ++++++++++++++------------ src/update/update_builder.rs | 20 +++---- 8 files changed, 127 insertions(+), 99 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 5c61d3e75..53c2cb460 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -322,7 +322,7 @@ async fn main() -> anyhow::Result<()> { // the type hint is necessary: https://github.com/rust-lang/rust/issues/32600 move |update_id, meta, content:&_| { // We prepare the update by using the update builder. - let mut update_builder = UpdateBuilder::new(); + let mut update_builder = UpdateBuilder::new(update_id); if let Some(max_nb_chunks) = indexer_opt_cloned.max_nb_chunks { update_builder.max_nb_chunks(max_nb_chunks); } @@ -363,7 +363,7 @@ async fn main() -> anyhow::Result<()> { otherwise => panic!("invalid encoding format {:?}", otherwise), }; - let result = builder.execute(reader, |indexing_step| { + let result = builder.execute(reader, |indexing_step, update_id| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), @@ -430,7 +430,7 @@ async fn main() -> anyhow::Result<()> { } } - let result = builder.execute(|indexing_step| { + let result = builder.execute(|indexing_step, update_id| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), diff --git a/src/search/facet/facet_condition.rs b/src/search/facet/facet_condition.rs index 578344d37..2f0444dce 100644 --- a/src/search/facet/facet_condition.rs +++ b/src/search/facet/facet_condition.rs @@ -625,9 +625,9 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. @@ -654,9 +654,9 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_faceted_fields(hashmap!{ "timestamp".into() => "integer".into() }); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. @@ -682,13 +682,13 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into(), "timestamp".into() => "integer".into(), }); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index ac359ba0d..a84596901 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -4,11 +4,17 @@ use crate::{ExternalDocumentsIds, Index}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, + _update_id: u64, } impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> ClearDocuments<'t, 'u, 'i> { - ClearDocuments { wtxn, index } + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + update_id: u64 + ) -> ClearDocuments<'t, 'u, 'i> { + + ClearDocuments { wtxn, index, _update_id: update_id } } pub fn execute(self) -> anyhow::Result { diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index 6dc4b1cfa..2b67535c9 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -12,12 +12,14 @@ pub struct DeleteDocuments<'t, 'u, 'i> { index: &'i Index, external_documents_ids: ExternalDocumentsIds<'static>, documents_ids: RoaringBitmap, + update_id: u64, } impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, + update_id: u64, ) -> anyhow::Result> { let external_documents_ids = index @@ -29,6 +31,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { index, external_documents_ids, documents_ids: RoaringBitmap::new(), + update_id, }) } @@ -64,7 +67,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We can execute a ClearDocuments operation when the number of documents // to delete is exactly the number of documents in the database. if current_documents_ids_len == self.documents_ids.len() { - return ClearDocuments::new(self.wtxn, self.index).execute(); + return ClearDocuments::new(self.wtxn, self.index, self.update_id).execute(); } let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; diff --git a/src/update/facets.rs b/src/update/facets.rs index a268cbeaf..522a4d350 100644 --- a/src/update/facets.rs +++ b/src/update/facets.rs @@ -24,10 +24,15 @@ pub struct Facets<'t, 'u, 'i> { pub(crate) chunk_fusing_shrink_size: Option, level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, + _update_id: u64, } impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + update_id: u64, + ) -> Facets<'t, 'u, 'i> { Facets { wtxn, index, @@ -36,6 +41,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { chunk_fusing_shrink_size: None, level_group_size: NonZeroUsize::new(4).unwrap(), min_level_size: NonZeroUsize::new(5).unwrap(), + _update_id: update_id, } } diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 7fab93343..e38c640a0 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -220,10 +220,15 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, + update_id: u64, } impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> IndexDocuments<'t, 'u, 'i, 'a> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + update_id: u64, + ) -> IndexDocuments<'t, 'u, 'i, 'a> { IndexDocuments { wtxn, index, @@ -240,6 +245,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { update_method: IndexDocumentsMethod::ReplaceDocuments, update_format: UpdateFormat::Json, autogenerate_docids: true, + update_id, } } @@ -262,9 +268,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { pub fn execute(self, reader: R, progress_callback: F) -> anyhow::Result where R: io::Read, - F: Fn(UpdateIndexingStep) + Sync, + F: Fn(UpdateIndexingStep, u64) + Sync, { let before_transform = Instant::now(); + let update_id = self.update_id; + let progress_callback = |step| progress_callback(step, update_id); let transform = Transform { rtxn: &self.wtxn, @@ -321,6 +329,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_level: self.chunk_compression_level, chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, thread_pool: self.thread_pool, + update_id: self.update_id, }; let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; debug!("documents to delete {:?}", replaced_documents_ids); @@ -616,7 +625,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }); } - let mut builder = Facets::new(self.wtxn, self.index); + let mut builder = Facets::new(self.wtxn, self.index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; @@ -651,9 +660,9 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -665,9 +674,9 @@ mod tests { // Second we send 1 document with id 1, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = &b"id,name\n1,updated kevin\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -679,9 +688,9 @@ mod tests { // Third we send 3 documents again to replace the existing ones. let mut wtxn = index.write_txn().unwrap(); let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -702,10 +711,10 @@ mod tests { // change the index method to merge documents. let mut wtxn = index.write_txn().unwrap(); let content = &b"id,name\n1,kevin\n1,kevina\n1,benoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is only 1 document now. @@ -729,10 +738,10 @@ mod tests { // Second we send 1 document with id 1, to force it to be merged with the previous one. let mut wtxn = index.write_txn().unwrap(); let content = &b"id,age\n1,25\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 1 document. @@ -765,10 +774,10 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.disable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); - assert!(builder.execute(content, |_| ()).is_err()); + assert!(builder.execute(content, |_, _| ()).is_err()); wtxn.commit().unwrap(); // Check that there is no document. @@ -792,10 +801,10 @@ mod tests { { "name": "kevin" }, { "name": "benoit" } ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.disable_autogenerate_docids(); builder.update_format(UpdateFormat::Json); - assert!(builder.execute(content, |_| ()).is_err()); + assert!(builder.execute(content, |_, _| ()).is_err()); wtxn.commit().unwrap(); // Check that there is no document. @@ -815,9 +824,9 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -833,9 +842,9 @@ mod tests { // Second we send 1 document with the generated uuid, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = format!("id,name\n{},updated kevin", kevin_uuid); - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.update_format(UpdateFormat::Csv); - builder.execute(content.as_bytes(), |_| ()).unwrap(); + builder.execute(content.as_bytes(), |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -868,9 +877,9 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -882,9 +891,9 @@ mod tests { // Second we send 1 document without specifying the id. let mut wtxn = index.write_txn().unwrap(); let content = &b"name\nnew kevin"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 4 documents now. @@ -904,9 +913,9 @@ mod tests { // First we send 0 documents and only headers. let mut wtxn = index.write_txn().unwrap(); let content = &b"id,name\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is no documents. @@ -930,9 +939,9 @@ mod tests { { "name": "kevina", "id": 21 }, { "name": "benoit" } ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -952,9 +961,9 @@ mod tests { // First we send 0 documents. let mut wtxn = index.write_txn().unwrap(); let content = &b"[]"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is no documents. @@ -978,9 +987,9 @@ mod tests { { "name": "kevina", "id": 21 } { "name": "benoit" } "#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::JsonStream); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -1001,18 +1010,18 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. let content = &b"id,name\nbrume bleue,kevin\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); - assert!(builder.execute(content, |_| ()).is_err()); + assert!(builder.execute(content, |_, _| ()).is_err()); wtxn.commit().unwrap(); // First we send 1 document with a valid id. let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. let content = &b"id,name\n32,kevin\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 1 document now. @@ -1036,9 +1045,9 @@ mod tests { { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 1 documents now. diff --git a/src/update/settings.rs b/src/update/settings.rs index dbe951761..17a9da1eb 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -23,6 +23,7 @@ pub struct Settings<'a, 't, 'u, 'i> { pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + update_id: u64, // If a struct field is set to `None` it means that it hasn't been set by the user, // however if it is `Some(None)` it means that the user forced a reset of the setting. @@ -33,7 +34,11 @@ pub struct Settings<'a, 't, 'u, 'i> { } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Settings<'a, 't, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + update_id: u64, + ) -> Settings<'a, 't, 'u, 'i> { Settings { wtxn, index, @@ -49,6 +54,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { displayed_fields: None, faceted_fields: None, criteria: None, + update_id, } } @@ -86,9 +92,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> where - F: Fn(UpdateIndexingStep) + Sync, + F: Fn(UpdateIndexingStep, u64) + Sync { let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let update_id = self.update_id; + let cb = |step| cb(step, update_id); // if the settings are set before any document update, we don't need to do anything, and // will set the primary key during the first document addition. if self.index.number_of_documents(&self.wtxn)? == 0 { @@ -118,11 +126,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fields_ids_map.clone())?; // We clear the full database (words-fst, documents ids and documents content). - ClearDocuments::new(self.wtxn, self.index).execute()?; + ClearDocuments::new(self.wtxn, self.index, self.update_id).execute()?; // We index the generated `TransformOutput` which must contain // all the documents with fields in the newly defined searchable order. - let mut indexing_builder = IndexDocuments::new(self.wtxn, self.index); + let mut indexing_builder = IndexDocuments::new(self.wtxn, self.index, self.update_id); indexing_builder.log_every_n = self.log_every_n; indexing_builder.max_nb_chunks = self.max_nb_chunks; indexing_builder.max_memory = self.max_memory; @@ -239,7 +247,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { pub fn execute(mut self, progress_callback: F) -> anyhow::Result<()> where - F: Fn(UpdateIndexingStep) + Sync + F: Fn(UpdateIndexingStep, u64) + Sync { let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; self.update_displayed()?; @@ -276,16 +284,16 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // We change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 1); builder.set_searchable_fields(vec!["name".into()]); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the searchable field is correctly set to "name" only. @@ -305,9 +313,9 @@ mod tests { // We change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 2); builder.reset_searchable_fields(); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the searchable field have been reset and documents are found now. @@ -331,18 +339,18 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // In the same transaction we change the displayed fields to be only the "age". // We also change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 1); builder.set_displayed_fields(vec!["age".into()]); builder.set_searchable_fields(vec!["name".into()]); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). @@ -353,9 +361,9 @@ mod tests { // We change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 2); builder.reset_searchable_fields(); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields always contains only the "age" field. @@ -375,9 +383,9 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). @@ -397,14 +405,14 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); // In the same transaction we change the displayed fields to be only the age. - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["age".into()]); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to only the "age" field. @@ -415,9 +423,9 @@ mod tests { // We reset the fields ids to become `None`, the default value. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 0); builder.reset_displayed_fields(); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). @@ -436,15 +444,15 @@ mod tests { // Set the faceted fields to be the age. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_faceted_fields(hashmap!{ "age".into() => "integer".into() }); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); // Then index some documents. let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set. @@ -459,9 +467,9 @@ mod tests { // Index a little more documents with new and current facets values. let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); builder.update_format(UpdateFormat::Csv); - builder.execute(content, |_| ()).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -480,14 +488,14 @@ mod tests { // Set all the settings except searchable let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); builder.set_faceted_fields(hashmap!{ "age".into() => "integer".into(), "toto".into() => "integer".into(), }); builder.set_criteria(vec!["asc(toto)".to_string()]); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // check the output @@ -500,9 +508,9 @@ mod tests { // We set toto and age as searchable to force reordering of the fields let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 1); builder.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/src/update/update_builder.rs b/src/update/update_builder.rs index 43e3c28ed..8d6eb034d 100644 --- a/src/update/update_builder.rs +++ b/src/update/update_builder.rs @@ -13,10 +13,11 @@ pub struct UpdateBuilder<'a> { pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + pub(crate) update_id: u64, } impl<'a> UpdateBuilder<'a> { - pub fn new() -> UpdateBuilder<'a> { + pub fn new(update_id: u64) -> UpdateBuilder<'a> { UpdateBuilder { log_every_n: None, max_nb_chunks: None, @@ -26,6 +27,7 @@ impl<'a> UpdateBuilder<'a> { chunk_compression_level: None, chunk_fusing_shrink_size: None, thread_pool: None, + update_id, } } @@ -67,7 +69,7 @@ impl<'a> UpdateBuilder<'a> { index: &'i Index, ) -> ClearDocuments<'t, 'u, 'i> { - ClearDocuments::new(wtxn, index) + ClearDocuments::new(wtxn, index, self.update_id) } pub fn delete_documents<'t, 'u, 'i>( @@ -76,7 +78,7 @@ impl<'a> UpdateBuilder<'a> { index: &'i Index, ) -> anyhow::Result> { - DeleteDocuments::new(wtxn, index) + DeleteDocuments::new(wtxn, index, self.update_id) } pub fn index_documents<'t, 'u, 'i>( @@ -85,7 +87,7 @@ impl<'a> UpdateBuilder<'a> { index: &'i Index, ) -> IndexDocuments<'t, 'u, 'i, 'a> { - let mut builder = IndexDocuments::new(wtxn, index); + let mut builder = IndexDocuments::new(wtxn, index, self.update_id); builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; @@ -105,7 +107,7 @@ impl<'a> UpdateBuilder<'a> { index: &'i Index, ) -> Settings<'a, 't, 'u, 'i> { - let mut builder = Settings::new(wtxn, index); + let mut builder = Settings::new(wtxn, index, self.update_id); builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; @@ -125,7 +127,7 @@ impl<'a> UpdateBuilder<'a> { index: &'i Index, ) -> Facets<'t, 'u, 'i> { - let mut builder = Facets::new(wtxn, index); + let mut builder = Facets::new(wtxn, index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; @@ -134,9 +136,3 @@ impl<'a> UpdateBuilder<'a> { builder } } - -impl Default for UpdateBuilder<'_> { - fn default() -> Self { - Self::new() - } -} From 8f43698a6015a534f21fc5176fb4ae812a39e701 Mon Sep 17 00:00:00 2001 From: mpostma Date: Mon, 1 Feb 2021 14:38:04 +0100 Subject: [PATCH 0456/1889] fix httpui --- http-ui/src/main.rs | 2 +- src/search/facet/facet_condition.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 53c2cb460..a29eb8895 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -382,7 +382,7 @@ async fn main() -> anyhow::Result<()> { }); match result { - Ok(()) => wtxn.commit().map_err(Into::into), + Ok(_) => wtxn.commit().map_err(Into::into), Err(e) => Err(e.into()) } }, diff --git a/src/search/facet/facet_condition.rs b/src/search/facet/facet_condition.rs index 2f0444dce..42c2327a9 100644 --- a/src/search/facet/facet_condition.rs +++ b/src/search/facet/facet_condition.rs @@ -732,13 +732,13 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into(), "timestamp".into() => "integer".into(), }); - builder.execute(|_| ()).unwrap(); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. From e8639517dadc72828218579a1eed3da16c024490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 12 Feb 2021 16:15:09 +0100 Subject: [PATCH 0457/1889] Change the project to become a workspace with milli as a default-member --- .gitignore | 3 - Cargo.lock | 998 ++++++- Cargo.toml | 70 +- http-ui/Cargo.lock | 2530 ----------------- http-ui/Cargo.toml | 2 +- milli/Cargo.toml | 67 + {benches => milli/benches}/search.rs | 0 {src => milli/src}/criterion.rs | 0 {src => milli/src}/external_documents_ids.rs | 0 {src => milli/src}/facet/facet_type.rs | 0 {src => milli/src}/facet/facet_value.rs | 0 {src => milli/src}/facet/mod.rs | 0 {src => milli/src}/facet/value_encoding.rs | 0 {src => milli/src}/fields_ids_map.rs | 0 .../src}/heed_codec/beu32_str_codec.rs | 0 .../heed_codec/bo_roaring_bitmap_codec.rs | 0 .../heed_codec/cbo_roaring_bitmap_codec.rs | 0 .../facet/facet_level_value_f64_codec.rs | 0 .../facet/facet_level_value_i64_codec.rs | 0 .../facet/facet_value_string_codec.rs | 0 .../facet/field_doc_id_facet_f64_codec.rs | 0 .../facet/field_doc_id_facet_i64_codec.rs | 0 .../facet/field_doc_id_facet_string_codec.rs | 0 {src => milli/src}/heed_codec/facet/mod.rs | 0 {src => milli/src}/heed_codec/mod.rs | 0 {src => milli/src}/heed_codec/obkv_codec.rs | 0 .../src}/heed_codec/roaring_bitmap_codec.rs | 0 .../src}/heed_codec/str_str_u8_codec.rs | 0 {src => milli/src}/index.rs | 0 {src => milli/src}/lib.rs | 0 {src => milli/src}/main.rs | 0 {src => milli/src}/mdfs.rs | 0 {src => milli/src}/proximity.rs | 0 {src => milli/src}/query_tokens.rs | 0 .../src}/search/facet/facet_condition.rs | 0 .../src}/search/facet/facet_distribution.rs | 0 {src => milli/src}/search/facet/grammar.pest | 0 {src => milli/src}/search/facet/mod.rs | 0 {src => milli/src}/search/facet/parser.rs | 0 {src => milli/src}/search/mod.rs | 0 {src => milli/src}/subcommand/infos.rs | 0 {src => milli/src}/subcommand/mod.rs | 0 {src => milli/src}/subcommand/search.rs | 0 .../src}/update/available_documents_ids.rs | 0 {src => milli/src}/update/clear_documents.rs | 0 {src => milli/src}/update/delete_documents.rs | 0 {src => milli/src}/update/facets.rs | 0 .../update/index_documents/merge_function.rs | 0 .../src}/update/index_documents/mod.rs | 0 .../src}/update/index_documents/store.rs | 0 .../src}/update/index_documents/transform.rs | 0 {src => milli/src}/update/mod.rs | 0 {src => milli/src}/update/settings.rs | 0 {src => milli/src}/update/update_builder.rs | 0 {src => milli/src}/update/update_step.rs | 0 {src => milli/src}/update_store.rs | 0 56 files changed, 1053 insertions(+), 2617 deletions(-) delete mode 100644 http-ui/Cargo.lock create mode 100644 milli/Cargo.toml rename {benches => milli/benches}/search.rs (100%) rename {src => milli/src}/criterion.rs (100%) rename {src => milli/src}/external_documents_ids.rs (100%) rename {src => milli/src}/facet/facet_type.rs (100%) rename {src => milli/src}/facet/facet_value.rs (100%) rename {src => milli/src}/facet/mod.rs (100%) rename {src => milli/src}/facet/value_encoding.rs (100%) rename {src => milli/src}/fields_ids_map.rs (100%) rename {src => milli/src}/heed_codec/beu32_str_codec.rs (100%) rename {src => milli/src}/heed_codec/bo_roaring_bitmap_codec.rs (100%) rename {src => milli/src}/heed_codec/cbo_roaring_bitmap_codec.rs (100%) rename {src => milli/src}/heed_codec/facet/facet_level_value_f64_codec.rs (100%) rename {src => milli/src}/heed_codec/facet/facet_level_value_i64_codec.rs (100%) rename {src => milli/src}/heed_codec/facet/facet_value_string_codec.rs (100%) rename {src => milli/src}/heed_codec/facet/field_doc_id_facet_f64_codec.rs (100%) rename {src => milli/src}/heed_codec/facet/field_doc_id_facet_i64_codec.rs (100%) rename {src => milli/src}/heed_codec/facet/field_doc_id_facet_string_codec.rs (100%) rename {src => milli/src}/heed_codec/facet/mod.rs (100%) rename {src => milli/src}/heed_codec/mod.rs (100%) rename {src => milli/src}/heed_codec/obkv_codec.rs (100%) rename {src => milli/src}/heed_codec/roaring_bitmap_codec.rs (100%) rename {src => milli/src}/heed_codec/str_str_u8_codec.rs (100%) rename {src => milli/src}/index.rs (100%) rename {src => milli/src}/lib.rs (100%) rename {src => milli/src}/main.rs (100%) rename {src => milli/src}/mdfs.rs (100%) rename {src => milli/src}/proximity.rs (100%) rename {src => milli/src}/query_tokens.rs (100%) rename {src => milli/src}/search/facet/facet_condition.rs (100%) rename {src => milli/src}/search/facet/facet_distribution.rs (100%) rename {src => milli/src}/search/facet/grammar.pest (100%) rename {src => milli/src}/search/facet/mod.rs (100%) rename {src => milli/src}/search/facet/parser.rs (100%) rename {src => milli/src}/search/mod.rs (100%) rename {src => milli/src}/subcommand/infos.rs (100%) rename {src => milli/src}/subcommand/mod.rs (100%) rename {src => milli/src}/subcommand/search.rs (100%) rename {src => milli/src}/update/available_documents_ids.rs (100%) rename {src => milli/src}/update/clear_documents.rs (100%) rename {src => milli/src}/update/delete_documents.rs (100%) rename {src => milli/src}/update/facets.rs (100%) rename {src => milli/src}/update/index_documents/merge_function.rs (100%) rename {src => milli/src}/update/index_documents/mod.rs (100%) rename {src => milli/src}/update/index_documents/store.rs (100%) rename {src => milli/src}/update/index_documents/transform.rs (100%) rename {src => milli/src}/update/mod.rs (100%) rename {src => milli/src}/update/settings.rs (100%) rename {src => milli/src}/update/update_builder.rs (100%) rename {src => milli/src}/update/update_step.rs (100%) rename {src => milli/src}/update_store.rs (100%) diff --git a/.gitignore b/.gitignore index e672f93a2..107b5bb36 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,6 @@ /target /Cargo.lock -# the sub target folder -http-ui/target - # datasets *.csv *.mmdb diff --git a/Cargo.lock b/Cargo.lock index ba477336e..d4a57f446 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -27,6 +27,70 @@ version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afddf7f520a80dbf76e6f50a35bca42a2331ef227a28b3b6dc5c2e2338d114b1" +[[package]] +name = "arrayvec" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" + +[[package]] +name = "askama" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d298738b6e47e1034e560e5afe63aa488fea34e25ec11b855a76f0d7b8e73134" +dependencies = [ + "askama_derive", + "askama_escape", + "askama_shared", + "mime", + "mime_guess", +] + +[[package]] +name = "askama_derive" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2925c4c290382f9d2fa3d1c1b6a63fa1427099721ecca4749b154cc9c25522" +dependencies = [ + "askama_shared", + "proc-macro2", + "syn", +] + +[[package]] +name = "askama_escape" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90c108c1a94380c89d2215d0ac54ce09796823cca0fd91b299cfff3b33e346fb" + +[[package]] +name = "askama_shared" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2582b77e0f3c506ec4838a25fa8a5f97b9bed72bb6d3d272ea1c031d8bd373bc" +dependencies = [ + "askama_escape", + "humansize", + "nom", + "num-traits", + "percent-encoding", + "proc-macro2", + "quote", + "serde", + "syn", + "toml", +] + +[[package]] +name = "askama_warp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96f410ab17fa08f70b5fda07ce1112418642c914864961630808979343ea226" +dependencies = [ + "askama", + "warp", +] + [[package]] name = "atty" version = "0.2.14" @@ -35,7 +99,7 @@ checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ "hermit-abi", "libc", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -44,6 +108,18 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +[[package]] +name = "base64" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" + +[[package]] +name = "base64" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" + [[package]] name = "bincode" version = "1.3.1" @@ -60,6 +136,18 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "bitvec" +version = "0.19.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7ba35e9565969edb811639dbebfe34edc0368e472c5018474c8eb2543397f81" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + [[package]] name = "block-buffer" version = "0.7.3" @@ -69,7 +157,16 @@ dependencies = [ "block-padding", "byte-tools", "byteorder", - "generic-array", + "generic-array 0.12.3", +] + +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "generic-array 0.14.4", ] [[package]] @@ -93,6 +190,16 @@ dependencies = [ "serde", ] +[[package]] +name = "buf_redux" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" +dependencies = [ + "memchr", + "safemem", +] + [[package]] name = "bumpalo" version = "3.4.0" @@ -120,6 +227,18 @@ version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b" +[[package]] +name = "bytes" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38" + +[[package]] +name = "bytes" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040" + [[package]] name = "cast" version = "0.2.3" @@ -178,7 +297,7 @@ dependencies = [ "num-integer", "num-traits", "time", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -205,6 +324,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" +[[package]] +name = "cpuid-bool" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" + [[package]] name = "crc32fast" version = "1.2.1" @@ -349,9 +474,24 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" dependencies = [ - "generic-array", + "generic-array 0.12.3", ] +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array 0.14.4", +] + +[[package]] +name = "dtoa" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d7ed2934d741c6b37e33e3832298e8850b53fd2d2bea03873375596c7cea4e" + [[package]] name = "either" version = "1.6.1" @@ -376,6 +516,12 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "form_urlencoded" version = "1.0.0" @@ -398,6 +544,123 @@ version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d79238883cf0307100b90aba4a755d8051a3182305dfe7f649a1e9dc0517006f" +[[package]] +name = "fuchsia-zircon" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" +dependencies = [ + "bitflags", + "fuchsia-zircon-sys", +] + +[[package]] +name = "fuchsia-zircon-sys" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" + +[[package]] +name = "funty" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" + +[[package]] +name = "futures" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" + +[[package]] +name = "futures-executor" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" + +[[package]] +name = "futures-macro" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" +dependencies = [ + "proc-macro-hack", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" + +[[package]] +name = "futures-task" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" +dependencies = [ + "once_cell", +] + +[[package]] +name = "futures-util" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite 0.2.4", + "pin-utils", + "proc-macro-hack", + "proc-macro-nested", + "slab", +] + [[package]] name = "fxhash" version = "0.2.1" @@ -416,6 +679,16 @@ dependencies = [ "typenum", ] +[[package]] +name = "generic-array" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.1.16" @@ -458,6 +731,26 @@ dependencies = [ "zstd", ] +[[package]] +name = "h2" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e4728fd124914ad25e99e3d15a9361a879f6620f63cb56bbb08f95abb97a535" +dependencies = [ + "bytes 0.5.6", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", + "tracing-futures", +] + [[package]] name = "half" version = "1.7.1" @@ -480,6 +773,31 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" +[[package]] +name = "headers" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62689dc57c7456e69712607ffcbd0aa1dfcccf9af73727e9b25bc1825375cac3" +dependencies = [ + "base64 0.13.0", + "bitflags", + "bytes 1.0.1", + "headers-core", + "http", + "mime", + "sha-1 0.8.2", + "time", +] + +[[package]] +name = "headers-core" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" +dependencies = [ + "http", +] + [[package]] name = "heck" version = "0.3.2" @@ -502,6 +820,7 @@ dependencies = [ "lmdb-rkv-sys", "once_cell", "page_size", + "serde", "synchronoise", "url", "zerocopy", @@ -535,12 +854,105 @@ dependencies = [ "libc", ] +[[package]] +name = "http" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7245cd7449cc792608c3c8a9eaf69bd4eabbabf802713748fd739c98b82f0747" +dependencies = [ + "bytes 1.0.1", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13d5ff830006f7646652e057693569bfe0d51760c0085a071769d142a205111b" +dependencies = [ + "bytes 0.5.6", + "http", +] + +[[package]] +name = "http-ui" +version = "0.1.0" +dependencies = [ + "anyhow", + "askama", + "askama_warp", + "byte-unit", + "bytes 0.5.6", + "either", + "flate2", + "fst", + "futures", + "grenad", + "heed", + "log", + "meilisearch-tokenizer", + "memmap", + "milli", + "once_cell", + "rayon", + "serde", + "serde_json", + "stderrlog", + "structopt", + "tempfile", + "tokio", + "warp", +] + +[[package]] +name = "httparse" +version = "1.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "615caabe2c3160b313d52ccc905335f4ed5f10881dd63dc5699d47e90be85691" + +[[package]] +name = "httpdate" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" + [[package]] name = "human_format" version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86cce260d758a9aa3d7c4b99d55c815a540f8a37514ba6046ab6be402a157cb0" +[[package]] +name = "humansize" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6cab2627acfc432780848602f3f558f7e9dd427352224b0d9324025796d2a5e" + +[[package]] +name = "hyper" +version = "0.13.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a6f157065790a3ed2f88679250419b5cdd96e714a0d65f7797fd337186e96bb" +dependencies = [ + "bytes 0.5.6", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project 1.0.5", + "socket2", + "tokio", + "tower-service", + "tracing", + "want", +] + [[package]] name = "idna" version = "0.2.0" @@ -562,6 +974,24 @@ dependencies = [ "hashbrown 0.9.1", ] +[[package]] +name = "input_buffer" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19a8a95243d5a0398cae618ec29477c6e3cb631152be5c19481f80bc71559754" +dependencies = [ + "bytes 0.5.6", +] + +[[package]] +name = "iovec" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" +dependencies = [ + "libc", +] + [[package]] name = "itertools" version = "0.9.0" @@ -631,6 +1061,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -646,6 +1086,19 @@ dependencies = [ "fst", ] +[[package]] +name = "lexical-core" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db65c6da02e61f55dae90a0ae427b2a5f6b3e8db09f58d10efab23af92592616" +dependencies = [ + "arrayvec", + "bitflags", + "cfg-if 0.1.10", + "ryu", + "static_assertions", +] + [[package]] name = "libc" version = "0.2.82" @@ -719,7 +1172,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" dependencies = [ "libc", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -779,6 +1232,22 @@ dependencies = [ "uuid", ] +[[package]] +name = "mime" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" + +[[package]] +name = "mime_guess" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2684d4c2e97d99848d30b324b00c8fcc7e5c897b7cbb5819b09e7c90e8baf212" +dependencies = [ + "mime", + "unicase", +] + [[package]] name = "miniz_oxide" version = "0.4.3" @@ -789,6 +1258,88 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mio" +version = "0.6.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4afd66f5b91bf2a3bc13fad0e21caedac168ca4c707504e75585648ae80e4cc4" +dependencies = [ + "cfg-if 0.1.10", + "fuchsia-zircon", + "fuchsia-zircon-sys", + "iovec", + "kernel32-sys", + "libc", + "log", + "miow 0.2.2", + "net2", + "slab", + "winapi 0.2.8", +] + +[[package]] +name = "mio-named-pipes" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" +dependencies = [ + "log", + "mio", + "miow 0.3.6", + "winapi 0.3.9", +] + +[[package]] +name = "mio-uds" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0" +dependencies = [ + "iovec", + "libc", + "mio", +] + +[[package]] +name = "miow" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebd808424166322d4a38da87083bfddd3ac4c131334ed55856112eb06d46944d" +dependencies = [ + "kernel32-sys", + "net2", + "winapi 0.2.8", + "ws2_32-sys", +] + +[[package]] +name = "miow" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a33c1b55807fbed163481b5ba66db4b2fa6cde694a5027be10fb724206c5897" +dependencies = [ + "socket2", + "winapi 0.3.9", +] + +[[package]] +name = "multipart" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050aeedc89243f5347c3e237e3e13dc76fbe4ae3742a57b94dc14f69acf76d4" +dependencies = [ + "buf_redux", + "httparse", + "log", + "mime", + "mime_guess", + "quick-error", + "rand 0.7.3", + "safemem", + "tempfile", + "twoway", +] + [[package]] name = "near-proximity" version = "0.1.0" @@ -797,6 +1348,17 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "net2" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" +dependencies = [ + "cfg-if 0.1.10", + "libc", + "winapi 0.3.9", +] + [[package]] name = "nix" version = "0.19.1" @@ -809,6 +1371,18 @@ dependencies = [ "libc", ] +[[package]] +name = "nom" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab6f70b46d6325aa300f1c7bb3d470127dfc27806d8ea6bf294ee0ce643ce2b1" +dependencies = [ + "bitvec", + "lexical-core", + "memchr", + "version_check", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -862,6 +1436,12 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" +[[package]] +name = "opaque-debug" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" + [[package]] name = "ordered-float" version = "2.0.1" @@ -878,7 +1458,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" dependencies = [ "libc", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -935,7 +1515,7 @@ checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" dependencies = [ "maplit", "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "sha-1", + "sha-1 0.8.2", ] [[package]] @@ -976,6 +1556,64 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15" +dependencies = [ + "pin-project-internal 0.4.27", +] + +[[package]] +name = "pin-project" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96fa8ebb90271c4477f144354485b8068bd8f6b78b428b01ba892ca26caf0b63" +dependencies = [ + "pin-project-internal 1.0.5", +] + +[[package]] +name = "pin-project-internal" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65ad2ae56b6abe3a1ee25f15ee605bacadb9a764edaba9c2bf4103800d4a1895" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-internal" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758669ae3558c6f74bd2a18b41f7ac0b5a195aea6639d6a9b5e5d1ad5ba24c0b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b" + +[[package]] +name = "pin-project-lite" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + [[package]] name = "pkg-config" version = "0.3.19" @@ -1024,6 +1662,18 @@ dependencies = [ "version_check", ] +[[package]] +name = "proc-macro-hack" +version = "0.5.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" + +[[package]] +name = "proc-macro-nested" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" + [[package]] name = "proc-macro2" version = "1.0.24" @@ -1033,6 +1683,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + [[package]] name = "quote" version = "1.0.8" @@ -1042,6 +1698,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "radium" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" + [[package]] name = "rand" version = "0.7.3" @@ -1200,7 +1862,7 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" dependencies = [ - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1233,6 +1895,12 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" +[[package]] +name = "safemem" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" + [[package]] name = "same-file" version = "1.0.6" @@ -1242,6 +1910,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scoped-tls" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" + [[package]] name = "scopeguard" version = "1.1.0" @@ -1305,16 +1979,50 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_urlencoded" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ec5d77e2d4c73717816afac02670d5c4f534ea95ed430442cad02e7a6e32c97" +dependencies = [ + "dtoa", + "itoa", + "serde", + "url", +] + [[package]] name = "sha-1" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" dependencies = [ - "block-buffer", - "digest", + "block-buffer 0.7.3", + "digest 0.8.1", "fake-simd", - "opaque-debug", + "opaque-debug 0.2.3", +] + +[[package]] +name = "sha-1" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4b312c3731e3fe78a185e6b9b911a7aa715b8e31cce117975219aab2acf285d" +dependencies = [ + "block-buffer 0.9.0", + "cfg-if 1.0.0", + "cpuid-bool", + "digest 0.9.0", + "opaque-debug 0.3.0", +] + +[[package]] +name = "signal-hook-registry" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6" +dependencies = [ + "libc", ] [[package]] @@ -1323,6 +2031,12 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" +[[package]] +name = "slab" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" + [[package]] name = "slice-group-by" version = "0.2.6" @@ -1351,6 +2065,23 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "98d3306e84bf86710d6cd8b4c9c3b721d5454cc91a603180f8f8cd06cfd317b4" +[[package]] +name = "socket2" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "122e570113d28d773067fab24266b66753f6ea915758651696b6e35e49f88d6e" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "stderrlog" version = "0.5.1" @@ -1420,6 +2151,12 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "tap" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36474e732d1affd3a6ed582781b3683df3d0563714c59c39591e8ff707cf078e" + [[package]] name = "tempfile" version = "3.2.0" @@ -1431,7 +2168,7 @@ dependencies = [ "rand 0.8.2", "redox_syscall", "remove_dir_all", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1441,7 +2178,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9" dependencies = [ "libc", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1479,7 +2216,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" dependencies = [ "libc", - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1507,6 +2244,148 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" +[[package]] +name = "tokio" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6703a273949a90131b290be1fe7b039d0fc884aa1935860dfcbe056f28cd8092" +dependencies = [ + "bytes 0.5.6", + "fnv", + "futures-core", + "iovec", + "lazy_static", + "libc", + "memchr", + "mio", + "mio-named-pipes", + "mio-uds", + "num_cpus", + "pin-project-lite 0.1.11", + "signal-hook-registry", + "slab", + "tokio-macros", + "winapi 0.3.9", +] + +[[package]] +name = "tokio-macros" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e44da00bfc73a25f814cd8d7e57a68a5c31b74b3152a0a1d1f590c97ed06265a" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d9e878ad426ca286e4dcae09cbd4e1973a7f8987d97570e2469703dd7f5720c" +dependencies = [ + "futures-util", + "log", + "pin-project 0.4.27", + "tokio", + "tungstenite", +] + +[[package]] +name = "tokio-util" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be8242891f2b6cbef26a2d7e8605133c2c554cd35b3e4948ea892d6d68436499" +dependencies = [ + "bytes 0.5.6", + "futures-core", + "futures-sink", + "log", + "pin-project-lite 0.1.11", + "tokio", +] + +[[package]] +name = "toml" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +dependencies = [ + "serde", +] + +[[package]] +name = "tower-service" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" + +[[package]] +name = "tracing" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d40a22fd029e33300d8d89a5cc8ffce18bb7c587662f54629e94c9de5487f3" +dependencies = [ + "cfg-if 1.0.0", + "log", + "pin-project-lite 0.2.4", + "tracing-core", +] + +[[package]] +name = "tracing-core" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "tracing-futures" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c" +dependencies = [ + "pin-project 0.4.27", + "tracing", +] + +[[package]] +name = "try-lock" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" + +[[package]] +name = "tungstenite" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0308d80d86700c5878b9ef6321f020f29b1bb9d5ff3cab25e75e23f3a492a23" +dependencies = [ + "base64 0.12.3", + "byteorder", + "bytes 0.5.6", + "http", + "httparse", + "input_buffer", + "log", + "rand 0.7.3", + "sha-1 0.9.3", + "url", + "utf-8", +] + +[[package]] +name = "twoway" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b11b2b5241ba34be09c3cc85a36e56e48f9888862e19cedf23336d35316ed1" +dependencies = [ + "memchr", +] + [[package]] name = "typenum" version = "1.12.0" @@ -1519,6 +2398,15 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-bidi" version = "0.3.4" @@ -1567,6 +2455,18 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "urlencoding" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9232eb53352b4442e40d7900465dfc534e8cb2dc8f18656fcb2ac16112b5593" + +[[package]] +name = "utf-8" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" + [[package]] name = "utf8-width" version = "0.1.4" @@ -1595,10 +2495,48 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d" dependencies = [ "same-file", - "winapi", + "winapi 0.3.9", "winapi-util", ] +[[package]] +name = "want" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" +dependencies = [ + "log", + "try-lock", +] + +[[package]] +name = "warp" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f41be6df54c97904af01aa23e613d4521eed7ab23537cede692d4058f6449407" +dependencies = [ + "bytes 0.5.6", + "futures", + "headers", + "http", + "hyper", + "log", + "mime", + "mime_guess", + "multipart", + "pin-project 0.4.27", + "scoped-tls", + "serde", + "serde_json", + "serde_urlencoded", + "tokio", + "tokio-tungstenite", + "tower-service", + "tracing", + "tracing-futures", + "urlencoding", +] + [[package]] name = "wasi" version = "0.9.0+wasi-snapshot-preview1" @@ -1684,6 +2622,12 @@ dependencies = [ "hashbrown 0.7.2", ] +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" + [[package]] name = "winapi" version = "0.3.9" @@ -1694,6 +2638,12 @@ dependencies = [ "winapi-x86_64-pc-windows-gnu", ] +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" + [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" @@ -1706,7 +2656,7 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" dependencies = [ - "winapi", + "winapi 0.3.9", ] [[package]] @@ -1715,6 +2665,22 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "ws2_32-sys" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + +[[package]] +name = "wyz" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" + [[package]] name = "zerocopy" version = "0.3.0" diff --git a/Cargo.toml b/Cargo.toml index d9ad19e78..d04cae871 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,70 +1,6 @@ -[package] -name = "milli" -version = "0.1.0" -authors = ["Kerollmops "] -edition = "2018" - -[dependencies] -anyhow = "1.0.28" -bstr = "0.2.13" -byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -byteorder = "1.3.4" -crossbeam-channel = "0.5.0" -csv = "1.1.3" -either = "1.6.1" -flate2 = "1.0.17" -fst = "0.4.5" -fxhash = "0.2.1" -grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } -heed = { version = "0.10.5", default-features = false, features = ["lmdb", "sync-read-txn"] } -human_format = "1.0.3" -jemallocator = "0.3.2" -levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } -linked-hash-map = "0.5.3" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } -memmap = "0.7.0" -near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } -num-traits = "0.2.14" -obkv = "0.1.0" -once_cell = "1.4.0" -ordered-float = "2.0.0" -rayon = "1.3.1" -regex = "1.4.2" -ringtail = "0.3.0" -roaring = "0.6.4" -serde = { version = "1.0", features = ["derive"] } -serde_json = { version = "1.0.59", features = ["preserve_order"] } -slice-group-by = "0.2.6" -smallstr = { version = "0.2.0", features = ["serde"] } -smallvec = "1.4.0" -structopt = { version = "0.3.14", default-features = false, features = ["wrap_help"] } -tempfile = "3.1.0" -uuid = { version = "0.8.1", features = ["v4"] } - -# facet filter parser -pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } -pest_derive = "2.1.0" - -# documents words self-join -itertools = "0.9.0" - -# logging -log = "0.4.11" -stderrlog = "0.5.0" - -[dev-dependencies] -criterion = "0.3.3" -maplit = "1.0.2" - -[build-dependencies] -fst = "0.4.5" - -[features] -default = [] - -[[bench]] -name = "search" -harness = false +[workspace] +members = ["milli", "http-ui"] +default-members = ["milli"] [profile.release] debug = true diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock deleted file mode 100644 index 29e04d714..000000000 --- a/http-ui/Cargo.lock +++ /dev/null @@ -1,2530 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -[[package]] -name = "adler" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" - -[[package]] -name = "ahash" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" - -[[package]] -name = "aho-corasick" -version = "0.7.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" -dependencies = [ - "memchr", -] - -[[package]] -name = "anyhow" -version = "1.0.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf8dcb5b4bbaa28653b647d8c77bd4ed40183b48882e130c1f1ffb73de069fd7" - -[[package]] -name = "askama" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70a6e7ebd44d0047fd48206c83c5cd3214acc7b9d87f001da170145c47ef7d12" -dependencies = [ - "askama_derive", - "askama_escape", - "askama_shared", - "mime", - "mime_guess", -] - -[[package]] -name = "askama_derive" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1d7169690c4f56343dcd821ab834972a22570a2662a19a84fd7775d5e1c3881" -dependencies = [ - "askama_shared", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "askama_escape" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90c108c1a94380c89d2215d0ac54ce09796823cca0fd91b299cfff3b33e346fb" - -[[package]] -name = "askama_shared" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62fc272363345c8cdc030e4c259d9d028237f8b057dc9bb327772a257bde6bb5" -dependencies = [ - "askama_escape", - "humansize", - "nom", - "num-traits", - "percent-encoding", - "proc-macro2", - "quote", - "serde", - "syn", - "toml", -] - -[[package]] -name = "askama_warp" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c96f410ab17fa08f70b5fda07ce1112418642c914864961630808979343ea226" -dependencies = [ - "askama", - "warp", -] - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "autocfg" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2" - -[[package]] -name = "autocfg" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" - -[[package]] -name = "base64" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" - -[[package]] -name = "bincode" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30d3a39baa26f9651f17b375061f3233dde33424a8b72b0dbe93a68a0bc896d" -dependencies = [ - "byteorder", - "serde", -] - -[[package]] -name = "bitflags" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" - -[[package]] -name = "block-buffer" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" -dependencies = [ - "block-padding", - "byte-tools", - "byteorder", - "generic-array 0.12.3", -] - -[[package]] -name = "block-buffer" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" -dependencies = [ - "generic-array 0.14.4", -] - -[[package]] -name = "block-padding" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" -dependencies = [ - "byte-tools", -] - -[[package]] -name = "bstr" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "473fc6b38233f9af7baa94fb5852dca389e3d95b8e21c8e3719301462c5d9faf" -dependencies = [ - "lazy_static", - "memchr", - "regex-automata", - "serde", -] - -[[package]] -name = "buf_redux" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" -dependencies = [ - "memchr", - "safemem", -] - -[[package]] -name = "byte-tools" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" - -[[package]] -name = "byte-unit" -version = "4.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c8758c32833faaae35b24a73d332e62d0528e89076ae841c63940e37008b153" -dependencies = [ - "utf8-width", -] - -[[package]] -name = "byteorder" -version = "1.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" - -[[package]] -name = "bytes" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38" - -[[package]] -name = "cc" -version = "1.0.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed67cbde08356238e75fc4656be4749481eeffb09e19f320a25237d5221c985d" -dependencies = [ - "jobserver", -] - -[[package]] -name = "cedarwood" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d" -dependencies = [ - "smallvec", -] - -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "character_converter" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" -dependencies = [ - "bincode", -] - -[[package]] -name = "chrono" -version = "0.4.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" -dependencies = [ - "libc", - "num-integer", - "num-traits", - "time", - "winapi 0.3.9", -] - -[[package]] -name = "clap" -version = "2.33.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" -dependencies = [ - "bitflags", - "term_size", - "textwrap", - "unicode-width", -] - -[[package]] -name = "cloudabi" -version = "0.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" -dependencies = [ - "bitflags", -] - -[[package]] -name = "const_fn" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c478836e029dcef17fb47c89023448c64f781a046e0300e257ad8225ae59afab" - -[[package]] -name = "cow-utils" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" - -[[package]] -name = "cpuid-bool" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" - -[[package]] -name = "crc32fast" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" -dependencies = [ - "cfg-if 1.0.0", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.0", -] - -[[package]] -name = "crossbeam-deque" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-epoch", - "crossbeam-utils 0.8.0", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0f606a85340376eef0d6d8fec399e6d4a544d648386c6645eb6d0653b27d9f" -dependencies = [ - "cfg-if 1.0.0", - "const_fn", - "crossbeam-utils 0.8.0", - "lazy_static", - "memoffset", - "scopeguard", -] - -[[package]] -name = "crossbeam-queue" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b" -dependencies = [ - "crossbeam-utils 0.6.6", -] - -[[package]] -name = "crossbeam-utils" -version = "0.6.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04973fa96e96579258a5091af6003abde64af786b860f18622b82e026cca60e6" -dependencies = [ - "cfg-if 0.1.10", - "lazy_static", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec91540d98355f690a86367e566ecad2e9e579f230230eb7c21398372be73ea5" -dependencies = [ - "autocfg 1.0.1", - "cfg-if 1.0.0", - "const_fn", - "lazy_static", -] - -[[package]] -name = "csv" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc4666154fd004af3fd6f1da2e81a96fd5a81927fe8ddb6ecc79e2aa6e138b54" -dependencies = [ - "bstr", - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" -dependencies = [ - "memchr", -] - -[[package]] -name = "deunicode" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80115a2dfde04491e181c2440a39e4be26e52d9ca4e92bed213f65b94e0b8db1" - -[[package]] -name = "digest" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" -dependencies = [ - "generic-array 0.12.3", -] - -[[package]] -name = "digest" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" -dependencies = [ - "generic-array 0.14.4", -] - -[[package]] -name = "dtoa" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134951f4028bdadb9b84baf4232681efbf277da25144b9b0ad65df75946c422b" - -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - -[[package]] -name = "fake-simd" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" - -[[package]] -name = "flate2" -version = "1.0.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7411863d55df97a419aa64cb4d2f167103ea9d767e2c54a1868b7ac3f6b47129" -dependencies = [ - "cfg-if 1.0.0", - "crc32fast", - "libc", - "miniz_oxide", -] - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "form_urlencoded" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece68d15c92e84fa4f19d3780f1294e5ca82a78a6d515f1efaabcc144688be00" -dependencies = [ - "matches", - "percent-encoding", -] - -[[package]] -name = "fs_extra" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" - -[[package]] -name = "fst" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d79238883cf0307100b90aba4a755d8051a3182305dfe7f649a1e9dc0517006f" - -[[package]] -name = "fuchsia-cprng" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" - -[[package]] -name = "fuchsia-zircon" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" -dependencies = [ - "bitflags", - "fuchsia-zircon-sys", -] - -[[package]] -name = "fuchsia-zircon-sys" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" - -[[package]] -name = "futures" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95314d38584ffbfda215621d723e0a3906f032e03ae5551e650058dac83d4797" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0448174b01148032eed37ac4aed28963aaaa8cfa93569a08e5b479bbc6c2c151" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18eaa56102984bed2c88ea39026cff3ce3b4c7f508ca970cedf2450ea10d4e46" - -[[package]] -name = "futures-executor" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5f8e0c9258abaea85e78ebdda17ef9666d390e987f006be6080dfe354b708cb" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e1798854a4727ff944a7b12aa999f58ce7aa81db80d2dfaaf2ba06f065ddd2b" - -[[package]] -name = "futures-macro" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e36fccf3fc58563b4a14d265027c627c3b665d7fed489427e88e7cc929559efe" -dependencies = [ - "proc-macro-hack", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "futures-sink" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e3ca3f17d6e8804ae5d3df7a7d35b2b3a6fe89dac84b31872720fc3060a0b11" - -[[package]] -name = "futures-task" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d502af37186c4fef99453df03e374683f8a1eec9dcc1e66b3b82dc8278ce3c" -dependencies = [ - "once_cell", -] - -[[package]] -name = "futures-util" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abcb44342f62e6f3e8ac427b8aa815f724fd705dfad060b18ac7866c15bb8e34" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project 1.0.1", - "pin-utils", - "proc-macro-hack", - "proc-macro-nested", - "slab", -] - -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - -[[package]] -name = "generic-array" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c68f0274ae0e023facc3c97b2e00f076be70e254bc851d972503b328db79b2ec" -dependencies = [ - "typenum", -] - -[[package]] -name = "generic-array" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "getrandom" -version = "0.1.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc587bc0ec293155d5bfa6b9891ec18a1e330c234f896ea47fbada4cadbe47e6" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", -] - -[[package]] -name = "glob" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" - -[[package]] -name = "grenad" -version = "0.1.0" -source = "git+https://github.com/Kerollmops/grenad.git?rev=3adcb26#3adcb267dcbc590c7da10eb5f887a254865b3dbe" -dependencies = [ - "byteorder", - "flate2", - "log", - "nix", - "snap", - "tempfile", - "zstd", -] - -[[package]] -name = "h2" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e4728fd124914ad25e99e3d15a9361a879f6620f63cb56bbb08f95abb97a535" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap", - "slab", - "tokio", - "tokio-util", - "tracing", - "tracing-futures", -] - -[[package]] -name = "hashbrown" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf" -dependencies = [ - "ahash", - "autocfg 1.0.1", -] - -[[package]] -name = "hashbrown" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" - -[[package]] -name = "headers" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed18eb2459bf1a09ad2d6b1547840c3e5e62882fa09b9a6a20b1de8e3228848f" -dependencies = [ - "base64", - "bitflags", - "bytes", - "headers-core", - "http", - "mime", - "sha-1 0.8.2", - "time", -] - -[[package]] -name = "headers-core" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" -dependencies = [ - "http", -] - -[[package]] -name = "heck" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" -dependencies = [ - "unicode-segmentation", -] - -[[package]] -name = "heed" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eaba3b0edee6a9cd551f24caca2027922b03259f7203a15f0b86af4c1348fcc" -dependencies = [ - "byteorder", - "heed-traits", - "heed-types", - "libc", - "lmdb-rkv-sys", - "once_cell", - "page_size", - "synchronoise", - "url", - "zerocopy", -] - -[[package]] -name = "heed-traits" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b328f6260a7e51bdb0ca6b68e6ea27ee3d11fba5dee930896ee7ff6ad5fc072c" - -[[package]] -name = "heed-types" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e628efb08beaee58355f80dc4adba79d644940ea9eef60175ea17dc218aab405" -dependencies = [ - "bincode", - "heed-traits", - "serde", - "serde_json", - "zerocopy", -] - -[[package]] -name = "hermit-abi" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aca5565f760fb5b220e499d72710ed156fdb74e631659e99377d9ebfbd13ae8" -dependencies = [ - "libc", -] - -[[package]] -name = "http" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d569972648b2c512421b5f2a405ad6ac9666547189d0c5477a3f200f3e02f9" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - -[[package]] -name = "http-body" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13d5ff830006f7646652e057693569bfe0d51760c0085a071769d142a205111b" -dependencies = [ - "bytes", - "http", -] - -[[package]] -name = "http-ui" -version = "0.1.0" -dependencies = [ - "anyhow", - "askama", - "askama_warp", - "byte-unit", - "bytes", - "either", - "flate2", - "fst", - "futures", - "grenad", - "heed", - "log", - "meilisearch-tokenizer", - "memmap", - "milli", - "once_cell", - "rayon", - "serde", - "serde_json", - "stderrlog", - "structopt", - "tempfile", - "tokio", - "warp", -] - -[[package]] -name = "httparse" -version = "1.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd179ae861f0c2e53da70d892f5f3029f9594be0c41dc5269cd371691b1dc2f9" - -[[package]] -name = "httpdate" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" - -[[package]] -name = "human_format" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86cce260d758a9aa3d7c4b99d55c815a540f8a37514ba6046ab6be402a157cb0" - -[[package]] -name = "humansize" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6cab2627acfc432780848602f3f558f7e9dd427352224b0d9324025796d2a5e" - -[[package]] -name = "hyper" -version = "0.13.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ad767baac13b44d4529fcf58ba2cd0995e36e7b435bc5b039de6f47e880dbf" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "httparse", - "httpdate", - "itoa", - "pin-project 1.0.1", - "socket2", - "tokio", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "idna" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" -dependencies = [ - "matches", - "unicode-bidi", - "unicode-normalization", -] - -[[package]] -name = "indexmap" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2" -dependencies = [ - "autocfg 1.0.1", - "hashbrown 0.9.1", -] - -[[package]] -name = "input_buffer" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19a8a95243d5a0398cae618ec29477c6e3cb631152be5c19481f80bc71559754" -dependencies = [ - "bytes", -] - -[[package]] -name = "iovec" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" -dependencies = [ - "libc", -] - -[[package]] -name = "itertools" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc6f3ad7b9d11a0c00842ff8de1b60ee58661048eb8049ed33c73594f359d7e6" - -[[package]] -name = "jemalloc-sys" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45" -dependencies = [ - "cc", - "fs_extra", - "libc", -] - -[[package]] -name = "jemallocator" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69" -dependencies = [ - "jemalloc-sys", - "libc", -] - -[[package]] -name = "jieba-rs" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34fbdeee8786790f4a99fa30ff5c5f88aa5183f7583693e3788d17fc8a48f33a" -dependencies = [ - "cedarwood", - "fxhash", - "hashbrown 0.9.1", - "lazy_static", - "phf", - "phf_codegen", - "regex", -] - -[[package]] -name = "jobserver" -version = "0.1.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c71313ebb9439f74b00d9d2dcec36440beaf57a6aa0623068441dd7cd81a7f2" -dependencies = [ - "libc", -] - -[[package]] -name = "kernel32-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "levenshtein_automata" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f44db4199cdb049b494a92d105acbfa43c25b3925e33803923ba9580b7bc9e1a" -dependencies = [ - "fst", -] - -[[package]] -name = "libc" -version = "0.2.80" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614" - -[[package]] -name = "linked-hash-map" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8dd5a6d5999d9907cda8ed67bbd137d3af8085216c2ac62de5be860bd41f304a" - -[[package]] -name = "lmdb-rkv-sys" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b27470ac25167b3afdfb6af8fcd3bc1be67de50ffbdaf4073378cfded6ae24a5" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - -[[package]] -name = "log" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fabed175da42fed1fa0746b0ea71f412aa9d35e76e95e59b192c64b9dc2bf8b" -dependencies = [ - "cfg-if 0.1.10", -] - -[[package]] -name = "maplit" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" - -[[package]] -name = "matches" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" - -[[package]] -name = "meilisearch-tokenizer" -version = "0.1.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#147b6154b1b34cb8f5da2df6a416b7da191bc850" -dependencies = [ - "character_converter", - "cow-utils", - "deunicode", - "fst", - "jieba-rs", - "once_cell", - "slice-group-by", - "unicode-segmentation", - "whatlang", -] - -[[package]] -name = "memchr" -version = "2.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" - -[[package]] -name = "memmap" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "memoffset" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa" -dependencies = [ - "autocfg 1.0.1", -] - -[[package]] -name = "milli" -version = "0.1.0" -dependencies = [ - "anyhow", - "bstr", - "byte-unit", - "byteorder", - "crossbeam-channel", - "csv", - "either", - "flate2", - "fst", - "fxhash", - "grenad", - "heed", - "human_format", - "itertools", - "jemallocator", - "levenshtein_automata", - "linked-hash-map", - "log", - "meilisearch-tokenizer", - "memmap", - "near-proximity", - "num-traits", - "obkv", - "once_cell", - "ordered-float", - "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", - "pest_derive", - "rayon", - "regex", - "ringtail", - "roaring", - "serde", - "serde_json", - "slice-group-by", - "smallstr", - "smallvec", - "stderrlog", - "structopt", - "tempfile", - "uuid", -] - -[[package]] -name = "mime" -version = "0.3.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" - -[[package]] -name = "mime_guess" -version = "2.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2684d4c2e97d99848d30b324b00c8fcc7e5c897b7cbb5819b09e7c90e8baf212" -dependencies = [ - "mime", - "unicase", -] - -[[package]] -name = "miniz_oxide" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f2d26ec3309788e423cfbf68ad1800f061638098d76a83681af979dc4eda19d" -dependencies = [ - "adler", - "autocfg 1.0.1", -] - -[[package]] -name = "mio" -version = "0.6.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fce347092656428bc8eaf6201042cb551b8d67855af7374542a92a0fbfcac430" -dependencies = [ - "cfg-if 0.1.10", - "fuchsia-zircon", - "fuchsia-zircon-sys", - "iovec", - "kernel32-sys", - "libc", - "log", - "miow 0.2.1", - "net2", - "slab", - "winapi 0.2.8", -] - -[[package]] -name = "mio-named-pipes" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" -dependencies = [ - "log", - "mio", - "miow 0.3.5", - "winapi 0.3.9", -] - -[[package]] -name = "mio-uds" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0" -dependencies = [ - "iovec", - "libc", - "mio", -] - -[[package]] -name = "miow" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c1f2f3b1cf331de6896aabf6e9d55dca90356cc9960cca7eaaf408a355ae919" -dependencies = [ - "kernel32-sys", - "net2", - "winapi 0.2.8", - "ws2_32-sys", -] - -[[package]] -name = "miow" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07b88fb9795d4d36d62a012dfbf49a8f5cf12751f36d31a9dbe66d528e58979e" -dependencies = [ - "socket2", - "winapi 0.3.9", -] - -[[package]] -name = "multipart" -version = "0.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8209c33c951f07387a8497841122fc6f712165e3f9bda3e6be4645b58188f676" -dependencies = [ - "buf_redux", - "httparse", - "log", - "mime", - "mime_guess", - "quick-error", - "rand 0.6.5", - "safemem", - "tempfile", - "twoway", -] - -[[package]] -name = "near-proximity" -version = "0.1.0" -source = "git+https://github.com/Kerollmops/plane-sweep-proximity?rev=6608205#66082058537f6fe7709adc4690048d62f3c0e9b7" -dependencies = [ - "tinyvec 1.0.1", -] - -[[package]] -name = "net2" -version = "0.2.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ebc3ec692ed7c9a255596c67808dee269f64655d8baf7b4f0638e51ba1d6853" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "nix" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85db2feff6bf70ebc3a4793191517d5f0331100a2f10f9bf93b5e5214f32b7b7" -dependencies = [ - "bitflags", - "cc", - "cfg-if 0.1.10", - "libc", -] - -[[package]] -name = "nom" -version = "5.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffb4262d26ed83a1c0a33a38fe2bb15797329c85770da05e6b828ddb782627af" -dependencies = [ - "memchr", - "version_check", -] - -[[package]] -name = "num-integer" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" -dependencies = [ - "autocfg 1.0.1", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" -dependencies = [ - "autocfg 1.0.1", -] - -[[package]] -name = "num_cpus" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" -dependencies = [ - "hermit-abi", - "libc", -] - -[[package]] -name = "obkv" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd8a5a0aa2f3adafe349259a5b3e21a19c388b792414c1161d60a69c1fa48e8" - -[[package]] -name = "once_cell" -version = "1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0" - -[[package]] -name = "opaque-debug" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" - -[[package]] -name = "opaque-debug" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" - -[[package]] -name = "ordered-float" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fe9037165d7023b1228bc4ae9a2fa1a2b0095eca6c2998c624723dfd01314a5" -dependencies = [ - "num-traits", -] - -[[package]] -name = "page_size" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "percent-encoding" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" - -[[package]] -name = "pest" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" -dependencies = [ - "ucd-trie", -] - -[[package]] -name = "pest" -version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" -dependencies = [ - "ucd-trie", -] - -[[package]] -name = "pest_derive" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" -dependencies = [ - "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "pest_generator", -] - -[[package]] -name = "pest_generator" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" -dependencies = [ - "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "pest_meta", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pest_meta" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" -dependencies = [ - "maplit", - "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "sha-1 0.8.2", -] - -[[package]] -name = "phf" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_codegen" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" -dependencies = [ - "phf_generator", - "phf_shared", -] - -[[package]] -name = "phf_generator" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" -dependencies = [ - "phf_shared", - "rand 0.7.3", -] - -[[package]] -name = "phf_shared" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" -dependencies = [ - "siphasher", -] - -[[package]] -name = "pin-project" -version = "0.4.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15" -dependencies = [ - "pin-project-internal 0.4.27", -] - -[[package]] -name = "pin-project" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee41d838744f60d959d7074e3afb6b35c7456d0f61cad38a24e35e6553f73841" -dependencies = [ - "pin-project-internal 1.0.1", -] - -[[package]] -name = "pin-project-internal" -version = "0.4.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65ad2ae56b6abe3a1ee25f15ee605bacadb9a764edaba9c2bf4103800d4a1895" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pin-project-internal" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81a4ffa594b66bff340084d4081df649a7dc049ac8d7fc458d8e628bfbbb2f86" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "pin-project-lite" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkg-config" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" - -[[package]] -name = "ppv-lite86" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" - -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro-nested" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eba180dafb9038b050a4c280019bbedf9f2467b61e5d892dcad585bb57aadc5a" - -[[package]] -name = "proc-macro2" -version = "1.0.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" -dependencies = [ - "unicode-xid", -] - -[[package]] -name = "quick-error" -version = "1.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" - -[[package]] -name = "quote" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" -dependencies = [ - "autocfg 0.1.7", - "libc", - "rand_chacha 0.1.1", - "rand_core 0.4.2", - "rand_hc 0.1.0", - "rand_isaac", - "rand_jitter", - "rand_os", - "rand_pcg 0.1.2", - "rand_xorshift", - "winapi 0.3.9", -] - -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom", - "libc", - "rand_chacha 0.2.2", - "rand_core 0.5.1", - "rand_hc 0.2.0", - "rand_pcg 0.2.1", -] - -[[package]] -name = "rand_chacha" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef" -dependencies = [ - "autocfg 0.1.7", - "rand_core 0.3.1", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core 0.5.1", -] - -[[package]] -name = "rand_core" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b" -dependencies = [ - "rand_core 0.4.2", -] - -[[package]] -name = "rand_core" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rand_hc" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rand_isaac" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rand_jitter" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b" -dependencies = [ - "libc", - "rand_core 0.4.2", - "winapi 0.3.9", -] - -[[package]] -name = "rand_os" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071" -dependencies = [ - "cloudabi", - "fuchsia-cprng", - "libc", - "rand_core 0.4.2", - "rdrand", - "winapi 0.3.9", -] - -[[package]] -name = "rand_pcg" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44" -dependencies = [ - "autocfg 0.1.7", - "rand_core 0.4.2", -] - -[[package]] -name = "rand_pcg" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rand_xorshift" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "rayon" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674" -dependencies = [ - "autocfg 1.0.1", - "crossbeam-deque", - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-utils 0.8.0", - "lazy_static", - "num_cpus", -] - -[[package]] -name = "rdrand" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2" -dependencies = [ - "rand_core 0.3.1", -] - -[[package]] -name = "redox_syscall" -version = "0.1.57" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" - -[[package]] -name = "regex" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38cf2c13ed4745de91a5eb834e11c00bcc3709e773173b2ce4c56c9fbde04b9c" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", - "thread_local", -] - -[[package]] -name = "regex-automata" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" -dependencies = [ - "byteorder", -] - -[[package]] -name = "regex-syntax" -version = "0.6.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b181ba2dcf07aaccad5448e8ead58db5b742cf85dfe035e2227f137a539a189" - -[[package]] -name = "remove_dir_all" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "ringtail" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21215c1b9d8f7832b433255bd9eea3e2779aa55b21b2f8e13aad62c74749b237" - -[[package]] -name = "roaring" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d60b41c8f25d07cecab125cb46ebbf234fc055effc61ca2392a3ef4f9422304" -dependencies = [ - "byteorder", -] - -[[package]] -name = "ryu" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" - -[[package]] -name = "safemem" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" - -[[package]] -name = "scoped-tls" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" - -[[package]] -name = "scopeguard" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - -[[package]] -name = "serde" -version = "1.0.117" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b88fa983de7720629c9387e9f517353ed404164b1e482c970a90c1a4aaf7dc1a" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.117" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbd1ae72adb44aab48f325a02444a5fc079349a8d804c1fc922aed3f7454c74e" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "serde_json" -version = "1.0.59" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcac07dbffa1c65e7f816ab9eba78eb142c6d44410f4eeba1e26e4f5dfa56b95" -dependencies = [ - "indexmap", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "serde_urlencoded" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ec5d77e2d4c73717816afac02670d5c4f534ea95ed430442cad02e7a6e32c97" -dependencies = [ - "dtoa", - "itoa", - "serde", - "url", -] - -[[package]] -name = "sha-1" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" -dependencies = [ - "block-buffer 0.7.3", - "digest 0.8.1", - "fake-simd", - "opaque-debug 0.2.3", -] - -[[package]] -name = "sha-1" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce3cdf1b5e620a498ee6f2a171885ac7e22f0e12089ec4b3d22b84921792507c" -dependencies = [ - "block-buffer 0.9.0", - "cfg-if 1.0.0", - "cpuid-bool", - "digest 0.9.0", - "opaque-debug 0.3.0", -] - -[[package]] -name = "signal-hook-registry" -version = "1.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce32ea0c6c56d5eacaeb814fbed9960547021d3edd010ded1425f180536b20ab" -dependencies = [ - "libc", -] - -[[package]] -name = "siphasher" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" - -[[package]] -name = "slab" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" - -[[package]] -name = "slice-group-by" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb" - -[[package]] -name = "smallstr" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e922794d168678729ffc7e07182721a14219c65814e66e91b839a272fe5ae4f" -dependencies = [ - "serde", - "smallvec", -] - -[[package]] -name = "smallvec" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbee7696b84bbf3d89a1c2eccff0850e3047ed46bfcd2e92c29a2d074d57e252" - -[[package]] -name = "snap" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da73c8f77aebc0e40c300b93f0a5f1bece7a248a36eee287d4e095f35c7b7d6e" - -[[package]] -name = "socket2" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1fa70dc5c8104ec096f4fe7ede7a221d35ae13dcd19ba1ad9a81d2cab9a1c44" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "redox_syscall", - "winapi 0.3.9", -] - -[[package]] -name = "stderrlog" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b02f316286ae558d83acc93dd81eaba096e746987a7961d4a9ae026842bae67f" -dependencies = [ - "atty", - "chrono", - "log", - "termcolor", - "thread_local", -] - -[[package]] -name = "structopt" -version = "0.3.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "126d630294ec449fae0b16f964e35bf3c74f940da9dca17ee9b905f7b3112eb8" -dependencies = [ - "clap", - "lazy_static", - "structopt-derive", -] - -[[package]] -name = "structopt-derive" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e51c492f9e23a220534971ff5afc14037289de430e3c83f9daf6a1b6ae91e8" -dependencies = [ - "heck", - "proc-macro-error", - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "syn" -version = "1.0.48" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc371affeffc477f42a221a1e4297aedcea33d47d19b61455588bd9d8f6b19ac" -dependencies = [ - "proc-macro2", - "quote", - "unicode-xid", -] - -[[package]] -name = "synchronoise" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d717ed0efc9d39ab3b642a096bc369a3e02a38a51c41845d7fe31bdad1d6eaeb" -dependencies = [ - "crossbeam-queue", -] - -[[package]] -name = "synstructure" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701" -dependencies = [ - "proc-macro2", - "quote", - "syn", - "unicode-xid", -] - -[[package]] -name = "tempfile" -version = "3.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "rand 0.7.3", - "redox_syscall", - "remove_dir_all", - "winapi 0.3.9", -] - -[[package]] -name = "term_size" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "termcolor" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "term_size", - "unicode-width", -] - -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static", -] - -[[package]] -name = "time" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi 0.3.9", -] - -[[package]] -name = "tinyvec" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "238ce071d267c5710f9d31451efec16c5ee22de34df17cc05e56cbc92e967117" - -[[package]] -name = "tinyvec" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b78a366903f506d2ad52ca8dc552102ffdd3e937ba8a227f024dc1d1eae28575" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" - -[[package]] -name = "tokio" -version = "0.2.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d34ca54d84bf2b5b4d7d31e901a8464f7b60ac145a284fba25ceb801f2ddccd" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "iovec", - "lazy_static", - "libc", - "memchr", - "mio", - "mio-named-pipes", - "mio-uds", - "num_cpus", - "pin-project-lite", - "signal-hook-registry", - "slab", - "tokio-macros", - "winapi 0.3.9", -] - -[[package]] -name = "tokio-macros" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0c3acc6aa564495a0f2e1d59fab677cd7f81a19994cfc7f3ad0e64301560389" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tokio-tungstenite" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d9e878ad426ca286e4dcae09cbd4e1973a7f8987d97570e2469703dd7f5720c" -dependencies = [ - "futures-util", - "log", - "pin-project 0.4.27", - "tokio", - "tungstenite", -] - -[[package]] -name = "tokio-util" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be8242891f2b6cbef26a2d7e8605133c2c554cd35b3e4948ea892d6d68436499" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "log", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "toml" -version = "0.5.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75cf45bb0bef80604d001caaec0d09da99611b3c0fd39d3080468875cdb65645" -dependencies = [ - "serde", -] - -[[package]] -name = "tower-service" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e987b6bf443f4b5b3b6f38704195592cca41c5bb7aedd3c3693c7081f8289860" - -[[package]] -name = "tracing" -version = "0.1.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0987850db3733619253fe60e17cb59b82d37c7e6c0236bb81e4d6b87c879f27" -dependencies = [ - "cfg-if 0.1.10", - "log", - "pin-project-lite", - "tracing-core", -] - -[[package]] -name = "tracing-core" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f" -dependencies = [ - "lazy_static", -] - -[[package]] -name = "tracing-futures" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c" -dependencies = [ - "pin-project 0.4.27", - "tracing", -] - -[[package]] -name = "try-lock" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" - -[[package]] -name = "tungstenite" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0308d80d86700c5878b9ef6321f020f29b1bb9d5ff3cab25e75e23f3a492a23" -dependencies = [ - "base64", - "byteorder", - "bytes", - "http", - "httparse", - "input_buffer", - "log", - "rand 0.7.3", - "sha-1 0.9.2", - "url", - "utf-8", -] - -[[package]] -name = "twoway" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b11b2b5241ba34be09c3cc85a36e56e48f9888862e19cedf23336d35316ed1" -dependencies = [ - "memchr", -] - -[[package]] -name = "typenum" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" - -[[package]] -name = "ucd-trie" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" - -[[package]] -name = "unicase" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" -dependencies = [ - "version_check", -] - -[[package]] -name = "unicode-bidi" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" -dependencies = [ - "matches", -] - -[[package]] -name = "unicode-normalization" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb19cf769fa8c6a80a162df694621ebeb4dafb606470b2b2fce0be40a98a977" -dependencies = [ - "tinyvec 0.3.4", -] - -[[package]] -name = "unicode-segmentation" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0" - -[[package]] -name = "unicode-width" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" - -[[package]] -name = "unicode-xid" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" - -[[package]] -name = "url" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e" -dependencies = [ - "form_urlencoded", - "idna", - "matches", - "percent-encoding", -] - -[[package]] -name = "urlencoding" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9232eb53352b4442e40d7900465dfc534e8cb2dc8f18656fcb2ac16112b5593" - -[[package]] -name = "utf-8" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" - -[[package]] -name = "utf8-width" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9071ac216321a4470a69fb2b28cfc68dcd1a39acd877c8be8e014df6772d8efa" - -[[package]] -name = "uuid" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fde2f6a4bea1d6e007c4ad38c6839fa71cbb63b6dbf5b595aa38dc9b1093c11" -dependencies = [ - "rand 0.7.3", -] - -[[package]] -name = "version_check" -version = "0.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" - -[[package]] -name = "want" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" -dependencies = [ - "log", - "try-lock", -] - -[[package]] -name = "warp" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f41be6df54c97904af01aa23e613d4521eed7ab23537cede692d4058f6449407" -dependencies = [ - "bytes", - "futures", - "headers", - "http", - "hyper", - "log", - "mime", - "mime_guess", - "multipart", - "pin-project 0.4.27", - "scoped-tls", - "serde", - "serde_json", - "serde_urlencoded", - "tokio", - "tokio-tungstenite", - "tower-service", - "tracing", - "tracing-futures", - "urlencoding", -] - -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - -[[package]] -name = "whatlang" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0289c1d1548414a5645e6583e118e9c569c579ec2a0c32417cc3dbf7a89075" -dependencies = [ - "hashbrown 0.7.2", -] - -[[package]] -name = "winapi" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-build" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "ws2_32-sys" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - -[[package]] -name = "zerocopy" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" -dependencies = [ - "byteorder", - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" -dependencies = [ - "proc-macro2", - "syn", - "synstructure", -] - -[[package]] -name = "zstd" -version = "0.5.3+zstd.1.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01b32eaf771efa709e8308605bbf9319bf485dc1503179ec0469b611937c0cd8" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "2.0.5+zstd.1.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cfb642e0d27f64729a639c52db457e0ae906e7bc6f5fe8f5c453230400f1055" -dependencies = [ - "libc", - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "1.4.17+zstd.1.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b89249644df056b522696b1bb9e7c18c87e8ffa3e2f0dc3b0155875d6498f01b" -dependencies = [ - "cc", - "glob", - "itertools", - "libc", -] diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 92932cb00..26adbfd56 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -12,7 +12,7 @@ grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = "0.10.5" meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } memmap = "0.7.0" -milli = { path = ".." } +milli = { path = "../milli" } once_cell = "1.4.1" rayon = "1.5.0" structopt = { version = "0.3.14", default-features = false, features = ["wrap_help"] } diff --git a/milli/Cargo.toml b/milli/Cargo.toml new file mode 100644 index 000000000..b2db8222a --- /dev/null +++ b/milli/Cargo.toml @@ -0,0 +1,67 @@ +[package] +name = "milli" +version = "0.1.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +anyhow = "1.0.28" +bstr = "0.2.13" +byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } +byteorder = "1.3.4" +crossbeam-channel = "0.5.0" +csv = "1.1.3" +either = "1.6.1" +flate2 = "1.0.17" +fst = "0.4.5" +fxhash = "0.2.1" +grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } +heed = { version = "0.10.5", default-features = false, features = ["lmdb", "sync-read-txn"] } +human_format = "1.0.3" +jemallocator = "0.3.2" +levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } +linked-hash-map = "0.5.3" +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } +memmap = "0.7.0" +near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } +num-traits = "0.2.14" +obkv = "0.1.0" +once_cell = "1.4.0" +ordered-float = "2.0.0" +rayon = "1.3.1" +regex = "1.4.2" +ringtail = "0.3.0" +roaring = "0.6.4" +serde = { version = "1.0", features = ["derive"] } +serde_json = { version = "1.0.59", features = ["preserve_order"] } +slice-group-by = "0.2.6" +smallstr = { version = "0.2.0", features = ["serde"] } +smallvec = "1.4.0" +structopt = { version = "0.3.14", default-features = false, features = ["wrap_help"] } +tempfile = "3.1.0" +uuid = { version = "0.8.1", features = ["v4"] } + +# facet filter parser +pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } +pest_derive = "2.1.0" + +# documents words self-join +itertools = "0.9.0" + +# logging +log = "0.4.11" +stderrlog = "0.5.0" + +[dev-dependencies] +criterion = "0.3.3" +maplit = "1.0.2" + +[build-dependencies] +fst = "0.4.5" + +[features] +default = [] + +[[bench]] +name = "search" +harness = false diff --git a/benches/search.rs b/milli/benches/search.rs similarity index 100% rename from benches/search.rs rename to milli/benches/search.rs diff --git a/src/criterion.rs b/milli/src/criterion.rs similarity index 100% rename from src/criterion.rs rename to milli/src/criterion.rs diff --git a/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs similarity index 100% rename from src/external_documents_ids.rs rename to milli/src/external_documents_ids.rs diff --git a/src/facet/facet_type.rs b/milli/src/facet/facet_type.rs similarity index 100% rename from src/facet/facet_type.rs rename to milli/src/facet/facet_type.rs diff --git a/src/facet/facet_value.rs b/milli/src/facet/facet_value.rs similarity index 100% rename from src/facet/facet_value.rs rename to milli/src/facet/facet_value.rs diff --git a/src/facet/mod.rs b/milli/src/facet/mod.rs similarity index 100% rename from src/facet/mod.rs rename to milli/src/facet/mod.rs diff --git a/src/facet/value_encoding.rs b/milli/src/facet/value_encoding.rs similarity index 100% rename from src/facet/value_encoding.rs rename to milli/src/facet/value_encoding.rs diff --git a/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs similarity index 100% rename from src/fields_ids_map.rs rename to milli/src/fields_ids_map.rs diff --git a/src/heed_codec/beu32_str_codec.rs b/milli/src/heed_codec/beu32_str_codec.rs similarity index 100% rename from src/heed_codec/beu32_str_codec.rs rename to milli/src/heed_codec/beu32_str_codec.rs diff --git a/src/heed_codec/bo_roaring_bitmap_codec.rs b/milli/src/heed_codec/bo_roaring_bitmap_codec.rs similarity index 100% rename from src/heed_codec/bo_roaring_bitmap_codec.rs rename to milli/src/heed_codec/bo_roaring_bitmap_codec.rs diff --git a/src/heed_codec/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/cbo_roaring_bitmap_codec.rs similarity index 100% rename from src/heed_codec/cbo_roaring_bitmap_codec.rs rename to milli/src/heed_codec/cbo_roaring_bitmap_codec.rs diff --git a/src/heed_codec/facet/facet_level_value_f64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs similarity index 100% rename from src/heed_codec/facet/facet_level_value_f64_codec.rs rename to milli/src/heed_codec/facet/facet_level_value_f64_codec.rs diff --git a/src/heed_codec/facet/facet_level_value_i64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_i64_codec.rs similarity index 100% rename from src/heed_codec/facet/facet_level_value_i64_codec.rs rename to milli/src/heed_codec/facet/facet_level_value_i64_codec.rs diff --git a/src/heed_codec/facet/facet_value_string_codec.rs b/milli/src/heed_codec/facet/facet_value_string_codec.rs similarity index 100% rename from src/heed_codec/facet/facet_value_string_codec.rs rename to milli/src/heed_codec/facet/facet_value_string_codec.rs diff --git a/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs similarity index 100% rename from src/heed_codec/facet/field_doc_id_facet_f64_codec.rs rename to milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs diff --git a/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs similarity index 100% rename from src/heed_codec/facet/field_doc_id_facet_i64_codec.rs rename to milli/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs diff --git a/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs similarity index 100% rename from src/heed_codec/facet/field_doc_id_facet_string_codec.rs rename to milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs diff --git a/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs similarity index 100% rename from src/heed_codec/facet/mod.rs rename to milli/src/heed_codec/facet/mod.rs diff --git a/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs similarity index 100% rename from src/heed_codec/mod.rs rename to milli/src/heed_codec/mod.rs diff --git a/src/heed_codec/obkv_codec.rs b/milli/src/heed_codec/obkv_codec.rs similarity index 100% rename from src/heed_codec/obkv_codec.rs rename to milli/src/heed_codec/obkv_codec.rs diff --git a/src/heed_codec/roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap_codec.rs similarity index 100% rename from src/heed_codec/roaring_bitmap_codec.rs rename to milli/src/heed_codec/roaring_bitmap_codec.rs diff --git a/src/heed_codec/str_str_u8_codec.rs b/milli/src/heed_codec/str_str_u8_codec.rs similarity index 100% rename from src/heed_codec/str_str_u8_codec.rs rename to milli/src/heed_codec/str_str_u8_codec.rs diff --git a/src/index.rs b/milli/src/index.rs similarity index 100% rename from src/index.rs rename to milli/src/index.rs diff --git a/src/lib.rs b/milli/src/lib.rs similarity index 100% rename from src/lib.rs rename to milli/src/lib.rs diff --git a/src/main.rs b/milli/src/main.rs similarity index 100% rename from src/main.rs rename to milli/src/main.rs diff --git a/src/mdfs.rs b/milli/src/mdfs.rs similarity index 100% rename from src/mdfs.rs rename to milli/src/mdfs.rs diff --git a/src/proximity.rs b/milli/src/proximity.rs similarity index 100% rename from src/proximity.rs rename to milli/src/proximity.rs diff --git a/src/query_tokens.rs b/milli/src/query_tokens.rs similarity index 100% rename from src/query_tokens.rs rename to milli/src/query_tokens.rs diff --git a/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs similarity index 100% rename from src/search/facet/facet_condition.rs rename to milli/src/search/facet/facet_condition.rs diff --git a/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs similarity index 100% rename from src/search/facet/facet_distribution.rs rename to milli/src/search/facet/facet_distribution.rs diff --git a/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest similarity index 100% rename from src/search/facet/grammar.pest rename to milli/src/search/facet/grammar.pest diff --git a/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs similarity index 100% rename from src/search/facet/mod.rs rename to milli/src/search/facet/mod.rs diff --git a/src/search/facet/parser.rs b/milli/src/search/facet/parser.rs similarity index 100% rename from src/search/facet/parser.rs rename to milli/src/search/facet/parser.rs diff --git a/src/search/mod.rs b/milli/src/search/mod.rs similarity index 100% rename from src/search/mod.rs rename to milli/src/search/mod.rs diff --git a/src/subcommand/infos.rs b/milli/src/subcommand/infos.rs similarity index 100% rename from src/subcommand/infos.rs rename to milli/src/subcommand/infos.rs diff --git a/src/subcommand/mod.rs b/milli/src/subcommand/mod.rs similarity index 100% rename from src/subcommand/mod.rs rename to milli/src/subcommand/mod.rs diff --git a/src/subcommand/search.rs b/milli/src/subcommand/search.rs similarity index 100% rename from src/subcommand/search.rs rename to milli/src/subcommand/search.rs diff --git a/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs similarity index 100% rename from src/update/available_documents_ids.rs rename to milli/src/update/available_documents_ids.rs diff --git a/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs similarity index 100% rename from src/update/clear_documents.rs rename to milli/src/update/clear_documents.rs diff --git a/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs similarity index 100% rename from src/update/delete_documents.rs rename to milli/src/update/delete_documents.rs diff --git a/src/update/facets.rs b/milli/src/update/facets.rs similarity index 100% rename from src/update/facets.rs rename to milli/src/update/facets.rs diff --git a/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs similarity index 100% rename from src/update/index_documents/merge_function.rs rename to milli/src/update/index_documents/merge_function.rs diff --git a/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs similarity index 100% rename from src/update/index_documents/mod.rs rename to milli/src/update/index_documents/mod.rs diff --git a/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs similarity index 100% rename from src/update/index_documents/store.rs rename to milli/src/update/index_documents/store.rs diff --git a/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs similarity index 100% rename from src/update/index_documents/transform.rs rename to milli/src/update/index_documents/transform.rs diff --git a/src/update/mod.rs b/milli/src/update/mod.rs similarity index 100% rename from src/update/mod.rs rename to milli/src/update/mod.rs diff --git a/src/update/settings.rs b/milli/src/update/settings.rs similarity index 100% rename from src/update/settings.rs rename to milli/src/update/settings.rs diff --git a/src/update/update_builder.rs b/milli/src/update/update_builder.rs similarity index 100% rename from src/update/update_builder.rs rename to milli/src/update/update_builder.rs diff --git a/src/update/update_step.rs b/milli/src/update/update_step.rs similarity index 100% rename from src/update/update_step.rs rename to milli/src/update/update_step.rs diff --git a/src/update_store.rs b/milli/src/update_store.rs similarity index 100% rename from src/update_store.rs rename to milli/src/update_store.rs From d8f3421608954af6d1de25d79a9ff60e013b267d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 14 Feb 2021 18:32:46 +0100 Subject: [PATCH 0458/1889] Update the dependencies and remove the unused ones --- Cargo.lock | 216 +++++++++++++++++++++-------------------------- milli/Cargo.toml | 4 - 2 files changed, 98 insertions(+), 122 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d4a57f446..5e5acf63f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -180,9 +180,9 @@ dependencies = [ [[package]] name = "bstr" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "473fc6b38233f9af7baa94fb5852dca389e3d95b8e21c8e3719301462c5d9faf" +checksum = "a40b47ad93e1a5404e6c18dec46b628214fee441c70f4ab5d6942142cc268a3d" dependencies = [ "lazy_static", "memchr", @@ -202,9 +202,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.4.0" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820" +checksum = "099e596ef14349721d9016f6b80dd3419ea1bf289ab9b44df8e4dfd3a005d5d9" [[package]] name = "byte-tools" @@ -341,16 +341,16 @@ dependencies = [ [[package]] name = "criterion" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70daa7ceec6cf143990669a04c7df13391d55fb27bd4079d252fca774ba244d8" +checksum = "ab327ed7354547cc2ef43cbe20ef68b988e70b4b593cbd66a2a61733123a3d23" dependencies = [ "atty", "cast", "clap", "criterion-plot", "csv", - "itertools", + "itertools 0.10.0", "lazy_static", "num-traits", "oorandom", @@ -372,7 +372,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e022feadec601fba1649cfa83586381a4ad31c6bf3a9ab7d408118b05dd9889d" dependencies = [ "cast", - "itertools", + "itertools 0.9.0", ] [[package]] @@ -506,9 +506,9 @@ checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" [[package]] name = "flate2" -version = "1.0.19" +version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7411863d55df97a419aa64cb4d2f167103ea9d767e2c54a1868b7ac3f6b47129" +checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" dependencies = [ "cfg-if 1.0.0", "crc32fast", @@ -532,12 +532,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "fs_extra" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" - [[package]] name = "fst" version = "0.4.5" @@ -562,9 +556,9 @@ checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" [[package]] name = "funty" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" +checksum = "1847abb9cb65d566acd5942e94aea9c8f547ad02c98e1649326fc0e8910b8b1e" [[package]] name = "futures" @@ -708,7 +702,7 @@ checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8" dependencies = [ "cfg-if 1.0.0", "libc", - "wasi 0.10.1+wasi-snapshot-preview1", + "wasi 0.10.2+wasi-snapshot-preview1", ] [[package]] @@ -955,9 +949,9 @@ dependencies = [ [[package]] name = "idna" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9" +checksum = "de910d521f7cc3135c4de8db1cb910e0b5ed1dc6f57c381cd07e8e661ce10094" dependencies = [ "matches", "unicode-bidi", @@ -1001,33 +995,21 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" -[[package]] -name = "jemalloc-sys" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45" -dependencies = [ - "cc", - "fs_extra", - "libc", -] - -[[package]] -name = "jemallocator" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69" -dependencies = [ - "jemalloc-sys", - "libc", -] - [[package]] name = "jieba-rs" version = "0.6.2" @@ -1054,9 +1036,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.46" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf3d7383929f7c9c7c2d0fa596f325832df98c3704f2c60553080f7127a58175" +checksum = "5cfb73131c35423a367daf8cbd24100af0d077668c8c2943f0e7dd775fef0f65" dependencies = [ "wasm-bindgen", ] @@ -1088,22 +1070,22 @@ dependencies = [ [[package]] name = "lexical-core" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db65c6da02e61f55dae90a0ae427b2a5f6b3e8db09f58d10efab23af92592616" +checksum = "21f866863575d0e1d654fbeeabdc927292fdf862873dc3c96c6f753357e13374" dependencies = [ "arrayvec", "bitflags", - "cfg-if 0.1.10", + "cfg-if 1.0.0", "ryu", "static_assertions", ] [[package]] name = "libc" -version = "0.2.82" +version = "0.2.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89203f3fba0a3795506acaad8ebce3c80c0af93f994d5a1d7a0b1eeb23271929" +checksum = "b7282d924be3275cec7f6756ff4121987bc6481325397dde6ba3e7802b1a8b1c" [[package]] name = "linked-hash-map" @@ -1124,11 +1106,11 @@ dependencies = [ [[package]] name = "log" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcf3805d4480bb5b86070dcfeb9e2cb2ebc148adb753c5cca5f884d1d65a42b2" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", ] [[package]] @@ -1146,7 +1128,7 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "meilisearch-tokenizer" version = "0.1.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#147b6154b1b34cb8f5da2df6a416b7da191bc850" +source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#31ba3ff4a15501f12b7d37ac64ddce7c35a9757c" dependencies = [ "character_converter", "cow-utils", @@ -1202,15 +1184,13 @@ dependencies = [ "grenad", "heed", "human_format", - "itertools", - "jemallocator", + "itertools 0.9.0", "levenshtein_automata", "linked-hash-map", "log", "maplit", "meilisearch-tokenizer", "memmap", - "near-proximity", "num-traits", "obkv", "once_cell", @@ -1219,11 +1199,9 @@ dependencies = [ "pest_derive", "rayon", "regex", - "ringtail", "roaring", "serde", "serde_json", - "slice-group-by", "smallstr", "smallvec", "stderrlog", @@ -1340,14 +1318,6 @@ dependencies = [ "twoway", ] -[[package]] -name = "near-proximity" -version = "0.1.0" -source = "git+https://github.com/Kerollmops/plane-sweep-proximity?rev=6608205#66082058537f6fe7709adc4690048d62f3c0e9b7" -dependencies = [ - "tinyvec", -] - [[package]] name = "net2" version = "0.2.37" @@ -1444,9 +1414,9 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "ordered-float" -version = "2.0.1" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dacdec97876ef3ede8c50efc429220641a0b11ba0048b4b0c357bccbc47c5204" +checksum = "766f840da25490628d8e63e529cd21c014f6600c6b8517add12a6fa6167a6218" dependencies = [ "num-traits", ] @@ -1622,16 +1592,32 @@ checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" [[package]] name = "plotters" -version = "0.2.15" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d1685fbe7beba33de0330629da9d955ac75bd54f33d7b79f9a895590124f6bb" +checksum = "45ca0ae5f169d0917a7c7f5a9c1a3d3d9598f18f529dd2b8373ed988efea307a" dependencies = [ - "js-sys", "num-traits", + "plotters-backend", + "plotters-svg", "wasm-bindgen", "web-sys", ] +[[package]] +name = "plotters-backend" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b07fffcddc1cb3a1de753caa4e4df03b79922ba43cf882acc1bdd7e8df9f4590" + +[[package]] +name = "plotters-svg" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b38a02e23bd9604b842a812063aec4ef702b57989c37b655254bb61c471ad211" +dependencies = [ + "plotters-backend", +] + [[package]] name = "ppv-lite86" version = "0.2.10" @@ -1691,9 +1677,9 @@ checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" [[package]] name = "quote" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" +checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" dependencies = [ "proc-macro2", ] @@ -1720,13 +1706,13 @@ dependencies = [ [[package]] name = "rand" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18519b42a40024d661e1714153e9ad0c3de27cd495760ceb09710920f1098b1e" +checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" dependencies = [ "libc", "rand_chacha 0.3.0", - "rand_core 0.6.1", + "rand_core 0.6.2", "rand_hc 0.3.0", ] @@ -1747,7 +1733,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d" dependencies = [ "ppv-lite86", - "rand_core 0.6.1", + "rand_core 0.6.2", ] [[package]] @@ -1761,9 +1747,9 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c026d7df8b298d90ccbbc5190bd04d85e159eaf5576caeacf8741da93ccbd2e5" +checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7" dependencies = [ "getrandom 0.2.2", ] @@ -1783,7 +1769,7 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" dependencies = [ - "rand_core 0.6.1", + "rand_core 0.6.2", ] [[package]] @@ -1865,12 +1851,6 @@ dependencies = [ "winapi 0.3.9", ] -[[package]] -name = "ringtail" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21215c1b9d8f7832b433255bd9eea3e2779aa55b21b2f8e13aad62c74749b237" - [[package]] name = "roaring" version = "0.6.4" @@ -1939,9 +1919,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.120" +version = "1.0.123" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "166b2349061381baf54a58e4b13c89369feb0ef2eaa57198899e2312aac30aab" +checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae" dependencies = [ "serde_derive", ] @@ -1958,9 +1938,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.120" +version = "1.0.123" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ca2a8cb5805ce9e3b95435e3765b7b553cecc762d938d409434338386cb5775" +checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31" dependencies = [ "proc-macro2", "quote", @@ -1969,9 +1949,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.61" +version = "1.0.62" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" +checksum = "ea1c6153794552ea7cf7cf63b1231a25de00ec90db326ba6264440fa08e31486" dependencies = [ "indexmap", "itoa", @@ -2061,9 +2041,9 @@ checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" [[package]] name = "snap" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98d3306e84bf86710d6cd8b4c9c3b721d5454cc91a603180f8f8cd06cfd317b4" +checksum = "dc725476a1398f0480d56cd0ad381f6f32acf2642704456f8f59a35df464b59a" [[package]] name = "socket2" @@ -2121,9 +2101,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.58" +version = "1.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc60a3d73ea6594cd712d830cc1f0390fd71542d8c8cd24e70cc54cdfd5e05d5" +checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081" dependencies = [ "proc-macro2", "quote", @@ -2153,9 +2133,9 @@ dependencies = [ [[package]] name = "tap" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36474e732d1affd3a6ed582781b3683df3d0563714c59c39591e8ff707cf078e" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tempfile" @@ -2165,7 +2145,7 @@ checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" dependencies = [ "cfg-if 1.0.0", "libc", - "rand 0.8.2", + "rand 0.8.3", "redox_syscall", "remove_dir_all", "winapi 0.3.9", @@ -2231,9 +2211,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf8dbc19eb42fba10e8feaaec282fb50e2c14b2726d6301dbfeed0f73306a6f" +checksum = "317cca572a0e89c3ce0ca1f1bdc9369547fe318a683418e42ac8f59d14701023" dependencies = [ "tinyvec_macros", ] @@ -2418,9 +2398,9 @@ dependencies = [ [[package]] name = "unicode-normalization" -version = "0.1.16" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a13e63ab62dbe32aeee58d1c5408d35c36c392bba5d9d3142287219721afe606" +checksum = "07fbfce1c8a97d547e8b5334978438d9d6ec8c20e38f56d4a4374d181493eaef" dependencies = [ "tinyvec", ] @@ -2545,15 +2525,15 @@ checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" [[package]] name = "wasi" -version = "0.10.1+wasi-snapshot-preview1" +version = "0.10.2+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93c6c3420963c5c64bca373b25e77acb562081b9bb4dd5bb864187742186cea9" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "wasm-bindgen" -version = "0.2.69" +version = "0.2.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cd364751395ca0f68cafb17666eee36b63077fb5ecd972bbcd74c90c4bf736e" +checksum = "55c0f7123de74f0dab9b7d00fd614e7b19349cd1e2f5252bbe9b1754b59433be" dependencies = [ "cfg-if 1.0.0", "wasm-bindgen-macro", @@ -2561,9 +2541,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.69" +version = "0.2.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1114f89ab1f4106e5b55e688b828c0ab0ea593a1ea7c094b141b14cbaaec2d62" +checksum = "7bc45447f0d4573f3d65720f636bbcc3dd6ce920ed704670118650bcd47764c7" dependencies = [ "bumpalo", "lazy_static", @@ -2576,9 +2556,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.69" +version = "0.2.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6ac8995ead1f084a8dea1e65f194d0973800c7f571f6edd70adf06ecf77084" +checksum = "3b8853882eef39593ad4174dd26fc9865a64e84026d223f63bb2c42affcbba2c" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2586,9 +2566,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.69" +version = "0.2.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5a48c72f299d80557c7c62e37e7225369ecc0c963964059509fbafe917c7549" +checksum = "4133b5e7f2a531fa413b3a1695e925038a05a71cf67e87dafa295cb645a01385" dependencies = [ "proc-macro2", "quote", @@ -2599,15 +2579,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.69" +version = "0.2.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e7811dd7f9398f14cc76efd356f98f03aa30419dea46aa810d71e819fc97158" +checksum = "dd4945e4943ae02d15c13962b38a5b1e81eadd4b71214eee75af64a4d6a4fd64" [[package]] name = "web-sys" -version = "0.3.46" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "222b1ef9334f92a21d3fb53dc3fd80f30836959a90f9274a626d7e06315ba3c3" +checksum = "c40dc691fc48003eba817c38da7113c15698142da971298003cac3ef175680b3" dependencies = [ "js-sys", "wasm-bindgen", @@ -2729,6 +2709,6 @@ checksum = "a1e6e8778706838f43f771d80d37787cb2fe06dafe89dd3aebaf6721b9eaec81" dependencies = [ "cc", "glob", - "itertools", + "itertools 0.9.0", "libc", ] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b2db8222a..f92980589 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,23 +18,19 @@ fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = { version = "0.10.5", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" -jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.3" meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } memmap = "0.7.0" -near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } num-traits = "0.2.14" obkv = "0.1.0" once_cell = "1.4.0" ordered-float = "2.0.0" rayon = "1.3.1" regex = "1.4.2" -ringtail = "0.3.0" roaring = "0.6.4" serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0.59", features = ["preserve_order"] } -slice-group-by = "0.2.6" smallstr = { version = "0.2.0", features = ["serde"] } smallvec = "1.4.0" structopt = { version = "0.3.14", default-features = false, features = ["wrap_help"] } From fecf3d6fc1df7d6077d723ed1d62142a0baf214d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 14 Feb 2021 18:55:15 +0100 Subject: [PATCH 0459/1889] Move the command lines helpers into different crates --- Cargo.lock | 66 +++++++++++++++++-- Cargo.toml | 2 +- http-ui/Cargo.toml | 3 + infos/Cargo.toml | 17 +++++ .../subcommand/infos.rs => infos/src/main.rs | 37 +++++++---- milli/Cargo.toml | 3 - milli/src/lib.rs | 3 +- milli/src/main.rs | 22 ------- milli/src/subcommand/mod.rs | 2 - search/Cargo.toml | 16 +++++ .../search.rs => search/src/main.rs | 19 +++++- 11 files changed, 142 insertions(+), 48 deletions(-) create mode 100644 infos/Cargo.toml rename milli/src/subcommand/infos.rs => infos/src/main.rs (97%) delete mode 100644 milli/src/main.rs delete mode 100644 milli/src/subcommand/mod.rs create mode 100644 search/Cargo.toml rename milli/src/subcommand/search.rs => search/src/main.rs (87%) diff --git a/Cargo.lock b/Cargo.lock index 5e5acf63f..a6a178510 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -532,6 +532,12 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs_extra" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" + [[package]] name = "fst" version = "0.4.5" @@ -556,9 +562,9 @@ checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" [[package]] name = "funty" -version = "1.2.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1847abb9cb65d566acd5942e94aea9c8f547ad02c98e1649326fc0e8910b8b1e" +checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" [[package]] name = "futures" @@ -881,6 +887,7 @@ dependencies = [ "either", "flate2", "fst", + "funty", "futures", "grenad", "heed", @@ -968,6 +975,22 @@ dependencies = [ "hashbrown 0.9.1", ] +[[package]] +name = "infos" +version = "0.1.0" +dependencies = [ + "anyhow", + "byte-unit", + "csv", + "heed", + "jemallocator", + "milli", + "roaring", + "serde_json", + "stderrlog", + "structopt", +] + [[package]] name = "input_buffer" version = "0.3.1" @@ -1010,6 +1033,27 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" +[[package]] +name = "jemalloc-sys" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45" +dependencies = [ + "cc", + "fs_extra", + "libc", +] + +[[package]] +name = "jemallocator" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69" +dependencies = [ + "jemalloc-sys", + "libc", +] + [[package]] name = "jieba-rs" version = "0.6.2" @@ -1172,7 +1216,6 @@ version = "0.1.0" dependencies = [ "anyhow", "bstr", - "byte-unit", "byteorder", "criterion", "crossbeam-channel", @@ -1204,8 +1247,6 @@ dependencies = [ "serde_json", "smallstr", "smallvec", - "stderrlog", - "structopt", "tempfile", "uuid", ] @@ -1902,6 +1943,21 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "search" +version = "0.1.0" +dependencies = [ + "anyhow", + "byte-unit", + "heed", + "jemallocator", + "log", + "milli", + "serde_json", + "stderrlog", + "structopt", +] + [[package]] name = "semver" version = "0.9.0" diff --git a/Cargo.toml b/Cargo.toml index d04cae871..16a5ab8d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["milli", "http-ui"] +members = ["milli", "http-ui", "infos", "search"] default-members = ["milli"] [profile.release] diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 26adbfd56..1326abfc5 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -34,3 +34,6 @@ warp = "0.2.2" log = "0.4.11" stderrlog = "0.5.0" fst = "0.4.5" + +# Temporary fix for bitvec, remove once fixed. (https://github.com/bitvecto-rs/bitvec/issues/105) +funty = "=1.1.0" diff --git a/infos/Cargo.toml b/infos/Cargo.toml new file mode 100644 index 000000000..14d52a573 --- /dev/null +++ b/infos/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "infos" +version = "0.1.0" +authors = ["Clément Renault "] +edition = "2018" + +[dependencies] +anyhow = "1.0.28" +byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } +csv = "1.1.3" +heed = "0.10.5" +jemallocator = "0.3.2" +milli = { path = "../milli" } +roaring = "0.6.4" +serde_json = "1.0.59" +stderrlog = "0.5.0" +structopt = { version = "0.3.14", default-features = false } diff --git a/milli/src/subcommand/infos.rs b/infos/src/main.rs similarity index 97% rename from milli/src/subcommand/infos.rs rename to infos/src/main.rs index 51d0492ce..e874385e6 100644 --- a/milli/src/subcommand/infos.rs +++ b/infos/src/main.rs @@ -4,12 +4,16 @@ use std::{str, io, fmt}; use anyhow::Context; use byte_unit::Byte; -use crate::Index; use heed::EnvOpenOptions; +use milli::Index; use structopt::StructOpt; use Command::*; +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + const MAIN_DB_NAME: &str = "main"; const WORD_DOCIDS_DB_NAME: &str = "word-docids"; const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; @@ -153,7 +157,18 @@ enum Command { PatchToNewExternalIds, } -pub fn run(opt: Opt) -> anyhow::Result<()> { +fn main() -> Result<(), ()> { + let opt = Opt::from_args(); + match run(opt) { + Ok(()) => Ok(()), + Err(e) => { + eprintln!("{}", e); + Err(()) + }, + } +} + +fn run(opt: Opt) -> anyhow::Result<()> { stderrlog::new() .verbosity(opt.verbose) .show_level(false) @@ -204,7 +219,7 @@ fn patch_to_new_external_ids(index: &Index, wtxn: &mut heed::RwTxn) -> anyhow::R let documents_ids = documents_ids.to_owned(); index.main.put::<_, ByteSlice, ByteSlice>( wtxn, - crate::index::HARD_EXTERNAL_DOCUMENTS_IDS_KEY.as_bytes(), + milli::index::HARD_EXTERNAL_DOCUMENTS_IDS_KEY.as_bytes(), &documents_ids, )?; index.main.delete::<_, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS)?; @@ -242,7 +257,7 @@ fn facet_values_iter<'txn, DC: 'txn, T>( rtxn: &'txn heed::RoTxn, db: heed::Database, field_id: u8, - facet_type: crate::facet::FacetType, + facet_type: milli::facet::FacetType, string_fn: impl Fn(&str) -> T + 'txn, float_fn: impl Fn(u8, f64, f64) -> T + 'txn, integer_fn: impl Fn(u8, i64, i64) -> T + 'txn, @@ -250,8 +265,8 @@ fn facet_values_iter<'txn, DC: 'txn, T>( where DC: heed::BytesDecode<'txn>, { - use crate::facet::FacetType; - use crate::heed_codec::facet::{ + use milli::facet::FacetType; + use milli::heed_codec::facet::{ FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec, }; @@ -504,7 +519,7 @@ fn export_words_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { fn export_documents(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use std::io::{BufWriter, Write as _}; - use crate::obkv_to_json; + use milli::obkv_to_json; let stdout = io::stdout(); let mut out = BufWriter::new(stdout); @@ -548,7 +563,7 @@ fn total_docid_word_positions_size(index: &Index, rtxn: &heed::RoTxn) -> anyhow: fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use heed::types::DecodeIgnore; - use crate::{DocumentId, BEU32StrCodec}; + use milli::{DocumentId, BEU32StrCodec}; let mut words_counts = Vec::new(); let mut count = 0; @@ -587,7 +602,7 @@ fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow:: fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use heed::types::DecodeIgnore; - use crate::BoRoaringBitmapCodec; + use milli::BoRoaringBitmapCodec; let mut values_length = Vec::new(); let mut count = 0; @@ -639,7 +654,7 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu use heed::types::ByteSlice; use heed::{Error, BytesDecode}; use roaring::RoaringBitmap; - use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; + use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>( db: heed::PolyDatabase, @@ -720,7 +735,7 @@ fn word_pair_proximities_docids( ) -> anyhow::Result<()> { use heed::types::ByteSlice; - use crate::RoaringBitmapCodec; + use milli::RoaringBitmapCodec; let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f92980589..7b6d3b7b9 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -7,7 +7,6 @@ edition = "2018" [dependencies] anyhow = "1.0.28" bstr = "0.2.13" -byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } byteorder = "1.3.4" crossbeam-channel = "0.5.0" csv = "1.1.3" @@ -33,7 +32,6 @@ serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0.59", features = ["preserve_order"] } smallstr = { version = "0.2.0", features = ["serde"] } smallvec = "1.4.0" -structopt = { version = "0.3.14", default-features = false, features = ["wrap_help"] } tempfile = "3.1.0" uuid = { version = "0.8.1", features = ["v4"] } @@ -46,7 +44,6 @@ itertools = "0.9.0" # logging log = "0.4.11" -stderrlog = "0.5.0" [dev-dependencies] criterion = "0.3.3" diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 09a66ea65..7a9afde2d 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -3,15 +3,14 @@ mod criterion; mod external_documents_ids; mod fields_ids_map; -mod index; mod mdfs; mod query_tokens; mod search; mod update_store; pub mod facet; pub mod heed_codec; +pub mod index; pub mod proximity; -pub mod subcommand; pub mod update; use std::borrow::Cow; diff --git a/milli/src/main.rs b/milli/src/main.rs deleted file mode 100644 index acc8733b3..000000000 --- a/milli/src/main.rs +++ /dev/null @@ -1,22 +0,0 @@ -use structopt::StructOpt; - -use milli::subcommand::infos::{self, Opt as InfosOpt}; -use milli::subcommand::search::{self, Opt as SearchOpt}; - -#[cfg(target_os = "linux")] -#[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; - -#[derive(Debug, StructOpt)] -#[structopt(name = "milli", about = "The milli project.")] -enum Command { - Infos(InfosOpt), - Search(SearchOpt), -} - -fn main() -> anyhow::Result<()> { - match Command::from_args() { - Command::Infos(opt) => infos::run(opt), - Command::Search(opt) => search::run(opt), - } -} diff --git a/milli/src/subcommand/mod.rs b/milli/src/subcommand/mod.rs deleted file mode 100644 index 8e2223a5e..000000000 --- a/milli/src/subcommand/mod.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub mod infos; -pub mod search; diff --git a/search/Cargo.toml b/search/Cargo.toml new file mode 100644 index 000000000..947deb70d --- /dev/null +++ b/search/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "search" +version = "0.1.0" +authors = ["Clément Renault "] +edition = "2018" + +[dependencies] +anyhow = "1.0.28" +byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } +heed = "0.10.5" +jemallocator = "0.3.2" +log = "0.4.11" +milli = { path = "../milli" } +serde_json = "1.0.59" +stderrlog = "0.5.0" +structopt = { version = "0.3.14", default-features = false } diff --git a/milli/src/subcommand/search.rs b/search/src/main.rs similarity index 87% rename from milli/src/subcommand/search.rs rename to search/src/main.rs index 0a150209e..d2e727417 100644 --- a/milli/src/subcommand/search.rs +++ b/search/src/main.rs @@ -8,7 +8,11 @@ use heed::EnvOpenOptions; use log::debug; use structopt::StructOpt; -use crate::{Index, obkv_to_json}; +use milli::{Index, obkv_to_json}; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; #[derive(Debug, StructOpt)] /// A simple search helper binary for the milli project. @@ -35,7 +39,18 @@ pub struct Opt { print_facet_distribution: bool, } -pub fn run(opt: Opt) -> anyhow::Result<()> { +fn main() -> Result<(), ()> { + let opt = Opt::from_args(); + match run(opt) { + Ok(()) => Ok(()), + Err(e) => { + eprintln!("{}", e); + Err(()) + }, + } +} + +fn run(opt: Opt) -> anyhow::Result<()> { stderrlog::new() .verbosity(opt.verbose) .show_level(false) From b3776598d88cf5cac1e4922b3788efb05d3f37fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 13 Feb 2021 14:04:23 +0100 Subject: [PATCH 0460/1889] Add a test to check deletion of documents with number as primary key --- milli/src/update/delete_documents.rs | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 2b67535c9..bd134891d 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -248,3 +248,39 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { Ok(self.documents_ids.len() as usize) } } + +#[cfg(test)] +mod tests { + use heed::EnvOpenOptions; + + use crate::update::{IndexDocuments, UpdateFormat}; + use super::*; + + #[test] + fn delete_documents_with_numbers_as_primary_key() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with an id for only one of them. + let mut wtxn = index.write_txn().unwrap(); + let content = &br#"[ + { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, + { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, + { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + + // delete those documents, ids are synchronous therefore 0, 1, and 2. + let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + builder.delete_document(0); + builder.delete_document(1); + builder.delete_document(2); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + } +} From 69acdd437e3c560620397d1363d699b33370c0b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 13 Feb 2021 13:57:53 +0100 Subject: [PATCH 0461/1889] Deserialize documents ids into JSON Values on deletion --- milli/src/update/delete_documents.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index bd134891d..932589dd7 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,6 +1,8 @@ +use anyhow::anyhow; use fst::IntoStreamer; use heed::types::ByteSlice; use roaring::RoaringBitmap; +use serde_json::Value; use crate::facet::FacetType; use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; @@ -95,7 +97,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; if let Some((_key, obkv)) = iter.next().transpose()? { if let Some(content) = obkv.get(id_field) { - let external_id: SmallString32 = serde_json::from_slice(content).unwrap(); + let external_id = match serde_json::from_slice(content).unwrap() { + Value::String(string) => SmallString32::from(string.as_str()), + Value::Number(number) => SmallString32::from(number.to_string()), + _ => return Err(anyhow!("documents ids must be either strings or numbers")), + }; external_ids.push(external_id); } iter.del_current()?; From 89ce4e74fefb8894dbc122bb67fe7da69fc2b30f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 13 Feb 2021 14:16:27 +0100 Subject: [PATCH 0462/1889] Do not change the primary key type when we serialize documents --- milli/src/update/index_documents/transform.rs | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index d53b83361..68888aad9 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -178,16 +178,10 @@ impl Transform<'_, '_> { serde_json::to_writer(&mut json_buffer, value)?; writer.insert(field_id, &json_buffer)?; } - else if field_id == primary_key_id { - // We validate the document id [a-zA-Z0-9\-_]. - let external_id = match validate_document_id(&external_id) { - Some(valid) => valid, - None => return Err(anyhow!("invalid document id: {:?}", external_id)), - }; - // We serialize the document id. - serde_json::to_writer(&mut json_buffer, &external_id)?; - writer.insert(field_id, &json_buffer)?; + // We validate the document id [a-zA-Z0-9\-_]. + if field_id == primary_key_id && validate_document_id(&external_id).is_none() { + return Err(anyhow!("invalid document id: {:?}", external_id)); } } From b3a21d5a5003689fbb5e549695fe8df1b7fcb067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 3 Feb 2021 10:30:33 +0100 Subject: [PATCH 0463/1889] Introduce the getters and setters for the words prefixes FST --- infos/src/main.rs | 1 + milli/src/index.rs | 24 +++++++++++++++++++++++- milli/src/update/clear_documents.rs | 1 + milli/src/update/delete_documents.rs | 1 + 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index e874385e6..916b5ba50 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -311,6 +311,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho env: _env, main, word_docids, + word_prefix_docids, docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, diff --git a/milli/src/index.rs b/milli/src/index.rs index c0dd22986..5763f78ee 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -27,6 +27,7 @@ pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; +pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; #[derive(Clone)] pub struct Index { @@ -36,6 +37,8 @@ pub struct Index { pub main: PolyDatabase, /// A word and all the documents ids containing the word. pub word_docids: Database, + /// A prefix of word and all the documents ids containing this prefix. + pub word_prefix_docids: Database, /// Maps a word and a document id (u32) to all the positions where the given word appears. pub docid_word_positions: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. @@ -50,11 +53,12 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(7); + options.max_dbs(8); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; let word_docids = env.create_database(Some("word-docids"))?; + let word_prefix_docids = env.create_database(Some("word-prefix-docids"))?; let docid_word_positions = env.create_database(Some("docid-word-positions"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; @@ -65,6 +69,7 @@ impl Index { env, main, word_docids, + word_prefix_docids, docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, @@ -328,6 +333,23 @@ impl Index { } } + /* words prefixes fst */ + + /// Writes the FST which is the words prefixes dictionnary of the engine. + pub fn put_words_prefixes_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set
) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) + } + + /// Returns the FST which is the words prefixes dictionnary of the engine. + pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_PREFIXES_FST_KEY)? { + Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), + None => Ok(fst::Set::default().map_data(Cow::Owned)?), + } + } + + /* documents */ + /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing. pub fn documents<'t>( &self, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index a84596901..6f0d457b7 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -22,6 +22,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { env: _env, main: _main, word_docids, + word_prefix_docids, docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 932589dd7..2efed359f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -79,6 +79,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { env: _env, main: _main, word_docids, + word_prefix_docids, docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, From 5e7b26791b7b8492c410fa9e938b019dd7b6585f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 3 Feb 2021 10:35:19 +0100 Subject: [PATCH 0464/1889] Take the words-prefixes into account while computing the biggest values --- infos/src/main.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 916b5ba50..305bfd0d5 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -321,6 +321,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let main_name = "main"; let word_docids_name = "word_docids"; + let word_prefix_docids_name = "word_prefix_docids"; let docid_word_positions_name = "docid_word_positions"; let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let facet_field_id_value_docids_name = "facet_field_id_value_docids"; @@ -329,8 +330,16 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let mut heap = BinaryHeap::with_capacity(limit + 1); if limit > 0 { + // Fetch the words FST let words_fst = index.words_fst(rtxn)?; - heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name))); + let length = words_fst.as_fst().as_bytes().len(); + heap.push(Reverse((length, format!("words-fst"), main_name))); + if heap.len() > limit { heap.pop(); } + + // Fetch the word prefix FST + let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; + let length = words_prefixes_fst.as_fst().as_bytes().len(); + heap.push(Reverse((length, format!("words-prefixes-fst"), main_name))); if heap.len() > limit { heap.pop(); } if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { @@ -344,6 +353,12 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in word_prefix_docids.remap_data_type::().iter(rtxn)? { + let (word, value) = result?; + heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); + if heap.len() > limit { heap.pop(); } + } + for result in docid_word_positions.remap_data_type::().iter(rtxn)? { let ((docid, word), value) = result?; let key = format!("{} {}", docid, word); From ee5a60e1c5f3b4f438485a27885932e8870fec7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 3 Feb 2021 10:36:07 +0100 Subject: [PATCH 0465/1889] Clear the words prefixes when clearing an index --- milli/src/update/clear_documents.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 6f0d457b7..d20263d38 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -36,6 +36,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; + self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; @@ -46,6 +47,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // Clear the other databases. word_docids.clear(self.wtxn)?; + word_prefix_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; facet_field_id_value_docids.clear(self.wtxn)?; From f365de636fe6dbeeee74ecf1ae4dad6b6149f5b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 11:12:38 +0100 Subject: [PATCH 0466/1889] Compute and write the word-prefix-docids database --- milli/src/lib.rs | 2 +- milli/src/update/delete_documents.rs | 3 + milli/src/update/facets.rs | 3 +- milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/mod.rs | 4 +- milli/src/update/words_prefixes.rs | 161 ++++++++++++++++++++++++ 6 files changed, 171 insertions(+), 4 deletions(-) create mode 100644 milli/src/update/words_prefixes.rs diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 7a9afde2d..66d134f4e 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -33,8 +33,8 @@ pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; -pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; +pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 2efed359f..1e0064f22 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -158,6 +158,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } + // FIXME we must recompute the words prefixes docids. + todo!("recompute words prefixes docids"); + // We construct an FST set that contains the words to delete from the words FST. let words_to_delete = words.iter().filter_map(|(word, must_remove)| { if *must_remove { Some(word.as_ref()) } else { None } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 522a4d350..bac5f3c86 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -32,7 +32,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, update_id: u64, - ) -> Facets<'t, 'u, 'i> { + ) -> Facets<'t, 'u, 'i> + { Facets { wtxn, index, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e38c640a0..d53173b71 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -19,7 +19,7 @@ use serde::{Serialize, Deserialize}; use crate::index::Index; use crate::update::{Facets, UpdateIndexingStep}; use self::store::{Store, Readers}; -use self::merge_function::{ +pub use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, field_id_docid_facet_values_merge, diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 2cd532c83..fcdcb33e9 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -6,12 +6,14 @@ mod index_documents; mod settings; mod update_builder; mod update_step; +mod words_prefixes; pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::DeleteDocuments; -pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat, DocumentAdditionResult}; pub use self::facets::Facets; +pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat, DocumentAdditionResult}; pub use self::settings::Settings; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; +pub use self::words_prefixes::WordsPrefixes; diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs new file mode 100644 index 000000000..bb8d3a6f8 --- /dev/null +++ b/milli/src/update/words_prefixes.rs @@ -0,0 +1,161 @@ +use std::iter::FromIterator; +use std::str; + +use fst::Streamer; +use grenad::CompressionType; +use heed::types::ByteSlice; + +use crate::{Index, SmallString32}; +use crate::update::index_documents::WriteMethod; +use crate::update::index_documents::{create_sorter, create_writer, writer_into_reader}; +use crate::update::index_documents::{word_docids_merge, write_into_lmdb_database}; + +pub struct WordsPrefixes<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, + threshold: f64, + max_prefix_length: usize, + _update_id: u64, +} + +impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + update_id: u64, + ) -> WordsPrefixes<'t, 'u, 'i> + { + WordsPrefixes { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + max_nb_chunks: None, + max_memory: None, + threshold: 0.01, // 1% + max_prefix_length: 4, + _update_id: update_id, + } + } + + /// Set the ratio of concerned words required to make a prefix be part of the words prefixes + /// database. If a word prefix is supposed to match more than this number of words in the + /// dictionnary, therefore this prefix is added to the words prefixes datastructures. + /// + /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped + /// to these bounds otherwise. + pub fn threshold(&mut self, value: f64) -> &mut Self { + self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] + self + } + + /// Set the maximum length of prefixes in bytes. + /// + /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped + /// to these bounds, otherwise. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value.min(25).max(1); // clamp [1, 25] + self + } + + pub fn execute(self) -> anyhow::Result<()> { + // Clear the words prefixes datastructures. + self.index.word_prefix_docids.clear(self.wtxn)?; + + let words_fst = self.index.words_fst(&self.wtxn)?; + let number_of_words = words_fst.len(); + let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; + + // It is forbidden to keep a mutable reference into the database + // and write into it at the same time, therefore we write into another file. + let mut docids_sorter = create_sorter( + word_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); + for n in 1..=self.max_prefix_length { + + let mut current_prefix = SmallString32::new(); + let mut current_prefix_count = 0; + let mut builder = fst::SetBuilder::memory(); + + let mut stream = words_fst.stream(); + while let Some(bytes) = stream.next() { + // We try to get the first n bytes out of this string but we only want + // to split at valid characters bounds. If we try to split in the middle of + // a character we ignore this word and go to the next one. + let word = str::from_utf8(bytes)?; + let prefix = match word.get(..n) { + Some(prefix) => prefix, + None => continue, + }; + + // This is the first iteration of the loop, + // or the current word doesn't starts with the current prefix. + if current_prefix_count == 0 || prefix != current_prefix.as_str() { + current_prefix = SmallString32::from(prefix); + current_prefix_count = 0; + } + + current_prefix_count += 1; + + // There is enough words corresponding to this prefix to add it to the cache. + if current_prefix_count == min_number_of_words { + builder.insert(prefix)?; + } + } + + // We construct the final set for prefixes of size n. + prefix_fsts.push(builder.into_set()); + } + + // We merge all of the previously computed prefixes into on final set. + let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(op.r#union())?; + let prefix_fst = builder.into_set(); + + // We iterate over all the prefixes and retrieve the corresponding docids. + let mut prefix_stream = prefix_fst.stream(); + while let Some(bytes) = prefix_stream.next() { + let prefix = str::from_utf8(bytes)?; + let db = self.index.word_docids.remap_data_type::(); + for result in db.prefix_iter(self.wtxn, prefix)? { + let (_word, data) = result?; + docids_sorter.insert(prefix, data)?; + } + } + + // Set the words prefixes FST in the dtabase. + self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; + + // We write the sorter into a reader to be able to read it back. + let mut docids_writer = tempfile::tempfile().and_then(|file| { + create_writer(self.chunk_compression_type, self.chunk_compression_level, file) + })?; + docids_sorter.write_into(&mut docids_writer)?; + let docids_reader = writer_into_reader(docids_writer, self.chunk_fusing_shrink_size)?; + + // We finally write the word prefix docids into the LMDB database. + write_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_docids.as_polymorph(), + docids_reader, + word_docids_merge, + WriteMethod::Append, + )?; + + Ok(()) + } +} From 9b03b0a1b2e63c2ea434fec26b8741c9a800043c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Feb 2021 10:28:15 +0100 Subject: [PATCH 0467/1889] Introduce the word prefix pair proximity docids database --- infos/src/main.rs | 10 ++++++++++ milli/src/index.rs | 6 +++++- milli/src/update/clear_documents.rs | 2 ++ milli/src/update/delete_documents.rs | 2 ++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 305bfd0d5..fcfab8bc5 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -18,6 +18,7 @@ const MAIN_DB_NAME: &str = "main"; const WORD_DOCIDS_DB_NAME: &str = "word-docids"; const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; +const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const DOCUMENTS_DB_NAME: &str = "documents"; const USERS_IDS_DOCUMENTS_IDS: &[u8] = b"users-ids-documents-ids"; @@ -314,6 +315,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho word_prefix_docids, docid_word_positions, word_pair_proximity_docids, + word_prefix_pair_proximity_docids, facet_field_id_value_docids, field_id_docid_facet_values: _, documents, @@ -323,6 +325,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let word_docids_name = "word_docids"; let word_prefix_docids_name = "word_prefix_docids"; let docid_word_positions_name = "docid_word_positions"; + let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let facet_field_id_value_docids_name = "facet_field_id_value_docids"; let documents_name = "documents"; @@ -373,6 +376,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in word_prefix_pair_proximity_docids.remap_data_type::().iter(rtxn)? { + let ((word, prefix, prox), value) = result?; + let key = format!("{} {} {}", word, prefix, prox); + heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name))); + if heap.len() > limit { heap.pop(); } + } + let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; for (field_id, field_type) in faceted_fields { diff --git a/milli/src/index.rs b/milli/src/index.rs index 5763f78ee..12ad86b22 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -43,6 +43,8 @@ pub struct Index { pub docid_word_positions: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, + /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. + pub word_prefix_pair_proximity_docids: Database, /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. pub facet_field_id_value_docids: Database, /// Maps the document id, the facet field id and the globally ordered value. @@ -53,7 +55,7 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(8); + options.max_dbs(9); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -61,6 +63,7 @@ impl Index { let word_prefix_docids = env.create_database(Some("word-prefix-docids"))?; let docid_word_positions = env.create_database(Some("docid-word-positions"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; + let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; @@ -72,6 +75,7 @@ impl Index { word_prefix_docids, docid_word_positions, word_pair_proximity_docids, + word_prefix_pair_proximity_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index d20263d38..1523a95b2 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -25,6 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_docids, docid_word_positions, word_pair_proximity_docids, + word_prefix_pair_proximity_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, @@ -50,6 +51,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; + word_prefix_pair_proximity_docids.clear(self.wtxn)?; facet_field_id_value_docids.clear(self.wtxn)?; field_id_docid_facet_values.clear(self.wtxn)?; documents.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 1e0064f22..27686960d 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -82,6 +82,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_prefix_docids, docid_word_positions, word_pair_proximity_docids, + word_prefix_pair_proximity_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, @@ -160,6 +161,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // FIXME we must recompute the words prefixes docids. todo!("recompute words prefixes docids"); + todo!("recompute words prefixes pairs proximity docids"); // We construct an FST set that contains the words to delete from the words FST. let words_to_delete = words.iter().filter_map(|(word, must_remove)| { From b5b89990ebc5367748e5072ad640e984db55ee5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 11:04:27 +0100 Subject: [PATCH 0468/1889] Compute and write the word prefix pair proximities database --- milli/src/update/words_prefixes.rs | 74 ++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 10 deletions(-) diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs index bb8d3a6f8..b020ed28b 100644 --- a/milli/src/update/words_prefixes.rs +++ b/milli/src/update/words_prefixes.rs @@ -1,14 +1,17 @@ use std::iter::FromIterator; use std::str; -use fst::Streamer; +use fst::automaton::Str; +use fst::{Automaton, Streamer, IntoStreamer}; use grenad::CompressionType; +use heed::BytesEncode; use heed::types::ByteSlice; -use crate::{Index, SmallString32}; +use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_sorter, create_writer, writer_into_reader}; -use crate::update::index_documents::{word_docids_merge, write_into_lmdb_database}; +use crate::update::index_documents::{create_sorter, create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge}; +use crate::{Index, SmallString32}; pub struct WordsPrefixes<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -67,6 +70,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { pub fn execute(self) -> anyhow::Result<()> { // Clear the words prefixes datastructures. self.index.word_prefix_docids.clear(self.wtxn)?; + self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; let words_fst = self.index.words_fst(&self.wtxn)?; let number_of_words = words_fst.len(); @@ -74,7 +78,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. - let mut docids_sorter = create_sorter( + let mut prefix_docids_sorter = create_sorter( word_docids_merge, self.chunk_compression_type, self.chunk_compression_level, @@ -133,7 +137,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { let db = self.index.word_docids.remap_data_type::(); for result in db.prefix_iter(self.wtxn, prefix)? { let (_word, data) = result?; - docids_sorter.insert(prefix, data)?; + prefix_docids_sorter.insert(prefix, data)?; } } @@ -141,21 +145,71 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; // We write the sorter into a reader to be able to read it back. - let mut docids_writer = tempfile::tempfile().and_then(|file| { + let mut prefix_docids_writer = tempfile::tempfile().and_then(|file| { create_writer(self.chunk_compression_type, self.chunk_compression_level, file) })?; - docids_sorter.write_into(&mut docids_writer)?; - let docids_reader = writer_into_reader(docids_writer, self.chunk_fusing_shrink_size)?; + prefix_docids_sorter.write_into(&mut prefix_docids_writer)?; + let prefix_docids_reader = writer_into_reader( + prefix_docids_writer, + self.chunk_fusing_shrink_size, + )?; // We finally write the word prefix docids into the LMDB database. write_into_lmdb_database( self.wtxn, *self.index.word_prefix_docids.as_polymorph(), - docids_reader, + prefix_docids_reader, word_docids_merge, WriteMethod::Append, )?; + // We compute the word prefix pair proximity database. + + // Here we create a sorter akin to the previous one. + let mut word_prefix_pair_proximity_docids_sorter = create_sorter( + words_pairs_proximities_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + // We insert all the word pairs corresponding to the word-prefix pairs + // where the prefixes appears in the prefix FST previously constructed. + let db = self.index.word_pair_proximity_docids.remap_data_type::(); + for result in db.iter(self.wtxn)? { + let ((word1, word2, prox), data) = result?; + let automaton = Str::new(word2).starts_with(); + let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); + while let Some(prefix) = matching_prefixes.next() { + let prefix = str::from_utf8(prefix)?; + let pair = (word1, prefix, prox); + let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); + word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; + } + } + + // FIXME we should create a sorter_into_lmdb_database function + // We write the sorter into a reader to be able to read it back. + let mut word_prefix_pair_prox_docids_writer = tempfile::tempfile().and_then(|file| { + create_writer(self.chunk_compression_type, self.chunk_compression_level, file) + })?; + word_prefix_pair_proximity_docids_sorter.write_into(&mut word_prefix_pair_prox_docids_writer)?; + let word_prefix_pair_docids_reader = writer_into_reader( + word_prefix_pair_prox_docids_writer, + self.chunk_fusing_shrink_size, + )?; + + // We finally write the word prefix pair proximity docids into the LMDB database. + write_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + word_prefix_pair_docids_reader, + words_pairs_proximities_docids_merge, + WriteMethod::Append, + )?; + Ok(()) } } From 62eee9c69e4b5fdda6a8626119f489e0caf75933 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 11:09:42 +0100 Subject: [PATCH 0469/1889] Introduce the sorter_into_lmdb_database helper function --- milli/src/update/index_documents/mod.rs | 104 +++++++++++++++++------- milli/src/update/words_prefixes.rs | 31 ++----- 2 files changed, 78 insertions(+), 57 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d53173b71..b6fde7ef4 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -8,7 +8,7 @@ use std::time::Instant; use anyhow::Context; use bstr::ByteSlice as _; -use grenad::{Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; +use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; use heed::types::ByteSlice; use log::{debug, info, error}; use memmap::Mmap; @@ -102,39 +102,19 @@ pub fn merge_into_lmdb_database( sources: Vec>, merge: MergeFn, method: WriteMethod, -) -> anyhow::Result<()> { +) -> anyhow::Result<()> +{ debug!("Merging {} MTBL stores...", sources.len()); let before = Instant::now(); let merger = merge_readers(sources, merge); - let mut in_iter = merger.into_merge_iter()?; - - match method { - WriteMethod::Append => { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = in_iter.next()? { - out_iter.append(k, v).with_context(|| { - format!("writing {:?} into LMDB", k.as_bstr()) - })?; - } - }, - WriteMethod::GetMergePut => { - while let Some((k, v)) = in_iter.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).expect("merge failed"); - iter.put_current(k, &val)?; - }, - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - }, - } - } - }, - } + merger_iter_into_lmdb_database( + wtxn, + database, + merger.into_merge_iter()?, + merge, + method, + )?; debug!("MTBL stores merged in {:.02?}!", before.elapsed()); Ok(()) @@ -146,7 +126,8 @@ pub fn write_into_lmdb_database( mut reader: Reader, merge: MergeFn, method: WriteMethod, -) -> anyhow::Result<()> { +) -> anyhow::Result<()> +{ debug!("Writing MTBL stores..."); let before = Instant::now(); @@ -181,6 +162,67 @@ pub fn write_into_lmdb_database( Ok(()) } +pub fn sorter_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + sorter: Sorter, + merge: MergeFn, + method: WriteMethod, +) -> anyhow::Result<()> +{ + debug!("Writing MTBL sorter..."); + let before = Instant::now(); + + merger_iter_into_lmdb_database( + wtxn, + database, + sorter.into_iter()?, + merge, + method, + )?; + + debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); + Ok(()) +} + +fn merger_iter_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + mut sorter: MergerIter, + merge: MergeFn, + method: WriteMethod, +) -> anyhow::Result<()> +{ + match method { + WriteMethod::Append => { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + while let Some((k, v)) = sorter.next()? { + out_iter.append(k, v).with_context(|| { + format!("writing {:?} into LMDB", k.as_bstr()) + })?; + } + }, + WriteMethod::GetMergePut => { + while let Some((k, v)) = sorter.next()? { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; + match iter.next().transpose()? { + Some((key, old_val)) if key == k => { + let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; + let val = merge(k, &vals).expect("merge failed"); + iter.put_current(k, &val)?; + }, + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + }, + } + } + }, + } + + Ok(()) +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[non_exhaustive] pub enum IndexDocumentsMethod { diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs index b020ed28b..f7c898c89 100644 --- a/milli/src/update/words_prefixes.rs +++ b/milli/src/update/words_prefixes.rs @@ -9,7 +9,7 @@ use heed::types::ByteSlice; use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_sorter, create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database}; use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge}; use crate::{Index, SmallString32}; @@ -144,21 +144,11 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { // Set the words prefixes FST in the dtabase. self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; - // We write the sorter into a reader to be able to read it back. - let mut prefix_docids_writer = tempfile::tempfile().and_then(|file| { - create_writer(self.chunk_compression_type, self.chunk_compression_level, file) - })?; - prefix_docids_sorter.write_into(&mut prefix_docids_writer)?; - let prefix_docids_reader = writer_into_reader( - prefix_docids_writer, - self.chunk_fusing_shrink_size, - )?; - // We finally write the word prefix docids into the LMDB database. - write_into_lmdb_database( + sorter_into_lmdb_database( self.wtxn, *self.index.word_prefix_docids.as_polymorph(), - prefix_docids_reader, + prefix_docids_sorter, word_docids_merge, WriteMethod::Append, )?; @@ -190,22 +180,11 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { } } - // FIXME we should create a sorter_into_lmdb_database function - // We write the sorter into a reader to be able to read it back. - let mut word_prefix_pair_prox_docids_writer = tempfile::tempfile().and_then(|file| { - create_writer(self.chunk_compression_type, self.chunk_compression_level, file) - })?; - word_prefix_pair_proximity_docids_sorter.write_into(&mut word_prefix_pair_prox_docids_writer)?; - let word_prefix_pair_docids_reader = writer_into_reader( - word_prefix_pair_prox_docids_writer, - self.chunk_fusing_shrink_size, - )?; - // We finally write the word prefix pair proximity docids into the LMDB database. - write_into_lmdb_database( + sorter_into_lmdb_database( self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - word_prefix_pair_docids_reader, + word_prefix_pair_proximity_docids_sorter, words_pairs_proximities_docids_merge, WriteMethod::Append, )?; From ea37fd821d646c9bec5378b2c3e486dcb21400b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 11:22:25 +0100 Subject: [PATCH 0470/1889] Clean up the words prefixes when deleting documents and words --- milli/src/update/delete_documents.rs | 45 +++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 27686960d..754f320a5 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -159,10 +159,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } - // FIXME we must recompute the words prefixes docids. - todo!("recompute words prefixes docids"); - todo!("recompute words prefixes pairs proximity docids"); - // We construct an FST set that contains the words to delete from the words FST. let words_to_delete = words.iter().filter_map(|(word, must_remove)| { if *must_remove { Some(word.as_ref()) } else { None } @@ -185,6 +181,47 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We write the new words FST into the main database. self.index.put_words_fst(self.wtxn, &new_words_fst)?; + // We iterate over the word prefix docids database and remove the deleted documents ids + // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. + let mut prefixes_to_delete = fst::SetBuilder::memory(); + let mut iter = word_prefix_docids.iter_mut(self.wtxn)?; + while let Some(result) = iter.next() { + let (prefix, mut docids) = result?; + let previous_len = docids.len(); + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + prefixes_to_delete.insert(prefix)?; + } else if docids.len() != previous_len { + iter.put_current(prefix, &docids)?; + } + } + + drop(iter); + + // We compute the new prefix FST and write it only if there is a change. + let prefixes_to_delete = prefixes_to_delete.into_set(); + if !prefixes_to_delete.is_empty() { + let new_words_prefixes_fst = { + // We retrieve the current words prefixes FST from the database. + let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?; + let difference = words_prefixes_fst.op().add(&prefixes_to_delete).difference(); + + // We stream the new external ids that does no more contains the to-delete external ids. + let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory(); + new_words_prefixes_fst_builder.extend_stream(difference.into_stream())?; + + // We create an words FST set from the above builder. + new_words_prefixes_fst_builder.into_set() + }; + + // We write the new words prefixes FST into the main database. + self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; + } + + // FIXME we must recompute the words prefixes docids. + todo!("recompute words prefixes pairs proximity docids"); + // We delete the documents ids that are under the pairs of words, // it is faster and use no memory to iterate over all the words pairs than // to compute the cartesian product of every words of the deleted documents. From 616ed8f73c4d64f3d276c56d268b1ee3d9f47f30 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Feb 2021 10:35:25 +0100 Subject: [PATCH 0471/1889] Clean up the word prefix pair proximities when deleting documents --- milli/src/update/delete_documents.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 754f320a5..5430bb3af 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -219,8 +219,22 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; } - // FIXME we must recompute the words prefixes docids. - todo!("recompute words prefixes pairs proximity docids"); + // We delete the documents ids from the word prefix pair proximity database docids + // and remove the empty pairs too. + let db = word_prefix_pair_proximity_docids.remap_key_type::(); + let mut iter = db.iter_mut(self.wtxn)?; + while let Some(result) = iter.next() { + let (key, mut docids) = result?; + let previous_len = docids.len(); + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(key, &docids)?; + } + } + + drop(iter); // We delete the documents ids that are under the pairs of words, // it is faster and use no memory to iterate over all the words pairs than From 87884859247da16493888f7793e2291f341786e6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Feb 2021 11:20:00 +0100 Subject: [PATCH 0472/1889] Take the prefix databases into account in the infos subcommand --- infos/src/main.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index fcfab8bc5..e88188217 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -16,6 +16,7 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; const MAIN_DB_NAME: &str = "main"; const WORD_DOCIDS_DB_NAME: &str = "word-docids"; +const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids"; const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; @@ -25,15 +26,19 @@ const USERS_IDS_DOCUMENTS_IDS: &[u8] = b"users-ids-documents-ids"; const ALL_DATABASE_NAMES: &[&str] = &[ MAIN_DB_NAME, WORD_DOCIDS_DB_NAME, + WORD_PREFIX_DOCIDS_DB_NAME, DOCID_WORD_POSITIONS_DB_NAME, WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, DOCUMENTS_DB_NAME, ]; const POSTINGS_DATABASE_NAMES: &[&str] = &[ WORD_DOCIDS_DB_NAME, + WORD_PREFIX_DOCIDS_DB_NAME, DOCID_WORD_POSITIONS_DB_NAME, WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, ]; #[derive(Debug, StructOpt)] @@ -653,9 +658,11 @@ fn size_of_database(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Re let database = match name { MAIN_DB_NAME => &index.main, + WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(), WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(), DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), DOCUMENTS_DB_NAME => index.documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; @@ -718,7 +725,7 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu let sum = values_length.iter().map(|l| *l as u64).sum::(); println!("The {} database stats on the lengths", name); - println!("\tnumber of proximity pairs: {}", count); + println!("\tnumber of entries: {}", count); println!("\t25th percentile (first quartile): {}", twenty_five_percentile); println!("\t50th percentile (median): {}", fifty_percentile); println!("\t75th percentile (third quartile): {}", seventy_five_percentile); @@ -740,6 +747,10 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu let db = index.word_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, + WORD_PREFIX_DOCIDS_DB_NAME => { + let db = index.word_prefix_docids.as_polymorph(); + compute_stats::(*db, rtxn, name) + }, DOCID_WORD_POSITIONS_DB_NAME => { let db = index.docid_word_positions.as_polymorph(); compute_stats::(*db, rtxn, name) @@ -748,6 +759,10 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu let db = index.word_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => { + let db = index.word_prefix_pair_proximity_docids.as_polymorph(); + compute_stats::(*db, rtxn, name) + }, unknown => anyhow::bail!("unknown database {:?}", unknown), } } From a4a48be923f454dfee45b2f921c026d1c5fadb0d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Feb 2021 11:53:13 +0100 Subject: [PATCH 0473/1889] Run the words prefixes update inside of the indexing documents update --- milli/src/update/index_documents/mod.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b6fde7ef4..d55f421dc 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -17,7 +17,7 @@ use rayon::prelude::*; use serde::{Serialize, Deserialize}; use crate::index::Index; -use crate::update::{Facets, UpdateIndexingStep}; +use crate::update::{Facets, WordsPrefixes, UpdateIndexingStep}; use self::store::{Store, Readers}; pub use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, @@ -259,6 +259,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { pub(crate) thread_pool: Option<&'a ThreadPool>, facet_level_group_size: Option, facet_min_level_size: Option, + words_prefix_threshold: Option, + max_prefix_length: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, @@ -284,6 +286,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { thread_pool: None, facet_level_group_size: None, facet_min_level_size: None, + words_prefix_threshold: None, + max_prefix_length: None, update_method: IndexDocumentsMethod::ReplaceDocuments, update_format: UpdateFormat::Json, autogenerate_docids: true, @@ -667,6 +671,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }); } + // Run the facets update operation. let mut builder = Facets::new(self.wtxn, self.index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; @@ -679,6 +684,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + // Run the words prefixes update operation. + let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + if let Some(value) = self.words_prefix_threshold { + builder.threshold(value); + } + if let Some(value) = self.max_prefix_length { + builder.max_prefix_length(value); + } + builder.execute()?; + debug_assert_eq!(database_count, total_databases); info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); From 7a0f86a04fb62f23d2821f0e06118ad65b3ef5ce Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Feb 2021 12:18:56 +0100 Subject: [PATCH 0474/1889] Introduce an infos command to extract the words prefixes fst --- infos/src/main.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/infos/src/main.rs b/infos/src/main.rs index e88188217..ef23bf4ff 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -153,6 +153,12 @@ enum Command { /// you can install it using `cargo install fst-bin`. ExportWordsFst, + /// Outputs the words prefix FST to standard output. + /// + /// One can use the FST binary helper to dissect and analyze it, + /// you can install it using `cargo install fst-bin`. + ExportWordsPrefixFst, + /// Outputs the documents as JSON lines to the standard output. /// /// All of the fields are extracted, not just the displayed ones. @@ -207,6 +213,7 @@ fn run(opt: Opt) -> anyhow::Result<()> { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) }, ExportWordsFst => export_words_fst(&index, &rtxn), + ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), ExportDocuments => export_documents(&index, &rtxn), PatchToNewExternalIds => { drop(rtxn); @@ -548,6 +555,16 @@ fn export_words_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { Ok(()) } +fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { + use std::io::Write as _; + + let mut stdout = io::stdout(); + let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; + stdout.write_all(words_prefixes_fst.as_fst().as_bytes())?; + + Ok(()) +} + fn export_documents(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use std::io::{BufWriter, Write as _}; use milli::obkv_to_json; From 49aee6d02cb1ffb4d9df79af5b0ae9ebe82af55a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Feb 2021 12:19:10 +0100 Subject: [PATCH 0475/1889] Fix the database-stats infos subcommand --- infos/src/main.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index ef23bf4ff..e33c2820f 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -725,17 +725,14 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu } values_length.sort_unstable(); + let len = values_length.len(); - let median = values_length.len() / 2; - let quartile = values_length.len() / 4; - let percentile = values_length.len() / 100; - - let twenty_five_percentile = values_length.get(quartile).unwrap_or(&0); - let fifty_percentile = values_length.get(median).unwrap_or(&0); - let seventy_five_percentile = values_length.get(quartile * 3).unwrap_or(&0); - let ninety_percentile = values_length.get(percentile * 90).unwrap_or(&0); - let ninety_five_percentile = values_length.get(percentile * 95).unwrap_or(&0); - let ninety_nine_percentile = values_length.get(percentile * 99).unwrap_or(&0); + let twenty_five_percentile = values_length.get(len / 4).unwrap_or(&0); + let fifty_percentile = values_length.get(len / 2).unwrap_or(&0); + let seventy_five_percentile = values_length.get(len * 3 / 4).unwrap_or(&0); + let ninety_percentile = values_length.get(len * 90 / 100).unwrap_or(&0); + let ninety_five_percentile = values_length.get(len * 95 / 100).unwrap_or(&0); + let ninety_nine_percentile = values_length.get(len * 99 / 100).unwrap_or(&0); let minimum = values_length.first().unwrap_or(&0); let maximum = values_length.last().unwrap_or(&0); let count = values_length.len(); From aa4d9882d298eafa24c462b8395fd3869250089c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Feb 2021 12:28:46 +0100 Subject: [PATCH 0476/1889] Introduce the new words-prefixes-docids infos subcomand --- infos/src/main.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/infos/src/main.rs b/infos/src/main.rs index e33c2820f..3f41b7d42 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -91,6 +91,16 @@ enum Command { words: Vec, }, + /// Outputs a CSV with the documents ids where the given words prefixes appears. + WordsPrefixesDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The prefixes to display the documents ids of. + prefixes: Vec, + }, + /// Outputs a CSV with the documents ids along with the facet values where it appears. FacetValuesDocids { /// Display the whole documents ids in details. @@ -198,6 +208,9 @@ fn run(opt: Opt) -> anyhow::Result<()> { MostCommonWords { limit } => most_common_words(&index, &rtxn, limit), BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit), WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words), + WordsPrefixesDocids { full_display, prefixes } => { + words_prefixes_docids(&index, &rtxn, !full_display, prefixes) + }, FacetValuesDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, field_name) }, @@ -464,6 +477,43 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec, +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["prefix", "documents_ids"])?; + + if prefixes.is_empty() { + for result in index.word_prefix_docids.iter(rtxn)? { + let (prefix, docids) = result?; + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[prefix, &docids])?; + } + } else { + for prefix in prefixes { + if let Some(docids) = index.word_prefix_docids.get(rtxn, &prefix)? { + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[prefix, docids])?; + } + } + } + + Ok(wtr.flush()?) +} + fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> { let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; From fcfb39c5de6ce70d59acb2ef28fdd1712136dacf Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Feb 2021 13:56:28 +0100 Subject: [PATCH 0477/1889] Move the RoaringBitmap related codecs into a module --- milli/src/heed_codec/mod.rs | 8 ++------ .../{ => roaring_bitmap}/bo_roaring_bitmap_codec.rs | 0 .../{ => roaring_bitmap}/cbo_roaring_bitmap_codec.rs | 0 milli/src/heed_codec/roaring_bitmap/mod.rs | 7 +++++++ .../{ => roaring_bitmap}/roaring_bitmap_codec.rs | 0 5 files changed, 9 insertions(+), 6 deletions(-) rename milli/src/heed_codec/{ => roaring_bitmap}/bo_roaring_bitmap_codec.rs (100%) rename milli/src/heed_codec/{ => roaring_bitmap}/cbo_roaring_bitmap_codec.rs (100%) create mode 100644 milli/src/heed_codec/roaring_bitmap/mod.rs rename milli/src/heed_codec/{ => roaring_bitmap}/roaring_bitmap_codec.rs (100%) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index e7b8cf256..ff8285357 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -1,14 +1,10 @@ mod beu32_str_codec; -mod bo_roaring_bitmap_codec; -mod cbo_roaring_bitmap_codec; mod obkv_codec; -mod roaring_bitmap_codec; +mod roaring_bitmap; mod str_str_u8_codec; pub mod facet; pub use self::beu32_str_codec::BEU32StrCodec; -pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; -pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; pub use self::obkv_codec::ObkvCodec; -pub use self::roaring_bitmap_codec::RoaringBitmapCodec; +pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/milli/src/heed_codec/bo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs similarity index 100% rename from milli/src/heed_codec/bo_roaring_bitmap_codec.rs rename to milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs diff --git a/milli/src/heed_codec/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs similarity index 100% rename from milli/src/heed_codec/cbo_roaring_bitmap_codec.rs rename to milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs diff --git a/milli/src/heed_codec/roaring_bitmap/mod.rs b/milli/src/heed_codec/roaring_bitmap/mod.rs new file mode 100644 index 000000000..866567637 --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap/mod.rs @@ -0,0 +1,7 @@ +mod bo_roaring_bitmap_codec; +mod cbo_roaring_bitmap_codec; +mod roaring_bitmap_codec; + +pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; +pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; +pub use self::roaring_bitmap_codec::RoaringBitmapCodec; diff --git a/milli/src/heed_codec/roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs similarity index 100% rename from milli/src/heed_codec/roaring_bitmap_codec.rs rename to milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs From 8d710c51301ffe48bb346c2eb2b8b745d3522c59 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Feb 2021 14:24:30 +0100 Subject: [PATCH 0478/1889] Introduce heed codecs to retrieve the length of roaring bitmaps --- milli/src/heed_codec/mod.rs | 2 + .../cbo_roaring_bitmap_codec.rs | 2 +- milli/src/heed_codec/roaring_bitmap/mod.rs | 2 +- .../bo_roaring_bitmap_len_codec.rs | 11 +++ .../cbo_roaring_bitmap_len_codec.rs | 22 +++++ .../heed_codec/roaring_bitmap_length/mod.rs | 7 ++ .../roaring_bitmap_len_codec.rs | 83 +++++++++++++++++++ milli/src/index.rs | 4 +- milli/src/lib.rs | 1 + 9 files changed, 130 insertions(+), 4 deletions(-) create mode 100644 milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs create mode 100644 milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs create mode 100644 milli/src/heed_codec/roaring_bitmap_length/mod.rs create mode 100644 milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index ff8285357..a070c66eb 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -1,10 +1,12 @@ mod beu32_str_codec; mod obkv_codec; mod roaring_bitmap; +mod roaring_bitmap_length; mod str_str_u8_codec; pub mod facet; pub use self::beu32_str_codec::BEU32StrCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; +pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 31eb949b3..8ccf831e3 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; /// This is the limit where using a byteorder became less size efficient /// than using a direct roaring encoding, it is also the point where we are able /// to determine the encoding used only by using the array of bytes length. -const THRESHOLD: usize = 7; +pub const THRESHOLD: usize = 7; /// A conditionnal codec that either use the RoaringBitmap /// or a lighter ByteOrder en/decoding method. diff --git a/milli/src/heed_codec/roaring_bitmap/mod.rs b/milli/src/heed_codec/roaring_bitmap/mod.rs index 866567637..6f8045c92 100644 --- a/milli/src/heed_codec/roaring_bitmap/mod.rs +++ b/milli/src/heed_codec/roaring_bitmap/mod.rs @@ -1,5 +1,5 @@ mod bo_roaring_bitmap_codec; -mod cbo_roaring_bitmap_codec; +pub mod cbo_roaring_bitmap_codec; mod roaring_bitmap_codec; pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; diff --git a/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs new file mode 100644 index 000000000..e749680a0 --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs @@ -0,0 +1,11 @@ +use std::mem; + +pub struct BoRoaringBitmapLenCodec; + +impl heed::BytesDecode<'_> for BoRoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode(bytes: &[u8]) -> Option { + Some((bytes.len() / mem::size_of::()) as u64) + } +} diff --git a/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs new file mode 100644 index 000000000..4f728f1cd --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs @@ -0,0 +1,22 @@ +use std::mem; + +use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec}; +use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD; + +pub struct CboRoaringBitmapLenCodec; + +impl heed::BytesDecode<'_> for CboRoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode(bytes: &[u8]) -> Option { + if bytes.len() <= THRESHOLD * mem::size_of::() { + // If there is threshold or less than threshold integers that can fit into this array + // of bytes it means that we used the ByteOrder codec serializer. + BoRoaringBitmapLenCodec::bytes_decode(bytes) + } else { + // Otherwise, it means we used the classic RoaringBitmapCodec and + // that the header takes threshold integers. + RoaringBitmapLenCodec::bytes_decode(bytes) + } + } +} diff --git a/milli/src/heed_codec/roaring_bitmap_length/mod.rs b/milli/src/heed_codec/roaring_bitmap_length/mod.rs new file mode 100644 index 000000000..e503c5c7a --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/mod.rs @@ -0,0 +1,7 @@ +mod bo_roaring_bitmap_len_codec; +mod cbo_roaring_bitmap_len_codec; +mod roaring_bitmap_len_codec; + +pub use self::bo_roaring_bitmap_len_codec::BoRoaringBitmapLenCodec; +pub use self::cbo_roaring_bitmap_len_codec::CboRoaringBitmapLenCodec; +pub use self::roaring_bitmap_len_codec::RoaringBitmapLenCodec; diff --git a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs new file mode 100644 index 000000000..042b5cf6b --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs @@ -0,0 +1,83 @@ +use std::io::{self, Read, BufRead}; +use std::mem; + +use byteorder::{ReadBytesExt, LittleEndian}; + +const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; +const SERIAL_COOKIE: u16 = 12347; + +pub struct RoaringBitmapLenCodec; + +impl RoaringBitmapLenCodec { + // FIXME should be exported in the RoaringBitmap crate + fn deserialize_from_slice(mut bytes: &[u8]) -> io::Result { + let (size, has_offsets) = { + let cookie = bytes.read_u32::()?; + if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { + (bytes.read_u32::()? as usize, true) + } else if (cookie as u16) == SERIAL_COOKIE { + return Err(io::Error::new( + io::ErrorKind::Other, + "run containers are unsupported", + )); + } else { + return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); + } + }; + + if size > u16::max_value() as usize + 1 { + return Err(io::Error::new( + io::ErrorKind::Other, + "size is greater than supported", + )); + } + + let mut description_bytes = vec![0u8; size * 4]; + bytes.read_exact(&mut description_bytes)?; + let description_bytes = &mut &description_bytes[..]; + + if has_offsets { + bytes.consume(size * 4); + } + + let mut length = 0; + for _ in 0..size { + let _key = description_bytes.read_u16::()?; + let len = u64::from(description_bytes.read_u16::()?) + 1; + length += len; + + if len <= 4096 { + bytes.consume(len as usize * mem::size_of::()); + } else { + bytes.consume(1024 * mem::size_of::()) + } + } + + Ok(length) + } +} + +impl heed::BytesDecode<'_> for RoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode(bytes: &[u8]) -> Option { + RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::heed_codec::RoaringBitmapCodec; + use heed::BytesEncode; + use roaring::RoaringBitmap; + + #[test] + fn deserialize_roaring_bitmap_length() { + let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect(); + let bytes = RoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + let len = RoaringBitmapLenCodec::deserialize_from_slice(&bytes).unwrap(); + assert_eq!(bitmap.len(), len); + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 12ad86b22..f8efa324e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -12,8 +12,8 @@ use crate::fields_ids_map::FieldsIdsMap; use crate::{default_criteria, Criterion, Search, FacetDistribution}; use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; use crate::{ - RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, - BoRoaringBitmapCodec, CboRoaringBitmapCodec, + RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec, + StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, }; pub const CRITERIA_KEY: &str = "criteria"; diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 66d134f4e..0fa966ee8 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -26,6 +26,7 @@ pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; +pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult}; pub use self::update_store::UpdateStore; From 2f561c77f528d3ba44c7b1f71575e86781c42a1b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Feb 2021 14:35:14 +0100 Subject: [PATCH 0479/1889] Introduce the word documents count method on the index --- milli/src/index.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index f8efa324e..5c5fc9895 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -352,6 +352,17 @@ impl Index { } } + /* word documents count */ + + /// Returns the number of documents ids associated with the given word, + /// it is much faster than deserializing the bitmap and getting the length of it. + pub fn word_documents_count(&self, rtxn: &RoTxn, word: &str) -> anyhow::Result> { + self.word_docids + .remap_data_type::() + .get(rtxn, word) + .map_err(Into::into) + } + /* documents */ /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing. From c2ffcc4bd1e0e0b50496a7f48df9cdf2c2743d25 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Feb 2021 14:59:37 +0100 Subject: [PATCH 0480/1889] Return an heed error from the word_documents_count method --- milli/src/index.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 5c5fc9895..7b83d69fc 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -356,11 +356,8 @@ impl Index { /// Returns the number of documents ids associated with the given word, /// it is much faster than deserializing the bitmap and getting the length of it. - pub fn word_documents_count(&self, rtxn: &RoTxn, word: &str) -> anyhow::Result> { - self.word_docids - .remap_data_type::() - .get(rtxn, word) - .map_err(Into::into) + pub fn word_documents_count(&self, rtxn: &RoTxn, word: &str) -> heed::Result> { + self.word_docids.remap_data_type::().get(rtxn, word) } /* documents */ From 519b1cb5c9672f2d86ebd297f1903c2e23873602 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sat, 20 Feb 2021 21:15:44 +0100 Subject: [PATCH 0481/1889] Update dependencies --- Cargo.lock | 115 +++++++++++++++++++++++++++++---------------- http-ui/Cargo.toml | 30 ++++++------ infos/Cargo.toml | 14 +++--- milli/Cargo.toml | 42 ++++++++--------- search/Cargo.toml | 12 ++--- 5 files changed, 123 insertions(+), 90 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a6a178510..112b98690 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -202,9 +202,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.6.0" +version = "3.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "099e596ef14349721d9016f6b80dd3419ea1bf289ab9b44df8e4dfd3a005d5d9" +checksum = "63396b8a4b9de3f4fdfb320ab6080762242f66a8ef174c49d8e19b674db4cdbe" [[package]] name = "byte-tools" @@ -221,6 +221,12 @@ dependencies = [ "utf8-width", ] +[[package]] +name = "bytemuck" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bed57e2090563b83ba8f83366628ce535a7584c9afa4c9fc0612a03925c6df58" + [[package]] name = "byteorder" version = "1.4.2" @@ -250,9 +256,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.66" +version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48" +checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd" dependencies = [ "jobserver", ] @@ -312,12 +318,6 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "const_fn" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28b9d6de7f49e22cf97ad17fc4036ece69300032f45f78f30b4a4482cdc3f4a6" - [[package]] name = "cow-utils" version = "0.1.2" @@ -382,7 +382,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.1", + "crossbeam-utils 0.8.2", ] [[package]] @@ -393,19 +393,19 @@ checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils 0.8.1", + "crossbeam-utils 0.8.2", ] [[package]] name = "crossbeam-epoch" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1aaa739f95311c2c7887a76863f500026092fb1dce0161dab577e559ef3569d" +checksum = "d60ab4a8dba064f2fbb5aa270c28da5cf4bbd0e72dae1140a6b0353a779dbe00" dependencies = [ "cfg-if 1.0.0", - "const_fn", - "crossbeam-utils 0.8.1", + "crossbeam-utils 0.8.2", "lazy_static", + "loom", "memoffset", "scopeguard", ] @@ -431,13 +431,14 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d" +checksum = "bae8f328835f8f5a6ceb6a7842a7f2d0c03692adb5c889347235d59194731fe3" dependencies = [ "autocfg", "cfg-if 1.0.0", "lazy_static", + "loom", ] [[package]] @@ -524,9 +525,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "form_urlencoded" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece68d15c92e84fa4f19d3780f1294e5ca82a78a6d515f1efaabcc144688be00" +checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" dependencies = [ "matches", "percent-encoding", @@ -670,6 +671,19 @@ dependencies = [ "byteorder", ] +[[package]] +name = "generator" +version = "0.6.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cdc09201b2e8ca1b19290cf7e65de2246b8e91fb6874279722189c4de7b94dc" +dependencies = [ + "cc", + "libc", + "log", + "rustc_version", + "winapi 0.3.9", +] + [[package]] name = "generic-array" version = "0.12.3" @@ -956,9 +970,9 @@ dependencies = [ [[package]] name = "idna" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de910d521f7cc3135c4de8db1cb910e0b5ed1dc6f57c381cd07e8e661ce10094" +checksum = "89829a5d69c23d348314a7ac337fe39173b61149a9864deabd260983aed48c21" dependencies = [ "matches", "unicode-bidi", @@ -1157,6 +1171,17 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "loom" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d44c73b4636e497b4917eb21c33539efa3816741a2d3ff26c6316f1b529481a4" +dependencies = [ + "cfg-if 1.0.0", + "generator", + "scoped-tls", +] + [[package]] name = "maplit" version = "1.0.2" @@ -1227,7 +1252,7 @@ dependencies = [ "grenad", "heed", "human_format", - "itertools 0.9.0", + "itertools 0.10.0", "levenshtein_automata", "linked-hash-map", "log", @@ -1384,9 +1409,9 @@ dependencies = [ [[package]] name = "nom" -version = "6.1.0" +version = "6.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab6f70b46d6325aa300f1c7bb3d470127dfc27806d8ea6bf294ee0ce643ce2b1" +checksum = "3d521ee2250f619dd5e06515ba405858d249edc8fae9ddee2dba0695e57db01b" dependencies = [ "bitvec", "lexical-core", @@ -1842,16 +1867,16 @@ checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" dependencies = [ "crossbeam-channel", "crossbeam-deque", - "crossbeam-utils 0.8.1", + "crossbeam-utils 0.8.2", "lazy_static", "num_cpus", ] [[package]] name = "redox_syscall" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ec8ca9416c5ea37062b502703cd7fcb207736bc294f6e0cf367ac6fc234570" +checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9" dependencies = [ "bitflags", ] @@ -1893,12 +1918,20 @@ dependencies = [ ] [[package]] -name = "roaring" -version = "0.6.4" +name = "retain_mut" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d60b41c8f25d07cecab125cb46ebbf234fc055effc61ca2392a3ef4f9422304" +checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1" + +[[package]] +name = "roaring" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6744a4a918e91359ad1d356a91e2e943a86d9fb9ae77f715d617032ea2af88f" dependencies = [ + "bytemuck", "byteorder", + "retain_mut", ] [[package]] @@ -2041,9 +2074,9 @@ dependencies = [ [[package]] name = "sha-1" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4b312c3731e3fe78a185e6b9b911a7aa715b8e31cce117975219aab2acf285d" +checksum = "dfebf75d25bd900fd1e7d11501efab59bc846dbc76196839663e6637bba9f25f" dependencies = [ "block-buffer 0.9.0", "cfg-if 1.0.0", @@ -2359,9 +2392,9 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" [[package]] name = "tracing" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d40a22fd029e33300d8d89a5cc8ffce18bb7c587662f54629e94c9de5487f3" +checksum = "f77d3842f76ca899ff2dbcf231c5c65813dea431301d6eb686279c15c4464f12" dependencies = [ "cfg-if 1.0.0", "log", @@ -2380,11 +2413,11 @@ dependencies = [ [[package]] name = "tracing-futures" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab7bb6f14721aa00656086e9335d363c5c8747bae02ebe32ea2c7dece5689b4c" +checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" dependencies = [ - "pin-project 0.4.27", + "pin-project 1.0.5", "tracing", ] @@ -2408,7 +2441,7 @@ dependencies = [ "input_buffer", "log", "rand 0.7.3", - "sha-1 0.9.3", + "sha-1 0.9.4", "url", "utf-8", ] @@ -2481,9 +2514,9 @@ checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" [[package]] name = "url" -version = "2.2.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e" +checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" dependencies = [ "form_urlencoded", "idna", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 1326abfc5..9d8f79c08 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -6,34 +6,34 @@ authors = ["Clément Renault "] edition = "2018" [dependencies] -anyhow = "1.0.28" +anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } -heed = "0.10.5" +heed = "0.10.6" meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } memmap = "0.7.0" milli = { path = "../milli" } -once_cell = "1.4.1" +once_cell = "1.5.2" rayon = "1.5.0" -structopt = { version = "0.3.14", default-features = false, features = ["wrap_help"] } -tempfile = "3.1.0" +structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] } +tempfile = "3.2.0" # http server -askama = "0.10.1" +askama = "0.10.5" askama_warp = "0.10.0" bytes = "0.5.6" either = "1.6.1" -flate2 = "1.0.19" -futures = "0.3.6" -serde = { version = "1.0", features = ["derive"] } -serde_json = { version = "1.0.59", features = ["preserve_order"] } -tokio = { version = "0.2", features = ["full"] } -warp = "0.2.2" +flate2 = "1.0.20" +futures = "0.3.12" +serde = { version = "1.0.123", features = ["derive"] } +serde_json = { version = "1.0.62", features = ["preserve_order"] } +tokio = { version = "0.2.25", features = ["full"] } +warp = "0.2.5" # logging -log = "0.4.11" -stderrlog = "0.5.0" +log = "0.4.14" +stderrlog = "0.5.1" fst = "0.4.5" # Temporary fix for bitvec, remove once fixed. (https://github.com/bitvecto-rs/bitvec/issues/105) -funty = "=1.1.0" +funty = "=1.1" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 14d52a573..32dfed20a 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -5,13 +5,13 @@ authors = ["Clément Renault "] edition = "2018" [dependencies] -anyhow = "1.0.28" +anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -csv = "1.1.3" -heed = "0.10.5" +csv = "1.1.5" +heed = "0.10.6" jemallocator = "0.3.2" milli = { path = "../milli" } -roaring = "0.6.4" -serde_json = "1.0.59" -stderrlog = "0.5.0" -structopt = { version = "0.3.14", default-features = false } +roaring = "0.6.5" +serde_json = "1.0.62" +stderrlog = "0.5.1" +structopt = { version = "0.3.21", default-features = false } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7b6d3b7b9..0c3052796 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -5,48 +5,48 @@ authors = ["Kerollmops "] edition = "2018" [dependencies] -anyhow = "1.0.28" -bstr = "0.2.13" -byteorder = "1.3.4" +anyhow = "1.0.38" +bstr = "0.2.15" +byteorder = "1.4.2" crossbeam-channel = "0.5.0" -csv = "1.1.3" +csv = "1.1.5" either = "1.6.1" -flate2 = "1.0.17" +flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } -heed = { version = "0.10.5", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { version = "0.10.6", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } -linked-hash-map = "0.5.3" +linked-hash-map = "0.5.4" meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } memmap = "0.7.0" num-traits = "0.2.14" -obkv = "0.1.0" -once_cell = "1.4.0" -ordered-float = "2.0.0" -rayon = "1.3.1" -regex = "1.4.2" -roaring = "0.6.4" -serde = { version = "1.0", features = ["derive"] } -serde_json = { version = "1.0.59", features = ["preserve_order"] } +obkv = "0.1.1" +once_cell = "1.5.2" +ordered-float = "2.1.1" +rayon = "1.5.0" +regex = "1.4.3" +roaring = "0.6.5" +serde = { version = "1.0.123", features = ["derive"] } +serde_json = { version = "1.0.62", features = ["preserve_order"] } smallstr = { version = "0.2.0", features = ["serde"] } -smallvec = "1.4.0" -tempfile = "3.1.0" -uuid = { version = "0.8.1", features = ["v4"] } +smallvec = "1.6.1" +tempfile = "3.2.0" +uuid = { version = "0.8.2", features = ["v4"] } # facet filter parser pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } pest_derive = "2.1.0" # documents words self-join -itertools = "0.9.0" +itertools = "0.10.0" # logging -log = "0.4.11" +log = "0.4.14" [dev-dependencies] -criterion = "0.3.3" +criterion = "0.3.4" maplit = "1.0.2" [build-dependencies] diff --git a/search/Cargo.toml b/search/Cargo.toml index 947deb70d..a2c79776a 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -5,12 +5,12 @@ authors = ["Clément Renault "] edition = "2018" [dependencies] -anyhow = "1.0.28" +anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -heed = "0.10.5" +heed = "0.10.6" jemallocator = "0.3.2" -log = "0.4.11" +log = "0.4.14" milli = { path = "../milli" } -serde_json = "1.0.59" -stderrlog = "0.5.0" -structopt = { version = "0.3.14", default-features = false } +serde_json = "1.0.62" +stderrlog = "0.5.1" +structopt = { version = "0.3.21", default-features = false } From c318373b88a9cc7ee4b451f1314d11f510db6158 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 18 Feb 2021 18:33:25 +0100 Subject: [PATCH 0482/1889] Expose the WordsPrefixes update on the UpdateBuilder --- milli/src/update/update_builder.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 8d6eb034d..c966f72d2 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -2,7 +2,7 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::Index; -use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; +use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets, WordsPrefixes}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -135,4 +135,19 @@ impl<'a> UpdateBuilder<'a> { builder } + + pub fn words_prefixes<'t, 'u, 'i>( + self, + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordsPrefixes<'t, 'u, 'i> + { + let mut builder = WordsPrefixes::new(wtxn, index, self.update_id); + + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + + builder + } } From c62d2f56d844bdfa6f2a8c8baa0f5c5240f284d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 18 Feb 2021 18:33:51 +0100 Subject: [PATCH 0483/1889] Expose an http route for the WordsPrefixes update --- http-ui/src/main.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index a29eb8895..2ce7f8bd1 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -225,6 +225,7 @@ enum UpdateMeta { ClearDocuments, Settings(Settings), Facets(Facets), + WordsPrefixes(WordsPrefixes), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -275,6 +276,14 @@ struct Facets { min_level_size: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +struct WordsPrefixes { + threshold: Option, + max_prefix_length: Option, +} + // Any value that is present is considered Some value, including null. fn deserialize_some<'de, T, D>(deserializer: D) -> Result, D::Error> where T: Deserialize<'de>, @@ -467,6 +476,21 @@ async fn main() -> anyhow::Result<()> { Ok(()) => wtxn.commit().map_err(Into::into), Err(e) => Err(e.into()) } + }, + UpdateMeta::WordsPrefixes(settings) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.words_prefixes(&mut wtxn, &index_cloned); + if let Some(value) = settings.threshold { + builder.threshold(value); + } + if let Some(value) = settings.max_prefix_length { + builder.max_prefix_length(value); + } + match builder.execute() { + Ok(()) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()) + } } }; @@ -884,6 +908,19 @@ async fn main() -> anyhow::Result<()> { warp::reply() }); + let update_store_cloned = update_store.clone(); + let update_status_sender_cloned = update_status_sender.clone(); + let change_words_prefixes_route = warp::filters::method::post() + .and(warp::path!("words-prefixes")) + .and(warp::body::json()) + .map(move |settings: WordsPrefixes| { + let meta = UpdateMeta::WordsPrefixes(settings); + let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); + let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); + eprintln!("update {} registered", update_id); + warp::reply() + }); + let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); let abort_update_id_route = warp::filters::method::delete() @@ -958,6 +995,7 @@ async fn main() -> anyhow::Result<()> { .or(clearing_route) .or(change_settings_route) .or(change_facet_levels_route) + .or(change_words_prefixes_route) .or(update_ws_route); let addr = SocketAddr::from_str(&opt.http_listen_addr)?; From e08b6b3ec7c4329df1d6d3dcc3f3815a35b85aca Mon Sep 17 00:00:00 2001 From: mpostma Date: Fri, 19 Feb 2021 09:54:31 +0100 Subject: [PATCH 0484/1889] add primary key to fields_id_map when not present --- milli/src/update/index_documents/transform.rs | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 68888aad9..b22cd14c6 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -516,7 +516,7 @@ fn compute_primary_key_pair( ) -> anyhow::Result<(FieldId, String)> { match primary_key { Some(primary_key) => { - let id = fields_ids_map.id(primary_key).expect("primary key must be present in the fields id map"); + let id = fields_ids_map.insert(primary_key).ok_or(anyhow!("Maximum number of fields exceeded"))?; Ok((id, primary_key.to_string())) } None => { @@ -572,17 +572,6 @@ mod test { use super::compute_primary_key_pair; use super::FieldsIdsMap; - #[test] - #[should_panic] - fn should_panic_primary_key_not_in_map() { - let mut fields_map = FieldsIdsMap::new(); - let _result = compute_primary_key_pair( - Some("toto"), - &mut fields_map, - None, - false); - } - #[test] fn should_return_primary_key_if_is_some() { let mut fields_map = FieldsIdsMap::new(); From 45330a5e479d921e150c2a92c3929e78e2694235 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 16:19:27 +0100 Subject: [PATCH 0485/1889] Avoid creating a default empty database in the infos crate --- infos/src/main.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/infos/src/main.rs b/infos/src/main.rs index 3f41b7d42..1ebf39969 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -200,6 +200,11 @@ fn run(opt: Opt) -> anyhow::Result<()> { let mut options = EnvOpenOptions::new(); options.map_size(opt.database_size.get_bytes() as usize); + // Return an error if the database does not exist. + if !opt.database.exists() { + anyhow::bail!("The database ({}) does not exist.", opt.database.display()); + } + // Open the LMDB database. let index = Index::new(options, opt.database)?; let rtxn = index.read_txn()?; From b59fe77ec75c098b2c3c3599acd5a9a59432013b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 16:19:52 +0100 Subject: [PATCH 0486/1889] Avoid creating a default empty database in the search crate --- search/src/main.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/search/src/main.rs b/search/src/main.rs index d2e727417..7e9443e5f 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -57,7 +57,11 @@ fn run(opt: Opt) -> anyhow::Result<()> { .timestamp(stderrlog::Timestamp::Off) .init()?; - std::fs::create_dir_all(&opt.database)?; + // Return an error if the database does not exist. + if !opt.database.exists() { + anyhow::bail!("The database ({}) does not exist.", opt.database.display()); + } + let mut options = EnvOpenOptions::new(); options.map_size(opt.database_size.get_bytes() as usize); From 78bede1ffbc11a2123ce9d1dc9e7a52e3290d022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 16:21:44 +0100 Subject: [PATCH 0487/1889] Fix error displaying of the workspace members --- infos/src/main.rs | 11 +---------- search/src/main.rs | 11 +---------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 1ebf39969..c627edad8 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -179,18 +179,9 @@ enum Command { PatchToNewExternalIds, } -fn main() -> Result<(), ()> { +fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); - match run(opt) { - Ok(()) => Ok(()), - Err(e) => { - eprintln!("{}", e); - Err(()) - }, - } -} -fn run(opt: Opt) -> anyhow::Result<()> { stderrlog::new() .verbosity(opt.verbose) .show_level(false) diff --git a/search/src/main.rs b/search/src/main.rs index 7e9443e5f..f7f95b730 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -39,18 +39,9 @@ pub struct Opt { print_facet_distribution: bool, } -fn main() -> Result<(), ()> { +fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); - match run(opt) { - Ok(()) => Ok(()), - Err(e) => { - eprintln!("{}", e); - Err(()) - }, - } -} -fn run(opt: Opt) -> anyhow::Result<()> { stderrlog::new() .verbosity(opt.verbose) .show_level(false) From 4884b324e63262dcfbfbf6ee2308678177cd547d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 16:24:12 +0100 Subject: [PATCH 0488/1889] Remove the useless external ids patch method in the infos crate --- infos/src/main.rs | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index c627edad8..92eeebf83 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -21,7 +21,6 @@ const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const DOCUMENTS_DB_NAME: &str = "documents"; -const USERS_IDS_DOCUMENTS_IDS: &[u8] = b"users-ids-documents-ids"; const ALL_DATABASE_NAMES: &[&str] = &[ MAIN_DB_NAME, @@ -173,10 +172,6 @@ enum Command { /// /// All of the fields are extracted, not just the displayed ones. ExportDocuments, - - /// A command that patches the old external ids - /// into the new external ids format. - PatchToNewExternalIds, } fn main() -> anyhow::Result<()> { @@ -224,32 +219,9 @@ fn main() -> anyhow::Result<()> { ExportWordsFst => export_words_fst(&index, &rtxn), ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), ExportDocuments => export_documents(&index, &rtxn), - PatchToNewExternalIds => { - drop(rtxn); - let mut wtxn = index.write_txn()?; - let result = patch_to_new_external_ids(&index, &mut wtxn); - wtxn.commit()?; - result - }, } } -fn patch_to_new_external_ids(index: &Index, wtxn: &mut heed::RwTxn) -> anyhow::Result<()> { - use heed::types::ByteSlice; - - if let Some(documents_ids) = index.main.get::<_, ByteSlice, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS)? { - let documents_ids = documents_ids.to_owned(); - index.main.put::<_, ByteSlice, ByteSlice>( - wtxn, - milli::index::HARD_EXTERNAL_DOCUMENTS_IDS_KEY.as_bytes(), - &documents_ids, - )?; - index.main.delete::<_, ByteSlice>(wtxn, USERS_IDS_DOCUMENTS_IDS)?; - } - - Ok(()) -} - fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { use std::collections::BinaryHeap; use std::cmp::Reverse; From 1eb7ce5cdbad9274673b64ad8f051f7ce98bce02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 23 Feb 2021 21:08:52 +0100 Subject: [PATCH 0489/1889] Improve the export-documents infos command by accepting internal ids --- infos/src/main.rs | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 92eeebf83..91157aaad 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -171,7 +171,10 @@ enum Command { /// Outputs the documents as JSON lines to the standard output. /// /// All of the fields are extracted, not just the displayed ones. - ExportDocuments, + ExportDocuments { + /// If defined, only retrieve the documents that corresponds to these internal ids. + internal_documents_ids: Vec, + }, } fn main() -> anyhow::Result<()> { @@ -218,7 +221,9 @@ fn main() -> anyhow::Result<()> { }, ExportWordsFst => export_words_fst(&index, &rtxn), ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), - ExportDocuments => export_documents(&index, &rtxn), + ExportDocuments { internal_documents_ids } => { + export_documents(&index, &rtxn, internal_documents_ids) + }, } } @@ -583,9 +588,9 @@ fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result< Ok(()) } -fn export_documents(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { +fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) -> anyhow::Result<()> { use std::io::{BufWriter, Write as _}; - use milli::obkv_to_json; + use milli::{BEU32, obkv_to_json}; let stdout = io::stdout(); let mut out = BufWriter::new(stdout); @@ -593,8 +598,18 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { let fields_ids_map = index.fields_ids_map(rtxn)?; let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); - for result in index.documents.iter(rtxn)? { - let (_id, obkv) = result?; + let iter: Box> = if internal_ids.is_empty() { + Box::new(index.documents.iter(rtxn)?.map(|result| { + result.map(|(_id, obkv)| obkv) + })) + } else { + Box::new(internal_ids.into_iter().flat_map(|id| { + index.documents.get(rtxn, &BEU32::new(id)).transpose() + })) + }; + + for result in iter { + let obkv = result?; let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv)?; serde_json::to_writer(&mut out, &document)?; writeln!(&mut out)?; From 9423310816a4473b0f6d998954525ad85dee7fbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 21 Feb 2021 17:21:13 +0100 Subject: [PATCH 0490/1889] Introduce an helpers crate that export the database to stdout --- Cargo.lock | 12 +++++++ Cargo.toml | 2 +- helpers/Cargo.toml | 14 ++++++++ helpers/src/main.rs | 86 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 helpers/Cargo.toml create mode 100644 helpers/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 112b98690..679eed2c0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -859,6 +859,18 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "helpers" +version = "0.1.0" +dependencies = [ + "anyhow", + "byte-unit", + "heed", + "milli", + "stderrlog", + "structopt", +] + [[package]] name = "hermit-abi" version = "0.1.18" diff --git a/Cargo.toml b/Cargo.toml index 16a5ab8d5..a60c293e3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["milli", "http-ui", "infos", "search"] +members = ["milli", "http-ui", "infos", "helpers", "search"] default-members = ["milli"] [profile.release] diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml new file mode 100644 index 000000000..ea0e1ddc8 --- /dev/null +++ b/helpers/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "helpers" +version = "0.1.0" +authors = ["Clément Renault "] +edition = "2018" + +[dependencies] +anyhow = "1.0.38" +byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } +heed = "0.10.6" +jemallocator = "0.3.2" +milli = { path = "../milli" } +stderrlog = "0.5.1" +structopt = { version = "0.3.21", default-features = false } diff --git a/helpers/src/main.rs b/helpers/src/main.rs new file mode 100644 index 000000000..c916d0448 --- /dev/null +++ b/helpers/src/main.rs @@ -0,0 +1,86 @@ +use std::path::PathBuf; + +use byte_unit::Byte; +use heed::{Env, EnvOpenOptions, CompactionOption}; +use structopt::StructOpt; + +use Command::*; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +#[derive(Debug, StructOpt)] +/// Some helpers commands for milli. +pub struct Opt { + /// The database path where the database is located. + /// It is created if it doesn't already exist. + #[structopt(long = "db", parse(from_os_str))] + database: PathBuf, + + /// The maximum size the database can take on disk. It is recommended to specify + /// the whole disk space (value must be a multiple of a page size). + #[structopt(long = "db-size", default_value = "100 GiB")] + database_size: Byte, + + /// Verbose mode (-v, -vv, -vvv, etc.) + #[structopt(short, long, parse(from_occurrences))] + verbose: usize, + + #[structopt(subcommand)] + command: Command, +} + +#[derive(Debug, StructOpt)] +enum Command { + /// Outputs the main LMDB database to stdout. + CopyMainDatabase { + /// Wether to enable or not the compaction of the database. + #[structopt(long, short = "c")] + enable_compaction: bool, + }, +} + +fn main() -> anyhow::Result<()> { + let opt = Opt::from_args(); + + stderrlog::new() + .verbosity(opt.verbose) + .show_level(false) + .timestamp(stderrlog::Timestamp::Off) + .init()?; + + let mut options = EnvOpenOptions::new(); + options.map_size(opt.database_size.get_bytes() as usize); + + // Return an error if the database does not exist. + if !opt.database.exists() { + anyhow::bail!("The database ({}) does not exist.", opt.database.display()); + } + + let env = options.open(opt.database)?; + + match opt.command { + CopyMainDatabase { enable_compaction } => { + use CompactionOption::*; + let compaction = if enable_compaction { Enabled } else { Disabled }; + copy_main_database_to_stdout(env, compaction) + }, + } +} + +#[cfg(target_family = "unix")] +fn copy_main_database_to_stdout(env: Env, compaction: CompactionOption) -> anyhow::Result<()> { + use std::os::unix::io::AsRawFd; + + let stdout = std::io::stdout().as_raw_fd(); + unsafe { env.copy_to_fd(stdout, compaction).map_err(Into::into) } +} + +#[cfg(target_family = "windows")] +fn copy_main_database_to_stdout(env: Env, compaction: CompactionOption) -> anyhow::Result<()> { + use std::os::windows::io::AsRawHandle; + + let stdout = std::io::stdout().as_raw_handle(); + unsafe { env.copy_to_fd(stdout, compaction).map_err(Into::into) } +} From 79a143b32f8004dfb9d0c7a120add8a0f9a07b2a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 12:03:31 +0100 Subject: [PATCH 0491/1889] Introduce the query tree data structure --- Cargo.lock | 2 + milli/Cargo.toml | 2 + milli/src/search/mod.rs | 28 ++ milli/src/search/query_tree.rs | 887 +++++++++++++++++++++++++++++++++ 4 files changed, 919 insertions(+) create mode 100644 milli/src/search/query_tree.rs diff --git a/Cargo.lock b/Cargo.lock index 679eed2c0..883d836b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1277,11 +1277,13 @@ dependencies = [ "ordered-float", "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", "pest_derive", + "rand 0.8.3", "rayon", "regex", "roaring", "serde", "serde_json", + "slice-group-by", "smallstr", "smallvec", "tempfile", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 0c3052796..9f378f14c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -30,6 +30,7 @@ regex = "1.4.3" roaring = "0.6.5" serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } +slice-group-by = "0.2.6" smallstr = { version = "0.2.0", features = ["serde"] } smallvec = "1.6.1" tempfile = "3.2.0" @@ -48,6 +49,7 @@ log = "0.4.14" [dev-dependencies] criterion = "0.3.4" maplit = "1.0.2" +rand = "0.8.3" [build-dependencies] fst = "0.4.5" diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 459b301a6..e5672982e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -29,6 +29,7 @@ static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod facet; +mod query_tree; pub struct Search<'a> { query: Option, @@ -391,3 +392,30 @@ pub struct SearchResult { // TODO those documents ids should be associated with their criteria scores. pub documents_ids: Vec, } + +pub fn word_typos(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { + let dfa = { + let lev = match max_typo { + 0 => &LEVDIST0, + 1 => &LEVDIST1, + _ => &LEVDIST2, + }; + + if is_prefix { + lev.build_prefix_dfa(&word) + } else { + lev.build_dfa(&word) + } + }; + + let mut derived_words = Vec::new(); + let mut stream = fst.search_with_state(&dfa).into_stream(); + + while let Some((word, state)) = stream.next() { + let word = std::str::from_utf8(word)?; + let distance = dfa.distance(state); + derived_words.push((word.to_string(), distance.to_u8())); + } + + Ok(derived_words) +} diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs new file mode 100644 index 000000000..f0133cde4 --- /dev/null +++ b/milli/src/search/query_tree.rs @@ -0,0 +1,887 @@ +#![allow(unused)] + +use std::borrow::Cow; +use std::collections::BTreeMap; +use std::{fmt, cmp, mem}; + +use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; +use roaring::RoaringBitmap; +use slice_group_by::GroupBy; + +use crate::Index; + +type IsOptionalWord = bool; +type IsPrefix = bool; + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum Operation { + And(Vec), + Consecutive(Vec), + Or(IsOptionalWord, Vec), + Query(Query), +} + +impl fmt::Debug for Operation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result { + match op { + Operation::And(children) => { + writeln!(f, "{:1$}AND", "", depth * 2)?; + children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + }, + Operation::Consecutive(children) => { + writeln!(f, "{:1$}CONSECUTIVE", "", depth * 2)?; + children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + }, + Operation::Or(true, children) => { + writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; + children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + }, + Operation::Or(false, children) => { + writeln!(f, "{:1$}OR", "", depth * 2)?; + children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + }, + Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2), + } + } + + pprint_tree(f, self, 0) + } +} + +impl Operation { + fn tolerant(prefix: IsPrefix, s: &str) -> Operation { + Operation::Query(Query { prefix, kind: QueryKind::tolerant(2, s.to_string()) }) + } + + fn exact(prefix: IsPrefix, s: &str) -> Operation { + Operation::Query(Query { prefix, kind: QueryKind::exact(s.to_string()) }) + } + + fn phrase(words: Vec) -> Operation { + Operation::consecutive( + words.into_iter().map(|s| { + Operation::Query(Query { prefix: false, kind: QueryKind::exact(s) }) + }).collect() + ) + } + + fn and(mut ops: Vec) -> Self { + if ops.len() == 1 { + ops.pop().unwrap() + } else { + Self::And(ops) + } + } + + pub fn or(word_branch: IsOptionalWord, mut ops: Vec) -> Self { + if ops.len() == 1 { + ops.pop().unwrap() + } else { + Self::Or(word_branch, ops) + } + } + + fn consecutive(mut ops: Vec) -> Self { + if ops.len() == 1 { + ops.pop().unwrap() + } else { + Self::Consecutive(ops) + } + } +} + +#[derive(Clone, Eq, PartialEq, Hash)] +pub struct Query { + pub prefix: IsPrefix, + pub kind: QueryKind, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum QueryKind { + Tolerant { typo: u8, word: String }, + Exact { original_typo: u8, word: String }, +} + +impl QueryKind { + fn exact(word: String) -> Self { + QueryKind::Exact { original_typo: 0, word } + } + + fn tolerant(typo: u8, word: String) -> Self { + QueryKind::Tolerant { typo, word } + } + + pub fn typo(&self) -> u8 { + match self { + QueryKind::Tolerant { typo, .. } => *typo, + QueryKind::Exact { original_typo, .. } => *original_typo, + } + } + + pub fn word(&self) -> &str { + match self { + QueryKind::Tolerant { word, .. } => word, + QueryKind::Exact { word, .. } => word, + } + } +} + +impl fmt::Debug for Query { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Query { prefix, kind } = self; + let prefix = if *prefix { String::from("Prefix") } else { String::default() }; + match kind { + QueryKind::Exact { word, .. } => { + f.debug_struct(&(prefix + "Exact")).field("word", &word).finish() + }, + QueryKind::Tolerant { typo, word } => { + f.debug_struct(&(prefix + "Tolerant")).field("word", &word).field("max typo", &typo).finish() + }, + } + } +} + +trait Context { + fn word_docids(&self, word: &str) -> heed::Result>; + fn synonyms>(&self, words: &[S]) -> heed::Result>>>; +} + +/// The query tree builder is the interface to build a query tree. +pub struct QueryTreeBuilder<'a> { + rtxn: &'a heed::RoTxn<'a>, + index: &'a Index, +} + +impl<'a> Context for QueryTreeBuilder<'a> { + fn word_docids(&self, word: &str) -> heed::Result> { + self.index.word_docids.get(self.rtxn, word) + } + + fn synonyms>(&self, _words: &[S]) -> heed::Result>>> { + Ok(None) + } +} + +impl<'a> QueryTreeBuilder<'a> { + /// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn` + /// and an Index `index`. + pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self { + Self { rtxn, index } + } + + /// Build the query tree: + /// - if `optional_words` is set to `false` the query tree will be + /// generated forcing all query words to be present in each matching documents + /// (the criterion `words` will be ignored) + /// - if `authorize_typos` is set to `false` the query tree will be generated + /// forcing all query words to match documents without any typo + /// (the criterion `typo` will be ignored) + pub fn build( + &self, + optional_words: bool, + authorize_typos: bool, + query: TokenStream, + ) -> anyhow::Result> + { + let primitive_query = create_primitive_query(query); + if !primitive_query.is_empty() { + create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some) + } else { + Ok(None) + } + } +} + +/// Split the word depending on the frequency of subwords in the database documents. +fn split_best_frequency<'a>(ctx: &impl Context, word: &'a str) -> heed::Result> { + let chars = word.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = word.split_at(i); + + let left_freq = ctx.word_docids(left)?.map(|docids| docids.len()).unwrap_or(0); + let right_freq = ctx.word_docids(right)?.map(|docids| docids.len()).unwrap_or(0); + + let min_freq = cmp::min(left_freq, right_freq); + if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { + best = Some((min_freq, left, right)); + } + } + + Ok(best.map(|(_, left, right)| Operation::Consecutive( + vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(left.to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact(right.to_string()) }) + ] + ))) +} + +/// Return the `QueryKind` of a word depending on `authorize_typos` +/// and the provided word length. +fn typos(word: String, authorize_typos: bool) -> QueryKind { + if authorize_typos { + match word.len() { + 0..=4 => QueryKind::exact(word), + 5..=8 => QueryKind::tolerant(1, word), + _ => QueryKind::tolerant(2, word), + } + } else { + QueryKind::exact(word) + } +} + +/// Fetch synonyms from the `Context` for the provided word +/// and create the list of operations for the query tree +fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result>> { + let synonyms = ctx.synonyms(word)?; + + Ok(synonyms.map(|synonyms| { + synonyms.into_iter().map(|synonym| { + let words = synonym.into_iter().map(|word| { + Operation::Query(Query { prefix: false, kind: QueryKind::exact(word) }) + }).collect(); + Operation::and(words) + }).collect() + })) +} + +/// The query tree builder is the interface to build a query tree. +pub struct MatchingWords { + inner: BTreeMap +} + +impl MatchingWords { + /// List all words which can be considered as a match for the query tree. + pub fn from_query_tree(tree: &Operation, fst: &fst::Set>) -> Self { + Self { inner: fetch_words(tree, fst).into_iter().collect() } + } + + /// Return true if the word match. + pub fn is_match(&self, word: &str) -> bool { + fn first_char(s: &str) -> Option<&str> { + s.chars().next().map(|c| &s[..c.len_utf8()]) + } + + match first_char(word) { + Some(first) => { + let left = first.to_owned(); + let right = word.to_owned(); + self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word) + }, + None => false + } + } +} + +type FetchedWords = Vec<(String, IsPrefix)>; + +/// Lists all words which can be considered as a match for the query tree. +fn fetch_words(tree: &Operation, fst: &fst::Set>) -> FetchedWords { + fn resolve_branch(tree: &[Operation], fst: &fst::Set>) -> FetchedWords { + tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect() + } + + fn resolve_query(query: &Query, fst: &fst::Set>) -> FetchedWords { + match query.kind.clone() { + QueryKind::Exact { word, .. } => vec![(word, query.prefix)], + QueryKind::Tolerant { typo, word } => { + if let Ok(words) = super::word_typos(&word, query.prefix, typo, fst) { + words.into_iter().map(|(w, _)| (w, query.prefix)).collect() + } else { + vec![(word, query.prefix)] + } + } + } + } + + fn resolve_ops(tree: &Operation, fst: &fst::Set>) -> FetchedWords { + match tree { + Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { + resolve_branch(ops.as_slice(), fst) + }, + Operation::Query(ops) => { + resolve_query(ops, fst) + }, + } + } + + let mut words = resolve_ops(tree, fst); + words.sort_unstable(); + words.dedup(); + words +} + +/// Main function that creates the final query tree from the primitive query. +fn create_query_tree( + ctx: &impl Context, + optional_words: bool, + authorize_typos: bool, + query: PrimitiveQuery, +) -> anyhow::Result +{ + /// Matches on the `PrimitiveQueryPart` and create an operation from it. + fn resolve_primitive_part( + ctx: &impl Context, + authorize_typos: bool, + part: PrimitiveQueryPart, + ) -> anyhow::Result + { + match part { + // 1. try to split word in 2 + // 2. try to fetch synonyms + // 3. create an operation containing the word + // 4. wrap all in an OR operation + PrimitiveQueryPart::Word(word, prefix) => { + let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); + if let Some(child) = split_best_frequency(ctx, &word)? { + children.push(child); + } + children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); + Ok(Operation::or(false, children)) + }, + // create a CONSECUTIVE operation wrapping all word in the phrase + PrimitiveQueryPart::Phrase(words) => { + Ok(Operation::phrase(words)) + }, + } + } + + /// Create all ngrams 1..=3 generating query tree branches. + fn ngrams( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], + ) -> anyhow::Result + { + const MAX_NGRAM: usize = 3; + let mut op_children = Vec::new(); + + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) { + let mut or_op_children = Vec::new(); + + for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { + if let Some(group) = sub_query.get(..ngram) { + let mut and_op_children = Vec::new(); + let tail = &sub_query[ngram..]; + let is_last = tail.is_empty(); + + match group { + [part] => { + let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; + and_op_children.push(operation); + }, + words => { + let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); + let words: Vec<_> = words.iter().filter_map(| part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }).collect(); + let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); + let concat = words.concat(); + let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; + operations.push(Operation::Query(query)); + and_op_children.push(Operation::or(false, operations)); + } + } + + if !is_last { + let ngrams = ngrams(ctx, authorize_typos, tail)?; + and_op_children.push(ngrams); + } + or_op_children.push(Operation::and(and_op_children)); + } + } + op_children.push(Operation::or(false, or_op_children)); + } + + Ok(Operation::and(op_children)) + } + + /// Create a new branch removing the last non-phrase query parts. + fn optional_word( + ctx: &impl Context, + authorize_typos: bool, + query: PrimitiveQuery, + ) -> anyhow::Result + { + let word_count = query.iter().filter(|part| !part.is_phrase()).count(); + let mut operation_children = Vec::new(); + + for count in (1..=word_count).rev() { + let mut tmp_count = 0; + + // keep only the N firsts non-quoted words, where N = count + // quoted words are allways kept + let query: Vec<_> = query.iter().cloned().filter(|part| { + if !part.is_phrase() { + tmp_count += 1; + tmp_count <= count + } else { true } + }).collect(); + + let ngrams = ngrams(ctx, authorize_typos, query.as_slice())?; + operation_children.push(ngrams); + } + + Ok(Operation::or(true, operation_children)) + } + + if optional_words { + optional_word(ctx, authorize_typos, query) + } else { + ngrams(ctx, authorize_typos, query.as_slice()) + } +} + +type PrimitiveQuery = Vec; + +#[derive(Debug, Clone)] +enum PrimitiveQueryPart { + Phrase(Vec), + Word(String, IsPrefix), +} + +impl PrimitiveQueryPart { + fn is_phrase(&self) -> bool { + matches!(self, Self::Phrase(_)) + } + + fn is_prefix(&self) -> bool { + matches!(self, Self::Word(_, is_prefix) if *is_prefix) + } +} + +/// Create primitive query from tokenized query string, +/// the primitive query is an intermediate state to build the query tree. +fn create_primitive_query(query: TokenStream) -> PrimitiveQuery { + let mut primitive_query = Vec::new(); + let mut phrase = Vec::new(); + let mut quoted = false; + + let mut peekable = query.peekable(); + while let Some(token) = peekable.next() { + match token.kind { + TokenKind::Word => { + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, + // 2. if the word is not the last token of the query we push it as a non-prefix word, + // 3. if the word is the last token of the query we push it as a prefix word. + if quoted { + phrase.push(token.word.to_string()); + } else if peekable.peek().is_some() { + primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false)); + } else { + primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); + } + }, + TokenKind::Separator(_) => { + let quote_count = token.word.chars().filter(|&s| s == '"').count(); + // swap quoted state if we encounter a double quote + if quote_count % 2 != 0 { + quoted = !quoted; + } + if !phrase.is_empty() && quote_count > 0 { + primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); + } + }, + _ => (), + } + } + + // If a quote is never closed, we consider all of the end of the query as a phrase. + if !phrase.is_empty() { + primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); + } + + primitive_query +} + +#[cfg(test)] +mod test { + use fst::Set; + use maplit::hashmap; + use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; + use rand::{Rng, SeedableRng, rngs::StdRng}; + + use super::*; + use std::collections::HashMap; + #[derive(Debug)] + struct TestContext { + synonyms: HashMap, Vec>>, + postings: HashMap, + } + + impl TestContext { + fn build( + &self, + optional_words: bool, + authorize_typos: bool, + query: TokenStream, + ) -> anyhow::Result> + { + let primitive_query = create_primitive_query(query); + if !primitive_query.is_empty() { + create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some) + } else { + Ok(None) + } + } + } + + impl Context for TestContext { + fn word_docids(&self, word: &str) -> heed::Result> { + Ok(self.postings.get(word).cloned()) + } + + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); + Ok(self.synonyms.get(&words).cloned()) + } + } + + impl Default for TestContext { + + fn default() -> TestContext { + let mut rng = StdRng::seed_from_u64(102); + let rng = &mut rng; + + fn random_postings(rng: &mut R, len: usize) -> RoaringBitmap { + let mut values = Vec::::with_capacity(len); + while values.len() != len { + values.push(rng.gen()); + } + values.sort_unstable(); + + RoaringBitmap::from_sorted_iter(values.into_iter()) + } + + TestContext { + synonyms: hashmap!{ + vec![String::from("hello")] => vec![ + vec![String::from("hi")], + vec![String::from("good"), String::from("morning")], + ], + vec![String::from("world")] => vec![ + vec![String::from("earth")], + vec![String::from("nature")], + ], + // new york city + vec![String::from("nyc")] => vec![ + vec![String::from("new"), String::from("york")], + vec![String::from("new"), String::from("york"), String::from("city")], + ], + vec![String::from("new"), String::from("york")] => vec![ + vec![String::from("nyc")], + vec![String::from("new"), String::from("york"), String::from("city")], + ], + vec![String::from("new"), String::from("york"), String::from("city")] => vec![ + vec![String::from("nyc")], + vec![String::from("new"), String::from("york")], + ], + }, + postings: hashmap!{ + String::from("hello") => random_postings(rng, 1500), + String::from("hi") => random_postings(rng, 4000), + String::from("word") => random_postings(rng, 2500), + String::from("split") => random_postings(rng, 400), + String::from("ngrams") => random_postings(rng, 1400), + String::from("world") => random_postings(rng, 15_000), + String::from("earth") => random_postings(rng, 8000), + String::from("2021") => random_postings(rng, 100), + String::from("2020") => random_postings(rng, 500), + String::from("is") => random_postings(rng, 50_000), + String::from("this") => random_postings(rng, 50_000), + String::from("good") => random_postings(rng, 1250), + String::from("morning") => random_postings(rng, 125), + }, + } + } + } + + #[test] + fn prefix() { + let query = "hey friends"; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "friends".to_string()) }), + ]), + Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), + ]); + + let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn no_prefix() { + let query = "hey friends "; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friends".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), + ]); + + let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn synonyms() { + let query = "hello world "; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hi".to_string()) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("morning".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "hello".to_string()) }), + ]), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("earth".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("nature".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), + ]), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }), + ]); + + let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn complex_synonyms() { + let query = "new york city "; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), + Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "yorkcity".to_string()) }), + ]), + ]), + Operation::And(vec![ + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("nyc".to_string()) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "newyork".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), + ]), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("nyc".to_string()) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "newyorkcity".to_string()) }), + ]), + ]); + + let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn ngrams() { + let query = "n grams "; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("n".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "grams".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }), + ]); + + let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn word_split() { + let query = "wordsplit fish "; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Or(false, vec![ + Operation::Consecutive(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("word".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("fish".to_string()) }) + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }), + ]); + + let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn phrase() { + let query = "\"hey friends\" \" \" \"wooop"; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::And(vec![ + Operation::Consecutive(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), + ]); + + let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn optional_word() { + let query = "hey my friend "; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::Or(true, vec![ + Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friend".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "myfriend".to_string()) }) + ]) + ]), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friend".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }), + ]), + Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + ]); + let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn no_typo() { + let query = "hey friends "; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }), + ]); + let query_tree = TestContext::default().build(false, false, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn fetching_words() { + let query = "wordsplit nyc world"; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let context = TestContext::default(); + let query_tree = context.build(false, true, tokens).unwrap().unwrap(); + + let expected = vec![ + ("city".to_string(), false), + ("earth".to_string(), false), + ("nature".to_string(), false), + ("new".to_string(), false), + ("nyc".to_string(), false), + ("split".to_string(), false), + ("word".to_string(), false), + ("word".to_string(), true), + ("world".to_string(), true), + ("york".to_string(), false), + + ]; + + let mut keys = context.postings.keys().collect::>(); + keys.sort_unstable(); + let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap(); + + let words = fetch_words(&query_tree, &set); + + assert_eq!(expected, words); + } +} From 4f197492523d47dea6ac1eb510182e93ff08a0d7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Feb 2021 15:06:58 +0100 Subject: [PATCH 0492/1889] Introduce the word_documents_count method on the Context trait --- milli/src/search/query_tree.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index f0133cde4..7457098cb 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -145,6 +145,12 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; fn synonyms>(&self, words: &[S]) -> heed::Result>>>; + fn word_documents_count(&self, word: &str) -> heed::Result> { + match self.word_docids(word)? { + Some(rb) => Ok(Some(rb.len())), + None => Ok(None), + } + } } /// The query tree builder is the interface to build a query tree. @@ -158,6 +164,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } + fn word_documents_count(&self, word: &str) -> heed::Result> { + self.index.word_documents_count(self.rtxn, word) + } + fn synonyms>(&self, _words: &[S]) -> heed::Result>>> { Ok(None) } @@ -201,8 +211,8 @@ fn split_best_frequency<'a>(ctx: &impl Context, word: &'a str) -> heed::Result old) { From 1dc857a4b29db1ce107c16ec4a4dde01f4d4d4b0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Feb 2021 22:18:36 +0100 Subject: [PATCH 0493/1889] Fix the query tree optional word generation with phrases --- milli/src/search/query_tree.rs | 96 +++++++++++++++++++++++++++------- 1 file changed, 76 insertions(+), 20 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 7457098cb..682f66f24 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -419,22 +419,26 @@ fn create_query_tree( query: PrimitiveQuery, ) -> anyhow::Result { - let word_count = query.iter().filter(|part| !part.is_phrase()).count(); + let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); let mut operation_children = Vec::new(); - for count in (1..=word_count).rev() { - let mut tmp_count = 0; + let start = number_phrases + (number_phrases == 0) as usize; + for len in start..=query.len() { + let mut word_count = len - number_phrases; + let query: Vec<_> = query.iter().filter_map(|p| { + if p.is_phrase() { + Some(p) + } else if word_count != 0 { + word_count -= 1; + Some(p) + } else { + None + } + }) + .cloned() + .collect(); - // keep only the N firsts non-quoted words, where N = count - // quoted words are allways kept - let query: Vec<_> = query.iter().cloned().filter(|part| { - if !part.is_phrase() { - tmp_count += 1; - tmp_count <= count - } else { true } - }).collect(); - - let ngrams = ngrams(ctx, authorize_typos, query.as_slice())?; + let ngrams = ngrams(ctx, authorize_typos, &query)?; operation_children.push(ngrams); } @@ -810,6 +814,14 @@ mod test { let tokens = result.tokens(); let expected = Operation::Or(true, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }), + ]), Operation::Or(false, vec![ Operation::And(vec![ Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), @@ -827,14 +839,58 @@ mod test { ]), Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }), ]), - Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }), - ]), + ]); + let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn optional_word_phrase() { + let query = "\"hey my\""; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::Consecutive(vec![ Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + ]); + let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + + #[test] + fn optional_word_multiple_phrases() { + let query = r#""hey" my good "friend""#; + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::Or(true, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), + ]), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), + ]), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "mygood".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), + ]), ]); let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap(); From 6008f528d05e59e61158c35c8841f44627593646 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 23 Feb 2021 15:50:33 +0100 Subject: [PATCH 0494/1889] Introduce the maximum_typo helper function --- milli/src/search/query_tree.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 682f66f24..1aaacbccb 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -514,6 +514,16 @@ fn create_primitive_query(query: TokenStream) -> PrimitiveQuery { primitive_query } +/// Returns the maximum number of typos that this Operation allows. +pub fn maximum_typo(operation: &Operation) -> usize { + use Operation::{Or, And, Query, Consecutive}; + match operation { + Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), + And(ops) | Consecutive(ops) => ops.iter().map(maximum_typo).sum::(), + Query(q) => q.kind.typo() as usize, + } +} + #[cfg(test)] mod test { use fst::Set; From 6d135beb21ab9bfcc9cd5bce0ed52e381b0f5031 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 23 Feb 2021 15:53:24 +0100 Subject: [PATCH 0495/1889] Introduce the maximum_proximity helper function --- milli/src/search/query_tree.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 1aaacbccb..64babe053 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -524,6 +524,16 @@ pub fn maximum_typo(operation: &Operation) -> usize { } } +/// Returns the maximum proximity that this Operation allows. +pub fn maximum_proximity(operation: &Operation) -> usize { + use Operation::{Or, And, Query, Consecutive}; + match operation { + Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), + And(ops) => ops.len().saturating_sub(1) * 8, + Query(_) | Consecutive(_) => 0, + } +} + #[cfg(test)] mod test { use fst::Set; From a463ae821eea2af1aae46ed08792890bdb9d13f0 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 10:49:25 +0100 Subject: [PATCH 0496/1889] Add methods optional_words and authorize_typos on the query tree --- milli/src/search/query_tree.rs | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 64babe053..87245ce20 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -157,6 +157,8 @@ trait Context { pub struct QueryTreeBuilder<'a> { rtxn: &'a heed::RoTxn<'a>, index: &'a Index, + optional_words: bool, + authorize_typos: bool, } impl<'a> Context for QueryTreeBuilder<'a> { @@ -177,7 +179,25 @@ impl<'a> QueryTreeBuilder<'a> { /// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn` /// and an Index `index`. pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self { - Self { rtxn, index } + Self { rtxn, index, optional_words: true, authorize_typos: true } + } + + /// if `optional_words` is set to `false` the query tree will be + /// generated forcing all query words to be present in each matching documents + /// (the criterion `words` will be ignored). + /// default value if not called: `true` + pub fn optional_words(&mut self, optional_words: bool) -> &mut Self { + self.optional_words = optional_words; + self + } + + /// if `authorize_typos` is set to `false` the query tree will be generated + /// forcing all query words to match documents without any typo + /// (the criterion `typo` will be ignored). + /// default value if not called: `true` + pub fn authorize_typos(&mut self, authorize_typos: bool) -> &mut Self { + self.authorize_typos = authorize_typos; + self } /// Build the query tree: @@ -187,16 +207,10 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build( - &self, - optional_words: bool, - authorize_typos: bool, - query: TokenStream, - ) -> anyhow::Result> - { + pub fn build(&self, query: TokenStream) -> anyhow::Result> { let primitive_query = create_primitive_query(query); if !primitive_query.is_empty() { - create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some) + create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some) } else { Ok(None) } From 240b02e175580be382d5f241de1f0cfd6abfc48c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 2 Mar 2021 11:30:48 +0100 Subject: [PATCH 0497/1889] Remove unused Operation constructors --- milli/src/search/query_tree.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 87245ce20..00905db2e 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -50,14 +50,6 @@ impl fmt::Debug for Operation { } impl Operation { - fn tolerant(prefix: IsPrefix, s: &str) -> Operation { - Operation::Query(Query { prefix, kind: QueryKind::tolerant(2, s.to_string()) }) - } - - fn exact(prefix: IsPrefix, s: &str) -> Operation { - Operation::Query(Query { prefix, kind: QueryKind::exact(s.to_string()) }) - } - fn phrase(words: Vec) -> Operation { Operation::consecutive( words.into_iter().map(|s| { @@ -186,6 +178,7 @@ impl<'a> QueryTreeBuilder<'a> { /// generated forcing all query words to be present in each matching documents /// (the criterion `words` will be ignored). /// default value if not called: `true` + #[allow(unused)] pub fn optional_words(&mut self, optional_words: bool) -> &mut Self { self.optional_words = optional_words; self @@ -195,6 +188,7 @@ impl<'a> QueryTreeBuilder<'a> { /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored). /// default value if not called: `true` + #[allow(unused)] pub fn authorize_typos(&mut self, authorize_typos: bool) -> &mut Self { self.authorize_typos = authorize_typos; self @@ -550,13 +544,15 @@ pub fn maximum_proximity(operation: &Operation) -> usize { #[cfg(test)] mod test { + use std::collections::HashMap; + use fst::Set; use maplit::hashmap; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::{Rng, SeedableRng, rngs::StdRng}; use super::*; - use std::collections::HashMap; + #[derive(Debug)] struct TestContext { synonyms: HashMap, Vec>>, From 411a118148211e30d1ea8fc2a56d6b2ec05933e5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 13:57:36 +0100 Subject: [PATCH 0498/1889] Avoid testing on nightly to fix a crate issue --- .github/workflows/test.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c5565b0c3..e2487f707 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -14,7 +14,10 @@ jobs: rust: - stable - beta - - nightly + # We temporarily stop building on nightly just to fix this issue + # https://github.com/bheisler/TinyTemplate/pull/17 + # Reenable it when the fix has been merged. + # - nightly steps: - uses: actions/checkout@v2 From 73286dc8bf915d368eae75fe56082246ae9505dc Mon Sep 17 00:00:00 2001 From: many Date: Wed, 10 Feb 2021 11:35:17 +0100 Subject: [PATCH 0499/1889] Introduce the query tree data structure --- milli/src/search/query_tree.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 00905db2e..9b253350e 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,5 +1,3 @@ -#![allow(unused)] - use std::borrow::Cow; use std::collections::BTreeMap; use std::{fmt, cmp, mem}; From f0ddea821cd214254ce201d29410c25b61940f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Feb 2021 15:14:00 +0100 Subject: [PATCH 0500/1889] Introduce the Typo criterion --- milli/src/search/criteria/mod.rs | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 milli/src/search/criteria/mod.rs diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs new file mode 100644 index 000000000..4cc4512d7 --- /dev/null +++ b/milli/src/search/criteria/mod.rs @@ -0,0 +1,34 @@ +use crate::Index; + +use roaring::RoaringBitmap; + +use super::query_tree::Operation; + +pub mod typo; + +pub trait Criterion { + fn next(&mut self) -> anyhow::Result, RoaringBitmap)>>; +} + +/// Either a set of candidates that defines the candidates +/// that are allowed to be returned, +/// or the candidates that must never be returned. +enum Candidates { + Allowed(RoaringBitmap), + Forbidden(RoaringBitmap) +} + +impl Candidates { + fn into_inner(self) -> RoaringBitmap { + match self { + Self::Allowed(inner) => inner, + Self::Forbidden(inner) => inner, + } + } +} + +impl Default for Candidates { + fn default() -> Self { + Self::Forbidden(RoaringBitmap::new()) + } +} From ad20d72a39641681a1b591bef0c8d8ebc7332a6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Feb 2021 15:20:07 +0100 Subject: [PATCH 0501/1889] Introduce the Typo criterion --- milli/src/search/criteria/typo.rs | 253 ++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 milli/src/search/criteria/typo.rs diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs new file mode 100644 index 000000000..196da4a49 --- /dev/null +++ b/milli/src/search/criteria/typo.rs @@ -0,0 +1,253 @@ +use std::{borrow::Cow, mem::take}; + +use anyhow::bail; +use roaring::RoaringBitmap; + +use crate::Index; +use crate::search::query_tree::{Operation, Query, QueryKind}; +use crate::search::word_typos; +use super::{Candidates, Criterion}; + +// FIXME we must stop when the number of typos is equal to +// the maximum number of typos for this query tree. +const MAX_NUM_TYPOS: u8 = 8; + +pub struct Typo<'t> { + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + words_fst: fst::Set>, + query_tree: Option, + number_typos: u8, + candidates: Candidates, + parent: Option>, +} + +impl<'t> Typo<'t> { + pub fn initial( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + query_tree: Option, + candidates: Option, + ) -> anyhow::Result where Self: Sized + { + Ok(Typo { + index, + rtxn, + words_fst: index.words_fst(rtxn)?, + query_tree, + number_typos: 0, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + parent: None, + }) + } + + pub fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + ) -> anyhow::Result where Self: Sized + { + Ok(Typo { + index, + rtxn, + words_fst: index.words_fst(rtxn)?, + query_tree: None, + number_typos: 0, + candidates: Candidates::default(), + parent: Some(parent), + }) + } +} + +impl<'t> Criterion for Typo<'t> { + fn next(&mut self) -> anyhow::Result, RoaringBitmap)>> { + use Candidates::{Allowed, Forbidden}; + while self.number_typos < MAX_NUM_TYPOS { + match (&mut self.query_tree, &mut self.candidates) { + (_, Allowed(candidates)) if candidates.is_empty() => { + self.query_tree = None; + self.candidates = Candidates::default(); + }, + (Some(query_tree), Allowed(candidates)) => { + let new_query_tree = alterate_query_tree(&self.words_fst, query_tree.clone(), self.number_typos)?; + let mut new_candidates = resolve_candidates(&self.index, &self.rtxn, &new_query_tree, self.number_typos)?; + new_candidates.intersect_with(&candidates); + candidates.difference_with(&new_candidates); + self.number_typos += 1; + + return Ok(Some((Some(new_query_tree), new_candidates))); + }, + (Some(query_tree), Forbidden(candidates)) => { + let new_query_tree = alterate_query_tree(&self.words_fst, query_tree.clone(), self.number_typos)?; + let mut new_candidates = resolve_candidates(&self.index, &self.rtxn, &new_query_tree, self.number_typos)?; + new_candidates.difference_with(&candidates); + candidates.union_with(&new_candidates); + self.number_typos += 1; + + return Ok(Some((Some(new_query_tree), new_candidates))); + }, + (None, Allowed(_)) => { + return Ok(Some((None, take(&mut self.candidates).into_inner()))); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some((query_tree, candidates)) => { + self.query_tree = query_tree; + self.candidates = Candidates::Allowed(candidates); + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + + Ok(None) + } +} + +/// Modify the query tree by replacing every tolerant query by an Or operation +/// containing all of the corresponding exact words in the words FST. Each tolerant +/// query will only be replaced by exact query with up to `number_typos` maximum typos. +fn alterate_query_tree( + words_fst: &fst::Set>, + mut query_tree: Operation, + number_typos: u8, +) -> anyhow::Result +{ + fn recurse( + words_fst: &fst::Set>, + operation: &mut Operation, + number_typos: u8, + ) -> anyhow::Result<()> + { + use Operation::{And, Consecutive, Or}; + + match operation { + And(ops) | Consecutive(ops) | Or(_, ops) => { + ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos)) + }, + Operation::Query(q) => { + if let QueryKind::Tolerant { typo, word } = &q.kind { + let typo = *typo.min(&number_typos); + let words = word_typos(word, q.prefix, typo, words_fst)?; + + let queries = words.into_iter().map(|(word, _typo)| { + Operation::Query(Query { + prefix: false, + kind: QueryKind::Exact { original_typo: typo, word }, + }) + }).collect(); + + *operation = Operation::or(false, queries); + } + + Ok(()) + }, + } + } + + recurse(words_fst, &mut query_tree, number_typos)?; + Ok(query_tree) +} + +fn resolve_candidates( + index: &Index, + rtxn: &heed::RoTxn, + query_tree: &Operation, + number_typos: u8, +) -> anyhow::Result +{ + // FIXME add a cache + // FIXME keep the cache between typos iterations + // cache: HashMap<(&Operation, u8), RoaringBitmap>, + + fn resolve_operation( + index: &Index, + rtxn: &heed::RoTxn, + query_tree: &Operation, + number_typos: u8, + ) -> anyhow::Result + { + use Operation::{And, Consecutive, Or, Query}; + + match query_tree { + And(ops) => { + mdfs(index, rtxn, ops, number_typos) + }, + Consecutive(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for slice in ops.windows(2) { + match (&slice[0], &slice[1]) { + (Operation::Query(left), Operation::Query(right)) => { + let key_pair = &(left.kind.word(), right.kind.word(), 1); + match index.word_pair_proximity_docids.get(rtxn, key_pair)? { + Some(pair_docids) => { + if first_loop { + candidates = pair_docids; + first_loop = false; + } else { + candidates.intersect_with(&pair_docids) + } + }, + None => return Ok(RoaringBitmap::new()), + } + + }, + _ => bail!("invalid consecutive query type"), + } + } + Ok(candidates) + }, + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(index, rtxn, op, number_typos)?; + candidates.union_with(&docids); + } + Ok(candidates) + }, + Query(q) => if q.kind.typo() == number_typos { + let word = q.kind.word(); + Ok(index.word_docids.get(rtxn, word)?.unwrap_or_default()) + } else { + Ok(RoaringBitmap::new()) + }, + } + } + + /// FIXME Make this function generic and mutualize it between Typo and proximity criterion + fn mdfs( + index: &Index, + rtxn: &heed::RoTxn, + branches: &[Operation], + mana: u8, + ) -> anyhow::Result + { + match branches.split_first() { + Some((head, [])) => resolve_operation(index, rtxn, head, mana), + Some((head, tail)) => { + let mut candidates = RoaringBitmap::new(); + + for m in 0..=mana { + let mut head_candidates = resolve_operation(index, rtxn, head, m)?; + if !head_candidates.is_empty() { + let tail_candidates = mdfs(index, rtxn, tail, mana - m)?; + head_candidates.intersect_with(&tail_candidates); + candidates.union_with(&head_candidates); + } + } + + Ok(candidates) + }, + None => Ok(RoaringBitmap::new()), + } + } + + resolve_operation(index, rtxn, query_tree, number_typos) +} From f091f370d055a723b2011a27170d3572d9b2dd2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 10:29:28 +0100 Subject: [PATCH 0502/1889] Use the Typo criteria in the search module --- milli/src/search/mod.rs | 232 +++++++++++++++++++++++----------------- 1 file changed, 136 insertions(+), 96 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index e5672982e..267f3556a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -18,10 +18,13 @@ use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec} use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::mdfs::Mdfs; use crate::query_tokens::{query_tokens, QueryToken}; -use crate::{Index, FieldId, DocumentId, Criterion}; +use crate::search::criteria::Criterion; +use crate::search::criteria::typo::Typo; +use crate::{Index, FieldId, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetIter}; +use self::query_tree::QueryTreeBuilder; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -30,6 +33,7 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod facet; mod query_tree; +mod criteria; pub struct Search<'a> { query: Option, @@ -258,15 +262,22 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> anyhow::Result { - let limit = self.limit; - let fst = self.index.words_fst(self.rtxn)?; - - // Construct the DFAs related to the query words. - let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) { - Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?), - _otherwise => None, + // We create the query tree by spliting the query into tokens. + let before = Instant::now(); + let query_tree = match self.query.as_ref() { + Some(query) => { + let builder = QueryTreeBuilder::new(self.rtxn, self.index); + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + builder.build(false, true, tokens) + }, + None => None, }; + debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); + // We create the original candidates with the facet conditions results. let before = Instant::now(); let facet_candidates = match &self.facet_condition { @@ -276,100 +287,129 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); - let order_by_facet = { - let criteria = self.index.criteria(self.rtxn)?; - let result = criteria.into_iter().flat_map(|criterion| { - match criterion { - Criterion::Asc(fid) => Some((fid, true)), - Criterion::Desc(fid) => Some((fid, false)), - _ => None - } - }).next(); - match result { - Some((attr_name, is_ascending)) => { - let field_id_map = self.index.fields_ids_map(self.rtxn)?; - let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?; - let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?; - let ftype = *faceted_fields.get(&fid) - .with_context(|| format!("{:?} not found in the faceted fields.", attr_name)) - .expect("corrupted data: "); - Some((fid, ftype, is_ascending)) - }, - None => None, + // We aretesting the typo criteria but there will be more of them soon. + let mut criteria = Typo::initial(self.index, self.rtxn, query_tree, facet_candidates)?; + + let mut offset = self.offset; + let mut limit = self.limit; + let mut documents_ids = Vec::new(); + while let Some((_qt, docids)) = criteria.next()? { + + let mut len = docids.len() as usize; + let mut docids = docids.into_iter(); + + if offset != 0 { + docids.by_ref().skip(offset).for_each(drop); + offset = offset.saturating_sub(len.min(offset)); + len = len.saturating_sub(len.min(offset)); } - }; - let before = Instant::now(); - let (candidates, derived_words) = match (facet_candidates, derived_words) { - (Some(mut facet_candidates), Some(derived_words)) => { - let words_candidates = Self::compute_candidates(&derived_words); - facet_candidates.intersect_with(&words_candidates); - (facet_candidates, derived_words) - }, - (None, Some(derived_words)) => { - (Self::compute_candidates(&derived_words), derived_words) - }, - (Some(facet_candidates), None) => { - // If the query is not set or results in no DFAs but - // there is some facet conditions we return a placeholder. - let documents_ids = match order_by_facet { - Some((fid, ftype, is_ascending)) => { - self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)? - }, - None => facet_candidates.iter().take(limit).collect(), - }; - return Ok(SearchResult { - documents_ids, - candidates: facet_candidates, - ..Default::default() - }) - }, - (None, None) => { - // If the query is not set or results in no DFAs we return a placeholder. - let all_docids = self.index.documents_ids(self.rtxn)?; - let documents_ids = match order_by_facet { - Some((fid, ftype, is_ascending)) => { - self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)? - }, - None => all_docids.iter().take(limit).collect(), - }; - return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() }) - }, - }; - - debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed()); - - // The mana depth first search is a revised DFS that explore - // solutions in the order of their proximities. - let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone()); - let mut documents = Vec::new(); - - // We execute the Mdfs iterator until we find enough documents. - while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { - match mdfs.next().transpose()? { - Some((proximity, answer)) => { - debug!("answer with a proximity of {}: {:?}", proximity, answer); - documents.push(answer); - }, - None => break, + if len != 0 { + documents_ids.extend(docids.take(limit)); + limit = limit.saturating_sub(len.min(limit)); } + + if limit == 0 { break } } - let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); - let documents_ids = match order_by_facet { - Some((fid, ftype, order)) => { - let mut ordered_documents = Vec::new(); - for documents_ids in documents { - let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?; - ordered_documents.push(docids); - if ordered_documents.iter().map(Vec::len).sum::() >= limit { break } - } - ordered_documents.into_iter().flatten().take(limit).collect() - }, - None => documents.into_iter().flatten().take(limit).collect(), - }; - + let found_words = HashSet::new(); + let candidates = RoaringBitmap::new(); Ok(SearchResult { found_words, candidates, documents_ids }) + + // let order_by_facet = { + // let criteria = self.index.criteria(self.rtxn)?; + // let result = criteria.into_iter().flat_map(|criterion| { + // match criterion { + // Criterion::Asc(fid) => Some((fid, true)), + // Criterion::Desc(fid) => Some((fid, false)), + // _ => None + // } + // }).next(); + // match result { + // Some((attr_name, is_ascending)) => { + // let field_id_map = self.index.fields_ids_map(self.rtxn)?; + // let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?; + // let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?; + // let ftype = *faceted_fields.get(&fid) + // .with_context(|| format!("{:?} not found in the faceted fields.", attr_name)) + // .expect("corrupted data: "); + // Some((fid, ftype, is_ascending)) + // }, + // None => None, + // } + // }; + + // let before = Instant::now(); + // let (candidates, derived_words) = match (facet_candidates, derived_words) { + // (Some(mut facet_candidates), Some(derived_words)) => { + // let words_candidates = Self::compute_candidates(&derived_words); + // facet_candidates.intersect_with(&words_candidates); + // (facet_candidates, derived_words) + // }, + // (None, Some(derived_words)) => { + // (Self::compute_candidates(&derived_words), derived_words) + // }, + // (Some(facet_candidates), None) => { + // // If the query is not set or results in no DFAs but + // // there is some facet conditions we return a placeholder. + // let documents_ids = match order_by_facet { + // Some((fid, ftype, is_ascending)) => { + // self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)? + // }, + // None => facet_candidates.iter().take(limit).collect(), + // }; + // return Ok(SearchResult { + // documents_ids, + // candidates: facet_candidates, + // ..Default::default() + // }) + // }, + // (None, None) => { + // // If the query is not set or results in no DFAs we return a placeholder. + // let all_docids = self.index.documents_ids(self.rtxn)?; + // let documents_ids = match order_by_facet { + // Some((fid, ftype, is_ascending)) => { + // self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)? + // }, + // None => all_docids.iter().take(limit).collect(), + // }; + // return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() }) + // }, + // }; + + // debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed()); + + // // The mana depth first search is a revised DFS that explore + // // solutions in the order of their proximities. + // let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone()); + // let mut documents = Vec::new(); + + // // We execute the Mdfs iterator until we find enough documents. + // while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { + // match mdfs.next().transpose()? { + // Some((proximity, answer)) => { + // debug!("answer with a proximity of {}: {:?}", proximity, answer); + // documents.push(answer); + // }, + // None => break, + // } + // } + + // let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); + // let documents_ids = match order_by_facet { + // Some((fid, ftype, order)) => { + // let mut ordered_documents = Vec::new(); + // for documents_ids in documents { + // let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?; + // ordered_documents.push(docids); + // if ordered_documents.iter().map(Vec::len).sum::() >= limit { break } + // } + // ordered_documents.into_iter().flatten().take(limit).collect() + // }, + // None => documents.into_iter().flatten().take(limit).collect(), + // }; + + // Ok(SearchResult { found_words, candidates, documents_ids }) } } From 98e69e63d2ed67563d9b1cbe456db145039cadab Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 15:27:35 +0100 Subject: [PATCH 0503/1889] implement Context trait for criteria --- milli/src/search/criteria/mod.rs | 122 +++++++++++++++++++++++++++++- milli/src/search/criteria/typo.rs | 67 +++++++--------- milli/src/search/mod.rs | 3 +- 3 files changed, 152 insertions(+), 40 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 4cc4512d7..36569906e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,8 +1,11 @@ +use std::borrow::Cow; + use crate::Index; +use crate::search::word_typos; use roaring::RoaringBitmap; -use super::query_tree::Operation; +use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; @@ -32,3 +35,120 @@ impl Default for Candidates { Self::Forbidden(RoaringBitmap::new()) } } +pub trait Context { + fn query_docids(&self, query: &Query) -> anyhow::Result>; + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) ->anyhow::Result>; + fn words_fst<'t>(&self) -> &'t fst::Set>; +} +pub struct HeedContext<'t> { + rtxn: &'t heed::RoTxn<'t>, + index: &'t Index, + words_fst: fst::Set>, +} + +impl<'a> Context for HeedContext<'a> { + fn query_docids(&self, query: &Query) -> anyhow::Result> { + match (&query.kind, query.prefix) { + // TODO de-comment when ~ready + // (QueryKind::Exact { word, .. }, true) if in_prefix_cache(&word) => { + // Ok(self.index.prefix_docids.get(self.rtxn, &word)?) + // }, + (QueryKind::Exact { word, .. }, true) => { + let words = word_typos(&word, true, 0, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, + (QueryKind::Exact { word, .. }, false) => { + Ok(self.index.word_docids.get(self.rtxn, &word)?) + }, + (QueryKind::Tolerant { typo, word }, prefix) => { + let words = word_typos(&word, prefix, *typo, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, + } + } + + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result> { + // TODO add prefix cache for Tolerant-Exact-true and Exact-Exact-true + match (&left.kind, &right.kind, right.prefix) { + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) => { + let words = word_typos(&right, true, 0, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let key = (left.as_str(), word.as_str(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) => { + let l_words = word_typos(&left, false, *typo, &self.words_fst)?; + let r_words = word_typos(&right, true, 0, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (left, _typo) in l_words { + for (right, _typo) in r_words.iter() { + let key = (left.as_str(), right.as_str(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + } + Ok(Some(docids)) + }, + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, false) => { + let words = word_typos(&left, false, *typo, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let key = (word.as_str(), right.as_str(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, + (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }, prefix) => { + let words = word_typos(&right, prefix, *typo, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let key = (left.as_str(), word.as_str(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, + (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }, prefix) => { + let l_words = word_typos(&left, false, *l_typo, &self.words_fst)?; + let r_words = word_typos(&right, prefix, *r_typo, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (left, _typo) in l_words { + for (right, _typo) in r_words.iter() { + let key = (left.as_str(), right.as_str(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + } + Ok(Some(docids)) + }, + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, false) => { + let key = (left.as_str(), right.as_str(), distance); + Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?) + }, + } + } + + fn words_fst<'t>(&self) -> &'t fst::Set> { + &self.words_fst + } +} + +impl<'t> HeedContext<'t> { + pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result { + let words_fst = index.words_fst(rtxn)?; + + Ok(Self { + rtxn, + index, + words_fst, + }) + } +} diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 196da4a49..b3608c397 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -3,19 +3,16 @@ use std::{borrow::Cow, mem::take}; use anyhow::bail; use roaring::RoaringBitmap; -use crate::Index; use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::search::word_typos; -use super::{Candidates, Criterion}; +use super::{Candidates, Criterion, Context}; // FIXME we must stop when the number of typos is equal to // the maximum number of typos for this query tree. const MAX_NUM_TYPOS: u8 = 8; pub struct Typo<'t> { - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - words_fst: fst::Set>, + ctx: &'t dyn Context, query_tree: Option, number_typos: u8, candidates: Candidates, @@ -24,16 +21,13 @@ pub struct Typo<'t> { impl<'t> Typo<'t> { pub fn initial( - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, + ctx: &'t dyn Context, query_tree: Option, candidates: Option, ) -> anyhow::Result where Self: Sized { Ok(Typo { - index, - rtxn, - words_fst: index.words_fst(rtxn)?, + ctx, query_tree, number_typos: 0, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), @@ -42,15 +36,12 @@ impl<'t> Typo<'t> { } pub fn new( - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, + ctx: &'t dyn Context, parent: Box, ) -> anyhow::Result where Self: Sized { Ok(Typo { - index, - rtxn, - words_fst: index.words_fst(rtxn)?, + ctx, query_tree: None, number_typos: 0, candidates: Candidates::default(), @@ -69,8 +60,10 @@ impl<'t> Criterion for Typo<'t> { self.candidates = Candidates::default(); }, (Some(query_tree), Allowed(candidates)) => { - let new_query_tree = alterate_query_tree(&self.words_fst, query_tree.clone(), self.number_typos)?; - let mut new_candidates = resolve_candidates(&self.index, &self.rtxn, &new_query_tree, self.number_typos)?; + // TODO if number_typos >= 2 the generated query_tree will allways be the same, + // generate a new one on each iteration is a waste of time. + let new_query_tree = alterate_query_tree(&self.ctx.words_fst(), query_tree.clone(), self.number_typos)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); self.number_typos += 1; @@ -78,8 +71,10 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some((Some(new_query_tree), new_candidates))); }, (Some(query_tree), Forbidden(candidates)) => { - let new_query_tree = alterate_query_tree(&self.words_fst, query_tree.clone(), self.number_typos)?; - let mut new_candidates = resolve_candidates(&self.index, &self.rtxn, &new_query_tree, self.number_typos)?; + // TODO if number_typos >= 2 the generated query_tree will allways be the same, + // generate a new one on each iteration is a waste of time. + let new_query_tree = alterate_query_tree(&self.ctx.words_fst(), query_tree.clone(), self.number_typos)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); self.number_typos += 1; @@ -132,6 +127,7 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos)) }, Operation::Query(q) => { + // TODO may be optimized when number_typos == 0 if let QueryKind::Tolerant { typo, word } = &q.kind { let typo = *typo.min(&number_typos); let words = word_typos(word, q.prefix, typo, words_fst)?; @@ -155,9 +151,8 @@ fn alterate_query_tree( Ok(query_tree) } -fn resolve_candidates( - index: &Index, - rtxn: &heed::RoTxn, +fn resolve_candidates<'t>( + ctx: &'t dyn Context, query_tree: &Operation, number_typos: u8, ) -> anyhow::Result @@ -166,9 +161,8 @@ fn resolve_candidates( // FIXME keep the cache between typos iterations // cache: HashMap<(&Operation, u8), RoaringBitmap>, - fn resolve_operation( - index: &Index, - rtxn: &heed::RoTxn, + fn resolve_operation<'t>( + ctx: &'t dyn Context, query_tree: &Operation, number_typos: u8, ) -> anyhow::Result @@ -177,7 +171,7 @@ fn resolve_candidates( match query_tree { And(ops) => { - mdfs(index, rtxn, ops, number_typos) + mdfs(ctx, ops, number_typos) }, Consecutive(ops) => { let mut candidates = RoaringBitmap::new(); @@ -185,8 +179,7 @@ fn resolve_candidates( for slice in ops.windows(2) { match (&slice[0], &slice[1]) { (Operation::Query(left), Operation::Query(right)) => { - let key_pair = &(left.kind.word(), right.kind.word(), 1); - match index.word_pair_proximity_docids.get(rtxn, key_pair)? { + match ctx.query_pair_proximity_docids(left, right, 1)? { Some(pair_docids) => { if first_loop { candidates = pair_docids; @@ -207,14 +200,13 @@ fn resolve_candidates( Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { - let docids = resolve_operation(index, rtxn, op, number_typos)?; + let docids = resolve_operation(ctx, op, number_typos)?; candidates.union_with(&docids); } Ok(candidates) }, Query(q) => if q.kind.typo() == number_typos { - let word = q.kind.word(); - Ok(index.word_docids.get(rtxn, word)?.unwrap_or_default()) + Ok(ctx.query_docids(q)?.unwrap_or_default()) } else { Ok(RoaringBitmap::new()) }, @@ -222,22 +214,21 @@ fn resolve_candidates( } /// FIXME Make this function generic and mutualize it between Typo and proximity criterion - fn mdfs( - index: &Index, - rtxn: &heed::RoTxn, + fn mdfs<'t>( + ctx: &'t dyn Context, branches: &[Operation], mana: u8, ) -> anyhow::Result { match branches.split_first() { - Some((head, [])) => resolve_operation(index, rtxn, head, mana), + Some((head, [])) => resolve_operation(ctx, head, mana), Some((head, tail)) => { let mut candidates = RoaringBitmap::new(); for m in 0..=mana { - let mut head_candidates = resolve_operation(index, rtxn, head, m)?; + let mut head_candidates = resolve_operation(ctx, head, m)?; if !head_candidates.is_empty() { - let tail_candidates = mdfs(index, rtxn, tail, mana - m)?; + let tail_candidates = mdfs(ctx, tail, mana - m)?; head_candidates.intersect_with(&tail_candidates); candidates.union_with(&head_candidates); } @@ -249,5 +240,5 @@ fn resolve_candidates( } } - resolve_operation(index, rtxn, query_tree, number_typos) + resolve_operation(ctx, query_tree, number_typos) } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 267f3556a..d061df2b9 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -288,7 +288,8 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); // We aretesting the typo criteria but there will be more of them soon. - let mut criteria = Typo::initial(self.index, self.rtxn, query_tree, facet_candidates)?; + let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; + let mut criteria = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; let mut offset = self.offset; let mut limit = self.limit; From 774a255f2e26272770b04c377d06780c375db611 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 15:49:44 +0100 Subject: [PATCH 0504/1889] use prefix cache in criteria --- milli/src/search/criteria/mod.rs | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 36569906e..5c6561b62 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -44,15 +44,15 @@ pub struct HeedContext<'t> { rtxn: &'t heed::RoTxn<'t>, index: &'t Index, words_fst: fst::Set>, + words_prefixes_fst: fst::Set>, } impl<'a> Context for HeedContext<'a> { fn query_docids(&self, query: &Query) -> anyhow::Result> { match (&query.kind, query.prefix) { - // TODO de-comment when ~ready - // (QueryKind::Exact { word, .. }, true) if in_prefix_cache(&word) => { - // Ok(self.index.prefix_docids.get(self.rtxn, &word)?) - // }, + (QueryKind::Exact { word, .. }, true) if self.in_prefix_cache(&word) => { + Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?) + }, (QueryKind::Exact { word, .. }, true) => { let words = word_typos(&word, true, 0, &self.words_fst)?; let mut docids = RoaringBitmap::new(); @@ -78,6 +78,19 @@ impl<'a> Context for HeedContext<'a> { fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result> { // TODO add prefix cache for Tolerant-Exact-true and Exact-Exact-true match (&left.kind, &right.kind, right.prefix) { + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { + let key = (left.as_str(), right.as_str(), distance); + Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?) + }, + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { + let words = word_typos(&left, false, *typo, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let key = (word.as_str(), right.as_str(), distance); + docids.union_with(&self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + Ok(Some(docids)) + }, (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) => { let words = word_typos(&right, true, 0, &self.words_fst)?; let mut docids = RoaringBitmap::new(); @@ -144,11 +157,17 @@ impl<'a> Context for HeedContext<'a> { impl<'t> HeedContext<'t> { pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result { let words_fst = index.words_fst(rtxn)?; + let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; Ok(Self { rtxn, index, words_fst, + words_prefixes_fst, }) } + + fn in_prefix_cache(&self, word: &str) -> bool { + self.words_prefixes_fst.contains(word) + } } From 907482c8acac6236cf3f5b28271b248f3d511843 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 16:21:21 +0100 Subject: [PATCH 0505/1889] clean docids fetchers --- milli/src/search/criteria/mod.rs | 86 +++++++++++-------------------- milli/src/search/criteria/typo.rs | 20 +++---- 2 files changed, 41 insertions(+), 65 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 5c6561b62..9cb6547c9 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -36,8 +36,8 @@ impl Default for Candidates { } } pub trait Context { - fn query_docids(&self, query: &Query) -> anyhow::Result>; - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) ->anyhow::Result>; + fn query_docids(&self, query: &Query) -> anyhow::Result; + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) ->anyhow::Result; fn words_fst<'t>(&self) -> &'t fst::Set>; } pub struct HeedContext<'t> { @@ -48,10 +48,10 @@ pub struct HeedContext<'t> { } impl<'a> Context for HeedContext<'a> { - fn query_docids(&self, query: &Query) -> anyhow::Result> { + fn query_docids(&self, query: &Query) -> anyhow::Result { match (&query.kind, query.prefix) { (QueryKind::Exact { word, .. }, true) if self.in_prefix_cache(&word) => { - Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?) + Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?.unwrap_or_default()) }, (QueryKind::Exact { word, .. }, true) => { let words = word_typos(&word, true, 0, &self.words_fst)?; @@ -59,10 +59,10 @@ impl<'a> Context for HeedContext<'a> { for (word, _typo) in words { docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); } - Ok(Some(docids)) + Ok(docids) }, (QueryKind::Exact { word, .. }, false) => { - Ok(self.index.word_docids.get(self.rtxn, &word)?) + Ok(self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()) }, (QueryKind::Tolerant { typo, word }, prefix) => { let words = word_typos(&word, prefix, *typo, &self.words_fst)?; @@ -70,81 +70,46 @@ impl<'a> Context for HeedContext<'a> { for (word, _typo) in words { docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); } - Ok(Some(docids)) + Ok(docids) }, } } - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result> { - // TODO add prefix cache for Tolerant-Exact-true and Exact-Exact-true + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result { match (&left.kind, &right.kind, right.prefix) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { let key = (left.as_str(), right.as_str(), distance); - Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?) + Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) }, (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { - let words = word_typos(&left, false, *typo, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let key = (word.as_str(), right.as_str(), distance); - docids.union_with(&self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - Ok(Some(docids)) + let l_words = word_typos(&left, false, *typo, &self.words_fst)?; + self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) }, (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) => { - let words = word_typos(&right, true, 0, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let key = (left.as_str(), word.as_str(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - Ok(Some(docids)) + let r_words = word_typos(&right, true, 0, &self.words_fst)?; + self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) }, (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) => { let l_words = word_typos(&left, false, *typo, &self.words_fst)?; let r_words = word_typos(&right, true, 0, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (left, _typo) in l_words { - for (right, _typo) in r_words.iter() { - let key = (left.as_str(), right.as_str(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - } - Ok(Some(docids)) + self.all_word_pair_proximity_docids(&l_words, &r_words, distance) }, (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, false) => { - let words = word_typos(&left, false, *typo, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let key = (word.as_str(), right.as_str(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - Ok(Some(docids)) + let l_words = word_typos(&left, false, *typo, &self.words_fst)?; + self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) }, (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }, prefix) => { - let words = word_typos(&right, prefix, *typo, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let key = (left.as_str(), word.as_str(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - Ok(Some(docids)) + let r_words = word_typos(&right, prefix, *typo, &self.words_fst)?; + self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) }, (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }, prefix) => { let l_words = word_typos(&left, false, *l_typo, &self.words_fst)?; let r_words = word_typos(&right, prefix, *r_typo, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (left, _typo) in l_words { - for (right, _typo) in r_words.iter() { - let key = (left.as_str(), right.as_str(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); - } - } - Ok(Some(docids)) + self.all_word_pair_proximity_docids(&l_words, &r_words, distance) }, (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, false) => { let key = (left.as_str(), right.as_str(), distance); - Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?) + Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) }, } } @@ -170,4 +135,15 @@ impl<'t> HeedContext<'t> { fn in_prefix_cache(&self, word: &str) -> bool { self.words_prefixes_fst.contains(word) } + + fn all_word_pair_proximity_docids, U: AsRef>(&self, left_words: &[(T, u8)], right_words: &[(U, u8)], distance: u8) -> anyhow::Result { + let mut docids = RoaringBitmap::new(); + for (left, _l_typo) in left_words { + for (right, _r_typo) in right_words { + let key = (left.as_ref(), right.as_ref(), distance); + docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + } + } + Ok(docids) + } } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index b3608c397..9834bbc21 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -180,17 +180,17 @@ fn resolve_candidates<'t>( match (&slice[0], &slice[1]) { (Operation::Query(left), Operation::Query(right)) => { match ctx.query_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => { - if first_loop { - candidates = pair_docids; - first_loop = false; - } else { - candidates.intersect_with(&pair_docids) - } + pair_docids if pair_docids.is_empty() => { + return Ok(RoaringBitmap::new()) + }, + pair_docids if first_loop => { + candidates = pair_docids; + first_loop = false; + }, + pair_docids => { + candidates.intersect_with(&pair_docids); }, - None => return Ok(RoaringBitmap::new()), } - }, _ => bail!("invalid consecutive query type"), } @@ -206,7 +206,7 @@ fn resolve_candidates<'t>( Ok(candidates) }, Query(q) => if q.kind.typo() == number_typos { - Ok(ctx.query_docids(q)?.unwrap_or_default()) + Ok(ctx.query_docids(q)?) } else { Ok(RoaringBitmap::new()) }, From 4128bdc859d394429e9261c457c2c743f79c3849 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 16:50:38 +0100 Subject: [PATCH 0506/1889] reduce match possibilities in docids fetchers --- milli/src/search/criteria/mod.rs | 93 +++++++++++++++++--------------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 9cb6547c9..d8bd21c01 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -49,26 +49,28 @@ pub struct HeedContext<'t> { impl<'a> Context for HeedContext<'a> { fn query_docids(&self, query: &Query) -> anyhow::Result { - match (&query.kind, query.prefix) { - (QueryKind::Exact { word, .. }, true) if self.in_prefix_cache(&word) => { - Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?.unwrap_or_default()) - }, - (QueryKind::Exact { word, .. }, true) => { - let words = word_typos(&word, true, 0, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); + match &query.kind { + QueryKind::Exact { word, .. } => { + if query.prefix && self.in_prefix_cache(&word) { + Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?.unwrap_or_default()) + } else if query.prefix { + let words = word_typos(&word, true, 0, &self.words_fst)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let current_docids = self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + } else { + Ok(self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()) } - Ok(docids) }, - (QueryKind::Exact { word, .. }, false) => { - Ok(self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()) - }, - (QueryKind::Tolerant { typo, word }, prefix) => { - let words = word_typos(&word, prefix, *typo, &self.words_fst)?; + QueryKind::Tolerant { typo, word } => { + let words = word_typos(&word, query.prefix, *typo, &self.words_fst)?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { - docids.union_with(&self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()); + let current_docids = self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default(); + docids.union_with(¤t_docids); } Ok(docids) }, @@ -76,41 +78,47 @@ impl<'a> Context for HeedContext<'a> { } fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result { - match (&left.kind, &right.kind, right.prefix) { - (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { - let key = (left.as_str(), right.as_str(), distance); - Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) + let prefix = right.prefix; + + match (&left.kind, &right.kind) { + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { + if prefix && self.in_prefix_cache(&right) { + let key = (left.as_str(), right.as_str(), distance); + Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) + } else if prefix { + let r_words = word_typos(&right, true, 0, &self.words_fst)?; + self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) + } else { + let key = (left.as_str(), right.as_str(), distance); + Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) + } }, - (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) if self.in_prefix_cache(&right) => { + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { let l_words = word_typos(&left, false, *typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) + if prefix && self.in_prefix_cache(&right) { + let mut docids = RoaringBitmap::new(); + for (left, _) in l_words { + let key = (left.as_ref(), right.as_ref(), distance); + let current_docids = self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + } else if prefix { + let r_words = word_typos(&right, true, 0, &self.words_fst)?; + self.all_word_pair_proximity_docids(&l_words, &r_words, distance) + } else { + self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) + } }, - (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, true) => { - let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) - }, - (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, true) => { - let l_words = word_typos(&left, false, *typo, &self.words_fst)?; - let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &r_words, distance) - }, - (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }, false) => { - let l_words = word_typos(&left, false, *typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) - }, - (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }, prefix) => { + (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_typos(&right, prefix, *typo, &self.words_fst)?; self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) }, - (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }, prefix) => { + (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { let l_words = word_typos(&left, false, *l_typo, &self.words_fst)?; let r_words = word_typos(&right, prefix, *r_typo, &self.words_fst)?; self.all_word_pair_proximity_docids(&l_words, &r_words, distance) }, - (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }, false) => { - let key = (left.as_str(), right.as_str(), distance); - Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) - }, } } @@ -141,7 +149,8 @@ impl<'t> HeedContext<'t> { for (left, _l_typo) in left_words { for (right, _r_typo) in right_words { let key = (left.as_ref(), right.as_ref(), distance); - docids.union_with(&self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()); + let current_docids = self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); + docids.union_with(¤t_docids); } } Ok(docids) From 86bcecf840a978d3ecf775af27714c4c0362ad33 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 17:02:26 +0100 Subject: [PATCH 0507/1889] change variable's name from distance to proximity --- milli/src/search/criteria/mod.rs | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d8bd21c01..d21102e64 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -37,7 +37,7 @@ impl Default for Candidates { } pub trait Context { fn query_docids(&self, query: &Query) -> anyhow::Result; - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) ->anyhow::Result; + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, proximity: u8) ->anyhow::Result; fn words_fst<'t>(&self) -> &'t fst::Set>; } pub struct HeedContext<'t> { @@ -77,19 +77,19 @@ impl<'a> Context for HeedContext<'a> { } } - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, distance: u8) -> anyhow::Result { + fn query_pair_proximity_docids(&self, left: &Query, right: &Query, proximity: u8) -> anyhow::Result { let prefix = right.prefix; match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix && self.in_prefix_cache(&right) { - let key = (left.as_str(), right.as_str(), distance); + let key = (left.as_str(), right.as_str(), proximity); Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) } else if prefix { let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) + self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, proximity) } else { - let key = (left.as_str(), right.as_str(), distance); + let key = (left.as_str(), right.as_str(), proximity); Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) } }, @@ -98,26 +98,26 @@ impl<'a> Context for HeedContext<'a> { if prefix && self.in_prefix_cache(&right) { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { - let key = (left.as_ref(), right.as_ref(), distance); + let key = (left.as_ref(), right.as_ref(), proximity); let current_docids = self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); docids.union_with(¤t_docids); } Ok(docids) } else if prefix { let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &r_words, distance) + self.all_word_pair_proximity_docids(&l_words, &r_words, proximity) } else { - self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], distance) + self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], proximity) } }, (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_typos(&right, prefix, *typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, distance) + self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, proximity) }, (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { let l_words = word_typos(&left, false, *l_typo, &self.words_fst)?; let r_words = word_typos(&right, prefix, *r_typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &r_words, distance) + self.all_word_pair_proximity_docids(&l_words, &r_words, proximity) }, } } @@ -144,11 +144,16 @@ impl<'t> HeedContext<'t> { self.words_prefixes_fst.contains(word) } - fn all_word_pair_proximity_docids, U: AsRef>(&self, left_words: &[(T, u8)], right_words: &[(U, u8)], distance: u8) -> anyhow::Result { + fn all_word_pair_proximity_docids, U: AsRef>( + &self, + left_words: &[(T, u8)], + right_words: &[(U, u8)], + proximity: u8 + ) -> anyhow::Result { let mut docids = RoaringBitmap::new(); for (left, _l_typo) in left_words { for (right, _r_typo) in right_words { - let key = (left.as_ref(), right.as_ref(), distance); + let key = (left.as_ref(), right.as_ref(), proximity); let current_docids = self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); docids.union_with(¤t_docids); } From 5344abc0082233906006a652723f6d03a34fb739 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 17:34:22 +0100 Subject: [PATCH 0508/1889] Introduce the CriterionResult return type --- milli/src/search/criteria/mod.rs | 12 +++++++++++- milli/src/search/criteria/typo.rs | 24 ++++++++++++++++++------ milli/src/search/mod.rs | 4 ++-- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d21102e64..45689bbe5 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -10,7 +10,17 @@ use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; pub trait Criterion { - fn next(&mut self) -> anyhow::Result, RoaringBitmap)>>; + fn next(&mut self) -> anyhow::Result>; +} + +/// The result of a call to the parent criterion. +pub struct CriterionResult { + /// The query tree that must be used by the children criterion to fetch candidates. + pub query_tree: Option, + /// The candidates that this criterion is allowed to return subsets of. + pub candidates: RoaringBitmap, + /// Candidates that comes from the current bucket of the initial criterion. + pub bucket_candidates: Option, } /// Either a set of candidates that defines the candidates diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 9834bbc21..781ea1ec8 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::search::word_typos; -use super::{Candidates, Criterion, Context}; +use super::{Candidates, Criterion, CriterionResult, Context}; // FIXME we must stop when the number of typos is equal to // the maximum number of typos for this query tree. @@ -51,7 +51,7 @@ impl<'t> Typo<'t> { } impl<'t> Criterion for Typo<'t> { - fn next(&mut self) -> anyhow::Result, RoaringBitmap)>> { + fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; while self.number_typos < MAX_NUM_TYPOS { match (&mut self.query_tree, &mut self.candidates) { @@ -68,7 +68,11 @@ impl<'t> Criterion for Typo<'t> { candidates.difference_with(&new_candidates); self.number_typos += 1; - return Ok(Some((Some(new_query_tree), new_candidates))); + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: new_candidates, + bucket_candidates: None, + })); }, (Some(query_tree), Forbidden(candidates)) => { // TODO if number_typos >= 2 the generated query_tree will allways be the same, @@ -79,16 +83,24 @@ impl<'t> Criterion for Typo<'t> { candidates.union_with(&new_candidates); self.number_typos += 1; - return Ok(Some((Some(new_query_tree), new_candidates))); + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: new_candidates, + bucket_candidates: None, + })); }, (None, Allowed(_)) => { - return Ok(Some((None, take(&mut self.candidates).into_inner()))); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: None, + })); }, (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { match parent.next()? { - Some((query_tree, candidates)) => { + Some(CriterionResult { query_tree, candidates, .. }) => { self.query_tree = query_tree; self.candidates = Candidates::Allowed(candidates); }, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index d061df2b9..89abb01b4 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -18,7 +18,7 @@ use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec} use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::mdfs::Mdfs; use crate::query_tokens::{query_tokens, QueryToken}; -use crate::search::criteria::Criterion; +use crate::search::criteria::{Criterion, CriterionResult}; use crate::search::criteria::typo::Typo; use crate::{Index, FieldId, DocumentId}; @@ -294,7 +294,7 @@ impl<'a> Search<'a> { let mut offset = self.offset; let mut limit = self.limit; let mut documents_ids = Vec::new(); - while let Some((_qt, docids)) = criteria.next()? { + while let Some(CriterionResult { candidates: docids, .. }) = criteria.next()? { let mut len = docids.len() as usize; let mut docids = docids.into_iter(); From 229130ed259606d03ecdc22309d102173c040e48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 17:45:56 +0100 Subject: [PATCH 0509/1889] Correctly compute the bucket candidates for the Typo criterion --- milli/src/search/criteria/typo.rs | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 781ea1ec8..31b56d700 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -16,6 +16,7 @@ pub struct Typo<'t> { query_tree: Option, number_typos: u8, candidates: Candidates, + bucket_candidates: Option, parent: Option>, } @@ -31,6 +32,7 @@ impl<'t> Typo<'t> { query_tree, number_typos: 0, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + bucket_candidates: None, parent: None, }) } @@ -45,6 +47,7 @@ impl<'t> Typo<'t> { query_tree: None, number_typos: 0, candidates: Candidates::default(), + bucket_candidates: None, parent: Some(parent), }) } @@ -68,10 +71,15 @@ impl<'t> Criterion for Typo<'t> { candidates.difference_with(&new_candidates); self.number_typos += 1; + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; + return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: new_candidates, - bucket_candidates: None, + bucket_candidates, })); }, (Some(query_tree), Forbidden(candidates)) => { @@ -83,26 +91,33 @@ impl<'t> Criterion for Typo<'t> { candidates.union_with(&new_candidates); self.number_typos += 1; + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; + return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: new_candidates, - bucket_candidates: None, + bucket_candidates, })); }, (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); return Ok(Some(CriterionResult { query_tree: None, - candidates: take(&mut self.candidates).into_inner(), - bucket_candidates: None, + candidates: candidates.clone(), + bucket_candidates: Some(candidates), })); }, (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { match parent.next()? { - Some(CriterionResult { query_tree, candidates, .. }) => { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates = bucket_candidates; }, None => return Ok(None), } From fea9ffc46a92a0f9c7f295dd2379317a2719c94a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 17 Feb 2021 17:50:46 +0100 Subject: [PATCH 0510/1889] Use the bucket candidates in the search module --- milli/src/search/mod.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 89abb01b4..d94bc8831 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -294,19 +294,24 @@ impl<'a> Search<'a> { let mut offset = self.offset; let mut limit = self.limit; let mut documents_ids = Vec::new(); - while let Some(CriterionResult { candidates: docids, .. }) = criteria.next()? { + let mut initial_candidates = RoaringBitmap::new(); + while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next()? { - let mut len = docids.len() as usize; - let mut docids = docids.into_iter(); + let mut len = candidates.len() as usize; + let mut candidates = candidates.into_iter(); + + if let Some(docids) = bucket_candidates { + initial_candidates.union_with(&docids); + } if offset != 0 { - docids.by_ref().skip(offset).for_each(drop); + candidates.by_ref().skip(offset).for_each(drop); offset = offset.saturating_sub(len.min(offset)); len = len.saturating_sub(len.min(offset)); } if len != 0 { - documents_ids.extend(docids.take(limit)); + documents_ids.extend(candidates.take(limit)); limit = limit.saturating_sub(len.min(limit)); } @@ -314,8 +319,7 @@ impl<'a> Search<'a> { } let found_words = HashSet::new(); - let candidates = RoaringBitmap::new(); - Ok(SearchResult { found_words, candidates, documents_ids }) + Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids }) // let order_by_facet = { // let criteria = self.index.criteria(self.rtxn)?; From 9ccaea2afc7b147bd367fd283f1d87bf2c1710e1 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 18:16:21 +0100 Subject: [PATCH 0511/1889] simplify criterion context --- milli/src/search/criteria/mod.rs | 189 ++++++++++++++++-------------- milli/src/search/criteria/typo.rs | 6 +- 2 files changed, 106 insertions(+), 89 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 45689bbe5..d033f5707 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -46,9 +46,12 @@ impl Default for Candidates { } } pub trait Context { - fn query_docids(&self, query: &Query) -> anyhow::Result; - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, proximity: u8) ->anyhow::Result; + fn word_docids(&self, word: &str) -> heed::Result>; + fn word_prefix_docids(&self, word: &str) -> heed::Result>; + fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; + fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; + fn in_prefix_cache(&self, word: &str) -> bool; } pub struct HeedContext<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -58,83 +61,31 @@ pub struct HeedContext<'t> { } impl<'a> Context for HeedContext<'a> { - fn query_docids(&self, query: &Query) -> anyhow::Result { - match &query.kind { - QueryKind::Exact { word, .. } => { - if query.prefix && self.in_prefix_cache(&word) { - Ok(self.index.word_prefix_docids.get(self.rtxn, &word)?.unwrap_or_default()) - } else if query.prefix { - let words = word_typos(&word, true, 0, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let current_docids = self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default(); - docids.union_with(¤t_docids); - } - Ok(docids) - } else { - Ok(self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default()) - } - }, - QueryKind::Tolerant { typo, word } => { - let words = word_typos(&word, query.prefix, *typo, &self.words_fst)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let current_docids = self.index.word_docids.get(self.rtxn, &word)?.unwrap_or_default(); - docids.union_with(¤t_docids); - } - Ok(docids) - }, - } + fn word_docids(&self, word: &str) -> heed::Result> { + self.index.word_docids.get(self.rtxn, &word) } - fn query_pair_proximity_docids(&self, left: &Query, right: &Query, proximity: u8) -> anyhow::Result { - let prefix = right.prefix; + fn word_prefix_docids(&self, word: &str) -> heed::Result> { + self.index.word_prefix_docids.get(self.rtxn, &word) + } - match (&left.kind, &right.kind) { - (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { - if prefix && self.in_prefix_cache(&right) { - let key = (left.as_str(), right.as_str(), proximity); - Ok(self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) - } else if prefix { - let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, proximity) - } else { - let key = (left.as_str(), right.as_str(), proximity); - Ok(self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default()) - } - }, - (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = word_typos(&left, false, *typo, &self.words_fst)?; - if prefix && self.in_prefix_cache(&right) { - let mut docids = RoaringBitmap::new(); - for (left, _) in l_words { - let key = (left.as_ref(), right.as_ref(), proximity); - let current_docids = self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); - docids.union_with(¤t_docids); - } - Ok(docids) - } else if prefix { - let r_words = word_typos(&right, true, 0, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &r_words, proximity) - } else { - self.all_word_pair_proximity_docids(&l_words, &[(right, 0)], proximity) - } - }, - (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { - let r_words = word_typos(&right, prefix, *typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&[(left, 0)], &r_words, proximity) - }, - (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { - let l_words = word_typos(&left, false, *l_typo, &self.words_fst)?; - let r_words = word_typos(&right, prefix, *r_typo, &self.words_fst)?; - self.all_word_pair_proximity_docids(&l_words, &r_words, proximity) - }, - } + fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left, right, proximity); + self.index.word_pair_proximity_docids.get(self.rtxn, &key) + } + + fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left, right, proximity); + self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) } fn words_fst<'t>(&self) -> &'t fst::Set> { &self.words_fst } + + fn in_prefix_cache(&self, word: &str) -> bool { + self.words_prefixes_fst.contains(word) + } } impl<'t> HeedContext<'t> { @@ -149,25 +100,91 @@ impl<'t> HeedContext<'t> { words_prefixes_fst, }) } +} - fn in_prefix_cache(&self, word: &str) -> bool { - self.words_prefixes_fst.contains(word) +fn all_word_pair_proximity_docids, U: AsRef>( + ctx: &dyn Context, + left_words: &[(T, u8)], + right_words: &[(U, u8)], + proximity: u8 +) -> anyhow::Result { + let mut docids = RoaringBitmap::new(); + for (left, _l_typo) in left_words { + for (right, _r_typo) in right_words { + let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } } + Ok(docids) +} - fn all_word_pair_proximity_docids, U: AsRef>( - &self, - left_words: &[(T, u8)], - right_words: &[(U, u8)], - proximity: u8 - ) -> anyhow::Result { - let mut docids = RoaringBitmap::new(); - for (left, _l_typo) in left_words { - for (right, _r_typo) in right_words { - let key = (left.as_ref(), right.as_ref(), proximity); - let current_docids = self.index.word_pair_proximity_docids.get(self.rtxn, &key)?.unwrap_or_default(); +fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result { + match &query.kind { + QueryKind::Exact { word, .. } => { + if query.prefix && ctx.in_prefix_cache(&word) { + Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default()) + } else if query.prefix { + let words = word_typos(&word, true, 0, ctx.words_fst())?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + } else { + Ok(ctx.word_docids(&word)?.unwrap_or_default()) + } + }, + QueryKind::Tolerant { typo, word } => { + let words = word_typos(&word, query.prefix, *typo, ctx.words_fst())?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); docids.union_with(¤t_docids); } - } - Ok(docids) + Ok(docids) + }, + } +} + +fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, proximity: u8) -> anyhow::Result { + let prefix = right.prefix; + + match (&left.kind, &right.kind) { + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { + if prefix && ctx.in_prefix_cache(&right) { + Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) + } else if prefix { + let r_words = word_typos(&right, true, 0, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + } else { + Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) + } + }, + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { + let l_words = word_typos(&left, false, *typo, ctx.words_fst())?; + if prefix && ctx.in_prefix_cache(&right) { + let mut docids = RoaringBitmap::new(); + for (left, _) in l_words { + let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + } else if prefix { + let r_words = word_typos(&right, true, 0, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) + } else { + all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) + } + }, + (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { + let r_words = word_typos(&right, prefix, *typo, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + }, + (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { + let l_words = word_typos(&left, false, *l_typo, ctx.words_fst())?; + let r_words = word_typos(&right, prefix, *r_typo, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) + }, } } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 31b56d700..a1b8e1f16 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::search::word_typos; -use super::{Candidates, Criterion, CriterionResult, Context}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; // FIXME we must stop when the number of typos is equal to // the maximum number of typos for this query tree. @@ -206,7 +206,7 @@ fn resolve_candidates<'t>( for slice in ops.windows(2) { match (&slice[0], &slice[1]) { (Operation::Query(left), Operation::Query(right)) => { - match ctx.query_pair_proximity_docids(left, right, 1)? { + match query_pair_proximity_docids(ctx, left, right, 1)? { pair_docids if pair_docids.is_empty() => { return Ok(RoaringBitmap::new()) }, @@ -233,7 +233,7 @@ fn resolve_candidates<'t>( Ok(candidates) }, Query(q) => if q.kind.typo() == number_typos { - Ok(ctx.query_docids(q)?) + Ok(query_docids(ctx, q)?) } else { Ok(RoaringBitmap::new()) }, From 67c71130dfcfbd7ce78089e49b14ce5b8ce2d13f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Feb 2021 11:00:31 +0100 Subject: [PATCH 0512/1889] Reduce the number of calls to alterate_query_tree --- milli/src/search/criteria/typo.rs | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a1b8e1f16..57db6e855 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -63,9 +63,16 @@ impl<'t> Criterion for Typo<'t> { self.candidates = Candidates::default(); }, (Some(query_tree), Allowed(candidates)) => { - // TODO if number_typos >= 2 the generated query_tree will allways be the same, - // generate a new one on each iteration is a waste of time. - let new_query_tree = alterate_query_tree(&self.ctx.words_fst(), query_tree.clone(), self.number_typos)?; + let fst = self.ctx.words_fst(); + let new_query_tree = if self.number_typos < 2 { + alterate_query_tree(&fst, query_tree.clone(), self.number_typos)? + } else if self.number_typos == 2 { + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos)?; + query_tree.clone() + } else { + query_tree.clone() + }; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); @@ -83,9 +90,16 @@ impl<'t> Criterion for Typo<'t> { })); }, (Some(query_tree), Forbidden(candidates)) => { - // TODO if number_typos >= 2 the generated query_tree will allways be the same, - // generate a new one on each iteration is a waste of time. - let new_query_tree = alterate_query_tree(&self.ctx.words_fst(), query_tree.clone(), self.number_typos)?; + let fst = self.ctx.words_fst(); + let new_query_tree = if self.number_typos < 2 { + alterate_query_tree(&fst, query_tree.clone(), self.number_typos)? + } else if self.number_typos == 2 { + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos)?; + query_tree.clone() + } else { + query_tree.clone() + }; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); From 4da6e1ea9c31ad10df47af0d15d9a67b248b09a8 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Feb 2021 14:40:59 +0100 Subject: [PATCH 0513/1889] add cache in typo criterion --- milli/src/search/criteria/typo.rs | 40 ++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 57db6e855..11c96a4d4 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -1,4 +1,4 @@ -use std::{borrow::Cow, mem::take}; +use std::{borrow::Cow, collections::HashMap, mem::take}; use anyhow::bail; use roaring::RoaringBitmap; @@ -18,6 +18,7 @@ pub struct Typo<'t> { candidates: Candidates, bucket_candidates: Option, parent: Option>, + cache: HashMap<(Operation, u8), RoaringBitmap>, } impl<'t> Typo<'t> { @@ -34,6 +35,7 @@ impl<'t> Typo<'t> { candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), bucket_candidates: None, parent: None, + cache: HashMap::new(), }) } @@ -49,6 +51,7 @@ impl<'t> Typo<'t> { candidates: Candidates::default(), bucket_candidates: None, parent: Some(parent), + cache: HashMap::new(), }) } } @@ -73,7 +76,7 @@ impl<'t> Criterion for Typo<'t> { query_tree.clone() }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.cache)?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); self.number_typos += 1; @@ -100,7 +103,7 @@ impl<'t> Criterion for Typo<'t> { query_tree.clone() }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.cache)?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); self.number_typos += 1; @@ -196,6 +199,7 @@ fn resolve_candidates<'t>( ctx: &'t dyn Context, query_tree: &Operation, number_typos: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, ) -> anyhow::Result { // FIXME add a cache @@ -206,13 +210,14 @@ fn resolve_candidates<'t>( ctx: &'t dyn Context, query_tree: &Operation, number_typos: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, ) -> anyhow::Result { use Operation::{And, Consecutive, Or, Query}; match query_tree { And(ops) => { - mdfs(ctx, ops, number_typos) + mdfs(ctx, ops, number_typos, cache) }, Consecutive(ops) => { let mut candidates = RoaringBitmap::new(); @@ -241,7 +246,7 @@ fn resolve_candidates<'t>( Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { - let docids = resolve_operation(ctx, op, number_typos)?; + let docids = resolve_operation(ctx, op, number_typos, cache)?; candidates.union_with(&docids); } Ok(candidates) @@ -259,17 +264,34 @@ fn resolve_candidates<'t>( ctx: &'t dyn Context, branches: &[Operation], mana: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, ) -> anyhow::Result { match branches.split_first() { - Some((head, [])) => resolve_operation(ctx, head, mana), + Some((head, [])) => { + if let Some(candidates) = cache.get(&(head.clone(), mana)) { + Ok(candidates.clone()) + } else { + let candidates = resolve_operation(ctx, head, mana, cache)?; + cache.insert((head.clone(), mana), candidates.clone()); + Ok(candidates) + } + }, Some((head, tail)) => { let mut candidates = RoaringBitmap::new(); for m in 0..=mana { - let mut head_candidates = resolve_operation(ctx, head, m)?; + let mut head_candidates = { + if let Some(candidates) = cache.get(&(head.clone(), m)) { + candidates.clone() + } else { + let candidates = resolve_operation(ctx, head, m, cache)?; + cache.insert((head.clone(), m), candidates.clone()); + candidates + } + }; if !head_candidates.is_empty() { - let tail_candidates = mdfs(ctx, tail, mana - m)?; + let tail_candidates = mdfs(ctx, tail, mana - m, cache)?; head_candidates.intersect_with(&tail_candidates); candidates.union_with(&head_candidates); } @@ -281,5 +303,5 @@ fn resolve_candidates<'t>( } } - resolve_operation(ctx, query_tree, number_typos) + resolve_operation(ctx, query_tree, number_typos, cache) } From 41fc51ebcf8d6f0d029cc4fdc8286b2f862718fc Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Feb 2021 15:08:56 +0100 Subject: [PATCH 0514/1889] optimize alterate_query_tree when number_typos is zero --- milli/src/search/criteria/typo.rs | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 11c96a4d4..6fd234d0b 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -171,19 +171,27 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos)) }, Operation::Query(q) => { - // TODO may be optimized when number_typos == 0 if let QueryKind::Tolerant { typo, word } = &q.kind { - let typo = *typo.min(&number_typos); - let words = word_typos(word, q.prefix, typo, words_fst)?; + // if no typo is allowed we don't call word_typos(..), + // and directly create an Exact query + if number_typos == 0 { + *operation = Operation::Query(Query { + prefix: q.prefix, + kind: QueryKind::Exact { original_typo: 0, word: word.clone() }, + }); + } else { + let typo = *typo.min(&number_typos); + let words = word_typos(word, q.prefix, typo, words_fst)?; - let queries = words.into_iter().map(|(word, _typo)| { - Operation::Query(Query { - prefix: false, - kind: QueryKind::Exact { original_typo: typo, word }, - }) - }).collect(); + let queries = words.into_iter().map(|(word, _typo)| { + Operation::Query(Query { + prefix: false, + kind: QueryKind::Exact { original_typo: typo, word }, + }) + }).collect(); - *operation = Operation::or(false, queries); + *operation = Operation::or(false, queries); + } } Ok(()) From 9e093d5ff31acd93cf98f037f4650dd92aece60c Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Feb 2021 15:45:47 +0100 Subject: [PATCH 0515/1889] add cache on alterate_query_tree function --- milli/src/search/criteria/typo.rs | 48 ++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 6fd234d0b..0284d448d 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -18,7 +18,8 @@ pub struct Typo<'t> { candidates: Candidates, bucket_candidates: Option, parent: Option>, - cache: HashMap<(Operation, u8), RoaringBitmap>, + candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, + typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>, } impl<'t> Typo<'t> { @@ -35,7 +36,8 @@ impl<'t> Typo<'t> { candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), bucket_candidates: None, parent: None, - cache: HashMap::new(), + candidates_cache: HashMap::new(), + typo_cache: HashMap::new(), }) } @@ -51,7 +53,8 @@ impl<'t> Typo<'t> { candidates: Candidates::default(), bucket_candidates: None, parent: Some(parent), - cache: HashMap::new(), + candidates_cache: HashMap::new(), + typo_cache: HashMap::new(), }) } } @@ -68,15 +71,15 @@ impl<'t> Criterion for Typo<'t> { (Some(query_tree), Allowed(candidates)) => { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; query_tree.clone() } else { query_tree.clone() }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.cache)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); self.number_typos += 1; @@ -95,15 +98,15 @@ impl<'t> Criterion for Typo<'t> { (Some(query_tree), Forbidden(candidates)) => { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; query_tree.clone() } else { query_tree.clone() }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.cache)?; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); self.number_typos += 1; @@ -156,19 +159,21 @@ fn alterate_query_tree( words_fst: &fst::Set>, mut query_tree: Operation, number_typos: u8, + typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, ) -> anyhow::Result { fn recurse( words_fst: &fst::Set>, operation: &mut Operation, number_typos: u8, + typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, ) -> anyhow::Result<()> { use Operation::{And, Consecutive, Or}; match operation { And(ops) | Consecutive(ops) | Or(_, ops) => { - ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos)) + ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache)) }, Operation::Query(q) => { if let QueryKind::Tolerant { typo, word } = &q.kind { @@ -181,9 +186,16 @@ fn alterate_query_tree( }); } else { let typo = *typo.min(&number_typos); - let words = word_typos(word, q.prefix, typo, words_fst)?; + let cache_key = (word.clone(), q.prefix, typo); + let words = if let Some(derivations) = typo_cache.get(&cache_key) { + derivations.clone() + } else { + let derivations = word_typos(word, q.prefix, typo, words_fst)?; + typo_cache.insert(cache_key, derivations.clone()); + derivations + }; - let queries = words.into_iter().map(|(word, _typo)| { + let queries = words.into_iter().map(|(word, typo)| { Operation::Query(Query { prefix: false, kind: QueryKind::Exact { original_typo: typo, word }, @@ -199,7 +211,7 @@ fn alterate_query_tree( } } - recurse(words_fst, &mut query_tree, number_typos)?; + recurse(words_fst, &mut query_tree, number_typos, typo_cache)?; Ok(query_tree) } @@ -277,11 +289,12 @@ fn resolve_candidates<'t>( { match branches.split_first() { Some((head, [])) => { - if let Some(candidates) = cache.get(&(head.clone(), mana)) { + let cache_key = (head.clone(), mana); + if let Some(candidates) = cache.get(&cache_key) { Ok(candidates.clone()) } else { let candidates = resolve_operation(ctx, head, mana, cache)?; - cache.insert((head.clone(), mana), candidates.clone()); + cache.insert(cache_key, candidates.clone()); Ok(candidates) } }, @@ -290,11 +303,12 @@ fn resolve_candidates<'t>( for m in 0..=mana { let mut head_candidates = { - if let Some(candidates) = cache.get(&(head.clone(), m)) { + let cache_key = (head.clone(), m); + if let Some(candidates) = cache.get(&cache_key) { candidates.clone() } else { let candidates = resolve_operation(ctx, head, m, cache)?; - cache.insert((head.clone(), m), candidates.clone()); + cache.insert(cache_key, candidates.clone()); candidates } }; From a273c46559fc699a157213a216f25d1c3aef34a3 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Feb 2021 16:31:10 +0100 Subject: [PATCH 0516/1889] clean warnings --- milli/src/search/criteria/mod.rs | 18 +- milli/src/search/criteria/typo.rs | 10 +- milli/src/search/mod.rs | 321 ++---------------------------- milli/src/search/query_tree.rs | 2 +- 4 files changed, 25 insertions(+), 326 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d033f5707..3673aef78 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use crate::Index; -use crate::search::word_typos; +use crate::search::word_derivations; use roaring::RoaringBitmap; @@ -124,7 +124,7 @@ fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result anyhow::Result { - let words = word_typos(&word, query.prefix, *typo, ctx.words_fst())?; + let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst())?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); @@ -155,14 +155,14 @@ fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, p if prefix && ctx.in_prefix_cache(&right) { Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) } else if prefix { - let r_words = word_typos(&right, true, 0, ctx.words_fst())?; + let r_words = word_derivations(&right, true, 0, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) } else { Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) } }, (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = word_typos(&left, false, *typo, ctx.words_fst())?; + let l_words = word_derivations(&left, false, *typo, ctx.words_fst())?; if prefix && ctx.in_prefix_cache(&right) { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { @@ -171,19 +171,19 @@ fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, p } Ok(docids) } else if prefix { - let r_words = word_typos(&right, true, 0, ctx.words_fst())?; + let r_words = word_derivations(&right, true, 0, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } }, (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { - let r_words = word_typos(&right, prefix, *typo, ctx.words_fst())?; + let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) }, (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { - let l_words = word_typos(&left, false, *l_typo, ctx.words_fst())?; - let r_words = word_typos(&right, prefix, *r_typo, ctx.words_fst())?; + let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst())?; + let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst())?; all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) }, } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 0284d448d..a6f500bd5 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -4,7 +4,7 @@ use anyhow::bail; use roaring::RoaringBitmap; use crate::search::query_tree::{Operation, Query, QueryKind}; -use crate::search::word_typos; +use crate::search::word_derivations; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; // FIXME we must stop when the number of typos is equal to @@ -177,7 +177,7 @@ fn alterate_query_tree( }, Operation::Query(q) => { if let QueryKind::Tolerant { typo, word } = &q.kind { - // if no typo is allowed we don't call word_typos(..), + // if no typo is allowed we don't call word_derivations function, // and directly create an Exact query if number_typos == 0 { *operation = Operation::Query(Query { @@ -190,7 +190,7 @@ fn alterate_query_tree( let words = if let Some(derivations) = typo_cache.get(&cache_key) { derivations.clone() } else { - let derivations = word_typos(word, q.prefix, typo, words_fst)?; + let derivations = word_derivations(word, q.prefix, typo, words_fst)?; typo_cache.insert(cache_key, derivations.clone()); derivations }; @@ -222,10 +222,6 @@ fn resolve_candidates<'t>( cache: &mut HashMap<(Operation, u8), RoaringBitmap>, ) -> anyhow::Result { - // FIXME add a cache - // FIXME keep the cache between typos iterations - // cache: HashMap<(&Operation, u8), RoaringBitmap>, - fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index d94bc8831..6046cc8d2 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,26 +1,18 @@ use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt; use std::time::Instant; -use anyhow::{bail, Context}; use fst::{IntoStreamer, Streamer, Set}; -use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; -use ordered_float::OrderedFloat; use roaring::bitmap::RoaringBitmap; -use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; -use crate::mdfs::Mdfs; -use crate::query_tokens::{query_tokens, QueryToken}; use crate::search::criteria::{Criterion, CriterionResult}; use crate::search::criteria::typo::Typo; -use crate::{Index, FieldId, DocumentId}; +use crate::{Index, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetIter}; @@ -69,198 +61,6 @@ impl<'a> Search<'a> { self } - /// Extracts the query words from the query string and returns the DFAs accordingly. - /// TODO introduce settings for the number of typos regarding the words lengths. - fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { - let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2); - - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let words: Vec<_> = query_tokens(tokens).collect(); - - let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let number_of_words = words.len(); - - words.into_iter().enumerate().map(|(i, word)| { - let (word, quoted) = match word { - QueryToken::Free(token) => (token.text().to_string(), token.text().len() <= 3), - QueryToken::Quoted(token) => (token.text().to_string(), true), - }; - let is_last = i + 1 == number_of_words; - let is_prefix = is_last && !ends_with_whitespace && !quoted; - let lev = match word.len() { - 0..=4 => if quoted { lev0 } else { lev0 }, - 5..=8 => if quoted { lev0 } else { lev1 }, - _ => if quoted { lev0 } else { lev2 }, - }; - - let dfa = if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - }; - - (word, is_prefix, dfa) - }) - .collect() - } - - /// Fetch the words from the given FST related to the given DFAs along with - /// the associated documents ids. - fn fetch_words_docids( - &self, - fst: &fst::Set>, - dfas: Vec<(String, bool, DFA)>, - ) -> anyhow::Result, RoaringBitmap)>> - { - // A Vec storing all the derived words from the original query words, associated - // with the distance from the original word and the docids where the words appears. - let mut derived_words = Vec::<(HashMap::, RoaringBitmap)>::with_capacity(dfas.len()); - - for (_word, _is_prefix, dfa) in dfas { - - let mut acc_derived_words = HashMap::new(); - let mut unions_docids = RoaringBitmap::new(); - let mut stream = fst.search_with_state(&dfa).into_stream(); - while let Some((word, state)) = stream.next() { - - let word = std::str::from_utf8(word)?; - let docids = self.index.word_docids.get(self.rtxn, word)?.unwrap(); - let distance = dfa.distance(state); - unions_docids.union_with(&docids); - acc_derived_words.insert(word.to_string(), (distance.to_u8(), docids)); - } - derived_words.push((acc_derived_words, unions_docids)); - } - - Ok(derived_words) - } - - /// Returns the set of docids that contains all of the query words. - fn compute_candidates( - derived_words: &[(HashMap, RoaringBitmap)], - ) -> RoaringBitmap - { - // We sort the derived words by inverse popularity, this way intersections are faster. - let mut derived_words: Vec<_> = derived_words.iter().collect(); - derived_words.sort_unstable_by_key(|(_, docids)| docids.len()); - - // we do a union between all the docids of each of the derived words, - // we got N unions (the number of original query words), we then intersect them. - let mut candidates = RoaringBitmap::new(); - - for (i, (_, union_docids)) in derived_words.iter().enumerate() { - if i == 0 { - candidates = union_docids.clone(); - } else { - candidates.intersect_with(&union_docids); - } - } - - candidates - } - - fn facet_ordered( - &self, - field_id: FieldId, - facet_type: FacetType, - ascending: bool, - mut documents_ids: RoaringBitmap, - limit: usize, - ) -> anyhow::Result> - { - let mut output: Vec<_> = match facet_type { - FacetType::Float => { - if documents_ids.len() <= 1000 { - let db = self.index.field_id_docid_facet_values.remap_key_type::(); - let mut docids_values = Vec::with_capacity(documents_ids.len() as usize); - for docid in documents_ids.iter() { - let left = (field_id, docid, f64::MIN); - let right = (field_id, docid, f64::MAX); - let mut iter = db.range(self.rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, OrderedFloat(value))); - } - } - docids_values.sort_unstable_by_key(|(_, value)| *value); - let iter = docids_values.into_iter().map(|(id, _)| id); - if ascending { - iter.take(limit).collect() - } else { - iter.rev().take(limit).collect() - } - } else { - let facet_fn = if ascending { - FacetIter::::new_reducing - } else { - FacetIter::::new_reverse_reducing - }; - let mut limit_tmp = limit; - let mut output = Vec::new(); - for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? { - let (_val, docids) = result?; - limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); - output.push(docids); - if limit_tmp == 0 { break } - } - output.into_iter().flatten().take(limit).collect() - } - }, - FacetType::Integer => { - if documents_ids.len() <= 1000 { - let db = self.index.field_id_docid_facet_values.remap_key_type::(); - let mut docids_values = Vec::with_capacity(documents_ids.len() as usize); - for docid in documents_ids.iter() { - let left = (field_id, docid, i64::MIN); - let right = (field_id, docid, i64::MAX); - let mut iter = db.range(self.rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, value)); - } - } - docids_values.sort_unstable_by_key(|(_, value)| *value); - let iter = docids_values.into_iter().map(|(id, _)| id); - if ascending { - iter.take(limit).collect() - } else { - iter.rev().take(limit).collect() - } - } else { - let facet_fn = if ascending { - FacetIter::::new_reducing - } else { - FacetIter::::new_reverse_reducing - }; - let mut limit_tmp = limit; - let mut output = Vec::new(); - for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? { - let (_val, docids) = result?; - limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); - output.push(docids); - if limit_tmp == 0 { break } - } - output.into_iter().flatten().take(limit).collect() - } - }, - FacetType::String => bail!("criteria facet type must be a number"), - }; - - // if there isn't enough documents to return we try to complete that list - // with documents that are maybe not faceted under this field and therefore - // not returned by the previous facet iteration. - if output.len() < limit { - output.iter().for_each(|n| { documents_ids.remove(*n); }); - let remaining = documents_ids.iter().take(limit - output.len()); - output.extend(remaining); - } - - Ok(output) - } - pub fn execute(&self) -> anyhow::Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); @@ -320,101 +120,6 @@ impl<'a> Search<'a> { let found_words = HashSet::new(); Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids }) - - // let order_by_facet = { - // let criteria = self.index.criteria(self.rtxn)?; - // let result = criteria.into_iter().flat_map(|criterion| { - // match criterion { - // Criterion::Asc(fid) => Some((fid, true)), - // Criterion::Desc(fid) => Some((fid, false)), - // _ => None - // } - // }).next(); - // match result { - // Some((attr_name, is_ascending)) => { - // let field_id_map = self.index.fields_ids_map(self.rtxn)?; - // let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?; - // let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?; - // let ftype = *faceted_fields.get(&fid) - // .with_context(|| format!("{:?} not found in the faceted fields.", attr_name)) - // .expect("corrupted data: "); - // Some((fid, ftype, is_ascending)) - // }, - // None => None, - // } - // }; - - // let before = Instant::now(); - // let (candidates, derived_words) = match (facet_candidates, derived_words) { - // (Some(mut facet_candidates), Some(derived_words)) => { - // let words_candidates = Self::compute_candidates(&derived_words); - // facet_candidates.intersect_with(&words_candidates); - // (facet_candidates, derived_words) - // }, - // (None, Some(derived_words)) => { - // (Self::compute_candidates(&derived_words), derived_words) - // }, - // (Some(facet_candidates), None) => { - // // If the query is not set or results in no DFAs but - // // there is some facet conditions we return a placeholder. - // let documents_ids = match order_by_facet { - // Some((fid, ftype, is_ascending)) => { - // self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)? - // }, - // None => facet_candidates.iter().take(limit).collect(), - // }; - // return Ok(SearchResult { - // documents_ids, - // candidates: facet_candidates, - // ..Default::default() - // }) - // }, - // (None, None) => { - // // If the query is not set or results in no DFAs we return a placeholder. - // let all_docids = self.index.documents_ids(self.rtxn)?; - // let documents_ids = match order_by_facet { - // Some((fid, ftype, is_ascending)) => { - // self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)? - // }, - // None => all_docids.iter().take(limit).collect(), - // }; - // return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() }) - // }, - // }; - - // debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed()); - - // // The mana depth first search is a revised DFS that explore - // // solutions in the order of their proximities. - // let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone()); - // let mut documents = Vec::new(); - - // // We execute the Mdfs iterator until we find enough documents. - // while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { - // match mdfs.next().transpose()? { - // Some((proximity, answer)) => { - // debug!("answer with a proximity of {}: {:?}", proximity, answer); - // documents.push(answer); - // }, - // None => break, - // } - // } - - // let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); - // let documents_ids = match order_by_facet { - // Some((fid, ftype, order)) => { - // let mut ordered_documents = Vec::new(); - // for documents_ids in documents { - // let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?; - // ordered_documents.push(docids); - // if ordered_documents.iter().map(Vec::len).sum::() >= limit { break } - // } - // ordered_documents.into_iter().flatten().take(limit).collect() - // }, - // None => documents.into_iter().flatten().take(limit).collect(), - // }; - - // Ok(SearchResult { found_words, candidates, documents_ids }) } } @@ -438,19 +143,17 @@ pub struct SearchResult { pub documents_ids: Vec, } -pub fn word_typos(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { - let dfa = { - let lev = match max_typo { - 0 => &LEVDIST0, - 1 => &LEVDIST1, - _ => &LEVDIST2, - }; +pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { + let lev = match max_typo { + 0 => &LEVDIST0, + 1 => &LEVDIST1, + _ => &LEVDIST2, + }; - if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - } + let dfa = if is_prefix { + lev.build_prefix_dfa(&word) + } else { + lev.build_dfa(&word) }; let mut derived_words = Vec::new(); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 9b253350e..02f6dc0c8 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -303,7 +303,7 @@ fn fetch_words(tree: &Operation, fst: &fst::Set>) -> FetchedWords { match query.kind.clone() { QueryKind::Exact { word, .. } => vec![(word, query.prefix)], QueryKind::Tolerant { typo, word } => { - if let Ok(words) = super::word_typos(&word, query.prefix, typo, fst) { + if let Ok(words) = super::word_derivations(&word, query.prefix, typo, fst) { words.into_iter().map(|(w, _)| (w, query.prefix)).collect() } else { vec![(word, query.prefix)] From c5a32fd4fa90c2914b1296937af517d7cc2beffd Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 23 Feb 2021 17:33:20 +0100 Subject: [PATCH 0517/1889] Fix the typo criterion --- milli/src/search/criteria/typo.rs | 115 ++++++++++++++++-------------- 1 file changed, 60 insertions(+), 55 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a6f500bd5..75f3f5666 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -3,17 +3,13 @@ use std::{borrow::Cow, collections::HashMap, mem::take}; use anyhow::bail; use roaring::RoaringBitmap; -use crate::search::query_tree::{Operation, Query, QueryKind}; +use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::word_derivations; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; -// FIXME we must stop when the number of typos is equal to -// the maximum number of typos for this query tree. -const MAX_NUM_TYPOS: u8 = 8; - pub struct Typo<'t> { ctx: &'t dyn Context, - query_tree: Option, + query_tree: Option<(usize, Operation)>, number_typos: u8, candidates: Candidates, bucket_candidates: Option, @@ -31,7 +27,7 @@ impl<'t> Typo<'t> { { Ok(Typo { ctx, - query_tree, + query_tree: query_tree.map(|op| (maximum_typo(&op), op)), number_typos: 0, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), bucket_candidates: None, @@ -62,65 +58,75 @@ impl<'t> Typo<'t> { impl<'t> Criterion for Typo<'t> { fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; - while self.number_typos < MAX_NUM_TYPOS { + loop { match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { self.query_tree = None; self.candidates = Candidates::default(); }, - (Some(query_tree), Allowed(candidates)) => { - let fst = self.ctx.words_fst(); - let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? - } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; - query_tree.clone() + (Some((max_typos, query_tree)), Allowed(candidates)) => { + if self.number_typos as usize > *max_typos { + self.query_tree = None; + self.candidates = Candidates::default(); } else { - query_tree.clone() - }; + let fst = self.ctx.words_fst(); + let new_query_tree = if self.number_typos < 2 { + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? + } else if self.number_typos == 2 { + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; + query_tree.clone() + } else { + query_tree.clone() + }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; - new_candidates.intersect_with(&candidates); - candidates.difference_with(&new_candidates); - self.number_typos += 1; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; + new_candidates.intersect_with(&candidates); + candidates.difference_with(&new_candidates); + self.number_typos += 1; - let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), - }; + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: new_candidates, - bucket_candidates, - })); + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: new_candidates, + bucket_candidates, + })); + } }, - (Some(query_tree), Forbidden(candidates)) => { - let fst = self.ctx.words_fst(); - let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? - } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; - query_tree.clone() + (Some((max_typos, query_tree)), Forbidden(candidates)) => { + if self.number_typos as usize > *max_typos { + self.query_tree = None; + self.candidates = Candidates::default(); } else { - query_tree.clone() - }; + let fst = self.ctx.words_fst(); + let new_query_tree = if self.number_typos < 2 { + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? + } else if self.number_typos == 2 { + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; + query_tree.clone() + } else { + query_tree.clone() + }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; - new_candidates.difference_with(&candidates); - candidates.union_with(&new_candidates); - self.number_typos += 1; + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; + new_candidates.difference_with(&candidates); + candidates.union_with(&new_candidates); + self.number_typos += 1; - let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), - }; + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: new_candidates, - bucket_candidates, - })); + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: new_candidates, + bucket_candidates, + })); + } }, (None, Allowed(_)) => { let candidates = take(&mut self.candidates).into_inner(); @@ -135,7 +141,8 @@ impl<'t> Criterion for Typo<'t> { Some(parent) => { match parent.next()? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree; + self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); + self.number_typos = 0; self.candidates = Candidates::Allowed(candidates); self.bucket_candidates = bucket_candidates; }, @@ -147,8 +154,6 @@ impl<'t> Criterion for Typo<'t> { }, } } - - Ok(None) } } From fb7e6df790d840e276a24dbc0d1e855f007692e2 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 24 Feb 2021 10:25:22 +0100 Subject: [PATCH 0518/1889] add tests on typo criterion --- milli/src/search/criteria/mod.rs | 137 +++++++++++++++++++++++++++ milli/src/search/criteria/typo.rs | 150 ++++++++++++++++++++++++++++++ milli/src/search/query_tree.rs | 8 +- 3 files changed, 293 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 3673aef78..77d92f6ea 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -14,6 +14,7 @@ pub trait Criterion { } /// The result of a call to the parent criterion. +#[derive(Debug, Clone, PartialEq)] pub struct CriterionResult { /// The query tree that must be used by the children criterion to fetch candidates. pub query_tree: Option, @@ -188,3 +189,139 @@ fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, p }, } } + +#[cfg(test)] +pub mod test { + use maplit::hashmap; + use rand::{Rng, SeedableRng, rngs::StdRng}; + + use super::*; + use std::collections::HashMap; + + fn s(s: &str) -> String { s.to_string() } + pub struct TestContext<'t> { + words_fst: fst::Set>, + word_docids: HashMap, + word_prefix_docids: HashMap, + word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + } + + impl<'a> Context for TestContext<'a> { + fn word_docids(&self, word: &str) -> heed::Result> { + Ok(self.word_docids.get(&word.to_string()).cloned()) + } + + fn word_prefix_docids(&self, word: &str) -> heed::Result> { + Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) + } + + fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left.to_string(), right.to_string(), proximity.into()); + Ok(self.word_pair_proximity_docids.get(&key).cloned()) + } + + fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left.to_string(), right.to_string(), proximity.into()); + Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) + } + + fn words_fst<'t>(&self) -> &'t fst::Set> { + &self.words_fst + } + + fn in_prefix_cache(&self, word: &str) -> bool { + self.word_prefix_docids.contains_key(&word.to_string()) + } + } + + impl<'a> Default for TestContext<'a> { + fn default() -> TestContext<'a> { + let mut rng = StdRng::seed_from_u64(102); + let rng = &mut rng; + + fn random_postings(rng: &mut R, len: usize) -> RoaringBitmap { + let mut values = Vec::::with_capacity(len); + while values.len() != len { + values.push(rng.gen()); + } + values.sort_unstable(); + + RoaringBitmap::from_sorted_iter(values.into_iter()) + } + + let word_docids = hashmap!{ + s("hello") => random_postings(rng, 1500), + s("hi") => random_postings(rng, 4000), + s("word") => random_postings(rng, 2500), + s("split") => random_postings(rng, 400), + s("ngrams") => random_postings(rng, 1400), + s("world") => random_postings(rng, 15_000), + s("earth") => random_postings(rng, 8000), + s("2021") => random_postings(rng, 100), + s("2020") => random_postings(rng, 500), + s("is") => random_postings(rng, 50_000), + s("this") => random_postings(rng, 50_000), + s("good") => random_postings(rng, 1250), + s("morning") => random_postings(rng, 125), + }; + + let word_prefix_docids = hashmap!{ + s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], + s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], + s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], + }; + + let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")]; + let hello_world_split = (hello_world.len() / 2) as usize; + let hello_world_1 = hello_world.iter().take(hello_world_split).collect(); + let hello_world_2 = hello_world.iter().skip(hello_world_split).collect(); + + let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")]; + let hello_word_split = (hello_word.len() / 2) as usize; + let hello_word_4 = hello_word.iter().take(hello_word_split).collect(); + let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect(); + let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect(); + let word_pair_proximity_docids = hashmap!{ + (s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")], + (s("hello"), s("world"), 1) => hello_world_1, + (s("hello"), s("world"), 4) => hello_world_2, + (s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")], + (s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], + (s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), + (s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], + (s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), + (s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")], + (s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")], + (s("hello"), s("word"), 4) => hello_word_4, + (s("hello"), s("word"), 6) => hello_word_6, + (s("hello"), s("word"), 7) => hello_word_7, + (s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")], + (s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], + (s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")], + (s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], + }; + + let word_prefix_pair_proximity_docids = hashmap!{ + (s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(), + (s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(), + (s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(), + (s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(), + (s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(), + (s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(), + }; + + let mut keys = word_docids.keys().collect::>(); + keys.sort_unstable(); + let words_fst = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap(); + + TestContext { + words_fst, + word_docids, + word_prefix_docids, + word_pair_proximity_docids, + word_prefix_pair_proximity_docids, + } + } + } +} diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 75f3f5666..d9a5f8aa6 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -328,3 +328,153 @@ fn resolve_candidates<'t>( resolve_operation(ctx, query_tree, number_typos, cache) } + +#[cfg(test)] +mod test { + + use super::*; + use super::super::test::TestContext; + + #[test] + fn initial_placeholder_no_facets() { + let context = TestContext::default(); + let query_tree = None; + let facet_candidates = None; + + let mut criteria = Typo::initial(&context, query_tree, facet_candidates).unwrap(); + + assert!(criteria.next().unwrap().is_none()); + } + + #[test] + fn initial_query_tree_no_facets() { + let context = TestContext::default(); + let query_tree = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), + ]) + ]); + + let facet_candidates = None; + + let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates).unwrap(); + + let candidates_1 = context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("world").unwrap().unwrap(); + let expected_1 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ])), + candidates: candidates_1.clone(), + bucket_candidates: Some(candidates_1), + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_1)); + + let candidates_2 = ( + context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("word").unwrap().unwrap() + ) - context.word_docids("world").unwrap().unwrap(); + let expected_2 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ]), + ])), + candidates: candidates_2.clone(), + bucket_candidates: Some(candidates_2), + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_2)); + } + + #[test] + fn initial_placeholder_with_facets() { + let context = TestContext::default(); + let query_tree = None; + let facet_candidates = context.word_docids("earth").unwrap(); + + let mut criteria = Typo::initial(&context, query_tree, facet_candidates.clone()).unwrap(); + + let expected = CriterionResult { + query_tree: None, + candidates: facet_candidates.clone().unwrap(), + bucket_candidates: facet_candidates, + }; + + // first iteration, returns the facet candidates + assert_eq!(criteria.next().unwrap(), Some(expected)); + + // second iteration, returns None because there is no more things to do + assert!(criteria.next().unwrap().is_none()); + } + + #[test] + fn initial_query_tree_with_facets() { + let context = TestContext::default(); + let query_tree = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), + ]) + ]); + + let facet_candidates = context.word_docids("earth").unwrap().unwrap(); + + let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())).unwrap(); + + let candidates_1 = context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("world").unwrap().unwrap(); + let expected_1 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ])), + candidates: &candidates_1 & &facet_candidates, + bucket_candidates: Some(candidates_1 & &facet_candidates), + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_1)); + + let candidates_2 = ( + context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("word").unwrap().unwrap() + ) - context.word_docids("world").unwrap().unwrap(); + let expected_2 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ]), + ])), + candidates: &candidates_2 & &facet_candidates, + bucket_candidates: Some(candidates_2 & &facet_candidates), + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_2)); + } + +} diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 02f6dc0c8..715a4864e 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -94,11 +94,15 @@ pub enum QueryKind { } impl QueryKind { - fn exact(word: String) -> Self { + pub fn exact(word: String) -> Self { QueryKind::Exact { original_typo: 0, word } } - fn tolerant(typo: u8, word: String) -> Self { + pub fn exact_with_typo(original_typo: u8, word: String) -> Self { + QueryKind::Exact { original_typo, word } + } + + pub fn tolerant(typo: u8, word: String) -> Self { QueryKind::Tolerant { typo, word } } From 64688b3786561804ec25c0a69a402d64e5456863 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 14:59:40 +0100 Subject: [PATCH 0519/1889] fix query tree builder --- milli/src/search/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 6046cc8d2..1e3e8eefe 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -66,12 +66,12 @@ impl<'a> Search<'a> { let before = Instant::now(); let query_tree = match self.query.as_ref() { Some(query) => { - let builder = QueryTreeBuilder::new(self.rtxn, self.index); + let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); let stop_words = &Set::default(); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.build(false, true, tokens) + builder.optional_words(false).build(tokens) }, None => None, }; From d92ad5640ae188ace90517687fedea5041c48bc1 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 16:14:38 +0100 Subject: [PATCH 0520/1889] remove option on bucket_candidates --- milli/src/search/criteria/mod.rs | 2 +- milli/src/search/criteria/typo.rs | 30 +++++++++++++++--------------- milli/src/search/mod.rs | 4 +--- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 77d92f6ea..e7549cae1 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -21,7 +21,7 @@ pub struct CriterionResult { /// The candidates that this criterion is allowed to return subsets of. pub candidates: RoaringBitmap, /// Candidates that comes from the current bucket of the initial criterion. - pub bucket_candidates: Option, + pub bucket_candidates: RoaringBitmap, } /// Either a set of candidates that defines the candidates diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index d9a5f8aa6..a9d3b5d96 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -12,7 +12,7 @@ pub struct Typo<'t> { query_tree: Option<(usize, Operation)>, number_typos: u8, candidates: Candidates, - bucket_candidates: Option, + bucket_candidates: RoaringBitmap, parent: Option>, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>, @@ -30,7 +30,7 @@ impl<'t> Typo<'t> { query_tree: query_tree.map(|op| (maximum_typo(&op), op)), number_typos: 0, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::new(), typo_cache: HashMap::new(), @@ -47,7 +47,7 @@ impl<'t> Typo<'t> { query_tree: None, number_typos: 0, candidates: Candidates::default(), - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::new(), typo_cache: HashMap::new(), @@ -85,8 +85,8 @@ impl<'t> Criterion for Typo<'t> { self.number_typos += 1; let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), }; return Ok(Some(CriterionResult { @@ -117,8 +117,8 @@ impl<'t> Criterion for Typo<'t> { self.number_typos += 1; let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), }; return Ok(Some(CriterionResult { @@ -133,7 +133,7 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some(CriterionResult { query_tree: None, candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, (None, Forbidden(_)) => { @@ -373,7 +373,7 @@ mod test { ]), ])), candidates: candidates_1.clone(), - bucket_candidates: Some(candidates_1), + bucket_candidates: candidates_1, }; assert_eq!(criteria.next().unwrap(), Some(expected_1)); @@ -395,7 +395,7 @@ mod test { ]), ])), candidates: candidates_2.clone(), - bucket_candidates: Some(candidates_2), + bucket_candidates: candidates_2, }; assert_eq!(criteria.next().unwrap(), Some(expected_2)); @@ -405,13 +405,13 @@ mod test { fn initial_placeholder_with_facets() { let context = TestContext::default(); let query_tree = None; - let facet_candidates = context.word_docids("earth").unwrap(); + let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criteria = Typo::initial(&context, query_tree, facet_candidates.clone()).unwrap(); + let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())).unwrap(); let expected = CriterionResult { query_tree: None, - candidates: facet_candidates.clone().unwrap(), + candidates: facet_candidates.clone(), bucket_candidates: facet_candidates, }; @@ -449,7 +449,7 @@ mod test { ]), ])), candidates: &candidates_1 & &facet_candidates, - bucket_candidates: Some(candidates_1 & &facet_candidates), + bucket_candidates: candidates_1 & &facet_candidates, }; assert_eq!(criteria.next().unwrap(), Some(expected_1)); @@ -471,7 +471,7 @@ mod test { ]), ])), candidates: &candidates_2 & &facet_candidates, - bucket_candidates: Some(candidates_2 & &facet_candidates), + bucket_candidates: candidates_2 & &facet_candidates, }; assert_eq!(criteria.next().unwrap(), Some(expected_2)); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 1e3e8eefe..7a8bbdc09 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -100,9 +100,7 @@ impl<'a> Search<'a> { let mut len = candidates.len() as usize; let mut candidates = candidates.into_iter(); - if let Some(docids) = bucket_candidates { - initial_candidates.union_with(&docids); - } + initial_candidates.union_with(&bucket_candidates); if offset != 0 { candidates.by_ref().skip(offset).for_each(drop); From 2d068bd45b61da527c1bf12a866b58ba5d895222 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 17 Feb 2021 15:27:35 +0100 Subject: [PATCH 0521/1889] implement Context trait for criteria --- milli/src/search/criteria/typo.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a9d3b5d96..659292619 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -181,6 +181,7 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache)) }, Operation::Query(q) => { + // TODO may be optimized when number_typos == 0 if let QueryKind::Tolerant { typo, word } = &q.kind { // if no typo is allowed we don't call word_derivations function, // and directly create an Exact query From 1e47f9b3ffa7f55071274d95d542bb183c7862cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Feb 2021 15:32:14 +0100 Subject: [PATCH 0522/1889] Introduce the Words criterion --- milli/src/search/criteria/mod.rs | 1 + milli/src/search/criteria/typo.rs | 4 +- milli/src/search/criteria/words.rs | 109 +++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 milli/src/search/criteria/words.rs diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index e7549cae1..5cc803dee 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -8,6 +8,7 @@ use roaring::RoaringBitmap; use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; +pub mod words; pub trait Criterion { fn next(&mut self) -> anyhow::Result>; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 659292619..a62616f08 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -13,7 +13,7 @@ pub struct Typo<'t> { number_typos: u8, candidates: Candidates, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Option>, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>, } @@ -39,7 +39,7 @@ impl<'t> Typo<'t> { pub fn new( ctx: &'t dyn Context, - parent: Box, + parent: Box, ) -> anyhow::Result where Self: Sized { Ok(Typo { diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs new file mode 100644 index 000000000..ade370fda --- /dev/null +++ b/milli/src/search/criteria/words.rs @@ -0,0 +1,109 @@ +use std::collections::HashMap; +use std::mem::take; + +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use super::{Candidates, Criterion, CriterionResult, Context}; + +pub struct Words<'t> { + ctx: &'t dyn Context, + query_trees: Vec, + candidates: Candidates, + bucket_candidates: RoaringBitmap, + parent: Option>, + candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, +} + +impl<'t> Words<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> anyhow::Result where Self: Sized + { + Ok(Words { + ctx, + query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + bucket_candidates: RoaringBitmap::new(), + parent: None, + candidates_cache: HashMap::default(), + }) + } + + pub fn new( + ctx: &'t dyn Context, + parent: Box, + ) -> anyhow::Result where Self: Sized + { + Ok(Words { + ctx, + query_trees: Vec::default(), + candidates: Candidates::default(), + bucket_candidates: RoaringBitmap::new(), + parent: Some(parent), + candidates_cache: HashMap::default(), + }) + } +} + +impl<'t> Criterion for Words<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + + loop { + match (self.query_trees.pop(), &mut self.candidates) { + (_, Allowed(candidates)) if candidates.is_empty() => { + self.query_trees = Vec::new(); + self.candidates = Candidates::default(); + }, + (Some(qt), Allowed(candidates)) => { + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(qt), + candidates: candidates.clone(), + bucket_candidates, + })); + }, + (Some(_qt), Forbidden(candidates)) => { + todo!() + }, + (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: candidates.clone(), + bucket_candidates: candidates, + })); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); + self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates = bucket_candidates; + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + } +} + +fn explode_query_tree(query_tree: Operation) -> Vec { + match query_tree { + Operation::Or(true, ops) => ops, + otherwise => vec![otherwise], + } +} From e174ccbd8e20a6b357430cd876b56ad7fd67420c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Feb 2021 17:22:43 +0100 Subject: [PATCH 0523/1889] Use the words criterion in the search module --- milli/src/search/criteria/words.rs | 2 +- milli/src/search/mod.rs | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index ade370fda..bf3aa8b12 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -70,7 +70,7 @@ impl<'t> Criterion for Words<'t> { bucket_candidates, })); }, - (Some(_qt), Forbidden(candidates)) => { + (Some(_qt), Forbidden(_candidates)) => { todo!() }, (None, Allowed(_)) => { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7a8bbdc09..2a726f635 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,8 +11,8 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::typo::Typo; -use crate::{Index, DocumentId}; +use crate::search::criteria::{typo::Typo, words::Words}; +use crate::{Index, FieldId, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetIter}; @@ -71,7 +71,7 @@ impl<'a> Search<'a> { let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.optional_words(false).build(tokens) + builder.build(tokens) }, None => None, }; @@ -89,7 +89,8 @@ impl<'a> Search<'a> { // We aretesting the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; - let mut criteria = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; + let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; + let mut criteria = Words::new(&criteria_ctx, Box::new(typo_criterion))?; let mut offset = self.offset; let mut limit = self.limit; From ef381e17bbd4647ae9894f6d56f5db87fce2f861 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Feb 2021 11:20:42 +0100 Subject: [PATCH 0524/1889] Compute the candidates for each sub query tree --- milli/src/search/criteria/words.rs | 77 +++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index bf3aa8b12..93298b64e 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -1,10 +1,11 @@ use std::collections::HashMap; use std::mem::take; +use anyhow::bail; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use super::{Candidates, Criterion, CriterionResult, Context}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; pub struct Words<'t> { ctx: &'t dyn Context, @@ -64,9 +65,13 @@ impl<'t> Criterion for Words<'t> { None => candidates.clone(), }; + let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; + found_candidates.intersect_with(&candidates); + candidates.difference_with(&found_candidates); + return Ok(Some(CriterionResult { query_tree: Some(qt), - candidates: candidates.clone(), + candidates: found_candidates, bucket_candidates, })); }, @@ -107,3 +112,71 @@ fn explode_query_tree(query_tree: Operation) -> Vec { otherwise => vec![otherwise], } } + +fn resolve_candidates<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, +) -> anyhow::Result +{ + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + ) -> anyhow::Result + { + use Operation::{And, Consecutive, Or, Query}; + + match query_tree { + And(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for op in ops { + let docids = resolve_operation(ctx, op, cache)?; + if first_loop { + candidates = docids; + first_loop = false; + } else { + candidates.intersect_with(&docids); + } + } + Ok(candidates) + }, + Consecutive(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for slice in ops.windows(2) { + match (&slice[0], &slice[1]) { + (Operation::Query(left), Operation::Query(right)) => { + match query_pair_proximity_docids(ctx, left, right, 1)? { + pair_docids if pair_docids.is_empty() => { + return Ok(RoaringBitmap::new()) + }, + pair_docids if first_loop => { + candidates = pair_docids; + first_loop = false; + }, + pair_docids => { + candidates.intersect_with(&pair_docids); + }, + } + }, + _ => bail!("invalid consecutive query type"), + } + } + Ok(candidates) + }, + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(ctx, op, cache)?; + candidates.union_with(&docids); + } + Ok(candidates) + }, + Query(q) => Ok(query_docids(ctx, q)?), + } + } + + resolve_operation(ctx, query_tree, cache) +} From 3415812b06291f5ee7504513f76dbcf411823c26 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 24 Feb 2021 14:48:12 +0100 Subject: [PATCH 0525/1889] Imrpove the intersection speed in the words criterion --- milli/src/search/criteria/words.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 93298b64e..cf1668055 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -129,10 +129,15 @@ fn resolve_candidates<'t>( match query_tree { And(ops) => { + let mut ops = ops.iter().map(|op| { + resolve_operation(ctx, op, cache) + }).collect::>>()?; + + ops.sort_unstable_by_key(|cds| cds.len()); + let mut candidates = RoaringBitmap::new(); let mut first_loop = true; - for op in ops { - let docids = resolve_operation(ctx, op, cache)?; + for docids in ops { if first_loop { candidates = docids; first_loop = false; From b5b7ec0162bd5c3613f3281cdb29becb1c792758 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 24 Feb 2021 15:59:19 +0100 Subject: [PATCH 0526/1889] implement initial state for words criterion --- milli/src/search/criteria/words.rs | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index cf1668055..bd03ecf97 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -60,23 +60,36 @@ impl<'t> Criterion for Words<'t> { self.candidates = Candidates::default(); }, (Some(qt), Allowed(candidates)) => { - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => candidates.clone(), - }; - let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => found_candidates.clone(), + }; + return Ok(Some(CriterionResult { query_tree: Some(qt), candidates: found_candidates, bucket_candidates, })); }, - (Some(_qt), Forbidden(_candidates)) => { - todo!() + (Some(qt), Forbidden(candidates)) => { + let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; + found_candidates.difference_with(&candidates); + candidates.union_with(&found_candidates); + + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => found_candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(qt), + candidates: found_candidates, + bucket_candidates, + })); }, (None, Allowed(_)) => { let candidates = take(&mut self.candidates).into_inner(); From 14f9f85c4b2d30ff79ab31562ddade10c571d849 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 19 Feb 2021 15:45:15 +0100 Subject: [PATCH 0527/1889] Introduce the AscDesc criterion --- milli/src/search/criteria/asc_desc.rs | 272 ++++++++++++++++++++++++++ milli/src/search/criteria/mod.rs | 2 + milli/src/search/mod.rs | 15 +- 3 files changed, 285 insertions(+), 4 deletions(-) create mode 100644 milli/src/search/criteria/asc_desc.rs diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs new file mode 100644 index 000000000..bf75ada7e --- /dev/null +++ b/milli/src/search/criteria/asc_desc.rs @@ -0,0 +1,272 @@ +use std::mem::take; + +use anyhow::bail; +use itertools::Itertools; +use ordered_float::OrderedFloat; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; +use crate::heed_codec::facet::{FieldDocIdFacetI64Codec, FieldDocIdFacetF64Codec}; +use crate::search::facet::FacetIter; +use crate::search::query_tree::Operation; +use crate::{FieldId, Index}; +use super::{Candidates, Criterion, CriterionResult}; + +pub struct AscDesc<'t> { + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + query_tree: Option, + candidates: Candidates, + bucket_candidates: Option, + faceted_candidates: RoaringBitmap, + parent: Option>, +} + +impl<'t> AscDesc<'t> { + pub fn initial_asc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + query_tree: Option, + candidates: Option, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result where Self: Sized + { + Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, true) + } + + pub fn initial_desc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + query_tree: Option, + candidates: Option, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result where Self: Sized + { + Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, false) + } + + pub fn asc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result where Self: Sized + { + Self::new(index, rtxn, parent, field_id, facet_type, true) + } + + pub fn desc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result where Self: Sized + { + Self::new(index, rtxn, parent, field_id, facet_type, false) + } + + fn initial( + index: &'t Index, + rtxn: &'t heed::RoTxn, + query_tree: Option, + candidates: Option, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + ) -> anyhow::Result where Self: Sized + { + Ok(AscDesc { + index, + rtxn, + field_id, + facet_type, + ascending, + query_tree, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, + bucket_candidates: None, + parent: None, + }) + } + + fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + ) -> anyhow::Result where Self: Sized + { + Ok(AscDesc { + index, + rtxn, + field_id, + facet_type, + ascending, + query_tree: None, + candidates: Candidates::default(), + faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, + bucket_candidates: None, + parent: Some(parent), + }) + } +} + +impl<'t> Criterion for AscDesc<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + + loop { + match (&mut self.query_tree, &mut self.candidates) { + (_, Allowed(candidates)) if candidates.is_empty() => { + self.query_tree = None; + self.candidates = Candidates::default(); + }, + (Some(qt), Allowed(candidates)) => { + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(candidates.clone()), + }; + + let mut found_candidates = facet_ordered( + self.index, + self.rtxn, + self.field_id, + self.facet_type, + self.ascending, + candidates.clone(), + )?; + + found_candidates.intersect_with(&candidates); + candidates.difference_with(&found_candidates); + + return Ok(Some(CriterionResult { + query_tree: Some(qt.clone()), + candidates: found_candidates, + bucket_candidates, + })); + }, + (Some(_qt), Forbidden(_candidates)) => { + todo!() + }, + (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { + self.query_tree = query_tree; + candidates.intersect_with(&self.faceted_candidates); + self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates = bucket_candidates; + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + } +} + +fn facet_ordered( + index: &Index, + rtxn: &heed::RoTxn, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + candidates: RoaringBitmap, +) -> anyhow::Result +{ + match facet_type { + FacetType::Float => { + if candidates.len() <= 1000 { + let db = index.field_id_docid_facet_values.remap_key_type::(); + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, f64::MIN); + let right = (field_id, docid, f64::MAX); + let mut iter = db.range(rtxn, &(left..=right))?; + let entry = if ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), ())) = entry.transpose()? { + docids_values.push((docid, OrderedFloat(value))); + } + } + docids_values.sort_unstable_by_key(|(_, value)| *value); + let iter = docids_values.into_iter(); + let iter = if ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + match iter.group_by(|(_, v)| *v).into_iter().next() { + Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()), + None => Ok(RoaringBitmap::new()) + } + } else { + let facet_fn = if ascending { + FacetIter::::new_reducing + } else { + FacetIter::::new_reverse_reducing + }; + + let mut iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default()) + } + }, + FacetType::Integer => { + if candidates.len() <= 1000 { + let db = index.field_id_docid_facet_values.remap_key_type::(); + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, i64::MIN); + let right = (field_id, docid, i64::MAX); + let mut iter = db.range(rtxn, &(left..=right))?; + let entry = if ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), ())) = entry.transpose()? { + docids_values.push((docid, value)); + } + } + docids_values.sort_unstable_by_key(|(_, value)| *value); + let iter = docids_values.into_iter(); + let iter = if ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + match iter.group_by(|(_, v)| *v).into_iter().next() { + Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()), + None => Ok(RoaringBitmap::new()) + } + } else { + let facet_fn = if ascending { + FacetIter::::new_reducing + } else { + FacetIter::::new_reverse_reducing + }; + + let mut iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default()) + } + }, + FacetType::String => bail!("criteria facet type must be a number"), + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 5cc803dee..34d06dce3 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -9,6 +9,7 @@ use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; pub mod words; +pub mod asc_desc; pub trait Criterion { fn next(&mut self) -> anyhow::Result>; @@ -28,6 +29,7 @@ pub struct CriterionResult { /// Either a set of candidates that defines the candidates /// that are allowed to be returned, /// or the candidates that must never be returned. +#[derive(Debug)] enum Candidates { Allowed(RoaringBitmap), Forbidden(RoaringBitmap) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 2a726f635..93cac34b6 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,11 +11,11 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::{typo::Typo, words::Words}; -use crate::{Index, FieldId, DocumentId}; +use crate::search::criteria::{typo::Typo, words::Words, asc_desc::AscDesc}; +use crate::{Index, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; -pub use self::facet::{FacetIter}; +pub use self::facet::FacetIter; use self::query_tree::QueryTreeBuilder; // Building these factories is not free. @@ -90,7 +90,14 @@ impl<'a> Search<'a> { // We aretesting the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; - let mut criteria = Words::new(&criteria_ctx, Box::new(typo_criterion))?; + let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?; + + // We sort in descending order on a specific field *by hand*, don't do that at home. + let attr_name = "released-timestamp"; + let fid = self.index.fields_ids_map(self.rtxn)?.id(attr_name).unwrap(); + let ftype = *self.index.faceted_fields(self.rtxn)?.get(attr_name).unwrap(); + let desc_criterion = AscDesc::desc(self.index, self.rtxn, Box::new(words_criterion), fid, ftype)?; + let mut criteria = desc_criterion; let mut offset = self.offset; let mut limit = self.limit; From 3d731cc861a1799e0b14288443a2e7668d7bb83b Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 16:47:34 +0100 Subject: [PATCH 0528/1889] remove option on bucket_candidates --- milli/src/search/criteria/asc_desc.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index bf75ada7e..3d32bd845 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -21,7 +21,7 @@ pub struct AscDesc<'t> { ascending: bool, query_tree: Option, candidates: Candidates, - bucket_candidates: Option, + bucket_candidates: RoaringBitmap, faceted_candidates: RoaringBitmap, parent: Option>, } @@ -92,7 +92,7 @@ impl<'t> AscDesc<'t> { query_tree, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: None, }) } @@ -115,7 +115,7 @@ impl<'t> AscDesc<'t> { query_tree: None, candidates: Candidates::default(), faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: Some(parent), }) } @@ -133,8 +133,8 @@ impl<'t> Criterion for AscDesc<'t> { }, (Some(qt), Allowed(candidates)) => { let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(candidates.clone()), + Some(_) => take(&mut self.bucket_candidates), + None => candidates.clone(), }; let mut found_candidates = facet_ordered( @@ -163,7 +163,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree: None, candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, (None, Forbidden(_)) => { From 22b84fe543c5f0d34d80a39db22e4c564f410378 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 12:12:35 +0100 Subject: [PATCH 0529/1889] Use the words criterion in the search module --- milli/src/search/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 93cac34b6..51f81f540 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -66,12 +66,12 @@ impl<'a> Search<'a> { let before = Instant::now(); let query_tree = match self.query.as_ref() { Some(query) => { - let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); + let builder = QueryTreeBuilder::new(self.rtxn, self.index); let stop_words = &Set::default(); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.build(tokens) + builder.build(tokens)? }, None => None, }; From 9bc9b366450b76861156b082205a950f58cc7f66 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 22 Feb 2021 17:17:01 +0100 Subject: [PATCH 0530/1889] Introduce the Proximity criterion --- milli/src/search/criteria/mod.rs | 1 + milli/src/search/criteria/proximity.rs | 283 +++++++++++++++++++++++++ milli/src/search/mod.rs | 18 +- milli/src/search/query_tree.rs | 7 + 4 files changed, 301 insertions(+), 8 deletions(-) create mode 100644 milli/src/search/criteria/proximity.rs diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 34d06dce3..41cd6722c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -10,6 +10,7 @@ use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; pub mod words; pub mod asc_desc; +pub mod proximity; pub trait Criterion { fn next(&mut self) -> anyhow::Result>; diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs new file mode 100644 index 000000000..2a46cf5d0 --- /dev/null +++ b/milli/src/search/criteria/proximity.rs @@ -0,0 +1,283 @@ +use std::collections::HashMap; +use std::mem::take; + +use roaring::RoaringBitmap; + +use crate::search::query_tree::{maximum_proximity, Operation, Query}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; + +pub struct Proximity<'t> { + ctx: &'t dyn Context, + query_tree: Option<(usize, Operation)>, + proximity: u8, + candidates: Candidates, + bucket_candidates: Option, + parent: Option>, + candidates_cache: HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, +} + +impl<'t> Proximity<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> anyhow::Result where Self: Sized + { + Ok(Proximity { + ctx, + query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), + proximity: 0, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + bucket_candidates: None, + parent: None, + candidates_cache: HashMap::new(), + }) + } + + pub fn new( + ctx: &'t dyn Context, + parent: Box, + ) -> anyhow::Result where Self: Sized + { + Ok(Proximity { + ctx, + query_tree: None, + proximity: 0, + candidates: Candidates::default(), + bucket_candidates: None, + parent: Some(parent), + candidates_cache: HashMap::new(), + }) + } +} + +impl<'t> Criterion for Proximity<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + loop { + match (&mut self.query_tree, &mut self.candidates) { + (_, Allowed(candidates)) if candidates.is_empty() => { + self.query_tree = None; + self.candidates = Candidates::default(); + }, + (Some((max_prox, query_tree)), Allowed(candidates)) => { + if self.proximity as usize > *max_prox { + self.query_tree = None; + self.candidates = Candidates::default(); + } else { + let mut new_candidates = resolve_candidates( + self.ctx, + &query_tree, + self.proximity, + &mut self.candidates_cache, + )?; + + new_candidates.intersect_with(&candidates); + candidates.difference_with(&new_candidates); + self.proximity += 1; + + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(query_tree.clone()), + candidates: new_candidates, + bucket_candidates, + })); + } + }, + (Some((max_prox, query_tree)), Forbidden(candidates)) => { + if self.proximity as usize > *max_prox { + self.query_tree = None; + self.candidates = Candidates::default(); + } else { + let mut new_candidates = resolve_candidates( + self.ctx, + &query_tree, + self.proximity, + &mut self.candidates_cache, + )?; + + new_candidates.difference_with(&candidates); + candidates.union_with(&new_candidates); + self.proximity += 1; + + let bucket_candidates = match self.parent { + Some(_) => self.bucket_candidates.take(), + None => Some(new_candidates.clone()), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(query_tree.clone()), + candidates: new_candidates, + bucket_candidates, + })); + } + }, + (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); + self.proximity = 0; + self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates = bucket_candidates; + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + } +} + +fn resolve_candidates<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, +) -> anyhow::Result +{ + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + ) -> anyhow::Result> + { + use Operation::{And, Consecutive, Or, Query}; + + let result = match query_tree { + And(ops) => mdfs(ctx, ops, proximity, cache)?, + Consecutive(ops) => if proximity == 0 { + mdfs(ctx, ops, 0, cache)? + } else { + Default::default() + }, + Or(_, ops) => { + let mut output = Vec::new(); + for op in ops { + let result = resolve_operation(ctx, op, proximity, cache)?; + output.extend(result); + } + output + }, + Query(q) => if proximity == 0 { + let candidates = query_docids(ctx, q)?; + vec![(q.clone(), q.clone(), candidates)] + } else { + Default::default() + }, + }; + + Ok(result) + } + + fn mdfs_pair<'t>( + ctx: &'t dyn Context, + left: &Operation, + right: &Operation, + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + ) -> anyhow::Result> + { + fn pair_combinations(mana: u8) -> impl Iterator { + (0..=mana).map(move |m| (mana - m, m)) + } + + let mut output = Vec::new(); + + for (pair_p, left_right_p) in pair_combinations(proximity) { + for (left_p, right_p) in pair_combinations(left_right_p) { + let left_key = (left.clone(), left_p); + if !cache.contains_key(&left_key) { + let candidates = resolve_operation(ctx, left, left_p, cache)?; + cache.insert(left_key.clone(), candidates); + } + + let right_key = (right.clone(), right_p); + if !cache.contains_key(&right_key) { + let candidates = resolve_operation(ctx, right, right_p, cache)?; + cache.insert(right_key.clone(), candidates); + } + + let lefts = cache.get(&left_key).unwrap(); + let rights = cache.get(&right_key).unwrap(); + + for (ll, lr, lcandidates) in lefts { + for (rl, rr, rcandidates) in rights { + let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1)?; + if lcandidates.len() < rcandidates.len() { + candidates.intersect_with(lcandidates); + candidates.intersect_with(rcandidates); + } else { + candidates.intersect_with(rcandidates); + candidates.intersect_with(lcandidates); + } + if !candidates.is_empty() { + output.push((ll.clone(), rr.clone(), candidates)); + } + } + } + } + } + + Ok(output) + } + + fn mdfs<'t>( + ctx: &'t dyn Context, + branches: &[Operation], + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + ) -> anyhow::Result> + { + // Extract the first two elements but gives the tail + // that is just after the first element. + let next = branches.split_first().map(|(h1, t)| { + (h1, t.split_first().map(|(h2, _)| (h2, t))) + }); + + match next { + Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache), + Some((head1, Some((head2, tail)))) => { + let mut output = Vec::new(); + for p in 0..=proximity { + for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache)? { + if !head_candidates.is_empty() { + for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache)? { + candidates.intersect_with(&head_candidates); + if !candidates.is_empty() { + output.push((lhead.clone(), rtail, candidates)); + } + } + } + } + } + Ok(output) + }, + Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache), + None => return Ok(Default::default()), + } + } + + let mut candidates = RoaringBitmap::new(); + for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache)? { + candidates.union_with(&cds); + } + Ok(candidates) +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 51f81f540..aced8bbd1 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,7 +11,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::{typo::Typo, words::Words, asc_desc::AscDesc}; +use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; use crate::{Index, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; @@ -87,17 +87,19 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); - // We aretesting the typo criteria but there will be more of them soon. + // We are testing the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?; + let proximity_criterion = Proximity::new(&criteria_ctx, Box::new(words_criterion))?; + // let proximity_criterion = Proximity::initial(&criteria_ctx, query_tree, facet_candidates)?; + let mut criteria = proximity_criterion; - // We sort in descending order on a specific field *by hand*, don't do that at home. - let attr_name = "released-timestamp"; - let fid = self.index.fields_ids_map(self.rtxn)?.id(attr_name).unwrap(); - let ftype = *self.index.faceted_fields(self.rtxn)?.get(attr_name).unwrap(); - let desc_criterion = AscDesc::desc(self.index, self.rtxn, Box::new(words_criterion), fid, ftype)?; - let mut criteria = desc_criterion; + // // We sort in descending order on a specific field *by hand*, don't do that at home. + // let attr_name = "released-timestamp"; + // let fid = self.index.fields_ids_map(self.rtxn)?.id(attr_name).unwrap(); + // let ftype = *self.index.faceted_fields(self.rtxn)?.get(attr_name).unwrap(); + // let desc_criterion = AscDesc::desc(self.index, self.rtxn, Box::new(words_criterion), fid, ftype)?; let mut offset = self.offset; let mut limit = self.limit; diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 715a4864e..59f7802f3 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -79,6 +79,13 @@ impl Operation { Self::Consecutive(ops) } } + + pub fn query(&self) -> Option<&Query> { + match self { + Operation::Query(query) => Some(query), + _ => None, + } + } } #[derive(Clone, Eq, PartialEq, Hash)] From ae4a237e580a38b930c8422297b275cb08819aac Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 24 Feb 2021 15:36:57 +0100 Subject: [PATCH 0531/1889] Fix the maximum_proximity function --- milli/src/search/criteria/mod.rs | 8 +++++++- milli/src/search/query_tree.rs | 5 ++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 41cd6722c..52367ac5f 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -153,8 +153,14 @@ fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result anyhow::Result { - let prefix = right.prefix; + if proximity >= 8 { + let mut candidates = query_docids(ctx, left)?; + let right_candidates = query_docids(ctx, right)?; + candidates.intersect_with(&right_candidates); + return Ok(candidates); + } + let prefix = right.prefix; match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix && ctx.in_prefix_cache(&right) { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 59f7802f3..47057ad10 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -546,7 +546,10 @@ pub fn maximum_proximity(operation: &Operation) -> usize { use Operation::{Or, And, Query, Consecutive}; match operation { Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), - And(ops) => ops.len().saturating_sub(1) * 8, + And(ops) => { + ops.iter().map(maximum_proximity).sum::() + + ops.len().saturating_sub(1) * 7 + }, Query(_) | Consecutive(_) => 0, } } From 4510bbcccac375409e18dc5170cc0238118bffb8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 24 Feb 2021 15:37:37 +0100 Subject: [PATCH 0532/1889] Add a lot of debug --- milli/src/search/criteria/proximity.rs | 7 +++++++ milli/src/search/criteria/typo.rs | 3 +++ milli/src/search/criteria/words.rs | 4 +++- milli/src/search/mod.rs | 2 ++ 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 2a46cf5d0..aab46a6a2 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use std::mem::take; use roaring::RoaringBitmap; +use log::debug; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; @@ -55,6 +56,12 @@ impl<'t> Criterion for Proximity<'t> { fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { + debug!("Proximity at iteration {} (max {:?}) ({:?})", + self.proximity, + self.query_tree.as_ref().map(|(mp, _)| mp), + self.candidates, + ); + match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { self.query_tree = None; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a62616f08..a48b074cc 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -2,6 +2,7 @@ use std::{borrow::Cow, collections::HashMap, mem::take}; use anyhow::bail; use roaring::RoaringBitmap; +use log::debug; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::word_derivations; @@ -59,6 +60,8 @@ impl<'t> Criterion for Typo<'t> { fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { + debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); + match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { self.query_tree = None; diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index bd03ecf97..3b0ecd54a 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use std::mem::take; use anyhow::bail; +use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; @@ -52,8 +53,9 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; - loop { + debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); + match (self.query_trees.pop(), &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { self.query_trees = Vec::new(); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index aced8bbd1..f3d5af2da 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -107,6 +107,8 @@ impl<'a> Search<'a> { let mut initial_candidates = RoaringBitmap::new(); while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next()? { + debug!("Number of candidates found {}", candidates.len()); + let mut len = candidates.len() as usize; let mut candidates = candidates.into_iter(); From 5af63c74e04251461dd022836a3e4f38ca3df52d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 24 Feb 2021 17:44:35 +0100 Subject: [PATCH 0533/1889] Speed-up the MatchingWords highlighting struct --- http-ui/src/main.rs | 18 ++--- milli/src/lib.rs | 2 +- milli/src/search/criteria/typo.rs | 2 +- milli/src/search/mod.rs | 51 +++++++++----- milli/src/search/query_tree.rs | 111 +++++++++++++----------------- 5 files changed, 91 insertions(+), 93 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 2ce7f8bd1..86f965368 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -32,7 +32,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use milli::facet::FacetValue; use milli::update::UpdateIndexingStep::*; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; -use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; +use milli::{obkv_to_json, Index, UpdateStore, SearchResult, MatchingWords, FacetCondition}; static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); @@ -132,7 +132,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { Self { analyzer } } - fn highlight_value(&self, value: Value, words_to_highlight: &HashSet) -> Value { + fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value { match value { Value::Null => Value::Null, Value::Bool(boolean) => Value::Bool(boolean), @@ -142,7 +142,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed = self.analyzer.analyze(&old_string); for (word, token) in analyzed.reconstruct() { if token.is_word() { - let to_highlight = words_to_highlight.contains(token.text()); + let to_highlight = matching_words.matches(token.text()); if to_highlight { string.push_str("") } string.push_str(word); if to_highlight { string.push_str("") } @@ -154,12 +154,12 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { }, Value::Array(values) => { Value::Array(values.into_iter() - .map(|v| self.highlight_value(v, words_to_highlight)) + .map(|v| self.highlight_value(v, matching_words)) .collect()) }, Value::Object(object) => { Value::Object(object.into_iter() - .map(|(k, v)| (k, self.highlight_value(v, words_to_highlight))) + .map(|(k, v)| (k, self.highlight_value(v, matching_words))) .collect()) }, } @@ -168,14 +168,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { fn highlight_record( &self, object: &mut Map, - words_to_highlight: &HashSet, + matching_words: &MatchingWords, attributes_to_highlight: &HashSet, ) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { let old_value = mem::take(value); - *value = self.highlight_value(old_value, words_to_highlight); + *value = self.highlight_value(old_value, matching_words); } } } @@ -722,7 +722,7 @@ async fn main() -> anyhow::Result<()> { search.facet_condition(condition); } - let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap(); + let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); let number_of_candidates = candidates.len(); let facets = if query.facet_distribution == Some(true) { @@ -748,7 +748,7 @@ async fn main() -> anyhow::Result<()> { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { - highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight); + highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight); } documents.push(object); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 0fa966ee8..75d6f9fb3 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -28,7 +28,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; -pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult}; +pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a48b074cc..0b8111997 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -1,8 +1,8 @@ use std::{borrow::Cow, collections::HashMap, mem::take}; use anyhow::bail; -use roaring::RoaringBitmap; use log::debug; +use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::word_derivations; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f3d5af2da..dbb504368 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,10 +1,9 @@ use std::borrow::Cow; -use std::collections::HashSet; use std::fmt; use std::time::Instant; use fst::{IntoStreamer, Streamer, Set}; -use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; +use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder}; use log::debug; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; @@ -14,8 +13,9 @@ use crate::search::criteria::{Criterion, CriterionResult}; use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; use crate::{Index, DocumentId}; -pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::FacetIter; +pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; +pub use self::query_tree::MatchingWords; use self::query_tree::QueryTreeBuilder; // Building these factories is not free. @@ -87,6 +87,11 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); + let matching_words = match query_tree.as_ref() { + Some(query_tree) => MatchingWords::from_query_tree(&query_tree), + None => MatchingWords::default(), + }; + // We are testing the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; @@ -128,8 +133,7 @@ impl<'a> Search<'a> { if limit == 0 { break } } - let found_words = HashSet::new(); - Ok(SearchResult { found_words, candidates: initial_candidates, documents_ids }) + Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) } } @@ -147,26 +151,21 @@ impl fmt::Debug for Search<'_> { #[derive(Default)] pub struct SearchResult { - pub found_words: HashSet, + pub matching_words: MatchingWords, pub candidates: RoaringBitmap, // TODO those documents ids should be associated with their criteria scores. pub documents_ids: Vec, } -pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { - let lev = match max_typo { - 0 => &LEVDIST0, - 1 => &LEVDIST1, - _ => &LEVDIST2, - }; - - let dfa = if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - }; - +pub fn word_derivations( + word: &str, + is_prefix: bool, + max_typo: u8, + fst: &fst::Set>, +) -> anyhow::Result> +{ let mut derived_words = Vec::new(); + let dfa = build_dfa(word, max_typo, is_prefix); let mut stream = fst.search_with_state(&dfa).into_stream(); while let Some((word, state)) = stream.next() { @@ -177,3 +176,17 @@ pub fn word_derivations(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Se Ok(derived_words) } + +pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { + let lev = match typos { + 0 => &LEVDIST0, + 1 => &LEVDIST1, + _ => &LEVDIST2, + }; + + if is_prefix { + lev.build_prefix_dfa(word) + } else { + lev.build_dfa(word) + } +} diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 47057ad10..114032eb8 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,12 +1,13 @@ -use std::borrow::Cow; -use std::collections::BTreeMap; +use std::collections::HashSet; use std::{fmt, cmp, mem}; +use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; use slice_group_by::GroupBy; use crate::Index; +use super::build_dfa; type IsOptionalWord = bool; type IsPrefix = bool; @@ -113,6 +114,14 @@ impl QueryKind { QueryKind::Tolerant { typo, word } } + pub fn is_tolerant(&self) -> bool { + matches!(self, QueryKind::Tolerant { .. }) + } + + pub fn is_exact(&self) -> bool { + matches!(self, QueryKind::Exact { .. }) + } + pub fn typo(&self) -> u8 { match self { QueryKind::Tolerant { typo, .. } => *typo, @@ -275,69 +284,45 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result + dfas: Vec<(DFA, u8)>, } impl MatchingWords { /// List all words which can be considered as a match for the query tree. - pub fn from_query_tree(tree: &Operation, fst: &fst::Set>) -> Self { - Self { inner: fetch_words(tree, fst).into_iter().collect() } + pub fn from_query_tree(tree: &Operation) -> Self { + Self { + dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect() + } } /// Return true if the word match. - pub fn is_match(&self, word: &str) -> bool { - fn first_char(s: &str) -> Option<&str> { - s.chars().next().map(|c| &s[..c.len_utf8()]) - } - - match first_char(word) { - Some(first) => { - let left = first.to_owned(); - let right = word.to_owned(); - self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word) - }, - None => false - } + pub fn matches(&self, word: &str) -> bool { + self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) { + Distance::Exact(t) => t <= *typo, + Distance::AtLeast(_) => false, + }) } } -type FetchedWords = Vec<(String, IsPrefix)>; - /// Lists all words which can be considered as a match for the query tree. -fn fetch_words(tree: &Operation, fst: &fst::Set>) -> FetchedWords { - fn resolve_branch(tree: &[Operation], fst: &fst::Set>) -> FetchedWords { - tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect() - } - - fn resolve_query(query: &Query, fst: &fst::Set>) -> FetchedWords { - match query.kind.clone() { - QueryKind::Exact { word, .. } => vec![(word, query.prefix)], - QueryKind::Tolerant { typo, word } => { - if let Ok(words) = super::word_derivations(&word, query.prefix, typo, fst) { - words.into_iter().map(|(w, _)| (w, query.prefix)).collect() - } else { - vec![(word, query.prefix)] - } - } - } - } - - fn resolve_ops(tree: &Operation, fst: &fst::Set>) -> FetchedWords { +fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { + fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { match tree { Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { - resolve_branch(ops.as_slice(), fst) + ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); }, - Operation::Query(ops) => { - resolve_query(ops, fst) + Operation::Query(Query { prefix, kind }) => { + let typo = if kind.is_exact() { 0 } else { kind.typo() }; + out.insert((kind.word(), typo, *prefix)); }, } } - let mut words = resolve_ops(tree, fst); - words.sort_unstable(); - words.dedup(); - words + let mut queries = HashSet::new(); + resolve_ops(tree, &mut queries); + queries } /// Main function that creates the final query tree from the primitive query. @@ -559,7 +544,7 @@ mod test { use std::collections::HashMap; use fst::Set; - use maplit::hashmap; + use maplit::{hashmap, hashset}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::{Rng, SeedableRng, rngs::StdRng}; @@ -970,26 +955,26 @@ mod test { let context = TestContext::default(); let query_tree = context.build(false, true, tokens).unwrap().unwrap(); - let expected = vec![ - ("city".to_string(), false), - ("earth".to_string(), false), - ("nature".to_string(), false), - ("new".to_string(), false), - ("nyc".to_string(), false), - ("split".to_string(), false), - ("word".to_string(), false), - ("word".to_string(), true), - ("world".to_string(), true), - ("york".to_string(), false), - - ]; + let expected = hashset!{ + ("word", 0, false), + ("nyc", 0, false), + ("wordsplit", 2, false), + ("wordsplitnycworld", 2, true), + ("nature", 0, false), + ("new", 0, false), + ("city", 0, false), + ("world", 1, true), + ("york", 0, false), + ("split", 0, false), + ("nycworld", 1, true), + ("earth", 0, false), + ("wordsplitnyc", 2, false), + }; let mut keys = context.postings.keys().collect::>(); keys.sort_unstable(); - let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap(); - - let words = fetch_words(&query_tree, &set); + let words = fetch_queries(&query_tree); assert_eq!(expected, words); } } From 7ac09d7b7c37e5cf5f43ab7a31bbd359755229ef Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 16:54:41 +0100 Subject: [PATCH 0534/1889] remove option of bucket_candidates --- milli/src/search/criteria/proximity.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index aab46a6a2..57c7007fc 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -12,7 +12,7 @@ pub struct Proximity<'t> { query_tree: Option<(usize, Operation)>, proximity: u8, candidates: Candidates, - bucket_candidates: Option, + bucket_candidates: RoaringBitmap, parent: Option>, candidates_cache: HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, } @@ -29,7 +29,7 @@ impl<'t> Proximity<'t> { query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), proximity: 0, candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::new(), }) @@ -45,7 +45,7 @@ impl<'t> Proximity<'t> { query_tree: None, proximity: 0, candidates: Candidates::default(), - bucket_candidates: None, + bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::new(), }) @@ -84,8 +84,8 @@ impl<'t> Criterion for Proximity<'t> { self.proximity += 1; let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), }; return Ok(Some(CriterionResult { @@ -112,8 +112,8 @@ impl<'t> Criterion for Proximity<'t> { self.proximity += 1; let bucket_candidates = match self.parent { - Some(_) => self.bucket_candidates.take(), - None => Some(new_candidates.clone()), + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), }; return Ok(Some(CriterionResult { @@ -128,7 +128,7 @@ impl<'t> Criterion for Proximity<'t> { return Ok(Some(CriterionResult { query_tree: None, candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, (None, Forbidden(_)) => { From daf126a63838a9382e73f106f553e546b625a979 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 25 Feb 2021 16:34:29 +0100 Subject: [PATCH 0535/1889] Introduce the final Fetcher criterion --- milli/src/search/criteria/fetcher.rs | 107 +++++++++++++++++++++++++++ milli/src/search/criteria/mod.rs | 90 +++++++++++++++++++++- milli/src/search/criteria/words.rs | 80 +------------------- milli/src/search/mod.rs | 7 +- 4 files changed, 201 insertions(+), 83 deletions(-) create mode 100644 milli/src/search/criteria/fetcher.rs diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs new file mode 100644 index 000000000..7706ee280 --- /dev/null +++ b/milli/src/search/criteria/fetcher.rs @@ -0,0 +1,107 @@ +use std::collections::HashMap; +use std::mem::take; + +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; + +pub struct Fetcher<'t> { + ctx: &'t dyn Context, + query_tree: Option, + candidates: Candidates, + parent: Option>, + should_get_documents_ids: bool, +} + +impl<'t> Fetcher<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> Self + { + Fetcher { + ctx, + query_tree, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + parent: None, + should_get_documents_ids: true, + } + } + + pub fn new( + ctx: &'t dyn Context, + parent: Box, + ) -> Self + { + Fetcher { + ctx, + query_tree: None, + candidates: Candidates::default(), + parent: Some(parent), + should_get_documents_ids: true, + } + } +} + +impl<'t> Criterion for Fetcher<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + loop { + debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", + self.should_get_documents_ids, self.candidates, + ); + + match &mut self.candidates { + Allowed(candidates) => if candidates.is_empty() { + self.candidates = Candidates::default(); + } else { + self.should_get_documents_ids = false; + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + Forbidden(_) => { + let should_get_documents_ids = take(&mut self.should_get_documents_ids); + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(result) => return Ok(Some(result)), + None => if should_get_documents_ids { + let candidates = match &self.query_tree { + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?, + None => self.ctx.documents_ids()?, + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + } + }, + None => if should_get_documents_ids { + let candidates = match &self.query_tree { + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?, + None => self.ctx.documents_ids()?, + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + } + return Ok(None); + }, + } + } + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 52367ac5f..1845e607a 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,16 +1,19 @@ +use std::collections::HashMap; use std::borrow::Cow; +use anyhow::bail; +use roaring::RoaringBitmap; + use crate::Index; use crate::search::word_derivations; -use roaring::RoaringBitmap; - use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; pub mod words; pub mod asc_desc; pub mod proximity; +pub mod fetcher; pub trait Criterion { fn next(&mut self) -> anyhow::Result>; @@ -51,6 +54,7 @@ impl Default for Candidates { } } pub trait Context { + fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; @@ -66,6 +70,10 @@ pub struct HeedContext<'t> { } impl<'a> Context for HeedContext<'a> { + fn documents_ids(&self) -> heed::Result { + self.index.documents_ids(self.rtxn) + } + fn word_docids(&self, word: &str) -> heed::Result> { self.index.word_docids.get(self.rtxn, &word) } @@ -107,6 +115,80 @@ impl<'t> HeedContext<'t> { } } +pub fn resolve_query_tree<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, +) -> anyhow::Result +{ + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + ) -> anyhow::Result + { + use Operation::{And, Consecutive, Or, Query}; + + match query_tree { + And(ops) => { + let mut ops = ops.iter().map(|op| { + resolve_operation(ctx, op, cache) + }).collect::>>()?; + + ops.sort_unstable_by_key(|cds| cds.len()); + + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for docids in ops { + if first_loop { + candidates = docids; + first_loop = false; + } else { + candidates.intersect_with(&docids); + } + } + Ok(candidates) + }, + Consecutive(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for slice in ops.windows(2) { + match (&slice[0], &slice[1]) { + (Operation::Query(left), Operation::Query(right)) => { + match query_pair_proximity_docids(ctx, left, right, 1)? { + pair_docids if pair_docids.is_empty() => { + return Ok(RoaringBitmap::new()) + }, + pair_docids if first_loop => { + candidates = pair_docids; + first_loop = false; + }, + pair_docids => { + candidates.intersect_with(&pair_docids); + }, + } + }, + _ => bail!("invalid consecutive query type"), + } + } + Ok(candidates) + }, + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(ctx, op, cache)?; + candidates.union_with(&docids); + } + Ok(candidates) + }, + Query(q) => Ok(query_docids(ctx, q)?), + } + } + + resolve_operation(ctx, query_tree, cache) +} + + fn all_word_pair_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], @@ -218,6 +300,10 @@ pub mod test { } impl<'a> Context for TestContext<'a> { + fn documents_ids(&self) -> heed::Result { + Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) + } + fn word_docids(&self, word: &str) -> heed::Result> { Ok(self.word_docids.get(&word.to_string()).cloned()) } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 3b0ecd54a..0913d429d 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -1,12 +1,11 @@ use std::collections::HashMap; use std::mem::take; -use anyhow::bail; use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; pub struct Words<'t> { ctx: &'t dyn Context, @@ -62,7 +61,7 @@ impl<'t> Criterion for Words<'t> { self.candidates = Candidates::default(); }, (Some(qt), Allowed(candidates)) => { - let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -78,7 +77,7 @@ impl<'t> Criterion for Words<'t> { })); }, (Some(qt), Forbidden(candidates)) => { - let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; found_candidates.difference_with(&candidates); candidates.union_with(&found_candidates); @@ -127,76 +126,3 @@ fn explode_query_tree(query_tree: Operation) -> Vec { otherwise => vec![otherwise], } } - -fn resolve_candidates<'t>( - ctx: &'t dyn Context, - query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, -) -> anyhow::Result -{ - fn resolve_operation<'t>( - ctx: &'t dyn Context, - query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, - ) -> anyhow::Result - { - use Operation::{And, Consecutive, Or, Query}; - - match query_tree { - And(ops) => { - let mut ops = ops.iter().map(|op| { - resolve_operation(ctx, op, cache) - }).collect::>>()?; - - ops.sort_unstable_by_key(|cds| cds.len()); - - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for docids in ops { - if first_loop { - candidates = docids; - first_loop = false; - } else { - candidates.intersect_with(&docids); - } - } - Ok(candidates) - }, - Consecutive(ops) => { - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for slice in ops.windows(2) { - match (&slice[0], &slice[1]) { - (Operation::Query(left), Operation::Query(right)) => { - match query_pair_proximity_docids(ctx, left, right, 1)? { - pair_docids if pair_docids.is_empty() => { - return Ok(RoaringBitmap::new()) - }, - pair_docids if first_loop => { - candidates = pair_docids; - first_loop = false; - }, - pair_docids => { - candidates.intersect_with(&pair_docids); - }, - } - }, - _ => bail!("invalid consecutive query type"), - } - } - Ok(candidates) - }, - Or(_, ops) => { - let mut candidates = RoaringBitmap::new(); - for op in ops { - let docids = resolve_operation(ctx, op, cache)?; - candidates.union_with(&docids); - } - Ok(candidates) - }, - Query(q) => Ok(query_docids(ctx, q)?), - } - } - - resolve_operation(ctx, query_tree, cache) -} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index dbb504368..84c6acf3e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -10,7 +10,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; +use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity, fetcher::Fetcher}; use crate::{Index, DocumentId}; pub use self::facet::FacetIter; @@ -92,13 +92,12 @@ impl<'a> Search<'a> { None => MatchingWords::default(), }; - // We are testing the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?; let proximity_criterion = Proximity::new(&criteria_ctx, Box::new(words_criterion))?; - // let proximity_criterion = Proximity::initial(&criteria_ctx, query_tree, facet_candidates)?; - let mut criteria = proximity_criterion; + let fetcher_criterion = Fetcher::new(&criteria_ctx, Box::new(proximity_criterion)); + let mut criteria = fetcher_criterion; // // We sort in descending order on a specific field *by hand*, don't do that at home. // let attr_name = "released-timestamp"; From b0e0c5eba0c83d57d7262036c076b0a829065de7 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 25 Feb 2021 16:59:09 +0100 Subject: [PATCH 0536/1889] remove option of bucket_candidates --- milli/src/search/criteria/fetcher.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index 7706ee280..e21548e3f 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -63,7 +63,7 @@ impl<'t> Criterion for Fetcher<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, Forbidden(_) => { @@ -81,7 +81,7 @@ impl<'t> Criterion for Fetcher<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, } @@ -95,7 +95,7 @@ impl<'t> Criterion for Fetcher<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), - bucket_candidates: Some(candidates), + bucket_candidates: candidates, })); }, } From 36c1f93ceb969c69bb37b252e8369169670c1767 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 25 Feb 2021 17:28:20 +0100 Subject: [PATCH 0537/1889] Do an union of the bucket candidates --- milli/src/search/criteria/asc_desc.rs | 2 +- milli/src/search/criteria/proximity.rs | 2 +- milli/src/search/criteria/typo.rs | 2 +- milli/src/search/criteria/words.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 3d32bd845..df80f3bb4 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -174,7 +174,7 @@ impl<'t> Criterion for AscDesc<'t> { self.query_tree = query_tree; candidates.intersect_with(&self.faceted_candidates); self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates = bucket_candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 57c7007fc..352567d1a 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -139,7 +139,7 @@ impl<'t> Criterion for Proximity<'t> { self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); self.proximity = 0; self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates = bucket_candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 0b8111997..b82ebbf5b 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -147,7 +147,7 @@ impl<'t> Criterion for Typo<'t> { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates = bucket_candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 0913d429d..c8bb0abc4 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -107,7 +107,7 @@ impl<'t> Criterion for Words<'t> { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates = bucket_candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } From 025835c5b2af5c2622c41504c19ea3f94aab6680 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 1 Mar 2021 14:03:12 +0100 Subject: [PATCH 0538/1889] Fix the criteria to avoid always returning a placeholder --- milli/src/search/criteria/asc_desc.rs | 7 +++++-- milli/src/search/criteria/fetcher.rs | 18 ++++++++++++------ milli/src/search/criteria/proximity.rs | 7 +++++-- milli/src/search/criteria/typo.rs | 7 +++++-- milli/src/search/criteria/words.rs | 8 ++++++-- 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index df80f3bb4..151b0a6a0 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -128,8 +128,11 @@ impl<'t> Criterion for AscDesc<'t> { loop { match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { - self.query_tree = None; - self.candidates = Candidates::default(); + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); }, (Some(qt), Allowed(candidates)) => { let bucket_candidates = match self.parent { diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index e21548e3f..f0cf16b90 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -54,20 +54,26 @@ impl<'t> Criterion for Fetcher<'t> { self.should_get_documents_ids, self.candidates, ); + let should_get_documents_ids = take(&mut self.should_get_documents_ids); match &mut self.candidates { - Allowed(candidates) => if candidates.is_empty() { - self.candidates = Candidates::default(); - } else { - self.should_get_documents_ids = false; + Allowed(candidates) => { let candidates = take(&mut self.candidates).into_inner(); + let candidates = match &self.query_tree { + Some(qt) if should_get_documents_ids => { + let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?; + docids.intersect_with(&candidates); + docids + }, + _ => candidates, + }; + return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), + query_tree: self.query_tree.take(), candidates: candidates.clone(), bucket_candidates: candidates, })); }, Forbidden(_) => { - let should_get_documents_ids = take(&mut self.should_get_documents_ids); match self.parent.as_mut() { Some(parent) => { match parent.next()? { diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 352567d1a..553a191ec 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -64,8 +64,11 @@ impl<'t> Criterion for Proximity<'t> { match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { - self.query_tree = None; - self.candidates = Candidates::default(); + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take().map(|(_, qt)| qt), + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); }, (Some((max_prox, query_tree)), Allowed(candidates)) => { if self.proximity as usize > *max_prox { diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index b82ebbf5b..5c8592c5e 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -64,8 +64,11 @@ impl<'t> Criterion for Typo<'t> { match (&mut self.query_tree, &mut self.candidates) { (_, Allowed(candidates)) if candidates.is_empty() => { - self.query_tree = None; - self.candidates = Candidates::default(); + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take().map(|(_, qt)| qt), + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); }, (Some((max_typos, query_tree)), Allowed(candidates)) => { if self.number_typos as usize > *max_typos { diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index c8bb0abc4..bfb85579a 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -56,9 +56,13 @@ impl<'t> Criterion for Words<'t> { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); match (self.query_trees.pop(), &mut self.candidates) { - (_, Allowed(candidates)) if candidates.is_empty() => { + (query_tree, Allowed(candidates)) if candidates.is_empty() => { self.query_trees = Vec::new(); - self.candidates = Candidates::default(); + return Ok(Some(CriterionResult { + query_tree, + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); }, (Some(qt), Allowed(candidates)) => { let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; From f118d7e067cb6550feaa87c38e3a259d96dd1ae0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 2 Mar 2021 11:58:32 +0100 Subject: [PATCH 0539/1889] build criteria from settings --- milli/src/search/criteria/fetcher.rs | 2 +- milli/src/search/criteria/mod.rs | 82 ++++++++++++++++++++++++---- milli/src/search/mod.rs | 9 +-- 3 files changed, 74 insertions(+), 19 deletions(-) diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index f0cf16b90..38fee20d3 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -56,7 +56,7 @@ impl<'t> Criterion for Fetcher<'t> { let should_get_documents_ids = take(&mut self.should_get_documents_ids); match &mut self.candidates { - Allowed(candidates) => { + Allowed(_) => { let candidates = take(&mut self.candidates).into_inner(); let candidates = match &self.query_tree { Some(qt) if should_get_documents_ids => { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1845e607a..49bacf209 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,13 +1,19 @@ use std::collections::HashMap; use std::borrow::Cow; -use anyhow::bail; +use anyhow::{bail, Context as _}; use roaring::RoaringBitmap; -use crate::Index; +use crate::facet::FacetType; use crate::search::word_derivations; +use crate::{Index, FieldId}; use super::query_tree::{Operation, Query, QueryKind}; +use self::typo::Typo; +use self::words::Words; +use self::asc_desc::AscDesc; +use self::proximity::Proximity; +use self::fetcher::Fetcher; pub mod typo; pub mod words; @@ -62,14 +68,14 @@ pub trait Context { fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; } -pub struct HeedContext<'t> { +pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, index: &'t Index, words_fst: fst::Set>, words_prefixes_fst: fst::Set>, } -impl<'a> Context for HeedContext<'a> { +impl<'a> Context for CriteriaBuilder<'a> { fn documents_ids(&self) -> heed::Result { self.index.documents_ids(self.rtxn) } @@ -101,17 +107,71 @@ impl<'a> Context for HeedContext<'a> { } } -impl<'t> HeedContext<'t> { +impl<'t> CriteriaBuilder<'t> { pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result { let words_fst = index.words_fst(rtxn)?; let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; + Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) + } - Ok(Self { - rtxn, - index, - words_fst, - words_prefixes_fst, - }) + pub fn build( + &'t self, + mut query_tree: Option, + mut facet_candidates: Option, + ) -> anyhow::Result> + { + use crate::criterion::Criterion as Name; + + let fields_ids_map = self.index.fields_ids_map(&self.rtxn)?; + let faceted_fields = self.index.faceted_fields(&self.rtxn)?; + let field_id_facet_type = |field: &str| -> anyhow::Result<(FieldId, FacetType)> { + let id = fields_ids_map.id(field).with_context(|| { + format!("field {:?} isn't registered", field) + })?; + let facet_type = faceted_fields.get(field).with_context(|| { + format!("field {:?} isn't faceted", field) + })?; + Ok((id, *facet_type)) + }; + + let mut criterion = None as Option>; + for name in self.index.criteria(&self.rtxn)? { + criterion = Some(match criterion.take() { + Some(father) => match name { + Name::Typo => Box::new(Typo::new(self, father)?), + Name::Words => Box::new(Words::new(self, father)?), + Name::Proximity => Box::new(Proximity::new(self, father)?), + Name::Asc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::asc(&self.index, &self.rtxn, father, id, facet_type)?) + }, + Name::Desc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::desc(&self.index, &self.rtxn, father, id, facet_type)?) + }, + _otherwise => father, + }, + None => match name { + Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())?), + Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())?), + Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())?), + Name::Asc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?) + }, + Name::Desc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?) + }, + _otherwise => continue, + }, + }); + } + + match criterion { + Some(criterion) => Ok(Fetcher::new(self, criterion)), + None => Ok(Fetcher::initial(self, query_tree, facet_candidates)), + } } } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 84c6acf3e..48b0f71da 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -10,7 +10,6 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity, fetcher::Fetcher}; use crate::{Index, DocumentId}; pub use self::facet::FacetIter; @@ -92,12 +91,8 @@ impl<'a> Search<'a> { None => MatchingWords::default(), }; - let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; - let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; - let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?; - let proximity_criterion = Proximity::new(&criteria_ctx, Box::new(words_criterion))?; - let fetcher_criterion = Fetcher::new(&criteria_ctx, Box::new(proximity_criterion)); - let mut criteria = fetcher_criterion; + let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; + let mut criteria = criteria_builder.build(query_tree, facet_candidates)?; // // We sort in descending order on a specific field *by hand*, don't do that at home. // let attr_name = "released-timestamp"; From 6bf6b404955c9087a905051e002244010289cb26 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 2 Mar 2021 11:02:09 +0100 Subject: [PATCH 0540/1889] Remove unused files --- milli/src/lib.rs | 2 - milli/src/mdfs.rs | 163 ---------------------------- milli/src/query_tokens.rs | 217 -------------------------------------- 3 files changed, 382 deletions(-) delete mode 100644 milli/src/mdfs.rs delete mode 100644 milli/src/query_tokens.rs diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 75d6f9fb3..d6a078a1f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -3,8 +3,6 @@ mod criterion; mod external_documents_ids; mod fields_ids_map; -mod mdfs; -mod query_tokens; mod search; mod update_store; pub mod facet; diff --git a/milli/src/mdfs.rs b/milli/src/mdfs.rs deleted file mode 100644 index 6beba3c69..000000000 --- a/milli/src/mdfs.rs +++ /dev/null @@ -1,163 +0,0 @@ -use std::collections::hash_map::Entry::{Occupied, Vacant}; -use std::collections::HashMap; -use std::mem; - -use roaring::RoaringBitmap; -use crate::Index; - -/// A mana depth first search implementation. -pub struct Mdfs<'a> { - index: &'a Index, - rtxn: &'a heed::RoTxn<'a>, - words: &'a [(HashMap, RoaringBitmap)], - union_cache: HashMap<(usize, u8), RoaringBitmap>, - candidates: RoaringBitmap, - mana: u32, - max_mana: u32, -} - -impl<'a> Mdfs<'a> { - pub fn new( - index: &'a Index, - rtxn: &'a heed::RoTxn, - words: &'a [(HashMap, RoaringBitmap)], - candidates: RoaringBitmap, - ) -> Mdfs<'a> - { - // Compute the number of pairs (windows) we have for this list of words. - let mana = words.len().saturating_sub(1) as u32; - let max_mana = mana * 8; - Mdfs { index, rtxn, words, union_cache: HashMap::new(), candidates, mana, max_mana } - } -} - -impl<'a> Iterator for Mdfs<'a> { - type Item = anyhow::Result<(u32, RoaringBitmap)>; - - fn next(&mut self) -> Option { - // If there is less or only one word therefore the only - // possible documents that we can return are the candidates. - if self.words.len() <= 1 { - if self.candidates.is_empty() { return None } - return Some(Ok((0, mem::take(&mut self.candidates)))); - } - - while self.mana <= self.max_mana { - let mut answer = RoaringBitmap::new(); - let result = mdfs_step( - &self.index, - &self.rtxn, - self.mana, - self.words, - &self.candidates, - &self.candidates, - &mut self.union_cache, - &mut answer, - ); - - match result { - Ok(()) => { - // We always increase the mana for the next loop. - let proximity = self.mana; - self.mana += 1; - - // If no documents were found we must not return and continue - // the search with more mana. - if !answer.is_empty() { - - // We remove the answered documents from the list of - // candidates to be sure we don't search for them again. - self.candidates.difference_with(&answer); - - // We return the answer. - return Some(Ok((proximity, answer))); - } - }, - Err(e) => return Some(Err(e)), - } - } - - None - } -} - -fn mdfs_step( - index: &Index, - rtxn: &heed::RoTxn, - mana: u32, - words: &[(HashMap, RoaringBitmap)], - candidates: &RoaringBitmap, - parent_docids: &RoaringBitmap, - union_cache: &mut HashMap<(usize, u8), RoaringBitmap>, - answer: &mut RoaringBitmap, -) -> anyhow::Result<()> -{ - use std::cmp::{min, max}; - - let (words1, words2) = (&words[0].0, &words[1].0); - let pairs = words_pair_combinations(words1, words2); - let tail = &words[1..]; - let nb_children = tail.len() as u32 - 1; - - // The minimum amount of mana that you must consume is at least 1 and the - // amount of mana that your children can consume. Because the last child must - // consume the remaining mana, it is mandatory that there not too much at the end. - let min_proximity = max(1, mana.saturating_sub(nb_children * 8)) as u8; - - // The maximum amount of mana that you can use is 8 or the remaining amount of - // mana minus your children, as you can't just consume all the mana, - // your children must have at least 1 mana. - let max_proximity = min(8, mana - nb_children) as u8; - - for proximity in min_proximity..=max_proximity { - let mut docids = match union_cache.entry((words.len(), proximity)) { - Occupied(entry) => entry.get().clone(), - Vacant(entry) => { - let mut docids = RoaringBitmap::new(); - if proximity == 8 { - docids = candidates.clone(); - } else { - for (w1, w2) in pairs.iter().cloned() { - let key = (w1, w2, proximity); - if let Some(di) = index.word_pair_proximity_docids.get(rtxn, &key)? { - docids.union_with(&di); - } - } - } - entry.insert(docids).clone() - } - }; - - // We must be sure that we only return docids that are present in the candidates. - docids.intersect_with(parent_docids); - - if !docids.is_empty() { - let mana = mana.checked_sub(proximity as u32).unwrap(); - if tail.len() < 2 { - // We are the last pair, we return without recuring as we don't have any child. - answer.union_with(&docids); - return Ok(()); - } else { - return mdfs_step(index, rtxn, mana, tail, candidates, &docids, union_cache, answer); - } - } - } - - Ok(()) -} - -fn words_pair_combinations<'h>( - w1: &'h HashMap, - w2: &'h HashMap, -) -> Vec<(&'h str, &'h str)> -{ - let mut pairs = Vec::new(); - for (w1, (_typos, docids1)) in w1 { - for (w2, (_typos, docids2)) in w2 { - if !docids1.is_disjoint(&docids2) { - pairs.push((w1.as_str(), w2.as_str())); - } - } - } - pairs -} diff --git a/milli/src/query_tokens.rs b/milli/src/query_tokens.rs deleted file mode 100644 index 258c90765..000000000 --- a/milli/src/query_tokens.rs +++ /dev/null @@ -1,217 +0,0 @@ -use meilisearch_tokenizer::{Token, TokenKind}; - -#[derive(Debug)] -enum State { - Free, - Quoted, -} - -impl State { - fn swap(&mut self) { - match self { - State::Quoted => *self = State::Free, - State::Free => *self = State::Quoted, - } - } -} - -#[derive(Debug, PartialEq, Eq)] -pub enum QueryToken<'a> { - Free(Token<'a>), - Quoted(Token<'a>), -} - -pub fn query_tokens<'a>(mut tokens: impl Iterator>) -> impl Iterator> { - let mut state = State::Free; - let f = move || { - loop { - let token = tokens.next()?; - match token.kind() { - _ if token.text().trim() == "\"" => state.swap(), - TokenKind::Word => { - let token = match state { - State::Quoted => QueryToken::Quoted(token), - State::Free => QueryToken::Free(token), - }; - return Some(token); - }, - _ => (), - } - } - }; - std::iter::from_fn(f) -} - -#[cfg(test)] -mod tests { - use super::*; - use QueryToken::{Quoted, Free}; - use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; - use fst::Set; - - macro_rules! assert_eq_query_token { - ($test:expr, Quoted($val:literal)) => { - match $test { - Quoted(val) => assert_eq!(val.text(), $val), - Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()), - } - }; - - ($test:expr, Free($val:literal)) => { - match $test { - Quoted(val) => panic!("expected Free(\"{}\"), found Quoted(\"{}\")", $val, val.text()), - Free(val) => assert_eq!(val.text(), $val), - } - }; - } - - #[test] - fn empty() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = ""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert!(iter.next().is_none()); - - let query = " "; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert!(iter.next().is_none()); - } - - #[test] - fn one_quoted_string() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert!(iter.next().is_none()); - } - - #[test] - fn one_pending_quoted_string() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert!(iter.next().is_none()); - } - - #[test] - fn one_non_quoted_string() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert!(iter.next().is_none()); - } - - #[test] - fn quoted_directly_followed_by_free_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello\"world"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert_eq_query_token!(iter.next().unwrap(), Free("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn free_directly_followed_by_quoted_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello\"world\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn free_followed_by_quoted_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello \"world\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn multiple_spaces_separated_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello world "; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Free("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn multi_interleaved_quoted_free_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello \"world\" coucou \"monde\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert_eq_query_token!(iter.next().unwrap(), Free("coucou")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("monde")); - assert!(iter.next().is_none()); - } - - #[test] - fn multi_quoted_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello world\" coucou \"monde est beau\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert_eq_query_token!(iter.next().unwrap(), Free("coucou")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("monde")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("est")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("beau")); - assert!(iter.next().is_none()); - } - - #[test] - fn chinese() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "汽车男生"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("汽车")); - assert_eq_query_token!(iter.next().unwrap(), Free("男生")); - assert!(iter.next().is_none()); - } -} From 246286f0ebede97517bc262fc9cf67448e221194 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 2 Mar 2021 11:14:10 +0100 Subject: [PATCH 0541/1889] take hard separator into account --- milli/src/update/index_documents/store.rs | 41 +++++++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 96d1098f9..05974d55e 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -13,7 +13,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; use log::{debug, info}; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; @@ -471,14 +471,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { }; let analyzed = self.analyzer.analyze(&content); - let tokens = analyzed - .tokens() - .filter(|t| t.is_word()) - .map(|t| t.text().to_string()); + let tokens = process_tokens(analyzed.tokens()); - for (pos, word) in tokens.enumerate().take(MAX_POSITION) { + for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); } } } @@ -609,6 +606,36 @@ enum FacetValue { Integer(i64), } +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standart proximity of 1 between words. +fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator().is_some()) + .scan((0, None), |(offset, prev_kind), token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => { + *prev_kind = Some(token.kind); + } + _ => (), + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} + fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result> { use FacetValue::*; From cdaa96df6329670cf959ebb67c5e86618e1bfaea Mon Sep 17 00:00:00 2001 From: many Date: Tue, 2 Mar 2021 14:46:50 +0100 Subject: [PATCH 0542/1889] optimize proximity criterion --- milli/src/search/criteria/proximity.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 553a191ec..dc05787dd 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -205,14 +205,16 @@ fn resolve_candidates<'t>( cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, ) -> anyhow::Result> { - fn pair_combinations(mana: u8) -> impl Iterator { - (0..=mana).map(move |m| (mana - m, m)) + fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { + (0..=mana.min(left_max)).map(move |m| (m, mana - m)) } + let pair_max_proximity = 7; + let mut output = Vec::new(); - for (pair_p, left_right_p) in pair_combinations(proximity) { - for (left_p, right_p) in pair_combinations(left_right_p) { + for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) { + for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) { let left_key = (left.clone(), left_p); if !cache.contains_key(&left_key) { let candidates = resolve_operation(ctx, left, left_p, cache)?; From 5c5e51095cdbfc4c37592b90a08e5433e67c2ac1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 11:43:42 +0100 Subject: [PATCH 0543/1889] Fix the Asc/Desc criteria to alsways return the QueryTree when available --- milli/src/search/criteria/asc_desc.rs | 97 ++++++++++++++------------- 1 file changed, 52 insertions(+), 45 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 151b0a6a0..9af9d53e6 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -1,17 +1,20 @@ +use std::collections::HashMap; use std::mem::take; use anyhow::bail; use itertools::Itertools; +use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetI64Codec, FieldDocIdFacetF64Codec}; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index}; -use super::{Candidates, Criterion, CriterionResult}; +use super::{Criterion, CriterionResult}; pub struct AscDesc<'t> { index: &'t Index, @@ -20,7 +23,7 @@ pub struct AscDesc<'t> { facet_type: FacetType, ascending: bool, query_tree: Option, - candidates: Candidates, + candidates: RoaringBitmap, bucket_candidates: RoaringBitmap, faceted_candidates: RoaringBitmap, parent: Option>, @@ -83,6 +86,19 @@ impl<'t> AscDesc<'t> { ascending: bool, ) -> anyhow::Result where Self: Sized { + let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?; + let candidates = match &query_tree { + Some(qt) => { + let context = CriteriaBuilder::new(rtxn, index)?; + let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new())?; + if let Some(candidates) = candidates { + qt_candidates.intersect_with(&candidates); + } + qt_candidates + }, + None => candidates.unwrap_or(faceted_candidates.clone()), + }; + Ok(AscDesc { index, rtxn, @@ -90,8 +106,8 @@ impl<'t> AscDesc<'t> { facet_type, ascending, query_tree, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, + candidates, + faceted_candidates, bucket_candidates: RoaringBitmap::new(), parent: None, }) @@ -113,7 +129,7 @@ impl<'t> AscDesc<'t> { facet_type, ascending, query_tree: None, - candidates: Candidates::default(), + candidates: RoaringBitmap::new(), faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, bucket_candidates: RoaringBitmap::new(), parent: Some(parent), @@ -123,24 +139,43 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { fn next(&mut self) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; - loop { - match (&mut self.query_tree, &mut self.candidates) { - (_, Allowed(candidates)) if candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: take(&mut self.candidates).into_inner(), - bucket_candidates: take(&mut self.bucket_candidates), - })); + debug!("Facet {} iteration ({:?})", + if self.ascending { "Asc" } else { "Desc" }, self.candidates, + ); + + match &mut self.candidates { + candidates if candidates.is_empty() => { + let query_tree = self.query_tree.take(); + let candidates = take(&mut self.candidates); + let bucket_candidates = take(&mut self.bucket_candidates); + + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { + self.query_tree = query_tree; + candidates.intersect_with(&self.faceted_candidates); + self.candidates = candidates; + self.bucket_candidates = bucket_candidates; + }, + None => return Ok(None), + } + }, + None => if query_tree.is_none() && bucket_candidates.is_empty() { + return Ok(None) + }, + } + + return Ok(Some(CriterionResult { query_tree, candidates, bucket_candidates })); }, - (Some(qt), Allowed(candidates)) => { + candidates => { let bucket_candidates = match self.parent { Some(_) => take(&mut self.bucket_candidates), None => candidates.clone(), }; - let mut found_candidates = facet_ordered( + let found_candidates = facet_ordered( self.index, self.rtxn, self.field_id, @@ -149,42 +184,14 @@ impl<'t> Criterion for AscDesc<'t> { candidates.clone(), )?; - found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); return Ok(Some(CriterionResult { - query_tree: Some(qt.clone()), + query_tree: self.query_tree.clone(), candidates: found_candidates, bucket_candidates, })); }, - (Some(_qt), Forbidden(_candidates)) => { - todo!() - }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); - return Ok(Some(CriterionResult { - query_tree: None, - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - (None, Forbidden(_)) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next()? { - Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { - self.query_tree = query_tree; - candidates.intersect_with(&self.faceted_candidates); - self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates.union_with(&bucket_candidates); - }, - None => return Ok(None), - } - }, - None => return Ok(None), - } - }, } } } From f376c6a7287a952599cfe203e08f397edc672163 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 10:48:16 +0100 Subject: [PATCH 0544/1889] Make sure we retrieve the docid word positions --- milli/src/update/index_documents/store.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 05974d55e..05767080a 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -274,13 +274,15 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; // We store document_id associated with all the words the record contains. - for (word, _) in words_positions.drain() { - self.insert_word_docid(&word, document_id)?; + for (word, _) in words_positions.iter() { + self.insert_word_docid(word, document_id)?; } self.documents_writer.insert(document_id.to_be_bytes(), record)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; + words_positions.clear(); + // We store document_id associated with all the field id and values. for (field, values) in facet_values.drain() { for value in values { From 07784c899043d24318f13edac84e805708946696 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 11:25:36 +0100 Subject: [PATCH 0545/1889] Tune the words prefixes threshold to compute for 1/1000 instead --- Cargo.lock | 1 + infos/src/main.rs | 2 +- milli/src/update/words_prefixes.rs | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 883d836b7..b7f479d2e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -866,6 +866,7 @@ dependencies = [ "anyhow", "byte-unit", "heed", + "jemallocator", "milli", "stderrlog", "structopt", diff --git a/infos/src/main.rs b/infos/src/main.rs index 91157aaad..0d2b7abb5 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -598,7 +598,7 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) - let fields_ids_map = index.fields_ids_map(rtxn)?; let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); - let iter: Box> = if internal_ids.is_empty() { + let iter: Box> = if internal_ids.is_empty() { Box::new(index.documents.iter(rtxn)?.map(|result| { result.map(|(_id, obkv)| obkv) })) diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs index f7c898c89..70b82b217 100644 --- a/milli/src/update/words_prefixes.rs +++ b/milli/src/update/words_prefixes.rs @@ -41,7 +41,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { chunk_fusing_shrink_size: None, max_nb_chunks: None, max_memory: None, - threshold: 0.01, // 1% + threshold: 0.1 / 100.0, // .01% max_prefix_length: 4, _update_id: update_id, } From 1fc25148da6971b901f0502de255f3daafb8674c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 18:09:19 +0100 Subject: [PATCH 0546/1889] Remove useless where clauses for the criteria --- milli/src/search/criteria/asc_desc.rs | 12 ++++++------ milli/src/search/criteria/proximity.rs | 4 ++-- milli/src/search/criteria/typo.rs | 4 ++-- milli/src/search/criteria/words.rs | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 9af9d53e6..193e9c942 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -37,7 +37,7 @@ impl<'t> AscDesc<'t> { candidates: Option, field_id: FieldId, facet_type: FacetType, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, true) } @@ -49,7 +49,7 @@ impl<'t> AscDesc<'t> { candidates: Option, field_id: FieldId, facet_type: FacetType, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, false) } @@ -60,7 +60,7 @@ impl<'t> AscDesc<'t> { parent: Box, field_id: FieldId, facet_type: FacetType, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Self::new(index, rtxn, parent, field_id, facet_type, true) } @@ -71,7 +71,7 @@ impl<'t> AscDesc<'t> { parent: Box, field_id: FieldId, facet_type: FacetType, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Self::new(index, rtxn, parent, field_id, facet_type, false) } @@ -84,7 +84,7 @@ impl<'t> AscDesc<'t> { field_id: FieldId, facet_type: FacetType, ascending: bool, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?; let candidates = match &query_tree { @@ -120,7 +120,7 @@ impl<'t> AscDesc<'t> { field_id: FieldId, facet_type: FacetType, ascending: bool, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(AscDesc { index, diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index dc05787dd..fe82523ca 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -22,7 +22,7 @@ impl<'t> Proximity<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Proximity { ctx, @@ -38,7 +38,7 @@ impl<'t> Proximity<'t> { pub fn new( ctx: &'t dyn Context, parent: Box, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Proximity { ctx, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 5c8592c5e..76c2fbc46 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -24,7 +24,7 @@ impl<'t> Typo<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Typo { ctx, @@ -41,7 +41,7 @@ impl<'t> Typo<'t> { pub fn new( ctx: &'t dyn Context, parent: Box, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Typo { ctx, diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index bfb85579a..08cbeaab3 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -21,7 +21,7 @@ impl<'t> Words<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Words { ctx, @@ -36,7 +36,7 @@ impl<'t> Words<'t> { pub fn new( ctx: &'t dyn Context, parent: Box, - ) -> anyhow::Result where Self: Sized + ) -> anyhow::Result { Ok(Words { ctx, From 2cc4a467a6db6012225c090ad1a8350d2f72fba4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 18:16:13 +0100 Subject: [PATCH 0547/1889] Change the criterion output that cannot fail --- milli/src/search/criteria/mod.rs | 12 ++++++------ milli/src/search/criteria/proximity.rs | 16 ++++++---------- milli/src/search/criteria/typo.rs | 24 ++++++++++-------------- milli/src/search/criteria/words.rs | 16 ++++++---------- 4 files changed, 28 insertions(+), 40 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 49bacf209..0dcaa5a69 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -138,9 +138,9 @@ impl<'t> CriteriaBuilder<'t> { for name in self.index.criteria(&self.rtxn)? { criterion = Some(match criterion.take() { Some(father) => match name { - Name::Typo => Box::new(Typo::new(self, father)?), - Name::Words => Box::new(Words::new(self, father)?), - Name::Proximity => Box::new(Proximity::new(self, father)?), + Name::Typo => Box::new(Typo::new(self, father)), + Name::Words => Box::new(Words::new(self, father)), + Name::Proximity => Box::new(Proximity::new(self, father)), Name::Asc(field) => { let (id, facet_type) = field_id_facet_type(&field)?; Box::new(AscDesc::asc(&self.index, &self.rtxn, father, id, facet_type)?) @@ -152,9 +152,9 @@ impl<'t> CriteriaBuilder<'t> { _otherwise => father, }, None => match name { - Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())?), - Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())?), - Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())?), + Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())), + Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())), + Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())), Name::Asc(field) => { let (id, facet_type) = field_id_facet_type(&field)?; Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index fe82523ca..b192902c1 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -22,9 +22,9 @@ impl<'t> Proximity<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result + ) -> Self { - Ok(Proximity { + Proximity { ctx, query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), proximity: 0, @@ -32,15 +32,11 @@ impl<'t> Proximity<'t> { bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::new(), - }) + } } - pub fn new( - ctx: &'t dyn Context, - parent: Box, - ) -> anyhow::Result - { - Ok(Proximity { + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Proximity { ctx, query_tree: None, proximity: 0, @@ -48,7 +44,7 @@ impl<'t> Proximity<'t> { bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::new(), - }) + } } } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 76c2fbc46..e952bda55 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -24,9 +24,9 @@ impl<'t> Typo<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result + ) -> Self { - Ok(Typo { + Typo { ctx, query_tree: query_tree.map(|op| (maximum_typo(&op), op)), number_typos: 0, @@ -35,15 +35,11 @@ impl<'t> Typo<'t> { parent: None, candidates_cache: HashMap::new(), typo_cache: HashMap::new(), - }) + } } - pub fn new( - ctx: &'t dyn Context, - parent: Box, - ) -> anyhow::Result - { - Ok(Typo { + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Typo { ctx, query_tree: None, number_typos: 0, @@ -52,7 +48,7 @@ impl<'t> Typo<'t> { parent: Some(parent), candidates_cache: HashMap::new(), typo_cache: HashMap::new(), - }) + } } } @@ -348,7 +344,7 @@ mod test { let query_tree = None; let facet_candidates = None; - let mut criteria = Typo::initial(&context, query_tree, facet_candidates).unwrap(); + let mut criteria = Typo::initial(&context, query_tree, facet_candidates); assert!(criteria.next().unwrap().is_none()); } @@ -366,7 +362,7 @@ mod test { let facet_candidates = None; - let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates).unwrap(); + let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates); let candidates_1 = context.word_docids("split").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap() @@ -414,7 +410,7 @@ mod test { let query_tree = None; let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())).unwrap(); + let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())); let expected = CriterionResult { query_tree: None, @@ -442,7 +438,7 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())).unwrap(); + let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())); let candidates_1 = context.word_docids("split").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap() diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 08cbeaab3..1827cd1ed 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -21,31 +21,27 @@ impl<'t> Words<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Option, - ) -> anyhow::Result + ) -> Self { - Ok(Words { + Words { ctx, query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::default(), - }) + } } - pub fn new( - ctx: &'t dyn Context, - parent: Box, - ) -> anyhow::Result - { - Ok(Words { + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Words { ctx, query_trees: Vec::default(), candidates: Candidates::default(), bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::default(), - }) + } } } From 9b6b35d9b7b18539c8b11adc30a12863858a84df Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 18:19:10 +0100 Subject: [PATCH 0548/1889] Clean up some comments --- milli/src/search/criteria/typo.rs | 1 - milli/src/search/mod.rs | 6 ------ 2 files changed, 7 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index e952bda55..a78ac3339 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -283,7 +283,6 @@ fn resolve_candidates<'t>( } } - /// FIXME Make this function generic and mutualize it between Typo and proximity criterion fn mdfs<'t>( ctx: &'t dyn Context, branches: &[Operation], diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 48b0f71da..8570cefaa 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -94,12 +94,6 @@ impl<'a> Search<'a> { let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; let mut criteria = criteria_builder.build(query_tree, facet_candidates)?; - // // We sort in descending order on a specific field *by hand*, don't do that at home. - // let attr_name = "released-timestamp"; - // let fid = self.index.fields_ids_map(self.rtxn)?.id(attr_name).unwrap(); - // let ftype = *self.index.faceted_fields(self.rtxn)?.get(attr_name).unwrap(); - // let desc_criterion = AscDesc::desc(self.index, self.rtxn, Box::new(words_criterion), fid, ftype)?; - let mut offset = self.offset; let mut limit = self.limit; let mut documents_ids = Vec::new(); From e5bb96bc3b379b9b231224442e0770ac942389cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 6 Mar 2021 12:48:41 +0100 Subject: [PATCH 0549/1889] Fix the searchable settings test --- milli/src/update/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 17a9da1eb..fd91d3468 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -283,7 +283,7 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = &b"id,name,age\n0,kevin,23\n1,kevina,21\n2,benoit,34\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); From 9e1eb252326aca7dea98ac1178ffa663f1548517 Mon Sep 17 00:00:00 2001 From: mpostma Date: Fri, 5 Mar 2021 19:54:46 +0100 Subject: [PATCH 0550/1889] implement display for criterion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update milli/src/criterion.rs Co-authored-by: Clément Renault --- milli/src/criterion.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index ead5b9da3..e9f5f87ca 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,4 +1,5 @@ use std::collections::HashMap; +use std::fmt; use anyhow::{Context, bail}; use regex::Regex; @@ -63,3 +64,20 @@ pub fn default_criteria() -> Vec { Criterion::Exactness, ] } + +impl fmt::Display for Criterion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use Criterion::*; + + match self { + Typo => f.write_str("typo"), + Words => f.write_str("words"), + Proximity => f.write_str("proximity"), + Attribute => f.write_str("attribute"), + WordsPosition => f.write_str("wordsPosition"), + Exactness => f.write_str("exactness"), + Asc(attr) => write!(f, "asc({:?})", attr), + Desc(attr) => write!(f, "desc({:?})", attr), + } + } +} From e3095be85c20f3c773633345bfc655fee5f25233 Mon Sep 17 00:00:00 2001 From: mpostma Date: Mon, 8 Mar 2021 12:09:09 +0100 Subject: [PATCH 0551/1889] Remove Debug use in Display impl --- milli/src/criterion.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index e9f5f87ca..5d8ba09ba 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -76,8 +76,8 @@ impl fmt::Display for Criterion { Attribute => f.write_str("attribute"), WordsPosition => f.write_str("wordsPosition"), Exactness => f.write_str("exactness"), - Asc(attr) => write!(f, "asc({:?})", attr), - Desc(attr) => write!(f, "desc({:?})", attr), + Asc(attr) => write!(f, "asc({})", attr), + Desc(attr) => write!(f, "desc({})", attr), } } } From a58d2b613786b54088aa01fd594ab2bf8e7aea77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 4 Mar 2021 10:13:34 +0100 Subject: [PATCH 0552/1889] Print the Asc/Desc criterion field name in the debug prints --- milli/src/search/criteria/asc_desc.rs | 60 ++++++++++++++++++--------- milli/src/search/criteria/mod.rs | 28 ++----------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 193e9c942..50bb6798b 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::mem::take; -use anyhow::bail; +use anyhow::{bail, Context as _}; use itertools::Itertools; use log::debug; use ordered_float::OrderedFloat; @@ -13,12 +13,13 @@ use crate::heed_codec::facet::{FieldDocIdFacetI64Codec, FieldDocIdFacetF64Codec} use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; -use crate::{FieldId, Index}; +use crate::{FieldsIdsMap, FieldId, Index}; use super::{Criterion, CriterionResult}; pub struct AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn<'t>, + field_name: String, field_id: FieldId, facet_type: FacetType, ascending: bool, @@ -35,11 +36,10 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, query_tree: Option, candidates: Option, - field_id: FieldId, - facet_type: FacetType, + field_name: String, ) -> anyhow::Result { - Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, true) + Self::initial(index, rtxn, query_tree, candidates, field_name, true) } pub fn initial_desc( @@ -47,33 +47,30 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, query_tree: Option, candidates: Option, - field_id: FieldId, - facet_type: FacetType, + field_name: String, ) -> anyhow::Result { - Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, false) + Self::initial(index, rtxn, query_tree, candidates, field_name, false) } pub fn asc( index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - field_id: FieldId, - facet_type: FacetType, + field_name: String, ) -> anyhow::Result { - Self::new(index, rtxn, parent, field_id, facet_type, true) + Self::new(index, rtxn, parent, field_name, true) } pub fn desc( index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - field_id: FieldId, - facet_type: FacetType, + field_name: String, ) -> anyhow::Result { - Self::new(index, rtxn, parent, field_id, facet_type, false) + Self::new(index, rtxn, parent, field_name, false) } fn initial( @@ -81,11 +78,14 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, query_tree: Option, candidates: Option, - field_id: FieldId, - facet_type: FacetType, + field_name: String, ascending: bool, ) -> anyhow::Result { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let faceted_fields = index.faceted_fields(rtxn)?; + let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; + let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?; let candidates = match &query_tree { Some(qt) => { @@ -102,6 +102,7 @@ impl<'t> AscDesc<'t> { Ok(AscDesc { index, rtxn, + field_name, field_id, facet_type, ascending, @@ -117,14 +118,18 @@ impl<'t> AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - field_id: FieldId, - facet_type: FacetType, + field_name: String, ascending: bool, ) -> anyhow::Result { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let faceted_fields = index.faceted_fields(rtxn)?; + let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; + Ok(AscDesc { index, rtxn, + field_name, field_id, facet_type, ascending, @@ -140,8 +145,8 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { fn next(&mut self) -> anyhow::Result> { loop { - debug!("Facet {} iteration ({:?})", - if self.ascending { "Asc" } else { "Desc" }, self.candidates, + debug!("Facet {}({}) iteration ({:?})", + if self.ascending { "Asc" } else { "Desc" }, self.field_name, self.candidates, ); match &mut self.candidates { @@ -197,6 +202,21 @@ impl<'t> Criterion for AscDesc<'t> { } } +fn field_id_facet_type( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + field: &str, +) -> anyhow::Result<(FieldId, FacetType)> +{ + let id = fields_ids_map.id(field).with_context(|| { + format!("field {:?} isn't registered", field) + })?; + let facet_type = faceted_fields.get(field).with_context(|| { + format!("field {:?} isn't faceted", field) + })?; + Ok((id, *facet_type)) +} + fn facet_ordered( index: &Index, rtxn: &heed::RoTxn, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 0dcaa5a69..b1119e221 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -122,18 +122,6 @@ impl<'t> CriteriaBuilder<'t> { { use crate::criterion::Criterion as Name; - let fields_ids_map = self.index.fields_ids_map(&self.rtxn)?; - let faceted_fields = self.index.faceted_fields(&self.rtxn)?; - let field_id_facet_type = |field: &str| -> anyhow::Result<(FieldId, FacetType)> { - let id = fields_ids_map.id(field).with_context(|| { - format!("field {:?} isn't registered", field) - })?; - let facet_type = faceted_fields.get(field).with_context(|| { - format!("field {:?} isn't faceted", field) - })?; - Ok((id, *facet_type)) - }; - let mut criterion = None as Option>; for name in self.index.criteria(&self.rtxn)? { criterion = Some(match criterion.take() { @@ -141,14 +129,8 @@ impl<'t> CriteriaBuilder<'t> { Name::Typo => Box::new(Typo::new(self, father)), Name::Words => Box::new(Words::new(self, father)), Name::Proximity => Box::new(Proximity::new(self, father)), - Name::Asc(field) => { - let (id, facet_type) = field_id_facet_type(&field)?; - Box::new(AscDesc::asc(&self.index, &self.rtxn, father, id, facet_type)?) - }, - Name::Desc(field) => { - let (id, facet_type) = field_id_facet_type(&field)?; - Box::new(AscDesc::desc(&self.index, &self.rtxn, father, id, facet_type)?) - }, + Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?), + Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?), _otherwise => father, }, None => match name { @@ -156,12 +138,10 @@ impl<'t> CriteriaBuilder<'t> { Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())), Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())), Name::Asc(field) => { - let (id, facet_type) = field_id_facet_type(&field)?; - Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?) + Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) }, Name::Desc(field) => { - let (id, facet_type) = field_id_facet_type(&field)?; - Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?) + Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) }, _otherwise => continue, }, From 3c76b3548d298a80a6cc249cbf67acc754997afe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 4 Mar 2021 11:00:18 +0100 Subject: [PATCH 0553/1889] Rework the Asc/Desc criteria to be facet iterator based --- milli/src/search/criteria/asc_desc.rs | 168 ++++++++++++++------------ milli/src/search/criteria/mod.rs | 5 +- 2 files changed, 92 insertions(+), 81 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 50bb6798b..9d675ab42 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -2,8 +2,10 @@ use std::collections::HashMap; use std::mem::take; use anyhow::{bail, Context as _}; +use heed::{BytesDecode, BytesEncode}; use itertools::Itertools; use log::debug; +use num_traits::Bounded; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; @@ -24,7 +26,7 @@ pub struct AscDesc<'t> { facet_type: FacetType, ascending: bool, query_tree: Option, - candidates: RoaringBitmap, + candidates: Box> + 't>, bucket_candidates: RoaringBitmap, faceted_candidates: RoaringBitmap, parent: Option>, @@ -107,7 +109,7 @@ impl<'t> AscDesc<'t> { facet_type, ascending, query_tree, - candidates, + candidates: facet_ordered(index, rtxn, field_id, facet_type, ascending, candidates)?, faceted_candidates, bucket_candidates: RoaringBitmap::new(), parent: None, @@ -134,7 +136,7 @@ impl<'t> AscDesc<'t> { facet_type, ascending, query_tree: None, - candidates: RoaringBitmap::new(), + candidates: Box::new(std::iter::empty()), faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, bucket_candidates: RoaringBitmap::new(), parent: Some(parent), @@ -145,23 +147,28 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { fn next(&mut self) -> anyhow::Result> { loop { - debug!("Facet {}({}) iteration ({:?})", - if self.ascending { "Asc" } else { "Desc" }, self.field_name, self.candidates, + debug!("Facet {}({}) iteration", + if self.ascending { "Asc" } else { "Desc" }, self.field_name ); - match &mut self.candidates { - candidates if candidates.is_empty() => { + match self.candidates.next().transpose()? { + None => { let query_tree = self.query_tree.take(); - let candidates = take(&mut self.candidates); let bucket_candidates = take(&mut self.bucket_candidates); - match self.parent.as_mut() { Some(parent) => { match parent.next()? { Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { self.query_tree = query_tree; candidates.intersect_with(&self.faceted_candidates); - self.candidates = candidates; + self.candidates = facet_ordered( + self.index, + self.rtxn, + self.field_id, + self.facet_type, + self.ascending, + candidates, + )?; self.bucket_candidates = bucket_candidates; }, None => return Ok(None), @@ -172,28 +179,21 @@ impl<'t> Criterion for AscDesc<'t> { }, } - return Ok(Some(CriterionResult { query_tree, candidates, bucket_candidates })); + return Ok(Some(CriterionResult { + query_tree, + candidates: RoaringBitmap::new(), + bucket_candidates, + })); }, - candidates => { + Some(candidates) => { let bucket_candidates = match self.parent { Some(_) => take(&mut self.bucket_candidates), None => candidates.clone(), }; - let found_candidates = facet_ordered( - self.index, - self.rtxn, - self.field_id, - self.facet_type, - self.ascending, - candidates.clone(), - )?; - - candidates.difference_with(&found_candidates); - return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), - candidates: found_candidates, + candidates, bucket_candidates, })); }, @@ -217,86 +217,98 @@ fn field_id_facet_type( Ok((id, *facet_type)) } -fn facet_ordered( - index: &Index, - rtxn: &heed::RoTxn, +/// Returns an iterator over groups of the given candidates in ascending or descending order. +/// +/// It will either use an iterative or a recusrsive method on the whole facet database depending +/// on the number of candidates to rank. +fn facet_ordered<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, field_id: FieldId, facet_type: FacetType, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result +) -> anyhow::Result> + 't>> { match facet_type { FacetType::Float => { if candidates.len() <= 1000 { - let db = index.field_id_docid_facet_values.remap_key_type::(); - let mut docids_values = Vec::with_capacity(candidates.len() as usize); - for docid in candidates.iter() { - let left = (field_id, docid, f64::MIN); - let right = (field_id, docid, f64::MAX); - let mut iter = db.range(rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, OrderedFloat(value))); - } - } - docids_values.sort_unstable_by_key(|(_, value)| *value); - let iter = docids_values.into_iter(); - let iter = if ascending { - Box::new(iter) as Box> - } else { - Box::new(iter.rev()) - }; - match iter.group_by(|(_, v)| *v).into_iter().next() { - Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()), - None => Ok(RoaringBitmap::new()) - } + let iter = iterative_facet_ordered_iter::>( + index, rtxn, field_id, ascending, candidates, + )?; + Ok(Box::new(iter.map(Ok)) as Box>) } else { let facet_fn = if ascending { FacetIter::::new_reducing } else { FacetIter::::new_reverse_reducing }; - - let mut iter = facet_fn(rtxn, index, field_id, candidates)?; - Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default()) + let iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } }, FacetType::Integer => { if candidates.len() <= 1000 { - let db = index.field_id_docid_facet_values.remap_key_type::(); - let mut docids_values = Vec::with_capacity(candidates.len() as usize); - for docid in candidates.iter() { - let left = (field_id, docid, i64::MIN); - let right = (field_id, docid, i64::MAX); - let mut iter = db.range(rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, value)); - } - } - docids_values.sort_unstable_by_key(|(_, value)| *value); - let iter = docids_values.into_iter(); - let iter = if ascending { - Box::new(iter) as Box> - } else { - Box::new(iter.rev()) - }; - match iter.group_by(|(_, v)| *v).into_iter().next() { - Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()), - None => Ok(RoaringBitmap::new()) - } + let iter = iterative_facet_ordered_iter::( + index, rtxn, field_id, ascending, candidates, + )?; + Ok(Box::new(iter.map(Ok)) as Box>) } else { let facet_fn = if ascending { FacetIter::::new_reducing } else { FacetIter::::new_reverse_reducing }; - - let mut iter = facet_fn(rtxn, index, field_id, candidates)?; - Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default()) + let iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } }, FacetType::String => bail!("criteria facet type must be a number"), } } + +/// Fetch the whole list of candidates facet values one by one and order them by it. +/// +/// This function is fast when the amount of candidates to rank is small. +fn iterative_facet_ordered_iter<'t, KC, T, U>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + ascending: bool, + candidates: RoaringBitmap, +) -> anyhow::Result + 't> +where + KC: BytesDecode<'t, DItem = (FieldId, u32, T)>, + KC: for<'a> BytesEncode<'a, EItem = (FieldId, u32, T)>, + T: Bounded, + U: From + Ord + Clone + 't, +{ + let db = index.field_id_docid_facet_values.remap_key_type::(); + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, T::min_value()); + let right = (field_id, docid, T::max_value()); + let mut iter = db.range(rtxn, &(left..=right))?; + let entry = if ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), ())) = entry.transpose()? { + docids_values.push((docid, U::from(value))); + } + } + docids_values.sort_unstable_by_key(|(_, v)| v.clone()); + let iter = docids_values.into_iter(); + let iter = if ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + + // The itertools GroupBy iterator doesn't provide an owned version, we are therefore + // required to collect the result into an owned collection (a Vec). + // https://github.com/rust-itertools/itertools/issues/499 + let vec: Vec<_> = iter.group_by(|(_, v)| v.clone()) + .into_iter() + .map(|(_, ids)| ids.map(|(id, _)| id).collect()) + .collect(); + + Ok(vec.into_iter()) +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index b1119e221..aadd0b31a 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,12 +1,11 @@ use std::collections::HashMap; use std::borrow::Cow; -use anyhow::{bail, Context as _}; +use anyhow::bail; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::search::word_derivations; -use crate::{Index, FieldId}; +use crate::Index; use super::query_tree::{Operation, Query, QueryKind}; use self::typo::Typo; From 636a9df177e7a58029fcccd806f4f59c6d26cb45 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 3 Mar 2021 13:38:20 +0100 Subject: [PATCH 0554/1889] Temporarily fix the tinytemplate doc hidden issue --- Cargo.lock | 5 +++-- milli/Cargo.toml | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b7f479d2e..4a5254b57 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1288,6 +1288,7 @@ dependencies = [ "smallstr", "smallvec", "tempfile", + "tinytemplate", "uuid", ] @@ -2305,9 +2306,9 @@ dependencies = [ [[package]] name = "tinytemplate" -version = "1.2.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2ada8616fad06a2d0c455adc530de4ef57605a8120cc65da9653e0e9623ca74" +checksum = "6d3dc76004a03cec1c5932bca4cdc2e39aaa798e3f82363dd94f9adf6098c12f" dependencies = [ "serde", "serde_json", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 9f378f14c..67b3a1155 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -46,6 +46,10 @@ itertools = "0.10.0" # logging log = "0.4.14" +# We temporarily depend on this crate just to fix this issue +# https://github.com/bheisler/TinyTemplate/pull/17 +tinytemplate = "=1.1.0" + [dev-dependencies] criterion = "0.3.4" maplit = "1.0.2" From ae47bb359498eee4c0c7cf8b464b315e9713235a Mon Sep 17 00:00:00 2001 From: many Date: Wed, 3 Mar 2021 15:41:09 +0100 Subject: [PATCH 0555/1889] Introduce plane_sweep function in proximity criterion --- milli/src/search/criteria/mod.rs | 12 +- milli/src/search/criteria/proximity.rs | 178 ++++++++++++++++++++++++- 2 files changed, 188 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index aadd0b31a..856e9af9d 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -5,7 +5,7 @@ use anyhow::bail; use roaring::RoaringBitmap; use crate::search::word_derivations; -use crate::Index; +use crate::{DocumentId, Index}; use super::query_tree::{Operation, Query, QueryKind}; use self::typo::Typo; @@ -66,6 +66,7 @@ pub trait Context { fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; + fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result>; } pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -104,6 +105,11 @@ impl<'a> Context for CriteriaBuilder<'a> { fn in_prefix_cache(&self, word: &str) -> bool { self.words_prefixes_fst.contains(word) } + + fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result> { + let key = (docid, word); + self.index.docid_word_positions.get(self.rtxn, &key) + } } impl<'t> CriteriaBuilder<'t> { @@ -368,6 +374,10 @@ pub mod test { fn in_prefix_cache(&self, word: &str) -> bool { self.word_prefix_docids.contains_key(&word.to_string()) } + + fn docid_word_positions(&self, _docid: DocumentId, _word: &str) -> heed::Result> { + todo!() + } } impl<'a> Default for TestContext<'a> { diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b192902c1..cea50c034 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -1,9 +1,10 @@ -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::mem::take; use roaring::RoaringBitmap; use log::debug; +use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; @@ -289,3 +290,178 @@ fn resolve_candidates<'t>( } Ok(candidates) } + +fn resolve_plane_sweep_candidates<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + allowed_candidates: &RoaringBitmap, +) -> anyhow::Result> +{ + /// FIXME may be buggy with query like "new new york" + fn plane_sweep<'t>( + ctx: &'t dyn Context, + operations: &[Operation], + docid: DocumentId, + consecutive: bool, + ) -> anyhow::Result> { + fn compute_groups_proximity(groups: &Vec<(usize, (Position, u8, Position))>, consecutive: bool) -> Option<(Position, u8, Position)> { + // take the inner proximity of the first group as initial + let mut proximity = groups.first()?.1.1; + let left_most_pos = groups.first()?.1.0; + let right_most_pos = groups.last()?.1.2; + + for pair in groups.windows(2) { + if let [(i1, (_, _, rpos1)), (i2, (lpos2, prox2, _))] = pair { + // if a pair overlap, meaning that they share at least a word, we return None + if rpos1 >= lpos2 { return None } + // if groups are in the good order (query order) we remove 1 to the proximity + // the proximity is clamped to 7 + let pair_proximity = if i1 < i2 { + (*lpos2 - *rpos1 - 1).min(7) + } else { + (*lpos2 - *rpos1).min(7) + }; + + proximity += pair_proximity as u8 + prox2; + } + } + + // if groups should be consecutives, we will only accept groups with a proximity of 0 + if !consecutive || proximity == 0 { + Some((left_most_pos, proximity, right_most_pos)) + } else { None } + } + + let groups_len = operations.len(); + let mut groups_positions = Vec::with_capacity(groups_len); + + for operation in operations { + let positions = resolve_operation(ctx, operation, docid)?; + groups_positions.push(positions.into_iter()); + } + + // Pop top elements of each list. + let mut current = Vec::with_capacity(groups_len); + for (i, positions) in groups_positions.iter_mut().enumerate() { + match positions.next() { + Some(p) => current.push((i, p)), + // if a group return None, it means that the document does not contain all the words, + // we return an empty result. + None => return Ok(Vec::new()), + } + } + + // Sort k elements by their positions. + current.sort_unstable_by_key(|(_, p)| *p); + + // Find leftmost and rightmost group and their positions. + let mut leftmost = *current.first().unwrap(); + let mut rightmost = *current.last().unwrap(); + + let mut output = Vec::new(); + loop { + // Find the position p of the next elements of a list of the leftmost group. + // If the list is empty, break the loop. + let p = groups_positions[leftmost.0].next().map(|p| (leftmost.0, p)); + + // let q be the position q of second group of the interval. + let q = current[1]; + + let mut leftmost_index = 0; + + // If p > r, then the interval [l, r] is minimal and + // we insert it into the heap according to its size. + if p.map_or(true, |p| p.1 > rightmost.1) { + leftmost_index = current[0].0; + if let Some(group) = compute_groups_proximity(¤t, consecutive) { + output.push(group); + } + } + + // TODO not sure about breaking here or when the p list is found empty. + let p = match p { + Some(p) => p, + None => break, + }; + + // Remove the leftmost group P in the interval, + // and pop the same group from a list. + current[leftmost_index] = p; + + if p.1 > rightmost.1 { + // if [l, r] is minimal, let r = p and l = q. + rightmost = p; + leftmost = q; + } else { + // Ohterwise, let l = min{p,q}. + leftmost = if p.1 < q.1 { p } else { q }; + } + + // Then update the interval and order of groups_positions in the interval. + current.sort_unstable_by_key(|(_, p)| *p); + } + + // Sort the list according to the size and the positions. + output.sort_unstable(); + + Ok(output) + } + + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + docid: DocumentId, + ) -> anyhow::Result> { + use Operation::{And, Consecutive, Or}; + + match query_tree { + And(ops) => plane_sweep(ctx, ops, docid, false), + Consecutive(ops) => plane_sweep(ctx, ops, docid, true), + Or(_, ops) => { + let mut result = Vec::new(); + for op in ops { + result.extend(resolve_operation(ctx, op, docid)?) + } + + result.sort_unstable(); + Ok(result) + }, + Operation::Query(Query {prefix, kind}) => { + let fst = ctx.words_fst(); + let words = match kind { + QueryKind::Exact { word, .. } => { + if *prefix { + word_derivations(word, true, 0, fst)? + } else { + vec![(word.to_string(), 0)] + } + }, + QueryKind::Tolerant { typo, word } => { + word_derivations(word, *prefix, *typo, fst)? + } + }; + + let mut result = Vec::new(); + for (word, _) in words { + if let Some(positions) = ctx.docid_word_positions(docid, &word)? { + let iter = positions.iter().map(|p| (p, 0, p)); + result.extend(iter); + } + } + + result.sort_unstable(); + Ok(result) + } + } + } + + let mut candidates = BTreeMap::new(); + for docid in allowed_candidates { + let positions = resolve_operation(ctx, query_tree, docid)?; + let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); + let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7); + candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid); + } + + Ok(candidates) +} From 2606c92ef92b6a1f85c0a9656acad19b42222ac5 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 4 Mar 2021 16:07:07 +0100 Subject: [PATCH 0556/1889] use plain sweep in proximity criterion --- milli/src/search/criteria/proximity.rs | 45 ++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index cea50c034..55a468c22 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, btree_map}; use std::mem::take; use roaring::RoaringBitmap; @@ -16,6 +16,7 @@ pub struct Proximity<'t> { bucket_candidates: RoaringBitmap, parent: Option>, candidates_cache: HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + plane_sweep_cache: Option>, } impl<'t> Proximity<'t> { @@ -33,6 +34,7 @@ impl<'t> Proximity<'t> { bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::new(), + plane_sweep_cache: None, } } @@ -45,6 +47,7 @@ impl<'t> Proximity<'t> { bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::new(), + plane_sweep_cache: None, } } } @@ -69,15 +72,42 @@ impl<'t> Criterion for Proximity<'t> { }, (Some((max_prox, query_tree)), Allowed(candidates)) => { if self.proximity as usize > *max_prox { + // reset state to (None, Forbidden(_)) self.query_tree = None; self.candidates = Candidates::default(); } else { - let mut new_candidates = resolve_candidates( - self.ctx, - &query_tree, - self.proximity, - &mut self.candidates_cache, - )?; + let mut new_candidates = if candidates.len() <= 1000 { + if let Some(cache) = self.plane_sweep_cache.as_mut() { + match cache.next() { + Some((p, candidates)) => { + self.proximity = p; + candidates + }, + None => { + // reset state to (None, Forbidden(_)) + self.query_tree = None; + self.candidates = Candidates::default(); + continue + }, + } + } else { + let cache = resolve_plane_sweep_candidates( + self.ctx, + query_tree, + candidates + )?; + self.plane_sweep_cache = Some(cache.into_iter()); + + continue + } + } else { // use set theory based algorithm + resolve_candidates( + self.ctx, + &query_tree, + self.proximity, + &mut self.candidates_cache, + )? + }; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); @@ -140,6 +170,7 @@ impl<'t> Criterion for Proximity<'t> { self.proximity = 0; self.candidates = Candidates::Allowed(candidates); self.bucket_candidates.union_with(&bucket_candidates); + self.plane_sweep_cache = None; }, None => return Ok(None), } From 5fcaedb880db82df76196bfb4363dbacf961be3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 5 Mar 2021 11:02:24 +0100 Subject: [PATCH 0557/1889] Introduce a WordDerivationsCache struct --- milli/src/search/criteria/asc_desc.rs | 7 +-- milli/src/search/criteria/fetcher.rs | 11 ++-- milli/src/search/criteria/mod.rs | 57 ++++++++++++-------- milli/src/search/criteria/proximity.rs | 75 ++++++++++++++++---------- milli/src/search/criteria/typo.rs | 75 ++++++++++++++++---------- milli/src/search/criteria/words.rs | 9 ++-- milli/src/search/mod.rs | 35 +++++++----- 7 files changed, 169 insertions(+), 100 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 9d675ab42..0aff60d3d 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -15,6 +15,7 @@ use crate::heed_codec::facet::{FieldDocIdFacetI64Codec, FieldDocIdFacetF64Codec} use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; use crate::{FieldsIdsMap, FieldId, Index}; use super::{Criterion, CriterionResult}; @@ -92,7 +93,7 @@ impl<'t> AscDesc<'t> { let candidates = match &query_tree { Some(qt) => { let context = CriteriaBuilder::new(rtxn, index)?; - let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new())?; + let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), &mut WordDerivationsCache::new())?; if let Some(candidates) = candidates { qt_candidates.intersect_with(&candidates); } @@ -145,7 +146,7 @@ impl<'t> AscDesc<'t> { } impl<'t> Criterion for AscDesc<'t> { - fn next(&mut self) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { loop { debug!("Facet {}({}) iteration", if self.ascending { "Asc" } else { "Desc" }, self.field_name @@ -157,7 +158,7 @@ impl<'t> Criterion for AscDesc<'t> { let bucket_candidates = take(&mut self.bucket_candidates); match self.parent.as_mut() { Some(parent) => { - match parent.next()? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { self.query_tree = query_tree; candidates.intersect_with(&self.faceted_candidates); diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index 38fee20d3..99c49e53e 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -5,6 +5,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; pub struct Fetcher<'t> { @@ -47,7 +48,7 @@ impl<'t> Fetcher<'t> { } impl<'t> Criterion for Fetcher<'t> { - fn next(&mut self) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", @@ -60,7 +61,7 @@ impl<'t> Criterion for Fetcher<'t> { let candidates = take(&mut self.candidates).into_inner(); let candidates = match &self.query_tree { Some(qt) if should_get_documents_ids => { - let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?; + let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; docids.intersect_with(&candidates); docids }, @@ -76,11 +77,11 @@ impl<'t> Criterion for Fetcher<'t> { Forbidden(_) => { match self.parent.as_mut() { Some(parent) => { - match parent.next()? { + match parent.next(wdcache)? { Some(result) => return Ok(Some(result)), None => if should_get_documents_ids { let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?, + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?, None => self.ctx.documents_ids()?, }; @@ -94,7 +95,7 @@ impl<'t> Criterion for Fetcher<'t> { }, None => if should_get_documents_ids { let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?, + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?, None => self.ctx.documents_ids()?, }; diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 856e9af9d..d70942c1c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -4,8 +4,8 @@ use std::borrow::Cow; use anyhow::bail; use roaring::RoaringBitmap; -use crate::search::word_derivations; -use crate::{DocumentId, Index}; +use crate::search::{word_derivations, WordDerivationsCache}; +use crate::{Index, DocumentId}; use super::query_tree::{Operation, Query, QueryKind}; use self::typo::Typo; @@ -21,7 +21,7 @@ pub mod proximity; pub mod fetcher; pub trait Criterion { - fn next(&mut self) -> anyhow::Result>; + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result>; } /// The result of a call to the parent criterion. @@ -164,12 +164,14 @@ pub fn resolve_query_tree<'t>( ctx: &'t dyn Context, query_tree: &Operation, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { use Operation::{And, Consecutive, Or, Query}; @@ -177,7 +179,7 @@ pub fn resolve_query_tree<'t>( match query_tree { And(ops) => { let mut ops = ops.iter().map(|op| { - resolve_operation(ctx, op, cache) + resolve_operation(ctx, op, cache, wdcache) }).collect::>>()?; ops.sort_unstable_by_key(|cds| cds.len()); @@ -200,7 +202,7 @@ pub fn resolve_query_tree<'t>( for slice in ops.windows(2) { match (&slice[0], &slice[1]) { (Operation::Query(left), Operation::Query(right)) => { - match query_pair_proximity_docids(ctx, left, right, 1)? { + match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? { pair_docids if pair_docids.is_empty() => { return Ok(RoaringBitmap::new()) }, @@ -221,16 +223,16 @@ pub fn resolve_query_tree<'t>( Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { - let docids = resolve_operation(ctx, op, cache)?; + let docids = resolve_operation(ctx, op, cache, wdcache)?; candidates.union_with(&docids); } Ok(candidates) }, - Query(q) => Ok(query_docids(ctx, q)?), + Query(q) => Ok(query_docids(ctx, q, wdcache)?), } } - resolve_operation(ctx, query_tree, cache) + resolve_operation(ctx, query_tree, cache, wdcache) } @@ -239,7 +241,8 @@ fn all_word_pair_proximity_docids, U: AsRef>( left_words: &[(T, u8)], right_words: &[(U, u8)], proximity: u8 -) -> anyhow::Result { +) -> anyhow::Result +{ let mut docids = RoaringBitmap::new(); for (left, _l_typo) in left_words { for (right, _r_typo) in right_words { @@ -250,13 +253,18 @@ fn all_word_pair_proximity_docids, U: AsRef>( Ok(docids) } -fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result { +fn query_docids( + ctx: &dyn Context, + query: &Query, + wdcache: &mut WordDerivationsCache, +) -> anyhow::Result +{ match &query.kind { QueryKind::Exact { word, .. } => { if query.prefix && ctx.in_prefix_cache(&word) { Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default()) } else if query.prefix { - let words = word_derivations(&word, true, 0, ctx.words_fst())?; + let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); @@ -268,7 +276,7 @@ fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result { - let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst())?; + let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); @@ -279,10 +287,17 @@ fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result anyhow::Result { +fn query_pair_proximity_docids( + ctx: &dyn Context, + left: &Query, + right: &Query, + proximity: u8, + wdcache: &mut WordDerivationsCache, +) -> anyhow::Result +{ if proximity >= 8 { - let mut candidates = query_docids(ctx, left)?; - let right_candidates = query_docids(ctx, right)?; + let mut candidates = query_docids(ctx, left, wdcache)?; + let right_candidates = query_docids(ctx, right, wdcache)?; candidates.intersect_with(&right_candidates); return Ok(candidates); } @@ -293,14 +308,14 @@ fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, p if prefix && ctx.in_prefix_cache(&right) { Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) } else if prefix { - let r_words = word_derivations(&right, true, 0, ctx.words_fst())?; + let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) } else { Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) } }, (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = word_derivations(&left, false, *typo, ctx.words_fst())?; + let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); if prefix && ctx.in_prefix_cache(&right) { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { @@ -309,19 +324,19 @@ fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, p } Ok(docids) } else if prefix { - let r_words = word_derivations(&right, true, 0, ctx.words_fst())?; + let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } }, (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { - let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst())?; + let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) }, (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { - let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst())?; - let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst())?; + let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); + let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) }, } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 55a468c22..8a4892e35 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::{BTreeMap, HashMap, btree_map}; use std::mem::take; @@ -6,6 +7,7 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; +use crate::search::WordDerivationsCache; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; pub struct Proximity<'t> { @@ -53,7 +55,7 @@ impl<'t> Proximity<'t> { } impl<'t> Criterion for Proximity<'t> { - fn next(&mut self) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Proximity at iteration {} (max {:?}) ({:?})", @@ -94,7 +96,8 @@ impl<'t> Criterion for Proximity<'t> { let cache = resolve_plane_sweep_candidates( self.ctx, query_tree, - candidates + candidates, + wdcache, )?; self.plane_sweep_cache = Some(cache.into_iter()); @@ -106,6 +109,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, + wdcache, )? }; @@ -135,6 +139,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, + wdcache, )?; new_candidates.difference_with(&candidates); @@ -164,7 +169,7 @@ impl<'t> Criterion for Proximity<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next()? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); self.proximity = 0; @@ -188,6 +193,7 @@ fn resolve_candidates<'t>( query_tree: &Operation, proximity: u8, cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { fn resolve_operation<'t>( @@ -195,27 +201,28 @@ fn resolve_candidates<'t>( query_tree: &Operation, proximity: u8, cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { use Operation::{And, Consecutive, Or, Query}; let result = match query_tree { - And(ops) => mdfs(ctx, ops, proximity, cache)?, + And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, Consecutive(ops) => if proximity == 0 { - mdfs(ctx, ops, 0, cache)? + mdfs(ctx, ops, 0, cache, wdcache)? } else { Default::default() }, Or(_, ops) => { let mut output = Vec::new(); for op in ops { - let result = resolve_operation(ctx, op, proximity, cache)?; + let result = resolve_operation(ctx, op, proximity, cache, wdcache)?; output.extend(result); } output }, Query(q) => if proximity == 0 { - let candidates = query_docids(ctx, q)?; + let candidates = query_docids(ctx, q, wdcache)?; vec![(q.clone(), q.clone(), candidates)] } else { Default::default() @@ -231,6 +238,7 @@ fn resolve_candidates<'t>( right: &Operation, proximity: u8, cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { @@ -245,13 +253,13 @@ fn resolve_candidates<'t>( for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) { let left_key = (left.clone(), left_p); if !cache.contains_key(&left_key) { - let candidates = resolve_operation(ctx, left, left_p, cache)?; + let candidates = resolve_operation(ctx, left, left_p, cache, wdcache)?; cache.insert(left_key.clone(), candidates); } let right_key = (right.clone(), right_p); if !cache.contains_key(&right_key) { - let candidates = resolve_operation(ctx, right, right_p, cache)?; + let candidates = resolve_operation(ctx, right, right_p, cache, wdcache)?; cache.insert(right_key.clone(), candidates); } @@ -260,7 +268,7 @@ fn resolve_candidates<'t>( for (ll, lr, lcandidates) in lefts { for (rl, rr, rcandidates) in rights { - let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1)?; + let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; if lcandidates.len() < rcandidates.len() { candidates.intersect_with(lcandidates); candidates.intersect_with(rcandidates); @@ -284,6 +292,7 @@ fn resolve_candidates<'t>( branches: &[Operation], proximity: u8, cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { // Extract the first two elements but gives the tail @@ -293,13 +302,13 @@ fn resolve_candidates<'t>( }); match next { - Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache), + Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache, wdcache), Some((head1, Some((head2, tail)))) => { let mut output = Vec::new(); for p in 0..=proximity { - for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache)? { + for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache, wdcache)? { if !head_candidates.is_empty() { - for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache)? { + for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache, wdcache)? { candidates.intersect_with(&head_candidates); if !candidates.is_empty() { output.push((lhead.clone(), rtail, candidates)); @@ -310,13 +319,13 @@ fn resolve_candidates<'t>( } Ok(output) }, - Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache), + Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache), None => return Ok(Default::default()), } } let mut candidates = RoaringBitmap::new(); - for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache)? { + for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? { candidates.union_with(&cds); } Ok(candidates) @@ -326,6 +335,7 @@ fn resolve_plane_sweep_candidates<'t>( ctx: &'t dyn Context, query_tree: &Operation, allowed_candidates: &RoaringBitmap, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { /// FIXME may be buggy with query like "new new york" @@ -334,8 +344,14 @@ fn resolve_plane_sweep_candidates<'t>( operations: &[Operation], docid: DocumentId, consecutive: bool, - ) -> anyhow::Result> { - fn compute_groups_proximity(groups: &Vec<(usize, (Position, u8, Position))>, consecutive: bool) -> Option<(Position, u8, Position)> { + wdcache: &mut WordDerivationsCache, + ) -> anyhow::Result> + { + fn compute_groups_proximity( + groups: &[(usize, (Position, u8, Position))], + consecutive: bool, + ) -> Option<(Position, u8, Position)> + { // take the inner proximity of the first group as initial let mut proximity = groups.first()?.1.1; let left_most_pos = groups.first()?.1.0; @@ -360,14 +376,16 @@ fn resolve_plane_sweep_candidates<'t>( // if groups should be consecutives, we will only accept groups with a proximity of 0 if !consecutive || proximity == 0 { Some((left_most_pos, proximity, right_most_pos)) - } else { None } + } else { + None + } } let groups_len = operations.len(); let mut groups_positions = Vec::with_capacity(groups_len); for operation in operations { - let positions = resolve_operation(ctx, operation, docid)?; + let positions = resolve_operation(ctx, operation, docid, wdcache)?; groups_positions.push(positions.into_iter()); } @@ -442,16 +460,17 @@ fn resolve_plane_sweep_candidates<'t>( ctx: &'t dyn Context, query_tree: &Operation, docid: DocumentId, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { use Operation::{And, Consecutive, Or}; match query_tree { - And(ops) => plane_sweep(ctx, ops, docid, false), - Consecutive(ops) => plane_sweep(ctx, ops, docid, true), + And(ops) => plane_sweep(ctx, ops, docid, false, wdcache), + Consecutive(ops) => plane_sweep(ctx, ops, docid, true, wdcache), Or(_, ops) => { let mut result = Vec::new(); for op in ops { - result.extend(resolve_operation(ctx, op, docid)?) + result.extend(resolve_operation(ctx, op, docid, wdcache)?) } result.sort_unstable(); @@ -462,19 +481,19 @@ fn resolve_plane_sweep_candidates<'t>( let words = match kind { QueryKind::Exact { word, .. } => { if *prefix { - word_derivations(word, true, 0, fst)? + Cow::Borrowed(word_derivations(word, true, 0, fst, wdcache)?) } else { - vec![(word.to_string(), 0)] + Cow::Owned(vec![(word.to_string(), 0)]) } }, QueryKind::Tolerant { typo, word } => { - word_derivations(word, *prefix, *typo, fst)? + Cow::Borrowed(word_derivations(word, *prefix, *typo, fst, wdcache)?) } }; let mut result = Vec::new(); - for (word, _) in words { - if let Some(positions) = ctx.docid_word_positions(docid, &word)? { + for (word, _) in words.as_ref() { + if let Some(positions) = ctx.docid_word_positions(docid, word)? { let iter = positions.iter().map(|p| (p, 0, p)); result.extend(iter); } @@ -488,7 +507,7 @@ fn resolve_plane_sweep_candidates<'t>( let mut candidates = BTreeMap::new(); for docid in allowed_candidates { - let positions = resolve_operation(ctx, query_tree, docid)?; + let positions = resolve_operation(ctx, query_tree, docid, wdcache)?; let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7); candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid); diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a78ac3339..870dcf642 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -5,7 +5,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; -use crate::search::word_derivations; +use crate::search::{word_derivations, WordDerivationsCache}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; pub struct Typo<'t> { @@ -53,7 +53,7 @@ impl<'t> Typo<'t> { } impl<'t> Criterion for Typo<'t> { - fn next(&mut self) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); @@ -73,15 +73,21 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)?; query_tree.clone() } else { query_tree.clone() }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; + let mut new_candidates = resolve_candidates( + self.ctx, + &new_query_tree, + self.number_typos, + &mut self.candidates_cache, + wdcache, + )?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); self.number_typos += 1; @@ -105,15 +111,21 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)?; query_tree.clone() } else { query_tree.clone() }; - let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; + let mut new_candidates = resolve_candidates( + self.ctx, + &new_query_tree, + self.number_typos, + &mut self.candidates_cache, + wdcache, + )?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); self.number_typos += 1; @@ -141,7 +153,7 @@ impl<'t> Criterion for Typo<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next()? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; @@ -167,6 +179,7 @@ fn alterate_query_tree( mut query_tree: Operation, number_typos: u8, typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { fn recurse( @@ -174,13 +187,14 @@ fn alterate_query_tree( operation: &mut Operation, number_typos: u8, typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result<()> { use Operation::{And, Consecutive, Or}; match operation { And(ops) | Consecutive(ops) | Or(_, ops) => { - ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache)) + ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache, wdcache)) }, Operation::Query(q) => { // TODO may be optimized when number_typos == 0 @@ -198,7 +212,7 @@ fn alterate_query_tree( let words = if let Some(derivations) = typo_cache.get(&cache_key) { derivations.clone() } else { - let derivations = word_derivations(word, q.prefix, typo, words_fst)?; + let derivations = word_derivations(word, q.prefix, typo, words_fst, wdcache)?.to_vec(); typo_cache.insert(cache_key, derivations.clone()); derivations }; @@ -219,7 +233,7 @@ fn alterate_query_tree( } } - recurse(words_fst, &mut query_tree, number_typos, typo_cache)?; + recurse(words_fst, &mut query_tree, number_typos, typo_cache, wdcache)?; Ok(query_tree) } @@ -228,6 +242,7 @@ fn resolve_candidates<'t>( query_tree: &Operation, number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { fn resolve_operation<'t>( @@ -235,13 +250,14 @@ fn resolve_candidates<'t>( query_tree: &Operation, number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { use Operation::{And, Consecutive, Or, Query}; match query_tree { And(ops) => { - mdfs(ctx, ops, number_typos, cache) + mdfs(ctx, ops, number_typos, cache, wdcache) }, Consecutive(ops) => { let mut candidates = RoaringBitmap::new(); @@ -249,7 +265,7 @@ fn resolve_candidates<'t>( for slice in ops.windows(2) { match (&slice[0], &slice[1]) { (Operation::Query(left), Operation::Query(right)) => { - match query_pair_proximity_docids(ctx, left, right, 1)? { + match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? { pair_docids if pair_docids.is_empty() => { return Ok(RoaringBitmap::new()) }, @@ -270,13 +286,13 @@ fn resolve_candidates<'t>( Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { - let docids = resolve_operation(ctx, op, number_typos, cache)?; + let docids = resolve_operation(ctx, op, number_typos, cache, wdcache)?; candidates.union_with(&docids); } Ok(candidates) }, Query(q) => if q.kind.typo() == number_typos { - Ok(query_docids(ctx, q)?) + Ok(query_docids(ctx, q, wdcache)?) } else { Ok(RoaringBitmap::new()) }, @@ -288,6 +304,7 @@ fn resolve_candidates<'t>( branches: &[Operation], mana: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { match branches.split_first() { @@ -296,7 +313,7 @@ fn resolve_candidates<'t>( if let Some(candidates) = cache.get(&cache_key) { Ok(candidates.clone()) } else { - let candidates = resolve_operation(ctx, head, mana, cache)?; + let candidates = resolve_operation(ctx, head, mana, cache, wdcache)?; cache.insert(cache_key, candidates.clone()); Ok(candidates) } @@ -310,13 +327,13 @@ fn resolve_candidates<'t>( if let Some(candidates) = cache.get(&cache_key) { candidates.clone() } else { - let candidates = resolve_operation(ctx, head, m, cache)?; + let candidates = resolve_operation(ctx, head, m, cache, wdcache)?; cache.insert(cache_key, candidates.clone()); candidates } }; if !head_candidates.is_empty() { - let tail_candidates = mdfs(ctx, tail, mana - m, cache)?; + let tail_candidates = mdfs(ctx, tail, mana - m, cache, wdcache)?; head_candidates.intersect_with(&tail_candidates); candidates.union_with(&head_candidates); } @@ -328,7 +345,7 @@ fn resolve_candidates<'t>( } } - resolve_operation(ctx, query_tree, number_typos, cache) + resolve_operation(ctx, query_tree, number_typos, cache, wdcache) } #[cfg(test)] @@ -343,9 +360,10 @@ mod test { let query_tree = None; let facet_candidates = None; + let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, query_tree, facet_candidates); - assert!(criteria.next().unwrap().is_none()); + assert!(criteria.next(&mut wdcache).unwrap().is_none()); } #[test] @@ -361,6 +379,7 @@ mod test { let facet_candidates = None; +let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates); let candidates_1 = context.word_docids("split").unwrap().unwrap() @@ -378,7 +397,7 @@ mod test { bucket_candidates: candidates_1, }; - assert_eq!(criteria.next().unwrap(), Some(expected_1)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -400,7 +419,7 @@ mod test { bucket_candidates: candidates_2, }; - assert_eq!(criteria.next().unwrap(), Some(expected_2)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); } #[test] @@ -409,6 +428,7 @@ mod test { let query_tree = None; let facet_candidates = context.word_docids("earth").unwrap().unwrap(); +let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())); let expected = CriterionResult { @@ -418,10 +438,10 @@ mod test { }; // first iteration, returns the facet candidates - assert_eq!(criteria.next().unwrap(), Some(expected)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected)); // second iteration, returns None because there is no more things to do - assert!(criteria.next().unwrap().is_none()); + assert!(criteria.next(&mut wdcache).unwrap().is_none()); } #[test] @@ -437,6 +457,7 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); +let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())); let candidates_1 = context.word_docids("split").unwrap().unwrap() @@ -454,7 +475,7 @@ mod test { bucket_candidates: candidates_1 & &facet_candidates, }; - assert_eq!(criteria.next().unwrap(), Some(expected_1)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -476,7 +497,7 @@ mod test { bucket_candidates: candidates_2 & &facet_candidates, }; - assert_eq!(criteria.next().unwrap(), Some(expected_2)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); } } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 1827cd1ed..33296fd07 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -5,6 +5,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; pub struct Words<'t> { @@ -46,7 +47,7 @@ impl<'t> Words<'t> { } impl<'t> Criterion for Words<'t> { - fn next(&mut self) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); @@ -61,7 +62,7 @@ impl<'t> Criterion for Words<'t> { })); }, (Some(qt), Allowed(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -77,7 +78,7 @@ impl<'t> Criterion for Words<'t> { })); }, (Some(qt), Forbidden(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; found_candidates.difference_with(&candidates); candidates.union_with(&found_candidates); @@ -103,7 +104,7 @@ impl<'t> Criterion for Words<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next()? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); self.candidates = Candidates::Allowed(candidates); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 8570cefaa..34b3ffec9 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,5 +1,7 @@ use std::borrow::Cow; +use std::collections::hash_map::{HashMap, Entry}; use std::fmt; +use std::str::Utf8Error; use std::time::Instant; use fst::{IntoStreamer, Streamer, Set}; @@ -97,8 +99,9 @@ impl<'a> Search<'a> { let mut offset = self.offset; let mut limit = self.limit; let mut documents_ids = Vec::new(); + let mut words_derivations_cache = WordDerivationsCache::new(); let mut initial_candidates = RoaringBitmap::new(); - while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next()? { + while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next(&mut words_derivations_cache)? { debug!("Number of candidates found {}", candidates.len()); @@ -145,24 +148,32 @@ pub struct SearchResult { pub documents_ids: Vec, } -pub fn word_derivations( +pub type WordDerivationsCache = HashMap<(String, bool, u8), Vec<(String, u8)>>; + +pub fn word_derivations<'c>( word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>, -) -> anyhow::Result> + cache: &'c mut WordDerivationsCache, +) -> Result<&'c [(String, u8)], Utf8Error> { - let mut derived_words = Vec::new(); - let dfa = build_dfa(word, max_typo, is_prefix); - let mut stream = fst.search_with_state(&dfa).into_stream(); + match cache.entry((word.to_string(), is_prefix, max_typo)) { + Entry::Occupied(entry) => Ok(entry.into_mut()), + Entry::Vacant(entry) => { + let mut derived_words = Vec::new(); + let dfa = build_dfa(word, max_typo, is_prefix); + let mut stream = fst.search_with_state(&dfa).into_stream(); - while let Some((word, state)) = stream.next() { - let word = std::str::from_utf8(word)?; - let distance = dfa.distance(state); - derived_words.push((word.to_string(), distance.to_u8())); + while let Some((word, state)) = stream.next() { + let word = std::str::from_utf8(word)?; + let distance = dfa.distance(state); + derived_words.push((word.to_string(), distance.to_u8())); + } + + Ok(entry.insert(derived_words)) + }, } - - Ok(derived_words) } pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { From 82a0f678fbde5d7e0360aab75116f173a5448d20 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 8 Mar 2021 16:12:03 +0100 Subject: [PATCH 0558/1889] Introduce a cache on the docid_word_positions database method --- milli/src/search/criteria/proximity.rs | 72 +++++++++++++++++++------- milli/src/search/criteria/typo.rs | 6 +-- 2 files changed, 55 insertions(+), 23 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 8a4892e35..82b7185a3 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap, btree_map}; +use std::collections::btree_map::{self, BTreeMap}; +use std::collections::hash_map::{HashMap, Entry}; use std::mem::take; use roaring::RoaringBitmap; @@ -331,19 +332,21 @@ fn resolve_candidates<'t>( Ok(candidates) } -fn resolve_plane_sweep_candidates<'t>( - ctx: &'t dyn Context, +fn resolve_plane_sweep_candidates( + ctx: &dyn Context, query_tree: &Operation, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { /// FIXME may be buggy with query like "new new york" - fn plane_sweep<'t>( - ctx: &'t dyn Context, - operations: &[Operation], + fn plane_sweep<'a>( + ctx: &dyn Context, + operations: &'a [Operation], docid: DocumentId, consecutive: bool, + rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, + dwpcache: &mut HashMap>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { @@ -385,7 +388,7 @@ fn resolve_plane_sweep_candidates<'t>( let mut groups_positions = Vec::with_capacity(groups_len); for operation in operations { - let positions = resolve_operation(ctx, operation, docid, wdcache)?; + let positions = resolve_operation(ctx, operation, docid, rocache, dwpcache, wdcache)?; groups_positions.push(positions.into_iter()); } @@ -456,25 +459,32 @@ fn resolve_plane_sweep_candidates<'t>( Ok(output) } - fn resolve_operation<'t>( - ctx: &'t dyn Context, - query_tree: &Operation, + fn resolve_operation<'a>( + ctx: &dyn Context, + query_tree: &'a Operation, docid: DocumentId, + rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, + dwpcache: &mut HashMap>, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result> { + ) -> anyhow::Result> + { use Operation::{And, Consecutive, Or}; - match query_tree { - And(ops) => plane_sweep(ctx, ops, docid, false, wdcache), - Consecutive(ops) => plane_sweep(ctx, ops, docid, true, wdcache), + if let Some(result) = rocache.get(query_tree) { + return Ok(result.clone()); + } + + let result = match query_tree { + And(ops) => plane_sweep(ctx, ops, docid, false, rocache, dwpcache, wdcache)?, + Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, dwpcache, wdcache)?, Or(_, ops) => { let mut result = Vec::new(); for op in ops { - result.extend(resolve_operation(ctx, op, docid, wdcache)?) + result.extend(resolve_operation(ctx, op, docid, rocache, dwpcache, wdcache)?) } result.sort_unstable(); - Ok(result) + result }, Operation::Query(Query {prefix, kind}) => { let fst = ctx.words_fst(); @@ -493,21 +503,43 @@ fn resolve_plane_sweep_candidates<'t>( let mut result = Vec::new(); for (word, _) in words.as_ref() { - if let Some(positions) = ctx.docid_word_positions(docid, word)? { + let positions = match dwpcache.entry(word.to_string()) { + Entry::Occupied(entry) => entry.into_mut(), + Entry::Vacant(entry) => { + let positions = ctx.docid_word_positions(docid, word)?; + entry.insert(positions) + } + }; + + if let Some(positions) = positions { let iter = positions.iter().map(|p| (p, 0, p)); result.extend(iter); } } result.sort_unstable(); - Ok(result) + result } - } + }; + + rocache.insert(query_tree, result.clone()); + Ok(result) } + let mut word_positions_cache = HashMap::new(); + let mut resolve_operation_cache = HashMap::new(); let mut candidates = BTreeMap::new(); for docid in allowed_candidates { - let positions = resolve_operation(ctx, query_tree, docid, wdcache)?; + word_positions_cache.clear(); + resolve_operation_cache.clear(); + let positions = resolve_operation( + ctx, + query_tree, + docid, + &mut resolve_operation_cache, + &mut word_positions_cache, + wdcache, + )?; let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7); candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid); diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 870dcf642..e598637f1 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -379,7 +379,7 @@ mod test { let facet_candidates = None; -let mut wdcache = WordDerivationsCache::new(); + let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates); let candidates_1 = context.word_docids("split").unwrap().unwrap() @@ -428,7 +428,7 @@ let mut wdcache = WordDerivationsCache::new(); let query_tree = None; let facet_candidates = context.word_docids("earth").unwrap().unwrap(); -let mut wdcache = WordDerivationsCache::new(); + let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())); let expected = CriterionResult { @@ -457,7 +457,7 @@ let mut wdcache = WordDerivationsCache::new(); let facet_candidates = context.word_docids("earth").unwrap().unwrap(); -let mut wdcache = WordDerivationsCache::new(); + let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())); let candidates_1 = context.word_docids("split").unwrap().unwrap() From b18ec00a7ac3c227743be2ac455260ca71eed7f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 6 Mar 2021 11:28:22 +0100 Subject: [PATCH 0559/1889] Add a logging_timer macro to te criterion next methods --- Cargo.lock | 156 +++++++++++++++++-------- milli/Cargo.toml | 1 + milli/src/search/criteria/asc_desc.rs | 1 + milli/src/search/criteria/fetcher.rs | 1 + milli/src/search/criteria/proximity.rs | 1 + milli/src/search/criteria/typo.rs | 1 + milli/src/search/criteria/words.rs | 1 + 7 files changed, 113 insertions(+), 49 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4a5254b57..930ace50f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -53,8 +53,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2925c4c290382f9d2fa3d1c1b6a63fa1427099721ecca4749b154cc9c25522" dependencies = [ "askama_shared", - "proc-macro2", - "syn", + "proc-macro2 1.0.24", + "syn 1.0.60", ] [[package]] @@ -74,10 +74,10 @@ dependencies = [ "nom", "num-traits", "percent-encoding", - "proc-macro2", - "quote", + "proc-macro2 1.0.24", + "quote 1.0.9", "serde", - "syn", + "syn 1.0.60", "toml", ] @@ -622,9 +622,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" dependencies = [ "proc-macro-hack", - "proc-macro2", - "quote", - "syn", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", ] [[package]] @@ -1184,6 +1184,28 @@ dependencies = [ "cfg-if 1.0.0", ] +[[package]] +name = "logging_timer" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40d0c249955c17c2f8f86b5f501b16d2509ebbe775f7b1d1d2b1ba85ade2a793" +dependencies = [ + "log", + "logging_timer_proc_macros", +] + +[[package]] +name = "logging_timer_proc_macros" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "482c2c28e6bcfe7c4274f82f701774d755e6aa873edfd619460fcd0966e0eb07" +dependencies = [ + "log", + "proc-macro2 0.4.30", + "quote 0.6.13", + "syn 0.15.44", +] + [[package]] name = "loom" version = "0.4.0" @@ -1269,6 +1291,7 @@ dependencies = [ "levenshtein_automata", "linked-hash-map", "log", + "logging_timer", "maplit", "meilisearch-tokenizer", "memmap", @@ -1554,9 +1577,9 @@ checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" dependencies = [ "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", "pest_meta", - "proc-macro2", - "quote", - "syn", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", ] [[package]] @@ -1632,9 +1655,9 @@ version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "65ad2ae56b6abe3a1ee25f15ee605bacadb9a764edaba9c2bf4103800d4a1895" dependencies = [ - "proc-macro2", - "quote", - "syn", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", ] [[package]] @@ -1643,9 +1666,9 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "758669ae3558c6f74bd2a18b41f7ac0b5a195aea6639d6a9b5e5d1ad5ba24c0b" dependencies = [ - "proc-macro2", - "quote", - "syn", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", ] [[package]] @@ -1713,9 +1736,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" dependencies = [ "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", "version_check", ] @@ -1725,8 +1748,8 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ - "proc-macro2", - "quote", + "proc-macro2 1.0.24", + "quote 1.0.9", "version_check", ] @@ -1742,13 +1765,22 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" +[[package]] +name = "proc-macro2" +version = "0.4.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" +dependencies = [ + "unicode-xid 0.1.0", +] + [[package]] name = "proc-macro2" version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" dependencies = [ - "unicode-xid", + "unicode-xid 0.2.1", ] [[package]] @@ -1757,13 +1789,22 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +[[package]] +name = "quote" +version = "0.6.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1" +dependencies = [ + "proc-macro2 0.4.30", +] + [[package]] name = "quote" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" dependencies = [ - "proc-macro2", + "proc-macro2 1.0.24", ] [[package]] @@ -2047,9 +2088,9 @@ version = "1.0.123" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31" dependencies = [ - "proc-macro2", - "quote", - "syn", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", ] [[package]] @@ -2199,9 +2240,20 @@ checksum = "5ba9cdfda491b814720b6b06e0cac513d922fc407582032e8706e9f137976f90" dependencies = [ "heck", "proc-macro-error", - "proc-macro2", - "quote", - "syn", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", +] + +[[package]] +name = "syn" +version = "0.15.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ca4b3b69a77cbe1ffc9e198781b7acb0c7365a883670e8f1c1bc66fba79a5c5" +dependencies = [ + "proc-macro2 0.4.30", + "quote 0.6.13", + "unicode-xid 0.1.0", ] [[package]] @@ -2210,9 +2262,9 @@ version = "1.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081" dependencies = [ - "proc-macro2", - "quote", - "unicode-xid", + "proc-macro2 1.0.24", + "quote 1.0.9", + "unicode-xid 0.2.1", ] [[package]] @@ -2230,10 +2282,10 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701" dependencies = [ - "proc-macro2", - "quote", - "syn", - "unicode-xid", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", + "unicode-xid 0.2.1", ] [[package]] @@ -2359,9 +2411,9 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e44da00bfc73a25f814cd8d7e57a68a5c31b74b3152a0a1d1f590c97ed06265a" dependencies = [ - "proc-macro2", - "quote", - "syn", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", ] [[package]] @@ -2522,6 +2574,12 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" +[[package]] +name = "unicode-xid" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" + [[package]] name = "unicode-xid" version = "0.2.1" @@ -2653,9 +2711,9 @@ dependencies = [ "bumpalo", "lazy_static", "log", - "proc-macro2", - "quote", - "syn", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", "wasm-bindgen-shared", ] @@ -2665,7 +2723,7 @@ version = "0.2.70" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b8853882eef39593ad4174dd26fc9865a64e84026d223f63bb2c42affcbba2c" dependencies = [ - "quote", + "quote 1.0.9", "wasm-bindgen-macro-support", ] @@ -2675,9 +2733,9 @@ version = "0.2.70" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4133b5e7f2a531fa413b3a1695e925038a05a71cf67e87dafa295cb645a01385" dependencies = [ - "proc-macro2", - "quote", - "syn", + "proc-macro2 1.0.24", + "quote 1.0.9", + "syn 1.0.60", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -2782,8 +2840,8 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" dependencies = [ - "proc-macro2", - "syn", + "proc-macro2 1.0.24", + "syn 1.0.60", "synstructure", ] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 67b3a1155..2eb40dc94 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -45,6 +45,7 @@ itertools = "0.10.0" # logging log = "0.4.14" +logging_timer = "1.0.0" # We temporarily depend on this crate just to fix this issue # https://github.com/bheisler/TinyTemplate/pull/17 diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 0aff60d3d..29fe26d7e 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -146,6 +146,7 @@ impl<'t> AscDesc<'t> { } impl<'t> Criterion for AscDesc<'t> { + #[logging_timer::time("AscDesc::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { loop { debug!("Facet {}({}) iteration", diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index 99c49e53e..094efe75e 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -48,6 +48,7 @@ impl<'t> Fetcher<'t> { } impl<'t> Criterion for Fetcher<'t> { + #[logging_timer::time("Fetcher::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 82b7185a3..259c3a1cf 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -56,6 +56,7 @@ impl<'t> Proximity<'t> { } impl<'t> Criterion for Proximity<'t> { + #[logging_timer::time("Proximity::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index e598637f1..4cc0015da 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -53,6 +53,7 @@ impl<'t> Typo<'t> { } impl<'t> Criterion for Typo<'t> { + #[logging_timer::time("Typo::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 33296fd07..d94fd0c53 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -47,6 +47,7 @@ impl<'t> Words<'t> { } impl<'t> Criterion for Words<'t> { + #[logging_timer::time("Words::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { From d781a6164a36ed831270a87ab5bc1fe0d1a34e93 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 8 Mar 2021 16:27:52 +0100 Subject: [PATCH 0560/1889] Rewrite some code with idiomatic Rust --- milli/src/search/criteria/proximity.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 259c3a1cf..5b14f699c 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -357,9 +357,9 @@ fn resolve_plane_sweep_candidates( ) -> Option<(Position, u8, Position)> { // take the inner proximity of the first group as initial - let mut proximity = groups.first()?.1.1; - let left_most_pos = groups.first()?.1.0; - let right_most_pos = groups.last()?.1.2; + let (_, (_, mut proximity, _)) = groups.first()?; + let (_, (left_most_pos, _, _)) = groups.first()?; + let (_, (_, _, right_most_pos)) = groups.last()?; for pair in groups.windows(2) { if let [(i1, (_, _, rpos1)), (i2, (lpos2, prox2, _))] = pair { @@ -379,7 +379,7 @@ fn resolve_plane_sweep_candidates( // if groups should be consecutives, we will only accept groups with a proximity of 0 if !consecutive || proximity == 0 { - Some((left_most_pos, proximity, right_most_pos)) + Some((*left_most_pos, proximity, *right_most_pos)) } else { None } From bd63da0a0edad945644fbd874e22e22509b3e5ae Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 28 Dec 2020 16:46:16 +0100 Subject: [PATCH 0561/1889] Add missing databases to the infos subcommand --- infos/src/main.rs | 84 +++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 51 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 0d2b7abb5..9f16c7c0e 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -19,6 +19,8 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids"; const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids"; const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; +const FACET_FIELD_ID_VALUE_DOCIDS_NAME: &str = "facet-field-id-value-docids"; +const FIELD_ID_DOCID_FACET_VALUES_NAME: &str = "field-id-docid-facet-values"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const DOCUMENTS_DB_NAME: &str = "documents"; @@ -28,6 +30,8 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PREFIX_DOCIDS_DB_NAME, DOCID_WORD_POSITIONS_DB_NAME, WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, + FACET_FIELD_ID_VALUE_DOCIDS_NAME, + FIELD_ID_DOCID_FACET_VALUES_NAME, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, DOCUMENTS_DB_NAME, ]; @@ -116,9 +120,6 @@ enum Command { field_name: String, }, - /// Outputs the total size of all the docid-word-positions keys and values. - TotalDocidWordPositionsSize, - /// Outputs the average number of *different* words by document. AverageNumberOfWordsByDoc, @@ -132,10 +133,10 @@ enum Command { database: String, }, - /// Outputs the size in bytes of the specified database. + /// Outputs the size in bytes of the specified databases names. SizeOfDatabase { #[structopt(possible_values = ALL_DATABASE_NAMES)] - database: String, + databases: Vec, }, /// Outputs a CSV with the proximities for the two specidied words and @@ -209,12 +210,11 @@ fn main() -> anyhow::Result<()> { facet_values_docids(&index, &rtxn, !full_display, field_name) }, FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), - TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { average_number_of_positions_by_word(&index, &rtxn) }, - SizeOfDatabase { database } => size_of_database(&index, &rtxn, &database), + SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases), DatabaseStats { database } => database_stats(&index, &rtxn, &database), WordPairProximitiesDocids { full_display, word1, word2 } => { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) @@ -620,28 +620,6 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) - Ok(()) } -fn total_docid_word_positions_size(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { - use heed::types::ByteSlice; - - let mut total_key_size = 0; - let mut total_val_size = 0; - let mut count = 0; - - let iter = index.docid_word_positions.as_polymorph().iter::<_, ByteSlice, ByteSlice>(rtxn)?; - for result in iter { - let (key, val) = result?; - total_key_size += key.len(); - total_val_size += val.len(); - count += 1; - } - - println!("number of keys: {}", count); - println!("total key size: {}", total_key_size); - println!("total value size: {}", total_val_size); - - Ok(()) -} - fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use heed::types::DecodeIgnore; use milli::{DocumentId, BEU32StrCodec}; @@ -703,33 +681,37 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any Ok(()) } -fn size_of_database(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> { +fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> anyhow::Result<()> { use heed::types::ByteSlice; - let database = match name { - MAIN_DB_NAME => &index.main, - WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(), - WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(), - DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), - DOCUMENTS_DB_NAME => index.documents.as_polymorph(), - unknown => anyhow::bail!("unknown database {:?}", unknown), - }; + for name in names { + let database = match name.as_str() { + MAIN_DB_NAME => &index.main, + WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(), + WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(), + DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), + WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), + FACET_FIELD_ID_VALUE_DOCIDS_NAME => index.facet_field_id_value_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_VALUES_NAME => index.field_id_docid_facet_values.as_polymorph(), + DOCUMENTS_DB_NAME => index.documents.as_polymorph(), + unknown => anyhow::bail!("unknown database {:?}", unknown), + }; - let mut key_size: u64 = 0; - let mut val_size: u64 = 0; - for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? { - let (k, v) = result?; - key_size += k.len() as u64; - val_size += v.len() as u64; + let mut key_size: u64 = 0; + let mut val_size: u64 = 0; + for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? { + let (k, v) = result?; + key_size += k.len() as u64; + val_size += v.len() as u64; + } + + println!("The {} database weigh:", name); + println!("\ttotal key size: {} bytes", key_size); + println!("\ttotal val size: {} bytes", val_size); + println!("\ttotal size: {} bytes", key_size + val_size); } - println!("The {} database weigh:", name); - println!("\ttotal key size: {} bytes", key_size); - println!("\ttotal val size: {} bytes", val_size); - println!("\ttotal size: {} bytes", key_size + val_size); - Ok(()) } From 3d02b19fbd9a88f51c9358e583afcda9ba9e3eef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 5 Mar 2021 16:13:21 +0100 Subject: [PATCH 0562/1889] Introduce the docids-words-positions subcommand to the infos crate --- infos/src/main.rs | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/infos/src/main.rs b/infos/src/main.rs index 9f16c7c0e..c966a0143 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -114,6 +114,16 @@ enum Command { field_name: String, }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. + DocidsWordsPositions { + /// Display the whole positions in detail. + #[structopt(long)] + full_display: bool, + + /// If defined, only retrieve the documents that corresponds to these internal ids. + internal_documents_ids: Vec, + }, + /// Outputs some facets statistics for the given facet name. FacetStats { /// The field name in the document. @@ -209,6 +219,9 @@ fn main() -> anyhow::Result<()> { FacetValuesDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, field_name) }, + DocidsWordsPositions { full_display, internal_documents_ids } => { + docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) + }, FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { @@ -525,6 +538,39 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam Ok(wtr.flush()?) } +fn docids_words_positions( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + internal_ids: Vec, +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["document_id", "word", "positions"])?; + + let iter: Box> = if internal_ids.is_empty() { + Box::new(index.docid_word_positions.iter(rtxn)?) + } else { + let vec: heed::Result> = internal_ids.into_iter().map(|id| { + index.docid_word_positions.prefix_iter(rtxn, &(id, "")) + }).collect(); + Box::new(vec?.into_iter().flatten()) + }; + + for result in iter { + let ((id, word), positions) = result?; + let positions = if debug { + format!("{:?}", positions) + } else { + format!("{:?}", positions.iter().collect::>()) + }; + wtr.write_record(&[&id.to_string(), word, &positions])?; + } + + Ok(wtr.flush()?) +} + fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; From 18844d60b569268d7eef3d76a5533823a65e858d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 5 Mar 2021 16:37:18 +0100 Subject: [PATCH 0563/1889] Simplify the output of database sizes in the infos crate --- infos/src/main.rs | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index c966a0143..376679656 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -30,9 +30,9 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PREFIX_DOCIDS_DB_NAME, DOCID_WORD_POSITIONS_DB_NAME, WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, FACET_FIELD_ID_VALUE_DOCIDS_NAME, FIELD_ID_DOCID_FACET_VALUES_NAME, - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, DOCUMENTS_DB_NAME, ]; @@ -145,6 +145,8 @@ enum Command { /// Outputs the size in bytes of the specified databases names. SizeOfDatabase { + /// The name of the database to measure the size of, if not specified it's equivalent + /// to specifying all the databases names. #[structopt(possible_values = ALL_DATABASE_NAMES)] databases: Vec, }, @@ -730,6 +732,12 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> anyhow::Result<()> { use heed::types::ByteSlice; + let names = if names.is_empty() { + ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect() + } else { + names + }; + for name in names { let database = match name.as_str() { MAIN_DB_NAME => &index.main, @@ -753,9 +761,9 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a } println!("The {} database weigh:", name); - println!("\ttotal key size: {} bytes", key_size); - println!("\ttotal val size: {} bytes", val_size); - println!("\ttotal size: {} bytes", key_size + val_size); + println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true)); + println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true)); + println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true)); } Ok(()) @@ -810,9 +818,9 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu println!("\tminimum: {}", minimum); println!("\tmaximum: {}", maximum); println!("\taverage: {}", sum as f64 / count as f64); - println!("\ttotal key size: {} bytes", key_size); - println!("\ttotal val size: {} bytes", val_size); - println!("\ttotal size: {} bytes", key_size + val_size); + println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true)); + println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true)); + println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true)); Ok(()) } From f2043441020391cbc6804805702b6fed083cecf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 8 Mar 2021 18:54:06 +0100 Subject: [PATCH 0564/1889] Update the LICENSE file to match the year 2021 --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 17a0f0781..70d8ffeb7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2020 Clément Renault +Copyright (c) 2021 Clément Renault Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 2f9af6a707fbe741ca2d6a70ea66b385b3768d9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 8 Mar 2021 18:56:22 +0100 Subject: [PATCH 0565/1889] Fix the REAMD.md bash example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0194cb237..3b7d8e264 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ All of that on a 39$/month machine with 4cores. You can feed the engine with your CSV (comma-seperated, yes) data like this: ```bash -cat "name,age\nhello,32\nkiki,24\n" | http POST 127.0.0.1:9700/documents content-type:text/csv +echo "name,age\nhello,32\nkiki,24\n" | http POST 127.0.0.1:9700/documents content-type:text/csv ``` Here ids will be automatically generated as UUID v4 if they doesn't exist in some or every documents. From f51eb46c699c1a1b6d913b59d7d388c0c45cbf78 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 9 Mar 2021 10:24:27 +0100 Subject: [PATCH 0566/1889] Use the RoaringBitmapLenCodec to retrieve the count of documents --- milli/src/index.rs | 11 ++++++----- milli/src/update/clear_documents.rs | 2 +- milli/src/update/delete_documents.rs | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 7b83d69fc..c0a00080e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -118,6 +118,12 @@ impl Index { Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?.unwrap_or_default()) } + /// Returns the number of documents indexed in the database. + pub fn number_of_documents(&self, rtxn: &RoTxn) -> anyhow::Result { + let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, DOCUMENTS_IDS_KEY)?; + Ok(count.unwrap_or_default()) + } + /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. @@ -380,11 +386,6 @@ impl Index { Ok(documents) } - /// Returns the number of documents indexed in the database. - pub fn number_of_documents(&self, rtxn: &RoTxn) -> anyhow::Result { - Ok(self.documents_ids(rtxn).map(|docids| docids.len() as usize)?) - } - pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { FacetDistribution::new(rtxn, self) } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 1523a95b2..82e35d703 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -17,7 +17,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { ClearDocuments { wtxn, index, _update_id: update_id } } - pub fn execute(self) -> anyhow::Result { + pub fn execute(self) -> anyhow::Result { let Index { env: _env, main: _main, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 5430bb3af..d1007376a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -51,7 +51,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { Some(docid) } - pub fn execute(self) -> anyhow::Result { + pub fn execute(self) -> anyhow::Result { // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; @@ -308,7 +308,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); - Ok(self.documents_ids.len() as usize) + Ok(self.documents_ids.len()) } } From 62a70c300d35f569186a05bfe7b91338113a5d9f Mon Sep 17 00:00:00 2001 From: many Date: Tue, 9 Mar 2021 12:04:52 +0100 Subject: [PATCH 0567/1889] Optimize words criterion --- milli/src/search/criteria/asc_desc.rs | 20 +++++++++--- milli/src/search/criteria/fetcher.rs | 42 +++++++++++++++++++------- milli/src/search/criteria/mod.rs | 17 ++++++----- milli/src/search/criteria/proximity.rs | 16 +++++++--- milli/src/search/criteria/typo.rs | 20 ++++++------ milli/src/search/criteria/words.rs | 37 ++++++++++------------- milli/src/search/mod.rs | 5 ++- 7 files changed, 95 insertions(+), 62 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 29fe26d7e..5b2ec32e8 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -160,9 +160,21 @@ impl<'t> Criterion for AscDesc<'t> { match self.parent.as_mut() { Some(parent) => { match parent.next(wdcache)? { - Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; - candidates.intersect_with(&self.faceted_candidates); + let candidates = match (&self.query_tree, candidates) { + (_, Some(mut candidates)) => { + candidates.intersect_with(&self.faceted_candidates); + candidates + }, + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; + candidates.intersect_with(&self.faceted_candidates); + candidates + }, + (None, None) => take(&mut self.faceted_candidates), + }; self.candidates = facet_ordered( self.index, self.rtxn, @@ -183,7 +195,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree, - candidates: RoaringBitmap::new(), + candidates: Some(RoaringBitmap::new()), bucket_candidates, })); }, @@ -195,7 +207,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), - candidates, + candidates: Some(candidates), bucket_candidates, })); }, diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index 094efe75e..723b5a13a 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -8,12 +8,24 @@ use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; +/// The result of a call to the parent criterion. +#[derive(Debug, Clone, PartialEq)] +pub struct FetcherResult { + /// The query tree that must be used by the children criterion to fetch candidates. + pub query_tree: Option, + /// The candidates that this criterion is allowed to return subsets of. + pub candidates: RoaringBitmap, + /// Candidates that comes from the current bucket of the initial criterion. + pub bucket_candidates: RoaringBitmap, +} + pub struct Fetcher<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Candidates, parent: Option>, should_get_documents_ids: bool, + wdcache: WordDerivationsCache, } impl<'t> Fetcher<'t> { @@ -29,6 +41,7 @@ impl<'t> Fetcher<'t> { candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), parent: None, should_get_documents_ids: true, + wdcache: WordDerivationsCache::new(), } } @@ -43,13 +56,12 @@ impl<'t> Fetcher<'t> { candidates: Candidates::default(), parent: Some(parent), should_get_documents_ids: true, + wdcache: WordDerivationsCache::new(), } } -} -impl<'t> Criterion for Fetcher<'t> { #[logging_timer::time("Fetcher::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + pub fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", @@ -62,14 +74,14 @@ impl<'t> Criterion for Fetcher<'t> { let candidates = take(&mut self.candidates).into_inner(); let candidates = match &self.query_tree { Some(qt) if should_get_documents_ids => { - let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; + let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?; docids.intersect_with(&candidates); docids }, _ => candidates, }; - return Ok(Some(CriterionResult { + return Ok(Some(FetcherResult { query_tree: self.query_tree.take(), candidates: candidates.clone(), bucket_candidates: candidates, @@ -78,15 +90,23 @@ impl<'t> Criterion for Fetcher<'t> { Forbidden(_) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { - Some(result) => return Ok(Some(result)), + match parent.next(&mut self.wdcache)? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, + (None, None) => RoaringBitmap::new(), + }; + + return Ok(Some(FetcherResult { query_tree, candidates, bucket_candidates })) + }, None => if should_get_documents_ids { let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?, + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, None => self.ctx.documents_ids()?, }; - return Ok(Some(CriterionResult { + return Ok(Some(FetcherResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), bucket_candidates: candidates, @@ -96,11 +116,11 @@ impl<'t> Criterion for Fetcher<'t> { }, None => if should_get_documents_ids { let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?, + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, None => self.ctx.documents_ids()?, }; - return Ok(Some(CriterionResult { + return Ok(Some(FetcherResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), bucket_candidates: candidates, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d70942c1c..b2fd7803d 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -14,10 +14,10 @@ use self::asc_desc::AscDesc; use self::proximity::Proximity; use self::fetcher::Fetcher; -pub mod typo; -pub mod words; -pub mod asc_desc; -pub mod proximity; +mod typo; +mod words; +mod asc_desc; +mod proximity; pub mod fetcher; pub trait Criterion { @@ -28,11 +28,12 @@ pub trait Criterion { #[derive(Debug, Clone, PartialEq)] pub struct CriterionResult { /// The query tree that must be used by the children criterion to fetch candidates. - pub query_tree: Option, - /// The candidates that this criterion is allowed to return subsets of. - pub candidates: RoaringBitmap, + query_tree: Option, + /// The candidates that this criterion is allowed to return subsets of, + /// if None, it is up to the child to compute the candidates itself. + candidates: Option, /// Candidates that comes from the current bucket of the initial criterion. - pub bucket_candidates: RoaringBitmap, + bucket_candidates: RoaringBitmap, } /// Either a set of candidates that defines the candidates diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 5b14f699c..cb4fd257b 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -9,7 +9,7 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::WordDerivationsCache; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; pub struct Proximity<'t> { ctx: &'t dyn Context, @@ -70,7 +70,7 @@ impl<'t> Criterion for Proximity<'t> { (_, Allowed(candidates)) if candidates.is_empty() => { return Ok(Some(CriterionResult { query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: take(&mut self.candidates).into_inner(), + candidates: Some(take(&mut self.candidates).into_inner()), bucket_candidates: take(&mut self.bucket_candidates), })); }, @@ -126,7 +126,7 @@ impl<'t> Criterion for Proximity<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -155,7 +155,7 @@ impl<'t> Criterion for Proximity<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -164,7 +164,7 @@ impl<'t> Criterion for Proximity<'t> { let candidates = take(&mut self.candidates).into_inner(); return Ok(Some(CriterionResult { query_tree: None, - candidates: candidates.clone(), + candidates: Some(candidates.clone()), bucket_candidates: candidates, })); }, @@ -173,6 +173,12 @@ impl<'t> Criterion for Proximity<'t> { Some(parent) => { match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, + (None, None) => RoaringBitmap::new(), + }; + self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); self.proximity = 0; self.candidates = Candidates::Allowed(candidates); diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 4cc0015da..8bead4661 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -63,7 +63,7 @@ impl<'t> Criterion for Typo<'t> { (_, Allowed(candidates)) if candidates.is_empty() => { return Ok(Some(CriterionResult { query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: take(&mut self.candidates).into_inner(), + candidates: Some(take(&mut self.candidates).into_inner()), bucket_candidates: take(&mut self.bucket_candidates), })); }, @@ -100,7 +100,7 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -138,7 +138,7 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -147,7 +147,7 @@ impl<'t> Criterion for Typo<'t> { let candidates = take(&mut self.candidates).into_inner(); return Ok(Some(CriterionResult { query_tree: None, - candidates: candidates.clone(), + candidates: Some(candidates.clone()), bucket_candidates: candidates, })); }, @@ -158,7 +158,7 @@ impl<'t> Criterion for Typo<'t> { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; - self.candidates = Candidates::Allowed(candidates); + self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), @@ -394,7 +394,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), ]), ])), - candidates: candidates_1.clone(), + candidates: Some(candidates_1.clone()), bucket_candidates: candidates_1, }; @@ -416,7 +416,7 @@ mod test { ]), ]), ])), - candidates: candidates_2.clone(), + candidates: Some(candidates_2.clone()), bucket_candidates: candidates_2, }; @@ -434,7 +434,7 @@ mod test { let expected = CriterionResult { query_tree: None, - candidates: facet_candidates.clone(), + candidates: Some(facet_candidates.clone()), bucket_candidates: facet_candidates, }; @@ -472,7 +472,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), ]), ])), - candidates: &candidates_1 & &facet_candidates, + candidates: Some(&candidates_1 & &facet_candidates), bucket_candidates: candidates_1 & &facet_candidates, }; @@ -494,7 +494,7 @@ mod test { ]), ]), ])), - candidates: &candidates_2 & &facet_candidates, + candidates: Some(&candidates_2 & &facet_candidates), bucket_candidates: candidates_2 & &facet_candidates, }; diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index d94fd0c53..8774eed7c 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -6,12 +6,12 @@ use roaring::RoaringBitmap; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; +use super::{resolve_query_tree, Criterion, CriterionResult, Context}; pub struct Words<'t> { ctx: &'t dyn Context, query_trees: Vec, - candidates: Candidates, + candidates: Option, bucket_candidates: RoaringBitmap, parent: Option>, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, @@ -27,7 +27,7 @@ impl<'t> Words<'t> { Words { ctx, query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + candidates, bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::default(), @@ -38,7 +38,7 @@ impl<'t> Words<'t> { Words { ctx, query_trees: Vec::default(), - candidates: Candidates::default(), + candidates: None, bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::default(), @@ -49,20 +49,19 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { #[logging_timer::time("Words::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); match (self.query_trees.pop(), &mut self.candidates) { - (query_tree, Allowed(candidates)) if candidates.is_empty() => { + (query_tree, Some(candidates)) if candidates.is_empty() => { self.query_trees = Vec::new(); return Ok(Some(CriterionResult { query_tree, - candidates: take(&mut self.candidates).into_inner(), + candidates: self.candidates.take(), bucket_candidates: take(&mut self.bucket_candidates), })); }, - (Some(qt), Allowed(candidates)) => { + (Some(qt), Some(candidates)) => { let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -74,41 +73,37 @@ impl<'t> Criterion for Words<'t> { return Ok(Some(CriterionResult { query_tree: Some(qt), - candidates: found_candidates, + candidates: Some(found_candidates), bucket_candidates, })); }, - (Some(qt), Forbidden(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; - found_candidates.difference_with(&candidates); - candidates.union_with(&found_candidates); - + (Some(qt), None) => { let bucket_candidates = match self.parent { Some(_) => take(&mut self.bucket_candidates), - None => found_candidates.clone(), + None => RoaringBitmap::new(), }; return Ok(Some(CriterionResult { query_tree: Some(qt), - candidates: found_candidates, + candidates: None, bucket_candidates, })); }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); + (None, Some(_)) => { + let candidates = self.candidates.take(); return Ok(Some(CriterionResult { query_tree: None, candidates: candidates.clone(), - bucket_candidates: candidates, + bucket_candidates: candidates.unwrap_or_default(), })); }, - (None, Forbidden(_)) => { + (None, None) => { match self.parent.as_mut() { Some(parent) => { match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); - self.candidates = Candidates::Allowed(candidates); + self.candidates = candidates; self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 34b3ffec9..7475ef473 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,7 +11,7 @@ use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use crate::search::criteria::{Criterion, CriterionResult}; +use crate::search::criteria::fetcher::FetcherResult; use crate::{Index, DocumentId}; pub use self::facet::FacetIter; @@ -99,9 +99,8 @@ impl<'a> Search<'a> { let mut offset = self.offset; let mut limit = self.limit; let mut documents_ids = Vec::new(); - let mut words_derivations_cache = WordDerivationsCache::new(); let mut initial_candidates = RoaringBitmap::new(); - while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next(&mut words_derivations_cache)? { + while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { debug!("Number of candidates found {}", candidates.len()); From 42fd7dea78aa1903da70afb365f5fbcacb653699 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 9 Mar 2021 15:18:30 +0100 Subject: [PATCH 0568/1889] Remove the useless typo cache --- milli/src/search/criteria/typo.rs | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 8bead4661..5acc7a048 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -16,7 +16,6 @@ pub struct Typo<'t> { bucket_candidates: RoaringBitmap, parent: Option>, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, - typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>, } impl<'t> Typo<'t> { @@ -34,7 +33,6 @@ impl<'t> Typo<'t> { bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::new(), - typo_cache: HashMap::new(), } } @@ -47,7 +45,6 @@ impl<'t> Typo<'t> { bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::new(), - typo_cache: HashMap::new(), } } } @@ -74,9 +71,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; query_tree.clone() } else { query_tree.clone() @@ -112,9 +109,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; query_tree.clone() } else { query_tree.clone() @@ -179,7 +176,6 @@ fn alterate_query_tree( words_fst: &fst::Set>, mut query_tree: Operation, number_typos: u8, - typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { @@ -187,7 +183,6 @@ fn alterate_query_tree( words_fst: &fst::Set>, operation: &mut Operation, number_typos: u8, - typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result<()> { @@ -195,7 +190,7 @@ fn alterate_query_tree( match operation { And(ops) | Consecutive(ops) | Or(_, ops) => { - ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache, wdcache)) + ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) }, Operation::Query(q) => { // TODO may be optimized when number_typos == 0 @@ -209,19 +204,11 @@ fn alterate_query_tree( }); } else { let typo = *typo.min(&number_typos); - let cache_key = (word.clone(), q.prefix, typo); - let words = if let Some(derivations) = typo_cache.get(&cache_key) { - derivations.clone() - } else { - let derivations = word_derivations(word, q.prefix, typo, words_fst, wdcache)?.to_vec(); - typo_cache.insert(cache_key, derivations.clone()); - derivations - }; - + let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; let queries = words.into_iter().map(|(word, typo)| { Operation::Query(Query { prefix: false, - kind: QueryKind::Exact { original_typo: typo, word }, + kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() }, }) }).collect(); @@ -234,7 +221,7 @@ fn alterate_query_tree( } } - recurse(words_fst, &mut query_tree, number_typos, typo_cache, wdcache)?; + recurse(words_fst, &mut query_tree, number_typos, wdcache)?; Ok(query_tree) } From facfb4b6151482628e82c90f36a19f97d3ff764a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 9 Mar 2021 15:55:59 +0100 Subject: [PATCH 0569/1889] Fix the bucket candidates --- milli/src/search/criteria/asc_desc.rs | 6 +++++- milli/src/search/criteria/proximity.rs | 7 ++++++- milli/src/search/criteria/typo.rs | 8 ++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 5b2ec32e8..6b8afad2c 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -175,6 +175,11 @@ impl<'t> Criterion for AscDesc<'t> { }, (None, None) => take(&mut self.faceted_candidates), }; + if bucket_candidates.is_empty() { + self.bucket_candidates.union_with(&candidates); + } else { + self.bucket_candidates.union_with(&bucket_candidates); + } self.candidates = facet_ordered( self.index, self.rtxn, @@ -183,7 +188,6 @@ impl<'t> Criterion for AscDesc<'t> { self.ascending, candidates, )?; - self.bucket_candidates = bucket_candidates; }, None => return Ok(None), } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index cb4fd257b..e5f010177 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -179,10 +179,15 @@ impl<'t> Criterion for Proximity<'t> { (None, None) => RoaringBitmap::new(), }; + if bucket_candidates.is_empty() { + self.bucket_candidates.union_with(&candidates); + } else { + self.bucket_candidates.union_with(&bucket_candidates); + } + self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); self.proximity = 0; self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates.union_with(&bucket_candidates); self.plane_sweep_cache = None; }, None => return Ok(None), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 5acc7a048..b17b7561b 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -127,16 +127,12 @@ impl<'t> Criterion for Typo<'t> { new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); self.number_typos += 1; - - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; + self.bucket_candidates.union_with(&new_candidates); return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: Some(new_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); } }, From d301859bbd1c445040abd1bb1b833abcffc71e7c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 9 Mar 2021 17:48:05 +0100 Subject: [PATCH 0570/1889] Introduce a special word_derivations function for Proximity --- milli/src/search/criteria/mod.rs | 14 +++-- milli/src/search/criteria/proximity.rs | 73 ++++++++++++++------------ 2 files changed, 48 insertions(+), 39 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index b2fd7803d..22f081871 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -67,7 +67,7 @@ pub trait Context { fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; - fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result>; + fn docid_words_positions(&self, docid: DocumentId) -> heed::Result>; } pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -107,9 +107,13 @@ impl<'a> Context for CriteriaBuilder<'a> { self.words_prefixes_fst.contains(word) } - fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result> { - let key = (docid, word); - self.index.docid_word_positions.get(self.rtxn, &key) + fn docid_words_positions(&self, docid: DocumentId) -> heed::Result> { + let mut words_positions = HashMap::new(); + for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { + let ((_, word), positions) = result?; + words_positions.insert(word.to_string(), positions); + } + Ok(words_positions) } } @@ -391,7 +395,7 @@ pub mod test { self.word_prefix_docids.contains_key(&word.to_string()) } - fn docid_word_positions(&self, _docid: DocumentId, _word: &str) -> heed::Result> { + fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result> { todo!() } } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index e5f010177..b62eb8cfd 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -1,14 +1,13 @@ -use std::borrow::Cow; use std::collections::btree_map::{self, BTreeMap}; -use std::collections::hash_map::{HashMap, Entry}; +use std::collections::hash_map::HashMap; use std::mem::take; use roaring::RoaringBitmap; use log::debug; -use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}}; +use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; -use crate::search::WordDerivationsCache; +use crate::search::{build_dfa, WordDerivationsCache}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; pub struct Proximity<'t> { @@ -358,7 +357,7 @@ fn resolve_plane_sweep_candidates( docid: DocumentId, consecutive: bool, rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, - dwpcache: &mut HashMap>, + words_positions: &HashMap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { @@ -400,7 +399,7 @@ fn resolve_plane_sweep_candidates( let mut groups_positions = Vec::with_capacity(groups_len); for operation in operations { - let positions = resolve_operation(ctx, operation, docid, rocache, dwpcache, wdcache)?; + let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; groups_positions.push(positions.into_iter()); } @@ -476,7 +475,7 @@ fn resolve_plane_sweep_candidates( query_tree: &'a Operation, docid: DocumentId, rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, - dwpcache: &mut HashMap>, + words_positions: &HashMap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { @@ -487,44 +486,34 @@ fn resolve_plane_sweep_candidates( } let result = match query_tree { - And(ops) => plane_sweep(ctx, ops, docid, false, rocache, dwpcache, wdcache)?, - Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, dwpcache, wdcache)?, + And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?, + Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?, Or(_, ops) => { let mut result = Vec::new(); for op in ops { - result.extend(resolve_operation(ctx, op, docid, rocache, dwpcache, wdcache)?) + result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?) } result.sort_unstable(); result }, - Operation::Query(Query {prefix, kind}) => { - let fst = ctx.words_fst(); - let words = match kind { + Operation::Query(Query { prefix, kind }) => { + let mut result = Vec::new(); + match kind { QueryKind::Exact { word, .. } => { if *prefix { - Cow::Borrowed(word_derivations(word, true, 0, fst, wdcache)?) + let iter = word_derivations(word, true, 0, &words_positions) + .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); + result.extend(iter); } else { - Cow::Owned(vec![(word.to_string(), 0)]) + if let Some(positions) = words_positions.get(word) { + result.extend(positions.iter().map(|p| (p, 0, p))); + } } }, QueryKind::Tolerant { typo, word } => { - Cow::Borrowed(word_derivations(word, *prefix, *typo, fst, wdcache)?) - } - }; - - let mut result = Vec::new(); - for (word, _) in words.as_ref() { - let positions = match dwpcache.entry(word.to_string()) { - Entry::Occupied(entry) => entry.into_mut(), - Entry::Vacant(entry) => { - let positions = ctx.docid_word_positions(docid, word)?; - entry.insert(positions) - } - }; - - if let Some(positions) = positions { - let iter = positions.iter().map(|p| (p, 0, p)); + let iter = word_derivations(word, *prefix, *typo, &words_positions) + .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); result.extend(iter); } } @@ -538,18 +527,34 @@ fn resolve_plane_sweep_candidates( Ok(result) } - let mut word_positions_cache = HashMap::new(); + fn word_derivations<'a>( + word: &str, + is_prefix: bool, + max_typo: u8, + words_positions: &'a HashMap, + ) -> impl Iterator + { + let dfa = build_dfa(word, max_typo, is_prefix); + words_positions.iter().filter_map(move |(document_word, positions)| { + use levenshtein_automata::Distance; + match dfa.eval(document_word) { + Distance::Exact(_) => Some(positions), + Distance::AtLeast(_) => None, + } + }) + } + let mut resolve_operation_cache = HashMap::new(); let mut candidates = BTreeMap::new(); for docid in allowed_candidates { - word_positions_cache.clear(); + let words_positions = ctx.docid_words_positions(docid)?; resolve_operation_cache.clear(); let positions = resolve_operation( ctx, query_tree, docid, &mut resolve_operation_cache, - &mut word_positions_cache, + &words_positions, wdcache, )?; let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); From 54b97ed8e1241072cd5e68e14267f61d96b8d5bf Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Mar 2021 10:56:26 +0100 Subject: [PATCH 0571/1889] Update the fetcher comments --- milli/src/search/criteria/fetcher.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index 723b5a13a..fa204bdf2 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -8,12 +8,12 @@ use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; -/// The result of a call to the parent criterion. +/// The result of a call to the fetcher. #[derive(Debug, Clone, PartialEq)] pub struct FetcherResult { - /// The query tree that must be used by the children criterion to fetch candidates. + /// The query tree corresponding to the current bucket of the last criterion. pub query_tree: Option, - /// The candidates that this criterion is allowed to return subsets of. + /// The candidates of the current bucket of the last criterion. pub candidates: RoaringBitmap, /// Candidates that comes from the current bucket of the initial criterion. pub bucket_candidates: RoaringBitmap, From d48008339e33ae615dc467e54e1eb290517117de Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Mar 2021 11:16:30 +0100 Subject: [PATCH 0572/1889] Introduce two new optional_words and authorize_typos Search options --- milli/src/search/mod.rs | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7475ef473..ce5a6bc88 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -33,13 +33,24 @@ pub struct Search<'a> { facet_condition: Option, offset: usize, limit: usize, + optional_words: bool, + authorize_typos: bool, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } impl<'a> Search<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { - Search { query: None, facet_condition: None, offset: 0, limit: 20, rtxn, index } + Search { + query: None, + facet_condition: None, + offset: 0, + limit: 20, + optional_words: true, + authorize_typos: true, + rtxn, + index, + } } pub fn query(&mut self, query: impl Into) -> &mut Search<'a> { @@ -57,6 +68,16 @@ impl<'a> Search<'a> { self } + pub fn optional_words(&mut self, value: bool) -> &mut Search<'a> { + self.optional_words = value; + self + } + + pub fn authorize_typos(&mut self, value: bool) -> &mut Search<'a> { + self.authorize_typos = value; + self + } + pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { self.facet_condition = Some(condition); self @@ -67,7 +88,9 @@ impl<'a> Search<'a> { let before = Instant::now(); let query_tree = match self.query.as_ref() { Some(query) => { - let builder = QueryTreeBuilder::new(self.rtxn, self.index); + let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); + builder.optional_words(self.optional_words); + builder.authorize_typos(self.authorize_typos); let stop_words = &Set::default(); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); let result = analyzer.analyze(query); @@ -129,12 +152,23 @@ impl<'a> Search<'a> { impl fmt::Debug for Search<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let Search { query, facet_condition, offset, limit, rtxn: _, index: _ } = self; + let Search { + query, + facet_condition, + offset, + limit, + optional_words, + authorize_typos, + rtxn: _, + index: _, + } = self; f.debug_struct("Search") .field("query", query) .field("facet_condition", facet_condition) .field("offset", offset) .field("limit", limit) + .field("optional_words", optional_words) + .field("authorize_typos", authorize_typos) .finish() } } From 0cc3132f5a6faf0cf427d3c1f3ab24b1845886cc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 11 Mar 2021 14:44:47 +0100 Subject: [PATCH 0573/1889] Rename master into main in the Github CI --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e2487f707..c51430384 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,9 +2,9 @@ name: Continuous integration on: push: - branches: [ master ] + branches: [ main ] pull_request: - branches: [ master ] + branches: [ main ] jobs: ci: From 80d0f9c49d16ee94c6ac10471977d8617e9f884d Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 11 Mar 2021 18:32:04 +0100 Subject: [PATCH 0574/1889] methods to update index time metadata --- Cargo.lock | 2 ++ milli/Cargo.toml | 1 + milli/src/index.rs | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 930ace50f..0f21f2a83 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -302,6 +302,7 @@ dependencies = [ "libc", "num-integer", "num-traits", + "serde", "time", "winapi 0.3.9", ] @@ -1277,6 +1278,7 @@ dependencies = [ "anyhow", "bstr", "byteorder", + "chrono", "criterion", "crossbeam-channel", "csv", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 2eb40dc94..b63e34b32 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" anyhow = "1.0.38" bstr = "0.2.15" byteorder = "1.4.2" +chrono = { version = "0.4.19", features = ["serde"] } crossbeam-channel = "0.5.0" csv = "1.1.5" either = "1.6.1" diff --git a/milli/src/index.rs b/milli/src/index.rs index c0a00080e..c7a855e1f 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -6,6 +6,7 @@ use anyhow::Context; use heed::types::*; use heed::{PolyDatabase, Database, RwTxn, RoTxn}; use roaring::RoaringBitmap; +use chrono::{Utc, DateTime}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; @@ -28,6 +29,8 @@ pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; +const CREATED_AT_KEY: &str = "created-at"; +const UPDATED_AT_KEY: &str ="updated-at"; #[derive(Clone)] pub struct Index { @@ -68,6 +71,17 @@ impl Index { let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; + { + let mut txn = env.write_txn()?; + // The db was just created, we update its metadata with the relevant information. + if main.get::<_, Str, SerdeJson>>(&txn, CREATED_AT_KEY)?.is_none() { + let now = Utc::now(); + main.put::<_, Str, SerdeJson>>(&mut txn, UPDATED_AT_KEY, &now)?; + main.put::<_, Str, SerdeJson>>(&mut txn, CREATED_AT_KEY, &now)?; + txn.commit()?; + } + } + Ok(Index { env, main, @@ -393,4 +407,24 @@ impl Index { pub fn search<'a>(&'a self, rtxn: &'a RoTxn) -> Search<'a> { Search::new(rtxn, self) } + + /// Returns the index creation time. + pub fn created_at(&self, rtxn: &RoTxn) -> heed::Result> { + let time = self.main + .get::<_, Str, SerdeJson>>(rtxn, CREATED_AT_KEY)? + .expect("Index without creation time"); + Ok(time) + } + + /// Returns the index creation time. + pub fn updated_at(&self, rtxn: &RoTxn) -> heed::Result> { + let time = self.main + .get::<_, Str, SerdeJson>>(rtxn, UPDATED_AT_KEY)? + .expect("Index without update time"); + Ok(time) + } + + pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) + } } From 615fe095e160e46fe51306a3a90f6c635abbef47 Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 11 Mar 2021 18:42:21 +0100 Subject: [PATCH 0575/1889] update index updated at on index writes --- milli/src/index.rs | 4 ++-- milli/src/update/clear_documents.rs | 2 ++ milli/src/update/delete_documents.rs | 2 ++ milli/src/update/facets.rs | 2 ++ milli/src/update/index_documents/mod.rs | 2 ++ milli/src/update/settings.rs | 2 ++ milli/src/update/words_prefixes.rs | 2 ++ 7 files changed, 14 insertions(+), 2 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index c7a855e1f..cf31b54a8 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -30,7 +30,7 @@ pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; -const UPDATED_AT_KEY: &str ="updated-at"; +const UPDATED_AT_KEY: &str = "updated-at"; #[derive(Clone)] pub struct Index { @@ -416,7 +416,7 @@ impl Index { Ok(time) } - /// Returns the index creation time. + /// Returns the index last updated time. pub fn updated_at(&self, rtxn: &RoTxn) -> heed::Result> { let time = self.main .get::<_, Str, SerdeJson>>(rtxn, UPDATED_AT_KEY)? diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 82e35d703..5ae3680d3 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,3 +1,4 @@ +use chrono::Utc; use roaring::RoaringBitmap; use crate::{ExternalDocumentsIds, Index}; @@ -18,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { } pub fn execute(self) -> anyhow::Result { + self.index.set_updated_at(self.wtxn, &Utc::now())?; let Index { env: _env, main: _main, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index d1007376a..0b112ceb1 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,4 +1,5 @@ use anyhow::anyhow; +use chrono::Utc; use fst::IntoStreamer; use heed::types::ByteSlice; use roaring::RoaringBitmap; @@ -52,6 +53,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } pub fn execute(self) -> anyhow::Result { + self.index.set_updated_at(self.wtxn, &Utc::now())?; // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index bac5f3c86..62da5af7e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -2,6 +2,7 @@ use std::cmp; use std::fs::File; use std::num::NonZeroUsize; +use chrono::Utc; use grenad::{CompressionType, Reader, Writer, FileFuse}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesEncode, Error}; @@ -57,6 +58,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { } pub fn execute(self) -> anyhow::Result<()> { + self.index.set_updated_at(self.wtxn, &Utc::now())?; // We get the faceted fields to be able to create the facet levels. let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d55f421dc..ccbd95c7f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -8,6 +8,7 @@ use std::time::Instant; use anyhow::Context; use bstr::ByteSlice as _; +use chrono::Utc; use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; use heed::types::ByteSlice; use log::{debug, info, error}; @@ -316,6 +317,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { R: io::Read, F: Fn(UpdateIndexingStep, u64) + Sync, { + self.index.set_updated_at(self.wtxn, &Utc::now())?; let before_transform = Instant::now(); let update_id = self.update_id; let progress_callback = |step| progress_callback(step, update_id); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index fd91d3468..7ce8b98c1 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -2,6 +2,7 @@ use std::collections::HashMap; use std::str::FromStr; use anyhow::Context; +use chrono::Utc; use grenad::CompressionType; use itertools::Itertools; use rayon::ThreadPool; @@ -249,6 +250,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { where F: Fn(UpdateIndexingStep, u64) + Sync { + self.index.set_updated_at(self.wtxn, &Utc::now())?; let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; self.update_displayed()?; let facets_updated = self.update_facets()?; diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs index 70b82b217..f2fe526a2 100644 --- a/milli/src/update/words_prefixes.rs +++ b/milli/src/update/words_prefixes.rs @@ -1,6 +1,7 @@ use std::iter::FromIterator; use std::str; +use chrono::Utc; use fst::automaton::Str; use fst::{Automaton, Streamer, IntoStreamer}; use grenad::CompressionType; @@ -68,6 +69,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { } pub fn execute(self) -> anyhow::Result<()> { + self.index.set_updated_at(self.wtxn, &Utc::now())?; // Clear the words prefixes datastructures. self.index.word_prefix_docids.clear(self.wtxn)?; self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; From f0210453a60c96b33b24aef4c04730773c97b446 Mon Sep 17 00:00:00 2001 From: mpostma Date: Fri, 12 Mar 2021 14:43:17 +0100 Subject: [PATCH 0576/1889] add updated at on put primary key --- milli/src/index.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index cf31b54a8..a14747788 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -142,6 +142,7 @@ impl Index { /// Writes the documents primary key, this is the field name that is used to store the id. pub fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { + self.set_updated_at(wtxn, &Utc::now())?; self.main.put::<_, Str, Str>(wtxn, PRIMARY_KEY_KEY, &primary_key) } From 3455082458e694d3afe3017a04c50e0c60a63ccd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 15 Mar 2021 14:13:44 +0100 Subject: [PATCH 0577/1889] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 70d8ffeb7..1df4fd69c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 Clément Renault +Copyright (c) 2021 Meili SAS Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 9c271838760257564ab03db28227332562823499 Mon Sep 17 00:00:00 2001 From: mpostma Date: Mon, 15 Mar 2021 20:23:50 +0100 Subject: [PATCH 0578/1889] fix broken offset --- milli/src/search/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index ce5a6bc88..7560fbf0a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -133,7 +133,7 @@ impl<'a> Search<'a> { initial_candidates.union_with(&bucket_candidates); if offset != 0 { - candidates.by_ref().skip(offset).for_each(drop); + candidates.by_ref().take(offset).for_each(drop); offset = offset.saturating_sub(len.min(offset)); len = len.saturating_sub(len.min(offset)); } From 73dcdb27f64c993b43af62ccacf03ef4fbeb8e26 Mon Sep 17 00:00:00 2001 From: tamo Date: Thu, 25 Mar 2021 15:00:18 +0100 Subject: [PATCH 0579/1889] select a specific release of the tokenizer instead of using the latests git commit --- Cargo.lock | 289 ++++++++++++++++++++------------------------- http-ui/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 3 files changed, 133 insertions(+), 160 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0f21f2a83..a53930367 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,10 +1,12 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +version = 3 + [[package]] name = "adler" -version = "0.2.3" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" @@ -23,9 +25,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.38" +version = "1.0.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afddf7f520a80dbf76e6f50a35bca42a2331ef227a28b3b6dc5c2e2338d114b1" +checksum = "81cddc5f91628367664cc7c69714ff08deee8a3efc54623011c772544d7b2767" [[package]] name = "arrayvec" @@ -54,7 +56,7 @@ checksum = "ca2925c4c290382f9d2fa3d1c1b6a63fa1427099721ecca4749b154cc9c25522" dependencies = [ "askama_shared", "proc-macro2 1.0.24", - "syn 1.0.60", + "syn 1.0.64", ] [[package]] @@ -77,7 +79,7 @@ dependencies = [ "proc-macro2 1.0.24", "quote 1.0.9", "serde", - "syn 1.0.60", + "syn 1.0.64", "toml", ] @@ -138,9 +140,9 @@ checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" [[package]] name = "bitvec" -version = "0.19.4" +version = "0.19.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7ba35e9565969edb811639dbebfe34edc0368e472c5018474c8eb2543397f81" +checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321" dependencies = [ "funty", "radium", @@ -157,7 +159,7 @@ dependencies = [ "block-padding", "byte-tools", "byteorder", - "generic-array 0.12.3", + "generic-array 0.12.4", ] [[package]] @@ -214,9 +216,9 @@ checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" [[package]] name = "byte-unit" -version = "4.0.9" +version = "4.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c8758c32833faaae35b24a73d332e62d0528e89076ae841c63940e37008b153" +checksum = "b9520900471c3a9bbcfe0fd4c7b6bcfeff41b20a76cf91c59b7474b09be1ee27" dependencies = [ "utf8-width", ] @@ -229,9 +231,9 @@ checksum = "bed57e2090563b83ba8f83366628ce535a7584c9afa4c9fc0612a03925c6df58" [[package]] name = "byteorder" -version = "1.4.2" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" @@ -383,7 +385,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.2", + "crossbeam-utils 0.8.3", ] [[package]] @@ -394,19 +396,18 @@ checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils 0.8.2", + "crossbeam-utils 0.8.3", ] [[package]] name = "crossbeam-epoch" -version = "0.9.2" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d60ab4a8dba064f2fbb5aa270c28da5cf4bbd0e72dae1140a6b0353a779dbe00" +checksum = "2584f639eb95fea8c798496315b297cf81b9b58b6d30ab066a75455333cf4b12" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.2", + "crossbeam-utils 0.8.3", "lazy_static", - "loom", "memoffset", "scopeguard", ] @@ -432,21 +433,20 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bae8f328835f8f5a6ceb6a7842a7f2d0c03692adb5c889347235d59194731fe3" +checksum = "e7e9d99fa91428effe99c5c6d4634cdeba32b8cf784fc428a2a687f61a952c49" dependencies = [ "autocfg", "cfg-if 1.0.0", "lazy_static", - "loom", ] [[package]] name = "csv" -version = "1.1.5" +version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d58633299b24b515ac72a3f869f8b91306a3cec616a602843a383acd6f9e97" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" dependencies = [ "bstr", "csv-core", @@ -466,9 +466,9 @@ dependencies = [ [[package]] name = "deunicode" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80115a2dfde04491e181c2440a39e4be26e52d9ca4e92bed213f65b94e0b8db1" +checksum = "c0b7756d6eb729250618a3693b34b3311b282e12aeeee7970ae2a70997c03eb6" [[package]] name = "digest" @@ -476,7 +476,7 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" dependencies = [ - "generic-array 0.12.3", + "generic-array 0.12.4", ] [[package]] @@ -570,9 +570,9 @@ checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" [[package]] name = "futures" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9052a1a50244d8d5aa9bf55cbc2fb6f357c86cc52e46c62ed390a7180cf150" +checksum = "7f55667319111d593ba876406af7c409c0ebb44dc4be6132a783ccf163ea14c1" dependencies = [ "futures-channel", "futures-core", @@ -585,9 +585,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2d31b7ec7efab6eefc7c57233bb10b847986139d88cc2f5a02a1ae6871a1846" +checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939" dependencies = [ "futures-core", "futures-sink", @@ -595,15 +595,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79e5145dde8da7d1b3892dad07a9c98fc04bc39892b1ecc9692cf53e2b780a65" +checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94" [[package]] name = "futures-executor" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9e59fdc009a4b3096bf94f740a0f2424c082521f20a9b08c5c07c48d90fd9b9" +checksum = "891a4b7b96d84d5940084b2a37632dd65deeae662c114ceaa2c879629c9c0ad1" dependencies = [ "futures-core", "futures-task", @@ -612,42 +612,39 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28be053525281ad8259d47e4de5de657b25e7bac113458555bb4b70bc6870500" +checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59" [[package]] name = "futures-macro" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c287d25add322d9f9abdcdc5927ca398917996600182178774032e9f8258fedd" +checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7" dependencies = [ "proc-macro-hack", "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", ] [[package]] name = "futures-sink" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caf5c69029bda2e743fddd0582d1083951d65cc9539aebf8812f36c3491342d6" +checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3" [[package]] name = "futures-task" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13de07eb8ea81ae445aca7b69f5f7bf15d7bf4912d8ca37d6645c77ae8a58d86" -dependencies = [ - "once_cell", -] +checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80" [[package]] name = "futures-util" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "632a8cd0f2a4b3fdea1657f08bde063848c3bd00f9bbf6e256b8be78802e624b" +checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1" dependencies = [ "futures-channel", "futures-core", @@ -656,7 +653,7 @@ dependencies = [ "futures-sink", "futures-task", "memchr", - "pin-project-lite 0.2.4", + "pin-project-lite 0.2.6", "pin-utils", "proc-macro-hack", "proc-macro-nested", @@ -672,24 +669,11 @@ dependencies = [ "byteorder", ] -[[package]] -name = "generator" -version = "0.6.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cdc09201b2e8ca1b19290cf7e65de2246b8e91fb6874279722189c4de7b94dc" -dependencies = [ - "cc", - "libc", - "log", - "rustc_version", - "winapi 0.3.9", -] - [[package]] name = "generic-array" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c68f0274ae0e023facc3c97b2e00f076be70e254bc851d972503b328db79b2ec" +checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd" dependencies = [ "typenum", ] @@ -723,7 +707,7 @@ checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8" dependencies = [ "cfg-if 1.0.0", "libc", - "wasi 0.10.2+wasi-snapshot-preview1", + "wasi 0.10.0+wasi-snapshot-preview1", ] [[package]] @@ -790,9 +774,9 @@ checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" [[package]] name = "headers" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62689dc57c7456e69712607ffcbd0aa1dfcccf9af73727e9b25bc1825375cac3" +checksum = "f0b7591fb62902706ae8e7aaff416b1b0fa2c0fd0878b46dc13baa3712d8a855" dependencies = [ "base64 0.13.0", "bitflags", @@ -800,7 +784,7 @@ dependencies = [ "headers-core", "http", "mime", - "sha-1 0.8.2", + "sha-1 0.9.4", "time", ] @@ -995,9 +979,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.6.1" +version = "1.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b" +checksum = "824845a0bf897a9042383849b02c1bc219c2383772efcd5c6f9766fa4b81aef3" dependencies = [ "autocfg", "hashbrown 0.9.1", @@ -1108,9 +1092,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.47" +version = "0.3.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cfb73131c35423a367daf8cbd24100af0d077668c8c2943f0e7dd775fef0f65" +checksum = "dc15e39392125075f60c95ba416f5381ff6c3a948ff02ab12464715adf56c821" dependencies = [ "wasm-bindgen", ] @@ -1155,9 +1139,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.86" +version = "0.2.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7282d924be3275cec7f6756ff4121987bc6481325397dde6ba3e7802b1a8b1c" +checksum = "8916b1f6ca17130ec6568feccee27c156ad12037880833a3b842a823236502e7" [[package]] name = "linked-hash-map" @@ -1207,17 +1191,6 @@ dependencies = [ "syn 0.15.44", ] -[[package]] -name = "loom" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d44c73b4636e497b4917eb21c33539efa3816741a2d3ff26c6316f1b529481a4" -dependencies = [ - "cfg-if 1.0.0", - "generator", - "scoped-tls", -] - [[package]] name = "maplit" version = "1.0.2" @@ -1233,7 +1206,7 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "meilisearch-tokenizer" version = "0.1.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#31ba3ff4a15501f12b7d37ac64ddce7c35a9757c" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.4#31ba3ff4a15501f12b7d37ac64ddce7c35a9757c" dependencies = [ "character_converter", "cow-utils", @@ -1335,9 +1308,9 @@ dependencies = [ [[package]] name = "miniz_oxide" -version = "0.4.3" +version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f2d26ec3309788e423cfbf68ad1800f061638098d76a83681af979dc4eda19d" +checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" dependencies = [ "adler", "autocfg", @@ -1370,7 +1343,7 @@ checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" dependencies = [ "log", "mio", - "miow 0.3.6", + "miow 0.3.7", "winapi 0.3.9", ] @@ -1399,11 +1372,10 @@ dependencies = [ [[package]] name = "miow" -version = "0.3.6" +version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a33c1b55807fbed163481b5ba66db4b2fa6cde694a5027be10fb724206c5897" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" dependencies = [ - "socket2", "winapi 0.3.9", ] @@ -1450,11 +1422,12 @@ dependencies = [ [[package]] name = "nom" -version = "6.1.1" +version = "6.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d521ee2250f619dd5e06515ba405858d249edc8fae9ddee2dba0695e57db01b" +checksum = "e7413f999671bd4745a7b624bd370a569fb6bc574b23c83a3c5ed2e453f3d5e2" dependencies = [ "bitvec", + "funty", "lexical-core", "memchr", "version_check", @@ -1497,9 +1470,9 @@ checksum = "ddd8a5a0aa2f3adafe349259a5b3e21a19c388b792414c1161d60a69c1fa48e8" [[package]] name = "once_cell" -version = "1.5.2" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0" +checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" [[package]] name = "oorandom" @@ -1547,8 +1520,7 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" dependencies = [ "ucd-trie", ] @@ -1556,7 +1528,8 @@ dependencies = [ [[package]] name = "pest" version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" dependencies = [ "ucd-trie", ] @@ -1581,7 +1554,7 @@ dependencies = [ "pest_meta", "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", ] [[package]] @@ -1659,7 +1632,7 @@ checksum = "65ad2ae56b6abe3a1ee25f15ee605bacadb9a764edaba9c2bf4103800d4a1895" dependencies = [ "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", ] [[package]] @@ -1670,20 +1643,20 @@ checksum = "758669ae3558c6f74bd2a18b41f7ac0b5a195aea6639d6a9b5e5d1ad5ba24c0b" dependencies = [ "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", ] [[package]] name = "pin-project-lite" -version = "0.1.11" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c917123afa01924fc84bb20c4c03f004d9c38e5127e3c039bbf7f4b9c76a2f6b" +checksum = "257b64915a082f7811703966789728173279bdebb956b143dbcd23f6f970a777" [[package]] name = "pin-project-lite" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439697af366c49a6d0a010c56a0d97685bc140ce0d377b13a2ea2aa42d64a827" +checksum = "dc0e1f259c92177c30a4c9d177246edd0a3568b25756a977d0632cf8fa37e905" [[package]] name = "pin-utils" @@ -1740,7 +1713,7 @@ dependencies = [ "proc-macro-error-attr", "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", "version_check", ] @@ -1926,7 +1899,7 @@ checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" dependencies = [ "crossbeam-channel", "crossbeam-deque", - "crossbeam-utils 0.8.2", + "crossbeam-utils 0.8.3", "lazy_static", "num_cpus", ] @@ -1942,14 +1915,13 @@ dependencies = [ [[package]] name = "regex" -version = "1.4.3" +version = "1.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a" +checksum = "957056ecddbeba1b26965114e191d2e8589ce74db242b6ea25fc4062427a5c19" dependencies = [ "aho-corasick", "memchr", "regex-syntax", - "thread_local", ] [[package]] @@ -1963,9 +1935,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.22" +version = "0.6.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581" +checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" [[package]] name = "remove_dir_all" @@ -2067,9 +2039,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.123" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae" +checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171" dependencies = [ "serde_derive", ] @@ -2086,20 +2058,20 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.123" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31" +checksum = "b093b7a2bb58203b5da3056c05b4ec1fed827dcfdb37347a8841695263b3d06d" dependencies = [ "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", ] [[package]] name = "serde_json" -version = "1.0.62" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea1c6153794552ea7cf7cf63b1231a25de00ec90db326ba6264440fa08e31486" +checksum = "799e97dc9fdae36a5c8b8f2cae9ce2ee9fdce2058c57a93e6099d919fd982f79" dependencies = [ "indexmap", "itoa", @@ -2155,9 +2127,9 @@ dependencies = [ [[package]] name = "siphasher" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" +checksum = "cbce6d4507c7e4a3962091436e56e95290cb71fa302d0d270e32130b75fbff27" [[package]] name = "slab" @@ -2244,7 +2216,7 @@ dependencies = [ "proc-macro-error", "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", ] [[package]] @@ -2260,9 +2232,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.60" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081" +checksum = "3fd9d1e9976102a03c542daa2eff1b43f9d72306342f3f8b3ed5fb8908195d6f" dependencies = [ "proc-macro2 1.0.24", "quote 1.0.9", @@ -2286,7 +2258,7 @@ checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701" dependencies = [ "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", "unicode-xid 0.2.1", ] @@ -2350,11 +2322,12 @@ dependencies = [ [[package]] name = "time" -version = "0.1.43" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca8a50ef2360fbd1eeb0ecd46795a87a19024eb4b53c5dc916ca1fd95fe62438" +checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" dependencies = [ "libc", + "wasi 0.10.0+wasi-snapshot-preview1", "winapi 0.3.9", ] @@ -2400,7 +2373,7 @@ dependencies = [ "mio-named-pipes", "mio-uds", "num_cpus", - "pin-project-lite 0.1.11", + "pin-project-lite 0.1.12", "signal-hook-registry", "slab", "tokio-macros", @@ -2415,7 +2388,7 @@ checksum = "e44da00bfc73a25f814cd8d7e57a68a5c31b74b3152a0a1d1f590c97ed06265a" dependencies = [ "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", ] [[package]] @@ -2441,7 +2414,7 @@ dependencies = [ "futures-core", "futures-sink", "log", - "pin-project-lite 0.1.11", + "pin-project-lite 0.1.12", "tokio", ] @@ -2462,13 +2435,13 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" [[package]] name = "tracing" -version = "0.1.24" +version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f77d3842f76ca899ff2dbcf231c5c65813dea431301d6eb686279c15c4464f12" +checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f" dependencies = [ "cfg-if 1.0.0", "log", - "pin-project-lite 0.2.4", + "pin-project-lite 0.2.6", "tracing-core", ] @@ -2527,9 +2500,9 @@ dependencies = [ [[package]] name = "typenum" -version = "1.12.0" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" +checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" [[package]] name = "ucd-trie" @@ -2629,15 +2602,15 @@ dependencies = [ [[package]] name = "version_check" -version = "0.9.2" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" [[package]] name = "walkdir" -version = "2.3.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" dependencies = [ "same-file", "winapi 0.3.9", @@ -2690,15 +2663,15 @@ checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" [[package]] name = "wasi" -version = "0.10.2+wasi-snapshot-preview1" +version = "0.10.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" [[package]] name = "wasm-bindgen" -version = "0.2.70" +version = "0.2.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55c0f7123de74f0dab9b7d00fd614e7b19349cd1e2f5252bbe9b1754b59433be" +checksum = "8fe8f61dba8e5d645a4d8132dc7a0a66861ed5e1045d2c0ed940fab33bac0fbe" dependencies = [ "cfg-if 1.0.0", "wasm-bindgen-macro", @@ -2706,24 +2679,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.70" +version = "0.2.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bc45447f0d4573f3d65720f636bbcc3dd6ce920ed704670118650bcd47764c7" +checksum = "046ceba58ff062da072c7cb4ba5b22a37f00a302483f7e2a6cdc18fedbdc1fd3" dependencies = [ "bumpalo", "lazy_static", "log", "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.70" +version = "0.2.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b8853882eef39593ad4174dd26fc9865a64e84026d223f63bb2c42affcbba2c" +checksum = "0ef9aa01d36cda046f797c57959ff5f3c615c9cc63997a8d545831ec7976819b" dependencies = [ "quote 1.0.9", "wasm-bindgen-macro-support", @@ -2731,28 +2704,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.70" +version = "0.2.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4133b5e7f2a531fa413b3a1695e925038a05a71cf67e87dafa295cb645a01385" +checksum = "96eb45c1b2ee33545a813a92dbb53856418bf7eb54ab34f7f7ff1448a5b3735d" dependencies = [ "proc-macro2 1.0.24", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.64", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.70" +version = "0.2.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd4945e4943ae02d15c13962b38a5b1e81eadd4b71214eee75af64a4d6a4fd64" +checksum = "b7148f4696fb4960a346eaa60bbfb42a1ac4ebba21f750f75fc1375b098d5ffa" [[package]] name = "web-sys" -version = "0.3.47" +version = "0.3.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c40dc691fc48003eba817c38da7113c15698142da971298003cac3ef175680b3" +checksum = "59fe19d70f5dacc03f6e46777213facae5ac3801575d56ca6cbd4c93dcd12310" dependencies = [ "js-sys", "wasm-bindgen", @@ -2843,7 +2816,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" dependencies = [ "proc-macro2 1.0.24", - "syn 1.0.60", + "syn 1.0.64", "synstructure", ] diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 9d8f79c08..75e5daebf 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = "0.10.6" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.4" } memmap = "0.7.0" milli = { path = "../milli" } once_cell = "1.5.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b63e34b32..d18628149 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -20,7 +20,7 @@ heed = { version = "0.10.6", default-features = false, features = ["lmdb", "sync human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.4" } memmap = "0.7.0" num-traits = "0.2.14" obkv = "0.1.1" From 522e79f2e0dec684cca7db9958f1851d43387db3 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Mon, 29 Mar 2021 19:07:22 +0300 Subject: [PATCH 0580/1889] feat(search, criteria): introduce a percentage threshold to the asc/desc --- milli/src/search/criteria/asc_desc.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 6b8afad2c..d4b85b2bf 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -19,6 +19,10 @@ use crate::search::WordDerivationsCache; use crate::{FieldsIdsMap, FieldId, Index}; use super::{Criterion, CriterionResult}; +/// If the number of candidates is lower or equal to the specified % of total number of documents, +/// use simple sort. Otherwise, use facet database. +const CANDIDATES_THRESHOLD: f64 = 0.1; + pub struct AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn<'t>, @@ -237,7 +241,7 @@ fn field_id_facet_type( /// Returns an iterator over groups of the given candidates in ascending or descending order. /// -/// It will either use an iterative or a recusrsive method on the whole facet database depending +/// It will either use an iterative or a recursive method on the whole facet database depending /// on the number of candidates to rank. fn facet_ordered<'t>( index: &'t Index, @@ -248,9 +252,11 @@ fn facet_ordered<'t>( candidates: RoaringBitmap, ) -> anyhow::Result> + 't>> { + let number_of_documents = index.number_of_documents(&rtxn)? as f64; + match facet_type { FacetType::Float => { - if candidates.len() <= 1000 { + if candidates.len() / number_of_documents * 100 <= CANDIDATES_THRESHOLD { let iter = iterative_facet_ordered_iter::>( index, rtxn, field_id, ascending, candidates, )?; @@ -266,7 +272,7 @@ fn facet_ordered<'t>( } }, FacetType::Integer => { - if candidates.len() <= 1000 { + if candidates.len() / number_of_documents * 100 <= CANDIDATES_THRESHOLD { let iter = iterative_facet_ordered_iter::( index, rtxn, field_id, ascending, candidates, )?; From a776ec97185b449a718236bd231d2cf8ef7650b3 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Mon, 29 Mar 2021 19:16:35 +0300 Subject: [PATCH 0581/1889] fix division --- milli/src/search/criteria/asc_desc.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index d4b85b2bf..df9f164e2 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -256,7 +256,7 @@ fn facet_ordered<'t>( match facet_type { FacetType::Float => { - if candidates.len() / number_of_documents * 100 <= CANDIDATES_THRESHOLD { + if candidates.len() as f64 / number_of_documents * 100.0 <= CANDIDATES_THRESHOLD { let iter = iterative_facet_ordered_iter::>( index, rtxn, field_id, ascending, candidates, )?; @@ -272,7 +272,7 @@ fn facet_ordered<'t>( } }, FacetType::Integer => { - if candidates.len() / number_of_documents * 100 <= CANDIDATES_THRESHOLD { + if candidates.len() as f64 / number_of_documents * 100.0 <= CANDIDATES_THRESHOLD { let iter = iterative_facet_ordered_iter::( index, rtxn, field_id, ascending, candidates, )?; From 1e3f05db8f0cdd82f092c709c2e70522cc5f4902 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Tue, 30 Mar 2021 11:57:10 +0300 Subject: [PATCH 0582/1889] use fixed number of candidates as a threshold --- milli/src/search/criteria/asc_desc.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index df9f164e2..78ae540e4 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -19,9 +19,9 @@ use crate::search::WordDerivationsCache; use crate::{FieldsIdsMap, FieldId, Index}; use super::{Criterion, CriterionResult}; -/// If the number of candidates is lower or equal to the specified % of total number of documents, -/// use simple sort. Otherwise, use facet database. -const CANDIDATES_THRESHOLD: f64 = 0.1; +/// Threshold on the number of candidates that will make +/// the system to choose between one algorithm or another. +const CANDIDATES_THRESHOLD: u64 = 1000; pub struct AscDesc<'t> { index: &'t Index, @@ -252,11 +252,9 @@ fn facet_ordered<'t>( candidates: RoaringBitmap, ) -> anyhow::Result> + 't>> { - let number_of_documents = index.number_of_documents(&rtxn)? as f64; - match facet_type { FacetType::Float => { - if candidates.len() as f64 / number_of_documents * 100.0 <= CANDIDATES_THRESHOLD { + if candidates.len() <= CANDIDATES_THRESHOLD { let iter = iterative_facet_ordered_iter::>( index, rtxn, field_id, ascending, candidates, )?; @@ -272,7 +270,7 @@ fn facet_ordered<'t>( } }, FacetType::Integer => { - if candidates.len() as f64 / number_of_documents * 100.0 <= CANDIDATES_THRESHOLD { + if candidates.len() <= CANDIDATES_THRESHOLD { let iter = iterative_facet_ordered_iter::( index, rtxn, field_id, ascending, candidates, )?; From 2cb32edaa974e413c9a9beef65468bbfd218ed8f Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Tue, 30 Mar 2021 12:10:06 +0300 Subject: [PATCH 0583/1889] fix(criterion): compile asc/desc regex only once use once_cell instead of lazy_static reorder imports --- milli/src/criterion.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 5d8ba09ba..40f9a3e0b 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -4,9 +4,14 @@ use std::fmt; use anyhow::{Context, bail}; use regex::Regex; use serde::{Serialize, Deserialize}; +use once_cell::sync::Lazy; use crate::facet::FacetType; +static ASC_DESC_REGEX: Lazy = Lazy::new(|| { + Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() +}); + #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { /// Sorted by increasing number of typos. @@ -39,8 +44,7 @@ impl Criterion { "wordsposition" => Ok(Criterion::WordsPosition), "exactness" => Ok(Criterion::Exactness), text => { - let re = Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#)?; - let caps = re.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; + let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; let order = caps.get(1).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str(); faceted_attributes.get(field_name).with_context(|| format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name))?; From bcc131e8660d7bc84dea91ce29444b1d4340038e Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 31 Mar 2021 16:17:34 +0200 Subject: [PATCH 0584/1889] add a button to display or hide the facets --- http-ui/public/script.js | 17 +++++++++++++++++ http-ui/public/style.css | 10 +++++++++- http-ui/templates/index.html | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/http-ui/public/script.js b/http-ui/public/script.js index 4a16e8fc3..9d8c95b61 100644 --- a/http-ui/public/script.js +++ b/http-ui/public/script.js @@ -114,6 +114,23 @@ function selectedFacetsToArray(facets_obj) { return array; } +display_facets = false; +$('#display_facets').click(function() { + if (display_facets) { + display_facets = false; + $('#display_facets').html("Display facets") + $('#display_facets').removeClass("is-danger"); + $('#display_facets').addClass("is-success"); + $('#facets').hide(); + } else { + display_facets = true; + $('#display_facets').html("Hide facets") + $('#display_facets').addClass("is-danger"); + $('#display_facets').removeClass("is-success"); + $('#facets').show(); + } +}); + // Make the number of document a little bit prettier $('#docs-count').text(function(index, text) { return parseInt(text).toLocaleString() diff --git a/http-ui/public/style.css b/http-ui/public/style.css index 1de348082..ef032e51e 100644 --- a/http-ui/public/style.css +++ b/http-ui/public/style.css @@ -10,11 +10,19 @@ } #facets { + display: none; max-width: 900px; margin: 20px auto 0 auto; padding: 0; max-height: 16em; - overflow: scroll; + overflow: scroll; +} + +#display_facets { + margin: 20px auto 0 auto; + padding: 5px; + max-height: 16em; + overflow: scroll; } #facets .tag:hover { diff --git a/http-ui/templates/index.html b/http-ui/templates/index.html index 83b1a3e49..49fb0eb2b 100644 --- a/http-ui/templates/index.html +++ b/http-ui/templates/index.html @@ -41,6 +41,7 @@

Number of Documents

{{ docs_count }}

+ From 13ce0ebb873c5193cef38de61220d2cacd46c7ef Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 31 Mar 2021 16:27:32 +0200 Subject: [PATCH 0585/1889] stop requestings the facets if the user has hidden them --- http-ui/public/script.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/http-ui/public/script.js b/http-ui/public/script.js index 9d8c95b61..b621cd453 100644 --- a/http-ui/public/script.js +++ b/http-ui/public/script.js @@ -1,5 +1,6 @@ var request = null; var timeoutID = null; +var display_facets = false; $('#query, #filters').on('input', function () { var query = $('#query').val(); @@ -18,7 +19,7 @@ $('#query, #filters').on('input', function () { data: JSON.stringify({ 'query': query, 'filters': filters, - "facetDistribution": true, + "facetDistribution": display_facets, }), contentType: 'application/json', success: function (data, textStatus, request) { @@ -114,7 +115,6 @@ function selectedFacetsToArray(facets_obj) { return array; } -display_facets = false; $('#display_facets').click(function() { if (display_facets) { display_facets = false; From 9205b640a484a41cfe79ff6b7410dd08773f3090 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Wed, 31 Mar 2021 18:14:23 +0300 Subject: [PATCH 0586/1889] feat(index): introduce fields_ids_distribution --- milli/src/index.rs | 60 +++++++++++++++++++++++++ milli/src/update/index_documents/mod.rs | 2 +- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index a14747788..2e0d329ef 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -203,6 +203,25 @@ impl Index { Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_IDS_MAP_KEY)?.unwrap_or_default()) } + /* fields ids distribution */ + + /// Returns the fields ids distribution which associate the internal field ids + /// with the number of times it occurs in the obkv documents. + // TODO store in the index itself and change only within updates that modify the documents + pub fn fields_ids_distribution(&self, rtxn: &RoTxn) -> anyhow::Result> { + let mut distribution = HashMap::new(); + + for document in self.documents.iter(rtxn)? { + let (_, obkv) = document?; + + for (field_id, _) in obkv.iter() { + *distribution.entry(field_id).or_default() += 1; + } + } + + Ok(distribution) + } + /* displayed fields */ /// Writes the fields that must be displayed in the defined order. @@ -429,3 +448,44 @@ impl Index { self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) } } + +#[cfg(test)] +mod tests { + use heed::EnvOpenOptions; + + use crate::Index; + use crate::update::{IndexDocuments, UpdateFormat}; + + fn prepare_index() -> Index { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = &br#" + { "name": "kevin" } + { "name": "bob", "age": 20 } + "#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::JsonStream); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + index + } + + #[test] + fn fields_ids_distribution() { + let index = prepare_index(); + + let rtxn = index.read_txn().unwrap(); + + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + + let fields_ids_distribution = index.fields_ids_distribution(&rtxn).unwrap(); + assert_eq!(fields_ids_distribution.len(), 2); + assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("age").unwrap()), Some(&1)); + assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("name").unwrap()), Some(&2)); + } +} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ccbd95c7f..a19d8c0a7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -925,7 +925,7 @@ mod tests { // one sent and that an UUID has been generated. assert_eq!(doc.get(0), Some(&br#""updated kevin""#[..])); // This is an UUID, it must be 36 bytes long plus the 2 surrounding string quotes ("). - assert!(doc.get(1).unwrap().len() == 36 + 2); + assert_eq!(doc.get(1).unwrap().len(), 36 + 2); drop(rtxn); } From 62a8f1d70798f5eb289b192f1aafddd0584365f7 Mon Sep 17 00:00:00 2001 From: tamo Date: Mon, 29 Mar 2021 19:16:36 +0200 Subject: [PATCH 0587/1889] bump the version of the tokenizer --- Cargo.lock | 2 +- http-ui/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a53930367..8f296f2fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1206,7 +1206,7 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "meilisearch-tokenizer" version = "0.1.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.1.4#31ba3ff4a15501f12b7d37ac64ddce7c35a9757c" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.0#833c48b2ee39071f8b4f51abd15122afdb3c8c06" dependencies = [ "character_converter", "cow-utils", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 75e5daebf..02a799091 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = "0.10.6" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.4" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.0" } memmap = "0.7.0" milli = { path = "../milli" } once_cell = "1.5.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d18628149..b1a54d22d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -20,7 +20,7 @@ heed = { version = "0.10.6", default-features = false, features = ["lmdb", "sync human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.1.4" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.0" } memmap = "0.7.0" num-traits = "0.2.14" obkv = "0.1.1" From a2f46029c7b5637d1c5249f6e021ee74c5e19123 Mon Sep 17 00:00:00 2001 From: tamo Date: Mon, 29 Mar 2021 19:15:47 +0200 Subject: [PATCH 0588/1889] implement a first version of the stop_words The front must provide a BTreeSet containing the stop words The stop_words are set at None if an empty Set is provided add the stop-words in the http-ui interface Use maplit in the test and remove all the useless drop(rtxn) at the end of all tests --- http-ui/src/main.rs | 22 ++- milli/src/index.rs | 17 +++ milli/src/search/mod.rs | 5 +- milli/src/search/query_tree.rs | 37 ++--- milli/src/update/index_documents/mod.rs | 5 +- milli/src/update/index_documents/store.rs | 8 +- milli/src/update/settings.rs | 165 +++++++++++++++++++--- 7 files changed, 203 insertions(+), 56 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 86f965368..f068b5b9a 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{File, create_dir_all}; use std::net::SocketAddr; @@ -128,7 +128,10 @@ struct Highlighter<'a, A> { impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { fn new(stop_words: &'a fst::Set
) -> Self { - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let mut config = AnalyzerConfig::default(); + config.stop_words(stop_words); + let analyzer = Analyzer::new(config); + Self { analyzer } } @@ -266,6 +269,13 @@ struct Settings { skip_serializing_if = "Option::is_none", )] criteria: Option>>, + + #[serde( + default, + deserialize_with = "deserialize_some", + skip_serializing_if = "Option::is_none", + )] + stop_words: Option>>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -439,6 +449,14 @@ async fn main() -> anyhow::Result<()> { } } + // We transpose the settings JSON struct into a real setting update. + if let Some(stop_words) = settings.stop_words { + match stop_words { + Some(stop_words) => builder.set_stop_words(stop_words), + None => builder.reset_stop_words(), + } + } + let result = builder.execute(|indexing_step, update_id| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), diff --git a/milli/src/index.rs b/milli/src/index.rs index 2e0d329ef..642ad4ab7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -28,6 +28,7 @@ pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; +pub const STOP_WORDS_KEY: &str = "stop-words"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; @@ -377,6 +378,22 @@ impl Index { } } + /* stop words */ + + pub fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, STOP_WORDS_KEY, fst.as_fst().as_bytes()) + } + + pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY) + } + pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? { + Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), + None => Ok(None), + } + } + /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7560fbf0a..c88800f38 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -4,7 +4,7 @@ use std::fmt; use std::str::Utf8Error; use std::time::Instant; -use fst::{IntoStreamer, Streamer, Set}; +use fst::{IntoStreamer, Streamer}; use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder}; use log::debug; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; @@ -91,8 +91,7 @@ impl<'a> Search<'a> { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); builder.authorize_typos(self.authorize_typos); - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::>::new(AnalyzerConfig::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); builder.build(tokens)? diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 114032eb8..f7367d826 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -543,7 +543,6 @@ pub fn maximum_proximity(operation: &Operation) -> usize { mod test { use std::collections::HashMap; - use fst::Set; use maplit::{hashmap, hashset}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::{Rng, SeedableRng, rngs::StdRng}; @@ -646,8 +645,7 @@ mod test { #[test] fn prefix() { let query = "hey friends"; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -667,8 +665,7 @@ mod test { #[test] fn no_prefix() { let query = "hey friends "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -688,8 +685,7 @@ mod test { #[test] fn synonyms() { let query = "hello world "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -720,8 +716,7 @@ mod test { #[test] fn complex_synonyms() { let query = "new york city "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -766,8 +761,7 @@ mod test { #[test] fn ngrams() { let query = "n grams "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -787,8 +781,7 @@ mod test { #[test] fn word_split() { let query = "wordsplit fish "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -814,8 +807,7 @@ mod test { #[test] fn phrase() { let query = "\"hey friends\" \" \" \"wooop"; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -835,8 +827,7 @@ mod test { #[test] fn optional_word() { let query = "hey my friend "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -875,8 +866,7 @@ mod test { #[test] fn optional_word_phrase() { let query = "\"hey my\""; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -892,8 +882,7 @@ mod test { #[test] fn optional_word_multiple_phrases() { let query = r#""hey" my good "friend""#; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -927,8 +916,7 @@ mod test { #[test] fn no_typo() { let query = "hey friends "; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); @@ -947,8 +935,7 @@ mod test { #[test] fn fetching_words() { let query = "wordsplit nyc world"; - let stop_words = &Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let result = analyzer.analyze(query); let tokens = result.tokens(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a19d8c0a7..f4a7c7f25 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -410,6 +410,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { None => fields_ids_map.iter().map(|(id, _name)| id).collect(), }; + let stop_words = self.index.stop_words(self.wtxn)?; + let stop_words = stop_words.as_ref(); let linked_hash_map_size = self.linked_hash_map_size; let max_nb_chunks = self.max_nb_chunks; let max_memory = self.max_memory; @@ -436,7 +438,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let readers = rayon::iter::repeatn(documents, num_threads) .enumerate() .map(|(i, documents)| { - let stop_words = fst::Set::default(); let store = Store::new( searchable_fields.clone(), faceted_fields.clone(), @@ -446,7 +447,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, - &stop_words, + stop_words, )?; store.index( documents, diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 05767080a..03d91af24 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -86,7 +86,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { chunk_compression_type: CompressionType, chunk_compression_level: Option, chunk_fusing_shrink_size: Option, - stop_words: &'s Set, + stop_words: Option<&'s Set>, ) -> anyhow::Result { // We divide the max memory by the number of sorter the Store have. @@ -141,7 +141,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { create_writer(chunk_compression_type, chunk_compression_level, f) })?; - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let mut config = AnalyzerConfig::default(); + if let Some(stop_words) = stop_words { + config.stop_words(stop_words); + } + let analyzer = Analyzer::new(config); Ok(Store { // Indexing parameters. diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 7ce8b98c1..451447102 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::str::FromStr; use anyhow::Context; @@ -32,6 +32,7 @@ pub struct Settings<'a, 't, 'u, 'i> { displayed_fields: Option>>, faceted_fields: Option>>, criteria: Option>>, + stop_words: Option>>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -55,6 +56,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { displayed_fields: None, faceted_fields: None, criteria: None, + stop_words: None, update_id, } } @@ -91,6 +93,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.criteria = Some(Some(criteria)); } + pub fn reset_stop_words(&mut self) { + self.stop_words = Some(None); + } + + pub fn set_stop_words(&mut self, stop_words: BTreeSet) { + self.stop_words = if stop_words.is_empty() { + Some(None) + } else { + Some(Some(stop_words)) + } + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync @@ -210,6 +224,28 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } + fn update_stop_words(&mut self) -> anyhow::Result { + match self.stop_words { + Some(Some(ref stop_words)) => { + let current = self.index.stop_words(self.wtxn)?; + // since we can't compare a BTreeSet with an FST we are going to convert the + // BTreeSet to an FST and then compare bytes per bytes the two FSTs. + let fst = fst::Set::from_iter(&*stop_words)?; + + // Does the new FST differ from the previous one? + if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { + // we want to re-create our FST. + self.index.put_stop_words(self.wtxn, &fst)?; + Ok(true) + } else { + Ok(false) + } + } + Some(None) => Ok(self.index.delete_stop_words(self.wtxn)?), + None => Ok(false), + } + } + fn update_facets(&mut self) -> anyhow::Result { match self.faceted_fields { Some(Some(ref fields)) => { @@ -248,22 +284,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { pub fn execute(mut self, progress_callback: F) -> anyhow::Result<()> where - F: Fn(UpdateIndexingStep, u64) + Sync - { - self.index.set_updated_at(self.wtxn, &Utc::now())?; - let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; - self.update_displayed()?; - let facets_updated = self.update_facets()?; - // update_criteria MUST be called after update_facets, since criterion fields must be set - // as facets. - self.update_criteria()?; - let searchable_updated = self.update_searchable()?; + F: Fn(UpdateIndexingStep, u64) + Sync + { + self.index.set_updated_at(self.wtxn, &Utc::now())?; + let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; + self.update_displayed()?; + let stop_words_updated = self.update_stop_words()?; + let facets_updated = self.update_facets()?; + // update_criteria MUST be called after update_facets, since criterion fields must be set + // as facets. + self.update_criteria()?; + let searchable_updated = self.update_searchable()?; - if facets_updated || searchable_updated { - self.reindex(&progress_callback, old_fields_ids_map)?; - } - Ok(()) + if facets_updated || searchable_updated || stop_words_updated { + self.reindex(&progress_callback, old_fields_ids_map)?; } + Ok(()) + } } #[cfg(test)] @@ -271,7 +308,7 @@ mod tests { use super::*; use heed::EnvOpenOptions; - use maplit::hashmap; + use maplit::{hashmap, btreeset}; use crate::facet::FacetType; use crate::update::{IndexDocuments, UpdateFormat}; @@ -328,7 +365,6 @@ mod tests { assert_eq!(result.documents_ids.len(), 1); let documents = index.documents(&rtxn, result.documents_ids).unwrap(); assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); - drop(rtxn); } #[test] @@ -372,7 +408,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap(); assert_eq!(fields_ids.unwrap(), &["age"][..]); - drop(rtxn); } #[test] @@ -394,7 +429,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap(); assert_eq!(fields_ids, None); - drop(rtxn); } #[test] @@ -434,7 +468,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); let fields_ids = index.displayed_fields(&rtxn).unwrap(); assert_eq!(fields_ids, None); - drop(rtxn); } #[test] @@ -478,7 +511,96 @@ mod tests { // Only count the field_id 0 and level 0 facet values. let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 4); - drop(rtxn); + } + + #[test] + fn default_stop_words() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure there is no stop_words by default + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_none()); + } + + #[test] + fn set_and_reset_stop_words() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + + // In the same transaction we provide some stop_words + let mut builder = Settings::new(&mut wtxn, &index, 0); + let set = btreeset!{ "i".to_string(), "the".to_string(), "are".to_string() }; + builder.set_stop_words(set.clone()); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure stop_words are effectively stored + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_some()); // at this point the index should return something + + let stop_words = stop_words.unwrap(); + let expected = fst::Set::from_iter(&set).unwrap(); + assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes()); + + // when we search for something that is a non prefix stop_words it should be ignored + let result = index.search(&rtxn).query("the ").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("i ").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("are ").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + + let result = index.search(&rtxn).query("dog").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos + let result = index.search(&rtxn).query("benoît").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data + + // now we'll reset the stop_words and ensure it's None + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.reset_stop_words(); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_none()); + + // now we can search for the stop words + let result = index.search(&rtxn).query("the").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + let result = index.search(&rtxn).query("i").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("are").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + + // the rest of the search is still not impacted + let result = index.search(&rtxn).query("dog").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos + let result = index.search(&rtxn).query("benoît").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data } #[test] @@ -519,6 +641,5 @@ mod tests { assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap()); assert!(index.primary_key(&rtxn).unwrap().is_none()); assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap()); - drop(rtxn); } } From 12fb509d8470e6d0c3a424756c9838a1efe306d2 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 31 Mar 2021 14:41:22 +0200 Subject: [PATCH 0589/1889] Integrate the stop_words in the querytree remove the stop_words from the querytree except if it was a prefix or a typo --- milli/src/search/query_tree.rs | 60 ++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index f7367d826..fb5b5b87c 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use std::{fmt, cmp, mem}; +use fst::Set; use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; @@ -154,6 +155,10 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; + fn stop_words(&self) -> anyhow::Result>>; + fn is_stop_word(&self, word: &str) -> anyhow::Result { + Ok(self.stop_words()?.map_or(false, |s| s.contains(word))) + } fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { @@ -183,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn synonyms>(&self, _words: &[S]) -> heed::Result>>> { Ok(None) } + + fn stop_words(&self) -> anyhow::Result>> { + self.index.stop_words(self.rtxn) + } } impl<'a> QueryTreeBuilder<'a> { @@ -331,8 +340,7 @@ fn create_query_tree( optional_words: bool, authorize_typos: bool, query: PrimitiveQuery, -) -> anyhow::Result -{ +) -> anyhow::Result { /// Matches on the `PrimitiveQueryPart` and create an operation from it. fn resolve_primitive_part( ctx: &impl Context, @@ -350,7 +358,12 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } - children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); + + let is_stop_word = ctx.is_stop_word(&word)?; + let query = Query { prefix, kind: typos(word, authorize_typos) }; + if query.prefix || query.kind.is_tolerant() || !is_stop_word { + children.push(Operation::Query(query)); + } Ok(Operation::or(false, children)) }, // create a CONSECUTIVE operation wrapping all word in the phrase @@ -365,12 +378,11 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - ) -> anyhow::Result - { + ) -> anyhow::Result { const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) { + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { let mut or_op_children = Vec::new(); for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { @@ -381,23 +393,31 @@ fn create_query_tree( match group { [part] => { - let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; + let operation = + resolve_primitive_part(ctx, authorize_typos, part.clone())?; and_op_children.push(operation); - }, + } words => { - let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); - let words: Vec<_> = words.iter().filter_map(| part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }).collect(); + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); + + let is_stop_word = ctx.is_stop_word(&concat)?; let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; - operations.push(Operation::Query(query)); - and_op_children.push(Operation::or(false, operations)); + if query.prefix || query.kind.is_tolerant() || !is_stop_word { + operations.push(Operation::Query(query)); + and_op_children.push(Operation::or(false, operations)); + } } } @@ -581,6 +601,10 @@ mod test { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) } + + fn stop_words(&self) -> anyhow::Result>> { + Ok(None) + } } impl Default for TestContext { From 27c7ab6e001826bdd233c8adfea2e2b3f3384394 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Wed, 31 Mar 2021 18:14:23 +0300 Subject: [PATCH 0590/1889] feat(index): store fields distribution in index --- Cargo.lock | 6 +-- milli/src/index.rs | 42 +++++++++---------- milli/src/update/index_documents/mod.rs | 4 ++ milli/src/update/index_documents/transform.rs | 17 ++++++++ milli/src/update/settings.rs | 2 +- 5 files changed, 45 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8f296f2fa..91e72450a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1520,7 +1520,8 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" dependencies = [ "ucd-trie", ] @@ -1528,8 +1529,7 @@ dependencies = [ [[package]] name = "pest" version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" dependencies = [ "ucd-trie", ] diff --git a/milli/src/index.rs b/milli/src/index.rs index 642ad4ab7..2b364b068 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,6 +23,7 @@ pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; +pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; @@ -33,6 +34,8 @@ pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; +pub type FieldsDistribution = HashMap; + #[derive(Clone)] pub struct Index { /// The LMDB environment which this index is associated with. @@ -204,23 +207,18 @@ impl Index { Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_IDS_MAP_KEY)?.unwrap_or_default()) } - /* fields ids distribution */ + /* fields distribution */ - /// Returns the fields ids distribution which associate the internal field ids - /// with the number of times it occurs in the obkv documents. - // TODO store in the index itself and change only within updates that modify the documents - pub fn fields_ids_distribution(&self, rtxn: &RoTxn) -> anyhow::Result> { - let mut distribution = HashMap::new(); + /// Writes the fields distribution which associate the field with the number of times + /// it occurs in the obkv documents. + pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>(wtxn, FIELDS_DISTRIBUTION_KEY, &distribution) + } - for document in self.documents.iter(rtxn)? { - let (_, obkv) = document?; - - for (field_id, _) in obkv.iter() { - *distribution.entry(field_id).or_default() += 1; - } - } - - Ok(distribution) + /// Returns the fields distribution which associate the field with the number of times + /// it occurs in the obkv documents. + pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default()) } /* displayed fields */ @@ -469,6 +467,7 @@ impl Index { #[cfg(test)] mod tests { use heed::EnvOpenOptions; + use maplit::hashmap; use crate::Index; use crate::update::{IndexDocuments, UpdateFormat}; @@ -493,16 +492,15 @@ mod tests { } #[test] - fn fields_ids_distribution() { + fn initial_fields_distribution() { let index = prepare_index(); let rtxn = index.read_txn().unwrap(); - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - - let fields_ids_distribution = index.fields_ids_distribution(&rtxn).unwrap(); - assert_eq!(fields_ids_distribution.len(), 2); - assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("age").unwrap()), Some(&1)); - assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("name").unwrap()), Some(&2)); + let fields_distribution = index.fields_distribution(&rtxn).unwrap(); + assert_eq!(fields_distribution, hashmap!{ + "age".to_string() => 1, + "name".to_string() => 2 + }); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index f4a7c7f25..fb1a2d6c0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -358,6 +358,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let TransformOutput { primary_key, fields_ids_map, + fields_distribution, external_documents_ids, new_documents_ids, replaced_documents_ids, @@ -551,6 +552,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the fields ids map into the main database self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + // We write the fields distribution into the main database + self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; + // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, &primary_key)?; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index b22cd14c6..f8aac60d7 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -14,12 +14,14 @@ use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use super::merge_function::merge_two_obkvs; use super::{create_writer, create_sorter, IndexDocumentsMethod}; +use crate::index::FieldsDistribution; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, + pub fields_distribution: FieldsDistribution, pub external_documents_ids: ExternalDocumentsIds<'static>, pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, @@ -74,6 +76,7 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; + let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); // Deserialize the whole batch of documents in memory. @@ -103,6 +106,7 @@ impl Transform<'_, '_> { return Ok(TransformOutput { primary_key, fields_ids_map, + fields_distribution, external_documents_ids: ExternalDocumentsIds::default(), new_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(), @@ -148,6 +152,8 @@ impl Transform<'_, '_> { // We prepare the fields ids map with the documents keys. for (key, _value) in &document { fields_ids_map.insert(&key).context("field id limit reached")?; + + *fields_distribution.entry(key.to_owned()).or_default() += 1; } // We retrieve the user id from the document based on the primary key name, @@ -200,6 +206,7 @@ impl Transform<'_, '_> { sorter, primary_key, fields_ids_map, + fields_distribution, documents_count, external_documents_ids, progress_callback, @@ -212,6 +219,7 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; + let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let mut csv = csv::Reader::from_reader(reader); @@ -307,6 +315,10 @@ impl Transform<'_, '_> { json_buffer.clear(); serde_json::to_writer(&mut json_buffer, &field)?; writer.insert(*field_id, &json_buffer)?; + + let field_name = fields_ids_map.name(*field_id).unwrap(); + + *fields_distribution.entry(field_name.to_string()).or_default() += 1; } // We use the extracted/generated user id as the key for this document. @@ -328,6 +340,7 @@ impl Transform<'_, '_> { sorter, primary_key_name, fields_ids_map, + fields_distribution, documents_count, external_documents_ids, progress_callback, @@ -342,6 +355,7 @@ impl Transform<'_, '_> { sorter: grenad::Sorter, primary_key: String, fields_ids_map: FieldsIdsMap, + fields_distribution: FieldsDistribution, approximate_number_of_documents: usize, mut external_documents_ids: ExternalDocumentsIds<'_>, progress_callback: F, @@ -439,6 +453,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map, + fields_distribution, external_documents_ids: external_documents_ids.into_static(), new_documents_ids, replaced_documents_ids, @@ -457,6 +472,7 @@ impl Transform<'_, '_> { new_fields_ids_map: FieldsIdsMap, ) -> anyhow::Result { + let fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_count = documents_ids.len() as usize; @@ -492,6 +508,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map: new_fields_ids_map, + fields_distribution, external_documents_ids: external_documents_ids.into_static(), new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 451447102..45a4c204c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -183,7 +183,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } - /// Udpates the index's searchable attributes. This causes the field map to be recomputed to + /// Updates the index's searchable attributes. This causes the field map to be recomputed to /// reflect the order of the searchable attributes. fn update_searchable(&mut self) -> anyhow::Result { match self.searchable_fields { From 2658c5c545489dc1f0e177a9e3128153aa6bc6a7 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Thu, 1 Apr 2021 10:07:16 +0300 Subject: [PATCH 0591/1889] feat(index): update fields distribution in clear & delete operations fixes after review bump the version of the tokenizer implement a first version of the stop_words The front must provide a BTreeSet containing the stop words The stop_words are set at None if an empty Set is provided add the stop-words in the http-ui interface Use maplit in the test and remove all the useless drop(rtxn) at the end of all tests Integrate the stop_words in the querytree remove the stop_words from the querytree except if it was a prefix or a typo more fixes after review --- Cargo.lock | 6 +-- milli/src/external_documents_ids.rs | 5 ++ milli/src/index.rs | 36 +++++-------- milli/src/lib.rs | 1 + milli/src/update/clear_documents.rs | 54 ++++++++++++++++++- milli/src/update/delete_documents.rs | 34 +++++++++++- milli/src/update/index_documents/transform.rs | 26 ++++++--- 7 files changed, 128 insertions(+), 34 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 91e72450a..8f296f2fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1520,8 +1520,7 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" dependencies = [ "ucd-trie", ] @@ -1529,7 +1528,8 @@ dependencies = [ [[package]] name = "pest" version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" dependencies = [ "ucd-trie", ] diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 7c81cdde8..ee2a6c7bb 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -19,6 +19,11 @@ impl<'a> ExternalDocumentsIds<'a> { } } + /// Returns `true` if hard and soft external documents lists are empty. + pub fn is_empty(&self) -> bool { + self.hard.is_empty() && self.soft.is_empty() + } + pub fn get>(&self, external_id: A) -> Option { let external_id = external_id.as_ref(); match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { diff --git a/milli/src/index.rs b/milli/src/index.rs index 2b364b068..1150edbca 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -10,7 +10,7 @@ use chrono::{Utc, DateTime}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::{default_criteria, Criterion, Search, FacetDistribution}; +use crate::{default_criteria, Criterion, Search, FacetDistribution, FieldsDistribution}; use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; use crate::{ RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec, @@ -34,8 +34,6 @@ pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; -pub type FieldsDistribution = HashMap; - #[derive(Clone)] pub struct Index { /// The LMDB environment which this index is associated with. @@ -209,14 +207,14 @@ impl Index { /* fields distribution */ - /// Writes the fields distribution which associate the field with the number of times - /// it occurs in the obkv documents. + /// Writes the fields distribution which associates every field name with + /// the number of times it occurs in the documents. pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>(wtxn, FIELDS_DISTRIBUTION_KEY, &distribution) + self.main.put::<_, Str, SerdeJson>(wtxn, FIELDS_DISTRIBUTION_KEY, distribution) } - /// Returns the fields distribution which associate the field with the number of times - /// it occurs in the obkv documents. + /// Returns the fields distribution which associates every field name with + /// the number of times it occurs in the documents. pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result { Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default()) } @@ -472,35 +470,29 @@ mod tests { use crate::Index; use crate::update::{IndexDocuments, UpdateFormat}; - fn prepare_index() -> Index { + #[test] + fn initial_fields_distribution() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#" - { "name": "kevin" } - { "name": "bob", "age": 20 } - "#[..]; + let content = &br#"[ + { "name": "kevin" }, + { "name": "bob", "age": 20 } + ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::JsonStream); + builder.update_format(UpdateFormat::Json); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); - index - } - - #[test] - fn initial_fields_distribution() { - let index = prepare_index(); - let rtxn = index.read_txn().unwrap(); let fields_distribution = index.fields_distribution(&rtxn).unwrap(); assert_eq!(fields_distribution, hashmap!{ + "name".to_string() => 2, "age".to_string() => 1, - "name".to_string() => 2 }); } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index d6a078a1f..fe9bd828b 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -41,6 +41,7 @@ pub type Attribute = u32; pub type DocumentId = u32; pub type FieldId = u8; pub type Position = u32; +pub type FieldsDistribution = HashMap; type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result>; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 5ae3680d3..2c24d9c07 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,6 +1,6 @@ use chrono::Utc; use roaring::RoaringBitmap; -use crate::{ExternalDocumentsIds, Index}; +use crate::{ExternalDocumentsIds, Index, FieldsDistribution}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -42,6 +42,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; + self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?; // We clean all the faceted documents ids. for (field_id, _) in faceted_fields { @@ -61,3 +62,54 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { Ok(number_of_documents) } } + +#[cfg(test)] +mod tests { + use heed::EnvOpenOptions; + + use crate::update::{IndexDocuments, UpdateFormat}; + use super::*; + + #[test] + fn clear_documents() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = &br#"[ + { "id": 0, "name": "kevin", "age": 20 }, + { "id": 1, "name": "kevina" }, + { "id": 2, "name": "benoit", "country": "France" } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + + // Clear all documents from the database. + let builder = ClearDocuments::new(&mut wtxn, &index, 1); + assert_eq!(builder.execute().unwrap(), 3); + + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 4); + + assert!(index.words_fst(&rtxn).unwrap().is_empty()); + assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); + assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); + assert!(index.documents_ids(&rtxn).unwrap().is_empty()); + assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); + + assert!(index.word_docids.is_empty(&rtxn).unwrap()); + assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); + assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); + assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); + assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); + assert!(index.facet_field_id_value_docids.is_empty(&rtxn).unwrap()); + assert!(index.field_id_docid_facet_values.is_empty(&rtxn).unwrap()); + assert!(index.documents.is_empty(&rtxn).unwrap()); + } +} diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 0b112ceb1..4c5bf0a8a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,3 +1,6 @@ +use std::collections::HashMap; +use std::collections::hash_map::Entry; + use anyhow::anyhow; use chrono::Utc; use fst::IntoStreamer; @@ -90,6 +93,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { documents, } = self.index; + // Number of fields for each document that has been deleted. + let mut fields_ids_distribution_diff = HashMap::new(); + // Retrieve the words and the external documents ids contained in the documents. let mut words = Vec::new(); let mut external_ids = Vec::new(); @@ -100,6 +106,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let key = BEU32::new(docid); let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; if let Some((_key, obkv)) = iter.next().transpose()? { + for (field_id, _) in obkv.iter() { + *fields_ids_distribution_diff.entry(field_id).or_default() += 1; + } + if let Some(content) = obkv.get(id_field) { let external_id = match serde_json::from_slice(content).unwrap() { Value::String(string) => SmallString32::from(string.as_str()), @@ -112,7 +122,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } drop(iter); - // We iterate througt the words positions of the document id, + // We iterate through the words positions of the document id, // retrieve the word and delete the positions. let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?; while let Some(result) = iter.next() { @@ -123,6 +133,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } + let mut fields_distribution = self.index.fields_distribution(self.wtxn)?; + + // We use pre-calculated number of fields occurrences that needs to be deleted + // to reflect deleted documents. + // If all field occurrences are removed, delete the entry from distribution. + // Otherwise, insert new number of occurrences (current_count - count_diff). + for (field_id, count_diff) in fields_ids_distribution_diff { + let field_name = fields_ids_map.name(field_id).unwrap(); + if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { + match entry.get().checked_sub(count_diff) { + Some(0) | None => entry.remove(), + Some(count) => entry.insert(count) + }; + } + } + + self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; + // We create the FST map of the external ids that we must delete. external_ids.sort_unstable(); let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?; @@ -347,5 +375,9 @@ mod tests { builder.execute().unwrap(); wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f8aac60d7..308a24abc 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::iter::Peekable; @@ -10,11 +11,10 @@ use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId}; +use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use super::merge_function::merge_two_obkvs; use super::{create_writer, create_sorter, IndexDocumentsMethod}; -use crate::index::FieldsDistribution; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; @@ -137,6 +137,8 @@ impl Transform<'_, '_> { let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; let mut documents_count = 0; + let mut fields_ids_distribution = HashMap::new(); + for result in documents { let document = result?; @@ -151,9 +153,9 @@ impl Transform<'_, '_> { // We prepare the fields ids map with the documents keys. for (key, _value) in &document { - fields_ids_map.insert(&key).context("field id limit reached")?; + let field_id = fields_ids_map.insert(&key).context("field id limit reached")?; - *fields_distribution.entry(key.to_owned()).or_default() += 1; + *fields_ids_distribution.entry(field_id).or_insert(0) += 1; } // We retrieve the user id from the document based on the primary key name, @@ -196,6 +198,11 @@ impl Transform<'_, '_> { documents_count += 1; } + for (field_id, count) in fields_ids_distribution { + let field_name = fields_ids_map.name(field_id).unwrap(); + *fields_distribution.entry(field_name.to_string()).or_default() += count; + } + progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { documents_seen: documents_count, }); @@ -277,6 +284,8 @@ impl Transform<'_, '_> { let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; let mut documents_count = 0; + let mut fields_ids_distribution = HashMap::new(); + let mut record = csv::StringRecord::new(); while csv.read_record(&mut record)? { obkv_buffer.clear(); @@ -316,9 +325,7 @@ impl Transform<'_, '_> { serde_json::to_writer(&mut json_buffer, &field)?; writer.insert(*field_id, &json_buffer)?; - let field_name = fields_ids_map.name(*field_id).unwrap(); - - *fields_distribution.entry(field_name.to_string()).or_default() += 1; + *fields_ids_distribution.entry(*field_id).or_insert(0) += 1; } // We use the extracted/generated user id as the key for this document. @@ -326,6 +333,11 @@ impl Transform<'_, '_> { documents_count += 1; } + for (field_id, count) in fields_ids_distribution { + let field_name = fields_ids_map.name(field_id).unwrap(); + *fields_distribution.entry(field_name.to_string()).or_default() += count; + } + progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { documents_seen: documents_count, }); From 0a4bde1f2fd87bf8e79f8d75176661527a0eb412 Mon Sep 17 00:00:00 2001 From: tamo Date: Thu, 1 Apr 2021 19:13:18 +0200 Subject: [PATCH 0592/1889] update the default ordering of the criterion --- milli/src/criterion.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 40f9a3e0b..8bae99a20 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -14,18 +14,16 @@ static ASC_DESC_REGEX: Lazy = Lazy::new(|| { #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { + /// Sorted by decreasing number of matched query terms. + /// Query words at the front of an attribute is considered better than if it was at the back. + Words, /// Sorted by increasing number of typos. Typo, - /// Sorted by decreasing number of matched query terms. - Words, /// Sorted by increasing distance between matched query terms. Proximity, /// Documents with quey words contained in more important /// attributes are considred better. Attribute, - /// Documents with query words at the front of an attribute is - /// considered better than if it was at the back. - WordsPosition, /// Sorted by the similarity of the matched words with the query words. Exactness, /// Sorted by the increasing value of the field specified. @@ -37,11 +35,10 @@ pub enum Criterion { impl Criterion { pub fn from_str(faceted_attributes: &HashMap, txt: &str) -> anyhow::Result { match txt { - "typo" => Ok(Criterion::Typo), "words" => Ok(Criterion::Words), + "typo" => Ok(Criterion::Typo), "proximity" => Ok(Criterion::Proximity), "attribute" => Ok(Criterion::Attribute), - "wordsposition" => Ok(Criterion::WordsPosition), "exactness" => Ok(Criterion::Exactness), text => { let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; @@ -60,11 +57,10 @@ impl Criterion { pub fn default_criteria() -> Vec { vec![ - Criterion::Typo, Criterion::Words, + Criterion::Typo, Criterion::Proximity, Criterion::Attribute, - Criterion::WordsPosition, Criterion::Exactness, ] } @@ -74,11 +70,10 @@ impl fmt::Display for Criterion { use Criterion::*; match self { - Typo => f.write_str("typo"), Words => f.write_str("words"), + Typo => f.write_str("typo"), Proximity => f.write_str("proximity"), Attribute => f.write_str("attribute"), - WordsPosition => f.write_str("wordsPosition"), Exactness => f.write_str("exactness"), Asc(attr) => write!(f, "asc({})", attr), Desc(attr) => write!(f, "desc({})", attr), From dc636d190d6e6d332ff75e5811e0bf87011d474d Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Wed, 7 Apr 2021 14:33:44 +0300 Subject: [PATCH 0593/1889] refactor(http, update): introduce setting enum --- Cargo.lock | 6 +- http-ui/src/main.rs | 136 +++++++++++++------------------- milli/src/update/mod.rs | 19 ++--- milli/src/update/settings.rs | 147 ++++++++++++++++++++++------------- 4 files changed, 160 insertions(+), 148 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8f296f2fa..91e72450a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1520,7 +1520,8 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" dependencies = [ "ucd-trie", ] @@ -1528,8 +1529,7 @@ dependencies = [ [[package]] name = "pest" version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" dependencies = [ "ucd-trie", ] diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index f068b5b9a..6e9a07855 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,38 +1,38 @@ +use std::{io, mem}; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; -use std::fs::{File, create_dir_all}; +use std::fs::{create_dir_all, File}; use std::net::SocketAddr; use std::num::NonZeroUsize; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; use std::time::Instant; -use std::{mem, io}; use askama_warp::Template; use byte_unit::Byte; use either::Either; use flate2::read::GzDecoder; -use futures::stream; use futures::{FutureExt, StreamExt}; +use futures::stream; use grenad::CompressionType; use heed::EnvOpenOptions; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::OnceCell; use rayon::ThreadPool; -use serde::{Serialize, Deserialize, Deserializer}; +use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; use tokio::sync::broadcast; -use warp::filters::ws::Message; use warp::{Filter, http::Response}; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; +use warp::filters::ws::Message; +use milli::{FacetCondition, Index, MatchingWords, obkv_to_json, SearchResult, UpdateStore}; use milli::facet::FacetValue; +use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; use milli::update::UpdateIndexingStep::*; -use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; -use milli::{obkv_to_json, Index, UpdateStore, SearchResult, MatchingWords, FacetCondition}; static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); @@ -154,17 +154,17 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { } } Value::String(string) - }, + } Value::Array(values) => { Value::Array(values.into_iter() .map(|v| self.highlight_value(v, matching_words)) .collect()) - }, + } Value::Object(object) => { Value::Object(object.into_iter() .map(|(k, v)| (k, self.highlight_value(v, matching_words))) .collect()) - }, + } } } @@ -246,36 +246,20 @@ enum UpdateMetaProgress { #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] struct Settings { - #[serde( - default, - deserialize_with = "deserialize_some", - skip_serializing_if = "Option::is_none", - )] - displayed_attributes: Option>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + displayed_attributes: Setting>, - #[serde( - default, - deserialize_with = "deserialize_some", - skip_serializing_if = "Option::is_none", - )] - searchable_attributes: Option>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + searchable_attributes: Setting>, - #[serde(default)] - faceted_attributes: Option>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + faceted_attributes: Setting>, - #[serde( - default, - deserialize_with = "deserialize_some", - skip_serializing_if = "Option::is_none", - )] - criteria: Option>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + criteria: Setting>, - #[serde( - default, - deserialize_with = "deserialize_some", - skip_serializing_if = "Option::is_none", - )] - stop_words: Option>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + stop_words: Setting>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -294,14 +278,6 @@ struct WordsPrefixes { max_prefix_length: Option, } -// Any value that is present is considered Some value, including null. -fn deserialize_some<'de, T, D>(deserializer: D) -> Result, D::Error> -where T: Deserialize<'de>, - D: Deserializer<'de> -{ - Deserialize::deserialize(deserializer).map(Some) -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); @@ -339,7 +315,7 @@ async fn main() -> anyhow::Result<()> { update_store_options, update_store_path, // the type hint is necessary: https://github.com/rust-lang/rust/issues/32600 - move |update_id, meta, content:&_| { + move |update_id, meta, content: &_| { // We prepare the update by using the update builder. let mut update_builder = UpdateBuilder::new(update_id); if let Some(max_nb_chunks) = indexer_opt_cloned.max_nb_chunks { @@ -396,7 +372,7 @@ async fn main() -> anyhow::Result<()> { total_steps: indexing_step.number_of_steps(), current, total, - } + }, }); }); @@ -404,7 +380,7 @@ async fn main() -> anyhow::Result<()> { Ok(_) => wtxn.commit().map_err(Into::into), Err(e) => Err(e.into()) } - }, + } UpdateMeta::ClearDocuments => { // We must use the write transaction of the update here. let mut wtxn = index_cloned.write_txn()?; @@ -414,47 +390,45 @@ async fn main() -> anyhow::Result<()> { Ok(_count) => wtxn.commit().map_err(Into::into), Err(e) => Err(e.into()) } - }, + } UpdateMeta::Settings(settings) => { // We must use the write transaction of the update here. let mut wtxn = index_cloned.write_txn()?; let mut builder = update_builder.settings(&mut wtxn, &index_cloned); // We transpose the settings JSON struct into a real setting update. - if let Some(names) = settings.searchable_attributes { - match names { - Some(names) => builder.set_searchable_fields(names), - None => builder.reset_searchable_fields(), - } + match settings.searchable_attributes { + Setting::Set(searchable_attributes) => builder.set_searchable_fields(searchable_attributes), + Setting::Reset => builder.reset_searchable_fields(), + Setting::NotSet => () } // We transpose the settings JSON struct into a real setting update. - if let Some(names) = settings.displayed_attributes { - match names { - Some(names) => builder.set_displayed_fields(names), - None => builder.reset_displayed_fields(), - } + match settings.displayed_attributes { + Setting::Set(displayed_attributes) => builder.set_displayed_fields(displayed_attributes), + Setting::Reset => builder.reset_displayed_fields(), + Setting::NotSet => () } // We transpose the settings JSON struct into a real setting update. - if let Some(facet_types) = settings.faceted_attributes { - builder.set_faceted_fields(facet_types); + match settings.faceted_attributes { + Setting::Set(faceted_attributes) => builder.set_faceted_fields(faceted_attributes), + Setting::Reset => builder.reset_faceted_fields(), + Setting::NotSet => () } // We transpose the settings JSON struct into a real setting update. - if let Some(criteria) = settings.criteria { - match criteria { - Some(criteria) => builder.set_criteria(criteria), - None => builder.reset_criteria(), - } + match settings.criteria { + Setting::Set(criteria) => builder.set_criteria(criteria), + Setting::Reset => builder.reset_criteria(), + Setting::NotSet => () } // We transpose the settings JSON struct into a real setting update. - if let Some(stop_words) = settings.stop_words { - match stop_words { - Some(stop_words) => builder.set_stop_words(stop_words), - None => builder.reset_stop_words(), - } + match settings.stop_words { + Setting::Set(stop_words) => builder.set_stop_words(stop_words), + Setting::Reset => builder.reset_stop_words(), + Setting::NotSet => () } let result = builder.execute(|indexing_step, update_id| { @@ -471,7 +445,7 @@ async fn main() -> anyhow::Result<()> { total_steps: indexing_step.number_of_steps(), current, total, - } + }, }); }); @@ -479,7 +453,7 @@ async fn main() -> anyhow::Result<()> { Ok(_count) => wtxn.commit().map_err(Into::into), Err(e) => Err(e.into()) } - }, + } UpdateMeta::Facets(levels) => { // We must use the write transaction of the update here. let mut wtxn = index_cloned.write_txn()?; @@ -494,7 +468,7 @@ async fn main() -> anyhow::Result<()> { Ok(()) => wtxn.commit().map_err(Into::into), Err(e) => Err(e.into()) } - }, + } UpdateMeta::WordsPrefixes(settings) => { // We must use the write transaction of the update here. let mut wtxn = index_cloned.write_txn()?; @@ -716,7 +690,7 @@ async fn main() -> anyhow::Result<()> { let filters = match query.filters { Some(condition) if !condition.trim().is_empty() => { Some(FacetCondition::from_str(&rtxn, &index, &condition).unwrap()) - }, + } _otherwise => None, }; @@ -724,14 +698,14 @@ async fn main() -> anyhow::Result<()> { Some(array) => { let eithers = array.into_iter().map(Into::into); FacetCondition::from_array(&rtxn, &index, eithers).unwrap() - }, + } _otherwise => None, }; let condition = match (filters, facet_filters) { (Some(filters), Some(facet_filters)) => { Some(FacetCondition::And(Box::new(filters), Box::new(facet_filters))) - }, + } (Some(condition), None) | (None, Some(condition)) => Some(condition), _otherwise => None, }; @@ -807,12 +781,12 @@ async fn main() -> anyhow::Result<()> { Response::builder() .header("Content-Type", "application/json") .body(serde_json::to_string(&document).unwrap()) - }, + } None => { Response::builder() .status(404) .body(format!("Document with id {:?} not found.", id)) - }, + } } }); @@ -978,11 +952,11 @@ async fn main() -> anyhow::Result<()> { Ok(status) => { let msg = serde_json::to_string(&status).unwrap(); stream::iter(Some(Ok(Message::text(msg)))) - }, + } Err(e) => { eprintln!("channel error: {:?}", e); stream::iter(None) - }, + } } }) .forward(websocket) diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index fcdcb33e9..c2df94468 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -1,3 +1,13 @@ +pub use self::available_documents_ids::AvailableDocumentsIds; +pub use self::clear_documents::ClearDocuments; +pub use self::delete_documents::DeleteDocuments; +pub use self::facets::Facets; +pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat}; +pub use self::settings::{Setting, Settings}; +pub use self::update_builder::UpdateBuilder; +pub use self::update_step::UpdateIndexingStep; +pub use self::words_prefixes::WordsPrefixes; + mod available_documents_ids; mod clear_documents; mod delete_documents; @@ -8,12 +18,3 @@ mod update_builder; mod update_step; mod words_prefixes; -pub use self::available_documents_ids::AvailableDocumentsIds; -pub use self::clear_documents::ClearDocuments; -pub use self::delete_documents::DeleteDocuments; -pub use self::facets::Facets; -pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat, DocumentAdditionResult}; -pub use self::settings::Settings; -pub use self::update_builder::UpdateBuilder; -pub use self::update_step::UpdateIndexingStep; -pub use self::words_prefixes::WordsPrefixes; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 45a4c204c..f73d0f4d2 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -6,12 +6,51 @@ use chrono::Utc; use grenad::CompressionType; use itertools::Itertools; use rayon::ThreadPool; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use crate::{FieldsIdsMap, Index}; use crate::criterion::Criterion; use crate::facet::FacetType; -use crate::update::index_documents::{Transform, IndexDocumentsMethod}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; -use crate::{Index, FieldsIdsMap}; +use crate::update::index_documents::{IndexDocumentsMethod, Transform}; + +#[derive(Debug, Clone)] +pub enum Setting { + Set(T), + NotSet, + Reset, +} + +impl Default for Setting { + fn default() -> Self { + Self::NotSet + } +} + +impl Setting { + pub const fn is_not_set(&self) -> bool { + matches!(self, Self::NotSet) + } +} + +impl Serialize for Setting { + fn serialize(&self, serializer: S) -> Result where S: Serializer { + match self { + Self::Set(value) => Some(value), + // Usually not_set isn't serialized by setting skip_serializing_if field attribute + Self::NotSet | Self::Reset => None, + }.serialize(serializer) + } +} + +impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting { + fn deserialize(deserializer: D) -> Result where D: Deserializer<'de> { + Deserialize::deserialize(deserializer).map(|x| match x { + Some(x) => Self::Set(x), + None => Self::Reset, // Reset is forced by sending null value + }) + } +} pub struct Settings<'a, 't, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -26,13 +65,11 @@ pub struct Settings<'a, 't, 'u, 'i> { pub(crate) thread_pool: Option<&'a ThreadPool>, update_id: u64, - // If a struct field is set to `None` it means that it hasn't been set by the user, - // however if it is `Some(None)` it means that the user forced a reset of the setting. - searchable_fields: Option>>, - displayed_fields: Option>>, - faceted_fields: Option>>, - criteria: Option>>, - stop_words: Option>>, + searchable_fields: Setting>, + displayed_fields: Setting>, + faceted_fields: Setting>, + criteria: Setting>, + stop_words: Setting>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -52,62 +89,62 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { chunk_compression_level: None, chunk_fusing_shrink_size: None, thread_pool: None, - searchable_fields: None, - displayed_fields: None, - faceted_fields: None, - criteria: None, - stop_words: None, + searchable_fields: Setting::NotSet, + displayed_fields: Setting::NotSet, + faceted_fields: Setting::NotSet, + criteria: Setting::NotSet, + stop_words: Setting::NotSet, update_id, } } pub fn reset_searchable_fields(&mut self) { - self.searchable_fields = Some(None); + self.searchable_fields = Setting::Reset; } pub fn set_searchable_fields(&mut self, names: Vec) { - self.searchable_fields = Some(Some(names)); + self.searchable_fields = Setting::Set(names); } pub fn reset_displayed_fields(&mut self) { - self.displayed_fields = Some(None); + self.displayed_fields = Setting::Reset; } pub fn set_displayed_fields(&mut self, names: Vec) { - self.displayed_fields = Some(Some(names)); - } - - pub fn set_faceted_fields(&mut self, names_facet_types: HashMap) { - self.faceted_fields = Some(Some(names_facet_types)); + self.displayed_fields = Setting::Set(names); } pub fn reset_faceted_fields(&mut self) { - self.faceted_fields = Some(None); + self.faceted_fields = Setting::Reset; + } + + pub fn set_faceted_fields(&mut self, names_facet_types: HashMap) { + self.faceted_fields = Setting::Set(names_facet_types); } pub fn reset_criteria(&mut self) { - self.criteria = Some(None); + self.criteria = Setting::Reset; } pub fn set_criteria(&mut self, criteria: Vec) { - self.criteria = Some(Some(criteria)); + self.criteria = Setting::Set(criteria); } pub fn reset_stop_words(&mut self) { - self.stop_words = Some(None); + self.stop_words = Setting::Reset; } pub fn set_stop_words(&mut self, stop_words: BTreeSet) { self.stop_words = if stop_words.is_empty() { - Some(None) + Setting::Reset } else { - Some(Some(stop_words)) + Setting::Set(stop_words) } } fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> - where - F: Fn(UpdateIndexingStep, u64) + Sync + where + F: Fn(UpdateIndexingStep, u64) + Sync { let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let update_id = self.update_id; @@ -115,7 +152,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // if the settings are set before any document update, we don't need to do anything, and // will set the primary key during the first document addition. if self.index.number_of_documents(&self.wtxn)? == 0 { - return Ok(()) + return Ok(()); } let transform = Transform { @@ -160,7 +197,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_displayed(&mut self) -> anyhow::Result { match self.displayed_fields { - Some(Some(ref fields)) => { + Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // fields are deduplicated, only the first occurrence is taken into account let names: Vec<_> = fields @@ -177,8 +214,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_displayed_fields(self.wtxn, &names)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Some(None) => { self.index.delete_displayed_fields(self.wtxn)?; }, - None => return Ok(false), + Setting::Reset => { self.index.delete_displayed_fields(self.wtxn)?; } + Setting::NotSet => return Ok(false), } Ok(true) } @@ -187,7 +224,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { /// reflect the order of the searchable attributes. fn update_searchable(&mut self) -> anyhow::Result { match self.searchable_fields { - Some(Some(ref fields)) => { + Setting::Set(ref fields) => { // every time the searchable attributes are updated, we need to update the // ids for any settings that uses the facets. (displayed_fields, // faceted_fields) @@ -218,15 +255,15 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_searchable_fields(self.wtxn, &names)?; self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; } - Some(None) => { self.index.delete_searchable_fields(self.wtxn)?; }, - None => return Ok(false), + Setting::Reset => { self.index.delete_searchable_fields(self.wtxn)?; } + Setting::NotSet => return Ok(false), } Ok(true) } fn update_stop_words(&mut self) -> anyhow::Result { match self.stop_words { - Some(Some(ref stop_words)) => { + Setting::Set(ref stop_words) => { let current = self.index.stop_words(self.wtxn)?; // since we can't compare a BTreeSet with an FST we are going to convert the // BTreeSet to an FST and then compare bytes per bytes the two FSTs. @@ -241,14 +278,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(false) } } - Some(None) => Ok(self.index.delete_stop_words(self.wtxn)?), - None => Ok(false), + Setting::Reset => Ok(self.index.delete_stop_words(self.wtxn)?), + Setting::NotSet => Ok(false), } } fn update_facets(&mut self) -> anyhow::Result { match self.faceted_fields { - Some(Some(ref fields)) => { + Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_facets = HashMap::new(); for (name, ty) in fields { @@ -259,15 +296,15 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_faceted_fields(self.wtxn, &new_facets)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Some(None) => { self.index.delete_faceted_fields(self.wtxn)?; }, - None => return Ok(false) + Setting::Reset => { self.index.delete_faceted_fields(self.wtxn)?; } + Setting::NotSet => return Ok(false) } Ok(true) } fn update_criteria(&mut self) -> anyhow::Result<()> { match self.criteria { - Some(Some(ref fields)) => { + Setting::Set(ref fields) => { let faceted_fields = self.index.faceted_fields(&self.wtxn)?; let mut new_criteria = Vec::new(); for name in fields { @@ -276,15 +313,15 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } self.index.put_criteria(self.wtxn, &new_criteria)?; } - Some(None) => { self.index.delete_criteria(self.wtxn)?; } - None => (), + Setting::Reset => { self.index.delete_criteria(self.wtxn)?; } + Setting::NotSet => (), } Ok(()) } pub fn execute(mut self, progress_callback: F) -> anyhow::Result<()> - where - F: Fn(UpdateIndexingStep, u64) + Sync + where + F: Fn(UpdateIndexingStep, u64) + Sync { self.index.set_updated_at(self.wtxn, &Utc::now())?; let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; @@ -305,14 +342,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { #[cfg(test)] mod tests { - use super::*; - use heed::EnvOpenOptions; - use maplit::{hashmap, btreeset}; + use maplit::{btreeset, hashmap}; use crate::facet::FacetType; use crate::update::{IndexDocuments, UpdateFormat}; + use super::*; + #[test] fn set_and_reset_searchable_fields() { let path = tempfile::tempdir().unwrap(); @@ -480,7 +517,7 @@ mod tests { // Set the faceted fields to be the age. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "age".into() => "integer".into() }); + builder.set_faceted_fields(hashmap! { "age".into() => "integer".into() }); builder.execute(|_, _| ()).unwrap(); // Then index some documents. @@ -493,7 +530,7 @@ mod tests { // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashmap!{ "age".to_string() => FacetType::Integer }); + assert_eq!(fields_ids, hashmap! { "age".to_string() => FacetType::Integer }); // Only count the field_id 0 and level 0 facet values. let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 3); @@ -550,7 +587,7 @@ mod tests { // In the same transaction we provide some stop_words let mut builder = Settings::new(&mut wtxn, &index, 0); - let set = btreeset!{ "i".to_string(), "the".to_string(), "are".to_string() }; + let set = btreeset! { "i".to_string(), "the".to_string(), "are".to_string() }; builder.set_stop_words(set.clone()); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -614,7 +651,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); - builder.set_faceted_fields(hashmap!{ + builder.set_faceted_fields(hashmap! { "age".into() => "integer".into(), "toto".into() => "integer".into(), }); From 84c1dda39d8aae1335570259aaac3b5e8f41398a Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Wed, 7 Apr 2021 15:06:14 +0300 Subject: [PATCH 0594/1889] test(http): setting enum serialize/deserialize --- Cargo.lock | 10 ++++++ http-ui/Cargo.toml | 3 ++ http-ui/src/main.rs | 61 +++++++++++++++++++++++++++++++++++- milli/src/update/settings.rs | 4 +-- 4 files changed, 75 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 91e72450a..a76ad8709 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -911,6 +911,7 @@ dependencies = [ "rayon", "serde", "serde_json", + "serde_test", "stderrlog", "structopt", "tempfile", @@ -2079,6 +2080,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_test" +version = "1.0.125" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4bb5fef7eaf5a97917567183607ac4224c5b451c15023930f23b937cce879fe" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.6.1" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 02a799091..748564c03 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -37,3 +37,6 @@ fst = "0.4.5" # Temporary fix for bitvec, remove once fixed. (https://github.com/bitvecto-rs/bitvec/issues/105) funty = "=1.1" + +[dev-dependencies] +serde_test = "1.0.125" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 6e9a07855..1b77e443e 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -242,7 +242,7 @@ enum UpdateMetaProgress { }, } -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] struct Settings { @@ -993,3 +993,62 @@ async fn main() -> anyhow::Result<()> { let addr = SocketAddr::from_str(&opt.http_listen_addr)?; Ok(warp::serve(routes).run(addr).await) } + +#[cfg(test)] +mod tests { + use serde_test::{assert_de_tokens, assert_ser_tokens, Token}; + + use milli::update::Setting; + + use crate::Settings; + + #[test] + fn serialize_settings() { + let settings = Settings { + displayed_attributes: Setting::Set(vec!["name".to_string()]), + searchable_attributes: Setting::Reset, + faceted_attributes: Setting::NotSet, + criteria: Setting::NotSet, + stop_words: Default::default(), + }; + + assert_ser_tokens(&settings, &[ + Token::Struct { name: "Settings", len: 3 }, + Token::Str("displayedAttributes"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("name"), + Token::SeqEnd, + Token::Str("searchableAttributes"), + Token::None, + Token::Str("facetedAttributes"), + Token::None, + Token::StructEnd, + ]); + } + + #[test] + fn deserialize_settings() { + let settings = Settings { + displayed_attributes: Setting::Set(vec!["name".to_string()]), + searchable_attributes: Setting::Reset, + faceted_attributes: Setting::Reset, + criteria: Setting::NotSet, + stop_words: Setting::NotSet, + }; + + assert_de_tokens(&settings, &[ + Token::Struct { name: "Settings", len: 3 }, + Token::Str("displayedAttributes"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("name"), + Token::SeqEnd, + Token::Str("searchableAttributes"), + Token::None, + Token::Str("facetedAttributes"), + Token::None, + Token::StructEnd, + ]); + } +} diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index f73d0f4d2..5ad942ad6 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -14,11 +14,11 @@ use crate::facet::FacetType; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub enum Setting { Set(T), - NotSet, Reset, + NotSet, } impl Default for Setting { From 3af8fa194c7f04975ba6d13f1245185ec375e7aa Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Sat, 10 Apr 2021 12:13:59 +0300 Subject: [PATCH 0595/1889] test(http): combine settings assert_(ser|de)_tokens into 1 test --- Cargo.lock | 1 + http-ui/Cargo.toml | 1 + http-ui/src/main.rs | 81 ++++++++++++++++++++++++++++++++------------- 3 files changed, 60 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a76ad8709..186afd78b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -904,6 +904,7 @@ dependencies = [ "grenad", "heed", "log", + "maplit", "meilisearch-tokenizer", "memmap", "milli", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 748564c03..817de8ef2 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -39,4 +39,5 @@ fst = "0.4.5" funty = "=1.1" [dev-dependencies] +maplit = "1.0.2" serde_test = "1.0.125" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 1b77e443e..08e28be56 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -996,58 +996,93 @@ async fn main() -> anyhow::Result<()> { #[cfg(test)] mod tests { - use serde_test::{assert_de_tokens, assert_ser_tokens, Token}; + use maplit::{btreeset,hashmap}; + use serde_test::{assert_tokens, Token}; use milli::update::Setting; use crate::Settings; #[test] - fn serialize_settings() { + fn serde_settings_set() { let settings = Settings { displayed_attributes: Setting::Set(vec!["name".to_string()]), - searchable_attributes: Setting::Reset, - faceted_attributes: Setting::NotSet, - criteria: Setting::NotSet, - stop_words: Default::default(), + searchable_attributes: Setting::Set(vec!["age".to_string()]), + faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }), + criteria: Setting::Set(vec!["asc(age)".to_string()]), + stop_words: Setting::Set(btreeset! { "and".to_string() }), }; - assert_ser_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 3 }, + assert_tokens(&settings, &[ + Token::Struct { name: "Settings", len: 5 }, Token::Str("displayedAttributes"), Token::Some, Token::Seq { len: Some(1) }, Token::Str("name"), Token::SeqEnd, Token::Str("searchableAttributes"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("age"), + Token::SeqEnd, + Token::Str("facetedAttributes"), + Token::Some, + Token::Map { len: Some(1) }, + Token::Str("age"), + Token::Str("integer"), + Token::MapEnd, + Token::Str("criteria"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("asc(age)"), + Token::SeqEnd, + Token::Str("stopWords"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("and"), + Token::SeqEnd, + Token::StructEnd, + ]); + } + + #[test] + fn serde_settings_reset() { + let settings = Settings { + displayed_attributes: Setting::Reset, + searchable_attributes: Setting::Reset, + faceted_attributes: Setting::Reset, + criteria: Setting::Reset, + stop_words: Setting::Reset, + }; + + assert_tokens(&settings, &[ + Token::Struct { name: "Settings", len: 5 }, + Token::Str("displayedAttributes"), + Token::None, + Token::Str("searchableAttributes"), Token::None, Token::Str("facetedAttributes"), Token::None, + Token::Str("criteria"), + Token::None, + Token::Str("stopWords"), + Token::None, Token::StructEnd, ]); } #[test] - fn deserialize_settings() { + fn serde_settings_notset() { let settings = Settings { - displayed_attributes: Setting::Set(vec!["name".to_string()]), - searchable_attributes: Setting::Reset, - faceted_attributes: Setting::Reset, + displayed_attributes: Setting::NotSet, + searchable_attributes: Setting::NotSet, + faceted_attributes: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, }; - assert_de_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 3 }, - Token::Str("displayedAttributes"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("name"), - Token::SeqEnd, - Token::Str("searchableAttributes"), - Token::None, - Token::Str("facetedAttributes"), - Token::None, + assert_tokens(&settings, &[ + Token::Struct { name: "Settings", len: 0 }, Token::StructEnd, ]); } From 6a128d4ec7805ef66a8d9862088189978ecb965c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 8 Apr 2021 19:21:27 +0200 Subject: [PATCH 0596/1889] Add release drafter files --- .github/release-draft-template.yml | 27 +++++++++++++++++++++++++++ .github/workflows/release-drafter.yml | 16 ++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 .github/release-draft-template.yml create mode 100644 .github/workflows/release-drafter.yml diff --git a/.github/release-draft-template.yml b/.github/release-draft-template.yml new file mode 100644 index 000000000..08e1f2fc7 --- /dev/null +++ b/.github/release-draft-template.yml @@ -0,0 +1,27 @@ +name-template: 'Milli v$RESOLVED_VERSION' +tag-template: 'v$RESOLVED_VERSION' +exclude-labels: + - 'skip-changelog' +version-resolver: + minor: + labels: + - 'breaking-change' + default: patch +categories: + - title: 'Breaking changes ⚠️' + label: 'breaking-change' +template: | + ## Changes + + $CHANGES + + Thanks again to $CONTRIBUTORS! 🎉 +no-changes-template: 'Changes are coming soon 😎' +sort-direction: 'ascending' +replacers: + - search: '/(?:and )?@dependabot-preview(?:\[bot\])?,?/g' + replace: '' + - search: '/(?:and )?@bors(?:\[bot\])?,?/g' + replace: '' + - search: '/(?:and )?@meili-bot,?/g' + replace: '' diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml new file mode 100644 index 000000000..9ec8b9d64 --- /dev/null +++ b/.github/workflows/release-drafter.yml @@ -0,0 +1,16 @@ +name: Release Drafter + +on: + push: + branches: + - main + +jobs: + update_release_draft: + runs-on: ubuntu-latest + steps: + - uses: release-drafter/release-drafter@v5 + with: + config-name: release-draft-template.yml + env: + GITHUB_TOKEN: ${{ secrets.RELEASE_DRAFTER_TOKEN }} From da036dcc3e63203c1a19d7ade7cf0ec0a4ebc598 Mon Sep 17 00:00:00 2001 From: tamo Date: Thu, 8 Apr 2021 15:12:37 +0200 Subject: [PATCH 0597/1889] Revert "Integrate the stop_words in the querytree" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 12fb509d8470e6d0c3a424756c9838a1efe306d2. We revert this commit because it's causing the bug #150. The initial algorithm we implemented for the stop_words was: 1. remove the stop_words from the dataset 2. keep the stop_words in the query to see if we can generate new words by integrating typos or if the word was a prefix => This was causing the bug since, in the case of “The hobbit”, we were **always** looking for something starting with “t he” or “th e” instead of ignoring the word completely. For now we are going to fix the bug by completely ignoring the stop_words in the query. This could cause another problem were someone mistyped a normal word and ended up typing a stop_word. For example imagine someone searching for the music “Won't he do it”. If that person misplace one space and write “Won' the do it” then we will loose a part of the request. One fix would be to update our query tree to something like that: --------------------- OR OR TOLERANT hobbit # the first option is to ignore the stop_word AND CONSECUTIVE # the second option is to do as we are doing EXACT t # currently EXACT he TOLERANT hobbit --------------------- This would increase drastically the size of our query tree on request with a lot of stop_words. For example think of “The Lord Of The Rings”. For now whatsoever we decided we were going to ignore this problem and consider that it doesn't reduce too much the relevancy of the search to do that while it improves the performances. --- milli/src/search/query_tree.rs | 60 ++++++++++------------------------ 1 file changed, 18 insertions(+), 42 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index fb5b5b87c..f7367d826 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,7 +1,6 @@ use std::collections::HashSet; use std::{fmt, cmp, mem}; -use fst::Set; use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; @@ -155,10 +154,6 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn stop_words(&self) -> anyhow::Result>>; - fn is_stop_word(&self, word: &str) -> anyhow::Result { - Ok(self.stop_words()?.map_or(false, |s| s.contains(word))) - } fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { @@ -188,10 +183,6 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn synonyms>(&self, _words: &[S]) -> heed::Result>>> { Ok(None) } - - fn stop_words(&self) -> anyhow::Result>> { - self.index.stop_words(self.rtxn) - } } impl<'a> QueryTreeBuilder<'a> { @@ -340,7 +331,8 @@ fn create_query_tree( optional_words: bool, authorize_typos: bool, query: PrimitiveQuery, -) -> anyhow::Result { +) -> anyhow::Result +{ /// Matches on the `PrimitiveQueryPart` and create an operation from it. fn resolve_primitive_part( ctx: &impl Context, @@ -358,12 +350,7 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } - - let is_stop_word = ctx.is_stop_word(&word)?; - let query = Query { prefix, kind: typos(word, authorize_typos) }; - if query.prefix || query.kind.is_tolerant() || !is_stop_word { - children.push(Operation::Query(query)); - } + children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); Ok(Operation::or(false, children)) }, // create a CONSECUTIVE operation wrapping all word in the phrase @@ -378,11 +365,12 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - ) -> anyhow::Result { + ) -> anyhow::Result + { const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) { let mut or_op_children = Vec::new(); for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { @@ -393,31 +381,23 @@ fn create_query_tree( match group { [part] => { - let operation = - resolve_primitive_part(ctx, authorize_typos, part.clone())?; + let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; and_op_children.push(operation); - } + }, words => { - let is_prefix = words.last().map_or(false, |part| part.is_prefix()); - let words: Vec<_> = words - .iter() - .filter_map(|part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }) - .collect(); + let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); + let words: Vec<_> = words.iter().filter_map(| part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }).collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); - - let is_stop_word = ctx.is_stop_word(&concat)?; let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; - if query.prefix || query.kind.is_tolerant() || !is_stop_word { - operations.push(Operation::Query(query)); - and_op_children.push(Operation::or(false, operations)); - } + operations.push(Operation::Query(query)); + and_op_children.push(Operation::or(false, operations)); } } @@ -601,10 +581,6 @@ mod test { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) } - - fn stop_words(&self) -> anyhow::Result>> { - Ok(None) - } } impl Default for TestContext { From dcb00b2e54480a199a4d60ccef31d4c3d021af6b Mon Sep 17 00:00:00 2001 From: tamo Date: Thu, 8 Apr 2021 21:21:20 +0200 Subject: [PATCH 0598/1889] test a new implementation of the stop_words --- milli/src/search/mod.rs | 9 ++++++++- milli/src/search/query_tree.rs | 22 +++++++++++++--------- milli/src/update/settings.rs | 7 ++++--- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index c88800f38..a8cde213b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -91,7 +91,14 @@ impl<'a> Search<'a> { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); builder.authorize_typos(self.authorize_typos); - let analyzer = Analyzer::>::new(AnalyzerConfig::default()); + // We make sure that the analyzer is aware of the stop words + // this ensures that the query builder is able to properly remove them. + let mut config = AnalyzerConfig::default(); + let stop_words = self.index.stop_words(self.rtxn)?; + if let Some(ref stop_words) = stop_words { + config.stop_words(stop_words); + } + let analyzer = Analyzer::new(config); let result = analyzer.analyze(query); let tokens = result.tokens(); builder.build(tokens)? diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index f7367d826..1941f0c6f 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use std::{fmt, cmp, mem}; +use fst::Set; use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; @@ -220,7 +221,8 @@ impl<'a> QueryTreeBuilder<'a> { /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) pub fn build(&self, query: TokenStream) -> anyhow::Result> { - let primitive_query = create_primitive_query(query); + let stop_words = self.index.stop_words(self.rtxn)?; + let primitive_query = create_primitive_query(query, stop_words); if !primitive_query.is_empty() { create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some) } else { @@ -370,7 +372,7 @@ fn create_query_tree( const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) { + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { let mut or_op_children = Vec::new(); for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { @@ -385,8 +387,8 @@ fn create_query_tree( and_op_children.push(operation); }, words => { - let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); - let words: Vec<_> = words.iter().filter_map(| part| { + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words.iter().filter_map(|part| { if let PrimitiveQueryPart::Word(word, _) = part { Some(word.as_str()) } else { @@ -474,7 +476,7 @@ impl PrimitiveQueryPart { /// Create primitive query from tokenized query string, /// the primitive query is an intermediate state to build the query tree. -fn create_primitive_query(query: TokenStream) -> PrimitiveQuery { +fn create_primitive_query(query: TokenStream, stop_words: Option>) -> PrimitiveQuery { let mut primitive_query = Vec::new(); let mut phrase = Vec::new(); let mut quoted = false; @@ -482,14 +484,16 @@ fn create_primitive_query(query: TokenStream) -> PrimitiveQuery { let mut peekable = query.peekable(); while let Some(token) = peekable.next() { match token.kind { - TokenKind::Word => { + TokenKind::Word | TokenKind::StopWord => { // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, - // 2. if the word is not the last token of the query we push it as a non-prefix word, + // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { phrase.push(token.word.to_string()); } else if peekable.peek().is_some() { - primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false)); + if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.word.as_ref())) { + primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false)); + } } else { primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); } @@ -563,7 +567,7 @@ mod test { query: TokenStream, ) -> anyhow::Result> { - let primitive_query = create_primitive_query(query); + let primitive_query = create_primitive_query(query, None); if !primitive_query.is_empty() { create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some) } else { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 5ad942ad6..a858aa1a9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -602,12 +602,13 @@ mod tests { assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes()); // when we search for something that is a non prefix stop_words it should be ignored + // thus we should get a placeholder search (all the results = 3) let result = index.search(&rtxn).query("the ").execute().unwrap(); - assert!(result.documents_ids.is_empty()); + assert_eq!(result.documents_ids.len(), 3); let result = index.search(&rtxn).query("i ").execute().unwrap(); - assert!(result.documents_ids.is_empty()); + assert_eq!(result.documents_ids.len(), 3); let result = index.search(&rtxn).query("are ").execute().unwrap(); - assert!(result.documents_ids.is_empty()); + assert_eq!(result.documents_ids.len(), 3); let result = index.search(&rtxn).query("dog").execute().unwrap(); assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos From 2c5c79d68ebdb8ccc2d28d34f668752fb858bb8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 14 Apr 2021 18:54:04 +0200 Subject: [PATCH 0599/1889] Update Tokenizer version to v0.2.1 --- Cargo.lock | 4 ++-- http-ui/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 186afd78b..5bedb4800 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1207,8 +1207,8 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "meilisearch-tokenizer" -version = "0.1.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.0#833c48b2ee39071f8b4f51abd15122afdb3c8c06" +version = "0.2.1" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.1#b7a89c682b9f5d23a1d8075a99cca76069fff6c6" dependencies = [ "character_converter", "cow-utils", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 817de8ef2..196b83c9f 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = "0.10.6" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.0" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.1" } memmap = "0.7.0" milli = { path = "../milli" } once_cell = "1.5.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b1a54d22d..1242194de 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -20,7 +20,7 @@ heed = { version = "0.10.6", default-features = false, features = ["lmdb", "sync human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.0" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.1" } memmap = "0.7.0" num-traits = "0.2.14" obkv = "0.1.1" From 45c45e11ddf9bc5c077ce07d7c7168df0ee61d2c Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Wed, 7 Apr 2021 12:38:48 +0200 Subject: [PATCH 0600/1889] implement distinct attribute distinct can return error facet distinct on numbers return distinct error review fixes make get_facet_value more generic fixes --- milli/src/index.rs | 13 ++ milli/src/search/criteria/asc_desc.rs | 9 +- milli/src/search/criteria/fetcher.rs | 10 +- milli/src/search/criteria/mod.rs | 7 +- milli/src/search/criteria/proximity.rs | 15 +- milli/src/search/criteria/typo.rs | 65 +++++-- milli/src/search/criteria/words.rs | 10 +- milli/src/search/distinct/facet_distinct.rs | 192 ++++++++++++++++++++ milli/src/search/distinct/map_distinct.rs | 109 +++++++++++ milli/src/search/distinct/mod.rs | 21 +++ milli/src/search/distinct/noop_distinct.rs | 36 ++++ milli/src/search/mod.rs | 63 +++++-- milli/src/update/settings.rs | 28 +++ 13 files changed, 525 insertions(+), 53 deletions(-) create mode 100644 milli/src/search/distinct/facet_distinct.rs create mode 100644 milli/src/search/distinct/map_distinct.rs create mode 100644 milli/src/search/distinct/mod.rs create mode 100644 milli/src/search/distinct/noop_distinct.rs diff --git a/milli/src/index.rs b/milli/src/index.rs index 1150edbca..59f966b95 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -19,6 +19,7 @@ use crate::{ pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; +pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; @@ -460,6 +461,18 @@ impl Index { pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) } + + pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { + self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) + } + + pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { + self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) + } + + pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) + } } #[cfg(test)] diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 78ae540e4..ddd25009d 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -17,7 +17,7 @@ use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use crate::{FieldsIdsMap, FieldId, Index}; -use super::{Criterion, CriterionResult}; +use super::{Criterion, CriterionResult, CriterionContext}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -151,7 +151,7 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, context: CriterionContext) -> anyhow::Result> { loop { debug!("Facet {}({}) iteration", if self.ascending { "Asc" } else { "Desc" }, self.field_name @@ -163,7 +163,8 @@ impl<'t> Criterion for AscDesc<'t> { let bucket_candidates = take(&mut self.bucket_candidates); match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { + let CriterionContext { word_cache, exclude } = context; + match parent.next(CriterionContext { exclude, word_cache })? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; let candidates = match (&self.query_tree, candidates) { @@ -173,7 +174,7 @@ impl<'t> Criterion for AscDesc<'t> { }, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), word_cache)?; candidates.intersect_with(&self.faceted_candidates); candidates }, diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index fa204bdf2..dcd40e43d 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context, CriterionContext}; /// The result of a call to the fetcher. #[derive(Debug, Clone, PartialEq)] @@ -61,7 +61,7 @@ impl<'t> Fetcher<'t> { } #[logging_timer::time("Fetcher::{}")] - pub fn next(&mut self) -> anyhow::Result> { + pub fn next(&mut self, exclude: &RoaringBitmap) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", @@ -90,7 +90,11 @@ impl<'t> Fetcher<'t> { Forbidden(_) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(&mut self.wdcache)? { + let context = CriterionContext { + word_cache: &mut self.wdcache, + exclude + }; + match parent.next(context)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { let candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 22f081871..5e25001a2 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -20,8 +20,13 @@ mod asc_desc; mod proximity; pub mod fetcher; +pub struct CriterionContext<'a, 'b> { + exclude: &'a RoaringBitmap, + word_cache: &'b mut WordDerivationsCache, +} + pub trait Criterion { - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result>; + fn next(&mut self, wdcache: CriterionContext) -> anyhow::Result>; } /// The result of a call to the parent criterion. diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b62eb8cfd..45cffb93d 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -8,7 +8,7 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree, CriterionContext}; pub struct Proximity<'t> { ctx: &'t dyn Context, @@ -56,8 +56,9 @@ impl<'t> Proximity<'t> { impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, context: CriterionContext) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; + let CriterionContext { word_cache, exclude } = context; loop { debug!("Proximity at iteration {} (max {:?}) ({:?})", self.proximity, @@ -98,7 +99,7 @@ impl<'t> Criterion for Proximity<'t> { self.ctx, query_tree, candidates, - wdcache, + word_cache, )?; self.plane_sweep_cache = Some(cache.into_iter()); @@ -110,7 +111,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, - wdcache, + word_cache, )? }; @@ -140,7 +141,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, - wdcache, + word_cache, )?; new_candidates.difference_with(&candidates); @@ -170,11 +171,11 @@ impl<'t> Criterion for Proximity<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { + match parent.next(CriterionContext { exclude, word_cache })? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { let candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), word_cache)?, (None, None) => RoaringBitmap::new(), }; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index b17b7561b..1c3942495 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, CriterionContext}; pub struct Typo<'t> { ctx: &'t dyn Context, @@ -51,8 +51,9 @@ impl<'t> Typo<'t> { impl<'t> Criterion for Typo<'t> { #[logging_timer::time("Typo::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, context: CriterionContext) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; + let CriterionContext { word_cache, exclude } = context; loop { debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); @@ -71,9 +72,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)?; query_tree.clone() } else { query_tree.clone() @@ -84,7 +85,7 @@ impl<'t> Criterion for Typo<'t> { &new_query_tree, self.number_typos, &mut self.candidates_cache, - wdcache, + word_cache, )?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); @@ -109,9 +110,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)?; query_tree.clone() } else { query_tree.clone() @@ -122,7 +123,7 @@ impl<'t> Criterion for Typo<'t> { &new_query_tree, self.number_typos, &mut self.candidates_cache, - wdcache, + word_cache, )?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); @@ -147,7 +148,7 @@ impl<'t> Criterion for Typo<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { + match parent.next(CriterionContext { exclude, word_cache })? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; @@ -346,8 +347,12 @@ mod test { let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, query_tree, facet_candidates); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; - assert!(criteria.next(&mut wdcache).unwrap().is_none()); + assert!(criteria.next(sort_context).unwrap().is_none()); } #[test] @@ -381,7 +386,12 @@ mod test { bucket_candidates: candidates_1, }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; + + assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -403,7 +413,12 @@ mod test { bucket_candidates: candidates_2, }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; + + assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_2)); } #[test] @@ -421,11 +436,19 @@ mod test { bucket_candidates: facet_candidates, }; + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; // first iteration, returns the facet candidates - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected)); + assert_eq!(criteria.next(sort_context).unwrap(), Some(expected)); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; // second iteration, returns None because there is no more things to do - assert!(criteria.next(&mut wdcache).unwrap().is_none()); + assert!(criteria.next(sort_context ).unwrap().is_none()); } #[test] @@ -459,7 +482,12 @@ mod test { bucket_candidates: candidates_1 & &facet_candidates, }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; + + assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -481,7 +509,12 @@ mod test { bucket_candidates: candidates_2 & &facet_candidates, }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); + let sort_context = CriterionContext { + word_cache: &mut wdcache, + exclude: &RoaringBitmap::new(), + }; + + assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_2)); } } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 8774eed7c..b401f99fa 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -5,8 +5,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Criterion, CriterionResult, Context}; +use super::{resolve_query_tree, Criterion, CriterionResult, Context, CriterionContext}; pub struct Words<'t> { ctx: &'t dyn Context, @@ -48,7 +47,8 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { #[logging_timer::time("Words::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, context: CriterionContext) -> anyhow::Result> { + let CriterionContext { word_cache, exclude } = context; loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); @@ -62,7 +62,7 @@ impl<'t> Criterion for Words<'t> { })); }, (Some(qt), Some(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, word_cache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -100,7 +100,7 @@ impl<'t> Criterion for Words<'t> { (None, None) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { + match parent.next(CriterionContext { word_cache, exclude })? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); self.candidates = candidates; diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs new file mode 100644 index 000000000..cecb8ba4b --- /dev/null +++ b/milli/src/search/distinct/facet_distinct.rs @@ -0,0 +1,192 @@ +use std::mem::size_of; + +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::*; +use crate::{facet::FacetType, DocumentId, FieldId, Index}; +use super::{Distinct, DocIter}; + +pub struct FacetDistinct<'a> { + distinct: FieldId, + index: &'a Index, + txn: &'a heed::RoTxn<'a>, + facet_type: FacetType, +} + +impl<'a> FacetDistinct<'a> { + pub fn new( + distinct: FieldId, + index: &'a Index, + txn: &'a heed::RoTxn<'a>, + facet_type: FacetType, + ) -> Self { + Self { + distinct, + index, + txn, + facet_type, + } + } +} + +pub struct FacetDistinctIter<'a> { + candidates: RoaringBitmap, + distinct: FieldId, + excluded: RoaringBitmap, + facet_type: FacetType, + index: &'a Index, + iter_offset: usize, + txn: &'a heed::RoTxn<'a>, +} + +impl<'a> FacetDistinctIter<'a> { + fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result + where + KC: heed::BytesEncode<'c>, + { + let facet_docids = self + .index + .facet_field_id_value_docids + .remap_key_type::() + .get(self.txn, key)? + .expect("Corrupted data: Facet values must exist"); + Ok(facet_docids) + } + + fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> { + let iter = get_facet_values::( + id, + self.distinct, + self.index, + self.txn, + )?; + + for item in iter { + let ((_, _, value), _) = item?; + let key = (self.distinct, value); + let facet_docids = self.get_facet_docids::(&key)?; + self.excluded.union_with(&facet_docids); + } + + self.excluded.remove(id); + + Ok(()) + } + + fn distinct_integer(&mut self, id: DocumentId) -> anyhow::Result<()> { + let iter = get_facet_values::( + id, + self.distinct, + self.index, + self.txn, + )?; + + for item in iter { + let ((_, _, value), _) = item?; + // get facet docids on level 0 + let key = (self.distinct, 0, value, value); + let facet_docids = self.get_facet_docids::(&key)?; + self.excluded.union_with(&facet_docids); + } + + self.excluded.remove(id); + + Ok(()) + } + + fn distinct_float(&mut self, id: DocumentId) -> anyhow::Result<()> { + let iter = get_facet_values::(id, + self.distinct, + self.index, + self.txn, + )?; + + for item in iter { + let ((_, _, value), _) = item?; + // get facet docids on level 0 + let key = (self.distinct, 0, value, value); + let facet_docids = self.get_facet_docids::(&key)?; + self.excluded.union_with(&facet_docids); + } + + self.excluded.remove(id); + + Ok(()) + } + + fn next_inner(&mut self) -> anyhow::Result> { + // The first step is to remove all the excluded documents from our candidates + self.candidates.difference_with(&self.excluded); + + let mut candidates_iter = self.candidates.iter().skip(self.iter_offset); + match candidates_iter.next() { + Some(id) => { + match self.facet_type { + FacetType::String => self.distinct_string(id)?, + FacetType::Integer => self.distinct_integer(id)?, + FacetType::Float => self.distinct_float(id)?, + }; + + // On every iteration, the first document is always a distinct one, since it + // hasn't been discarded by the previous difference. + self.iter_offset += 1; + Ok(Some(id)) + } + // no more candidate at this offset, return. + None => Ok(None), + } + } +} + +fn get_facet_values<'a, KC>( + id: DocumentId, + distinct: FieldId, + index: &Index, + txn: &'a heed::RoTxn, +) -> anyhow::Result> +where + KC: heed::BytesDecode<'a>, +{ + const FID_SIZE: usize = size_of::(); + const DOCID_SIZE: usize = size_of::(); + + let mut key = [0; FID_SIZE + DOCID_SIZE]; + key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); + key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); + + let iter = index + .field_id_docid_facet_values + .prefix_iter(txn, &key)? + .remap_key_type::(); + Ok(iter) +} + +impl Iterator for FacetDistinctIter<'_> { + type Item = anyhow::Result; + + fn next(&mut self) -> Option { + self.next_inner().transpose() + } +} + +impl DocIter for FacetDistinctIter<'_> { + fn into_excluded(self) -> RoaringBitmap { + self.excluded + } +} + +impl<'a> Distinct<'_> for FacetDistinct<'a> { + type Iter = FacetDistinctIter<'a>; + + fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { + FacetDistinctIter { + candidates, + distinct: self.distinct, + excluded, + facet_type: self.facet_type, + index: self.index, + iter_offset: 0, + txn: self.txn, + } + } +} diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs new file mode 100644 index 000000000..411d63c87 --- /dev/null +++ b/milli/src/search/distinct/map_distinct.rs @@ -0,0 +1,109 @@ +use std::collections::HashMap; + +use roaring::RoaringBitmap; +use serde_json::Value; + +use super::{Distinct, DocIter}; +use crate::{DocumentId, FieldId, Index}; + +pub struct MapDistinct<'a> { + distinct: FieldId, + map: HashMap, + index: &'a Index, + txn: &'a heed::RoTxn<'a>, +} + +impl<'a> MapDistinct<'a> { + pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { + let map = HashMap::new(); + Self { + distinct, + map, + index, + txn, + } + } +} + +pub struct MapDistinctIter<'a, 'b> { + distinct: FieldId, + map: &'b mut HashMap, + index: &'a Index, + txn: &'a heed::RoTxn<'a>, + candidates: roaring::bitmap::IntoIter, + excluded: RoaringBitmap, +} + +impl<'a, 'b> MapDistinctIter<'a, 'b> { + fn next_inner(&mut self) -> anyhow::Result> { + let map = &mut self.map; + let mut filter = |value: Value| { + let entry = map.entry(value.to_string()).or_insert(0); + *entry += 1; + *entry <= 1 + }; + + while let Some(id) = self.candidates.next() { + let document = self.index.documents(&self.txn, Some(id))?[0].1; + let value = document + .get(self.distinct) + .map(serde_json::from_slice::) + .transpose()?; + + let accept = match value { + Some(value) => { + match value { + // Since we can't distinct these values, we always accept them + Value::Null | Value::Object(_) => true, + Value::Array(values) => { + let mut accept = true; + for value in values { + accept &= filter(value); + } + accept + } + value => filter(value), + } + } + // Accept values by default. + _ => true, + }; + + if accept { + return Ok(Some(id)); + } else { + self.excluded.insert(id); + } + } + Ok(None) + } +} + +impl Iterator for MapDistinctIter<'_, '_> { + type Item = anyhow::Result; + + fn next(&mut self) -> Option { + self.next_inner().transpose() + } +} + +impl DocIter for MapDistinctIter<'_, '_> { + fn into_excluded(self) -> RoaringBitmap { + self.excluded + } +} + +impl<'a, 'b> Distinct<'b> for MapDistinct<'a> { + type Iter = MapDistinctIter<'a, 'b>; + + fn distinct(&'b mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { + MapDistinctIter { + distinct: self.distinct, + map: &mut self.map, + index: &self.index, + txn: &self.txn, + candidates: candidates.into_iter(), + excluded, + } + } +} diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs new file mode 100644 index 000000000..0f24e2d03 --- /dev/null +++ b/milli/src/search/distinct/mod.rs @@ -0,0 +1,21 @@ +mod facet_distinct; +mod map_distinct; +mod noop_distinct; + +use roaring::RoaringBitmap; + +pub use facet_distinct::FacetDistinct; +pub use map_distinct::MapDistinct; +pub use noop_distinct::NoopDistinct; +use crate::DocumentId; + +pub trait DocIter: Iterator> { + /// Returns ownership on the internal RoaringBitmaps: (candidates, excluded) + fn into_excluded(self) -> RoaringBitmap; +} + +pub trait Distinct<'a> { + type Iter: DocIter; + + fn distinct(&'a mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; +} diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs new file mode 100644 index 000000000..6484f4d64 --- /dev/null +++ b/milli/src/search/distinct/noop_distinct.rs @@ -0,0 +1,36 @@ +use roaring::RoaringBitmap; + +use crate::DocumentId; +use super::{DocIter, Distinct}; + +pub struct NoopDistinct; + +pub struct NoopDistinctIter { + candidates: roaring::bitmap::IntoIter, + excluded: RoaringBitmap, +} + +impl Iterator for NoopDistinctIter { + type Item = anyhow::Result; + + fn next(&mut self) -> Option { + self.candidates.next().map(Result::Ok) + } +} + +impl DocIter for NoopDistinctIter { + fn into_excluded(self) -> RoaringBitmap { + self.excluded + } +} + +impl Distinct<'_> for NoopDistinct { + type Iter = NoopDistinctIter; + + fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { + NoopDistinctIter { + candidates: candidates.into_iter(), + excluded, + } + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index a8cde213b..2c55330a7 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,22 +11,24 @@ use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use crate::search::criteria::fetcher::FetcherResult; +use crate::search::criteria::fetcher::{FetcherResult, Fetcher}; use crate::{Index, DocumentId}; +use distinct::{MapDistinct, FacetDistinct, Distinct, DocIter, NoopDistinct}; +use self::query_tree::QueryTreeBuilder; pub use self::facet::FacetIter; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::query_tree::MatchingWords; -use self::query_tree::QueryTreeBuilder; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); +mod criteria; +mod distinct; mod facet; mod query_tree; -mod criteria; pub struct Search<'a> { query: Option, @@ -123,33 +125,60 @@ impl<'a> Search<'a> { }; let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let mut criteria = criteria_builder.build(query_tree, facet_candidates)?; + let criteria = criteria_builder.build(query_tree, facet_candidates)?; + + match self.index.distinct_attribute(self.rtxn)? { + None => self.perform_sort(NoopDistinct, matching_words, criteria), + Some(name) => { + let field_ids_map = self.index.fields_ids_map(self.rtxn)?; + let id = field_ids_map.id(name).expect("distinct not present in field map"); + let faceted_fields = self.index.faceted_fields(self.rtxn)?; + match faceted_fields.get(name) { + Some(facet_type) => { + let distinct = FacetDistinct::new(id, self.index, self.rtxn, *facet_type); + self.perform_sort(distinct, matching_words, criteria) + } + None => { + let distinct = MapDistinct::new(id, self.index, self.rtxn); + self.perform_sort(distinct, matching_words, criteria) + } + } + } + } + } + + fn perform_sort( + &self, + mut distinct: impl for<'c> Distinct<'c>, + matching_words: MatchingWords, + mut criteria: Fetcher, + ) -> anyhow::Result { let mut offset = self.offset; - let mut limit = self.limit; - let mut documents_ids = Vec::new(); let mut initial_candidates = RoaringBitmap::new(); - while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { + let mut excluded_documents = RoaringBitmap::new(); + let mut documents_ids = Vec::with_capacity(self.limit); + + while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_documents)? { debug!("Number of candidates found {}", candidates.len()); - let mut len = candidates.len() as usize; - let mut candidates = candidates.into_iter(); + let excluded = std::mem::take(&mut excluded_documents); + + let mut candidates = distinct.distinct(candidates, excluded); initial_candidates.union_with(&bucket_candidates); if offset != 0 { - candidates.by_ref().take(offset).for_each(drop); - offset = offset.saturating_sub(len.min(offset)); - len = len.saturating_sub(len.min(offset)); + let discarded = candidates.by_ref().take(offset).count(); + offset = offset.saturating_sub(discarded); } - if len != 0 { - documents_ids.extend(candidates.take(limit)); - limit = limit.saturating_sub(len.min(limit)); + for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { + documents_ids.push(candidate?); } - - if limit == 0 { break } + if documents_ids.len() == self.limit { break } + excluded_documents = candidates.into_excluded(); } Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index a858aa1a9..e63948082 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -70,6 +70,7 @@ pub struct Settings<'a, 't, 'u, 'i> { faceted_fields: Setting>, criteria: Setting>, stop_words: Setting>, + distinct_attribute: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -94,6 +95,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { faceted_fields: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, + distinct_attribute: Setting::NotSet, update_id, } } @@ -142,6 +144,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn set_distinct_attribute(&mut self, distinct_attribute: String) { + self.distinct_attribute = Setting::Set(distinct_attribute); + } + + pub fn reset_distinct_attribute(&mut self) { + self.distinct_attribute = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync @@ -220,6 +230,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } + fn update_distinct_attribute(&mut self) -> anyhow::Result { + match self.distinct_attribute { + Setting::Set(ref attr) => { + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + fields_ids_map + .insert(attr) + .context("field id limit exceeded")?; + + self.index.put_distinct_attribute(self.wtxn, &attr)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + } + Setting::Reset => { self.index.delete_distinct_attribute(self.wtxn)?; }, + Setting::NotSet => return Ok(false), + } + Ok(true) + } + /// Updates the index's searchable attributes. This causes the field map to be recomputed to /// reflect the order of the searchable attributes. fn update_searchable(&mut self) -> anyhow::Result { @@ -328,6 +355,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_displayed()?; let stop_words_updated = self.update_stop_words()?; let facets_updated = self.update_facets()?; + self.update_distinct_attribute()?; // update_criteria MUST be called after update_facets, since criterion fields must be set // as facets. self.update_criteria()?; From 2f73fa55ae7f9145f9dbbd2b2c83d2cee6b8e76d Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Wed, 14 Apr 2021 12:00:45 +0200 Subject: [PATCH 0601/1889] add documentation --- milli/src/index.rs | 25 ++++++++++++--------- milli/src/search/distinct/facet_distinct.rs | 15 +++++++++++-- milli/src/search/distinct/map_distinct.rs | 3 +++ milli/src/search/distinct/mod.rs | 8 ++++++- milli/src/search/distinct/noop_distinct.rs | 6 +++-- 5 files changed, 41 insertions(+), 16 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 59f966b95..a2b6cc440 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -343,6 +343,20 @@ impl Index { } } + /* Distinct attribute */ + + pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { + self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) + } + + pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { + self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) + } + + pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) + } + /* criteria */ pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { @@ -462,17 +476,6 @@ impl Index { self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) } - pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { - self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) - } - - pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) - } - - pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) - } } #[cfg(test)] diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index cecb8ba4b..053bbd705 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,6 +6,12 @@ use crate::heed_codec::facet::*; use crate::{facet::FacetType, DocumentId, FieldId, Index}; use super::{Distinct, DocIter}; +/// A distinct implementer that is backed by facets. On each iteration, the facet values for the +/// distinct attribute of the first document are retrieved. The document ids for these facet values +/// are then retrieved and taken out of the the candidate and added to the excluded set. We take +/// care to keep the document we are currently on, and remove it from the excluded list. The next +/// iterations will never contain any occurence of a document with the same distinct value as a +/// document from previous iterations. pub struct FacetDistinct<'a> { distinct: FieldId, index: &'a Index, @@ -114,6 +120,9 @@ impl<'a> FacetDistinctIter<'a> { Ok(()) } + /// Performs the next iteration of the facet distinct. This is a convenience method that is + /// called by the Iterator::next implementation that tranposes the result. It makes error + /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { // The first step is to remove all the excluded documents from our candidates self.candidates.difference_with(&self.excluded); @@ -127,8 +136,10 @@ impl<'a> FacetDistinctIter<'a> { FacetType::Float => self.distinct_float(id)?, }; - // On every iteration, the first document is always a distinct one, since it - // hasn't been discarded by the previous difference. + // The first document of each iteration is kept, since the next call to + // `difference_with` will filter out all the documents for that facet value. By + // increasing the offset we make sure to get the first valid value for the next + // distinct document to keep. self.iter_offset += 1; Ok(Some(id)) } diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index 411d63c87..37e52aa6f 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -6,6 +6,9 @@ use serde_json::Value; use super::{Distinct, DocIter}; use crate::{DocumentId, FieldId, Index}; +/// A distinct implementer that is backed by an `HashMap`. Each time a document is seen, the value +/// for its distinct field is added to the map. If the map already contains an entry for this +/// value, then the document is filtered out, and is added to the excluded set. pub struct MapDistinct<'a> { distinct: FieldId, map: HashMap, diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 0f24e2d03..5f2e260e4 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -9,11 +9,17 @@ pub use map_distinct::MapDistinct; pub use noop_distinct::NoopDistinct; use crate::DocumentId; +/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. +/// It provides a way to get back the ownership to the excluded set. pub trait DocIter: Iterator> { - /// Returns ownership on the internal RoaringBitmaps: (candidates, excluded) + /// Returns ownership on the internal exluded set. fn into_excluded(self) -> RoaringBitmap; } +/// A trait that is implemented by structs that perform a distinct on `candidates`. Calling distinct +/// must return an iterator containing only distinct documents, and add the discarded documents to +/// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the +/// returned iterator. pub trait Distinct<'a> { type Iter: DocIter; diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 6484f4d64..9fdf17187 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -1,12 +1,14 @@ -use roaring::RoaringBitmap; +use roaring::{RoaringBitmap, bitmap::IntoIter}; use crate::DocumentId; use super::{DocIter, Distinct}; +/// A distinct implementer that does not perform any distinct, and simply returns an iterator to +/// the candidates. pub struct NoopDistinct; pub struct NoopDistinctIter { - candidates: roaring::bitmap::IntoIter, + candidates: IntoIter, excluded: RoaringBitmap, } From 75464a1baadc86a8549a48a2ee3fff3f79588d30 Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Wed, 14 Apr 2021 12:18:13 +0200 Subject: [PATCH 0602/1889] review fixes --- milli/src/index.rs | 1 - milli/src/search/criteria/asc_desc.rs | 9 ++- milli/src/search/criteria/fetcher.rs | 10 +--- milli/src/search/criteria/mod.rs | 7 +-- milli/src/search/criteria/proximity.rs | 15 +++-- milli/src/search/criteria/typo.rs | 66 ++++++---------------- milli/src/search/criteria/words.rs | 9 ++- milli/src/search/distinct/map_distinct.rs | 27 ++++----- milli/src/search/distinct/noop_distinct.rs | 2 +- milli/src/search/mod.rs | 5 +- 10 files changed, 50 insertions(+), 101 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index a2b6cc440..643a9ffb9 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -475,7 +475,6 @@ impl Index { pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) } - } #[cfg(test)] diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index ddd25009d..78ae540e4 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -17,7 +17,7 @@ use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use crate::{FieldsIdsMap, FieldId, Index}; -use super::{Criterion, CriterionResult, CriterionContext}; +use super::{Criterion, CriterionResult}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -151,7 +151,7 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] - fn next(&mut self, context: CriterionContext) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { loop { debug!("Facet {}({}) iteration", if self.ascending { "Asc" } else { "Desc" }, self.field_name @@ -163,8 +163,7 @@ impl<'t> Criterion for AscDesc<'t> { let bucket_candidates = take(&mut self.bucket_candidates); match self.parent.as_mut() { Some(parent) => { - let CriterionContext { word_cache, exclude } = context; - match parent.next(CriterionContext { exclude, word_cache })? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; let candidates = match (&self.query_tree, candidates) { @@ -174,7 +173,7 @@ impl<'t> Criterion for AscDesc<'t> { }, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), word_cache)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; candidates.intersect_with(&self.faceted_candidates); candidates }, diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index dcd40e43d..fa204bdf2 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context, CriterionContext}; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; /// The result of a call to the fetcher. #[derive(Debug, Clone, PartialEq)] @@ -61,7 +61,7 @@ impl<'t> Fetcher<'t> { } #[logging_timer::time("Fetcher::{}")] - pub fn next(&mut self, exclude: &RoaringBitmap) -> anyhow::Result> { + pub fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", @@ -90,11 +90,7 @@ impl<'t> Fetcher<'t> { Forbidden(_) => { match self.parent.as_mut() { Some(parent) => { - let context = CriterionContext { - word_cache: &mut self.wdcache, - exclude - }; - match parent.next(context)? { + match parent.next(&mut self.wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { let candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 5e25001a2..22f081871 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -20,13 +20,8 @@ mod asc_desc; mod proximity; pub mod fetcher; -pub struct CriterionContext<'a, 'b> { - exclude: &'a RoaringBitmap, - word_cache: &'b mut WordDerivationsCache, -} - pub trait Criterion { - fn next(&mut self, wdcache: CriterionContext) -> anyhow::Result>; + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result>; } /// The result of a call to the parent criterion. diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 45cffb93d..b62eb8cfd 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -8,7 +8,7 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree, CriterionContext}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; pub struct Proximity<'t> { ctx: &'t dyn Context, @@ -56,9 +56,8 @@ impl<'t> Proximity<'t> { impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] - fn next(&mut self, context: CriterionContext) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; - let CriterionContext { word_cache, exclude } = context; loop { debug!("Proximity at iteration {} (max {:?}) ({:?})", self.proximity, @@ -99,7 +98,7 @@ impl<'t> Criterion for Proximity<'t> { self.ctx, query_tree, candidates, - word_cache, + wdcache, )?; self.plane_sweep_cache = Some(cache.into_iter()); @@ -111,7 +110,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, - word_cache, + wdcache, )? }; @@ -141,7 +140,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, - word_cache, + wdcache, )?; new_candidates.difference_with(&candidates); @@ -171,11 +170,11 @@ impl<'t> Criterion for Proximity<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(CriterionContext { exclude, word_cache })? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { let candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), word_cache)?, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, (None, None) => RoaringBitmap::new(), }; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 1c3942495..3877f53ed 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, CriterionContext}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; pub struct Typo<'t> { ctx: &'t dyn Context, @@ -51,9 +51,8 @@ impl<'t> Typo<'t> { impl<'t> Criterion for Typo<'t> { #[logging_timer::time("Typo::{}")] - fn next(&mut self, context: CriterionContext) -> anyhow::Result> { + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; - let CriterionContext { word_cache, exclude } = context; loop { debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); @@ -72,9 +71,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; query_tree.clone() } else { query_tree.clone() @@ -85,7 +84,7 @@ impl<'t> Criterion for Typo<'t> { &new_query_tree, self.number_typos, &mut self.candidates_cache, - word_cache, + wdcache, )?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); @@ -110,9 +109,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, word_cache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; query_tree.clone() } else { query_tree.clone() @@ -123,7 +122,7 @@ impl<'t> Criterion for Typo<'t> { &new_query_tree, self.number_typos, &mut self.candidates_cache, - word_cache, + wdcache, )?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); @@ -148,7 +147,7 @@ impl<'t> Criterion for Typo<'t> { (None, Forbidden(_)) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(CriterionContext { exclude, word_cache })? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; @@ -347,12 +346,8 @@ mod test { let mut wdcache = WordDerivationsCache::new(); let mut criteria = Typo::initial(&context, query_tree, facet_candidates); - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; - assert!(criteria.next(sort_context).unwrap().is_none()); + assert!(criteria.next(&mut wdcache).unwrap().is_none()); } #[test] @@ -386,12 +381,7 @@ mod test { bucket_candidates: candidates_1, }; - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; - - assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_1)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -413,12 +403,7 @@ mod test { bucket_candidates: candidates_2, }; - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; - - assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_2)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); } #[test] @@ -436,19 +421,11 @@ mod test { bucket_candidates: facet_candidates, }; - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; // first iteration, returns the facet candidates - assert_eq!(criteria.next(sort_context).unwrap(), Some(expected)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected)); - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; // second iteration, returns None because there is no more things to do - assert!(criteria.next(sort_context ).unwrap().is_none()); + assert!(criteria.next(&mut wdcache).unwrap().is_none()); } #[test] @@ -482,12 +459,7 @@ mod test { bucket_candidates: candidates_1 & &facet_candidates, }; - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; - - assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_1)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -509,12 +481,6 @@ mod test { bucket_candidates: candidates_2 & &facet_candidates, }; - let sort_context = CriterionContext { - word_cache: &mut wdcache, - exclude: &RoaringBitmap::new(), - }; - - assert_eq!(criteria.next(sort_context).unwrap(), Some(expected_2)); + assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); } - } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index b401f99fa..0aa3b483a 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -5,7 +5,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use super::{resolve_query_tree, Criterion, CriterionResult, Context, CriterionContext}; +use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache}; pub struct Words<'t> { ctx: &'t dyn Context, @@ -47,8 +47,7 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { #[logging_timer::time("Words::{}")] - fn next(&mut self, context: CriterionContext) -> anyhow::Result> { - let CriterionContext { word_cache, exclude } = context; + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); @@ -62,7 +61,7 @@ impl<'t> Criterion for Words<'t> { })); }, (Some(qt), Some(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, word_cache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -100,7 +99,7 @@ impl<'t> Criterion for Words<'t> { (None, None) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(CriterionContext { word_cache, exclude })? { + match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); self.candidates = candidates; diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index 37e52aa6f..f2e31bce4 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -18,10 +18,9 @@ pub struct MapDistinct<'a> { impl<'a> MapDistinct<'a> { pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { - let map = HashMap::new(); Self { distinct, - map, + map: HashMap::new(), index, txn, } @@ -38,6 +37,9 @@ pub struct MapDistinctIter<'a, 'b> { } impl<'a, 'b> MapDistinctIter<'a, 'b> { + /// Performs the next iteration of the mafacetp distinct. This is a convenience method that is + /// called by the Iterator::next implementation that tranposes the result. It makes error + /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { let map = &mut self.map; let mut filter = |value: Value| { @@ -54,22 +56,15 @@ impl<'a, 'b> MapDistinctIter<'a, 'b> { .transpose()?; let accept = match value { - Some(value) => { - match value { - // Since we can't distinct these values, we always accept them - Value::Null | Value::Object(_) => true, - Value::Array(values) => { - let mut accept = true; - for value in values { - accept &= filter(value); - } - accept - } - value => filter(value), + Some(Value::Array(values)) => { + let mut accept = true; + for value in values { + accept &= filter(value); } + accept } - // Accept values by default. - _ => true, + Some(Value::Null) | Some(Value::Object(_)) | None => true, + Some(value) => filter(value), }; if accept { diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 9fdf17187..8f7bc7d17 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -16,7 +16,7 @@ impl Iterator for NoopDistinctIter { type Item = anyhow::Result; fn next(&mut self) -> Option { - self.candidates.next().map(Result::Ok) + self.candidates.next().map(Ok) } } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 2c55330a7..7324ea72a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use std::collections::hash_map::{HashMap, Entry}; use std::fmt; +use std::mem::take; use std::str::Utf8Error; use std::time::Instant; @@ -159,11 +160,11 @@ impl<'a> Search<'a> { let mut excluded_documents = RoaringBitmap::new(); let mut documents_ids = Vec::with_capacity(self.limit); - while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_documents)? { + while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { debug!("Number of candidates found {}", candidates.len()); - let excluded = std::mem::take(&mut excluded_documents); + let excluded = take(&mut excluded_documents); let mut candidates = distinct.distinct(candidates, excluded); From 9c4660d3d6357c90b730b17ad6502cc4080d354c Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Thu, 15 Apr 2021 15:29:37 +0200 Subject: [PATCH 0603/1889] add tests --- milli/src/index.rs | 33 +++++- milli/src/search/distinct/facet_distinct.rs | 39 ++++++- milli/src/search/distinct/map_distinct.rs | 35 +++++- milli/src/search/distinct/mod.rs | 121 +++++++++++++++++++- milli/src/search/distinct/noop_distinct.rs | 23 +++- 5 files changed, 242 insertions(+), 9 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 643a9ffb9..7be618789 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -478,13 +478,44 @@ impl Index { } #[cfg(test)] -mod tests { +pub(crate) mod tests { + use std::ops::Deref; + use heed::EnvOpenOptions; use maplit::hashmap; + use tempfile::TempDir; use crate::Index; use crate::update::{IndexDocuments, UpdateFormat}; + pub(crate) struct TempIndex { + inner: Index, + _tempdir: TempDir, + } + + impl Deref for TempIndex { + type Target = Index; + + fn deref(&self) -> &Self::Target { + &self.inner + } + } + + impl TempIndex { + /// Creates a temporary index, with a default `4096 * 100` size. This should be enough for + /// most tests. + pub fn new() -> Self { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 4096); + let _tempdir = TempDir::new_in(".").unwrap(); + let inner = Index::new(options, _tempdir.path()).unwrap(); + Self { + inner, + _tempdir + } + } + } + #[test] fn initial_fields_distribution() { let path = tempfile::tempdir().unwrap(); diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 053bbd705..e97f8b922 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,7 +6,9 @@ use crate::heed_codec::facet::*; use crate::{facet::FacetType, DocumentId, FieldId, Index}; use super::{Distinct, DocIter}; -/// A distinct implementer that is backed by facets. On each iteration, the facet values for the +/// A distinct implementer that is backed by facets. +/// +/// On each iteration, the facet values for the /// distinct attribute of the first document are retrieved. The document ids for these facet values /// are then retrieved and taken out of the the candidate and added to the excluded set. We take /// care to keep the document we are currently on, and remove it from the excluded list. The next @@ -121,7 +123,7 @@ impl<'a> FacetDistinctIter<'a> { } /// Performs the next iteration of the facet distinct. This is a convenience method that is - /// called by the Iterator::next implementation that tranposes the result. It makes error + /// called by the Iterator::next implementation that transposes the result. It makes error /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { // The first step is to remove all the excluded documents from our candidates @@ -201,3 +203,36 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { } } } + +#[cfg(test)] +mod test { + use std::collections::HashMap; + + use super::*; + use super::super::test::{generate_index, validate_distinct_candidates}; + use crate::facet::FacetType; + + macro_rules! test_facet_distinct { + ($name:ident, $distinct:literal, $facet_type:expr) => { + #[test] + fn $name() { + use std::iter::FromIterator; + + let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); + let (index, fid, candidates) = generate_index($distinct, facets); + let txn = index.read_txn().unwrap(); + let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type); + let excluded = RoaringBitmap::new(); + let mut iter = map_distinct.distinct(candidates.clone(), excluded); + let count = validate_distinct_candidates(iter.by_ref(), fid, &index); + let excluded = iter.into_excluded(); + assert_eq!(count as u64 + excluded.len(), candidates.len()); + } + }; + } + + test_facet_distinct!(test_string, "txt", FacetType::String); + test_facet_distinct!(test_strings, "txts", FacetType::String); + test_facet_distinct!(test_int, "cat-int", FacetType::Integer); + test_facet_distinct!(test_ints, "cat-ints", FacetType::Integer); +} diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index f2e31bce4..4c01d1ded 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -6,7 +6,9 @@ use serde_json::Value; use super::{Distinct, DocIter}; use crate::{DocumentId, FieldId, Index}; -/// A distinct implementer that is backed by an `HashMap`. Each time a document is seen, the value +/// A distinct implementer that is backed by an `HashMap`. +/// +/// Each time a document is seen, the value /// for its distinct field is added to the map. If the map already contains an entry for this /// value, then the document is filtered out, and is added to the excluded set. pub struct MapDistinct<'a> { @@ -38,7 +40,7 @@ pub struct MapDistinctIter<'a, 'b> { impl<'a, 'b> MapDistinctIter<'a, 'b> { /// Performs the next iteration of the mafacetp distinct. This is a convenience method that is - /// called by the Iterator::next implementation that tranposes the result. It makes error + /// called by the Iterator::next implementation that transposes the result. It makes error /// handling easier. fn next_inner(&mut self) -> anyhow::Result> { let map = &mut self.map; @@ -105,3 +107,32 @@ impl<'a, 'b> Distinct<'b> for MapDistinct<'a> { } } } + +#[cfg(test)] +mod test { + use std::collections::HashMap; + + use super::*; + use super::super::test::{generate_index, validate_distinct_candidates}; + + macro_rules! test_map_distinct { + ($name:ident, $distinct:literal) => { + #[test] + fn $name() { + let (index, fid, candidates) = generate_index($distinct, HashMap::new()); + let txn = index.read_txn().unwrap(); + let mut map_distinct = MapDistinct::new(fid, &index, &txn); + let excluded = RoaringBitmap::new(); + let mut iter = map_distinct.distinct(candidates.clone(), excluded); + let count = validate_distinct_candidates(iter.by_ref(), fid, &index); + let excluded = iter.into_excluded(); + assert_eq!(count as u64 + excluded.len(), candidates.len()); + } + }; + } + + test_map_distinct!(test_string, "txt"); + test_map_distinct!(test_strings, "txts"); + test_map_distinct!(test_int, "cat-int"); + test_map_distinct!(test_ints, "cat-ints"); +} diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 5f2e260e4..776f0d2b3 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -4,14 +4,14 @@ mod noop_distinct; use roaring::RoaringBitmap; +use crate::DocumentId; pub use facet_distinct::FacetDistinct; pub use map_distinct::MapDistinct; pub use noop_distinct::NoopDistinct; -use crate::DocumentId; /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. /// It provides a way to get back the ownership to the excluded set. -pub trait DocIter: Iterator> { +pub trait DocIter: Iterator> { /// Returns ownership on the internal exluded set. fn into_excluded(self) -> RoaringBitmap; } @@ -25,3 +25,120 @@ pub trait Distinct<'a> { fn distinct(&'a mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; } + +#[cfg(test)] +mod test { + use std::collections::{HashMap, HashSet}; + + use once_cell::sync::Lazy; + use rand::{seq::SliceRandom, Rng}; + use roaring::RoaringBitmap; + use serde_json::{json, Value}; + + use crate::index::{Index, tests::TempIndex}; + use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; + use crate::{BEU32, FieldId, DocumentId}; + + static JSON: Lazy = Lazy::new(generate_json); + + fn generate_json() -> Value { + let mut rng = rand::thread_rng(); + let num_docs = rng.gen_range(10..30); + + let mut documents = Vec::new(); + + let txts = ["toto", "titi", "tata"]; + let cats = (1..10).map(|i| i.to_string()).collect::>(); + let cat_ints = (1..10).collect::>(); + + for i in 0..num_docs { + let txt = txts.choose(&mut rng).unwrap(); + let mut sample_txts = cats.clone(); + sample_txts.shuffle(&mut rng); + + let mut sample_ints = cat_ints.clone(); + sample_ints.shuffle(&mut rng); + + let doc = json!({ + "id": i, + "txt": txt, + "cat-int": rng.gen_range(0..3), + "txts": sample_txts[..(rng.gen_range(0..3))], + "cat-ints": sample_ints[..(rng.gen_range(0..3))], + }); + documents.push(doc); + } + + Value::Array(documents) + } + + /// Returns a temporary index populated with random test documents, the FieldId for the + /// distinct attribute, and the RoaringBitmap with the document ids. + pub(crate) fn generate_index(distinct: &str, facets: HashMap) -> (TempIndex, FieldId, RoaringBitmap) { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + + // set distinct and faceted attributes for the index. + let builder = UpdateBuilder::new(0); + let mut update = builder.settings(&mut txn, &index); + update.set_distinct_attribute(distinct.to_string()); + if !facets.is_empty() { + update.set_faceted_fields(facets) + } + update.execute(|_, _| ()).unwrap(); + + // add documents to the index + let builder = UpdateBuilder::new(1); + let mut addition = builder.index_documents(&mut txn, &index); + + addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + addition.update_format(UpdateFormat::Json); + + addition + .execute(JSON.to_string().as_bytes(), |_, _| ()) + .unwrap(); + + let fields_map = index.fields_ids_map(&txn).unwrap(); + let fid = fields_map.id(&distinct).unwrap(); + + let map = (0..JSON.as_array().unwrap().len() as u32).collect(); + + txn.commit().unwrap(); + + (index, fid, map) + } + + + /// Checks that all the candidates are distinct, and returns the candidates number. + pub(crate) fn validate_distinct_candidates( + candidates: impl Iterator>, + distinct: FieldId, + index: &Index, + ) -> usize { + fn test(seen: &mut HashSet, value: &Value) { + match value { + Value::Null | Value::Object(_) | Value::Bool(_) => (), + Value::Number(_) | Value::String(_) => { + let s = value.to_string(); + assert!(seen.insert(s)); + } + Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))} + } + } + + let mut seen = HashSet::::new(); + + let txn = index.read_txn().unwrap(); + let mut count = 0; + for candidate in candidates { + count += 1; + let candidate = candidate.unwrap(); + let id = BEU32::new(candidate); + let document = index.documents.get(&txn, &id).unwrap().unwrap(); + let value = document.get(distinct).unwrap(); + let value = serde_json::from_slice(value).unwrap(); + test(&mut seen, &value); + } + count + } +} diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 8f7bc7d17..3de9be631 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -3,8 +3,8 @@ use roaring::{RoaringBitmap, bitmap::IntoIter}; use crate::DocumentId; use super::{DocIter, Distinct}; -/// A distinct implementer that does not perform any distinct, and simply returns an iterator to -/// the candidates. +/// A distinct implementer that does not perform any distinct, +/// and simply returns an iterator to the candidates. pub struct NoopDistinct; pub struct NoopDistinctIter { @@ -36,3 +36,22 @@ impl Distinct<'_> for NoopDistinct { } } } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_noop() { + let candidates = (1..10).collect(); + let excluded = RoaringBitmap::new(); + let mut iter = NoopDistinct.distinct(candidates, excluded); + assert_eq!( + iter.by_ref().map(Result::unwrap).collect::>(), + (1..10).collect::>() + ); + + let excluded = iter.into_excluded(); + assert!(excluded.is_empty()); + } +} From f6b06d6e5d8f80787e4b6da5e4609496a6339e8b Mon Sep 17 00:00:00 2001 From: Michael Chiche Date: Fri, 16 Apr 2021 20:08:43 +0200 Subject: [PATCH 0604/1889] typo: wrong command in example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3b7d8e264..50a64e079 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ You can specify the number of threads to use to index documents and many other s ```bash cd http-ui -cargo run --release -- serve --db my-database.mdb -vvv --indexing-jobs 8 +cargo run --release -- --db my-database.mdb -vvv --indexing-jobs 8 ``` ### Index your documents From e39aabbfe6e35b40261910d09c9b5b13cc2dfaa5 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Wed, 7 Apr 2021 11:53:57 +0300 Subject: [PATCH 0605/1889] feat(search, update): synonyms --- http-ui/src/main.rs | 12 ++++++ milli/src/index.rs | 52 +++++++++++++++++++----- milli/src/search/query_tree.rs | 21 +++++----- milli/src/update/settings.rs | 73 ++++++++++++++++++++++++++++++++-- 4 files changed, 132 insertions(+), 26 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 08e28be56..605b6a7ba 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -260,6 +260,9 @@ struct Settings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] stop_words: Setting>, + + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + synonyms: Setting>>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -431,6 +434,13 @@ async fn main() -> anyhow::Result<()> { Setting::NotSet => () } + // We transpose the settings JSON struct into a real setting update. + match settings.synonyms { + Setting::Set(synonyms) => builder.set_synonyms(synonyms), + Setting::Reset => builder.reset_synonyms(), + Setting::NotSet => () + } + let result = builder.execute(|indexing_step, update_id| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), @@ -1011,6 +1021,7 @@ mod tests { faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }), criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), + synonyms: Setting::NotSet }; assert_tokens(&settings, &[ @@ -1053,6 +1064,7 @@ mod tests { faceted_attributes: Setting::Reset, criteria: Setting::Reset, stop_words: Setting::Reset, + synonyms: Setting::NotSet }; assert_tokens(&settings, &[ diff --git a/milli/src/index.rs b/milli/src/index.rs index 7be618789..d743445e3 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -3,19 +3,19 @@ use std::collections::HashMap; use std::path::Path; use anyhow::Context; +use chrono::{DateTime, Utc}; +use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use heed::types::*; -use heed::{PolyDatabase, Database, RwTxn, RoTxn}; use roaring::RoaringBitmap; -use chrono::{Utc, DateTime}; +use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search}; +use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; +use crate::{ + BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, + ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec, +}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::{default_criteria, Criterion, Search, FacetDistribution, FieldsDistribution}; -use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; -use crate::{ - RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec, - StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, -}; pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; @@ -31,6 +31,7 @@ pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const WORDS_FST_KEY: &str = "words-fst"; pub const STOP_WORDS_KEY: &str = "stop-words"; +pub const SYNONYMS_KEY: &str = "synonyms"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; @@ -376,12 +377,12 @@ impl Index { /* words fst */ - /// Writes the FST which is the words dictionnary of the engine. + /// Writes the FST which is the words dictionary of the engine. pub fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes()) } - /// Returns the FST which is the words dictionnary of the engine. + /// Returns the FST which is the words dictionary of the engine. pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), @@ -398,6 +399,7 @@ impl Index { pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY) } + pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? { Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), @@ -405,6 +407,34 @@ impl Index { } } + /* synonyms */ + + pub fn put_synonyms(&self, wtxn: &mut RwTxn, synonyms: &HashMap, Vec>>) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<_>>(wtxn, SYNONYMS_KEY, synonyms) + } + + pub fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, SYNONYMS_KEY) + } + + pub fn synonyms(&self, rtxn: &RoTxn) -> anyhow::Result, Vec>>>> { + match self.main.get::<_, Str, SerdeBincode, Vec>>>>(rtxn, SYNONYMS_KEY)? { + Some(synonyms) => Ok(Some(synonyms)), + None => Ok(None), + } + } + + pub fn words_synonyms>(&self, rtxn: &RoTxn, words: &[S]) -> anyhow::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect(); + + match self.synonyms(rtxn)? { + Some(synonyms) => Ok(Some( + synonyms.get(&words).cloned().unwrap_or(Vec::default()) + )), + None => Ok(None) + } + } + /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. @@ -536,7 +566,7 @@ pub(crate) mod tests { let rtxn = index.read_txn().unwrap(); let fields_distribution = index.fields_distribution(&rtxn).unwrap(); - assert_eq!(fields_distribution, hashmap!{ + assert_eq!(fields_distribution, hashmap! { "name".to_string() => 2, "age".to_string() => 1, }); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 1941f0c6f..b2fd62771 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -155,7 +155,7 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn synonyms>(&self, words: &[S]) -> heed::Result>>>; + fn synonyms>(&self, words: &[S]) -> anyhow::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { Some(rb) => Ok(Some(rb.len())), @@ -177,12 +177,12 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } - fn word_documents_count(&self, word: &str) -> heed::Result> { - self.index.word_documents_count(self.rtxn, word) + fn synonyms>(&self, words: &[S]) -> anyhow::Result>>> { + self.index.words_synonyms(self.rtxn, words) } - fn synonyms>(&self, _words: &[S]) -> heed::Result>>> { - Ok(None) + fn word_documents_count(&self, word: &str) -> heed::Result> { + self.index.word_documents_count(self.rtxn, word) } } @@ -270,10 +270,10 @@ fn typos(word: String, authorize_typos: bool) -> QueryKind { } } -/// Fetch synonyms from the `Context` for the provided word +/// Fetch synonyms from the `Context` for the provided words /// and create the list of operations for the query tree -fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result>> { - let synonyms = ctx.synonyms(word)?; +fn synonyms(ctx: &impl Context, words: &[&str]) -> anyhow::Result>> { + let synonyms = ctx.synonyms(words)?; Ok(synonyms.map(|synonyms| { synonyms.into_iter().map(|synonym| { @@ -581,14 +581,13 @@ mod test { Ok(self.postings.get(word).cloned()) } - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { - let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); + fn synonyms>(&self, words: &[S]) -> anyhow::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect(); Ok(self.synonyms.get(&words).cloned()) } } impl Default for TestContext { - fn default() -> TestContext { let mut rng = StdRng::seed_from_u64(102); let rng = &mut rng; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index e63948082..336c0e253 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -13,6 +13,7 @@ use crate::criterion::Criterion; use crate::facet::FacetType; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; +use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; #[derive(Debug, Clone, PartialEq)] pub enum Setting { @@ -71,6 +72,7 @@ pub struct Settings<'a, 't, 'u, 'i> { criteria: Setting>, stop_words: Setting>, distinct_attribute: Setting, + synonyms: Setting>>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -96,6 +98,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { criteria: Setting::NotSet, stop_words: Setting::NotSet, distinct_attribute: Setting::NotSet, + synonyms: Setting::NotSet, update_id, } } @@ -144,12 +147,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn reset_distinct_attribute(&mut self) { + self.distinct_attribute = Setting::Reset; + } + pub fn set_distinct_attribute(&mut self, distinct_attribute: String) { self.distinct_attribute = Setting::Set(distinct_attribute); } - pub fn reset_distinct_attribute(&mut self) { - self.distinct_attribute = Setting::Reset; + pub fn reset_synonyms(&mut self) { + self.synonyms = Setting::Reset; + } + + pub fn set_synonyms(&mut self, synonyms: HashMap>) { + self.synonyms = if synonyms.is_empty() { + Setting::Reset + } else { + Setting::Set(synonyms) + } } fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> @@ -294,7 +309,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let current = self.index.stop_words(self.wtxn)?; // since we can't compare a BTreeSet with an FST we are going to convert the // BTreeSet to an FST and then compare bytes per bytes the two FSTs. - let fst = fst::Set::from_iter(&*stop_words)?; + let fst = fst::Set::from_iter(stop_words)?; // Does the new FST differ from the previous one? if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { @@ -310,6 +325,55 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + fn update_synonyms(&mut self) -> anyhow::Result { + match self.synonyms { + Setting::Set(ref synonyms) => { + let old_synonyms = self.index.synonyms(self.wtxn)?.unwrap_or_default(); + + let mut config = AnalyzerConfig::default(); + + let stop_words = self.index.stop_words(self.wtxn)?; + if let Some(stop_words) = &stop_words { + config.stop_words(stop_words); + } + + let analyzer = Analyzer::new(config); + + let normalize = |text: &String| { + analyzer + .analyze(text) + .tokens() + .filter_map(|token| + if token.is_word() { Some(token.text().to_string()) } else { None } + ) + .collect::>() + }; + + let new_synonyms = synonyms + .iter() + .map(|(word, synonyms)| { + let normalized_word = normalize(word); + let normalized_synonyms = synonyms.iter() + .map(normalize) + .unique() + .collect::>(); + + (normalized_word, normalized_synonyms) + }) + .collect(); + + if new_synonyms != old_synonyms { + self.index.put_synonyms(self.wtxn, &new_synonyms)?; + Ok(true) + } else { + Ok(false) + } + } + Setting::Reset => Ok(self.index.delete_synonyms(self.wtxn)?), + Setting::NotSet => Ok(false), + } + } + fn update_facets(&mut self) -> anyhow::Result { match self.faceted_fields { Setting::Set(ref fields) => { @@ -359,9 +423,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // update_criteria MUST be called after update_facets, since criterion fields must be set // as facets. self.update_criteria()?; + let synonyms_updated = self.update_synonyms()?; let searchable_updated = self.update_searchable()?; - if facets_updated || searchable_updated || stop_words_updated { + if stop_words_updated || facets_updated || synonyms_updated || searchable_updated { self.reindex(&progress_callback, old_fields_ids_map)?; } Ok(()) From 33860bc3b7fce784d6a9e9574f17e662d88aed94 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Fri, 9 Apr 2021 22:56:20 +0300 Subject: [PATCH 0606/1889] test(update, settings): set & reset synonyms fixes after review more fixes after review --- http-ui/src/main.rs | 19 ++++-- milli/src/index.rs | 19 ++---- milli/src/search/query_tree.rs | 14 ++-- milli/src/update/settings.rs | 117 ++++++++++++++++++++++++++------- 4 files changed, 119 insertions(+), 50 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 605b6a7ba..ad9f1646d 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1021,11 +1021,11 @@ mod tests { faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }), criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), - synonyms: Setting::NotSet + synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }) }; assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 5 }, + Token::Struct { name: "Settings", len: 6 }, Token::Str("displayedAttributes"), Token::Some, Token::Seq { len: Some(1) }, @@ -1052,6 +1052,14 @@ mod tests { Token::Seq { len: Some(1) }, Token::Str("and"), Token::SeqEnd, + Token::Str("synonyms"), + Token::Some, + Token::Map { len: Some(1) }, + Token::Str("alex"), + Token::Seq {len: Some(1) }, + Token::Str("alexey"), + Token::SeqEnd, + Token::MapEnd, Token::StructEnd, ]); } @@ -1064,11 +1072,11 @@ mod tests { faceted_attributes: Setting::Reset, criteria: Setting::Reset, stop_words: Setting::Reset, - synonyms: Setting::NotSet + synonyms: Setting::Reset, }; assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 5 }, + Token::Struct { name: "Settings", len: 6 }, Token::Str("displayedAttributes"), Token::None, Token::Str("searchableAttributes"), @@ -1079,6 +1087,8 @@ mod tests { Token::None, Token::Str("stopWords"), Token::None, + Token::Str("synonyms"), + Token::None, Token::StructEnd, ]); } @@ -1091,6 +1101,7 @@ mod tests { faceted_attributes: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, + synonyms: Setting::NotSet, }; assert_tokens(&settings, &[ diff --git a/milli/src/index.rs b/milli/src/index.rs index d743445e3..045eabc3c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -417,22 +417,13 @@ impl Index { self.main.delete::<_, Str>(wtxn, SYNONYMS_KEY) } - pub fn synonyms(&self, rtxn: &RoTxn) -> anyhow::Result, Vec>>>> { - match self.main.get::<_, Str, SerdeBincode, Vec>>>>(rtxn, SYNONYMS_KEY)? { - Some(synonyms) => Ok(Some(synonyms)), - None => Ok(None), - } + pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result, Vec>>> { + Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, SYNONYMS_KEY)?.unwrap_or_default()) } - pub fn words_synonyms>(&self, rtxn: &RoTxn, words: &[S]) -> anyhow::Result>>> { - let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect(); - - match self.synonyms(rtxn)? { - Some(synonyms) => Ok(Some( - synonyms.get(&words).cloned().unwrap_or(Vec::default()) - )), - None => Ok(None) - } + pub fn words_synonyms>(&self, rtxn: &RoTxn, words: &[S]) -> heed::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); + Ok(self.synonyms(rtxn)?.remove(&words)) } /* words prefixes fst */ diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index b2fd62771..d21227507 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -155,7 +155,7 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn synonyms>(&self, words: &[S]) -> anyhow::Result>>>; + fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { Some(rb) => Ok(Some(rb.len())), @@ -177,7 +177,7 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } - fn synonyms>(&self, words: &[S]) -> anyhow::Result>>> { + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { self.index.words_synonyms(self.rtxn, words) } @@ -270,10 +270,10 @@ fn typos(word: String, authorize_typos: bool) -> QueryKind { } } -/// Fetch synonyms from the `Context` for the provided words +/// Fetch synonyms from the `Context` for the provided word /// and create the list of operations for the query tree -fn synonyms(ctx: &impl Context, words: &[&str]) -> anyhow::Result>> { - let synonyms = ctx.synonyms(words)?; +fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result>> { + let synonyms = ctx.synonyms(word)?; Ok(synonyms.map(|synonyms| { synonyms.into_iter().map(|synonym| { @@ -581,8 +581,8 @@ mod test { Ok(self.postings.get(word).cloned()) } - fn synonyms>(&self, words: &[S]) -> anyhow::Result>>> { - let words: Vec<_> = words.iter().map(|s| s.as_ref().to_string()).collect(); + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 336c0e253..a0cfbd315 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -5,6 +5,7 @@ use anyhow::Context; use chrono::Utc; use grenad::CompressionType; use itertools::Itertools; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rayon::ThreadPool; use serde::{Deserialize, Deserializer, Serialize, Serializer}; @@ -13,7 +14,6 @@ use crate::criterion::Criterion; use crate::facet::FacetType; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; -use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; #[derive(Debug, Clone, PartialEq)] pub enum Setting { @@ -328,18 +328,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_synonyms(&mut self) -> anyhow::Result { match self.synonyms { Setting::Set(ref synonyms) => { - let old_synonyms = self.index.synonyms(self.wtxn)?.unwrap_or_default(); - - let mut config = AnalyzerConfig::default(); - - let stop_words = self.index.stop_words(self.wtxn)?; - if let Some(stop_words) = &stop_words { - config.stop_words(stop_words); - } - - let analyzer = Analyzer::new(config); - - let normalize = |text: &String| { + fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec { analyzer .analyze(text) .tokens() @@ -347,20 +336,40 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { if token.is_word() { Some(token.text().to_string()) } else { None } ) .collect::>() - }; + } - let new_synonyms = synonyms - .iter() - .map(|(word, synonyms)| { - let normalized_word = normalize(word); - let normalized_synonyms = synonyms.iter() - .map(normalize) - .unique() - .collect::>(); + let mut config = AnalyzerConfig::default(); + let stop_words = self.index.stop_words(self.wtxn)?; + if let Some(stop_words) = &stop_words { + config.stop_words(stop_words); + } + let analyzer = Analyzer::new(config); - (normalized_word, normalized_synonyms) - }) - .collect(); + let mut new_synonyms = HashMap::new(); + for (word, synonyms) in synonyms { + // Normalize both the word and associated synonyms. + let normalized_word = normalize(&analyzer, word); + let normalized_synonyms = synonyms + .iter() + .map(|synonym| normalize(&analyzer, synonym)); + + // Store the normalized synonyms under the normalized word, + // merging the possible duplicate words. + let entry = new_synonyms + .entry(normalized_word) + .or_insert_with(Vec::new); + entry.extend(normalized_synonyms); + } + + // Make sure that we don't have duplicate synonyms. + new_synonyms + .iter_mut() + .for_each(|(_, synonyms)| { + synonyms.sort_unstable(); + synonyms.dedup(); + }); + + let old_synonyms = self.index.synonyms(self.wtxn)?; if new_synonyms != old_synonyms { self.index.put_synonyms(self.wtxn, &new_synonyms)?; @@ -734,6 +743,64 @@ mod tests { assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data } + #[test] + fn set_and_reset_synonyms() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_, _| ()).unwrap(); + + // In the same transaction provide some synonyms + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_synonyms(hashmap! { + "blini".to_string() => vec!["crepes".to_string()], + "super like".to_string() => vec!["love".to_string()], + "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] + }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are effectively stored + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(!synonyms.is_empty()); // at this point the index should return something + + // Check that we can use synonyms + let result = index.search(&rtxn).query("blini").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("super like").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("puppies").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + + // Reset the synonyms + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.reset_synonyms(); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are reset + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(synonyms.is_empty()); + + // Check that synonyms are no longer work + let result = index.search(&rtxn).query("blini").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("super like").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("puppies").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + } + #[test] fn setting_searchable_recomputes_other_settings() { let path = tempfile::tempdir().unwrap(); From 127d3d028e94428c9e5f9331f03113baebe3d436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 19 Apr 2021 14:48:13 +0200 Subject: [PATCH 0607/1889] Update version for the next release (v0.1.1) --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5bedb4800..d28745d8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -846,7 +846,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "byte-unit", @@ -889,7 +889,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "askama", @@ -991,7 +991,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "byte-unit", @@ -1248,7 +1248,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "bstr", @@ -2011,7 +2011,7 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "search" -version = "0.1.0" +version = "0.1.1" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index ea0e1ddc8..1db7caf4f 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.1.0" +version = "0.1.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 196b83c9f..fdb0a9596 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.1.0" +version = "0.1.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 32dfed20a..59cfbd661 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.1.0" +version = "0.1.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1242194de..ffbaacc1c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.1.0" +version = "0.1.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index a2c79776a..ae22ea80a 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.1.0" +version = "0.1.1" authors = ["Clément Renault "] edition = "2018" From efbfa81fa7dcfee29fc261530d10f1647242a718 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 7 Apr 2021 11:13:47 +0200 Subject: [PATCH 0608/1889] Merge the Float and Integer enum variant into the Number one --- Cargo.lock | 6 +++--- milli/src/facet/facet_type.rs | 16 ++++++---------- milli/src/facet/facet_value.rs | 18 +++++++----------- 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d28745d8b..7facf6e8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1522,8 +1522,7 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" dependencies = [ "ucd-trie", ] @@ -1531,7 +1530,8 @@ dependencies = [ [[package]] name = "pest" version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" dependencies = [ "ucd-trie", ] diff --git a/milli/src/facet/facet_type.rs b/milli/src/facet/facet_type.rs index 4fdc80798..09f29bc00 100644 --- a/milli/src/facet/facet_type.rs +++ b/milli/src/facet/facet_type.rs @@ -8,16 +8,14 @@ use serde::{Serialize, Deserialize}; #[derive(Serialize, Deserialize)] pub enum FacetType { String, - Float, - Integer, + Number, } impl fmt::Display for FacetType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { FacetType::String => f.write_str("string"), - FacetType::Float => f.write_str("float"), - FacetType::Integer => f.write_str("integer"), + FacetType::Number => f.write_str("number"), } } } @@ -26,12 +24,10 @@ impl FromStr for FacetType { type Err = InvalidFacetType; fn from_str(s: &str) -> Result { - if s.eq_ignore_ascii_case("string") { + if s.trim().eq_ignore_ascii_case("string") { Ok(FacetType::String) - } else if s.eq_ignore_ascii_case("float") { - Ok(FacetType::Float) - } else if s.eq_ignore_ascii_case("integer") { - Ok(FacetType::Integer) + } else if s.trim().eq_ignore_ascii_case("number") { + Ok(FacetType::Number) } else { Err(InvalidFacetType) } @@ -43,7 +39,7 @@ pub struct InvalidFacetType; impl fmt::Display for InvalidFacetType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str(r#"Invalid facet type, must be "string", "float" or "integer""#) + f.write_str(r#"Invalid facet type, must be "string" or "number""#) } } diff --git a/milli/src/facet/facet_value.rs b/milli/src/facet/facet_value.rs index f311ca3dd..2fd2fdf40 100644 --- a/milli/src/facet/facet_value.rs +++ b/milli/src/facet/facet_value.rs @@ -4,8 +4,7 @@ use serde::{Serialize, Serializer}; #[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] pub enum FacetValue { String(String), - Float(OrderedFloat), - Integer(i64), + Number(OrderedFloat), } impl From for FacetValue { @@ -22,24 +21,25 @@ impl From<&str> for FacetValue { impl From for FacetValue { fn from(float: f64) -> FacetValue { - FacetValue::Float(OrderedFloat(float)) + FacetValue::Number(OrderedFloat(float)) } } impl From> for FacetValue { fn from(float: OrderedFloat) -> FacetValue { - FacetValue::Float(float) + FacetValue::Number(float) } } impl From for FacetValue { fn from(integer: i64) -> FacetValue { - FacetValue::Integer(integer) + FacetValue::Number(integer as f64) } } /// We implement Serialize ourselves because we need to always serialize it as a string, /// JSON object keys must be strings not numbers. +// TODO remove this impl and convert them into string, by hand, when required. impl Serialize for FacetValue { fn serialize(&self, serializer: S) -> Result where @@ -47,12 +47,8 @@ impl Serialize for FacetValue { { match self { FacetValue::String(string) => serializer.serialize_str(string), - FacetValue::Float(float) => { - let string = float.to_string(); - serializer.serialize_str(&string) - }, - FacetValue::Integer(integer) => { - let string = integer.to_string(); + FacetValue::Number(number) => { + let string = number.to_string(); serializer.serialize_str(&string) }, } From 51767725b2fe108c50ebf63318a5596805a7add1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 7 Apr 2021 11:57:16 +0200 Subject: [PATCH 0609/1889] Simplify integer and float functions trait bounds --- Cargo.lock | 1 - infos/src/main.rs | 21 +-- milli/Cargo.toml | 1 - milli/src/facet/facet_value.rs | 2 +- milli/src/facet/value_encoding.rs | 21 --- .../facet/facet_level_value_i64_codec.rs | 44 ----- .../facet/field_doc_id_facet_i64_codec.rs | 34 ---- milli/src/heed_codec/facet/mod.rs | 4 - milli/src/search/criteria/asc_desc.rs | 44 ++--- milli/src/search/distinct/facet_distinct.rs | 29 +--- milli/src/search/facet/facet_condition.rs | 158 +++++++----------- milli/src/search/facet/facet_distribution.rs | 33 +--- milli/src/search/facet/mod.rs | 121 ++++++-------- milli/src/update/delete_documents.rs | 13 +- milli/src/update/facets.rs | 130 ++++++-------- milli/src/update/index_documents/store.rs | 72 +++----- milli/src/update/settings.rs | 10 +- 17 files changed, 217 insertions(+), 521 deletions(-) delete mode 100644 milli/src/heed_codec/facet/facet_level_value_i64_codec.rs delete mode 100644 milli/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs diff --git a/Cargo.lock b/Cargo.lock index 7facf6e8b..a78696e1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1272,7 +1272,6 @@ dependencies = [ "maplit", "meilisearch-tokenizer", "memmap", - "num-traits", "obkv", "once_cell", "ordered-float", diff --git a/infos/src/main.rs b/infos/src/main.rs index 376679656..cc1727a68 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -274,15 +274,12 @@ fn facet_values_iter<'txn, DC: 'txn, T>( facet_type: milli::facet::FacetType, string_fn: impl Fn(&str) -> T + 'txn, float_fn: impl Fn(u8, f64, f64) -> T + 'txn, - integer_fn: impl Fn(u8, i64, i64) -> T + 'txn, ) -> heed::Result> + 'txn>> where DC: heed::BytesDecode<'txn>, { use milli::facet::FacetType; - use milli::heed_codec::facet::{ - FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec, - }; + use milli::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; let iter = db.prefix_iter(&rtxn, &[field_id])?; match facet_type { @@ -291,20 +288,13 @@ where .map(move |r| r.map(|((_, key), value)| (string_fn(key), value))); Ok(Box::new(iter) as Box>) }, - FacetType::Float => { + FacetType::Number => { let iter = iter.remap_key_type::() .map(move |r| r.map(|((_, level, left, right), value)| { (float_fn(level, left, right), value) })); Ok(Box::new(iter)) }, - FacetType::Integer => { - let iter = iter.remap_key_type::() - .map(move |r| r.map(|((_, level, left, right), value)| { - (integer_fn(level, left, right), value) - })); - Ok(Box::new(iter)) - }, } } @@ -413,11 +403,6 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let _ = write!(&mut output, " (level {})", level); output }, - |level, left, right| { - let mut output = facet_number_value_to_string(level, left, right).1; - let _ = write!(&mut output, " (level {})", level); - output - }, )?; for result in iter { @@ -523,7 +508,6 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam *field_type, |key| (0, key.to_owned()), facet_number_value_to_string, - facet_number_value_to_string, )?; for result in iter { @@ -590,7 +574,6 @@ fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow: *field_type, |_key| 0u8, |level, _left, _right| level, - |level, _left, _right| level, )?; println!("The database {:?} facet stats", field_name); diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ffbaacc1c..b198131c1 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -22,7 +22,6 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.1" } memmap = "0.7.0" -num-traits = "0.2.14" obkv = "0.1.1" once_cell = "1.5.2" ordered-float = "2.1.1" diff --git a/milli/src/facet/facet_value.rs b/milli/src/facet/facet_value.rs index 2fd2fdf40..99455fa27 100644 --- a/milli/src/facet/facet_value.rs +++ b/milli/src/facet/facet_value.rs @@ -33,7 +33,7 @@ impl From> for FacetValue { impl From for FacetValue { fn from(integer: i64) -> FacetValue { - FacetValue::Number(integer as f64) + FacetValue::Number(OrderedFloat(integer as f64)) } } diff --git a/milli/src/facet/value_encoding.rs b/milli/src/facet/value_encoding.rs index 3cb012a0e..7259243e5 100644 --- a/milli/src/facet/value_encoding.rs +++ b/milli/src/facet/value_encoding.rs @@ -13,16 +13,6 @@ pub fn f64_into_bytes(float: f64) -> Option<[u8; 8]> { None } -#[inline] -pub fn i64_into_bytes(int: i64) -> [u8; 8] { - xor_first_bit(int.to_be_bytes()) -} - -#[inline] -pub fn i64_from_bytes(bytes: [u8; 8]) -> i64 { - i64::from_be_bytes(xor_first_bit(bytes)) -} - #[inline] fn xor_first_bit(mut x: [u8; 8]) -> [u8; 8] { x[0] ^= 0x80; @@ -55,15 +45,4 @@ mod tests { let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect(); assert!(is_sorted(&vec), "{:?}", vec); } - - #[test] - fn ordered_i64_bytes() { - let a = -10_i64; - let b = -0_i64; - let c = 1_i64; - let d = 43_i64; - - let vec: Vec<_> = [a, b, c, d].iter().cloned().map(i64_into_bytes).collect(); - assert!(is_sorted(&vec), "{:?}", vec); - } } diff --git a/milli/src/heed_codec/facet/facet_level_value_i64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_i64_codec.rs deleted file mode 100644 index cc0d3120d..000000000 --- a/milli/src/heed_codec/facet/facet_level_value_i64_codec.rs +++ /dev/null @@ -1,44 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; -use crate::FieldId; - -pub struct FacetLevelValueI64Codec; - -impl<'a> heed::BytesDecode<'a> for FacetLevelValueI64Codec { - type DItem = (FieldId, u8, i64, i64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; - let (level, bytes) = bytes.split_first()?; - - let left = bytes[..8].try_into().map(i64_from_bytes).ok()?; - let right = if *level != 0 { - bytes[8..].try_into().map(i64_from_bytes).ok()? - } else { - left - }; - - Some((*field_id, *level, left, right)) - } -} - -impl heed::BytesEncode<'_> for FacetLevelValueI64Codec { - type EItem = (FieldId, u8, i64, i64); - - fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { - let left = i64_into_bytes(*left); - let right = i64_into_bytes(*right); - - let mut bytes = Vec::with_capacity(2 + left.len() + right.len()); - bytes.push(*field_id); - bytes.push(*level); - bytes.extend_from_slice(&left[..]); - if *level != 0 { - bytes.extend_from_slice(&right[..]); - } - - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs deleted file mode 100644 index a9eaf188c..000000000 --- a/milli/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs +++ /dev/null @@ -1,34 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::{i64_into_bytes, i64_from_bytes}; -use crate::{FieldId, DocumentId}; - -pub struct FieldDocIdFacetI64Codec; - -impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetI64Codec { - type DItem = (FieldId, DocumentId, i64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; - - let (document_id_bytes, bytes) = bytes.split_at(4); - let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; - - let value = bytes[..8].try_into().map(i64_from_bytes).ok()?; - - Some((*field_id, document_id, value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetI64Codec { - type EItem = (FieldId, DocumentId, i64); - - fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(1 + 4 + 8); - bytes.push(*field_id); - bytes.extend_from_slice(&document_id.to_be_bytes()); - bytes.extend_from_slice(&i64_into_bytes(*value)); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index d8ce936e0..532da12fa 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,13 +1,9 @@ mod facet_level_value_f64_codec; -mod facet_level_value_i64_codec; mod facet_value_string_codec; mod field_doc_id_facet_f64_codec; -mod field_doc_id_facet_i64_codec; mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; -pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec; pub use self::facet_value_string_codec::FacetValueStringCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; -pub use self::field_doc_id_facet_i64_codec::FieldDocIdFacetI64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 78ae540e4..1dc186720 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -2,16 +2,13 @@ use std::collections::HashMap; use std::mem::take; use anyhow::{bail, Context as _}; -use heed::{BytesDecode, BytesEncode}; use itertools::Itertools; use log::debug; -use num_traits::Bounded; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetI64Codec, FieldDocIdFacetF64Codec}; +use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; @@ -253,33 +250,17 @@ fn facet_ordered<'t>( ) -> anyhow::Result> + 't>> { match facet_type { - FacetType::Float => { + FacetType::Number => { if candidates.len() <= CANDIDATES_THRESHOLD { - let iter = iterative_facet_ordered_iter::>( + let iter = iterative_facet_ordered_iter( index, rtxn, field_id, ascending, candidates, )?; Ok(Box::new(iter.map(Ok)) as Box>) } else { let facet_fn = if ascending { - FacetIter::::new_reducing + FacetIter::new_reducing } else { - FacetIter::::new_reverse_reducing - }; - let iter = facet_fn(rtxn, index, field_id, candidates)?; - Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) - } - }, - FacetType::Integer => { - if candidates.len() <= CANDIDATES_THRESHOLD { - let iter = iterative_facet_ordered_iter::( - index, rtxn, field_id, ascending, candidates, - )?; - Ok(Box::new(iter.map(Ok)) as Box>) - } else { - let facet_fn = if ascending { - FacetIter::::new_reducing - } else { - FacetIter::::new_reverse_reducing + FacetIter::new_reverse_reducing }; let iter = facet_fn(rtxn, index, field_id, candidates)?; Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) @@ -292,28 +273,23 @@ fn facet_ordered<'t>( /// Fetch the whole list of candidates facet values one by one and order them by it. /// /// This function is fast when the amount of candidates to rank is small. -fn iterative_facet_ordered_iter<'t, KC, T, U>( +fn iterative_facet_ordered_iter<'t>( index: &'t Index, rtxn: &'t heed::RoTxn, field_id: FieldId, ascending: bool, candidates: RoaringBitmap, ) -> anyhow::Result + 't> -where - KC: BytesDecode<'t, DItem = (FieldId, u32, T)>, - KC: for<'a> BytesEncode<'a, EItem = (FieldId, u32, T)>, - T: Bounded, - U: From + Ord + Clone + 't, { - let db = index.field_id_docid_facet_values.remap_key_type::(); + let db = index.field_id_docid_facet_values.remap_key_type::(); let mut docids_values = Vec::with_capacity(candidates.len() as usize); for docid in candidates.iter() { - let left = (field_id, docid, T::min_value()); - let right = (field_id, docid, T::max_value()); + let left = (field_id, docid, f64::MIN); + let right = (field_id, docid, f64::MAX); let mut iter = db.range(rtxn, &(left..=right))?; let entry = if ascending { iter.next() } else { iter.last() }; if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, U::from(value))); + docids_values.push((docid, OrderedFloat(value))); } } docids_values.sort_unstable_by_key(|(_, v)| v.clone()); diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index e97f8b922..3c508b25b 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -81,28 +81,7 @@ impl<'a> FacetDistinctIter<'a> { Ok(()) } - fn distinct_integer(&mut self, id: DocumentId) -> anyhow::Result<()> { - let iter = get_facet_values::( - id, - self.distinct, - self.index, - self.txn, - )?; - - for item in iter { - let ((_, _, value), _) = item?; - // get facet docids on level 0 - let key = (self.distinct, 0, value, value); - let facet_docids = self.get_facet_docids::(&key)?; - self.excluded.union_with(&facet_docids); - } - - self.excluded.remove(id); - - Ok(()) - } - - fn distinct_float(&mut self, id: DocumentId) -> anyhow::Result<()> { + fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> { let iter = get_facet_values::(id, self.distinct, self.index, @@ -134,8 +113,7 @@ impl<'a> FacetDistinctIter<'a> { Some(id) => { match self.facet_type { FacetType::String => self.distinct_string(id)?, - FacetType::Integer => self.distinct_integer(id)?, - FacetType::Float => self.distinct_float(id)?, + FacetType::Number => self.distinct_number(id)?, }; // The first document of each iteration is kept, since the next call to @@ -233,6 +211,5 @@ mod test { test_facet_distinct!(test_string, "txt", FacetType::String); test_facet_distinct!(test_strings, "txts", FacetType::String); - test_facet_distinct!(test_int, "cat-int", FacetType::Integer); - test_facet_distinct!(test_ints, "cat-ints", FacetType::Integer); + test_facet_distinct!(test_number, "cat-int", FacetType::Number); } diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index 42c2327a9..525450ee1 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -5,17 +5,15 @@ use std::str::FromStr; use anyhow::Context; use either::Either; -use heed::types::{ByteSlice, DecodeIgnore}; +use heed::types::DecodeIgnore; use log::debug; -use num_traits::Bounded; use pest::error::{Error as PestError, ErrorVariant}; use pest::iterators::{Pair, Pairs}; use pest::Parser; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::FacetValueStringCodec; -use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; +use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec}; use super::FacetRange; @@ -26,17 +24,17 @@ use self::FacetCondition::*; use self::FacetNumberOperator::*; #[derive(Debug, Copy, Clone, PartialEq)] -pub enum FacetNumberOperator { - GreaterThan(T), - GreaterThanOrEqual(T), - Equal(T), - NotEqual(T), - LowerThan(T), - LowerThanOrEqual(T), - Between(T, T), +pub enum FacetNumberOperator { + GreaterThan(f64), + GreaterThanOrEqual(f64), + Equal(f64), + NotEqual(f64), + LowerThan(f64), + LowerThanOrEqual(f64), + Between(f64, f64), } -impl FacetNumberOperator { +impl FacetNumberOperator { /// This method can return two operations in case it must express /// an OR operation for the between case (i.e. `TO`). fn negate(self) -> (Self, Option) { @@ -78,9 +76,8 @@ impl FacetStringOperator { #[derive(Debug, Clone, PartialEq)] pub enum FacetCondition { - OperatorI64(FieldId, FacetNumberOperator), - OperatorF64(FieldId, FacetNumberOperator), OperatorString(FieldId, FacetStringOperator), + OperatorNumber(FieldId, FacetNumberOperator), Or(Box, Box), And(Box, Box), } @@ -173,8 +170,7 @@ impl FacetCondition { let operator = match ftype { FacetType::String => OperatorString(fid, FacetStringOperator::equal(value)), - FacetType::Float => OperatorF64(fid, FacetNumberOperator::Equal(value.parse()?)), - FacetType::Integer => OperatorI64(fid, FacetNumberOperator::Equal(value.parse()?)), + FacetType::Number => OperatorNumber(fid, FacetNumberOperator::Equal(value.parse()?)), }; if neg { Ok(operator.negate()) } else { Ok(operator) } @@ -267,15 +263,11 @@ impl FacetCondition { fn negate(self) -> FacetCondition { match self { - OperatorI64(fid, op) => match op.negate() { - (op, None) => OperatorI64(fid, op), - (a, Some(b)) => Or(Box::new(OperatorI64(fid, a)), Box::new(OperatorI64(fid, b))), - }, - OperatorF64(fid, op) => match op.negate() { - (op, None) => OperatorF64(fid, op), - (a, Some(b)) => Or(Box::new(OperatorF64(fid, a)), Box::new(OperatorF64(fid, b))), - }, OperatorString(fid, op) => OperatorString(fid, op.negate()), + OperatorNumber(fid, op) => match op.negate() { + (op, None) => OperatorNumber(fid, op), + (a, Some(b)) => Or(Box::new(OperatorNumber(fid, a)), Box::new(OperatorNumber(fid, b))), + }, Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), } @@ -293,16 +285,6 @@ impl FacetCondition { let lvalue = items.next().unwrap(); let rvalue = items.next().unwrap(); match ftype { - FacetType::Integer => { - let lvalue = pest_parse(lvalue)?; - let rvalue = pest_parse(rvalue)?; - Ok(OperatorI64(fid, Between(lvalue, rvalue))) - }, - FacetType::Float => { - let lvalue = pest_parse(lvalue)?; - let rvalue = pest_parse(rvalue)?; - Ok(OperatorF64(fid, Between(lvalue, rvalue))) - }, FacetType::String => { Err(PestError::::new_from_span( ErrorVariant::CustomError { @@ -311,6 +293,11 @@ impl FacetCondition { item_span, ).into()) }, + FacetType::Number => { + let lvalue = pest_parse(lvalue)?; + let rvalue = pest_parse(rvalue)?; + Ok(OperatorNumber(fid, Between(lvalue, rvalue))) + }, } } @@ -324,9 +311,8 @@ impl FacetCondition { let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let value = items.next().unwrap(); match ftype { - FacetType::Integer => Ok(OperatorI64(fid, Equal(pest_parse(value)?))), - FacetType::Float => Ok(OperatorF64(fid, Equal(pest_parse(value)?))), FacetType::String => Ok(OperatorString(fid, FacetStringOperator::equal(value.as_str()))), + FacetType::Number => Ok(OperatorNumber(fid, Equal(pest_parse(value)?))), } } @@ -341,8 +327,6 @@ impl FacetCondition { let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let value = items.next().unwrap(); match ftype { - FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(pest_parse(value)?))), - FacetType::Float => Ok(OperatorF64(fid, GreaterThan(pest_parse(value)?))), FacetType::String => { Err(PestError::::new_from_span( ErrorVariant::CustomError { @@ -351,6 +335,7 @@ impl FacetCondition { item_span, ).into()) }, + FacetType::Number => Ok(OperatorNumber(fid, GreaterThan(pest_parse(value)?))), } } @@ -365,8 +350,6 @@ impl FacetCondition { let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let value = items.next().unwrap(); match ftype { - FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(pest_parse(value)?))), - FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(pest_parse(value)?))), FacetType::String => { Err(PestError::::new_from_span( ErrorVariant::CustomError { @@ -375,6 +358,7 @@ impl FacetCondition { item_span, ).into()) }, + FacetType::Number => Ok(OperatorNumber(fid, GreaterThanOrEqual(pest_parse(value)?))), } } @@ -389,8 +373,6 @@ impl FacetCondition { let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let value = items.next().unwrap(); match ftype { - FacetType::Integer => Ok(OperatorI64(fid, LowerThan(pest_parse(value)?))), - FacetType::Float => Ok(OperatorF64(fid, LowerThan(pest_parse(value)?))), FacetType::String => { Err(PestError::::new_from_span( ErrorVariant::CustomError { @@ -399,6 +381,7 @@ impl FacetCondition { item_span, ).into()) }, + FacetType::Number => Ok(OperatorNumber(fid, LowerThan(pest_parse(value)?))), } } @@ -413,8 +396,6 @@ impl FacetCondition { let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; let value = items.next().unwrap(); match ftype { - FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(pest_parse(value)?))), - FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(pest_parse(value)?))), FacetType::String => { Err(PestError::::new_from_span( ErrorVariant::CustomError { @@ -423,6 +404,7 @@ impl FacetCondition { item_span, ).into()) }, + FacetType::Number => Ok(OperatorNumber(fid, LowerThanOrEqual(pest_parse(value)?))), } } } @@ -430,24 +412,20 @@ impl FacetCondition { impl FacetCondition { /// Aggregates the documents ids that are part of the specified range automatically /// going deeper through the levels. - fn explore_facet_levels<'t, T: 't, KC>( - rtxn: &'t heed::RoTxn, - db: heed::Database, + fn explore_facet_number_levels( + rtxn: &heed::RoTxn, + db: heed::Database, field_id: FieldId, level: u8, - left: Bound, - right: Bound, + left: Bound, + right: Bound, output: &mut RoaringBitmap, ) -> anyhow::Result<()> - where - T: Copy + PartialEq + PartialOrd + Bounded + Debug, - KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { match (left, right) { // If the request is an exact value we must go directly to the deepest level. (Included(l), Included(r)) if l == r && level > 0 => { - return Self::explore_facet_levels::(rtxn, db, field_id, 0, left, right, output); + return Self::explore_facet_number_levels(rtxn, db, field_id, 0, left, right, output); }, // lower TO upper when lower > upper must return no result (Included(l), Included(r)) if l > r => return Ok(()), @@ -462,7 +440,7 @@ impl FacetCondition { // We must create a custom iterator to be able to iterate over the // requested range as the range iterator cannot express some conditions. - let iter = FacetRange::new(rtxn, db.remap_key_type::(), field_id, level, left, right)?; + let iter = FacetRange::new(rtxn, db, field_id, level, left, right)?; debug!("Iterating between {:?} and {:?} (level {})", left, right, level); @@ -489,64 +467,60 @@ impl FacetCondition { if !matches!(left, Included(l) if l == left_found) { let sub_right = Excluded(left_found); debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); - Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, sub_right, output)?; + Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, sub_right, output)?; } if !matches!(right, Included(r) if r == right_found) { let sub_left = Excluded(right_found); debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); - Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, sub_left, right, output)?; + Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, sub_left, right, output)?; } }, None => { // If we found nothing at this level it means that we must find // the same bounds but at a deeper, more precise level. - Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, right, output)?; + Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, right, output)?; }, } Ok(()) } - fn evaluate_number_operator<'t, T: 't, KC>( - rtxn: &'t heed::RoTxn, + fn evaluate_number_operator<>( + rtxn: &heed::RoTxn, index: &Index, - db: heed::Database, + db: heed::Database, field_id: FieldId, - operator: FacetNumberOperator, + operator: FacetNumberOperator, ) -> anyhow::Result - where - T: Copy + PartialEq + PartialOrd + Bounded + Debug, - KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. let (left, right) = match operator { - GreaterThan(val) => (Excluded(val), Included(T::max_value())), - GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())), - Equal(val) => (Included(val), Included(val)), + GreaterThan(val) => (Excluded(val), Included(f64::MAX)), + GreaterThanOrEqual(val) => (Included(val), Included(f64::MAX)), + Equal(val) => (Included(val), Included(val)), NotEqual(val) => { let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; - let docids = Self::evaluate_number_operator::(rtxn, index, db, field_id, Equal(val))?; + let docids = Self::evaluate_number_operator(rtxn, index, db, field_id, Equal(val))?; return Ok(all_documents_ids - docids); }, - LowerThan(val) => (Included(T::min_value()), Excluded(val)), - LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)), - Between(left, right) => (Included(left), Included(right)), + LowerThan(val) => (Included(f64::MIN), Excluded(val)), + LowerThanOrEqual(val) => (Included(f64::MIN), Included(val)), + Between(left, right) => (Included(left), Included(right)), }; // Ask for the biggest value that can exist for this specific field, if it exists // that's fine if it don't, the value just before will be returned instead. let biggest_level = db - .remap_types::() - .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))? + .remap_data_type::() + .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))? .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - Self::explore_facet_levels::(rtxn, db, field_id, level, left, right, &mut output)?; + Self::explore_facet_number_levels(rtxn, db, field_id, level, left, right, &mut output)?; Ok(output) }, None => Ok(RoaringBitmap::new()), @@ -585,16 +559,14 @@ impl FacetCondition { { let db = index.facet_field_id_value_docids; match self { - OperatorI64(fid, op) => { - Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) - }, - OperatorF64(fid, op) => { - Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) - }, OperatorString(fid, op) => { let db = db.remap_key_type::(); Self::evaluate_string_operator(rtxn, index, db, *fid, op) }, + OperatorNumber(fid, op) => { + let db = db.remap_key_type::(); + Self::evaluate_number_operator(rtxn, index, db, *fid, *op) + }, Or(lhs, rhs) => { let lhs = lhs.evaluate(rtxn, index)?; let rhs = rhs.evaluate(rtxn, index)?; @@ -646,7 +618,7 @@ mod tests { } #[test] - fn i64() { + fn number() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -655,20 +627,20 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "timestamp".into() => "integer".into() }); + builder.set_faceted_fields(hashmap!{ "timestamp".into() => "number".into() }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = OperatorI64(0, Between(22, 44)); + let expected = OperatorNumber(0, Between(22.0, 44.0)); assert_eq!(condition, expected); let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); let expected = Or( - Box::new(OperatorI64(0, LowerThan(22))), - Box::new(OperatorI64(0, GreaterThan(44))), + Box::new(OperatorNumber(0, LowerThan(22.0))), + Box::new(OperatorNumber(0, GreaterThan(44.0))), ); assert_eq!(condition, expected); } @@ -686,7 +658,7 @@ mod tests { builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into(), - "timestamp".into() => "integer".into(), + "timestamp".into() => "number".into(), }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -700,7 +672,7 @@ mod tests { let expected = Or( Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), Box::new(And( - Box::new(OperatorI64(1, Between(22, 44))), + Box::new(OperatorNumber(1, Between(22.0, 44.0))), Box::new(OperatorString(0, FacetStringOperator::not_equal("ponce"))), )) ); @@ -714,8 +686,8 @@ mod tests { Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), Box::new(Or( Box::new(Or( - Box::new(OperatorI64(1, LowerThan(22))), - Box::new(OperatorI64(1, GreaterThan(44))), + Box::new(OperatorNumber(1, LowerThan(22.0))), + Box::new(OperatorNumber(1, GreaterThan(44.0))), )), Box::new(OperatorString(0, FacetStringOperator::equal("ponce"))), )), @@ -736,7 +708,7 @@ mod tests { builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into(), - "timestamp".into() => "integer".into(), + "timestamp".into() => "number".into(), }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index afa4f2a5a..7fd2d385b 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -7,8 +7,8 @@ use heed::BytesDecode; use roaring::RoaringBitmap; use crate::facet::{FacetType, FacetValue}; -use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; +use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; +use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::search::facet::{FacetIter, FacetRange}; use crate::{Index, FieldId, DocumentId}; @@ -102,12 +102,9 @@ impl<'a> FacetDistribution<'a> { FacetType::String => { fetch_facet_values::(index, rtxn, field_id, candidates) }, - FacetType::Float => { + FacetType::Number => { fetch_facet_values::(index, rtxn, field_id, candidates) }, - FacetType::Integer => { - fetch_facet_values::(index, rtxn, field_id, candidates) - }, } } @@ -122,18 +119,11 @@ impl<'a> FacetDistribution<'a> { { let iter = match facet_type { FacetType::String => unreachable!(), - FacetType::Float => { - let iter = FacetIter::::new_non_reducing( + FacetType::Number => { + let iter = FacetIter::new_non_reducing( self.rtxn, self.index, field_id, candidates.clone(), )?; - let iter = iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids))); - Box::new(iter) as Box::> - }, - FacetType::Integer => { - let iter = FacetIter::::new_non_reducing( - self.rtxn, self.index, field_id, candidates.clone(), - )?; - Box::new(iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids)))) + iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids))) }, }; @@ -170,16 +160,9 @@ impl<'a> FacetDistribution<'a> { .map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids))); Box::new(iter) as Box::> }, - FacetType::Float => { + FacetType::Number => { let db = db.remap_key_type::(); - let range = FacetRange::::new( - self.rtxn, db, field_id, level, Unbounded, Unbounded, - )?; - Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) - }, - FacetType::Integer => { - let db = db.remap_key_type::(); - let range = FacetRange::::new( + let range = FacetRange::new( self.rtxn, db, field_id, level, Unbounded, Unbounded, )?; Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index e5b06185f..e4bef2b12 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,15 +1,13 @@ -use std::fmt::Debug; use std::ops::Bound::{self, Included, Excluded, Unbounded}; use either::Either::{self, Left, Right}; use heed::types::{DecodeIgnore, ByteSlice}; -use heed::{BytesEncode, BytesDecode}; use heed::{Database, RoRange, RoRevRange, LazyDecode}; use log::debug; -use num_traits::Bounded; use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; +use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{Index, FieldId}; pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator}; @@ -19,43 +17,34 @@ mod facet_condition; mod facet_distribution; mod parser; -pub struct FacetRange<'t, T: 't, KC> { - iter: RoRange<'t, KC, LazyDecode>, - end: Bound, +pub struct FacetRange<'t> { + iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, + end: Bound, } -impl<'t, T: 't, KC> FacetRange<'t, T, KC> -where - KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, - T: PartialOrd + Copy + Bounded, -{ +impl<'t> FacetRange<'t> { pub fn new( rtxn: &'t heed::RoTxn, - db: Database, + db: Database, field_id: FieldId, level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> + left: Bound, + right: Bound, + ) -> heed::Result> { let left_bound = match left { - Included(left) => Included((field_id, level, left, T::min_value())), - Excluded(left) => Excluded((field_id, level, left, T::min_value())), - Unbounded => Included((field_id, level, T::min_value(), T::min_value())), + Included(left) => Included((field_id, level, left, f64::MIN)), + Excluded(left) => Excluded((field_id, level, left, f64::MIN)), + Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), }; - let right_bound = Included((field_id, level, T::max_value(), T::max_value())); + let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; Ok(FacetRange { iter, end: right }) } } -impl<'t, T, KC> Iterator for FacetRange<'t, T, KC> -where - KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, - KC: BytesDecode<'t, DItem = (FieldId, u8, T, T)>, - T: PartialOrd + Copy, -{ - type Item = heed::Result<((FieldId, u8, T, T), RoaringBitmap)>; +impl<'t> Iterator for FacetRange<'t> { + type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; fn next(&mut self) -> Option { match self.iter.next() { @@ -80,43 +69,34 @@ where } } -pub struct FacetRevRange<'t, T: 't, KC> { - iter: RoRevRange<'t, KC, LazyDecode>, - end: Bound, +pub struct FacetRevRange<'t> { + iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, + end: Bound, } -impl<'t, T: 't, KC> FacetRevRange<'t, T, KC> -where - KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, - T: PartialOrd + Copy + Bounded, -{ +impl<'t> FacetRevRange<'t> { pub fn new( rtxn: &'t heed::RoTxn, - db: Database, + db: Database, field_id: FieldId, level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> + left: Bound, + right: Bound, + ) -> heed::Result> { let left_bound = match left { - Included(left) => Included((field_id, level, left, T::min_value())), - Excluded(left) => Excluded((field_id, level, left, T::min_value())), - Unbounded => Included((field_id, level, T::min_value(), T::min_value())), + Included(left) => Included((field_id, level, left, f64::MIN)), + Excluded(left) => Excluded((field_id, level, left, f64::MIN)), + Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), }; - let right_bound = Included((field_id, level, T::max_value(), T::max_value())); + let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; Ok(FacetRevRange { iter, end: right }) } } -impl<'t, T, KC> Iterator for FacetRevRange<'t, T, KC> -where - KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, - KC: BytesDecode<'t, DItem = (FieldId, u8, T, T)>, - T: PartialOrd + Copy, -{ - type Item = heed::Result<((FieldId, u8, T, T), RoaringBitmap)>; +impl<'t> Iterator for FacetRevRange<'t> { + type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; fn next(&mut self) -> Option { loop { @@ -142,20 +122,15 @@ where } } -pub struct FacetIter<'t, T: 't, KC> { +pub struct FacetIter<'t> { rtxn: &'t heed::RoTxn<'t>, - db: Database, + db: Database, field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, FacetRevRange<'t, T, KC>>)>, + level_iters: Vec<(RoaringBitmap, Either, FacetRevRange<'t>>)>, must_reduce: bool, } -impl<'t, T, KC> FacetIter<'t, T, KC> -where - KC: heed::BytesDecode<'t, DItem = (FieldId, u8, T, T)>, - KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, - T: PartialOrd + Copy + Bounded, -{ +impl<'t> FacetIter<'t> { /// Create a `FacetIter` that will iterate on the different facet entries /// (facet value + documents ids) and that will reduce the given documents ids /// while iterating on the different facet levels. @@ -164,9 +139,9 @@ where index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, - ) -> heed::Result> + ) -> heed::Result> { - let db = index.facet_field_id_value_docids.remap_key_type::(); + let db = index.facet_field_id_value_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Left(highest_iter))]; @@ -181,9 +156,9 @@ where index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, - ) -> heed::Result> + ) -> heed::Result> { - let db = index.facet_field_id_value_docids.remap_key_type::(); + let db = index.facet_field_id_value_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Right(highest_iter))]; @@ -199,32 +174,32 @@ where index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, - ) -> heed::Result> + ) -> heed::Result> { - let db = index.facet_field_id_value_docids.remap_key_type::(); + let db = index.facet_field_id_value_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Left(highest_iter))]; Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) } - fn highest_level(rtxn: &'t heed::RoTxn, db: Database, fid: FieldId) -> heed::Result> { + fn highest_level( + rtxn: &'t heed::RoTxn, + db: Database, + fid: FieldId, + ) -> heed::Result> + { let level = db.remap_types::() .prefix_iter(rtxn, &[fid][..])? - .remap_key_type::() + .remap_key_type::() .last().transpose()? .map(|((_, level, _, _), _)| level); Ok(level) } } -impl<'t, T: 't, KC> Iterator for FacetIter<'t, T, KC> -where - KC: heed::BytesDecode<'t, DItem = (FieldId, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (FieldId, u8, T, T)>, - T: PartialOrd + Copy + Bounded + Debug, -{ - type Item = heed::Result<(T, RoaringBitmap)>; +impl<'t> Iterator for FacetIter<'t> { + type Item = heed::Result<(f64, RoaringBitmap)>; fn next(&mut self) -> Option { 'outer: loop { diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 4c5bf0a8a..8a2ba9bbf 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -10,7 +10,7 @@ use serde_json::Value; use crate::facet::FacetType; use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; -use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; +use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -302,7 +302,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } }, - FacetType::Float => { + FacetType::Number => { let mut iter = iter.remap_key_type::(); while let Some(result) = iter.next() { let ((_fid, docid, _value), ()) = result?; @@ -311,15 +311,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } }, - FacetType::Integer => { - let mut iter = iter.remap_key_type::(); - while let Some(result) = iter.next() { - let ((_fid, docid, _value), ()) = result?; - if self.documents_ids.contains(docid) { - iter.del_current()?; - } - } - }, } } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 62da5af7e..b9e4d7488 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -7,12 +7,11 @@ use grenad::{CompressionType, Reader, Writer, FileFuse}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesEncode, Error}; use log::debug; -use num_traits::{Bounded, Zero}; use roaring::RoaringBitmap; use crate::facet::FacetType; use crate::heed_codec::CboRoaringBitmapCodec; -use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; +use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::Index; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; @@ -65,58 +64,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); for (field_id, facet_type) in faceted_fields { let (content, documents_ids) = match facet_type { - FacetType::Integer => { - clear_field_levels::( - self.wtxn, - self.index.facet_field_id_value_docids, - field_id, - )?; - - let documents_ids = compute_faceted_documents_ids( - self.wtxn, - self.index.facet_field_id_value_docids, - field_id, - )?; - - let content = compute_facet_levels::( - self.wtxn, - self.index.facet_field_id_value_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.level_group_size, - self.min_level_size, - field_id, - )?; - - (Some(content), documents_ids) - }, - FacetType::Float => { - clear_field_levels::( - self.wtxn, - self.index.facet_field_id_value_docids, - field_id, - )?; - - let documents_ids = compute_faceted_documents_ids( - self.wtxn, - self.index.facet_field_id_value_docids, - field_id, - )?; - - let content = compute_facet_levels::( - self.wtxn, - self.index.facet_field_id_value_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.level_group_size, - self.min_level_size, - field_id, - )?; - - (Some(content), documents_ids) - }, FacetType::String => { let documents_ids = compute_faceted_documents_ids( self.wtxn, @@ -126,6 +73,32 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { (None, documents_ids) }, + FacetType::Number => { + clear_field_number_levels( + self.wtxn, + self.index.facet_field_id_value_docids.remap_key_type::(), + field_id, + )?; + + let documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_field_id_value_docids, + field_id, + )?; + + let content = compute_facet_number_levels( + self.wtxn, + self.index.facet_field_id_value_docids.remap_key_type::(), + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + field_id, + )?; + + (Some(content), documents_ids) + }, }; if let Some(content) = content { @@ -145,25 +118,21 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { } } -fn clear_field_levels<'t, T: 't, KC>( +fn clear_field_number_levels<'t, >( wtxn: &'t mut heed::RwTxn, - db: heed::Database, + db: heed::Database, field_id: u8, ) -> heed::Result<()> -where - T: Copy + Bounded, - KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { - let left = (field_id, 1, T::min_value(), T::min_value()); - let right = (field_id, u8::MAX, T::max_value(), T::max_value()); + let left = (field_id, 1, f64::MIN, f64::MIN); + let right = (field_id, u8::MAX, f64::MAX, f64::MAX); let range = left..=right; - db.remap_key_type::().delete_range(wtxn, &range).map(drop) + db.delete_range(wtxn, &range).map(drop) } -fn compute_facet_levels<'t, T: 't, KC>( +fn compute_facet_number_levels<'t>( rtxn: &'t heed::RoTxn, - db: heed::Database, + db: heed::Database, compression_type: CompressionType, compression_level: Option, shrink_size: Option, @@ -171,12 +140,10 @@ fn compute_facet_levels<'t, T: 't, KC>( min_level_size: NonZeroUsize, field_id: u8, ) -> anyhow::Result> -where - T: Copy + PartialEq + PartialOrd + Bounded + Zero, - KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { - let first_level_size = db.prefix_iter(rtxn, &[field_id])? + let first_level_size = db + .remap_key_type::() + .prefix_iter(rtxn, &[field_id])? .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; @@ -187,8 +154,8 @@ where })?; let level_0_range = { - let left = (field_id, 0, T::min_value(), T::min_value()); - let right = (field_id, 0, T::max_value(), T::max_value()); + let left = (field_id, 0, f64::MIN, f64::MIN); + let right = (field_id, 0, f64::MAX, f64::MAX); left..=right }; @@ -199,11 +166,10 @@ where .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); for (level, group_size) in group_size_iter { - let mut left = T::zero(); - let mut right = T::zero(); + let mut left = 0.0; + let mut right = 0.0; let mut group_docids = RoaringBitmap::new(); - let db = db.remap_key_type::(); for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { let ((_field_id, _level, value, _right), docids) = result?; @@ -212,7 +178,7 @@ where } else if i % group_size == 0 { // we found the first bound of the next group, we must store the left // and right bounds associated with the docids. - write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + write_number_entry(&mut writer, field_id, level, left, right, &group_docids)?; // We save the left bound for the new group and also reset the docids. group_docids = RoaringBitmap::new(); @@ -225,7 +191,7 @@ where } if !group_docids.is_empty() { - write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + write_number_entry(&mut writer, field_id, level, left, right, &group_docids)?; } } @@ -246,19 +212,17 @@ fn compute_faceted_documents_ids( Ok(documents_ids) } -fn write_entry( +fn write_number_entry( writer: &mut Writer, field_id: u8, level: u8, - left: T, - right: T, + left: f64, + right: f64, ids: &RoaringBitmap, ) -> anyhow::Result<()> -where - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { let key = (field_id, level, left, right); - let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?; + let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(()) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 03d91af24..79b3cfc5f 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -19,12 +19,12 @@ use roaring::RoaringBitmap; use serde_json::Value; use tempfile::tempfile; -use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; +use crate::facet::{FacetType, FacetValue}; +use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; +use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId}; +use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ @@ -365,8 +365,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { for ((field_id, value), docids) in iter { let result = match value { String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), - Float(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned), - Integer(i) => FacetLevelValueI64Codec::bytes_encode(&(field_id, 0, i, i)).map(Cow::into_owned), + Number(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned), }; let key = result.context("could not serialize facet key")?; let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) @@ -390,8 +389,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let result = match value { String(s) => FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, s)).map(Cow::into_owned), - Float(f) => FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, **f)).map(Cow::into_owned), - Integer(i) => FieldDocIdFacetI64Codec::bytes_encode(&(field_id, document_id, *i)).map(Cow::into_owned), + Number(f) => FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, **f)).map(Cow::into_owned), }; let key = result.context("could not serialize facet key")?; @@ -605,13 +603,6 @@ fn lmdb_key_valid_size(key: &[u8]) -> bool { !key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH } -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -enum FacetValue { - String(SmallString32), - Float(OrderedFloat), - Integer(i64), -} - /// take an iterator on tokens and compute their relative position depending on separator kinds /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, /// else we keep the standart proximity of 1 between words. @@ -654,54 +645,40 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result Ok(()), - Value::Bool(b) => { - output.push(Integer(*b as i64)); - Ok(()) + Value::Bool(b) => match ftype { + FacetType::String => { + output.push(String(b.to_string())); + Ok(()) + }, + FacetType::Number => { + output.push(Number(OrderedFloat(if *b { 1.0 } else { 0.0 }))); + Ok(()) + }, }, Value::Number(number) => match ftype { FacetType::String => { - let string = SmallString32::from(number.to_string()); - output.push(String(string)); + output.push(String(number.to_string())); Ok(()) }, - FacetType::Float => match number.as_f64() { + FacetType::Number => match number.as_f64() { Some(float) => { - output.push(Float(OrderedFloat(float))); + output.push(Number(OrderedFloat(float))); Ok(()) }, - None => bail!("invalid facet type, expecting {} found integer", ftype), - }, - FacetType::Integer => match number.as_i64() { - Some(integer) => { - output.push(Integer(integer)); - Ok(()) - }, - None => if number.is_f64() { - bail!("invalid facet type, expecting {} found float", ftype) - } else { - bail!("invalid facet type, expecting {} found out-of-bound integer (64bit)", ftype) - }, + None => bail!("invalid facet type, expecting {} found number", ftype), }, }, Value::String(string) => { + // TODO must be normalized and not only lowercased. let string = string.trim().to_lowercase(); - if string.is_empty() { return Ok(()) } match ftype { FacetType::String => { - let string = SmallString32::from(string); output.push(String(string)); Ok(()) }, - FacetType::Float => match string.parse() { + FacetType::Number => match string.parse() { Ok(float) => { - output.push(Float(OrderedFloat(float))); - Ok(()) - }, - Err(_err) => bail!("invalid facet type, expecting {} found string", ftype), - }, - FacetType::Integer => match string.parse() { - Ok(integer) => { - output.push(Integer(integer)); + output.push(Number(OrderedFloat(float))); Ok(()) }, Err(_err) => bail!("invalid facet type, expecting {} found string", ftype), @@ -711,7 +688,10 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result if can_recurse { values.iter().map(|v| inner_parse_facet_value(ftype, v, false, output)).collect() } else { - bail!("invalid facet type, expecting {} found sub-array ()", ftype) + bail!( + "invalid facet type, expecting {} found array (recursive arrays are not supported)", + ftype, + ); }, Value::Object(_) => bail!("invalid facet type, expecting {} found object", ftype), } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index a0cfbd315..62aa8db97 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -619,7 +619,7 @@ mod tests { // Set the faceted fields to be the age. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap! { "age".into() => "integer".into() }); + builder.set_faceted_fields(hashmap!{ "age".into() => "number".into() }); builder.execute(|_, _| ()).unwrap(); // Then index some documents. @@ -632,7 +632,7 @@ mod tests { // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashmap! { "age".to_string() => FacetType::Integer }); + assert_eq!(fields_ids, hashmap!{ "age".to_string() => FacetType::Number }); // Only count the field_id 0 and level 0 facet values. let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 3); @@ -812,9 +812,9 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); - builder.set_faceted_fields(hashmap! { - "age".into() => "integer".into(), - "toto".into() => "integer".into(), + builder.set_faceted_fields(hashmap!{ + "age".into() => "number".into(), + "toto".into() => "number".into(), }); builder.set_criteria(vec!["asc(toto)".to_string()]); builder.execute(|_, _| ()).unwrap(); From 2aeef093169f6f0444382a470b8f524bb8d31c1e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 7 Apr 2021 14:21:47 +0200 Subject: [PATCH 0610/1889] Remove debug logs while iterating through the facet levels --- milli/src/search/facet/mod.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index e4bef2b12..0252af963 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -3,7 +3,6 @@ use std::ops::Bound::{self, Included, Excluded, Unbounded}; use either::Either::{self, Left, Right}; use heed::types::{DecodeIgnore, ByteSlice}; use heed::{Database, RoRange, RoRevRange, LazyDecode}; -use log::debug; use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; @@ -223,7 +222,6 @@ impl<'t> Iterator for FacetIter<'t> { } if level == 0 { - debug!("found {:?} at {:?}", docids, left); return Some(Ok((left, docids))); } @@ -233,10 +231,6 @@ impl<'t> Iterator for FacetIter<'t> { let left = Included(left); let right = Included(right); - debug!("calling with {:?} to {:?} (level {}) to find {:?}", - left, right, level - 1, docids, - ); - let result = if is_ascending { FacetRange::new(rtxn, db, fid, level - 1, left, right).map(Left) } else { From c9b2d3ae1a27a6660b763873a5fb92d51d4f4081 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 7 Apr 2021 14:52:51 +0200 Subject: [PATCH 0611/1889] Warn instead of returning an error when a conversion fails --- Cargo.lock | 6 ++--- milli/src/update/index_documents/mod.rs | 2 ++ milli/src/update/index_documents/store.rs | 33 +++++++++++++++++++---- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a78696e1e..bbe86a2a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1521,7 +1521,8 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" dependencies = [ "ucd-trie", ] @@ -1529,8 +1530,7 @@ dependencies = [ [[package]] name = "pest" version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" dependencies = [ "ucd-trie", ] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index fb1a2d6c0..52949c13c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -440,6 +440,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .enumerate() .map(|(i, documents)| { let store = Store::new( + primary_key.clone(), + fields_ids_map.clone(), searchable_fields.clone(), faceted_fields.clone(), linked_hash_map_size, diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 79b3cfc5f..0bd83b692 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -12,7 +12,7 @@ use fst::Set; use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; -use log::{debug, info}; +use log::{debug, info, warn}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; @@ -24,7 +24,7 @@ use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId}; +use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ @@ -50,6 +50,8 @@ pub struct Readers { pub struct Store<'s, A> { // Indexing parameters + primary_key: String, + fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, faceted_fields: HashMap, // Caches @@ -78,6 +80,8 @@ pub struct Store<'s, A> { impl<'s, A: AsRef<[u8]>> Store<'s, A> { pub fn new( + primary_key: String, + fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, faceted_fields: HashMap, linked_hash_map_size: Option, @@ -149,6 +153,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(Store { // Indexing parameters. + primary_key, + fields_ids_map, searchable_fields, faceted_fields, // Caches @@ -462,9 +468,26 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let value = serde_json::from_slice(content)?; if let Some(ftype) = self.faceted_fields.get(&attr) { - let mut values = parse_facet_value(*ftype, &value).with_context(|| { - format!("extracting facets from the value {}", value) - })?; + let mut values = match parse_facet_value(*ftype, &value) { + Ok(values) => values, + Err(e) => { + // We extract the name of the attribute and the document id + // to help users debug a facet type conversion. + let attr_name = self.fields_ids_map.name(attr).unwrap(); + let document_id: Value = self.fields_ids_map.id(&self.primary_key) + .and_then(|fid| document.get(fid)) + .map(serde_json::from_slice) + .unwrap()?; + + let context = format!( + "while extracting facet from the {:?} attribute in the {} document", + attr_name, document_id, + ); + warn!("{}", e.context(context)); + + SmallVec8::default() + }, + }; facet_values.entry(attr).or_insert_with(SmallVec8::new).extend(values.drain(..)); } From 6fa00c61d2d3c2e24fa4d4e7e70046a34f251ab6 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Tue, 13 Apr 2021 20:10:58 +0300 Subject: [PATCH 0612/1889] feat(search): support words_limit --- milli/src/search/mod.rs | 35 +++++++++++------- milli/src/search/query_tree.rs | 66 +++++++++++++++++++++++++--------- 2 files changed, 72 insertions(+), 29 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7324ea72a..174fff35c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,25 +1,27 @@ use std::borrow::Cow; -use std::collections::hash_map::{HashMap, Entry}; +use std::collections::hash_map::{Entry, HashMap}; use std::fmt; use std::mem::take; use std::str::Utf8Error; use std::time::Instant; use fst::{IntoStreamer, Streamer}; -use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder}; +use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use log::debug; -use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use crate::search::criteria::fetcher::{FetcherResult, Fetcher}; -use crate::{Index, DocumentId}; -use distinct::{MapDistinct, FacetDistinct, Distinct, DocIter, NoopDistinct}; -use self::query_tree::QueryTreeBuilder; +use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct}; -pub use self::facet::FacetIter; -pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; +use crate::search::criteria::fetcher::{Fetcher, FetcherResult}; +use crate::{DocumentId, Index}; + +pub use self::facet::{ + FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator, +}; pub use self::query_tree::MatchingWords; +use self::query_tree::QueryTreeBuilder; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -38,6 +40,7 @@ pub struct Search<'a> { limit: usize, optional_words: bool, authorize_typos: bool, + words_limit: usize, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } @@ -51,6 +54,7 @@ impl<'a> Search<'a> { limit: 20, optional_words: true, authorize_typos: true, + words_limit: 10, rtxn, index, } @@ -81,6 +85,11 @@ impl<'a> Search<'a> { self } + pub fn words_limit(&mut self, value: usize) -> &mut Search<'a> { + self.words_limit = value; + self + } + pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { self.facet_condition = Some(condition); self @@ -94,6 +103,7 @@ impl<'a> Search<'a> { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); builder.authorize_typos(self.authorize_typos); + builder.words_limit(self.words_limit); // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut config = AnalyzerConfig::default(); @@ -154,14 +164,12 @@ impl<'a> Search<'a> { matching_words: MatchingWords, mut criteria: Fetcher, ) -> anyhow::Result { - let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); let mut excluded_documents = RoaringBitmap::new(); let mut documents_ids = Vec::with_capacity(self.limit); while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { - debug!("Number of candidates found {}", candidates.len()); let excluded = take(&mut excluded_documents); @@ -195,6 +203,7 @@ impl fmt::Debug for Search<'_> { limit, optional_words, authorize_typos, + words_limit, rtxn: _, index: _, } = self; @@ -205,6 +214,7 @@ impl fmt::Debug for Search<'_> { .field("limit", limit) .field("optional_words", optional_words) .field("authorize_typos", authorize_typos) + .field("words_limit", words_limit) .finish() } } @@ -225,8 +235,7 @@ pub fn word_derivations<'c>( max_typo: u8, fst: &fst::Set>, cache: &'c mut WordDerivationsCache, -) -> Result<&'c [(String, u8)], Utf8Error> -{ +) -> Result<&'c [(String, u8)], Utf8Error> { match cache.entry((word.to_string(), is_prefix, max_typo)) { Entry::Occupied(entry) => Ok(entry.into_mut()), Entry::Vacant(entry) => { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index d21227507..492b98a1e 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -170,6 +170,7 @@ pub struct QueryTreeBuilder<'a> { index: &'a Index, optional_words: bool, authorize_typos: bool, + words_limit: Option, } impl<'a> Context for QueryTreeBuilder<'a> { @@ -190,7 +191,7 @@ impl<'a> QueryTreeBuilder<'a> { /// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn` /// and an Index `index`. pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self { - Self { rtxn, index, optional_words: true, authorize_typos: true } + Self { rtxn, index, optional_words: true, authorize_typos: true, words_limit: None } } /// if `optional_words` is set to `false` the query tree will be @@ -213,6 +214,13 @@ impl<'a> QueryTreeBuilder<'a> { self } + /// Limit words and phrases that will be taken for query building. + /// Any beyond `words_limit` will be ignored. + pub fn words_limit(&mut self, words_limit: usize) -> &mut Self { + self.words_limit = Some(words_limit); + self + } + /// Build the query tree: /// - if `optional_words` is set to `false` the query tree will be /// generated forcing all query words to be present in each matching documents @@ -222,7 +230,7 @@ impl<'a> QueryTreeBuilder<'a> { /// (the criterion `typo` will be ignored) pub fn build(&self, query: TokenStream) -> anyhow::Result> { let stop_words = self.index.stop_words(self.rtxn)?; - let primitive_query = create_primitive_query(query, stop_words); + let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some) } else { @@ -476,13 +484,18 @@ impl PrimitiveQueryPart { /// Create primitive query from tokenized query string, /// the primitive query is an intermediate state to build the query tree. -fn create_primitive_query(query: TokenStream, stop_words: Option>) -> PrimitiveQuery { +fn create_primitive_query(query: TokenStream, stop_words: Option>, words_limit: Option) -> PrimitiveQuery { let mut primitive_query = Vec::new(); let mut phrase = Vec::new(); let mut quoted = false; + let parts_limit = words_limit.unwrap_or(usize::MAX); + let mut peekable = query.peekable(); while let Some(token) = peekable.next() { + // early return if word limit is exceeded + if primitive_query.len() >= parts_limit { return primitive_query } + match token.kind { TokenKind::Word | TokenKind::StopWord => { // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, @@ -564,10 +577,11 @@ mod test { &self, optional_words: bool, authorize_typos: bool, + words_limit: Option, query: TokenStream, ) -> anyhow::Result> { - let primitive_query = create_primitive_query(query, None); + let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some) } else { @@ -660,7 +674,7 @@ mod test { Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -680,7 +694,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -711,7 +725,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -756,7 +770,7 @@ mod test { ]), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -776,7 +790,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -802,7 +816,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -822,7 +836,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -861,7 +875,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }), ]), ]); - let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -877,7 +891,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), ]); - let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -911,7 +925,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), ]), ]); - let query_tree = TestContext::default().build(true, true, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -930,7 +944,7 @@ mod test { ]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, false, tokens).unwrap().unwrap(); + let query_tree = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -943,7 +957,7 @@ mod test { let tokens = result.tokens(); let context = TestContext::default(); - let query_tree = context.build(false, true, tokens).unwrap().unwrap(); + let query_tree = context.build(false, true, None, tokens).unwrap().unwrap(); let expected = hashset!{ ("word", 0, false), @@ -967,4 +981,24 @@ mod test { let words = fetch_queries(&query_tree); assert_eq!(expected, words); } + + #[test] + fn words_limit() { + let query = "\"hey my\" good friend"; + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::And(vec![ + Operation::Consecutive(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + ]), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), + ]); + + let query_tree = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } } From 7fa3a1d23efbd5f1109d7eb27499d73eddfb3552 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 21 Apr 2021 00:27:23 +0200 Subject: [PATCH 0613/1889] makes clippy happy http-ui --- http-ui/src/main.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index ad9f1646d..b091985f3 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -381,7 +381,7 @@ async fn main() -> anyhow::Result<()> { match result { Ok(_) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()) + Err(e) => Err(e) } } UpdateMeta::ClearDocuments => { @@ -391,7 +391,7 @@ async fn main() -> anyhow::Result<()> { match builder.execute() { Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()) + Err(e) => Err(e) } } UpdateMeta::Settings(settings) => { @@ -461,7 +461,7 @@ async fn main() -> anyhow::Result<()> { match result { Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()) + Err(e) => Err(e) } } UpdateMeta::Facets(levels) => { @@ -476,7 +476,7 @@ async fn main() -> anyhow::Result<()> { } match builder.execute() { Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()) + Err(e) => Err(e) } } UpdateMeta::WordsPrefixes(settings) => { @@ -491,7 +491,7 @@ async fn main() -> anyhow::Result<()> { } match builder.execute() { Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()) + Err(e) => Err(e) } } }; @@ -1001,7 +1001,8 @@ async fn main() -> anyhow::Result<()> { .or(update_ws_route); let addr = SocketAddr::from_str(&opt.http_listen_addr)?; - Ok(warp::serve(routes).run(addr).await) + warp::serve(routes).run(addr).await; + Ok(()) } #[cfg(test)] From f8dee1b402985d24475cb7f9fed2d2f4b8c4993f Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 21 Apr 2021 00:35:58 +0200 Subject: [PATCH 0614/1889] [makes clippy happy] search/criteria/proximity.rs --- milli/src/search/criteria/proximity.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b62eb8cfd..decd4c338 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -10,6 +10,8 @@ use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; +type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; + pub struct Proximity<'t> { ctx: &'t dyn Context, query_tree: Option<(usize, Operation)>, @@ -17,7 +19,7 @@ pub struct Proximity<'t> { candidates: Candidates, bucket_candidates: RoaringBitmap, parent: Option>, - candidates_cache: HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + candidates_cache: Cache, plane_sweep_cache: Option>, } @@ -35,7 +37,7 @@ impl<'t> Proximity<'t> { candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), bucket_candidates: RoaringBitmap::new(), parent: None, - candidates_cache: HashMap::new(), + candidates_cache: Cache::new(), plane_sweep_cache: None, } } @@ -48,7 +50,7 @@ impl<'t> Proximity<'t> { candidates: Candidates::default(), bucket_candidates: RoaringBitmap::new(), parent: Some(parent), - candidates_cache: HashMap::new(), + candidates_cache: Cache::new(), plane_sweep_cache: None, } } @@ -204,7 +206,7 @@ fn resolve_candidates<'t>( ctx: &'t dyn Context, query_tree: &Operation, proximity: u8, - cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + cache: &mut Cache, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { @@ -212,7 +214,7 @@ fn resolve_candidates<'t>( ctx: &'t dyn Context, query_tree: &Operation, proximity: u8, - cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + cache: &mut Cache, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { @@ -249,7 +251,7 @@ fn resolve_candidates<'t>( left: &Operation, right: &Operation, proximity: u8, - cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + cache: &mut Cache, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { @@ -303,7 +305,7 @@ fn resolve_candidates<'t>( ctx: &'t dyn Context, branches: &[Operation], proximity: u8, - cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + cache: &mut Cache, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { @@ -332,7 +334,7 @@ fn resolve_candidates<'t>( Ok(output) }, Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache), - None => return Ok(Default::default()), + None => Ok(Default::default()), } } @@ -505,10 +507,8 @@ fn resolve_plane_sweep_candidates( let iter = word_derivations(word, true, 0, &words_positions) .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); result.extend(iter); - } else { - if let Some(positions) = words_positions.get(word) { + } else if let Some(positions) = words_positions.get(word) { result.extend(positions.iter().map(|p| (p, 0, p))); - } } }, QueryKind::Tolerant { typo, word } => { From 0f4c0beffd0a25437c2e58cfb2f7686ab17da87f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 11 Mar 2021 11:48:55 +0100 Subject: [PATCH 0615/1889] Introduce the Attribute criterion --- Cargo.lock | 7 ++ milli/Cargo.toml | 1 + milli/src/search/criteria/attribute.rs | 133 +++++++++++++++++++++++++ milli/src/search/criteria/mod.rs | 4 + 4 files changed, 145 insertions(+) create mode 100644 milli/src/search/criteria/attribute.rs diff --git a/Cargo.lock b/Cargo.lock index bbe86a2a7..065be362f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,12 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "big_s" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8" + [[package]] name = "bincode" version = "1.3.1" @@ -1251,6 +1257,7 @@ name = "milli" version = "0.1.1" dependencies = [ "anyhow", + "big_s", "bstr", "byteorder", "chrono", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b198131c1..eefdfa7d5 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -52,6 +52,7 @@ logging_timer = "1.0.0" tinytemplate = "=1.1.0" [dev-dependencies] +big_s = "1.0.2" criterion = "0.3.4" maplit = "1.0.2" rand = "0.8.3" diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs new file mode 100644 index 000000000..9c31740b1 --- /dev/null +++ b/milli/src/search/criteria/attribute.rs @@ -0,0 +1,133 @@ +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::criteria::Query; +use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; +use super::{Criterion, CriterionResult, Context}; + +pub struct Attribute<'t> { + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + bucket_candidates: RoaringBitmap, + parent: Option>, +} + +impl<'t> Attribute<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> Self + { + Attribute { + ctx, + query_tree, + candidates, + bucket_candidates: RoaringBitmap::new(), + parent: None, + } + } + + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Attribute { + ctx, + query_tree: None, + candidates: None, + bucket_candidates: RoaringBitmap::new(), + parent: Some(parent), + } + } +} + +impl<'t> Criterion for Attribute<'t> { + #[logging_timer::time("Attribute::{}")] + fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + todo!("Attribute") + } +} + +// TODO can we keep refs of Query +fn explode_query_tree(query_tree: &Operation) -> Vec> { + use crate::search::criteria::Operation::{And, Or, Consecutive}; + + fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec> { + match tail.split_first() { + Some((thead, tail)) => { + let tail = and_recurse(thead, tail); + let mut out = Vec::new(); + for array in recurse(head) { + for tail_array in &tail { + let mut array = array.clone(); + array.extend(tail_array.iter().cloned()); + out.push(array); + } + } + out + }, + None => recurse(head), + } + } + + fn recurse(op: &Operation) -> Vec> { + match op { + And(ops) | Consecutive(ops) => { + ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) + }, + Or(_, ops) => ops.into_iter().map(recurse).flatten().collect(), + Operation::Query(query) => vec![vec![query.clone()]], + } + } + + recurse(query_tree) +} + +#[cfg(test)] +mod tests { + use big_s::S; + + use crate::search::criteria::QueryKind; + use super::*; + + #[test] + fn simple_explode_query_tree() { + let query_tree = Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), + ]), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), + ]), + ]), + ]), + ]); + + let expected = vec![ + vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }], + vec![ + Query { prefix: false, kind: QueryKind::exact(S("manythe")) }, + Query { prefix: false, kind: QueryKind::exact(S("fish")) }, + ], + vec![ + Query { prefix: false, kind: QueryKind::exact(S("many")) }, + Query { prefix: false, kind: QueryKind::exact(S("thefish")) }, + ], + vec![ + Query { prefix: false, kind: QueryKind::exact(S("many")) }, + Query { prefix: false, kind: QueryKind::exact(S("the")) }, + Query { prefix: false, kind: QueryKind::exact(S("fish")) }, + ], + ]; + + let result = explode_query_tree(&query_tree); + assert_eq!(expected, result); + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 22f081871..8d9c21f6e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -12,12 +12,14 @@ use self::typo::Typo; use self::words::Words; use self::asc_desc::AscDesc; use self::proximity::Proximity; +use self::attribute::Attribute; use self::fetcher::Fetcher; mod typo; mod words; mod asc_desc; mod proximity; +mod attribute; pub mod fetcher; pub trait Criterion { @@ -139,6 +141,7 @@ impl<'t> CriteriaBuilder<'t> { Name::Typo => Box::new(Typo::new(self, father)), Name::Words => Box::new(Words::new(self, father)), Name::Proximity => Box::new(Proximity::new(self, father)), + Name::Attribute => Box::new(Attribute::new(self, father)), Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?), Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?), _otherwise => father, @@ -147,6 +150,7 @@ impl<'t> CriteriaBuilder<'t> { Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())), Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())), Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())), + Name::Attribute => Box::new(Attribute::initial(self, query_tree.take(), facet_candidates.take())), Name::Asc(field) => { Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) }, From 4ff67ec2ee16d9b02362c85ab582dad9898b4a66 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 11 Mar 2021 17:31:02 +0100 Subject: [PATCH 0616/1889] Implement attribute criterion for small amounts of candidates --- milli/src/search/criteria/attribute.rs | 164 +++++++++++++++++++++++-- 1 file changed, 157 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 9c31740b1..7f8b5c622 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,10 +1,13 @@ -use log::debug; +use std::collections::{BTreeMap, HashMap, btree_map}; +use std::mem::take; + use roaring::RoaringBitmap; +use crate::{search::build_dfa}; use crate::search::criteria::Query; -use crate::search::query_tree::Operation; +use crate::search::query_tree::{Operation, QueryKind}; use crate::search::WordDerivationsCache; -use super::{Criterion, CriterionResult, Context}; +use super::{Criterion, CriterionResult, Context, resolve_query_tree}; pub struct Attribute<'t> { ctx: &'t dyn Context, @@ -12,6 +15,8 @@ pub struct Attribute<'t> { candidates: Option, bucket_candidates: RoaringBitmap, parent: Option>, + flattened_query_tree: Option>>, + current_buckets: Option>, } impl<'t> Attribute<'t> { @@ -27,6 +32,8 @@ impl<'t> Attribute<'t> { candidates, bucket_candidates: RoaringBitmap::new(), parent: None, + flattened_query_tree: None, + current_buckets: None, } } @@ -37,6 +44,8 @@ impl<'t> Attribute<'t> { candidates: None, bucket_candidates: RoaringBitmap::new(), parent: Some(parent), + flattened_query_tree: None, + current_buckets: None, } } } @@ -44,12 +53,153 @@ impl<'t> Attribute<'t> { impl<'t> Criterion for Attribute<'t> { #[logging_timer::time("Attribute::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { - todo!("Attribute") + loop { + match (&self.query_tree, &mut self.candidates) { + (_, Some(candidates)) if candidates.is_empty() => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + (Some(qt), Some(candidates)) => { + let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| flatten_query_tree(&qt)); + let current_buckets = if let Some(current_buckets) = self.current_buckets.as_mut() { + current_buckets + } else { + let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; + self.current_buckets.get_or_insert(new_buckets.into_iter()) + }; + + let found_candidates = if let Some((_score, candidates)) = current_buckets.next() { + candidates + } else { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }; + candidates.difference_with(&found_candidates); + + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => found_candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: Some(found_candidates), + bucket_candidates: bucket_candidates, + })); + }, + (Some(qt), None) => { + let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; + self.bucket_candidates.union_with(&query_tree_candidates); + self.candidates = Some(query_tree_candidates); + }, + (None, Some(_)) => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + (None, None) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next(wdcache)? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree; + self.candidates = candidates; + self.bucket_candidates.union_with(&bucket_candidates); + self.flattened_query_tree = None; + self.current_buckets = None; + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } } } +fn linear_compute_candidates( + ctx: &dyn Context, + branches: &Vec>, + allowed_candidates: &RoaringBitmap, +) -> anyhow::Result> +{ + fn compute_candidate_rank(branches: &Vec>, words_positions: HashMap) -> u64 { + let mut min_rank = u64::max_value(); + for branch in branches { + let mut branch_rank = 0; + for Query { prefix, kind } in branch { + // find the best position of the current word in the document. + let position = match kind { + QueryKind::Exact { word, .. } => { + if *prefix { + word_derivations(word, true, 0, &words_positions) + .flat_map(|positions| positions.iter().next()).min() + } else { + words_positions.get(word) + .map(|positions| positions.iter().next()) + .flatten() + } + }, + QueryKind::Tolerant { typo, word } => { + word_derivations(word, *prefix, *typo, &words_positions) + .flat_map(|positions| positions.iter().next()).min() + }, + }; + + // if a position is found, we add it to the branch score, + // otherwise the branch is considered as unfindable in this document and we break. + if let Some(position) = position { + branch_rank += position as u64; + } else { + branch_rank = u64::max_value(); + break; + } + } + min_rank = min_rank.min(branch_rank); + } + + min_rank + } + + fn word_derivations<'a>( + word: &str, + is_prefix: bool, + max_typo: u8, + words_positions: &'a HashMap, + ) -> impl Iterator + { + let dfa = build_dfa(word, max_typo, is_prefix); + words_positions.iter().filter_map(move |(document_word, positions)| { + use levenshtein_automata::Distance; + match dfa.eval(document_word) { + Distance::Exact(_) => Some(positions), + Distance::AtLeast(_) => None, + } + }) + } + + let mut candidates = BTreeMap::new(); + for docid in allowed_candidates { + let words_positions = ctx.docid_words_positions(docid)?; + let rank = compute_candidate_rank(branches, words_positions); + candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid); + } + + Ok(candidates) +} + // TODO can we keep refs of Query -fn explode_query_tree(query_tree: &Operation) -> Vec> { +fn flatten_query_tree(query_tree: &Operation) -> Vec> { use crate::search::criteria::Operation::{And, Or, Consecutive}; fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec> { @@ -91,7 +241,7 @@ mod tests { use super::*; #[test] - fn simple_explode_query_tree() { + fn simple_flatten_query_tree() { let query_tree = Operation::Or(false, vec![ Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), Operation::And(vec![ @@ -127,7 +277,7 @@ mod tests { ], ]; - let result = explode_query_tree(&query_tree); + let result = flatten_query_tree(&query_tree); assert_eq!(expected, result); } } From 75e7b1e3dadb46c0761e07a14b3a70dfe6e3c01d Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Mar 2021 13:49:55 +0100 Subject: [PATCH 0617/1889] Implement test Context methods --- milli/src/search/criteria/mod.rs | 99 +++++++++++++++++++------------- 1 file changed, 59 insertions(+), 40 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 8d9c21f6e..1d7026d71 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -366,6 +366,7 @@ pub mod test { word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + docid_words: HashMap>, } impl<'a> Context for TestContext<'a> { @@ -399,8 +400,17 @@ pub mod test { self.word_prefix_docids.contains_key(&word.to_string()) } - fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result> { - todo!() + fn docid_words_positions(&self, docid: DocumentId) -> heed::Result> { + if let Some(docid_words) = self.docid_words.get(&docid) { + Ok(docid_words + .iter() + .enumerate() + .map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))) + .collect() + ) + } else { + Ok(HashMap::new()) + } } } @@ -435,50 +445,58 @@ pub mod test { s("morning") => random_postings(rng, 125), }; + let mut docid_words = HashMap::new(); + for (word, docids) in word_docids.iter() { + for docid in docids { + let words = docid_words.entry(docid).or_insert(vec![]); + words.push(word.clone()); + } + } + let word_prefix_docids = hashmap!{ s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], }; - let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")]; - let hello_world_split = (hello_world.len() / 2) as usize; - let hello_world_1 = hello_world.iter().take(hello_world_split).collect(); - let hello_world_2 = hello_world.iter().skip(hello_world_split).collect(); - - let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")]; - let hello_word_split = (hello_word.len() / 2) as usize; - let hello_word_4 = hello_word.iter().take(hello_word_split).collect(); - let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect(); - let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect(); - let word_pair_proximity_docids = hashmap!{ - (s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")], - (s("hello"), s("world"), 1) => hello_world_1, - (s("hello"), s("world"), 4) => hello_world_2, - (s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")], - (s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], - (s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), - (s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], - (s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), - (s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")], - (s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")], - (s("hello"), s("word"), 4) => hello_word_4, - (s("hello"), s("word"), 6) => hello_word_6, - (s("hello"), s("word"), 7) => hello_word_7, - (s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")], - (s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], - (s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")], - (s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], - }; - - let word_prefix_pair_proximity_docids = hashmap!{ - (s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(), - (s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(), - (s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(), - (s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(), - (s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(), - (s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(), - }; + let mut word_pair_proximity_docids = HashMap::new(); + let mut word_prefix_pair_proximity_docids = HashMap::new(); + for (lword, lcandidates) in &word_docids { + for (rword, rcandidates) in &word_docids { + if lword == rword { continue } + let candidates = lcandidates & rcandidates; + for candidate in candidates { + if let Some(docid_words) = docid_words.get(&candidate) { + let lposition = docid_words.iter().position(|w| w == lword).unwrap(); + let rposition = docid_words.iter().position(|w| w == rword).unwrap(); + let key = if lposition < rposition { + (s(lword), s(rword), (rposition - lposition) as i32) + } else { + (s(lword), s(rword), (lposition - rposition + 1) as i32) + }; + let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); + docids.push(candidate); + } + } + } + for (pword, pcandidates) in &word_prefix_docids { + if lword.starts_with(pword) { continue } + let candidates = lcandidates & pcandidates; + for candidate in candidates { + if let Some(docid_words) = docid_words.get(&candidate) { + let lposition = docid_words.iter().position(|w| w == lword).unwrap(); + let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); + let key = if lposition < rposition { + (s(lword), s(pword), (rposition - lposition) as i32) + } else { + (s(lword), s(pword), (lposition - rposition + 1) as i32) + }; + let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); + docids.push(candidate); + } + } + } + } let mut keys = word_docids.keys().collect::>(); keys.sort_unstable(); @@ -490,6 +508,7 @@ pub mod test { word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + docid_words, } } } From b0a417f342de6afe8678c628b5e3be9c30f9c302 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 11 Mar 2021 17:24:35 +0100 Subject: [PATCH 0618/1889] Introduce the word_level_position_docids Index database --- infos/src/main.rs | 1 + milli/src/heed_codec/mod.rs | 2 + .../heed_codec/str_level_position_codec.rs | 42 +++++++++++++++++++ milli/src/index.rs | 8 +++- milli/src/lib.rs | 2 +- milli/src/update/clear_documents.rs | 1 + milli/src/update/delete_documents.rs | 1 + 7 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 milli/src/heed_codec/str_level_position_codec.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index cc1727a68..356a5417c 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -319,6 +319,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values: _, documents, diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index a070c66eb..cc73cdc65 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -2,6 +2,7 @@ mod beu32_str_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; +mod str_level_position_codec; mod str_str_u8_codec; pub mod facet; @@ -9,4 +10,5 @@ pub use self::beu32_str_codec::BEU32StrCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; +pub use self::str_level_position_codec::StrLevelPositionCodec; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs new file mode 100644 index 000000000..c421c04b5 --- /dev/null +++ b/milli/src/heed_codec/str_level_position_codec.rs @@ -0,0 +1,42 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::mem::size_of; +use std::str; + +pub struct StrLevelPositionCodec; + +impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { + type DItem = (&'a str, u8, u32, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::() + size_of::() * 2; + + if bytes.len() < footer_len { return None } + + let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let word = str::from_utf8(word).ok()?; + + let (level, bytes) = bytes.split_first()?; + let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?; + let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?; + + Some((word, *level, left, right)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { + type EItem = (&'a str, u8, u32, u32); + + fn bytes_encode((word, level, left, right): &Self::EItem) -> Option> { + let left = left.to_be_bytes(); + let right = right.to_be_bytes(); + + let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len()); + bytes.extend_from_slice(word.as_bytes()); + bytes.push(*level); + bytes.extend_from_slice(&left[..]); + bytes.extend_from_slice(&right[..]); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 045eabc3c..0659b207a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -12,7 +12,7 @@ use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; use crate::{ BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, - ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec, + ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, }; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; @@ -52,6 +52,8 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, + /// Maps the word, level and position range with the docids that corresponds to it. + pub word_level_position_docids: Database, /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. pub facet_field_id_value_docids: Database, /// Maps the document id, the facet field id and the globally ordered value. @@ -62,7 +64,7 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(9); + options.max_dbs(10); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -71,6 +73,7 @@ impl Index { let docid_word_positions = env.create_database(Some("docid-word-positions"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; + let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; @@ -94,6 +97,7 @@ impl Index { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index fe9bd828b..de5c6511e 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,7 +22,7 @@ use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; -pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; +pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 2c24d9c07..250e4b13a 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -28,6 +28,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 8a2ba9bbf..b60b7bac2 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -88,6 +88,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + word_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, From 9242f2f1d451807e45f29462c0126c992d8950af Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 11 Mar 2021 17:23:46 +0100 Subject: [PATCH 0619/1889] Store the first word positions levels --- .../update/index_documents/merge_function.rs | 4 + milli/src/update/index_documents/mod.rs | 42 +++- milli/src/update/index_documents/store.rs | 58 +++++- milli/src/update/mod.rs | 2 + milli/src/update/words_level_positions.rs | 184 ++++++++++++++++++ 5 files changed, 284 insertions(+), 6 deletions(-) create mode 100644 milli/src/update/words_level_positions.rs diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 6f24fcad9..54f994fc0 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -52,6 +52,10 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) - cbo_roaring_bitmap_merge(values) } +pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + cbo_roaring_bitmap_merge(values) +} + pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { cbo_roaring_bitmap_merge(values) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 52949c13c..8fc35b654 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -18,11 +18,12 @@ use rayon::prelude::*; use serde::{Serialize, Deserialize}; use crate::index::Index; -use crate::update::{Facets, WordsPrefixes, UpdateIndexingStep}; +use crate::update::{Facets, WordsLevelPositions, WordsPrefixes, UpdateIndexingStep}; use self::store::{Store, Readers}; pub use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, + docid_word_positions_merge, documents_merge, + word_level_position_docids_merge, facet_field_value_docids_merge, field_id_docid_facet_values_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -402,6 +403,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { enum DatabaseType { Main, WordDocids, + WordLevel0PositionDocids, FacetLevel0ValuesDocids, } @@ -467,6 +469,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut word_docids_readers = Vec::with_capacity(readers.len()); let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); + let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len()); let mut documents_readers = Vec::with_capacity(readers.len()); @@ -476,6 +479,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { word_docids, docid_word_positions, words_pairs_proximities_docids, + word_level_position_docids, facet_field_value_docids, field_id_docid_facet_values, documents @@ -484,6 +488,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { word_docids_readers.push(word_docids); docid_word_positions_readers.push(docid_word_positions); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); + word_level_position_docids_readers.push(word_level_position_docids); facet_field_value_docids_readers.push(facet_field_value_docids); field_id_docid_facet_values_readers.push(field_id_docid_facet_values); documents_readers.push(documents); @@ -514,6 +519,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { facet_field_value_docids_readers, facet_field_value_docids_merge, ), + ( + DatabaseType::WordLevel0PositionDocids, + word_level_position_docids_readers, + word_level_position_docids_merge, + ), ] .into_par_iter() .for_each(|(dbtype, readers, merge)| { @@ -569,7 +579,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; - let total_databases = 7; + let total_databases = 8; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, @@ -661,7 +671,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { )?; }, DatabaseType::FacetLevel0ValuesDocids => { - debug!("Writing the facet values docids into LMDB on disk..."); + debug!("Writing the facet level 0 values docids into LMDB on disk..."); let db = *self.index.facet_field_id_value_docids.as_polymorph(); write_into_lmdb_database( self.wtxn, @@ -671,6 +681,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, + DatabaseType::WordLevel0PositionDocids => { + debug!("Writing the word level 0 positions docids into LMDB on disk..."); + let db = *self.index.word_level_position_docids.as_polymorph(); + write_into_lmdb_database( + self.wtxn, + db, + content, + word_level_position_docids_merge, + write_method, + )?; + } } database_count += 1; @@ -693,6 +714,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + // Run the words positions update operation. + let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + if let Some(value) = self.facet_level_group_size { + builder.level_group_size(value); + } + if let Some(value) = self.facet_min_level_size { + builder.min_level_size(value); + } + builder.execute()?; + // Run the words prefixes update operation. let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 0bd83b692..358552768 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -29,7 +29,8 @@ use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - facet_field_value_docids_merge, field_id_docid_facet_values_merge, + word_level_position_docids_merge, facet_field_value_docids_merge, + field_id_docid_facet_values_merge, }; const LMDB_MAX_KEY_LENGTH: usize = 511; @@ -43,6 +44,7 @@ pub struct Readers { pub word_docids: Reader, pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, + pub word_level_position_docids: Reader, pub facet_field_value_docids: Reader, pub field_id_docid_facet_values: Reader, pub documents: Reader, @@ -69,6 +71,7 @@ pub struct Store<'s, A> { main_sorter: Sorter, word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, + word_level_position_docids_sorter: Sorter, facet_field_value_docids_sorter: Sorter, field_id_docid_facet_values_sorter: Sorter, // MTBL writers @@ -94,7 +97,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { ) -> anyhow::Result { // We divide the max memory by the number of sorter the Store have. - let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4)); + let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); let main_sorter = create_sorter( @@ -121,6 +124,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_nb_chunks, max_memory, ); + let word_level_position_docids_sorter = create_sorter( + word_level_position_docids_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + max_memory, + ); let facet_field_value_docids_sorter = create_sorter( facet_field_value_docids_merge, chunk_compression_type, @@ -172,6 +183,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { main_sorter, word_docids_sorter, words_pairs_proximities_docids_sorter, + word_level_position_docids_sorter, facet_field_value_docids_sorter, field_id_docid_facet_values_sorter, // MTBL writers @@ -290,6 +302,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { self.documents_writer.insert(document_id.to_be_bytes(), record)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; + Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?; words_positions.clear(); @@ -360,6 +373,42 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } + fn write_word_position_docids( + writer: &mut Sorter, + document_id: DocumentId, + words_positions: &HashMap>, + ) -> anyhow::Result<()> + { + let mut key_buffer = Vec::new(); + let mut data_buffer = Vec::new(); + + for (word, positions) in words_positions { + key_buffer.clear(); + key_buffer.extend_from_slice(word.as_bytes()); + key_buffer.push(0); // level 0 + + for position in positions { + key_buffer.truncate(word.len()); + let position_bytes = position.to_be_bytes(); + key_buffer.extend_from_slice(position_bytes.as_bytes()); + key_buffer.extend_from_slice(position_bytes.as_bytes()); + + data_buffer.clear(); + let positions = RoaringBitmap::from_iter(Some(document_id)); + // We serialize the positions into a buffer. + CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer) + .with_context(|| "could not serialize positions")?; + + // that we write under the generated key into MTBL + if lmdb_key_valid_size(&key_buffer) { + writer.insert(&key_buffer, &data_buffer)?; + } + } + } + + Ok(()) + } + fn write_facet_field_value_docids( sorter: &mut Sorter, iter: I, @@ -561,6 +610,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; + let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; + let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; @@ -570,6 +622,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; + let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?; let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; @@ -580,6 +633,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids, docid_word_positions, words_pairs_proximities_docids, + word_level_position_docids, facet_field_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index c2df94468..1fc4890fb 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -6,6 +6,7 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc pub use self::settings::{Setting, Settings}; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; +pub use self::words_level_positions::WordsLevelPositions; pub use self::words_prefixes::WordsPrefixes; mod available_documents_ids; @@ -16,5 +17,6 @@ mod index_documents; mod settings; mod update_builder; mod update_step; +mod words_level_positions; mod words_prefixes; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs new file mode 100644 index 000000000..983f82657 --- /dev/null +++ b/milli/src/update/words_level_positions.rs @@ -0,0 +1,184 @@ +use std::cmp; +use std::fs::File; +use std::num::NonZeroUsize; + +use grenad::{CompressionType, Reader, Writer, FileFuse}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesEncode, Error}; +use log::debug; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; +use crate::Index; +use crate::update::index_documents::WriteMethod; +use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; + +pub struct WordsLevelPositions<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + _update_id: u64, +} + +impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + update_id: u64, + ) -> WordsLevelPositions<'t, 'u, 'i> + { + WordsLevelPositions { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + level_group_size: NonZeroUsize::new(4).unwrap(), + min_level_size: NonZeroUsize::new(5).unwrap(), + _update_id: update_id, + } + } + + pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { + self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); + self + } + + pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { + self.min_level_size = value; + self + } + + pub fn execute(self) -> anyhow::Result<()> { + debug!("Computing and writing the word levels positions docids into LMDB on disk..."); + + clear_non_zero_levels_positions(self.wtxn, self.index.word_level_position_docids)?; + + let entries = compute_positions_levels( + self.wtxn, + self.index.word_level_position_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + )?; + + write_into_lmdb_database( + self.wtxn, + *self.index.facet_field_id_value_docids.as_polymorph(), + entries, + |_, _| anyhow::bail!("invalid facet level merging"), + WriteMethod::GetMergePut, + )?; + + Ok(()) + } +} + +fn clear_non_zero_levels_positions( + wtxn: &mut heed::RwTxn, + db: heed::Database, +) -> heed::Result<()> +{ + let mut iter = db.iter_mut(wtxn)?.lazily_decode_data(); + while let Some(result) = iter.next() { + let ((_, level, _, _), _) = result?; + if level != 0 { + iter.del_current()?; + } + } + Ok(()) +} + +/// Generates all the words positions levels (including the level zero). +fn compute_positions_levels( + rtxn: &heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + shrink_size: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, +) -> anyhow::Result> +{ + // let first_level_size = db.prefix_iter(rtxn, &[field_id])? + // .remap_types::() + // .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + // // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // // therefore we write the facet levels entries into a grenad file before transfering them. + // let mut writer = tempfile::tempfile().and_then(|file| { + // create_writer(compression_type, compression_level, file) + // })?; + + // let level_0_range = { + // let left = (field_id, 0, T::min_value(), T::min_value()); + // let right = (field_id, 0, T::max_value(), T::max_value()); + // left..=right + // }; + + // // Groups sizes are always a power of the original level_group_size and therefore a group + // // always maps groups of the previous level and never splits previous levels groups in half. + // let group_size_iter = (1u8..) + // .map(|l| (l, level_group_size.get().pow(l as u32))) + // .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + + // for (level, group_size) in group_size_iter { + // let mut left = T::zero(); + // let mut right = T::zero(); + // let mut group_docids = RoaringBitmap::new(); + + // let db = db.remap_key_type::(); + // for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + // let ((_field_id, _level, value, _right), docids) = result?; + + // if i == 0 { + // left = value; + // } else if i % group_size == 0 { + // // we found the first bound of the next group, we must store the left + // // and right bounds associated with the docids. + // write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + + // // We save the left bound for the new group and also reset the docids. + // group_docids = RoaringBitmap::new(); + // left = value; + // } + + // // The right bound is always the bound we run through. + // group_docids.union_with(&docids); + // right = value; + // } + + // if !group_docids.is_empty() { + // write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + // } + // } + + // writer_into_reader(writer, shrink_size) + + todo!() +} + +fn write_entry( + writer: &mut Writer, + field_id: u8, + level: u8, + left: T, + right: T, + ids: &RoaringBitmap, +) -> anyhow::Result<()> +where + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, +{ + let key = (field_id, level, left, right); + let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) +} From c765f277a3328be0bae4e9ae173de2fa61f23962 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 14:34:21 +0100 Subject: [PATCH 0620/1889] Introduce the WordsLevelPositions update --- milli/src/update/words_level_positions.rs | 117 +++++++++++----------- 1 file changed, 61 insertions(+), 56 deletions(-) diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 983f82657..0a7bc484d 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -3,12 +3,11 @@ use std::fs::File; use std::num::NonZeroUsize; use grenad::{CompressionType, Reader, Writer, FileFuse}; -use heed::types::{ByteSlice, DecodeIgnore}; +use heed::types::DecodeIgnore; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; use crate::Index; use crate::update::index_documents::WriteMethod; @@ -69,12 +68,16 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.min_level_size, )?; + // The previously computed entries also defines the level 0 entries + // so we can clear the database and append all of these entries. + self.index.word_level_position_docids.clear(self.wtxn)?; + write_into_lmdb_database( self.wtxn, *self.index.facet_field_id_value_docids.as_polymorph(), entries, |_, _| anyhow::bail!("invalid facet level merging"), - WriteMethod::GetMergePut, + WriteMethod::Append, )?; Ok(()) @@ -107,77 +110,79 @@ fn compute_positions_levels( min_level_size: NonZeroUsize, ) -> anyhow::Result> { - // let first_level_size = db.prefix_iter(rtxn, &[field_id])? - // .remap_types::() - // .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(compression_type, compression_level, file) + })?; - // // It is forbidden to keep a cursor and write in a database at the same time with LMDB - // // therefore we write the facet levels entries into a grenad file before transfering them. - // let mut writer = tempfile::tempfile().and_then(|file| { - // create_writer(compression_type, compression_level, file) - // })?; + for result in db.iter(rtxn)? { + let ((word, level, left, right), docids) = result?; - // let level_0_range = { - // let left = (field_id, 0, T::min_value(), T::min_value()); - // let right = (field_id, 0, T::max_value(), T::max_value()); - // left..=right - // }; + let first_level_size = db.remap_data_type::() + .prefix_iter(rtxn, &(word, level, u32::min_value(), u32::min_value()))? + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - // // Groups sizes are always a power of the original level_group_size and therefore a group - // // always maps groups of the previous level and never splits previous levels groups in half. - // let group_size_iter = (1u8..) - // .map(|l| (l, level_group_size.get().pow(l as u32))) - // .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + let level_0_range = { + let left = (word, 0, u32::min_value(), u32::min_value()); + let right = (word, 0, u32::max_value(), u32::max_value()); + left..=right + }; - // for (level, group_size) in group_size_iter { - // let mut left = T::zero(); - // let mut right = T::zero(); - // let mut group_docids = RoaringBitmap::new(); + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); - // let db = db.remap_key_type::(); - // for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { - // let ((_field_id, _level, value, _right), docids) = result?; + // As specified in the documentation, we also write the level 0 entries. + write_level_entry(&mut writer, word, level, left, right, &docids)?; - // if i == 0 { - // left = value; - // } else if i % group_size == 0 { - // // we found the first bound of the next group, we must store the left - // // and right bounds associated with the docids. - // write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; + for (level, group_size) in group_size_iter { + let mut left = 0; + let mut right = 0; + let mut group_docids = RoaringBitmap::new(); - // // We save the left bound for the new group and also reset the docids. - // group_docids = RoaringBitmap::new(); - // left = value; - // } + for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + let ((_field_id, _level, value, _right), docids) = result?; - // // The right bound is always the bound we run through. - // group_docids.union_with(&docids); - // right = value; - // } + if i == 0 { + left = value; + } else if i % group_size == 0 { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. + write_level_entry(&mut writer, word, level, left, right, &group_docids)?; - // if !group_docids.is_empty() { - // write_entry::(&mut writer, field_id, level, left, right, &group_docids)?; - // } - // } + // We save the left bound for the new group and also reset the docids. + group_docids = RoaringBitmap::new(); + left = value; + } - // writer_into_reader(writer, shrink_size) + // The right bound is always the bound we run through. + group_docids.union_with(&docids); + right = value; + } - todo!() + if !group_docids.is_empty() { + write_level_entry(&mut writer, word, level, left, right, &group_docids)?; + } + } + } + + writer_into_reader(writer, shrink_size) } -fn write_entry( +fn write_level_entry( writer: &mut Writer, - field_id: u8, + word: &str, level: u8, - left: T, - right: T, + left: u32, + right: u32, ids: &RoaringBitmap, ) -> anyhow::Result<()> -where - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, { - let key = (field_id, level, left, right); - let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?; + let key = (word, level, left, right); + let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(()) From 3a25137ee42d1f6d98db6f9e569baae40cb1949f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 13:55:24 +0100 Subject: [PATCH 0621/1889] Expose and use the WordsLevelPositions update --- milli/src/update/index_documents/mod.rs | 17 +++++++++++++++++ milli/src/update/update_builder.rs | 20 +++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 8fc35b654..e7143bde0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -263,6 +263,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { facet_min_level_size: Option, words_prefix_threshold: Option, max_prefix_length: Option, + words_positions_level_group_size: Option, + words_positions_min_level_size: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, @@ -290,6 +292,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { facet_min_level_size: None, words_prefix_threshold: None, max_prefix_length: None, + words_positions_level_group_size: None, + words_positions_min_level_size: None, update_method: IndexDocumentsMethod::ReplaceDocuments, update_format: UpdateFormat::Json, autogenerate_docids: true, @@ -740,6 +744,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + // Run the words level positions update operation. + let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + if let Some(value) = self.words_positions_level_group_size { + builder.level_group_size(value); + } + if let Some(value) = self.words_positions_min_level_size { + builder.min_level_size(value); + } + builder.execute()?; + debug_assert_eq!(database_count, total_databases); info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index c966f72d2..9a4fb850e 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -2,7 +2,10 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::Index; -use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets, WordsPrefixes}; +use super::{ + ClearDocuments, DeleteDocuments, IndexDocuments, Settings, + Facets, WordsPrefixes, WordsLevelPositions, +}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -150,4 +153,19 @@ impl<'a> UpdateBuilder<'a> { builder } + + pub fn words_level_positions<'t, 'u, 'i>( + self, + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordsLevelPositions<'t, 'u, 'i> + { + let mut builder = WordsLevelPositions::new(wtxn, index, self.update_id); + + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + + builder + } } From e8cc7f9cee818ecc18fff3c65f7b7566fb75a836 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 14:32:00 +0100 Subject: [PATCH 0622/1889] Expose a route in the http-ui to update the WordsLevelPositions --- http-ui/src/main.rs | 46 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b091985f3..dbf7aadce 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -229,6 +229,7 @@ enum UpdateMeta { Settings(Settings), Facets(Facets), WordsPrefixes(WordsPrefixes), + WordsLevelPositions(WordsLevelPositions), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -281,6 +282,22 @@ struct WordsPrefixes { max_prefix_length: Option, } +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +struct WordsLevelPositions { + level_group_size: Option, + min_level_size: Option, +} + +// Any value that is present is considered Some value, including null. +fn deserialize_some<'de, T, D>(deserializer: D) -> Result, D::Error> +where T: Deserialize<'de>, + D: Deserializer<'de> +{ + Deserialize::deserialize(deserializer).map(Some) +} + #[tokio::main] async fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); @@ -493,6 +510,21 @@ async fn main() -> anyhow::Result<()> { Ok(()) => wtxn.commit().map_err(Into::into), Err(e) => Err(e) } + }, + UpdateMeta::WordsLevelPositions(levels) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.words_level_positions(&mut wtxn, &index_cloned); + if let Some(value) = levels.level_group_size { + builder.level_group_size(value); + } + if let Some(value) = levels.min_level_size { + builder.min_level_size(value); + } + match builder.execute() { + Ok(()) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()) + } } }; @@ -923,6 +955,19 @@ async fn main() -> anyhow::Result<()> { warp::reply() }); + let update_store_cloned = update_store.clone(); + let update_status_sender_cloned = update_status_sender.clone(); + let change_words_level_positions_route = warp::filters::method::post() + .and(warp::path!("words-level-positions")) + .and(warp::body::json()) + .map(move |levels: WordsLevelPositions| { + let meta = UpdateMeta::WordsLevelPositions(levels); + let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); + let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); + eprintln!("update {} registered", update_id); + warp::reply() + }); + let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); let abort_update_id_route = warp::filters::method::delete() @@ -998,6 +1043,7 @@ async fn main() -> anyhow::Result<()> { .or(change_settings_route) .or(change_facet_levels_route) .or(change_words_prefixes_route) + .or(change_words_level_positions_route) .or(update_ws_route); let addr = SocketAddr::from_str(&opt.http_listen_addr)?; From 6b1b42b928685f468507f0c2fccd8ff6a2925e99 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 14:22:01 +0100 Subject: [PATCH 0623/1889] Introduce an infos wordsLevelPositionsDocids subcommand --- infos/src/main.rs | 61 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 356a5417c..e4d59c641 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -19,9 +19,10 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids"; const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids"; const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; -const FACET_FIELD_ID_VALUE_DOCIDS_NAME: &str = "facet-field-id-value-docids"; -const FIELD_ID_DOCID_FACET_VALUES_NAME: &str = "field-id-docid-facet-values"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; +const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; +const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids"; +const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values"; const DOCUMENTS_DB_NAME: &str = "documents"; const ALL_DATABASE_NAMES: &[&str] = &[ @@ -31,8 +32,9 @@ const ALL_DATABASE_NAMES: &[&str] = &[ DOCID_WORD_POSITIONS_DB_NAME, WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, - FACET_FIELD_ID_VALUE_DOCIDS_NAME, - FIELD_ID_DOCID_FACET_VALUES_NAME, + WORD_LEVEL_POSITION_DOCIDS_DB_NAME, + FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME, + FIELD_ID_DOCID_FACET_VALUES_DB_NAME, DOCUMENTS_DB_NAME, ]; @@ -114,6 +116,16 @@ enum Command { field_name: String, }, + /// Outputs a CSV with the documents ids along with the word level positions where it appears. + WordsLevelPositionsDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The field name in the document. + words: Vec, + }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. DocidsWordsPositions { /// Display the whole positions in detail. @@ -221,6 +233,9 @@ fn main() -> anyhow::Result<()> { FacetValuesDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, field_name) }, + WordsLevelPositionsDocids { full_display, words } => { + words_level_positions_docids(&index, &rtxn, !full_display, words) + }, DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) }, @@ -525,6 +540,40 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam Ok(wtr.flush()?) } +fn words_level_positions_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + words: Vec, +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["word", "level", "position_range", "documents_count", "documents_ids"])?; + + for word in words.iter().map(AsRef::as_ref) { + let range = { + let left = (word, 0, u32::min_value(), u32::min_value()); + let right = (word, u8::max_value(), u32::max_value(), u32::max_value()); + left..=right + }; + for result in index.word_level_position_docids.range(rtxn, &range)? { + let ((word, level, left, right), docids) = result?; + let level = level.to_string(); + let count = docids.len().to_string(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + let position_range = format!("{:?}", left..=right); + wtr.write_record(&[word, &level, &position_range, &count, &docids])?; + } + } + + Ok(wtr.flush()?) +} + fn docids_words_positions( index: &Index, rtxn: &heed::RoTxn, @@ -730,8 +779,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), - FACET_FIELD_ID_VALUE_DOCIDS_NAME => index.facet_field_id_value_docids.as_polymorph(), - FIELD_ID_DOCID_FACET_VALUES_NAME => index.field_id_docid_facet_values.as_polymorph(), + FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => index.facet_field_id_value_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_VALUES_DB_NAME => index.field_id_docid_facet_values.as_polymorph(), DOCUMENTS_DB_NAME => index.documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; From 3069bf4f4a3ad50a89a1573b49dec92c61107678 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 15:40:38 +0100 Subject: [PATCH 0624/1889] Fix and improve the words-level-positions computation --- infos/src/main.rs | 6 ++-- milli/src/update/index_documents/store.rs | 2 +- milli/src/update/words_level_positions.rs | 42 ++++++++--------------- 3 files changed, 20 insertions(+), 30 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index e4d59c641..c219c5758 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -558,7 +558,9 @@ fn words_level_positions_docids( left..=right }; for result in index.word_level_position_docids.range(rtxn, &range)? { - let ((word, level, left, right), docids) = result?; + let ((w, level, left, right), docids) = result?; + if word != w { break } + let level = level.to_string(); let count = docids.len().to_string(); let docids = if debug { @@ -567,7 +569,7 @@ fn words_level_positions_docids( format!("{:?}", docids.iter().collect::>()) }; let position_range = format!("{:?}", left..=right); - wtr.write_record(&[word, &level, &position_range, &count, &docids])?; + wtr.write_record(&[w, &level, &position_range, &count, &docids])?; } } diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 358552768..0f97476d9 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -388,7 +388,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { key_buffer.push(0); // level 0 for position in positions { - key_buffer.truncate(word.len()); + key_buffer.truncate(word.len() + 1); let position_bytes = position.to_be_bytes(); key_buffer.extend_from_slice(position_bytes.as_bytes()); key_buffer.extend_from_slice(position_bytes.as_bytes()); diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 0a7bc484d..77cec246a 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::num::NonZeroUsize; use grenad::{CompressionType, Reader, Writer, FileFuse}; -use heed::types::DecodeIgnore; +use heed::types::{DecodeIgnore, Str}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; @@ -56,10 +56,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { pub fn execute(self) -> anyhow::Result<()> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); - clear_non_zero_levels_positions(self.wtxn, self.index.word_level_position_docids)?; - let entries = compute_positions_levels( self.wtxn, + self.index.word_docids.remap_data_type::(), self.index.word_level_position_docids, self.chunk_compression_type, self.chunk_compression_level, @@ -74,7 +73,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { write_into_lmdb_database( self.wtxn, - *self.index.facet_field_id_value_docids.as_polymorph(), + *self.index.word_level_position_docids.as_polymorph(), entries, |_, _| anyhow::bail!("invalid facet level merging"), WriteMethod::Append, @@ -84,25 +83,11 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { } } -fn clear_non_zero_levels_positions( - wtxn: &mut heed::RwTxn, - db: heed::Database, -) -> heed::Result<()> -{ - let mut iter = db.iter_mut(wtxn)?.lazily_decode_data(); - while let Some(result) = iter.next() { - let ((_, level, _, _), _) = result?; - if level != 0 { - iter.del_current()?; - } - } - Ok(()) -} - -/// Generates all the words positions levels (including the level zero). +/// Generates all the words positions levels based on the levels zero (including the level zero). fn compute_positions_levels( rtxn: &heed::RoTxn, - db: heed::Database, + words_db: heed::Database, + words_positions_db: heed::Database, compression_type: CompressionType, compression_level: Option, shrink_size: Option, @@ -116,11 +101,11 @@ fn compute_positions_levels( create_writer(compression_type, compression_level, file) })?; - for result in db.iter(rtxn)? { - let ((word, level, left, right), docids) = result?; + for result in words_db.iter(rtxn)? { + let (word, ()) = result?; - let first_level_size = db.remap_data_type::() - .prefix_iter(rtxn, &(word, level, u32::min_value(), u32::min_value()))? + let first_level_size = words_positions_db.remap_data_type::() + .prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))? .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; let level_0_range = { @@ -136,14 +121,17 @@ fn compute_positions_levels( .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); // As specified in the documentation, we also write the level 0 entries. - write_level_entry(&mut writer, word, level, left, right, &docids)?; + for result in words_positions_db.range(rtxn, &level_0_range)? { + let ((word, level, left, right), docids) = result?; + write_level_entry(&mut writer, word, level, left, right, &docids)?; + } for (level, group_size) in group_size_iter { let mut left = 0; let mut right = 0; let mut group_docids = RoaringBitmap::new(); - for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { + for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() { let ((_field_id, _level, value, _right), docids) = result?; if i == 0 { From f7138284066887cf3bc610b35b48c8f2393bb448 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 15:47:41 +0100 Subject: [PATCH 0625/1889] Implement the clear and delete documents for the word-level-positions database --- milli/src/update/clear_documents.rs | 1 + milli/src/update/delete_documents.rs | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 250e4b13a..6d7dd72b8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -56,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; + word_level_position_docids.clear(self.wtxn)?; facet_field_id_value_docids.clear(self.wtxn)?; field_id_docid_facet_values.clear(self.wtxn)?; documents.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index b60b7bac2..f9303d339 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -330,6 +330,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // We delete the documents ids that are under the word level position docids. + let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(bytes, &docids)?; + } + } + + drop(iter); + Ok(self.documents_ids.len()) } } From 8bd4f5d93ec5e212197a8e662a37431bfdf0c865 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 17 Mar 2021 16:09:18 +0100 Subject: [PATCH 0626/1889] Compute the biggest values of the words_level_positions_docids --- infos/src/main.rs | 18 +++++++++++++++--- milli/src/update/words_level_positions.rs | 10 +++++----- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index c219c5758..2c11d3783 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -346,6 +346,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let docid_word_positions_name = "docid_word_positions"; let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids"; + let word_level_position_docids_name = "word_level_position_docids"; let facet_field_id_value_docids_name = "facet_field_id_value_docids"; let documents_name = "documents"; @@ -402,6 +403,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in word_level_position_docids.remap_data_type::().iter(rtxn)? { + let ((word, level, left, right), value) = result?; + let key = format!("{} {} {:?}", word, level, left..=right); + heap.push(Reverse((value.len(), key, word_level_position_docids_name))); + if heap.len() > limit { heap.pop(); } + } + let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; for (field_id, field_type) in faceted_fields { @@ -549,7 +557,7 @@ fn words_level_positions_docids( { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["word", "level", "position_range", "documents_count", "documents_ids"])?; + wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; for word in words.iter().map(AsRef::as_ref) { let range = { @@ -561,14 +569,18 @@ fn words_level_positions_docids( let ((w, level, left, right), docids) = result?; if word != w { break } - let level = level.to_string(); let count = docids.len().to_string(); let docids = if debug { format!("{:?}", docids) } else { format!("{:?}", docids.iter().collect::>()) }; - let position_range = format!("{:?}", left..=right); + let position_range = if level == 0 { + format!("{:?}", left) + } else { + format!("{:?}", left..=right) + }; + let level = level.to_string(); wtr.write_record(&[w, &level, &position_range, &count, &docids])?; } } diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 77cec246a..a7be248b6 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -104,16 +104,16 @@ fn compute_positions_levels( for result in words_db.iter(rtxn)? { let (word, ()) = result?; - let first_level_size = words_positions_db.remap_data_type::() - .prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))? - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_range = { let left = (word, 0, u32::min_value(), u32::min_value()); let right = (word, 0, u32::max_value(), u32::max_value()); left..=right }; + let first_level_size = words_positions_db.remap_data_type::() + .range(rtxn, &level_0_range)? + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. let group_size_iter = (1u8..) @@ -132,7 +132,7 @@ fn compute_positions_levels( let mut group_docids = RoaringBitmap::new(); for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() { - let ((_field_id, _level, value, _right), docids) = result?; + let ((_word, _level, value, _right), docids) = result?; if i == 0 { left = value; From bd1a371c62cf7d1fb79b29c1b5ccde30d63aa0ca Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Mar 2021 15:41:44 +0100 Subject: [PATCH 0627/1889] Compute the WordsLevelPositions only once --- milli/src/update/index_documents/mod.rs | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e7143bde0..3a41a52ae 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -718,19 +718,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; - // Run the words positions update operation. - let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id); - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - if let Some(value) = self.facet_level_group_size { - builder.level_group_size(value); - } - if let Some(value) = self.facet_min_level_size { - builder.min_level_size(value); - } - builder.execute()?; - // Run the words prefixes update operation. let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; From 89ee2cf576858398ee160a0ed54d6494aedcecfc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 18 Mar 2021 17:20:16 +0100 Subject: [PATCH 0628/1889] Introduce the TreeLevel struct --- infos/src/main.rs | 9 ++-- .../heed_codec/str_level_position_codec.rs | 13 +++-- milli/src/lib.rs | 2 + milli/src/tree_level.rs | 47 +++++++++++++++++++ milli/src/update/words_level_positions.rs | 11 +++-- 5 files changed, 67 insertions(+), 15 deletions(-) create mode 100644 milli/src/tree_level.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index 2c11d3783..0e6403d7b 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -5,7 +5,7 @@ use std::{str, io, fmt}; use anyhow::Context; use byte_unit::Byte; use heed::EnvOpenOptions; -use milli::Index; +use milli::{Index, TreeLevel}; use structopt::StructOpt; use Command::*; @@ -561,13 +561,12 @@ fn words_level_positions_docids( for word in words.iter().map(AsRef::as_ref) { let range = { - let left = (word, 0, u32::min_value(), u32::min_value()); - let right = (word, u8::max_value(), u32::max_value(), u32::max_value()); + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); left..=right }; for result in index.word_level_position_docids.range(rtxn, &range)? { let ((w, level, left, right), docids) = result?; - if word != w { break } let count = docids.len().to_string(); let docids = if debug { @@ -575,7 +574,7 @@ fn words_level_positions_docids( } else { format!("{:?}", docids.iter().collect::>()) }; - let position_range = if level == 0 { + let position_range = if level == TreeLevel::min_value() { format!("{:?}", left) } else { format!("{:?}", left..=right) diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs index c421c04b5..810e91940 100644 --- a/milli/src/heed_codec/str_level_position_codec.rs +++ b/milli/src/heed_codec/str_level_position_codec.rs @@ -1,12 +1,14 @@ use std::borrow::Cow; -use std::convert::TryInto; +use std::convert::{TryFrom, TryInto}; use std::mem::size_of; use std::str; +use crate::TreeLevel; + pub struct StrLevelPositionCodec; impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { - type DItem = (&'a str, u8, u32, u32); + type DItem = (&'a str, TreeLevel, u32, u32); fn bytes_decode(bytes: &'a [u8]) -> Option { let footer_len = size_of::() + size_of::() * 2; @@ -19,13 +21,14 @@ impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { let (level, bytes) = bytes.split_first()?; let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?; let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?; + let level = TreeLevel::try_from(*level).ok()?; - Some((word, *level, left, right)) + Some((word, level, left, right)) } } impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { - type EItem = (&'a str, u8, u32, u32); + type EItem = (&'a str, TreeLevel, u32, u32); fn bytes_encode((word, level, left, right): &Self::EItem) -> Option> { let left = left.to_be_bytes(); @@ -33,7 +36,7 @@ impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len()); bytes.extend_from_slice(word.as_bytes()); - bytes.push(*level); + bytes.push((*level).into()); bytes.extend_from_slice(&left[..]); bytes.extend_from_slice(&right[..]); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index de5c6511e..03169bce7 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -9,6 +9,7 @@ pub mod facet; pub mod heed_codec; pub mod index; pub mod proximity; +pub mod tree_level; pub mod update; use std::borrow::Cow; @@ -27,6 +28,7 @@ pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringB pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; +pub use self::tree_level::TreeLevel; pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; diff --git a/milli/src/tree_level.rs b/milli/src/tree_level.rs new file mode 100644 index 000000000..7ce2904e2 --- /dev/null +++ b/milli/src/tree_level.rs @@ -0,0 +1,47 @@ +use std::convert::TryFrom; +use std::fmt; + +/// This is just before the lowest printable character (space, sp, 32) +const MAX_VALUE: u8 = 31; + +#[derive(Debug, Copy, Clone)] +pub enum Error { + LevelTooHigh(u8), +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(transparent)] +pub struct TreeLevel(u8); + +impl TreeLevel { + pub const fn max_value() -> TreeLevel { + TreeLevel(MAX_VALUE) + } + + pub const fn min_value() -> TreeLevel { + TreeLevel(0) + } +} + +impl Into for TreeLevel { + fn into(self) -> u8 { + self.0 + } +} + +impl TryFrom for TreeLevel { + type Error = Error; + + fn try_from(value: u8) -> Result { + match value { + 0..=MAX_VALUE => Ok(TreeLevel(value)), + _ => Err(Error::LevelTooHigh(value)), + } + } +} + +impl fmt::Display for TreeLevel { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.0) + } +} diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index a7be248b6..4286fc780 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -1,4 +1,5 @@ use std::cmp; +use std::convert::TryFrom; use std::fs::File; use std::num::NonZeroUsize; @@ -9,9 +10,9 @@ use log::debug; use roaring::RoaringBitmap; use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; -use crate::Index; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::{Index, TreeLevel}; pub struct WordsLevelPositions<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -105,8 +106,8 @@ fn compute_positions_levels( let (word, ()) = result?; let level_0_range = { - let left = (word, 0, u32::min_value(), u32::min_value()); - let right = (word, 0, u32::max_value(), u32::max_value()); + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); left..=right }; @@ -117,7 +118,7 @@ fn compute_positions_levels( // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) + .map(|l| (TreeLevel::try_from(l).unwrap(), level_group_size.get().pow(l as u32))) .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); // As specified in the documentation, we also write the level 0 entries. @@ -163,7 +164,7 @@ fn compute_positions_levels( fn write_level_entry( writer: &mut Writer, word: &str, - level: u8, + level: TreeLevel, left: u32, right: u32, ids: &RoaringBitmap, From 658f316511faf6f87d4d7733236887e80d5eef79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 23 Mar 2021 15:25:46 +0100 Subject: [PATCH 0629/1889] Introduce the Initial Criterion --- milli/Cargo.toml | 3 - milli/src/search/criteria/asc_desc.rs | 161 +++++++------------------ milli/src/search/criteria/attribute.rs | 88 ++++++-------- milli/src/search/criteria/fetcher.rs | 135 --------------------- milli/src/search/criteria/final.rs | 57 +++++++++ milli/src/search/criteria/initial.rs | 28 +++++ milli/src/search/criteria/mod.rs | 65 ++++------ milli/src/search/criteria/proximity.rs | 151 ++++++++--------------- milli/src/search/criteria/typo.rs | 66 ++++------ milli/src/search/criteria/words.rs | 56 +++------ milli/src/search/mod.rs | 9 +- 11 files changed, 286 insertions(+), 533 deletions(-) delete mode 100644 milli/src/search/criteria/fetcher.rs create mode 100644 milli/src/search/criteria/final.rs create mode 100644 milli/src/search/criteria/initial.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index eefdfa7d5..ef9c64b7b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -57,9 +57,6 @@ criterion = "0.3.4" maplit = "1.0.2" rand = "0.8.3" -[build-dependencies] -fst = "0.4.5" - [features] default = [] diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 1dc186720..d2841d449 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -31,32 +31,10 @@ pub struct AscDesc<'t> { candidates: Box> + 't>, bucket_candidates: RoaringBitmap, faceted_candidates: RoaringBitmap, - parent: Option>, + parent: Box, } impl<'t> AscDesc<'t> { - pub fn initial_asc( - index: &'t Index, - rtxn: &'t heed::RoTxn, - query_tree: Option, - candidates: Option, - field_name: String, - ) -> anyhow::Result - { - Self::initial(index, rtxn, query_tree, candidates, field_name, true) - } - - pub fn initial_desc( - index: &'t Index, - rtxn: &'t heed::RoTxn, - query_tree: Option, - candidates: Option, - field_name: String, - ) -> anyhow::Result - { - Self::initial(index, rtxn, query_tree, candidates, field_name, false) - } - pub fn asc( index: &'t Index, rtxn: &'t heed::RoTxn, @@ -77,47 +55,6 @@ impl<'t> AscDesc<'t> { Self::new(index, rtxn, parent, field_name, false) } - fn initial( - index: &'t Index, - rtxn: &'t heed::RoTxn, - query_tree: Option, - candidates: Option, - field_name: String, - ascending: bool, - ) -> anyhow::Result - { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields(rtxn)?; - let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; - - let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?; - let candidates = match &query_tree { - Some(qt) => { - let context = CriteriaBuilder::new(rtxn, index)?; - let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), &mut WordDerivationsCache::new())?; - if let Some(candidates) = candidates { - qt_candidates.intersect_with(&candidates); - } - qt_candidates - }, - None => candidates.unwrap_or(faceted_candidates.clone()), - }; - - Ok(AscDesc { - index, - rtxn, - field_name, - field_id, - facet_type, - ascending, - query_tree, - candidates: facet_ordered(index, rtxn, field_id, facet_type, ascending, candidates)?, - faceted_candidates, - bucket_candidates: RoaringBitmap::new(), - parent: None, - }) - } - fn new( index: &'t Index, rtxn: &'t heed::RoTxn, @@ -141,7 +78,7 @@ impl<'t> AscDesc<'t> { candidates: Box::new(std::iter::empty()), faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, }) } } @@ -156,64 +93,56 @@ impl<'t> Criterion for AscDesc<'t> { match self.candidates.next().transpose()? { None => { - let query_tree = self.query_tree.take(); - let bucket_candidates = take(&mut self.bucket_candidates); - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree; - let candidates = match (&self.query_tree, candidates) { - (_, Some(mut candidates)) => { - candidates.intersect_with(&self.faceted_candidates); - candidates - }, - (Some(qt), None) => { - let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; - candidates.intersect_with(&self.faceted_candidates); - candidates - }, - (None, None) => take(&mut self.faceted_candidates), - }; - if bucket_candidates.is_empty() { - self.bucket_candidates.union_with(&candidates); - } else { - self.bucket_candidates.union_with(&bucket_candidates); - } - self.candidates = facet_ordered( - self.index, - self.rtxn, - self.field_id, - self.facet_type, - self.ascending, - candidates, - )?; + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates_is_some = candidates.is_some(); + self.query_tree = query_tree; + let candidates = match (&self.query_tree, candidates) { + (_, Some(mut candidates)) => { + candidates.intersect_with(&self.faceted_candidates); + candidates }, - None => return Ok(None), - } - }, - None => if query_tree.is_none() && bucket_candidates.is_empty() { - return Ok(None) - }, - } + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; + candidates.intersect_with(&self.faceted_candidates); + candidates + }, + (None, None) => take(&mut self.faceted_candidates), + }; - return Ok(Some(CriterionResult { - query_tree, - candidates: Some(RoaringBitmap::new()), - bucket_candidates, - })); + // If our parent returns candidates it means that the bucket + // candidates were already computed before and we can use them. + // + // If not, we must use the just computed candidates as our bucket + // candidates. + if candidates_is_some { + self.bucket_candidates.union_with(&bucket_candidates); + } else { + self.bucket_candidates.union_with(&candidates); + } + + if candidates.is_empty() { + continue; + } + + self.candidates = facet_ordered( + self.index, + self.rtxn, + self.field_id, + self.facet_type, + self.ascending, + candidates, + )?; + }, + None => return Ok(None), + } }, Some(candidates) => { - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, } diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 7f8b5c622..6398c7d87 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -14,36 +14,19 @@ pub struct Attribute<'t> { query_tree: Option, candidates: Option, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, flattened_query_tree: Option>>, current_buckets: Option>, } impl<'t> Attribute<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Attribute { - ctx, - query_tree, - candidates, - bucket_candidates: RoaringBitmap::new(), - parent: None, - flattened_query_tree: None, - current_buckets: None, - } - } - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { Attribute { ctx, query_tree: None, candidates: None, bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, flattened_query_tree: None, current_buckets: None, } @@ -63,34 +46,35 @@ impl<'t> Criterion for Attribute<'t> { })); }, (Some(qt), Some(candidates)) => { - let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| flatten_query_tree(&qt)); - let current_buckets = if let Some(current_buckets) = self.current_buckets.as_mut() { - current_buckets - } else { - let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; - self.current_buckets.get_or_insert(new_buckets.into_iter()) + let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| { + flatten_query_tree(&qt) + }); + + let current_buckets = match self.current_buckets.as_mut() { + Some(current_buckets) => current_buckets, + None => { + let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; + self.current_buckets.get_or_insert(new_buckets.into_iter()) + }, }; - let found_candidates = if let Some((_score, candidates)) = current_buckets.next() { - candidates - } else { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), - })); + let found_candidates = match current_buckets.next() { + Some((_score, candidates)) => candidates, + None => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, }; + candidates.difference_with(&found_candidates); - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => found_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(found_candidates), - bucket_candidates: bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, (Some(qt), None) => { @@ -106,18 +90,20 @@ impl<'t> Criterion for Attribute<'t> { })); }, (None, None) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree; - self.candidates = candidates; - self.bucket_candidates.union_with(&bucket_candidates); - self.flattened_query_tree = None; - self.current_buckets = None; - }, - None => return Ok(None), - } + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree; + self.candidates = candidates; + self.bucket_candidates.union_with(&bucket_candidates); + self.flattened_query_tree = None; + self.current_buckets = None; }, None => return Ok(None), } diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs deleted file mode 100644 index fa204bdf2..000000000 --- a/milli/src/search/criteria/fetcher.rs +++ /dev/null @@ -1,135 +0,0 @@ -use std::collections::HashMap; -use std::mem::take; - -use log::debug; -use roaring::RoaringBitmap; - -use crate::search::query_tree::Operation; -use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; - -/// The result of a call to the fetcher. -#[derive(Debug, Clone, PartialEq)] -pub struct FetcherResult { - /// The query tree corresponding to the current bucket of the last criterion. - pub query_tree: Option, - /// The candidates of the current bucket of the last criterion. - pub candidates: RoaringBitmap, - /// Candidates that comes from the current bucket of the initial criterion. - pub bucket_candidates: RoaringBitmap, -} - -pub struct Fetcher<'t> { - ctx: &'t dyn Context, - query_tree: Option, - candidates: Candidates, - parent: Option>, - should_get_documents_ids: bool, - wdcache: WordDerivationsCache, -} - -impl<'t> Fetcher<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Fetcher { - ctx, - query_tree, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - parent: None, - should_get_documents_ids: true, - wdcache: WordDerivationsCache::new(), - } - } - - pub fn new( - ctx: &'t dyn Context, - parent: Box, - ) -> Self - { - Fetcher { - ctx, - query_tree: None, - candidates: Candidates::default(), - parent: Some(parent), - should_get_documents_ids: true, - wdcache: WordDerivationsCache::new(), - } - } - - #[logging_timer::time("Fetcher::{}")] - pub fn next(&mut self) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; - loop { - debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", - self.should_get_documents_ids, self.candidates, - ); - - let should_get_documents_ids = take(&mut self.should_get_documents_ids); - match &mut self.candidates { - Allowed(_) => { - let candidates = take(&mut self.candidates).into_inner(); - let candidates = match &self.query_tree { - Some(qt) if should_get_documents_ids => { - let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?; - docids.intersect_with(&candidates); - docids - }, - _ => candidates, - }; - - return Ok(Some(FetcherResult { - query_tree: self.query_tree.take(), - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - Forbidden(_) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(&mut self.wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, - (None, None) => RoaringBitmap::new(), - }; - - return Ok(Some(FetcherResult { query_tree, candidates, bucket_candidates })) - }, - None => if should_get_documents_ids { - let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, - None => self.ctx.documents_ids()?, - }; - - return Ok(Some(FetcherResult { - query_tree: self.query_tree.clone(), - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - } - }, - None => if should_get_documents_ids { - let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, - None => self.ctx.documents_ids()?, - }; - - return Ok(Some(FetcherResult { - query_tree: self.query_tree.clone(), - candidates: candidates.clone(), - bucket_candidates: candidates, - })); - }, - } - return Ok(None); - }, - } - } - } -} diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs new file mode 100644 index 000000000..fe224ef94 --- /dev/null +++ b/milli/src/search/criteria/final.rs @@ -0,0 +1,57 @@ +use std::collections::HashMap; + +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; +use super::{resolve_query_tree, Criterion, CriterionResult, Context}; + +/// The result of a call to the fetcher. +#[derive(Debug, Clone, PartialEq)] +pub struct FinalResult { + /// The query tree corresponding to the current bucket of the last criterion. + pub query_tree: Option, + /// The candidates of the current bucket of the last criterion. + pub candidates: RoaringBitmap, + /// Candidates that comes from the current bucket of the initial criterion. + pub bucket_candidates: RoaringBitmap, +} + +pub struct Final<'t> { + ctx: &'t dyn Context, + parent: Box, + wdcache: WordDerivationsCache, +} + +impl<'t> Final<'t> { + pub fn new(ctx: &'t dyn Context, parent: Box) -> Final<'t> { + Final { ctx, parent, wdcache: WordDerivationsCache::new() } + } + + #[logging_timer::time("Final::{}")] + pub fn next(&mut self) -> anyhow::Result> { + loop { + debug!("Final iteration"); + + match self.parent.next(&mut self.wdcache)? { + Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => { + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, + (None, None) => self.ctx.documents_ids()?, + }; + + bucket_candidates.union_with(&candidates); + + return Ok(Some(FinalResult { + query_tree, + candidates, + bucket_candidates, + })); + }, + None => return Ok(None), + } + } + } +} diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs new file mode 100644 index 000000000..d4b9e1379 --- /dev/null +++ b/milli/src/search/criteria/initial.rs @@ -0,0 +1,28 @@ +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; + +use super::{Criterion, CriterionResult}; + +pub struct Initial { + answer: Option +} + +impl Initial { + pub fn new(query_tree: Option, mut candidates: Option) -> Initial { + let answer = CriterionResult { + query_tree, + candidates: candidates.clone(), + bucket_candidates: candidates.take().unwrap_or_default(), + }; + Initial { answer: Some(answer) } + } +} + +impl Criterion for Initial { + #[logging_timer::time("Initial::{}")] + fn next(&mut self, _: &mut WordDerivationsCache) -> anyhow::Result> { + Ok(self.answer.take()) + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1d7026d71..5e75be6ce 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -8,19 +8,21 @@ use crate::search::{word_derivations, WordDerivationsCache}; use crate::{Index, DocumentId}; use super::query_tree::{Operation, Query, QueryKind}; +use self::asc_desc::AscDesc; +use self::attribute::Attribute; +use self::r#final::Final; +use self::initial::Initial; +use self::proximity::Proximity; use self::typo::Typo; use self::words::Words; -use self::asc_desc::AscDesc; -use self::proximity::Proximity; -use self::attribute::Attribute; -use self::fetcher::Fetcher; +mod asc_desc; +mod attribute; +mod initial; +mod proximity; mod typo; mod words; -mod asc_desc; -mod proximity; -mod attribute; -pub mod fetcher; +pub mod r#final; pub trait Criterion { fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result>; @@ -61,6 +63,7 @@ impl Default for Candidates { Self::Forbidden(RoaringBitmap::new()) } } + pub trait Context { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; @@ -128,44 +131,26 @@ impl<'t> CriteriaBuilder<'t> { pub fn build( &'t self, - mut query_tree: Option, - mut facet_candidates: Option, - ) -> anyhow::Result> + query_tree: Option, + facet_candidates: Option, + ) -> anyhow::Result> { use crate::criterion::Criterion as Name; - let mut criterion = None as Option>; + let mut criterion = Box::new(Initial::new(query_tree, facet_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { - criterion = Some(match criterion.take() { - Some(father) => match name { - Name::Typo => Box::new(Typo::new(self, father)), - Name::Words => Box::new(Words::new(self, father)), - Name::Proximity => Box::new(Proximity::new(self, father)), - Name::Attribute => Box::new(Attribute::new(self, father)), - Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?), - Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?), - _otherwise => father, - }, - None => match name { - Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())), - Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())), - Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())), - Name::Attribute => Box::new(Attribute::initial(self, query_tree.take(), facet_candidates.take())), - Name::Asc(field) => { - Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) - }, - Name::Desc(field) => { - Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) - }, - _otherwise => continue, - }, - }); + criterion = match name { + Name::Typo => Box::new(Typo::new(self, criterion)), + Name::Words => Box::new(Words::new(self, criterion)), + Name::Proximity => Box::new(Proximity::new(self, criterion)), + Name::Attribute => Box::new(Attribute::new(self, criterion)), + Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), + Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), + _otherwise => criterion, + }; } - match criterion { - Some(criterion) => Ok(Fetcher::new(self, criterion)), - None => Ok(Fetcher::initial(self, query_tree, facet_candidates)), - } + Ok(Final::new(self, criterion)) } } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index decd4c338..dc1daafb2 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -8,48 +8,29 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; +use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; pub struct Proximity<'t> { ctx: &'t dyn Context, - query_tree: Option<(usize, Operation)>, + /// ((max_proximity, query_tree), allowed_candidates) + state: Option<(Option<(usize, Operation)>, RoaringBitmap)>, proximity: u8, - candidates: Candidates, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, candidates_cache: Cache, plane_sweep_cache: Option>, } impl<'t> Proximity<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Proximity { - ctx, - query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), - proximity: 0, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - bucket_candidates: RoaringBitmap::new(), - parent: None, - candidates_cache: Cache::new(), - plane_sweep_cache: None, - } - } - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { Proximity { ctx, - query_tree: None, + state: None, proximity: 0, - candidates: Candidates::default(), bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent: parent, candidates_cache: Cache::new(), plane_sweep_cache: None, } @@ -59,27 +40,20 @@ impl<'t> Proximity<'t> { impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; loop { - debug!("Proximity at iteration {} (max {:?}) ({:?})", + debug!("Proximity at iteration {} (max prox {:?}) ({:?})", self.proximity, - self.query_tree.as_ref().map(|(mp, _)| mp), - self.candidates, + self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)), + self.state.as_ref().map(|(_, cd)| cd), ); - match (&mut self.query_tree, &mut self.candidates) { - (_, Allowed(candidates)) if candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: Some(take(&mut self.candidates).into_inner()), - bucket_candidates: take(&mut self.bucket_candidates), - })); + match &mut self.state { + Some((_, candidates)) if candidates.is_empty() => { + self.state = None; // reset state }, - (Some((max_prox, query_tree)), Allowed(candidates)) => { + Some((Some((max_prox, query_tree)), candidates)) => { if self.proximity as usize > *max_prox { - // reset state to (None, Forbidden(_)) - self.query_tree = None; - self.candidates = Candidates::default(); + self.state = None; // reset state } else { let mut new_candidates = if candidates.len() <= 1000 { if let Some(cache) = self.plane_sweep_cache.as_mut() { @@ -89,9 +63,7 @@ impl<'t> Criterion for Proximity<'t> { candidates }, None => { - // reset state to (None, Forbidden(_)) - self.query_tree = None; - self.candidates = Candidates::default(); + self.state = None; // reset state continue }, } @@ -120,79 +92,54 @@ impl<'t> Criterion for Proximity<'t> { candidates.difference_with(&new_candidates); self.proximity += 1; - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), candidates: Some(new_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); } }, - (Some((max_prox, query_tree)), Forbidden(candidates)) => { - if self.proximity as usize > *max_prox { - self.query_tree = None; - self.candidates = Candidates::default(); - } else { - let mut new_candidates = resolve_candidates( - self.ctx, - &query_tree, - self.proximity, - &mut self.candidates_cache, - wdcache, - )?; - - new_candidates.difference_with(&candidates); - candidates.union_with(&new_candidates); - self.proximity += 1; - - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; - - return Ok(Some(CriterionResult { - query_tree: Some(query_tree.clone()), - candidates: Some(new_candidates), - bucket_candidates, - })); - } - }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); + Some((None, candidates)) => { + let candidates = take(candidates); + self.state = None; // reset state return Ok(Some(CriterionResult { query_tree: None, candidates: Some(candidates.clone()), bucket_candidates: candidates, })); }, - (None, Forbidden(_)) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, - (None, None) => RoaringBitmap::new(), - }; + None => { + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates_is_some = candidates.is_some(); + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, + (None, None) => RoaringBitmap::new(), + }; - if bucket_candidates.is_empty() { - self.bucket_candidates.union_with(&candidates); - } else { - self.bucket_candidates.union_with(&bucket_candidates); - } - - self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); - self.proximity = 0; - self.candidates = Candidates::Allowed(candidates); - self.plane_sweep_cache = None; - }, - None => return Ok(None), + // If our parent returns candidates it means that the bucket + // candidates were already computed before and we can use them. + // + // If not, we must use the just computed candidates as our bucket + // candidates. + if candidates_is_some { + self.bucket_candidates.union_with(&bucket_candidates); + } else { + self.bucket_candidates.union_with(&candidates); } + + let query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); + self.state = Some((query_tree, candidates)); + self.proximity = 0; + self.plane_sweep_cache = None; }, None => return Ok(None), } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 3877f53ed..40b06afc4 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -14,28 +14,11 @@ pub struct Typo<'t> { number_typos: u8, candidates: Candidates, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } impl<'t> Typo<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Typo { - ctx, - query_tree: query_tree.map(|op| (maximum_typo(&op), op)), - number_typos: 0, - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), - bucket_candidates: RoaringBitmap::new(), - parent: None, - candidates_cache: HashMap::new(), - } - } - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { Typo { ctx, @@ -43,7 +26,7 @@ impl<'t> Typo<'t> { number_typos: 0, candidates: Candidates::default(), bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, candidates_cache: HashMap::new(), } } @@ -90,15 +73,10 @@ impl<'t> Criterion for Typo<'t> { candidates.difference_with(&new_candidates); self.number_typos += 1; - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: Some(new_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); } }, @@ -145,17 +123,19 @@ impl<'t> Criterion for Typo<'t> { })); }, (None, Forbidden(_)) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); - self.number_typos = 0; - self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); - self.bucket_candidates.union_with(&bucket_candidates); - }, - None => return Ok(None), - } + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); + self.number_typos = 0; + self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } @@ -334,8 +314,8 @@ fn resolve_candidates<'t>( #[cfg(test)] mod test { - use super::*; + use super::super::initial::Initial; use super::super::test::TestContext; #[test] @@ -345,7 +325,8 @@ mod test { let facet_candidates = None; let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, query_tree, facet_candidates); + let parent = Initial::new(query_tree, facet_candidates); + let mut criteria = Typo::new(&context, Box::new(parent)); assert!(criteria.next(&mut wdcache).unwrap().is_none()); } @@ -364,7 +345,8 @@ mod test { let facet_candidates = None; let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates); + let parent = Initial::new(Some(query_tree), facet_candidates); + let mut criteria = Typo::new(&context, Box::new(parent)); let candidates_1 = context.word_docids("split").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap() @@ -413,7 +395,8 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())); + let parent = Initial::new(query_tree, Some(facet_candidates.clone())); + let mut criteria = Typo::new(&context, Box::new(parent)); let expected = CriterionResult { query_tree: None, @@ -442,7 +425,8 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); let mut wdcache = WordDerivationsCache::new(); - let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())); + let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone())); + let mut criteria = Typo::new(&context, Box::new(parent)); let candidates_1 = context.word_docids("split").unwrap().unwrap() & context.word_docids("this").unwrap().unwrap() diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 0aa3b483a..5bb9d8d90 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -12,34 +12,18 @@ pub struct Words<'t> { query_trees: Vec, candidates: Option, bucket_candidates: RoaringBitmap, - parent: Option>, + parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } impl<'t> Words<'t> { - pub fn initial( - ctx: &'t dyn Context, - query_tree: Option, - candidates: Option, - ) -> Self - { - Words { - ctx, - query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), - candidates, - bucket_candidates: RoaringBitmap::new(), - parent: None, - candidates_cache: HashMap::default(), - } - } - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { Words { ctx, query_trees: Vec::default(), candidates: None, bucket_candidates: RoaringBitmap::new(), - parent: Some(parent), + parent, candidates_cache: HashMap::default(), } } @@ -65,27 +49,17 @@ impl<'t> Criterion for Words<'t> { found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => found_candidates.clone(), - }; - return Ok(Some(CriterionResult { query_tree: Some(qt), candidates: Some(found_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, (Some(qt), None) => { - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => RoaringBitmap::new(), - }; - return Ok(Some(CriterionResult { query_tree: Some(qt), candidates: None, - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); }, (None, Some(_)) => { @@ -97,16 +71,18 @@ impl<'t> Criterion for Words<'t> { })); }, (None, None) => { - match self.parent.as_mut() { - Some(parent) => { - match parent.next(wdcache)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); - self.candidates = candidates; - self.bucket_candidates.union_with(&bucket_candidates); - }, - None => return Ok(None), - } + match self.parent.next(wdcache)? { + Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates: None, + bucket_candidates, + })); + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); + self.candidates = candidates; + self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 174fff35c..4f0bde422 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -13,9 +13,8 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct}; - -use crate::search::criteria::fetcher::{Fetcher, FetcherResult}; -use crate::{DocumentId, Index}; +use crate::search::criteria::r#final::{Final, FinalResult}; +use crate::{Index, DocumentId}; pub use self::facet::{ FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator, @@ -162,14 +161,14 @@ impl<'a> Search<'a> { &self, mut distinct: impl for<'c> Distinct<'c>, matching_words: MatchingWords, - mut criteria: Fetcher, + mut criteria: Final, ) -> anyhow::Result { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); let mut excluded_documents = RoaringBitmap::new(); let mut documents_ids = Vec::with_capacity(self.limit); - while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { + while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next()? { debug!("Number of candidates found {}", candidates.len()); let excluded = take(&mut excluded_documents); From 7aa5753ed282afd2df90f1fae07beb2a1b8eeb68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 24 Mar 2021 15:06:54 +0100 Subject: [PATCH 0630/1889] Make the attribute positions range bounds to be fixed --- http-ui/src/main.rs | 6 +-- milli/src/update/index_documents/mod.rs | 6 +-- milli/src/update/words_level_positions.rs | 47 +++++++++++++++-------- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index dbf7aadce..c85bd9b15 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; use std::net::SocketAddr; -use std::num::NonZeroUsize; +use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; @@ -286,8 +286,8 @@ struct WordsPrefixes { #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] struct WordsLevelPositions { - level_group_size: Option, - min_level_size: Option, + level_group_size: Option, + min_level_size: Option, } // Any value that is present is considered Some value, including null. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3a41a52ae..7a2196481 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom}; -use std::num::NonZeroUsize; +use std::num::{NonZeroU32, NonZeroUsize}; use std::sync::mpsc::sync_channel; use std::time::Instant; @@ -263,8 +263,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { facet_min_level_size: Option, words_prefix_threshold: Option, max_prefix_length: Option, - words_positions_level_group_size: Option, - words_positions_min_level_size: Option, + words_positions_level_group_size: Option, + words_positions_min_level_size: Option, update_method: IndexDocumentsMethod, update_format: UpdateFormat, autogenerate_docids: bool, diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 4286fc780..eb8d3bb3c 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -1,7 +1,7 @@ use std::cmp; use std::convert::TryFrom; use std::fs::File; -use std::num::NonZeroUsize; +use std::num::NonZeroU32; use grenad::{CompressionType, Reader, Writer, FileFuse}; use heed::types::{DecodeIgnore, Str}; @@ -20,8 +20,8 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, _update_id: u64, } @@ -38,18 +38,18 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { chunk_compression_type: CompressionType::None, chunk_compression_level: None, chunk_fusing_shrink_size: None, - level_group_size: NonZeroUsize::new(4).unwrap(), - min_level_size: NonZeroUsize::new(5).unwrap(), + level_group_size: NonZeroU32::new(4).unwrap(), + min_level_size: NonZeroU32::new(5).unwrap(), _update_id: update_id, } } - pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); + pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { + self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); self } - pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { + pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { self.min_level_size = value; self } @@ -84,6 +84,20 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { } } +/// Returns the next number after or equal to `x` that is divisible by `d`. +fn next_divisible(x: u32, d: u32) -> u32 { + (x.saturating_sub(1) | (d - 1)) + 1 +} + +/// Returns the previous number after or equal to `x` that is divisible by `d`, +/// saturates on zero. +fn previous_divisible(x: u32, d: u32) -> u32 { + match x.checked_sub(d - 1) { + Some(0) | None => 0, + Some(x) => next_divisible(x, d), + } +} + /// Generates all the words positions levels based on the levels zero (including the level zero). fn compute_positions_levels( rtxn: &heed::RoTxn, @@ -92,8 +106,8 @@ fn compute_positions_levels( compression_type: CompressionType, compression_level: Option, shrink_size: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, ) -> anyhow::Result> { // It is forbidden to keep a cursor and write in a database at the same time with LMDB @@ -113,7 +127,7 @@ fn compute_positions_levels( let first_level_size = words_positions_db.remap_data_type::() .range(rtxn, &level_0_range)? - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -136,20 +150,23 @@ fn compute_positions_levels( let ((_word, _level, value, _right), docids) = result?; if i == 0 { - left = value; - } else if i % group_size == 0 { + left = previous_divisible(value, group_size); + right = left + (group_size - 1); + } + + if value > right { // we found the first bound of the next group, we must store the left // and right bounds associated with the docids. write_level_entry(&mut writer, word, level, left, right, &group_docids)?; // We save the left bound for the new group and also reset the docids. group_docids = RoaringBitmap::new(); - left = value; + left = previous_divisible(value, group_size); + right = left + (group_size - 1); } // The right bound is always the bound we run through. group_docids.union_with(&docids); - right = value; } if !group_docids.is_empty() { From 0ad9499b935db85347323f14b2144c1c6a45d924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 24 Mar 2021 15:37:03 +0100 Subject: [PATCH 0631/1889] Fix an indexing bug in the words level positions --- milli/src/update/words_level_positions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index eb8d3bb3c..70bc89860 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -121,7 +121,7 @@ fn compute_positions_levels( let level_0_range = { let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); - let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + let right = (word, TreeLevel::min_value(), u32::max_value(), u32::max_value()); left..=right }; From ab92c814c3247b03bf2447ef5a346688a80cef95 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 24 Mar 2021 18:20:13 +0100 Subject: [PATCH 0632/1889] Fix attributes score --- milli/src/search/criteria/attribute.rs | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 6398c7d87..160807847 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -122,7 +122,8 @@ fn linear_compute_candidates( fn compute_candidate_rank(branches: &Vec>, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { - let mut branch_rank = 0; + let branch_len = branch.len(); + let mut branch_rank = Vec::with_capacity(branch_len); for Query { prefix, kind } in branch { // find the best position of the current word in the document. let position = match kind { @@ -145,13 +146,21 @@ fn linear_compute_candidates( // if a position is found, we add it to the branch score, // otherwise the branch is considered as unfindable in this document and we break. if let Some(position) = position { - branch_rank += position as u64; + branch_rank.push(position as u64); } else { - branch_rank = u64::max_value(); + branch_rank.clear(); break; } } - min_rank = min_rank.min(branch_rank); + + if !branch_rank.is_empty() { + branch_rank.sort_unstable(); + // because several words in same query can't match all a the position 0, + // we substract the word index to the position. + let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); + // here we do the means of the words of the branch + min_rank = min_rank.min(branch_rank / branch_len as u64); + } } min_rank From e65bad16ccd273625a624d44bde06e01eaf08bdb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 25 Mar 2021 11:10:12 +0100 Subject: [PATCH 0633/1889] Compute the words prefixes at the end of an update --- http-ui/src/main.rs | 68 ------ infos/src/main.rs | 1 + milli/src/index.rs | 6 +- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 16 ++ .../update/index_documents/merge_function.rs | 4 + milli/src/update/index_documents/mod.rs | 37 +++- milli/src/update/mod.rs | 9 +- milli/src/update/update_builder.rs | 35 +--- milli/src/update/word_prefix_docids.rs | 75 +++++++ .../word_prefix_pair_proximity_docids.rs | 89 ++++++++ milli/src/update/words_level_positions.rs | 90 ++++++-- milli/src/update/words_prefixes.rs | 196 ------------------ milli/src/update/words_prefixes_fst.rs | 104 ++++++++++ 14 files changed, 409 insertions(+), 323 deletions(-) create mode 100644 milli/src/update/word_prefix_docids.rs create mode 100644 milli/src/update/word_prefix_pair_proximity_docids.rs delete mode 100644 milli/src/update/words_prefixes.rs create mode 100644 milli/src/update/words_prefixes_fst.rs diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index c85bd9b15..00618f58a 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -228,8 +228,6 @@ enum UpdateMeta { ClearDocuments, Settings(Settings), Facets(Facets), - WordsPrefixes(WordsPrefixes), - WordsLevelPositions(WordsLevelPositions), } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -290,14 +288,6 @@ struct WordsLevelPositions { min_level_size: Option, } -// Any value that is present is considered Some value, including null. -fn deserialize_some<'de, T, D>(deserializer: D) -> Result, D::Error> -where T: Deserialize<'de>, - D: Deserializer<'de> -{ - Deserialize::deserialize(deserializer).map(Some) -} - #[tokio::main] async fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); @@ -496,36 +486,6 @@ async fn main() -> anyhow::Result<()> { Err(e) => Err(e) } } - UpdateMeta::WordsPrefixes(settings) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.words_prefixes(&mut wtxn, &index_cloned); - if let Some(value) = settings.threshold { - builder.threshold(value); - } - if let Some(value) = settings.max_prefix_length { - builder.max_prefix_length(value); - } - match builder.execute() { - Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) - } - }, - UpdateMeta::WordsLevelPositions(levels) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.words_level_positions(&mut wtxn, &index_cloned); - if let Some(value) = levels.level_group_size { - builder.level_group_size(value); - } - if let Some(value) = levels.min_level_size { - builder.min_level_size(value); - } - match builder.execute() { - Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()) - } - } }; let meta = match result { @@ -942,32 +902,6 @@ async fn main() -> anyhow::Result<()> { warp::reply() }); - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let change_words_prefixes_route = warp::filters::method::post() - .and(warp::path!("words-prefixes")) - .and(warp::body::json()) - .map(move |settings: WordsPrefixes| { - let meta = UpdateMeta::WordsPrefixes(settings); - let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); - let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); - eprintln!("update {} registered", update_id); - warp::reply() - }); - - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let change_words_level_positions_route = warp::filters::method::post() - .and(warp::path!("words-level-positions")) - .and(warp::body::json()) - .map(move |levels: WordsLevelPositions| { - let meta = UpdateMeta::WordsLevelPositions(levels); - let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); - let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); - eprintln!("update {} registered", update_id); - warp::reply() - }); - let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); let abort_update_id_route = warp::filters::method::delete() @@ -1042,8 +976,6 @@ async fn main() -> anyhow::Result<()> { .or(clearing_route) .or(change_settings_route) .or(change_facet_levels_route) - .or(change_words_prefixes_route) - .or(change_words_level_positions_route) .or(update_ws_route); let addr = SocketAddr::from_str(&opt.http_listen_addr)?; diff --git a/infos/src/main.rs b/infos/src/main.rs index 0e6403d7b..e730a8b43 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -338,6 +338,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho facet_field_id_value_docids, field_id_docid_facet_values: _, documents, + .. } = index; let main_name = "main"; diff --git a/milli/src/index.rs b/milli/src/index.rs index 0659b207a..ba7747250 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -54,6 +54,8 @@ pub struct Index { pub word_prefix_pair_proximity_docids: Database, /// Maps the word, level and position range with the docids that corresponds to it. pub word_level_position_docids: Database, + /// Maps the level positions of a word prefix with all the docids where this prefix appears. + pub word_prefix_level_position_docids: Database, /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. pub facet_field_id_value_docids: Database, /// Maps the document id, the facet field id and the globally ordered value. @@ -64,7 +66,7 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(10); + options.max_dbs(11); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -74,6 +76,7 @@ impl Index { let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; + let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; @@ -98,6 +101,7 @@ impl Index { word_pair_proximity_docids, word_prefix_pair_proximity_docids, word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 6d7dd72b8..f89c2d00c 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -29,6 +29,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_pair_proximity_docids, word_prefix_pair_proximity_docids, word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, @@ -57,6 +58,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_level_position_docids.clear(self.wtxn)?; + word_prefix_level_position_docids.clear(self.wtxn)?; facet_field_id_value_docids.clear(self.wtxn)?; field_id_docid_facet_values.clear(self.wtxn)?; documents.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index f9303d339..4c5f8d61a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -89,6 +89,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_pair_proximity_docids, word_prefix_pair_proximity_docids, word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values, documents, @@ -345,6 +346,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // We delete the documents ids that are under the word prefix level position docids. + let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(bytes, &docids)?; + } + } + + drop(iter); + Ok(self.documents_ids.len()) } } diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 54f994fc0..a6d008513 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -52,6 +52,10 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) - cbo_roaring_bitmap_merge(values) } +pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + cbo_roaring_bitmap_merge(values) +} + pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { cbo_roaring_bitmap_merge(values) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7a2196481..8ebdf1634 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -3,6 +3,7 @@ use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::num::{NonZeroU32, NonZeroUsize}; +use std::str; use std::sync::mpsc::sync_channel; use std::time::Instant; @@ -13,18 +14,21 @@ use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionTy use heed::types::ByteSlice; use log::{debug, info, error}; use memmap::Mmap; -use rayon::ThreadPool; use rayon::prelude::*; +use rayon::ThreadPool; use serde::{Serialize, Deserialize}; use crate::index::Index; -use crate::update::{Facets, WordsLevelPositions, WordsPrefixes, UpdateIndexingStep}; +use crate::update::{ + Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep, + WordPrefixPairProximityDocids, +}; use self::store::{Store, Readers}; pub use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, docid_word_positions_merge, documents_merge, - word_level_position_docids_merge, facet_field_value_docids_merge, - field_id_docid_facet_values_merge, + word_level_position_docids_merge, word_prefix_level_positions_docids_merge, + facet_field_value_docids_merge, field_id_docid_facet_values_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -719,10 +723,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { builder.execute()?; // Run the words prefixes update operation. - let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id); - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id); if let Some(value) = self.words_prefix_threshold { builder.threshold(value); } @@ -731,8 +732,26 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + // Run the word prefix docids update operation. + let mut builder = WordPrefixDocids::new(self.wtxn, self.index); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + builder.max_nb_chunks = self.max_nb_chunks; + builder.max_memory = self.max_memory; + builder.execute()?; + + // Run the word prefix pair proximity docids update operation. + let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); + builder.chunk_compression_type = self.chunk_compression_type; + builder.chunk_compression_level = self.chunk_compression_level; + builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + builder.max_nb_chunks = self.max_nb_chunks; + builder.max_memory = self.max_memory; + builder.execute()?; + // Run the words level positions update operation. - let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id); + let mut builder = WordsLevelPositions::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 1fc4890fb..203937e2f 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -6,8 +6,10 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc pub use self::settings::{Setting, Settings}; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; +pub use self::word_prefix_docids::WordPrefixDocids; +pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; pub use self::words_level_positions::WordsLevelPositions; -pub use self::words_prefixes::WordsPrefixes; +pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; @@ -17,6 +19,7 @@ mod index_documents; mod settings; mod update_builder; mod update_step; +mod word_prefix_docids; +mod word_prefix_pair_proximity_docids; mod words_level_positions; -mod words_prefixes; - +mod words_prefixes_fst; diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 9a4fb850e..8d6eb034d 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -2,10 +2,7 @@ use grenad::CompressionType; use rayon::ThreadPool; use crate::Index; -use super::{ - ClearDocuments, DeleteDocuments, IndexDocuments, Settings, - Facets, WordsPrefixes, WordsLevelPositions, -}; +use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -138,34 +135,4 @@ impl<'a> UpdateBuilder<'a> { builder } - - pub fn words_prefixes<'t, 'u, 'i>( - self, - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordsPrefixes<'t, 'u, 'i> - { - let mut builder = WordsPrefixes::new(wtxn, index, self.update_id); - - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - - builder - } - - pub fn words_level_positions<'t, 'u, 'i>( - self, - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordsLevelPositions<'t, 'u, 'i> - { - let mut builder = WordsLevelPositions::new(wtxn, index, self.update_id); - - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - - builder - } } diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs new file mode 100644 index 000000000..58c984212 --- /dev/null +++ b/milli/src/update/word_prefix_docids.rs @@ -0,0 +1,75 @@ +use std::str; + +use crate::Index; +use fst::Streamer; +use grenad::CompressionType; +use heed::types::ByteSlice; + +use crate::update::index_documents::WriteMethod; +use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database}; + +pub struct WordPrefixDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, +} + +impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> { + WordPrefixDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + max_nb_chunks: None, + max_memory: None, + } + } + + pub fn execute(self) -> anyhow::Result<()> { + // Clear the word prefix docids database. + self.index.word_prefix_docids.clear(self.wtxn)?; + + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + + // It is forbidden to keep a mutable reference into the database + // and write into it at the same time, therefore we write into another file. + let mut prefix_docids_sorter = create_sorter( + word_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + // We iterate over all the prefixes and retrieve the corresponding docids. + let mut prefix_stream = prefix_fst.stream(); + while let Some(bytes) = prefix_stream.next() { + let prefix = str::from_utf8(bytes)?; + let db = self.index.word_docids.remap_data_type::(); + for result in db.prefix_iter(self.wtxn, prefix)? { + let (_word, data) = result?; + prefix_docids_sorter.insert(prefix, data)?; + } + } + + drop(prefix_fst); + + // We finally write the word prefix docids into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_docids.as_polymorph(), + prefix_docids_sorter, + word_docids_merge, + WriteMethod::Append, + )?; + + Ok(()) + } +} diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs new file mode 100644 index 000000000..c972efc4f --- /dev/null +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -0,0 +1,89 @@ +use std::str; + +use fst::automaton::{Automaton, Str}; +use fst::{Streamer, IntoStreamer}; +use grenad::CompressionType; +use heed::BytesEncode; +use heed::types::ByteSlice; +use log::debug; + +use crate::Index; +use crate::heed_codec::StrStrU8Codec; +use crate::update::index_documents::{ + WriteMethod, create_sorter, sorter_into_lmdb_database, + words_pairs_proximities_docids_merge, +}; + +pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, +} + +impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> + { + WordPrefixPairProximityDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + chunk_fusing_shrink_size: None, + max_nb_chunks: None, + max_memory: None, + } + } + + pub fn execute(self) -> anyhow::Result<()> { + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; + + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + + // Here we create a sorter akin to the previous one. + let mut word_prefix_pair_proximity_docids_sorter = create_sorter( + words_pairs_proximities_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + // We insert all the word pairs corresponding to the word-prefix pairs + // where the prefixes appears in the prefix FST previously constructed. + let db = self.index.word_pair_proximity_docids.remap_data_type::(); + for result in db.iter(self.wtxn)? { + let ((word1, word2, prox), data) = result?; + let automaton = Str::new(word2).starts_with(); + let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); + while let Some(prefix) = matching_prefixes.next() { + let prefix = str::from_utf8(prefix)?; + let pair = (word1, prefix, prox); + let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); + word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; + } + } + + drop(prefix_fst); + + // We finally write the word prefix pair proximity docids into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + word_prefix_pair_proximity_docids_sorter, + words_pairs_proximities_docids_merge, + WriteMethod::Append, + )?; + + Ok(()) + } +} diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 70bc89860..1b772c37d 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -1,17 +1,22 @@ -use std::cmp; +use std::{cmp, str}; use std::convert::TryFrom; use std::fs::File; use std::num::NonZeroU32; +use fst::automaton::{self, Automaton}; +use fst::{Streamer, IntoStreamer}; use grenad::{CompressionType, Reader, Writer, FileFuse}; -use heed::types::{DecodeIgnore, Str}; +use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::update::index_documents::{ + create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, + word_prefix_level_positions_docids_merge, sorter_into_lmdb_database +}; use crate::{Index, TreeLevel}; pub struct WordsLevelPositions<'t, 'u, 'i> { @@ -20,27 +25,24 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) chunk_fusing_shrink_size: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, level_group_size: NonZeroU32, min_level_size: NonZeroU32, - _update_id: u64, } impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - update_id: u64, - ) -> WordsLevelPositions<'t, 'u, 'i> - { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> { WordsLevelPositions { wtxn, index, chunk_compression_type: CompressionType::None, chunk_compression_level: None, chunk_fusing_shrink_size: None, + max_nb_chunks: None, + max_memory: None, level_group_size: NonZeroU32::new(4).unwrap(), min_level_size: NonZeroU32::new(5).unwrap(), - _update_id: update_id, } } @@ -76,7 +78,71 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_level_position_docids.as_polymorph(), entries, - |_, _| anyhow::bail!("invalid facet level merging"), + |_, _| anyhow::bail!("invalid word level position merging"), + WriteMethod::Append, + )?; + + // We compute the word prefix level positions database. + self.index.word_prefix_level_position_docids.clear(self.wtxn)?; + + let mut word_prefix_level_positions_docids_sorter = create_sorter( + word_prefix_level_positions_docids_merge, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.max_nb_chunks, + self.max_memory, + ); + + // We insert the word prefix level positions where the level is equal to 0 and + // corresponds to the word-prefix level positions where the prefixes appears + // in the prefix FST previously constructed. + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + let db = self.index.word_level_position_docids.remap_data_type::(); + for result in db.iter(self.wtxn)? { + let ((word, level, left, right), data) = result?; + if level == TreeLevel::min_value() { + let automaton = automaton::Str::new(word).starts_with(); + let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); + while let Some(prefix) = matching_prefixes.next() { + let prefix = str::from_utf8(prefix)?; + let key = (prefix, level, left, right); + let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap(); + word_prefix_level_positions_docids_sorter.insert(bytes, data)?; + } + } + } + + // We finally write all the word prefix level positions docids with + // a level equal to 0 into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_level_position_docids.as_polymorph(), + word_prefix_level_positions_docids_sorter, + word_prefix_level_positions_docids_merge, + WriteMethod::Append, + )?; + + let entries = compute_positions_levels( + self.wtxn, + self.index.word_prefix_docids.remap_data_type::(), + self.index.word_prefix_level_position_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + )?; + + // The previously computed entries also defines the level 0 entries + // so we can clear the database and append all of these entries. + self.index.word_prefix_level_position_docids.clear(self.wtxn)?; + + write_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_level_position_docids.as_polymorph(), + entries, + |_, _| anyhow::bail!("invalid word prefix level position merging"), WriteMethod::Append, )?; diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs deleted file mode 100644 index f2fe526a2..000000000 --- a/milli/src/update/words_prefixes.rs +++ /dev/null @@ -1,196 +0,0 @@ -use std::iter::FromIterator; -use std::str; - -use chrono::Utc; -use fst::automaton::Str; -use fst::{Automaton, Streamer, IntoStreamer}; -use grenad::CompressionType; -use heed::BytesEncode; -use heed::types::ByteSlice; - -use crate::heed_codec::StrStrU8Codec; -use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database}; -use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge}; -use crate::{Index, SmallString32}; - -pub struct WordsPrefixes<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, - threshold: f64, - max_prefix_length: usize, - _update_id: u64, -} - -impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - update_id: u64, - ) -> WordsPrefixes<'t, 'u, 'i> - { - WordsPrefixes { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - chunk_fusing_shrink_size: None, - max_nb_chunks: None, - max_memory: None, - threshold: 0.1 / 100.0, // .01% - max_prefix_length: 4, - _update_id: update_id, - } - } - - /// Set the ratio of concerned words required to make a prefix be part of the words prefixes - /// database. If a word prefix is supposed to match more than this number of words in the - /// dictionnary, therefore this prefix is added to the words prefixes datastructures. - /// - /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped - /// to these bounds otherwise. - pub fn threshold(&mut self, value: f64) -> &mut Self { - self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] - self - } - - /// Set the maximum length of prefixes in bytes. - /// - /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped - /// to these bounds, otherwise. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value.min(25).max(1); // clamp [1, 25] - self - } - - pub fn execute(self) -> anyhow::Result<()> { - self.index.set_updated_at(self.wtxn, &Utc::now())?; - // Clear the words prefixes datastructures. - self.index.word_prefix_docids.clear(self.wtxn)?; - self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; - - let words_fst = self.index.words_fst(&self.wtxn)?; - let number_of_words = words_fst.len(); - let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; - - // It is forbidden to keep a mutable reference into the database - // and write into it at the same time, therefore we write into another file. - let mut prefix_docids_sorter = create_sorter( - word_docids_merge, - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.max_nb_chunks, - self.max_memory, - ); - - let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); - for n in 1..=self.max_prefix_length { - - let mut current_prefix = SmallString32::new(); - let mut current_prefix_count = 0; - let mut builder = fst::SetBuilder::memory(); - - let mut stream = words_fst.stream(); - while let Some(bytes) = stream.next() { - // We try to get the first n bytes out of this string but we only want - // to split at valid characters bounds. If we try to split in the middle of - // a character we ignore this word and go to the next one. - let word = str::from_utf8(bytes)?; - let prefix = match word.get(..n) { - Some(prefix) => prefix, - None => continue, - }; - - // This is the first iteration of the loop, - // or the current word doesn't starts with the current prefix. - if current_prefix_count == 0 || prefix != current_prefix.as_str() { - current_prefix = SmallString32::from(prefix); - current_prefix_count = 0; - } - - current_prefix_count += 1; - - // There is enough words corresponding to this prefix to add it to the cache. - if current_prefix_count == min_number_of_words { - builder.insert(prefix)?; - } - } - - // We construct the final set for prefixes of size n. - prefix_fsts.push(builder.into_set()); - } - - // We merge all of the previously computed prefixes into on final set. - let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); - let mut builder = fst::SetBuilder::memory(); - builder.extend_stream(op.r#union())?; - let prefix_fst = builder.into_set(); - - // We iterate over all the prefixes and retrieve the corresponding docids. - let mut prefix_stream = prefix_fst.stream(); - while let Some(bytes) = prefix_stream.next() { - let prefix = str::from_utf8(bytes)?; - let db = self.index.word_docids.remap_data_type::(); - for result in db.prefix_iter(self.wtxn, prefix)? { - let (_word, data) = result?; - prefix_docids_sorter.insert(prefix, data)?; - } - } - - // Set the words prefixes FST in the dtabase. - self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; - - // We finally write the word prefix docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_docids.as_polymorph(), - prefix_docids_sorter, - word_docids_merge, - WriteMethod::Append, - )?; - - // We compute the word prefix pair proximity database. - - // Here we create a sorter akin to the previous one. - let mut word_prefix_pair_proximity_docids_sorter = create_sorter( - words_pairs_proximities_docids_merge, - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.max_nb_chunks, - self.max_memory, - ); - - // We insert all the word pairs corresponding to the word-prefix pairs - // where the prefixes appears in the prefix FST previously constructed. - let db = self.index.word_pair_proximity_docids.remap_data_type::(); - for result in db.iter(self.wtxn)? { - let ((word1, word2, prox), data) = result?; - let automaton = Str::new(word2).starts_with(); - let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); - while let Some(prefix) = matching_prefixes.next() { - let prefix = str::from_utf8(prefix)?; - let pair = (word1, prefix, prox); - let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); - word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; - } - } - - // We finally write the word prefix pair proximity docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - word_prefix_pair_proximity_docids_sorter, - words_pairs_proximities_docids_merge, - WriteMethod::Append, - )?; - - Ok(()) - } -} diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs new file mode 100644 index 000000000..f53b0ee00 --- /dev/null +++ b/milli/src/update/words_prefixes_fst.rs @@ -0,0 +1,104 @@ +use std::iter::FromIterator; +use std::str; + +use fst::Streamer; +use crate::{Index, SmallString32}; + +pub struct WordsPrefixesFst<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + threshold: f64, + max_prefix_length: usize, + _update_id: u64, +} + +impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + update_id: u64, + ) -> WordsPrefixesFst<'t, 'u, 'i> + { + WordsPrefixesFst { + wtxn, + index, + threshold: 0.1 / 100.0, // .01% + max_prefix_length: 4, + _update_id: update_id, + } + } + + /// Set the ratio of concerned words required to make a prefix be part of the words prefixes + /// database. If a word prefix is supposed to match more than this number of words in the + /// dictionnary, therefore this prefix is added to the words prefixes datastructures. + /// + /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped + /// to these bounds otherwise. + pub fn threshold(&mut self, value: f64) -> &mut Self { + self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] + self + } + + /// Set the maximum length of prefixes in bytes. + /// + /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped + /// to these bounds, otherwise. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value.min(25).max(1); // clamp [1, 25] + self + } + + pub fn execute(self) -> anyhow::Result<()> { + let words_fst = self.index.words_fst(&self.wtxn)?; + let number_of_words = words_fst.len(); + let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; + + let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); + for n in 1..=self.max_prefix_length { + + let mut current_prefix = SmallString32::new(); + let mut current_prefix_count = 0; + let mut builder = fst::SetBuilder::memory(); + + let mut stream = words_fst.stream(); + while let Some(bytes) = stream.next() { + // We try to get the first n bytes out of this string but we only want + // to split at valid characters bounds. If we try to split in the middle of + // a character we ignore this word and go to the next one. + let word = str::from_utf8(bytes)?; + let prefix = match word.get(..n) { + Some(prefix) => prefix, + None => continue, + }; + + // This is the first iteration of the loop, + // or the current word doesn't starts with the current prefix. + if current_prefix_count == 0 || prefix != current_prefix.as_str() { + current_prefix = SmallString32::from(prefix); + current_prefix_count = 0; + } + + current_prefix_count += 1; + + // There is enough words corresponding to this prefix to add it to the cache. + if current_prefix_count == min_number_of_words { + builder.insert(prefix)?; + } + } + + // We construct the final set for prefixes of size n. + prefix_fsts.push(builder.into_set()); + } + + // We merge all of the previously computed prefixes into on final set. + let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(op.r#union())?; + let prefix_fst = builder.into_set(); + + // Set the words prefixes FST in the dtabase. + self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; + + Ok(()) + } +} From 1aad66bdaafcf29428f30c3cf7463c0635396a7e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 25 Mar 2021 11:17:32 +0100 Subject: [PATCH 0634/1889] Compute stats about the word prefix level positions database in the infos crate --- infos/src/main.rs | 101 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index e730a8b43..81b753084 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -21,6 +21,7 @@ const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; +const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids"; const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values"; const DOCUMENTS_DB_NAME: &str = "documents"; @@ -33,6 +34,7 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_LEVEL_POSITION_DOCIDS_DB_NAME, + WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME, FIELD_ID_DOCID_FACET_VALUES_DB_NAME, DOCUMENTS_DB_NAME, @@ -122,10 +124,21 @@ enum Command { #[structopt(long)] full_display: bool, - /// The field name in the document. + /// Words appearing in the documents. words: Vec, }, + /// Outputs a CSV with the documents ids along with + /// the word prefix level positions where it appears. + WordPrefixesLevelPositionsDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// Prefixes of words appearing in the documents. + prefixes: Vec, + }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. DocidsWordsPositions { /// Display the whole positions in detail. @@ -236,6 +249,9 @@ fn main() -> anyhow::Result<()> { WordsLevelPositionsDocids { full_display, words } => { words_level_positions_docids(&index, &rtxn, !full_display, words) }, + WordPrefixesLevelPositionsDocids { full_display, prefixes } => { + word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) + }, DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) }, @@ -335,6 +351,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho word_pair_proximity_docids, word_prefix_pair_proximity_docids, word_level_position_docids, + word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values: _, documents, @@ -348,6 +365,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let word_level_position_docids_name = "word_level_position_docids"; + let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; let facet_field_id_value_docids_name = "facet_field_id_value_docids"; let documents_name = "documents"; @@ -411,6 +429,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in word_prefix_level_position_docids.remap_data_type::().iter(rtxn)? { + let ((word, level, left, right), value) = result?; + let key = format!("{} {} {:?}", word, level, left..=right); + heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name))); + if heap.len() > limit { heap.pop(); } + } + let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; for (field_id, field_type) in faceted_fields { @@ -588,6 +613,45 @@ fn words_level_positions_docids( Ok(wtr.flush()?) } +fn word_prefixes_level_positions_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + prefixes: Vec, +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?; + + for word in prefixes.iter().map(AsRef::as_ref) { + let range = { + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + left..=right + }; + for result in index.word_prefix_level_position_docids.range(rtxn, &range)? { + let ((w, level, left, right), docids) = result?; + + let count = docids.len().to_string(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + let position_range = if level == TreeLevel::min_value() { + format!("{:?}", left) + } else { + format!("{:?}", left..=right) + }; + let level = level.to_string(); + wtr.write_record(&[w, &level, &position_range, &count, &docids])?; + } + } + + Ok(wtr.flush()?) +} + fn docids_words_positions( index: &Index, rtxn: &heed::RoTxn, @@ -779,6 +843,21 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> anyhow::Result<()> { use heed::types::ByteSlice; + let Index { + env: _, + main, + word_docids, + word_prefix_docids, + docid_word_positions, + word_pair_proximity_docids, + word_prefix_pair_proximity_docids, + word_level_position_docids, + word_prefix_level_position_docids, + facet_field_id_value_docids, + field_id_docid_facet_values, + documents, + } = index; + let names = if names.is_empty() { ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect() } else { @@ -787,15 +866,17 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a for name in names { let database = match name.as_str() { - MAIN_DB_NAME => &index.main, - WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(), - WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(), - DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), - FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => index.facet_field_id_value_docids.as_polymorph(), - FIELD_ID_DOCID_FACET_VALUES_DB_NAME => index.field_id_docid_facet_values.as_polymorph(), - DOCUMENTS_DB_NAME => index.documents.as_polymorph(), + MAIN_DB_NAME => &main, + WORD_PREFIX_DOCIDS_DB_NAME => word_prefix_docids.as_polymorph(), + WORD_DOCIDS_DB_NAME => word_docids.as_polymorph(), + DOCID_WORD_POSITIONS_DB_NAME => docid_word_positions.as_polymorph(), + WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_pair_proximity_docids.as_polymorph(), + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), + WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), + WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), + FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(), + DOCUMENTS_DB_NAME => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; From 7ff4a2a708d4d08a25fd800348316ee361108c5d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 25 Mar 2021 23:45:06 +0100 Subject: [PATCH 0635/1889] Display the number of entries in the infos crate --- infos/src/main.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/infos/src/main.rs b/infos/src/main.rs index 81b753084..5a12a9d4d 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -882,16 +882,19 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a let mut key_size: u64 = 0; let mut val_size: u64 = 0; + let mut number_entries: u64 = 0; for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? { let (k, v) = result?; key_size += k.len() as u64; val_size += v.len() as u64; + number_entries += 1; } println!("The {} database weigh:", name); println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true)); println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true)); println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true)); + println!("\tnumber of entries: {}", number_entries); } Ok(()) From 361193099fdeedb8b4b6fb5bf450bc9baa07f5cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 29 Mar 2021 16:25:14 +0200 Subject: [PATCH 0636/1889] Reduce the amount of branches when query tree flattened --- milli/src/search/criteria/attribute.rs | 83 +++++++++++++++----------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 160807847..31c11e7bb 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,3 +1,4 @@ +use std::cmp; use std::collections::{BTreeMap, HashMap, btree_map}; use std::mem::take; @@ -15,7 +16,7 @@ pub struct Attribute<'t> { candidates: Option, bucket_candidates: RoaringBitmap, parent: Box, - flattened_query_tree: Option>>, + flattened_query_tree: Option>>>, current_buckets: Option>, } @@ -115,33 +116,43 @@ impl<'t> Criterion for Attribute<'t> { fn linear_compute_candidates( ctx: &dyn Context, - branches: &Vec>, + branches: &Vec>>, allowed_candidates: &RoaringBitmap, ) -> anyhow::Result> { - fn compute_candidate_rank(branches: &Vec>, words_positions: HashMap) -> u64 { + fn compute_candidate_rank(branches: &Vec>>, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { + let branch_len = branch.len(); let mut branch_rank = Vec::with_capacity(branch_len); - for Query { prefix, kind } in branch { - // find the best position of the current word in the document. - let position = match kind { - QueryKind::Exact { word, .. } => { - if *prefix { - word_derivations(word, true, 0, &words_positions) - .flat_map(|positions| positions.iter().next()).min() - } else { - words_positions.get(word) - .map(|positions| positions.iter().next()) - .flatten() - } - }, - QueryKind::Tolerant { typo, word } => { - word_derivations(word, *prefix, *typo, &words_positions) - .flat_map(|positions| positions.iter().next()).min() - }, - }; + for derivates in branch { + let mut position = None; + for Query { prefix, kind } in derivates { + // find the best position of the current word in the document. + let current_position = match kind { + QueryKind::Exact { word, .. } => { + if *prefix { + word_derivations(word, true, 0, &words_positions) + .flat_map(|positions| positions.iter().next()).min() + } else { + words_positions.get(word) + .map(|positions| positions.iter().next()) + .flatten() + } + }, + QueryKind::Tolerant { typo, word } => { + word_derivations(word, *prefix, *typo, &words_positions) + .flat_map(|positions| positions.iter().next()).min() + }, + }; + + match (position, current_position) { + (Some(p), Some(cp)) => position = Some(cmp::min(p, cp)), + (None, Some(cp)) => position = Some(cp), + _ => (), + } + } // if a position is found, we add it to the branch score, // otherwise the branch is considered as unfindable in this document and we break. @@ -194,10 +205,10 @@ fn linear_compute_candidates( } // TODO can we keep refs of Query -fn flatten_query_tree(query_tree: &Operation) -> Vec> { +fn flatten_query_tree(query_tree: &Operation) -> Vec>> { use crate::search::criteria::Operation::{And, Or, Consecutive}; - fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec> { + fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec>> { match tail.split_first() { Some((thead, tail)) => { let tail = and_recurse(thead, tail); @@ -215,13 +226,17 @@ fn flatten_query_tree(query_tree: &Operation) -> Vec> { } } - fn recurse(op: &Operation) -> Vec> { + fn recurse(op: &Operation) -> Vec>> { match op { And(ops) | Consecutive(ops) => { ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) }, - Or(_, ops) => ops.into_iter().map(recurse).flatten().collect(), - Operation::Query(query) => vec![vec![query.clone()]], + Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { + vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] + } else { + ops.into_iter().map(recurse).flatten().collect() + }, + Operation::Query(query) => vec![vec![vec![query.clone()]]], } } @@ -256,19 +271,19 @@ mod tests { ]); let expected = vec![ - vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }], + vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], vec![ - Query { prefix: false, kind: QueryKind::exact(S("manythe")) }, - Query { prefix: false, kind: QueryKind::exact(S("fish")) }, + vec![Query { prefix: false, kind: QueryKind::exact(S("manythe")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], ], vec![ - Query { prefix: false, kind: QueryKind::exact(S("many")) }, - Query { prefix: false, kind: QueryKind::exact(S("thefish")) }, + vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("thefish")) }], ], vec![ - Query { prefix: false, kind: QueryKind::exact(S("many")) }, - Query { prefix: false, kind: QueryKind::exact(S("the")) }, - Query { prefix: false, kind: QueryKind::exact(S("fish")) }, + vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("the")) }], + vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], ], ]; From 59f58c15f7cedf16acdbb3a89bae17247ffcc3ab Mon Sep 17 00:00:00 2001 From: many Date: Wed, 31 Mar 2021 19:23:02 +0200 Subject: [PATCH 0637/1889] Implement attribute criterion * Implement WordLevelIterator * Implement QueryLevelIterator * Implement set algorithm based on iterators Not tested + Some TODO to fix --- milli/src/search/criteria/attribute.rs | 354 +++++++++++++++++++++++-- milli/src/search/criteria/final.rs | 4 +- milli/src/search/criteria/mod.rs | 52 +++- milli/src/search/criteria/proximity.rs | 4 +- milli/src/search/criteria/typo.rs | 4 +- milli/src/search/criteria/words.rs | 4 +- milli/src/tree_level.rs | 4 + 7 files changed, 394 insertions(+), 32 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 31c11e7bb..af336c21f 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,17 +1,17 @@ -use std::cmp; +use std::{cmp::{self, Ordering}, collections::BinaryHeap}; use std::collections::{BTreeMap, HashMap, btree_map}; use std::mem::take; use roaring::RoaringBitmap; -use crate::{search::build_dfa}; +use crate::{TreeLevel, search::build_dfa}; use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; use crate::search::WordDerivationsCache; use super::{Criterion, CriterionResult, Context, resolve_query_tree}; pub struct Attribute<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, query_tree: Option, candidates: Option, bucket_candidates: RoaringBitmap, @@ -21,7 +21,7 @@ pub struct Attribute<'t> { } impl<'t> Attribute<'t> { - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Attribute { ctx, query_tree: None, @@ -51,23 +51,27 @@ impl<'t> Criterion for Attribute<'t> { flatten_query_tree(&qt) }); - let current_buckets = match self.current_buckets.as_mut() { - Some(current_buckets) => current_buckets, - None => { - let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; - self.current_buckets.get_or_insert(new_buckets.into_iter()) - }, - }; + let found_candidates = if candidates.len() < 1000 { + let current_buckets = match self.current_buckets.as_mut() { + Some(current_buckets) => current_buckets, + None => { + let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; + self.current_buckets.get_or_insert(new_buckets.into_iter()) + }, + }; - let found_candidates = match current_buckets.next() { - Some((_score, candidates)) => candidates, - None => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, + match current_buckets.next() { + Some((_score, candidates)) => candidates, + None => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + } + } else { + set_compute_candidates(self.ctx, flattened_query_tree, candidates)? }; candidates.difference_with(&found_candidates); @@ -114,6 +118,316 @@ impl<'t> Criterion for Attribute<'t> { } } +struct WordLevelIterator<'t, 'q> { + inner: Box> + 't>, + level: TreeLevel, + interval_size: u32, + word: &'q str, + in_prefix_cache: bool, + inner_next: Option<(u32, u32, RoaringBitmap)>, + current_interval: Option<(u32, u32)>, +} + +impl<'t, 'q> WordLevelIterator<'t, 'q> { + fn new(ctx: &'t dyn Context<'t>, query: &'q Query) -> heed::Result> { + // TODO make it typo/prefix tolerant + let word = query.kind.word(); + let in_prefix_cache = query.prefix && ctx.in_prefix_cache(word); + match ctx.word_position_last_level(word, in_prefix_cache)? { + Some(level) => { + let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); + let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?; + Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) + }, + None => Ok(None), + } + } + + fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel) -> heed::Result { + let level = level.min(&self.level).clone(); + let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); + let word = self.word; + let in_prefix_cache = self.in_prefix_cache; + // TODO try to dig starting from the current interval + // let left = self.current_interval.map(|(left, _)| left); + let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?; + + Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) + } + + fn next(&mut self) -> heed::Result> { + fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left } + + let inner_next = match self.inner_next.take() { + Some(inner_next) => Some(inner_next), + None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)), + }; + + match inner_next { + Some((left, right, docids)) => { + match self.current_interval { + Some((last_left, last_right)) if !is_next_interval(last_right, left) => { + let blank_left = last_left + self.interval_size; + let blank_right = last_right + self.interval_size; + self.current_interval = Some((blank_left, blank_right)); + self.inner_next = Some((left, right, docids)); + Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) + }, + _ => { + self.current_interval = Some((left, right)); + Ok(Some((left, right, docids))) + } + } + }, + None => Ok(None), + } + } +} + +struct QueryLevelIterator<'t, 'q> { + previous: Option>>, + inner: Vec>, + level: TreeLevel, + accumulator: Vec>, + previous_accumulator: Vec>, +} + +impl<'t, 'q> QueryLevelIterator<'t, 'q> { + fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec) -> heed::Result> { + let mut inner = Vec::with_capacity(queries.len()); + for query in queries { + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, query)? { + inner.push(word_level_iterator); + } + } + + let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone()); + match highest { + Some(level) => Ok(Some(Self { + previous: None, + inner, + level, + accumulator: vec![], + previous_accumulator: vec![], + })), + None => Ok(None), + } + } + + fn previous(&mut self, previous: QueryLevelIterator<'t, 'q>) -> &Self { + self.previous = Some(Box::new(previous)); + self + } + + fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result { + let (level, previous) = match &self.previous { + Some(previous) => { + let previous = previous.dig(ctx)?; + (previous.level.min(self.level), Some(Box::new(previous))) + }, + None => (self.level.saturating_sub(1), None), + }; + + let mut inner = Vec::with_capacity(self.inner.len()); + for word_level_iterator in self.inner.iter() { + inner.push(word_level_iterator.dig(ctx, &level)?); + } + + Ok(Self {previous, inner, level, accumulator: vec![], previous_accumulator: vec![]}) + } + + + + fn inner_next(&mut self, level: TreeLevel) -> heed::Result> { + let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None; + let u8_level = Into::::into(level); + let interval_size = 4u32.pow(u8_level as u32); + for wli in self.inner.iter_mut() { + let wli_u8_level = Into::::into(wli.level.clone()); + let accumulated_count = 4u32.pow((u8_level - wli_u8_level) as u32); + for _ in 0..accumulated_count { + if let Some((next_left, _, next_docids)) = wli.next()? { + accumulated = accumulated.take().map( + |(acc_left, acc_right, mut acc_docids)| { + acc_docids.union_with(&next_docids); + (acc_left, acc_right, acc_docids) + } + ).or_else(|| Some((next_left, next_left + interval_size, next_docids))); + } + } + } + + Ok(accumulated) + } + + fn next(&mut self) -> heed::Result<(TreeLevel, Option<(u32, u32, RoaringBitmap)>)> { + let previous_result = match self.previous.as_mut() { + Some(previous) => { + Some(previous.next()?) + }, + None => None, + }; + + match previous_result { + Some((previous_level, previous_next)) => { + let inner_next = self.inner_next(previous_level)?; + self.accumulator.push(inner_next); + self.previous_accumulator.push(previous_next); + // TODO @many clean firsts intervals of both accumulators when both RoaringBitmap are empty, + // WARNING the cleaned intervals count needs to be kept to skip at the end + let mut merged_interval = None; + for current in self.accumulator.iter().rev().zip(self.previous_accumulator.iter()) { + if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { + let (_, _, merged_docids) = merged_interval.get_or_insert_with(|| (left_a + left_b, right_a + right_b, RoaringBitmap::new())); + merged_docids.union_with(&(a & b)); + } + } + Ok((previous_level, merged_interval)) + }, + None => { + let level = self.level.clone(); + let next_interval = self.inner_next(level.clone())?; + self.accumulator = vec![next_interval.clone()]; + Ok((level, next_interval)) + } + } + } +} + +struct Branch<'t, 'q> { + query_level_iterator: QueryLevelIterator<'t, 'q>, + last_result: Option<(u32, u32, RoaringBitmap)>, + tree_level: TreeLevel, + branch_size: u32, +} + +impl<'t, 'q> Branch<'t, 'q> { + fn cmp(&self, other: &Self) -> Ordering { + fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((1..branch_size).sum()) / branch_size } + match (&self.last_result, &other.last_result) { + (Some((s_left, _, _)), Some((o_left, _, _))) => { + // we compute a rank form the left interval. + let self_rank = compute_rank(*s_left, self.branch_size); + let other_rank = compute_rank(*o_left, other.branch_size); + let left_cmp = self_rank.cmp(&other_rank).reverse(); + // on level: higher is better, + // we want to reduce highest levels first. + let level_cmp = self.tree_level.cmp(&other.tree_level); + + left_cmp.then(level_cmp) + }, + (Some(_), None) => Ordering::Greater, + (None, Some(_)) => Ordering::Less, + (None, None) => Ordering::Equal, + } + } +} + +impl<'t, 'q> Ord for Branch<'t, 'q> { + fn cmp(&self, other: &Self) -> Ordering { + self.cmp(other) + } +} + +impl<'t, 'q> PartialOrd for Branch<'t, 'q> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'t, 'q> PartialEq for Branch<'t, 'q> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl<'t, 'q> Eq for Branch<'t, 'q> {} + +fn initialize_query_level_iterators<'t, 'q>( + ctx: &'t dyn Context<'t>, + branches: &'q Vec>>, +) -> heed::Result>> { + + let mut positions = BinaryHeap::with_capacity(branches.len()); + for branch in branches { + let mut branch_positions = Vec::with_capacity(branch.len()); + for query in branch { + match QueryLevelIterator::new(ctx, query)? { + Some(qli) => branch_positions.push(qli), + None => { + // the branch seems to be invalid, so we skip it. + branch_positions.clear(); + break; + }, + } + } + // QueryLevelIterator need to be sorted by level and folded in descending order. + branch_positions.sort_unstable_by_key(|qli| qli.level); + let folded_query_level_iterators = branch_positions + .into_iter() + .rev() + .fold(None, |fold: Option, qli| match fold { + Some(mut fold) => { + fold.previous(qli); + Some(fold) + }, + None => Some(qli), + }); + + if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { + let (tree_level, last_result) = folded_query_level_iterators.next()?; + let branch = Branch { + last_result, + tree_level, + query_level_iterator: folded_query_level_iterators, + branch_size: branch.len() as u32, + }; + positions.push(branch); + } + } + + Ok(positions) +} + +fn set_compute_candidates<'t>( + ctx: &'t dyn Context<'t>, + branches: &Vec>>, + allowed_candidates: &RoaringBitmap, +) -> anyhow::Result +{ + let mut branches_heap = initialize_query_level_iterators(ctx, branches)?; + let lowest_level = TreeLevel::min_value(); + + while let Some(mut branch) = branches_heap.peek_mut() { + let is_lowest_level = branch.tree_level == lowest_level; + match branch.last_result.as_mut() { + Some((_, _, candidates)) => { + candidates.intersect_with(&allowed_candidates); + if candidates.len() > 0 && is_lowest_level { + // we have candidates, but we can't dig deeper, return candidates. + return Ok(std::mem::take(candidates)); + } else if candidates.len() > 0 { + // we have candidates, lets dig deeper in levels. + let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?; + let (tree_level, last_result) = query_level_iterator.next()?; + branch.query_level_iterator = query_level_iterator; + branch.tree_level = tree_level; + branch.last_result = last_result; + } else { + // we don't have candidates, get next interval. + let (_, last_result) = branch.query_level_iterator.next()?; + branch.last_result = last_result; + } + }, + // None = no candidates to find. + None => return Ok(RoaringBitmap::new()), + } + } + + // we made all iterations without finding anything. + Ok(RoaringBitmap::new()) +} + fn linear_compute_candidates( ctx: &dyn Context, branches: &Vec>>, diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index fe224ef94..d3c394467 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -19,13 +19,13 @@ pub struct FinalResult { } pub struct Final<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, parent: Box, wdcache: WordDerivationsCache, } impl<'t> Final<'t> { - pub fn new(ctx: &'t dyn Context, parent: Box) -> Final<'t> { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Final<'t> { Final { ctx, parent, wdcache: WordDerivationsCache::new() } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 5e75be6ce..b972a0b2c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -4,7 +4,7 @@ use std::borrow::Cow; use anyhow::bail; use roaring::RoaringBitmap; -use crate::search::{word_derivations, WordDerivationsCache}; +use crate::{TreeLevel, search::{word_derivations, WordDerivationsCache}}; use crate::{Index, DocumentId}; use super::query_tree::{Operation, Query, QueryKind}; @@ -64,7 +64,7 @@ impl Default for Candidates { } } -pub trait Context { +pub trait Context<'c> { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; @@ -73,6 +73,8 @@ pub trait Context { fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; fn docid_words_positions(&self, docid: DocumentId) -> heed::Result>; + fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>>; + fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; } pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -81,7 +83,7 @@ pub struct CriteriaBuilder<'t> { words_prefixes_fst: fst::Set>, } -impl<'a> Context for CriteriaBuilder<'a> { +impl<'c> Context<'c> for CriteriaBuilder<'c> { fn documents_ids(&self) -> heed::Result { self.index.documents_ids(self.rtxn) } @@ -120,6 +122,40 @@ impl<'a> Context for CriteriaBuilder<'a> { } Ok(words_positions) } + + fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>> { + let range = { + let left = left.unwrap_or(u32::min_value()); + let right = right.unwrap_or(u32::max_value()); + let left = (word, level, left, left); + let right = (word, level, right, right); + left..=right + }; + let db = match in_prefix_cache { + true => self.index.word_prefix_level_position_docids, + false => self.index.word_level_position_docids, + }; + + Ok(Box::new(db.range(self.rtxn, &range)?)) + } + + fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result> { + let range = { + let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); + let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + left..=right + }; + let db = match in_prefix_cache { + true => self.index.word_prefix_level_position_docids, + false => self.index.word_level_position_docids, + }; + let last_level = db + .remap_data_type::() + .range(self.rtxn, &range)?.last().transpose()? + .map(|((_, level, _, _), _)| level); + + Ok(last_level) + } } impl<'t> CriteriaBuilder<'t> { @@ -354,7 +390,7 @@ pub mod test { docid_words: HashMap>, } - impl<'a> Context for TestContext<'a> { + impl<'c> Context<'c> for TestContext<'c> { fn documents_ids(&self) -> heed::Result { Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) } @@ -397,6 +433,14 @@ pub mod test { Ok(HashMap::new()) } } + + fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option, _right: Option) -> heed::Result> + 'c>> { + todo!() + } + + fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result> { + todo!() + } } impl<'a> Default for TestContext<'a> { diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index dc1daafb2..ca412bf28 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -13,7 +13,7 @@ use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proxim type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; pub struct Proximity<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, /// ((max_proximity, query_tree), allowed_candidates) state: Option<(Option<(usize, Operation)>, RoaringBitmap)>, proximity: u8, @@ -24,7 +24,7 @@ pub struct Proximity<'t> { } impl<'t> Proximity<'t> { - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Proximity { ctx, state: None, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 40b06afc4..bf58fa258 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -9,7 +9,7 @@ use crate::search::{word_derivations, WordDerivationsCache}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; pub struct Typo<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, query_tree: Option<(usize, Operation)>, number_typos: u8, candidates: Candidates, @@ -19,7 +19,7 @@ pub struct Typo<'t> { } impl<'t> Typo<'t> { - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Typo { ctx, query_tree: None, diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 5bb9d8d90..047b3c5f0 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -8,7 +8,7 @@ use crate::search::query_tree::Operation; use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache}; pub struct Words<'t> { - ctx: &'t dyn Context, + ctx: &'t dyn Context<'t>, query_trees: Vec, candidates: Option, bucket_candidates: RoaringBitmap, @@ -17,7 +17,7 @@ pub struct Words<'t> { } impl<'t> Words<'t> { - pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Words { ctx, query_trees: Vec::default(), diff --git a/milli/src/tree_level.rs b/milli/src/tree_level.rs index 7ce2904e2..b69316cf6 100644 --- a/milli/src/tree_level.rs +++ b/milli/src/tree_level.rs @@ -21,6 +21,10 @@ impl TreeLevel { pub const fn min_value() -> TreeLevel { TreeLevel(0) } + + pub fn saturating_sub(&self, lhs: u8) -> TreeLevel { + TreeLevel(self.0.saturating_sub(lhs)) + } } impl Into for TreeLevel { From 1eee0029a8d3633f42a045d654c94048cd9f4e40 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 1 Apr 2021 14:42:23 +0200 Subject: [PATCH 0638/1889] Make attribute criterion typo/prefix tolerant --- milli/src/search/criteria/attribute.rs | 57 ++++++++++++++++++-------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index af336c21f..87f9d4dde 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,4 +1,4 @@ -use std::{cmp::{self, Ordering}, collections::BinaryHeap}; +use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap}; use std::collections::{BTreeMap, HashMap, btree_map}; use std::mem::take; @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use crate::{TreeLevel, search::build_dfa}; use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; -use crate::search::WordDerivationsCache; +use crate::search::{word_derivations, WordDerivationsCache}; use super::{Criterion, CriterionResult, Context, resolve_query_tree}; pub struct Attribute<'t> { @@ -71,7 +71,7 @@ impl<'t> Criterion for Attribute<'t> { }, } } else { - set_compute_candidates(self.ctx, flattened_query_tree, candidates)? + set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? }; candidates.difference_with(&found_candidates); @@ -122,21 +122,18 @@ struct WordLevelIterator<'t, 'q> { inner: Box> + 't>, level: TreeLevel, interval_size: u32, - word: &'q str, + word: Cow<'q, str>, in_prefix_cache: bool, inner_next: Option<(u32, u32, RoaringBitmap)>, current_interval: Option<(u32, u32)>, } impl<'t, 'q> WordLevelIterator<'t, 'q> { - fn new(ctx: &'t dyn Context<'t>, query: &'q Query) -> heed::Result> { - // TODO make it typo/prefix tolerant - let word = query.kind.word(); - let in_prefix_cache = query.prefix && ctx.in_prefix_cache(word); - match ctx.word_position_last_level(word, in_prefix_cache)? { + fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result> { + match ctx.word_position_last_level(&word, in_prefix_cache)? { Some(level) => { let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); - let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?; + let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) }, None => Ok(None), @@ -146,11 +143,11 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel) -> heed::Result { let level = level.min(&self.level).clone(); let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); - let word = self.word; + let word = self.word.clone(); let in_prefix_cache = self.in_prefix_cache; // TODO try to dig starting from the current interval // let left = self.current_interval.map(|(left, _)| left); - let inner = ctx.word_position_iterator(word, level, in_prefix_cache, None, None)?; + let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) } @@ -193,11 +190,33 @@ struct QueryLevelIterator<'t, 'q> { } impl<'t, 'q> QueryLevelIterator<'t, 'q> { - fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec) -> heed::Result> { + fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { let mut inner = Vec::with_capacity(queries.len()); for query in queries { - if let Some(word_level_iterator) = WordLevelIterator::new(ctx, query)? { - inner.push(word_level_iterator); + match &query.kind { + QueryKind::Exact { word, .. } => { + if !query.prefix || ctx.in_prefix_cache(&word) { + let word = Cow::Borrowed(query.kind.word()); + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? { + inner.push(word_level_iterator); + } + } else { + for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { + let word = Cow::Owned(word.to_owned()); + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { + inner.push(word_level_iterator); + } + } + } + }, + QueryKind::Tolerant { typo, word } => { + for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { + let word = Cow::Owned(word.to_owned()); + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { + inner.push(word_level_iterator); + } + } + } } } @@ -346,13 +365,14 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {} fn initialize_query_level_iterators<'t, 'q>( ctx: &'t dyn Context<'t>, branches: &'q Vec>>, -) -> heed::Result>> { + wdcache: &mut WordDerivationsCache, +) -> anyhow::Result>> { let mut positions = BinaryHeap::with_capacity(branches.len()); for branch in branches { let mut branch_positions = Vec::with_capacity(branch.len()); for query in branch { - match QueryLevelIterator::new(ctx, query)? { + match QueryLevelIterator::new(ctx, query, wdcache)? { Some(qli) => branch_positions.push(qli), None => { // the branch seems to be invalid, so we skip it. @@ -393,9 +413,10 @@ fn set_compute_candidates<'t>( ctx: &'t dyn Context<'t>, branches: &Vec>>, allowed_candidates: &RoaringBitmap, + wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { - let mut branches_heap = initialize_query_level_iterators(ctx, branches)?; + let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?; let lowest_level = TreeLevel::min_value(); while let Some(mut branch) = branches_heap.peek_mut() { From b3e2280bb93fa4806229c8ee4188c4903b654887 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 1 Apr 2021 19:02:13 +0200 Subject: [PATCH 0639/1889] Debug attribute criterion * debug folding when initializing iterators --- milli/src/search/criteria/attribute.rs | 28 ++++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 87f9d4dde..d96ec493f 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -51,7 +51,7 @@ impl<'t> Criterion for Attribute<'t> { flatten_query_tree(&qt) }); - let found_candidates = if candidates.len() < 1000 { + let found_candidates = if candidates.len() < 1_000 { let current_buckets = match self.current_buckets.as_mut() { Some(current_buckets) => current_buckets, None => { @@ -322,10 +322,10 @@ struct Branch<'t, 'q> { impl<'t, 'q> Branch<'t, 'q> { fn cmp(&self, other: &Self) -> Ordering { - fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((1..branch_size).sum()) / branch_size } + fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((0..branch_size).sum()) / branch_size } match (&self.last_result, &other.last_result) { (Some((s_left, _, _)), Some((o_left, _, _))) => { - // we compute a rank form the left interval. + // we compute a rank from the left interval. let self_rank = compute_rank(*s_left, self.branch_size); let other_rank = compute_rank(*o_left, other.branch_size); let left_cmp = self_rank.cmp(&other_rank).reverse(); @@ -371,8 +371,8 @@ fn initialize_query_level_iterators<'t, 'q>( let mut positions = BinaryHeap::with_capacity(branches.len()); for branch in branches { let mut branch_positions = Vec::with_capacity(branch.len()); - for query in branch { - match QueryLevelIterator::new(ctx, query, wdcache)? { + for queries in branch { + match QueryLevelIterator::new(ctx, queries, wdcache)? { Some(qli) => branch_positions.push(qli), None => { // the branch seems to be invalid, so we skip it. @@ -386,10 +386,10 @@ fn initialize_query_level_iterators<'t, 'q>( let folded_query_level_iterators = branch_positions .into_iter() .rev() - .fold(None, |fold: Option, qli| match fold { - Some(mut fold) => { - fold.previous(qli); - Some(fold) + .fold(None, |fold: Option, mut qli| match fold { + Some(fold) => { + qli.previous(fold); + Some(qli) }, None => Some(qli), }); @@ -418,6 +418,7 @@ fn set_compute_candidates<'t>( { let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?; let lowest_level = TreeLevel::min_value(); + let mut final_candidates = RoaringBitmap::new(); while let Some(mut branch) = branches_heap.peek_mut() { let is_lowest_level = branch.tree_level == lowest_level; @@ -426,7 +427,8 @@ fn set_compute_candidates<'t>( candidates.intersect_with(&allowed_candidates); if candidates.len() > 0 && is_lowest_level { // we have candidates, but we can't dig deeper, return candidates. - return Ok(std::mem::take(candidates)); + final_candidates = std::mem::take(candidates); + break; } else if candidates.len() > 0 { // we have candidates, lets dig deeper in levels. let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?; @@ -441,12 +443,12 @@ fn set_compute_candidates<'t>( } }, // None = no candidates to find. - None => return Ok(RoaringBitmap::new()), + None => break, } + } - // we made all iterations without finding anything. - Ok(RoaringBitmap::new()) + Ok(final_candidates) } fn linear_compute_candidates( From 17c8c6f945bdffebdf6a935160566f9e6deaa8be Mon Sep 17 00:00:00 2001 From: many Date: Tue, 6 Apr 2021 15:03:41 +0200 Subject: [PATCH 0640/1889] Make set algorithm return None when nothing can be returned --- milli/src/search/criteria/attribute.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index d96ec493f..12c6b36b8 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -71,7 +71,18 @@ impl<'t> Criterion for Attribute<'t> { }, } } else { - set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? + let found_candidates = set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)?; + + match found_candidates { + Some(candidates) => candidates, + None => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: self.candidates.take(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + } }; candidates.difference_with(&found_candidates); @@ -414,11 +425,11 @@ fn set_compute_candidates<'t>( branches: &Vec>>, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> anyhow::Result> { let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?; let lowest_level = TreeLevel::min_value(); - let mut final_candidates = RoaringBitmap::new(); + let mut final_candidates = None; while let Some(mut branch) = branches_heap.peek_mut() { let is_lowest_level = branch.tree_level == lowest_level; @@ -427,7 +438,7 @@ fn set_compute_candidates<'t>( candidates.intersect_with(&allowed_candidates); if candidates.len() > 0 && is_lowest_level { // we have candidates, but we can't dig deeper, return candidates. - final_candidates = std::mem::take(candidates); + final_candidates = Some(std::mem::take(candidates)); break; } else if candidates.len() > 0 { // we have candidates, lets dig deeper in levels. From 0efa011e0965fa6e6a1da630d6a3c1cead9ba0e4 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 12 Apr 2021 11:19:25 +0200 Subject: [PATCH 0641/1889] Make a small code clean-up --- milli/src/search/criteria/attribute.rs | 90 ++++++++++++++------------ 1 file changed, 47 insertions(+), 43 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 12c6b36b8..af3e08af1 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -326,30 +326,25 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { struct Branch<'t, 'q> { query_level_iterator: QueryLevelIterator<'t, 'q>, - last_result: Option<(u32, u32, RoaringBitmap)>, + last_result: (u32, u32, RoaringBitmap), tree_level: TreeLevel, branch_size: u32, } impl<'t, 'q> Branch<'t, 'q> { fn cmp(&self, other: &Self) -> Ordering { - fn compute_rank(left: u32, branch_size: u32) -> u32 { left.saturating_sub((0..branch_size).sum()) / branch_size } - match (&self.last_result, &other.last_result) { - (Some((s_left, _, _)), Some((o_left, _, _))) => { - // we compute a rank from the left interval. - let self_rank = compute_rank(*s_left, self.branch_size); - let other_rank = compute_rank(*o_left, other.branch_size); - let left_cmp = self_rank.cmp(&other_rank).reverse(); - // on level: higher is better, - // we want to reduce highest levels first. - let level_cmp = self.tree_level.cmp(&other.tree_level); + let compute_rank = |left: u32, branch_size: u32| left.saturating_sub((0..branch_size).sum()) / branch_size; + let (s_left, _, _) = self.last_result; + let (o_left, _, _) = other.last_result; + // we compute a rank from the left interval. + let self_rank = compute_rank(s_left, self.branch_size); + let other_rank = compute_rank(o_left, other.branch_size); + let left_cmp = self_rank.cmp(&other_rank).reverse(); + // on level: higher is better, + // we want to reduce highest levels first. + let level_cmp = self.tree_level.cmp(&other.tree_level); - left_cmp.then(level_cmp) - }, - (Some(_), None) => Ordering::Greater, - (None, Some(_)) => Ordering::Less, - (None, None) => Ordering::Equal, - } + left_cmp.then(level_cmp) } } @@ -407,13 +402,15 @@ fn initialize_query_level_iterators<'t, 'q>( if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { let (tree_level, last_result) = folded_query_level_iterators.next()?; - let branch = Branch { - last_result, - tree_level, - query_level_iterator: folded_query_level_iterators, - branch_size: branch.len() as u32, - }; - positions.push(branch); + if let Some(last_result) = last_result { + let branch = Branch { + last_result, + tree_level, + query_level_iterator: folded_query_level_iterators, + branch_size: branch.len() as u32, + }; + positions.push(branch); + } } } @@ -433,28 +430,35 @@ fn set_compute_candidates<'t>( while let Some(mut branch) = branches_heap.peek_mut() { let is_lowest_level = branch.tree_level == lowest_level; - match branch.last_result.as_mut() { - Some((_, _, candidates)) => { - candidates.intersect_with(&allowed_candidates); - if candidates.len() > 0 && is_lowest_level { - // we have candidates, but we can't dig deeper, return candidates. - final_candidates = Some(std::mem::take(candidates)); - break; - } else if candidates.len() > 0 { - // we have candidates, lets dig deeper in levels. - let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?; - let (tree_level, last_result) = query_level_iterator.next()?; + let (_, _, candidates) = &mut branch.last_result; + candidates.intersect_with(&allowed_candidates); + if candidates.is_empty() { + // we don't have candidates, get next interval. + match branch.query_level_iterator.next()? { + (_, Some(last_result)) => { + branch.last_result = last_result; + }, + // TODO clean up this + (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); }, + } + + } + else if is_lowest_level { + // we have candidates, but we can't dig deeper, return candidates. + final_candidates = Some(take(candidates)); + break; + } else { + // we have candidates, lets dig deeper in levels. + let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?; + match query_level_iterator.next()? { + (tree_level, Some(last_result)) => { branch.query_level_iterator = query_level_iterator; branch.tree_level = tree_level; branch.last_result = last_result; - } else { - // we don't have candidates, get next interval. - let (_, last_result) = branch.query_level_iterator.next()?; - branch.last_result = last_result; - } - }, - // None = no candidates to find. - None => break, + }, + // TODO clean up this + (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); }, + } } } From 2b036449be4f2c4a1ca15d5b4d1cfed3a6828e07 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 13 Apr 2021 15:06:12 +0200 Subject: [PATCH 0642/1889] Fix the return of equal candidates in different pages --- milli/src/search/criteria/attribute.rs | 79 +++++++++++++++++--------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index af3e08af1..8d150730f 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,5 +1,6 @@ use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap}; use std::collections::{BTreeMap, HashMap, btree_map}; +use std::collections::binary_heap::PeekMut; use std::mem::take; use roaring::RoaringBitmap; @@ -332,13 +333,26 @@ struct Branch<'t, 'q> { } impl<'t, 'q> Branch<'t, 'q> { - fn cmp(&self, other: &Self) -> Ordering { - let compute_rank = |left: u32, branch_size: u32| left.saturating_sub((0..branch_size).sum()) / branch_size; - let (s_left, _, _) = self.last_result; - let (o_left, _, _) = other.last_result; + fn next(&mut self) -> heed::Result { + match self.query_level_iterator.next()? { + (tree_level, Some(last_result)) => { + self.last_result = last_result; + self.tree_level = tree_level; + Ok(true) + }, + (_, None) => Ok(false), + } + } + + fn compute_rank(&self) -> u32 { // we compute a rank from the left interval. - let self_rank = compute_rank(s_left, self.branch_size); - let other_rank = compute_rank(o_left, other.branch_size); + let (left, _, _) = self.last_result; + left.saturating_sub((0..self.branch_size).sum()) * 60 / self.branch_size + } + + fn cmp(&self, other: &Self) -> Ordering { + let self_rank = self.compute_rank(); + let other_rank = other.compute_rank(); let left_cmp = self_rank.cmp(&other_rank).reverse(); // on level: higher is better, // we want to reduce highest levels first. @@ -426,44 +440,53 @@ fn set_compute_candidates<'t>( { let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?; let lowest_level = TreeLevel::min_value(); - let mut final_candidates = None; + let mut final_candidates: Option<(u32, RoaringBitmap)> = None; while let Some(mut branch) = branches_heap.peek_mut() { let is_lowest_level = branch.tree_level == lowest_level; + let branch_rank = branch.compute_rank(); let (_, _, candidates) = &mut branch.last_result; candidates.intersect_with(&allowed_candidates); if candidates.is_empty() { // we don't have candidates, get next interval. - match branch.query_level_iterator.next()? { - (_, Some(last_result)) => { - branch.last_result = last_result; - }, - // TODO clean up this - (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); }, - } - + if !branch.next()? { PeekMut::pop(branch); } } else if is_lowest_level { // we have candidates, but we can't dig deeper, return candidates. - final_candidates = Some(take(candidates)); - break; + final_candidates = match final_candidates.take() { + Some((best_rank, mut best_candidates)) => { + // if current is worst than best we break to return + // candidates that correspond to the best rank + if branch_rank > best_rank { + final_candidates = Some((best_rank, best_candidates)); + break; + // else we add current candidates to best candidates + // and we fetch the next page + } else { + best_candidates.union_with(candidates); + if !branch.next()? { PeekMut::pop(branch); } + Some((best_rank, best_candidates)) + } + }, + // we take current candidates as best candidates + // and we fetch the next page + None => { + let candidates = take(candidates); + if !branch.next()? { PeekMut::pop(branch); } + Some((branch_rank, candidates)) + }, + }; } else { // we have candidates, lets dig deeper in levels. - let mut query_level_iterator = branch.query_level_iterator.dig(ctx)?; - match query_level_iterator.next()? { - (tree_level, Some(last_result)) => { - branch.query_level_iterator = query_level_iterator; - branch.tree_level = tree_level; - branch.last_result = last_result; - }, - // TODO clean up this - (_, None) => { std::collections::binary_heap::PeekMut::<'_, Branch<'_, '_>>::pop(branch); }, - } + branch.query_level_iterator = branch.query_level_iterator.dig(ctx)?; + if !branch.next()? { PeekMut::pop(branch); } } } - Ok(final_candidates) + Ok(final_candidates.map(|(_rank, candidates)| { + candidates + })) } fn linear_compute_candidates( From f8537900168265841993c4eb0bbd3fcc539a76b4 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 13 Apr 2021 18:25:38 +0200 Subject: [PATCH 0643/1889] Use the LCM of 10 first numbers to compute attribute rank --- milli/src/search/criteria/attribute.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 8d150730f..5ab60c58d 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -11,6 +11,10 @@ use crate::search::query_tree::{Operation, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; use super::{Criterion, CriterionResult, Context, resolve_query_tree}; +/// To be able to divide integers by the number of words in the query +/// we want to find a multiplier that allow us to divide by any number between 1 and 10. +/// We Choosed the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). +const LCM_10_FIRST_NUMBERS: u32 = 2520; pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, query_tree: Option, @@ -347,7 +351,7 @@ impl<'t, 'q> Branch<'t, 'q> { fn compute_rank(&self) -> u32 { // we compute a rank from the left interval. let (left, _, _) = self.last_result; - left.saturating_sub((0..self.branch_size).sum()) * 60 / self.branch_size + left.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size } fn cmp(&self, other: &Self) -> Ordering { @@ -545,7 +549,7 @@ fn linear_compute_candidates( // we substract the word index to the position. let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); // here we do the means of the words of the branch - min_rank = min_rank.min(branch_rank / branch_len as u64); + min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); } } From 716c8e22b0bb82a65f2d9320af8d3d68ffc9a79f Mon Sep 17 00:00:00 2001 From: many Date: Thu, 15 Apr 2021 10:44:27 +0200 Subject: [PATCH 0644/1889] Add style and comments --- milli/src/search/criteria/attribute.rs | 51 +++++++++++++++----------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 5ab60c58d..2672169de 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -15,6 +15,7 @@ use super::{Criterion, CriterionResult, Context, resolve_query_tree}; /// we want to find a multiplier that allow us to divide by any number between 1 and 10. /// We Choosed the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). const LCM_10_FIRST_NUMBERS: u32 = 2520; + pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, query_tree: Option, @@ -134,6 +135,9 @@ impl<'t> Criterion for Attribute<'t> { } } +/// WordLevelIterator is an pseudo-Iterator over intervals of word-position for one word, +/// it will begin at the first non-empty interval and will return every interval without +/// jumping over empty intervals. struct WordLevelIterator<'t, 'q> { inner: Box> + 't>, level: TreeLevel, @@ -197,12 +201,14 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { } } +/// QueryLevelIterator is an pseudo-Iterator for a Query, +/// It contains WordLevelIterators and is chainned with other QueryLevelIterator. struct QueryLevelIterator<'t, 'q> { - previous: Option>>, + parent: Option>>, inner: Vec>, level: TreeLevel, accumulator: Vec>, - previous_accumulator: Vec>, + parent_accumulator: Vec>, } impl<'t, 'q> QueryLevelIterator<'t, 'q> { @@ -239,26 +245,27 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone()); match highest { Some(level) => Ok(Some(Self { - previous: None, + parent: None, inner, level, accumulator: vec![], - previous_accumulator: vec![], + parent_accumulator: vec![], })), None => Ok(None), } } - fn previous(&mut self, previous: QueryLevelIterator<'t, 'q>) -> &Self { - self.previous = Some(Box::new(previous)); + fn parent(&mut self, parent: QueryLevelIterator<'t, 'q>) -> &Self { + self.parent = Some(Box::new(parent)); self } + /// create a new QueryLevelIterator with a lower level than the current one. fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result { - let (level, previous) = match &self.previous { - Some(previous) => { - let previous = previous.dig(ctx)?; - (previous.level.min(self.level), Some(Box::new(previous))) + let (level, parent) = match &self.parent { + Some(parent) => { + let parent = parent.dig(ctx)?; + (parent.level.min(self.level), Some(Box::new(parent))) }, None => (self.level.saturating_sub(1), None), }; @@ -268,7 +275,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { inner.push(word_level_iterator.dig(ctx, &level)?); } - Ok(Self {previous, inner, level, accumulator: vec![], previous_accumulator: vec![]}) + Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![]}) } @@ -295,29 +302,31 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { Ok(accumulated) } + /// return the next meta-interval created from inner WordLevelIterators, + /// and from eventual chainned QueryLevelIterator. fn next(&mut self) -> heed::Result<(TreeLevel, Option<(u32, u32, RoaringBitmap)>)> { - let previous_result = match self.previous.as_mut() { - Some(previous) => { - Some(previous.next()?) + let parent_result = match self.parent.as_mut() { + Some(parent) => { + Some(parent.next()?) }, None => None, }; - match previous_result { - Some((previous_level, previous_next)) => { - let inner_next = self.inner_next(previous_level)?; + match parent_result { + Some((parent_level, parent_next)) => { + let inner_next = self.inner_next(parent_level)?; self.accumulator.push(inner_next); - self.previous_accumulator.push(previous_next); + self.parent_accumulator.push(parent_next); // TODO @many clean firsts intervals of both accumulators when both RoaringBitmap are empty, // WARNING the cleaned intervals count needs to be kept to skip at the end let mut merged_interval = None; - for current in self.accumulator.iter().rev().zip(self.previous_accumulator.iter()) { + for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()) { if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { let (_, _, merged_docids) = merged_interval.get_or_insert_with(|| (left_a + left_b, right_a + right_b, RoaringBitmap::new())); merged_docids.union_with(&(a & b)); } } - Ok((previous_level, merged_interval)) + Ok((parent_level, merged_interval)) }, None => { let level = self.level.clone(); @@ -412,7 +421,7 @@ fn initialize_query_level_iterators<'t, 'q>( .rev() .fold(None, |fold: Option, mut qli| match fold { Some(fold) => { - qli.previous(fold); + qli.parent(fold); Some(qli) }, None => Some(qli), From e77291a6f3065824779d630c9fd4449869b338ee Mon Sep 17 00:00:00 2001 From: many Date: Thu, 15 Apr 2021 12:22:44 +0200 Subject: [PATCH 0645/1889] Optimize Atrribute criterion on big requests --- milli/src/search/criteria/attribute.rs | 160 +++++++++++++++---------- 1 file changed, 97 insertions(+), 63 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 2672169de..745d8cdb0 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -101,7 +101,7 @@ impl<'t> Criterion for Attribute<'t> { }, (Some(qt), None) => { let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; - self.bucket_candidates.union_with(&query_tree_candidates); + self.bucket_candidates |= &query_tree_candidates; self.candidates = Some(query_tree_candidates); }, (None, Some(_)) => { @@ -123,7 +123,7 @@ impl<'t> Criterion for Attribute<'t> { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; self.candidates = candidates; - self.bucket_candidates.union_with(&bucket_candidates); + self.bucket_candidates |= bucket_candidates; self.flattened_query_tree = None; self.current_buckets = None; }, @@ -160,14 +160,12 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { } } - fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel) -> heed::Result { + fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option) -> heed::Result { let level = level.min(&self.level).clone(); let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); let word = self.word.clone(); let in_prefix_cache = self.in_prefix_cache; - // TODO try to dig starting from the current interval - // let left = self.current_interval.map(|(left, _)| left); - let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; + let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) } @@ -209,6 +207,7 @@ struct QueryLevelIterator<'t, 'q> { level: TreeLevel, accumulator: Vec>, parent_accumulator: Vec>, + interval_to_skip: usize, } impl<'t, 'q> QueryLevelIterator<'t, 'q> { @@ -250,6 +249,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { level, accumulator: vec![], parent_accumulator: vec![], + interval_to_skip: 0, })), None => Ok(None), } @@ -270,16 +270,15 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { None => (self.level.saturating_sub(1), None), }; + let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten(); let mut inner = Vec::with_capacity(self.inner.len()); for word_level_iterator in self.inner.iter() { - inner.push(word_level_iterator.dig(ctx, &level)?); + inner.push(word_level_iterator.dig(ctx, &level, left_interval)?); } - Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![]}) + Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0}) } - - fn inner_next(&mut self, level: TreeLevel) -> heed::Result> { let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None; let u8_level = Into::::into(level); @@ -289,12 +288,13 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { let accumulated_count = 4u32.pow((u8_level - wli_u8_level) as u32); for _ in 0..accumulated_count { if let Some((next_left, _, next_docids)) = wli.next()? { - accumulated = accumulated.take().map( - |(acc_left, acc_right, mut acc_docids)| { - acc_docids.union_with(&next_docids); - (acc_left, acc_right, acc_docids) - } - ).or_else(|| Some((next_left, next_left + interval_size, next_docids))); + accumulated = match accumulated.take(){ + Some((acc_left, acc_right, mut acc_docids)) => { + acc_docids |= next_docids; + Some((acc_left, acc_right, acc_docids)) + }, + None => Some((next_left, next_left + interval_size, next_docids)), + }; } } } @@ -304,35 +304,59 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { /// return the next meta-interval created from inner WordLevelIterators, /// and from eventual chainned QueryLevelIterator. - fn next(&mut self) -> heed::Result<(TreeLevel, Option<(u32, u32, RoaringBitmap)>)> { + fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result> { let parent_result = match self.parent.as_mut() { Some(parent) => { - Some(parent.next()?) + Some(parent.next(allowed_candidates, tree_level)?) }, None => None, }; match parent_result { - Some((parent_level, parent_next)) => { - let inner_next = self.inner_next(parent_level)?; + Some(parent_next) => { + let inner_next = self.inner_next(tree_level)?; + self.interval_to_skip += self.accumulator.iter().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip).take_while(|current| { + match current { + (Some((_, _, inner)), Some((_, _, parent))) => { + inner.is_disjoint(allowed_candidates) && parent.is_empty() + }, + (Some((_, _, inner)), None) => { + inner.is_disjoint(allowed_candidates) + }, + (None, Some((_, _, parent))) => { + parent.is_empty() + }, + (None, None) => true, + } + }).count(); self.accumulator.push(inner_next); self.parent_accumulator.push(parent_next); - // TODO @many clean firsts intervals of both accumulators when both RoaringBitmap are empty, - // WARNING the cleaned intervals count needs to be kept to skip at the end - let mut merged_interval = None; - for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()) { + let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; + + for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) { if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { - let (_, _, merged_docids) = merged_interval.get_or_insert_with(|| (left_a + left_b, right_a + right_b, RoaringBitmap::new())); - merged_docids.union_with(&(a & b)); + match merged_interval.as_mut() { + Some((_, _, merged_docids)) => *merged_docids |= a & b, + None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)), + } } } - Ok((parent_level, merged_interval)) + Ok(merged_interval) }, None => { - let level = self.level.clone(); - let next_interval = self.inner_next(level.clone())?; - self.accumulator = vec![next_interval.clone()]; - Ok((level, next_interval)) + let level = self.level; + match self.inner_next(level)? { + Some((left, right, mut candidates)) => { + self.accumulator = vec![Some((left, right, RoaringBitmap::new()))]; + candidates &= allowed_candidates; + Ok(Some((left, right, candidates))) + + }, + None => { + self.accumulator = vec![None]; + Ok(None) + }, + } } } } @@ -346,17 +370,31 @@ struct Branch<'t, 'q> { } impl<'t, 'q> Branch<'t, 'q> { - fn next(&mut self) -> heed::Result { - match self.query_level_iterator.next()? { - (tree_level, Some(last_result)) => { + fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result { + let tree_level = self.query_level_iterator.level; + match self.query_level_iterator.next(allowed_candidates, tree_level)? { + Some(last_result) => { self.last_result = last_result; self.tree_level = tree_level; Ok(true) }, - (_, None) => Ok(false), + None => Ok(false), } } + fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> { + self.query_level_iterator = self.query_level_iterator.dig(ctx)?; + Ok(()) + } + + fn lazy_next(&mut self) { + let u8_level = Into::::into(self.tree_level.clone()); + let interval_size = 4u32.pow(u8_level as u32); + let (left, right, _) = self.last_result; + + self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); + } + fn compute_rank(&self) -> u32 { // we compute a rank from the left interval. let (left, _, _) = self.last_result; @@ -367,11 +405,11 @@ impl<'t, 'q> Branch<'t, 'q> { let self_rank = self.compute_rank(); let other_rank = other.compute_rank(); let left_cmp = self_rank.cmp(&other_rank).reverse(); - // on level: higher is better, - // we want to reduce highest levels first. - let level_cmp = self.tree_level.cmp(&other.tree_level); + // on level: lower is better, + // we want to dig faster into levels on interesting branches. + let level_cmp = self.tree_level.cmp(&other.tree_level).reverse(); - left_cmp.then(level_cmp) + left_cmp.then(level_cmp).then(self.last_result.2.len().cmp(&other.last_result.2.len())) } } @@ -398,6 +436,7 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {} fn initialize_query_level_iterators<'t, 'q>( ctx: &'t dyn Context<'t>, branches: &'q Vec>>, + allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result>> { @@ -418,7 +457,6 @@ fn initialize_query_level_iterators<'t, 'q>( branch_positions.sort_unstable_by_key(|qli| qli.level); let folded_query_level_iterators = branch_positions .into_iter() - .rev() .fold(None, |fold: Option, mut qli| match fold { Some(fold) => { qli.parent(fold); @@ -428,7 +466,8 @@ fn initialize_query_level_iterators<'t, 'q>( }); if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { - let (tree_level, last_result) = folded_query_level_iterators.next()?; + let tree_level = folded_query_level_iterators.level; + let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?; if let Some(last_result) = last_result { let branch = Branch { last_result, @@ -451,48 +490,43 @@ fn set_compute_candidates<'t>( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { - let mut branches_heap = initialize_query_level_iterators(ctx, branches, wdcache)?; + let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; let lowest_level = TreeLevel::min_value(); let mut final_candidates: Option<(u32, RoaringBitmap)> = None; + let mut allowed_candidates = allowed_candidates.clone(); while let Some(mut branch) = branches_heap.peek_mut() { let is_lowest_level = branch.tree_level == lowest_level; let branch_rank = branch.compute_rank(); - let (_, _, candidates) = &mut branch.last_result; - candidates.intersect_with(&allowed_candidates); + // if current is worst than best we break to return + // candidates that correspond to the best rank + if let Some((best_rank, _)) = final_candidates { if branch_rank > best_rank { break; } } + let _left = branch.last_result.0; + let candidates = take(&mut branch.last_result.2); if candidates.is_empty() { // we don't have candidates, get next interval. - if !branch.next()? { PeekMut::pop(branch); } + if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } } else if is_lowest_level { - // we have candidates, but we can't dig deeper, return candidates. + // we have candidates, but we can't dig deeper. + allowed_candidates -= &candidates; final_candidates = match final_candidates.take() { + // we add current candidates to best candidates Some((best_rank, mut best_candidates)) => { - // if current is worst than best we break to return - // candidates that correspond to the best rank - if branch_rank > best_rank { - final_candidates = Some((best_rank, best_candidates)); - break; - // else we add current candidates to best candidates - // and we fetch the next page - } else { - best_candidates.union_with(candidates); - if !branch.next()? { PeekMut::pop(branch); } - Some((best_rank, best_candidates)) - } + best_candidates |= candidates; + branch.lazy_next(); + Some((best_rank, best_candidates)) }, // we take current candidates as best candidates - // and we fetch the next page None => { - let candidates = take(candidates); - if !branch.next()? { PeekMut::pop(branch); } + branch.lazy_next(); Some((branch_rank, candidates)) }, }; } else { // we have candidates, lets dig deeper in levels. - branch.query_level_iterator = branch.query_level_iterator.dig(ctx)?; - if !branch.next()? { PeekMut::pop(branch); } + branch.dig(ctx)?; + if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } } } From 71740805a7c2f45eff9db63f5ae4e4705352d189 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 21 Apr 2021 11:44:29 +0200 Subject: [PATCH 0646/1889] Fix forgotten typo tests --- Cargo.lock | 14 ++++++++++++-- milli/Cargo.toml | 2 +- milli/src/search/criteria/typo.rs | 5 +++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 065be362f..0e42f60f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1005,7 +1005,7 @@ dependencies = [ "heed", "jemallocator", "milli", - "roaring", + "roaring 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json", "stderrlog", "structopt", @@ -1287,7 +1287,7 @@ dependencies = [ "rand 0.8.3", "rayon", "regex", - "roaring", + "roaring 0.6.5 (git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops)", "serde", "serde_json", "slice-group-by", @@ -1973,6 +1973,16 @@ dependencies = [ "retain_mut", ] +[[package]] +name = "roaring" +version = "0.6.5" +source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops#6689f8c9dd2efdbfde4442d4d803e87169780593" +dependencies = [ + "bytemuck", + "byteorder", + "retain_mut", +] + [[package]] name = "rustc_version" version = "0.2.3" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ef9c64b7b..b54c0d768 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -27,7 +27,7 @@ once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" regex = "1.4.3" -roaring = "0.6.5" +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "optimize-ops" } serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } slice-group-by = "0.2.6" diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index bf58fa258..5a3c93ac8 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -328,6 +328,7 @@ mod test { let parent = Initial::new(query_tree, facet_candidates); let mut criteria = Typo::new(&context, Box::new(parent)); + assert!(criteria.next(&mut wdcache).unwrap().unwrap().candidates.is_none()); assert!(criteria.next(&mut wdcache).unwrap().is_none()); } @@ -440,7 +441,7 @@ mod test { ]), ])), candidates: Some(&candidates_1 & &facet_candidates), - bucket_candidates: candidates_1 & &facet_candidates, + bucket_candidates: facet_candidates.clone(), }; assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); @@ -462,7 +463,7 @@ mod test { ]), ])), candidates: Some(&candidates_2 & &facet_candidates), - bucket_candidates: candidates_2 & &facet_candidates, + bucket_candidates: RoaringBitmap::new(), }; assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); From 0d7d3ce802d4e1ef5226bb90d1bc65f140fdf104 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 21 Apr 2021 11:53:07 +0200 Subject: [PATCH 0647/1889] Update roaring package --- Cargo.lock | 18 ++++-------------- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e42f60f4..6a30891ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1005,7 +1005,7 @@ dependencies = [ "heed", "jemallocator", "milli", - "roaring 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", + "roaring", "serde_json", "stderrlog", "structopt", @@ -1287,7 +1287,7 @@ dependencies = [ "rand 0.8.3", "rayon", "regex", - "roaring 0.6.5 (git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops)", + "roaring", "serde", "serde_json", "slice-group-by", @@ -1964,19 +1964,9 @@ checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1" [[package]] name = "roaring" -version = "0.6.5" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6744a4a918e91359ad1d356a91e2e943a86d9fb9ae77f715d617032ea2af88f" -dependencies = [ - "bytemuck", - "byteorder", - "retain_mut", -] - -[[package]] -name = "roaring" -version = "0.6.5" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=optimize-ops#6689f8c9dd2efdbfde4442d4d803e87169780593" +checksum = "a4b2e7ab0bbb2d144558ae3f4761a0db06d21463b45756fc64c3393cdba3d447" dependencies = [ "bytemuck", "byteorder", diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 59cfbd661..8b5867fde 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -11,7 +11,7 @@ csv = "1.1.5" heed = "0.10.6" jemallocator = "0.3.2" milli = { path = "../milli" } -roaring = "0.6.5" +roaring = "0.6.6" serde_json = "1.0.62" stderrlog = "0.5.1" structopt = { version = "0.3.21", default-features = false } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b54c0d768..8b359a09b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -27,7 +27,7 @@ once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" regex = "1.4.3" -roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "optimize-ops" } +roaring = "0.6.6" serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } slice-group-by = "0.2.6" From 0daa0e170ac5f81d517aca2384ec4fcb237fe76e Mon Sep 17 00:00:00 2001 From: Many Date: Mon, 26 Apr 2021 11:30:42 +0200 Subject: [PATCH 0648/1889] Fix PR comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- infos/src/main.rs | 3 +-- milli/src/search/criteria/attribute.rs | 2 +- milli/src/search/criteria/final.rs | 6 +----- milli/src/search/criteria/mod.rs | 10 +++++++++- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 5a12a9d4d..902394af8 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -354,8 +354,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho word_prefix_level_position_docids, facet_field_id_value_docids, field_id_docid_facet_values: _, - documents, - .. + documents } = index; let main_name = "main"; diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 745d8cdb0..18a18816c 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -13,7 +13,7 @@ use super::{Criterion, CriterionResult, Context, resolve_query_tree}; /// To be able to divide integers by the number of words in the query /// we want to find a multiplier that allow us to divide by any number between 1 and 10. -/// We Choosed the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). +/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). const LCM_10_FIRST_NUMBERS: u32 = 2520; pub struct Attribute<'t> { diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index d3c394467..f8bc43204 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -44,11 +44,7 @@ impl<'t> Final<'t> { bucket_candidates.union_with(&candidates); - return Ok(Some(FinalResult { - query_tree, - candidates, - bucket_candidates, - })); + return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); }, None => return Ok(None), } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index b972a0b2c..d3eac94fd 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -123,7 +123,15 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { Ok(words_positions) } - fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>> { + fn word_position_iterator( + &self, + word: &str, + level: TreeLevel, + in_prefix_cache: bool, + left: Option, + right: Option + ) -> heed::Result> + 'c>> + { let range = { let left = left.unwrap_or(u32::min_value()); let right = right.unwrap_or(u32::max_value()); From 47d780b8ce43fad2631efb473e25b5ea12992476 Mon Sep 17 00:00:00 2001 From: Many Date: Mon, 26 Apr 2021 14:51:52 +0200 Subject: [PATCH 0649/1889] Update milli/src/search/criteria/mod.rs Co-authored-by: Irevoire --- milli/src/search/criteria/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d3eac94fd..01af1ffbd 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -130,7 +130,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { in_prefix_cache: bool, left: Option, right: Option - ) -> heed::Result> + 'c>> + ) -> heed::Result> + 'c>> { let range = { let left = left.unwrap_or(u32::min_value()); From 0e4e6dfada834728644df9a5ab5afb8698a6c85f Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:29:52 +0200 Subject: [PATCH 0650/1889] Update milli/src/search/criteria/proximity.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/proximity.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index ca412bf28..4c73d7459 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -30,7 +30,7 @@ impl<'t> Proximity<'t> { state: None, proximity: 0, bucket_candidates: RoaringBitmap::new(), - parent: parent, + parent, candidates_cache: Cache::new(), plane_sweep_cache: None, } From 498c2b298c795810726c18f8c95ef16ce490c02d Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:30:02 +0200 Subject: [PATCH 0651/1889] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 18a18816c..820085c31 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -57,7 +57,7 @@ impl<'t> Criterion for Attribute<'t> { flatten_query_tree(&qt) }); - let found_candidates = if candidates.len() < 1_000 { + let found_candidates = if candidates.len() < 1000 { let current_buckets = match self.current_buckets.as_mut() { Some(current_buckets) => current_buckets, None => { From b3d6c6a9a0e8447daefa92b022c03fe39d5b08e3 Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:31:13 +0200 Subject: [PATCH 0652/1889] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 820085c31..31725e221 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -77,9 +77,7 @@ impl<'t> Criterion for Attribute<'t> { }, } } else { - let found_candidates = set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)?; - - match found_candidates { + match set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? { Some(candidates) => candidates, None => { return Ok(Some(CriterionResult { From e92d13767667f7ee5c4bfca6fca339d8c26c5e85 Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:31:42 +0200 Subject: [PATCH 0653/1889] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 31725e221..a1a31247b 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -453,7 +453,7 @@ fn initialize_query_level_iterators<'t, 'q>( } // QueryLevelIterator need to be sorted by level and folded in descending order. branch_positions.sort_unstable_by_key(|qli| qli.level); - let folded_query_level_iterators = branch_positions + let folded_query_level_iterators = branch_positions .into_iter() .fold(None, |fold: Option, mut qli| match fold { Some(fold) => { From c862b1bc6be8c364a474104139f90272ccd1787f Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:32:10 +0200 Subject: [PATCH 0654/1889] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index a1a31247b..c7d10e431 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -465,7 +465,7 @@ fn initialize_query_level_iterators<'t, 'q>( if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { let tree_level = folded_query_level_iterators.level; - let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?; + let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?; if let Some(last_result) = last_result { let branch = Branch { last_result, From 3b1358b62f539ec6f74a74deb40850dbff6ba34f Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:32:19 +0200 Subject: [PATCH 0655/1889] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index c7d10e431..8f2e34ca9 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -554,7 +554,7 @@ fn linear_compute_candidates( QueryKind::Exact { word, .. } => { if *prefix { word_derivations(word, true, 0, &words_positions) - .flat_map(|positions| positions.iter().next()).min() + .flat_map(|positions| positions.iter().next()).min() } else { words_positions.get(word) .map(|positions| positions.iter().next()) From 329bd4a1bbe4ddfe408fb33611f7d7d8e6d91661 Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:39:03 +0200 Subject: [PATCH 0656/1889] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 8f2e34ca9..62e992fad 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -304,9 +304,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { /// and from eventual chainned QueryLevelIterator. fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result> { let parent_result = match self.parent.as_mut() { - Some(parent) => { - Some(parent.next(allowed_candidates, tree_level)?) - }, + Some(parent) => Some(parent.next(allowed_candidates, tree_level)?), None => None, }; From 3794ffc9529d89bf6e965ad0c99eb477a5881bc6 Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:39:23 +0200 Subject: [PATCH 0657/1889] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 62e992fad..3d7132e77 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -496,7 +496,9 @@ fn set_compute_candidates<'t>( let branch_rank = branch.compute_rank(); // if current is worst than best we break to return // candidates that correspond to the best rank - if let Some((best_rank, _)) = final_candidates { if branch_rank > best_rank { break; } } + if let Some((best_rank, _)) = final_candidates { + if branch_rank > best_rank { break } + } let _left = branch.last_result.0; let candidates = take(&mut branch.last_result.2); if candidates.is_empty() { From 0add4d735c95ed8bcddeb2e2afa853bab7dcf62e Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 27 Apr 2021 17:40:34 +0200 Subject: [PATCH 0658/1889] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 3d7132e77..e1069b5f5 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -529,9 +529,7 @@ fn set_compute_candidates<'t>( } - Ok(final_candidates.map(|(_rank, candidates)| { - candidates - })) + Ok(final_candidates.map(|(_rank, candidates)| candidates)) } fn linear_compute_candidates( From 3b7e6afb55e76749bb69b11e5ab15d488bc8924f Mon Sep 17 00:00:00 2001 From: many Date: Wed, 28 Apr 2021 13:53:27 +0200 Subject: [PATCH 0659/1889] Make some refacto and add documentation --- milli/src/search/criteria/attribute.rs | 64 ++++++++++++++++++-------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index e1069b5f5..bbbc0de1a 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -16,6 +16,10 @@ use super::{Criterion, CriterionResult, Context, resolve_query_tree}; /// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). const LCM_10_FIRST_NUMBERS: u32 = 2520; +/// To compute the interval size of a level, +/// we use 4 as the exponentiation base and the level as the exponent. +const LEVEL_EXPONENTIATION_BASE: u32 = 4; + pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, query_tree: Option, @@ -150,7 +154,7 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result> { match ctx.word_position_last_level(&word, in_prefix_cache)? { Some(level) => { - let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level.clone()) as u32); let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) }, @@ -160,7 +164,7 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option) -> heed::Result { let level = level.min(&self.level).clone(); - let interval_size = 4u32.pow(Into::::into(level.clone()) as u32); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level.clone()) as u32); let word = self.word.clone(); let in_prefix_cache = self.in_prefix_cache; let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; @@ -280,10 +284,10 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { fn inner_next(&mut self, level: TreeLevel) -> heed::Result> { let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None; let u8_level = Into::::into(level); - let interval_size = 4u32.pow(u8_level as u32); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); for wli in self.inner.iter_mut() { let wli_u8_level = Into::::into(wli.level.clone()); - let accumulated_count = 4u32.pow((u8_level - wli_u8_level) as u32); + let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); for _ in 0..accumulated_count { if let Some((next_left, _, next_docids)) = wli.next()? { accumulated = match accumulated.take(){ @@ -311,20 +315,12 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { match parent_result { Some(parent_next) => { let inner_next = self.inner_next(tree_level)?; - self.interval_to_skip += self.accumulator.iter().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip).take_while(|current| { - match current { - (Some((_, _, inner)), Some((_, _, parent))) => { - inner.is_disjoint(allowed_candidates) && parent.is_empty() - }, - (Some((_, _, inner)), None) => { - inner.is_disjoint(allowed_candidates) - }, - (None, Some((_, _, parent))) => { - parent.is_empty() - }, - (None, None) => true, - } - }).count(); + self.interval_to_skip += interval_to_skip( + &self.parent_accumulator, + &self.accumulator, + self.interval_to_skip, + allowed_candidates + ); self.accumulator.push(inner_next); self.parent_accumulator.push(parent_next); let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; @@ -358,6 +354,29 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { } } +/// Count the number of interval that can be skiped when we make the cross-intersections +/// in order to compute the next meta-interval. +/// A pair of intervals is skiped when both intervals doesn't contain any allowed docids. +fn interval_to_skip( + parent_accumulator: &[Option<(u32, u32, RoaringBitmap)>], + current_accumulator: &[Option<(u32, u32, RoaringBitmap)>], + already_skiped: usize, + allowed_candidates: &RoaringBitmap, +) -> usize { + parent_accumulator.into_iter() + .zip(current_accumulator.into_iter()) + .skip(already_skiped) + .take_while(|(parent, current)| { + let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); + let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); + skip_parent && skip_current + }) + .count() + +} + +/// A Branch is represent a possible alternative of the original query and is build with the Query Tree, +/// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates. struct Branch<'t, 'q> { query_level_iterator: QueryLevelIterator<'t, 'q>, last_result: (u32, u32, RoaringBitmap), @@ -366,6 +385,8 @@ struct Branch<'t, 'q> { } impl<'t, 'q> Branch<'t, 'q> { + /// return the next meta-interval of the branch, + /// and update inner interval in order to be ranked by the BinaryHeap. fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result { let tree_level = self.query_level_iterator.level; match self.query_level_iterator.next(allowed_candidates, tree_level)? { @@ -378,19 +399,24 @@ impl<'t, 'q> Branch<'t, 'q> { } } + /// make the current Branch iterate over smaller intervals. fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> { self.query_level_iterator = self.query_level_iterator.dig(ctx)?; Ok(()) } + /// because next() method could be time consuming, + /// update inner interval in order to be ranked by the binary_heap without computing it, + /// the next() method should be called when the real interval is needed. fn lazy_next(&mut self) { let u8_level = Into::::into(self.tree_level.clone()); - let interval_size = 4u32.pow(u8_level as u32); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); let (left, right, _) = self.last_result; self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); } + /// return the score of the current inner interval. fn compute_rank(&self) -> u32 { // we compute a rank from the left interval. let (left, _, _) = self.last_result; From 31607bf9cd223df41c61651d6b1c9384bc4e50bd Mon Sep 17 00:00:00 2001 From: many Date: Tue, 27 Apr 2021 14:53:37 +0200 Subject: [PATCH 0660/1889] Add a threshold on proximity when choosing between linear/set algorithm --- milli/src/search/criteria/proximity.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 4c73d7459..4ed6dd401 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -55,7 +55,7 @@ impl<'t> Criterion for Proximity<'t> { if self.proximity as usize > *max_prox { self.state = None; // reset state } else { - let mut new_candidates = if candidates.len() <= 1000 { + let mut new_candidates = if candidates.len() <= 1000 && self.proximity > 0 { if let Some(cache) = self.plane_sweep_cache.as_mut() { match cache.next() { Some((p, candidates)) => { From 566c4a53c57e4daa67b1ddbac1d8df837f3e495b Mon Sep 17 00:00:00 2001 From: Yann Simon Date: Thu, 29 Apr 2021 09:25:35 +0200 Subject: [PATCH 0661/1889] do not use echo that espaces newline Fix https://github.com/meilisearch/milli/issues/175 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 50a64e079..13d35380a 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ All of that on a 39$/month machine with 4cores. You can feed the engine with your CSV (comma-seperated, yes) data like this: ```bash -echo "name,age\nhello,32\nkiki,24\n" | http POST 127.0.0.1:9700/documents content-type:text/csv +printf "name,age\nhello,32\nkiki,24\n" | http POST 127.0.0.1:9700/documents content-type:text/csv ``` Here ids will be automatically generated as UUID v4 if they doesn't exist in some or every documents. From ee09e50e7f96a6144aa8bbbc9a850775281bb38b Mon Sep 17 00:00:00 2001 From: many Date: Wed, 28 Apr 2021 18:01:23 +0200 Subject: [PATCH 0662/1889] Remove excluded document in criteria iterations - pass excluded document to criteria to remove them in higher levels of the bucket-sort - merge already returned document with excluded documents to avoid duplicas Related to #125 and #112 Fix #170 --- milli/src/search/criteria/asc_desc.rs | 13 ++--- milli/src/search/criteria/attribute.rs | 22 +++++--- milli/src/search/criteria/final.rs | 30 +++++++---- milli/src/search/criteria/initial.rs | 5 +- milli/src/search/criteria/mod.rs | 8 ++- milli/src/search/criteria/proximity.rs | 38 ++++++++++--- milli/src/search/criteria/typo.rs | 74 ++++++++++++++++++-------- milli/src/search/criteria/words.rs | 13 +++-- milli/src/search/mod.rs | 8 +-- 9 files changed, 149 insertions(+), 62 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index d2841d449..54cbb0fae 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -12,9 +12,8 @@ use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; -use crate::search::WordDerivationsCache; use crate::{FieldsIdsMap, FieldId, Index}; -use super::{Criterion, CriterionResult}; +use super::{Criterion, CriterionParameters, CriterionResult}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -85,7 +84,7 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { loop { debug!("Facet {}({}) iteration", if self.ascending { "Asc" } else { "Desc" }, self.field_name @@ -93,7 +92,7 @@ impl<'t> Criterion for AscDesc<'t> { match self.candidates.next().transpose()? { None => { - match self.parent.next(wdcache)? { + match self.parent.next(params)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { let candidates_is_some = candidates.is_some(); self.query_tree = query_tree; @@ -104,7 +103,8 @@ impl<'t> Criterion for AscDesc<'t> { }, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), params.wdcache)?; + candidates -= params.excluded_candidates; candidates.intersect_with(&self.faceted_candidates); candidates }, @@ -138,7 +138,8 @@ impl<'t> Criterion for AscDesc<'t> { None => return Ok(None), } }, - Some(candidates) => { + Some(mut candidates) => { + candidates -= params.excluded_candidates; return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index bbbc0de1a..5993f03bd 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -9,7 +9,7 @@ use crate::{TreeLevel, search::build_dfa}; use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; -use super::{Criterion, CriterionResult, Context, resolve_query_tree}; +use super::{Criterion, CriterionParameters, CriterionResult, Context, resolve_query_tree}; /// To be able to divide integers by the number of words in the query /// we want to find a multiplier that allow us to divide by any number between 1 and 10. @@ -20,6 +20,10 @@ const LCM_10_FIRST_NUMBERS: u32 = 2520; /// we use 4 as the exponentiation base and the level as the exponent. const LEVEL_EXPONENTIATION_BASE: u32 = 4; +/// Threshold on the number of candidates that will make +/// the system to choose between one algorithm or another. +const CANDIDATES_THRESHOLD: u64 = 1000; + pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, query_tree: Option, @@ -46,7 +50,12 @@ impl<'t> Attribute<'t> { impl<'t> Criterion for Attribute<'t> { #[logging_timer::time("Attribute::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + if let Some(candidates) = self.candidates.as_mut() { + *candidates -= params.excluded_candidates; + } + loop { match (&self.query_tree, &mut self.candidates) { (_, Some(candidates)) if candidates.is_empty() => { @@ -61,7 +70,7 @@ impl<'t> Criterion for Attribute<'t> { flatten_query_tree(&qt) }); - let found_candidates = if candidates.len() < 1000 { + let found_candidates = if candidates.len() < CANDIDATES_THRESHOLD { let current_buckets = match self.current_buckets.as_mut() { Some(current_buckets) => current_buckets, None => { @@ -81,7 +90,7 @@ impl<'t> Criterion for Attribute<'t> { }, } } else { - match set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? { + match set_compute_candidates(self.ctx, flattened_query_tree, candidates, params.wdcache)? { Some(candidates) => candidates, None => { return Ok(Some(CriterionResult { @@ -102,7 +111,8 @@ impl<'t> Criterion for Attribute<'t> { })); }, (Some(qt), None) => { - let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; + let mut query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), params.wdcache)?; + query_tree_candidates -= params.excluded_candidates; self.bucket_candidates |= &query_tree_candidates; self.candidates = Some(query_tree_candidates); }, @@ -114,7 +124,7 @@ impl<'t> Criterion for Attribute<'t> { })); }, (None, None) => { - match self.parent.next(wdcache)? { + match self.parent.next(params)? { Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index f8bc43204..707195ba7 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Criterion, CriterionResult, Context}; +use super::{resolve_query_tree, Criterion, CriterionResult, CriterionParameters, Context}; /// The result of a call to the fetcher. #[derive(Debug, Clone, PartialEq)] @@ -22,27 +22,39 @@ pub struct Final<'t> { ctx: &'t dyn Context<'t>, parent: Box, wdcache: WordDerivationsCache, + returned_candidates: RoaringBitmap, } impl<'t> Final<'t> { pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Final<'t> { - Final { ctx, parent, wdcache: WordDerivationsCache::new() } + Final { ctx, parent, wdcache: WordDerivationsCache::new(), returned_candidates: RoaringBitmap::new() } } #[logging_timer::time("Final::{}")] - pub fn next(&mut self) -> anyhow::Result> { + pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> anyhow::Result> { loop { debug!("Final iteration"); + let mut criterion_parameters = CriterionParameters { + wdcache: &mut self.wdcache, + // returned_candidates is merged with excluded_candidates to avoid duplicas + excluded_candidates: &(&self.returned_candidates | excluded_candidates), + }; - match self.parent.next(&mut self.wdcache)? { + match self.parent.next(&mut criterion_parameters)? { Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => { - let candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, - (None, None) => self.ctx.documents_ids()?, + let candidates = match candidates { + Some(candidates) => candidates, + None => { + let candidates = match query_tree.as_ref() { + Some(qt) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, + None => self.ctx.documents_ids()?, + }; + bucket_candidates |= &candidates; + candidates + } }; - bucket_candidates.union_with(&candidates); + self.returned_candidates |= &candidates; return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); }, diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index d4b9e1379..10858dd99 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -1,9 +1,8 @@ use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use crate::search::WordDerivationsCache; -use super::{Criterion, CriterionResult}; +use super::{Criterion, CriterionResult, CriterionParameters}; pub struct Initial { answer: Option @@ -22,7 +21,7 @@ impl Initial { impl Criterion for Initial { #[logging_timer::time("Initial::{}")] - fn next(&mut self, _: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, _: &mut CriterionParameters) -> anyhow::Result> { Ok(self.answer.take()) } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 01af1ffbd..164937dec 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -25,7 +25,7 @@ mod words; pub mod r#final; pub trait Criterion { - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result>; + fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result>; } /// The result of a call to the parent criterion. @@ -40,6 +40,12 @@ pub struct CriterionResult { bucket_candidates: RoaringBitmap, } +#[derive(Debug, PartialEq)] +pub struct CriterionParameters<'a> { + wdcache: &'a mut WordDerivationsCache, + excluded_candidates: &'a RoaringBitmap, +} + /// Either a set of candidates that defines the candidates /// that are allowed to be returned, /// or the candidates that must never be returned. diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 4ed6dd401..08fba1447 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -8,10 +8,26 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; -use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; +use super::{ + Context, + Criterion, + CriterionParameters, + CriterionResult, + query_docids, + query_pair_proximity_docids, + resolve_query_tree, +}; type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; +/// Threshold on the number of candidates that will make +/// the system choose between one algorithm or another. +const CANDIDATES_THRESHOLD: u64 = 1000; + +/// Threshold on the number of proximity that will make +/// the system choose between one algorithm or another. +const PROXIMITY_THRESHOLD: u8 = 0; + pub struct Proximity<'t> { ctx: &'t dyn Context<'t>, /// ((max_proximity, query_tree), allowed_candidates) @@ -39,7 +55,12 @@ impl<'t> Proximity<'t> { impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + if let Some((_, candidates)) = self.state.as_mut() { + *candidates -= params.excluded_candidates; + } + loop { debug!("Proximity at iteration {} (max prox {:?}) ({:?})", self.proximity, @@ -55,7 +76,7 @@ impl<'t> Criterion for Proximity<'t> { if self.proximity as usize > *max_prox { self.state = None; // reset state } else { - let mut new_candidates = if candidates.len() <= 1000 && self.proximity > 0 { + let mut new_candidates = if candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { if let Some(cache) = self.plane_sweep_cache.as_mut() { match cache.next() { Some((p, candidates)) => { @@ -72,7 +93,7 @@ impl<'t> Criterion for Proximity<'t> { self.ctx, query_tree, candidates, - wdcache, + params.wdcache, )?; self.plane_sweep_cache = Some(cache.into_iter()); @@ -84,7 +105,7 @@ impl<'t> Criterion for Proximity<'t> { &query_tree, self.proximity, &mut self.candidates_cache, - wdcache, + params.wdcache, )? }; @@ -109,7 +130,7 @@ impl<'t> Criterion for Proximity<'t> { })); }, None => { - match self.parent.next(wdcache)? { + match self.parent.next(params)? { Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, @@ -121,7 +142,10 @@ impl<'t> Criterion for Proximity<'t> { let candidates_is_some = candidates.is_some(); let candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, - (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, + (Some(qt), None) => { + let candidates = resolve_query_tree(self.ctx, qt, &mut HashMap::new(), params.wdcache)?; + candidates - params.excluded_candidates + }, (None, None) => RoaringBitmap::new(), }; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 5a3c93ac8..f265b30ae 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -6,7 +6,15 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; +use super::{ + Candidates, + Context, + Criterion, + CriterionParameters, + CriterionResult, + query_docids, + query_pair_proximity_docids +}; pub struct Typo<'t> { ctx: &'t dyn Context<'t>, @@ -34,8 +42,14 @@ impl<'t> Typo<'t> { impl<'t> Criterion for Typo<'t> { #[logging_timer::time("Typo::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; + // remove excluded candidates when next is called, instead of doing it in the loop. + match &mut self.candidates { + Allowed(candidates) => *candidates -= params.excluded_candidates, + Forbidden(candidates) => *candidates |= params.excluded_candidates, + } + loop { debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); @@ -54,9 +68,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?; query_tree.clone() } else { query_tree.clone() @@ -67,7 +81,7 @@ impl<'t> Criterion for Typo<'t> { &new_query_tree, self.number_typos, &mut self.candidates_cache, - wdcache, + params.wdcache, )?; new_candidates.intersect_with(&candidates); candidates.difference_with(&new_candidates); @@ -87,9 +101,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?; query_tree.clone() } else { query_tree.clone() @@ -100,7 +114,7 @@ impl<'t> Criterion for Typo<'t> { &new_query_tree, self.number_typos, &mut self.candidates_cache, - wdcache, + params.wdcache, )?; new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); @@ -123,7 +137,7 @@ impl<'t> Criterion for Typo<'t> { })); }, (None, Forbidden(_)) => { - match self.parent.next(wdcache)? { + match self.parent.next(params)? { Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, @@ -134,7 +148,9 @@ impl<'t> Criterion for Typo<'t> { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; - self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); + self.candidates = candidates.map_or_else(|| { + Candidates::Forbidden(params.excluded_candidates.clone()) + }, Candidates::Allowed); self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), @@ -324,12 +340,16 @@ mod test { let query_tree = None; let facet_candidates = None; - let mut wdcache = WordDerivationsCache::new(); + let mut criterion_parameters = CriterionParameters { + wdcache: &mut WordDerivationsCache::new(), + excluded_candidates: &RoaringBitmap::new(), + }; + let parent = Initial::new(query_tree, facet_candidates); let mut criteria = Typo::new(&context, Box::new(parent)); - assert!(criteria.next(&mut wdcache).unwrap().unwrap().candidates.is_none()); - assert!(criteria.next(&mut wdcache).unwrap().is_none()); + assert!(criteria.next(&mut criterion_parameters).unwrap().unwrap().candidates.is_none()); + assert!(criteria.next(&mut criterion_parameters).unwrap().is_none()); } #[test] @@ -345,7 +365,10 @@ mod test { let facet_candidates = None; - let mut wdcache = WordDerivationsCache::new(); + let mut criterion_parameters = CriterionParameters { + wdcache: &mut WordDerivationsCache::new(), + excluded_candidates: &RoaringBitmap::new(), + }; let parent = Initial::new(Some(query_tree), facet_candidates); let mut criteria = Typo::new(&context, Box::new(parent)); @@ -364,7 +387,7 @@ mod test { bucket_candidates: candidates_1, }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); + assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -386,7 +409,7 @@ mod test { bucket_candidates: candidates_2, }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); + assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); } #[test] @@ -395,7 +418,10 @@ mod test { let query_tree = None; let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut wdcache = WordDerivationsCache::new(); + let mut criterion_parameters = CriterionParameters { + wdcache: &mut WordDerivationsCache::new(), + excluded_candidates: &RoaringBitmap::new(), + }; let parent = Initial::new(query_tree, Some(facet_candidates.clone())); let mut criteria = Typo::new(&context, Box::new(parent)); @@ -406,10 +432,10 @@ mod test { }; // first iteration, returns the facet candidates - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected)); + assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected)); // second iteration, returns None because there is no more things to do - assert!(criteria.next(&mut wdcache).unwrap().is_none()); + assert!(criteria.next(&mut criterion_parameters).unwrap().is_none()); } #[test] @@ -425,7 +451,11 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut wdcache = WordDerivationsCache::new(); + + let mut criterion_parameters = CriterionParameters { + wdcache: &mut WordDerivationsCache::new(), + excluded_candidates: &RoaringBitmap::new(), + }; let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone())); let mut criteria = Typo::new(&context, Box::new(parent)); @@ -444,7 +474,7 @@ mod test { bucket_candidates: facet_candidates.clone(), }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); + assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); let candidates_2 = ( context.word_docids("split").unwrap().unwrap() @@ -466,6 +496,6 @@ mod test { bucket_candidates: RoaringBitmap::new(), }; - assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); + assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); } } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 047b3c5f0..23a45223a 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -5,7 +5,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache}; +use super::{Context, Criterion, CriterionParameters, CriterionResult, resolve_query_tree}; pub struct Words<'t> { ctx: &'t dyn Context<'t>, @@ -31,7 +31,12 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { #[logging_timer::time("Words::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + if let Some(candidates) = self.candidates.as_mut() { + *candidates -= params.excluded_candidates; + } + loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); @@ -45,7 +50,7 @@ impl<'t> Criterion for Words<'t> { })); }, (Some(qt), Some(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, params.wdcache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -71,7 +76,7 @@ impl<'t> Criterion for Words<'t> { })); }, (None, None) => { - match self.parent.next(wdcache)? { + match self.parent.next(params)? { Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 4f0bde422..4227ab0a6 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -165,13 +165,13 @@ impl<'a> Search<'a> { ) -> anyhow::Result { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); - let mut excluded_documents = RoaringBitmap::new(); + let mut excluded_candidates = RoaringBitmap::new(); let mut documents_ids = Vec::with_capacity(self.limit); - while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next()? { + while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_candidates)? { debug!("Number of candidates found {}", candidates.len()); - let excluded = take(&mut excluded_documents); + let excluded = take(&mut excluded_candidates); let mut candidates = distinct.distinct(candidates, excluded); @@ -186,7 +186,7 @@ impl<'a> Search<'a> { documents_ids.push(candidate?); } if documents_ids.len() == self.limit { break } - excluded_documents = candidates.into_excluded(); + excluded_candidates = candidates.into_excluded(); } Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) From e8e32e0ba1c4cb8c07e23a8f83c2c526571ca647 Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Thu, 29 Apr 2021 20:05:07 +0200 Subject: [PATCH 0663/1889] make document addition number visible --- milli/src/update/index_documents/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 8ebdf1634..ff7dcec77 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -41,7 +41,7 @@ mod transform; #[derive(Debug, Serialize, Deserialize, Clone)] pub struct DocumentAdditionResult { - nb_documents: usize, + pub nb_documents: usize, } #[derive(Debug, Copy, Clone)] From d81c0e8bba9747732a9761d5a63dcc252b8f30a6 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Fri, 30 Apr 2021 21:34:29 +0300 Subject: [PATCH 0664/1889] feat(update): disable autogenerate_docids by default --- milli/src/index.rs | 1 + milli/src/update/index_documents/mod.rs | 8 +++++--- milli/src/update/settings.rs | 7 +++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index ba7747250..584ffab56 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -558,6 +558,7 @@ pub(crate) mod tests { { "name": "bob", "age": 20 } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Json); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ff7dcec77..3acae7821 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -300,7 +300,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { words_positions_min_level_size: None, update_method: IndexDocumentsMethod::ReplaceDocuments, update_format: UpdateFormat::Json, - autogenerate_docids: true, + autogenerate_docids: false, update_id, } } @@ -901,7 +901,6 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.disable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); assert!(builder.execute(content, |_, _| ()).is_err()); wtxn.commit().unwrap(); @@ -928,7 +927,6 @@ mod tests { { "name": "benoit" } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.disable_autogenerate_docids(); builder.update_format(UpdateFormat::Json); assert!(builder.execute(content, |_, _| ()).is_err()); wtxn.commit().unwrap(); @@ -951,6 +949,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1066,6 +1065,7 @@ mod tests { { "name": "benoit" } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Json); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1088,6 +1088,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let content = &b"[]"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Json); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1114,6 +1115,7 @@ mod tests { { "name": "benoit" } "#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::JsonStream); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 62aa8db97..c4d4fcfce 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -517,6 +517,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -560,6 +561,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -581,6 +583,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); @@ -625,6 +628,7 @@ mod tests { // Then index some documents. let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -663,6 +667,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -684,6 +689,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); @@ -754,6 +760,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); From 34e02aba42fafb9005d9ceed516d01c56a96e8f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 3 May 2021 10:54:50 +0200 Subject: [PATCH 0665/1889] Upgrade Tokenizer version (v0.2.2) --- Cargo.lock | 4 ++-- http-ui/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6a30891ec..3dd98053c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1213,8 +1213,8 @@ checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "meilisearch-tokenizer" -version = "0.2.1" -source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.1#b7a89c682b9f5d23a1d8075a99cca76069fff6c6" +version = "0.2.2" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.2#eda4ed4968c8ac973cf1707ef89bd7012bb2722f" dependencies = [ "character_converter", "cow-utils", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index fdb0a9596..d0a077448 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = "0.10.6" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.1" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.2" } memmap = "0.7.0" milli = { path = "../milli" } once_cell = "1.5.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 8b359a09b..34e977138 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -20,7 +20,7 @@ heed = { version = "0.10.6", default-features = false, features = ["lmdb", "sync human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.1" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.2" } memmap = "0.7.0" obkv = "0.1.1" once_cell = "1.5.2" From c30f17fafbc68dd455489b9d480ea176e908a1c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 29 Apr 2021 17:20:52 +0200 Subject: [PATCH 0666/1889] Add bors --- .github/workflows/test.yml | 2 +- bors.toml | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 bors.toml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c51430384..7ba6f4d3a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Continuous integration on: push: - branches: [ main ] + branches: [ main, staging, trying ] pull_request: branches: [ main ] diff --git a/bors.toml b/bors.toml new file mode 100644 index 000000000..3fbc6159e --- /dev/null +++ b/bors.toml @@ -0,0 +1,5 @@ +status = [ + 'ci (stable)' +] +# 3 hours timeout +timeout-sec = 10800 From a8680887d8c2e73ccc834f27de907409fe1b3bb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 3 May 2021 14:50:47 +0200 Subject: [PATCH 0667/1889] Upgrade Milli version (v0.2.0) --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3dd98053c..e065cc260 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -852,7 +852,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.1.1" +version = "0.2.0" dependencies = [ "anyhow", "byte-unit", @@ -895,7 +895,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.1.1" +version = "0.2.0" dependencies = [ "anyhow", "askama", @@ -997,7 +997,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.1.1" +version = "0.2.0" dependencies = [ "anyhow", "byte-unit", @@ -1254,7 +1254,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.1.1" +version = "0.2.0" dependencies = [ "anyhow", "big_s", @@ -2017,7 +2017,7 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "search" -version = "0.1.1" +version = "0.2.0" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 1db7caf4f..75674e357 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.1.1" +version = "0.2.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index d0a077448..1636c3a65 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.1.1" +version = "0.2.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 8b5867fde..36c222148 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.1.1" +version = "0.2.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 34e977138..757df02d4 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.1.1" +version = "0.2.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index ae22ea80a..7c7d7fbc8 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.1.1" +version = "0.2.0" authors = ["Clément Renault "] edition = "2018" From bb5823c7752e6b2c9970c48bb5e0641d8e39d99f Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Mon, 3 May 2021 15:21:20 +0200 Subject: [PATCH 0668/1889] remove tests on main --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7ba6f4d3a..ff8342620 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -2,7 +2,7 @@ name: Continuous integration on: push: - branches: [ main, staging, trying ] + branches: [ staging, trying ] pull_request: branches: [ main ] From d61566787eedd64ee5f3e40d1786714893910f16 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 4 May 2021 11:23:51 +0200 Subject: [PATCH 0669/1889] provide an iterator over all the documents in a milli index --- milli/src/index.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index 584ffab56..945567cdb 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -477,6 +477,18 @@ impl Index { Ok(documents) } + /// Returns an iterator over all the documents in the index. + pub fn all_documents<'t>( + &self, + rtxn: &'t RoTxn, + ) -> anyhow::Result)>>> { + Ok(self + .documents + .iter(rtxn)? + // we cast the BEU32 to a DocumentId + .map(|document| document.map(|(id, obkv)| (id.get(), obkv)))) + } + pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { FacetDistribution::new(rtxn, self) } From f8d0f5265fea004057749d9b8c67897dce471f0c Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Tue, 4 May 2021 22:01:11 +0300 Subject: [PATCH 0670/1889] fix(update): fields distribution after documents merge --- milli/src/index.rs | 7 ++-- milli/src/update/index_documents/transform.rs | 36 +++++-------------- 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 945567cdb..f222069f6 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -566,11 +566,11 @@ pub(crate) mod tests { let mut wtxn = index.write_txn().unwrap(); let content = &br#"[ - { "name": "kevin" }, - { "name": "bob", "age": 20 } + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "bob", "age": 20 }, + { "id": 2, "name": "bob", "age": 20 } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.enable_autogenerate_docids(); builder.update_format(UpdateFormat::Json); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -579,6 +579,7 @@ pub(crate) mod tests { let fields_distribution = index.fields_distribution(&rtxn).unwrap(); assert_eq!(fields_distribution, hashmap! { + "id".to_string() => 2, "name".to_string() => 2, "age".to_string() => 1, }); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 308a24abc..e029a5135 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::collections::HashMap; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::iter::Peekable; @@ -76,7 +75,6 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); // Deserialize the whole batch of documents in memory. @@ -106,7 +104,7 @@ impl Transform<'_, '_> { return Ok(TransformOutput { primary_key, fields_ids_map, - fields_distribution, + fields_distribution: self.index.fields_distribution(self.rtxn)?, external_documents_ids: ExternalDocumentsIds::default(), new_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(), @@ -137,8 +135,6 @@ impl Transform<'_, '_> { let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; let mut documents_count = 0; - let mut fields_ids_distribution = HashMap::new(); - for result in documents { let document = result?; @@ -153,9 +149,7 @@ impl Transform<'_, '_> { // We prepare the fields ids map with the documents keys. for (key, _value) in &document { - let field_id = fields_ids_map.insert(&key).context("field id limit reached")?; - - *fields_ids_distribution.entry(field_id).or_insert(0) += 1; + fields_ids_map.insert(&key).context("field id limit reached")?; } // We retrieve the user id from the document based on the primary key name, @@ -198,11 +192,6 @@ impl Transform<'_, '_> { documents_count += 1; } - for (field_id, count) in fields_ids_distribution { - let field_name = fields_ids_map.name(field_id).unwrap(); - *fields_distribution.entry(field_name.to_string()).or_default() += count; - } - progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { documents_seen: documents_count, }); @@ -213,7 +202,6 @@ impl Transform<'_, '_> { sorter, primary_key, fields_ids_map, - fields_distribution, documents_count, external_documents_ids, progress_callback, @@ -226,7 +214,6 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let mut csv = csv::Reader::from_reader(reader); @@ -284,8 +271,6 @@ impl Transform<'_, '_> { let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; let mut documents_count = 0; - let mut fields_ids_distribution = HashMap::new(); - let mut record = csv::StringRecord::new(); while csv.read_record(&mut record)? { obkv_buffer.clear(); @@ -324,8 +309,6 @@ impl Transform<'_, '_> { json_buffer.clear(); serde_json::to_writer(&mut json_buffer, &field)?; writer.insert(*field_id, &json_buffer)?; - - *fields_ids_distribution.entry(*field_id).or_insert(0) += 1; } // We use the extracted/generated user id as the key for this document. @@ -333,11 +316,6 @@ impl Transform<'_, '_> { documents_count += 1; } - for (field_id, count) in fields_ids_distribution { - let field_name = fields_ids_map.name(field_id).unwrap(); - *fields_distribution.entry(field_name.to_string()).or_default() += count; - } - progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { documents_seen: documents_count, }); @@ -352,7 +330,6 @@ impl Transform<'_, '_> { sorter, primary_key_name, fields_ids_map, - fields_distribution, documents_count, external_documents_ids, progress_callback, @@ -367,7 +344,6 @@ impl Transform<'_, '_> { sorter: grenad::Sorter, primary_key: String, fields_ids_map: FieldsIdsMap, - fields_distribution: FieldsDistribution, approximate_number_of_documents: usize, mut external_documents_ids: ExternalDocumentsIds<'_>, progress_callback: F, @@ -376,6 +352,7 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let documents_ids = self.index.documents_ids(self.rtxn)?; + let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); // Once we have sort and deduplicated the documents we write them into a final file. @@ -396,7 +373,6 @@ impl Transform<'_, '_> { let mut documents_count = 0; let mut iter = sorter.into_iter()?; while let Some((external_id, update_obkv)) = iter.next()? { - if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { documents_seen: documents_count, @@ -438,6 +414,12 @@ impl Transform<'_, '_> { // We insert the document under the documents ids map into the final file. final_sorter.insert(docid.to_be_bytes(), obkv)?; documents_count += 1; + + let reader = obkv::KvReader::new(obkv); + for (field_id, _) in reader.iter() { + let field_name = fields_ids_map.name(field_id).unwrap(); + *fields_distribution.entry(field_name.to_string()).or_default() += 1; + } } progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { From 1e11578ef077cf5b9616553a0f83b5a0730fa59f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 5 May 2021 14:57:34 +0200 Subject: [PATCH 0671/1889] Update version for the next release (v0.2.1) --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e065cc260..0b1da2b3f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -852,7 +852,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.2.0" +version = "0.2.1" dependencies = [ "anyhow", "byte-unit", @@ -895,7 +895,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.2.0" +version = "0.2.1" dependencies = [ "anyhow", "askama", @@ -997,7 +997,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.2.0" +version = "0.2.1" dependencies = [ "anyhow", "byte-unit", @@ -1254,7 +1254,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.2.0" +version = "0.2.1" dependencies = [ "anyhow", "big_s", @@ -2017,7 +2017,7 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "search" -version = "0.2.0" +version = "0.2.1" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 75674e357..0afd05b13 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.2.0" +version = "0.2.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 1636c3a65..36745d567 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.2.0" +version = "0.2.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 36c222148..c94ca63e2 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.2.0" +version = "0.2.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 757df02d4..3b25bb268 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.2.0" +version = "0.2.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 7c7d7fbc8..1b7cd3a45 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.2.0" +version = "0.2.1" authors = ["Clément Renault "] edition = "2018" From a3f8686fbfdbdb0707561a9b497878fd86f0a0fa Mon Sep 17 00:00:00 2001 From: many Date: Tue, 4 May 2021 13:44:55 +0200 Subject: [PATCH 0672/1889] Introduce exactness criterion --- milli/src/fields_ids_map.rs | 10 + milli/src/search/criteria/exactness.rs | 335 +++++++++++++++++++++++++ milli/src/search/criteria/mod.rs | 40 ++- milli/src/search/mod.rs | 8 +- milli/src/search/query_tree.rs | 46 ++-- 5 files changed, 412 insertions(+), 27 deletions(-) create mode 100644 milli/src/search/criteria/exactness.rs diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index ce79e6e04..6eed9c41f 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -65,6 +65,16 @@ impl FieldsIdsMap { pub fn iter(&self) -> impl Iterator { self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) } + + /// Iterate over the ids in the ids order. + pub fn ids<'a>(&'a self) -> impl Iterator + 'a { + self.ids_names.keys().copied() + } + + /// Iterate over the names in the ids order. + pub fn names(&self) -> impl Iterator { + self.ids_names.values().map(AsRef::as_ref) + } } impl Default for FieldsIdsMap { diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs new file mode 100644 index 000000000..a67b9ed3c --- /dev/null +++ b/milli/src/search/criteria/exactness.rs @@ -0,0 +1,335 @@ +use std::{collections::HashMap, mem}; + +use log::debug; +use roaring::RoaringBitmap; +use itertools::Itertools; +use std::ops::BitOr; + +use crate::search::query_tree::{Operation, PrimitiveQueryPart}; +use crate::search::criteria::{ + Context, + Criterion, + CriterionParameters, + CriterionResult, + resolve_query_tree, +}; +use crate::TreeLevel; + +pub struct Exactness<'t> { + ctx: &'t dyn Context<'t>, + query_tree: Option, + state: Option, + bucket_candidates: RoaringBitmap, + parent: Box, + query: Vec, +} + +impl<'t> Exactness<'t> { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box, primitive_query: &[PrimitiveQueryPart]) -> heed::Result { + let mut query: Vec<_> = Vec::with_capacity(primitive_query.len()); + for part in primitive_query { + query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?); + } + + Ok(Exactness { + ctx, + query_tree: None, + state: None, + bucket_candidates: RoaringBitmap::new(), + parent, + query, + }) + } +} + +impl<'t> Criterion for Exactness<'t> { + #[logging_timer::time("Exactness::{}")] + fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + if let Some(state) = self.state.as_mut() { + state.difference_with(params.excluded_candidates); + } + + loop { + debug!("Exactness for query {:?} at state {:?}", self.query, self.state); + + match self.state.as_mut() { + Some(state) if state.is_empty() => { + // reset state + self.state = None; + self.query_tree = None; + }, + Some(state) => { + let (candidates, state) = resolve_state(self.ctx, mem::take(state), &self.query)?; + self.state = state; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: Some(candidates), + bucket_candidates: mem::take(&mut self.bucket_candidates), + })); + }, + None => { + match self.parent.next(params)? { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + let candidates = match candidates { + Some(candidates) => candidates, + None => resolve_query_tree(self.ctx, &query_tree, &mut HashMap::new(), params.wdcache)?, + }; + self.state = Some(State::new(candidates)); + self.query_tree = Some(query_tree); + self.bucket_candidates |= bucket_candidates; + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree, + candidates, + bucket_candidates, + })); + }, + None => return Ok(None), + } + }, + } + } + } +} + +#[derive(Debug)] +enum State { + /// Extract the documents that have an attribute that contains exactly the query. + ExactAttribute(RoaringBitmap), + /// Extract the documents that have an attribute that starts with exactly the query. + AttributeStartsWith(RoaringBitmap), + /// Rank the remaining documents by the number of exact words contained. + ExactWords(RoaringBitmap), + Remainings(Vec), +} + +impl State { + fn new(candidates: RoaringBitmap) -> Self { + Self::ExactAttribute(candidates) + } + + fn difference_with(&mut self, lhs: &RoaringBitmap) { + match self { + Self::ExactAttribute(candidates) | + Self::AttributeStartsWith(candidates) | + Self::ExactWords(candidates) => *candidates -= lhs, + Self::Remainings(candidates_array) => { + candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs); + candidates_array.retain(|candidates| !candidates.is_empty()); + } + } + } + + fn is_empty(&self) -> bool { + match self { + Self::ExactAttribute(candidates) | + Self::AttributeStartsWith(candidates) | + Self::ExactWords(candidates) => candidates.is_empty(), + Self::Remainings(candidates_array) => { + candidates_array.iter().all(RoaringBitmap::is_empty) + } + } + } +} + +impl Default for State { + fn default() -> Self { + Self::Remainings(vec![]) + } +} + +#[logging_timer::time("Exactness::{}")] +fn resolve_state( + ctx: &dyn Context, + state: State, + query: &[ExactQueryPart], +) -> anyhow::Result<(RoaringBitmap, Option)> +{ + use State::*; + match state { + ExactAttribute(mut allowed_candidates) | + AttributeStartsWith(mut allowed_candidates) => { + let mut candidates = RoaringBitmap::new(); + let attributes_ids = ctx.searchable_fields_ids()?; + for id in attributes_ids { + let attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; + candidates |= intersection_of(attribute_candidates_array.iter().collect()); + } + + // only keep allowed candidates + candidates &= &allowed_candidates; + // remove current candidates from allowed candidates + allowed_candidates -= &candidates; + Ok((candidates, Some(ExactWords(allowed_candidates)))) + }, + ExactWords(mut allowed_candidates) => { + let number_of_part = query.len(); + let mut parts_candidates_array = Vec::with_capacity(number_of_part); + + for part in query { + let mut candidates = RoaringBitmap::new(); + use ExactQueryPart::*; + match part { + Synonyms(synonyms) => { + for synonym in synonyms { + if let Some(synonym_candidates) = ctx.word_docids(synonym)? { + candidates |= synonym_candidates; + } + } + }, + // compute intersection on pair of words with a proximity of 0. + Phrase(phrase) => { + let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); + for words in phrase.windows(2) { + if let [left, right] = words { + match ctx.word_pair_proximity_docids(left, right, 0)? { + Some(docids) => bitmaps.push(docids), + None => { + bitmaps.clear(); + break + }, + } + } + } + candidates |= intersection_of(bitmaps.iter().collect()); + } + } + parts_candidates_array.push(candidates); + } + + let mut candidates_array = Vec::new(); + + // compute documents that contain all exact words. + let mut all_exact_candidates = intersection_of(parts_candidates_array.iter().collect()); + all_exact_candidates &= &allowed_candidates; + allowed_candidates -= &all_exact_candidates; + + // push the result of combinations of exact words grouped by the number of exact words contained by documents. + for c_count in (1..number_of_part).rev() { + let mut combinations_candidates = parts_candidates_array + .iter() + // create all `c_count` combinations of exact words + .combinations(c_count) + // intersect each word candidates in combinations + .map(intersection_of) + // union combinations of `c_count` exact words + .fold(RoaringBitmap::new(), RoaringBitmap::bitor); + // only keep allowed candidates + combinations_candidates &= &allowed_candidates; + // remove current candidates from allowed candidates + allowed_candidates -= &combinations_candidates; + candidates_array.push(combinations_candidates); + } + + // push remainings allowed candidates as the worst valid candidates + candidates_array.push(allowed_candidates); + // reverse the array to be able to pop candidates from the best to the worst. + candidates_array.reverse(); + + Ok((all_exact_candidates, Some(Remainings(candidates_array)))) + }, + // pop remainings candidates until the emptiness + Remainings(mut candidates_array) => { + let candidates = candidates_array.pop().unwrap_or_default(); + if !candidates_array.is_empty() { + Ok((candidates, Some(Remainings(candidates_array)))) + } else { + Ok((candidates, None)) + } + }, + + } +} + +fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[ExactQueryPart]) -> heed::Result> { + let lowest_level = TreeLevel::min_value(); + let mut attribute_candidates_array = Vec::new(); + // start from attribute first position + let mut pos = attribute_id * 1000; + for part in query { + use ExactQueryPart::*; + match part { + Synonyms(synonyms) => { + let mut synonyms_candidates = RoaringBitmap::new(); + for word in synonyms { + let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; + if let Some(word_candidates) = wc { + synonyms_candidates |= word_candidates; + } + } + attribute_candidates_array.push(synonyms_candidates); + pos += 1; + }, + Phrase(phrase) => { + for word in phrase { + let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; + if let Some(word_candidates) = wc { + attribute_candidates_array.push(word_candidates); + } + pos += 1; + } + } + } + } + + Ok(attribute_candidates_array) +} + +fn intersection_of(mut to_intersect: Vec<&RoaringBitmap>) -> RoaringBitmap { + match to_intersect.len() { + 0 => RoaringBitmap::new(), + 1 => to_intersect[0].clone(), + 2 => to_intersect[0] & to_intersect[1], + _ => { + to_intersect.sort_unstable_by(|a, b| a.len().cmp(&b.len()).reverse()); + + match to_intersect.pop() { + None => RoaringBitmap::new(), + Some(candidates) => { + let mut candidates = candidates.clone(); + while let Some(bitmap) = to_intersect.pop() { + if candidates.is_empty() { break; } + candidates &= bitmap; + } + + candidates + }, + } + } + } +} + +#[derive(Debug, Clone)] +pub enum ExactQueryPart { + Phrase(Vec), + Synonyms(Vec), +} + +impl ExactQueryPart { + fn from_primitive_query_part(ctx: &dyn Context, part: &PrimitiveQueryPart) -> heed::Result { + let part = match part { + PrimitiveQueryPart::Word(word, _) => { + match ctx.synonyms(word)? { + Some(synonyms) => { + let mut synonyms: Vec<_> = synonyms.into_iter().filter_map(|mut array| { + // keep 1 word synonyms only. + match array.pop() { + Some(word) if array.is_empty() => Some(word), + _ => None, + } + }).collect(); + synonyms.push(word.clone()); + ExactQueryPart::Synonyms(synonyms) + }, + None => ExactQueryPart::Synonyms(vec![word.clone()]), + } + }, + PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()), + }; + + Ok(part) + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 164937dec..1c626e183 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -7,9 +7,10 @@ use roaring::RoaringBitmap; use crate::{TreeLevel, search::{word_derivations, WordDerivationsCache}}; use crate::{Index, DocumentId}; -use super::query_tree::{Operation, Query, QueryKind}; +use super::query_tree::{Operation, PrimitiveQuery, PrimitiveQueryPart, Query, QueryKind}; use self::asc_desc::AscDesc; use self::attribute::Attribute; +use self::exactness::Exactness; use self::r#final::Final; use self::initial::Initial; use self::proximity::Proximity; @@ -18,6 +19,7 @@ use self::words::Words; mod asc_desc; mod attribute; +mod exactness; mod initial; mod proximity; mod typo; @@ -81,6 +83,9 @@ pub trait Context<'c> { fn docid_words_positions(&self, docid: DocumentId) -> heed::Result>; fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>>; fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; + fn synonyms(&self, word: &str) -> heed::Result>>>; + fn searchable_fields_ids(&self) -> heed::Result>; + fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error>; } pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -170,6 +175,23 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { Ok(last_level) } + + fn synonyms(&self, word: &str) -> heed::Result>>> { + self.index.words_synonyms(self.rtxn, &[word]) + } + + fn searchable_fields_ids(&self) -> heed::Result> { + match self.index.searchable_fields_ids(self.rtxn)? { + Some(searchable_fields_ids) => Ok(searchable_fields_ids), + None => Ok(self.index.fields_ids_map(self.rtxn)?.ids().collect()), + } + + } + + fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { + let key = (word, level, left, right); + self.index.word_level_position_docids.get(self.rtxn, &key) + } } impl<'t> CriteriaBuilder<'t> { @@ -182,11 +204,14 @@ impl<'t> CriteriaBuilder<'t> { pub fn build( &'t self, query_tree: Option, + primitive_query: Option>, facet_candidates: Option, ) -> anyhow::Result> { use crate::criterion::Criterion as Name; + let primitive_query = primitive_query.unwrap_or_default(); + let mut criterion = Box::new(Initial::new(query_tree, facet_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { @@ -194,6 +219,7 @@ impl<'t> CriteriaBuilder<'t> { Name::Words => Box::new(Words::new(self, criterion)), Name::Proximity => Box::new(Proximity::new(self, criterion)), Name::Attribute => Box::new(Attribute::new(self, criterion)), + Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), _otherwise => criterion, @@ -455,6 +481,18 @@ pub mod test { fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result> { todo!() } + + fn synonyms(&self, word: &str) -> heed::Result>>> { + todo!() + } + + fn searchable_fields_ids(&self) -> heed::Result> { + todo!() + } + + fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { + todo!() + } } impl<'a> Default for TestContext<'a> { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 4227ab0a6..be107bf72 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -97,7 +97,7 @@ impl<'a> Search<'a> { pub fn execute(&self) -> anyhow::Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); - let query_tree = match self.query.as_ref() { + let (query_tree, primitive_query) = match self.query.as_ref() { Some(query) => { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); @@ -113,9 +113,9 @@ impl<'a> Search<'a> { let analyzer = Analyzer::new(config); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.build(tokens)? + builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) }, - None => None, + None => (None, None), }; debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); @@ -135,7 +135,7 @@ impl<'a> Search<'a> { }; let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let criteria = criteria_builder.build(query_tree, facet_candidates)?; + let criteria = criteria_builder.build(query_tree, primitive_query, facet_candidates)?; match self.index.distinct_attribute(self.rtxn)? { None => self.perform_sort(NoopDistinct, matching_words, criteria), diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 492b98a1e..b74b8af58 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -228,11 +228,12 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build(&self, query: TokenStream) -> anyhow::Result> { + pub fn build(&self, query: TokenStream) -> anyhow::Result> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { - create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some) + let qt = create_query_tree(self, self.optional_words, self.authorize_typos, &primitive_query)?; + Ok(Some((qt, primitive_query))) } else { Ok(None) } @@ -340,7 +341,7 @@ fn create_query_tree( ctx: &impl Context, optional_words: bool, authorize_typos: bool, - query: PrimitiveQuery, + query: &[PrimitiveQueryPart], ) -> anyhow::Result { /// Matches on the `PrimitiveQueryPart` and create an operation from it. @@ -458,16 +459,16 @@ fn create_query_tree( } if optional_words { - optional_word(ctx, authorize_typos, query) + optional_word(ctx, authorize_typos, query.to_vec()) } else { - ngrams(ctx, authorize_typos, query.as_slice()) + ngrams(ctx, authorize_typos, query) } } -type PrimitiveQuery = Vec; +pub type PrimitiveQuery = Vec; #[derive(Debug, Clone)] -enum PrimitiveQueryPart { +pub enum PrimitiveQueryPart { Phrase(Vec), Word(String, IsPrefix), } @@ -579,11 +580,12 @@ mod test { authorize_typos: bool, words_limit: Option, query: TokenStream, - ) -> anyhow::Result> + ) -> anyhow::Result> { let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { - create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some) + let qt = create_query_tree(self, optional_words, authorize_typos, &primitive_query)?; + Ok(Some((qt, primitive_query))) } else { Ok(None) } @@ -674,7 +676,7 @@ mod test { Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -694,7 +696,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -725,7 +727,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -770,7 +772,7 @@ mod test { ]), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -790,7 +792,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -816,7 +818,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -836,7 +838,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -875,7 +877,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }), ]), ]); - let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -891,7 +893,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), ]); - let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -925,7 +927,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), ]), ]); - let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -944,7 +946,7 @@ mod test { ]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -957,7 +959,7 @@ mod test { let tokens = result.tokens(); let context = TestContext::default(); - let query_tree = context.build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = context.build(false, true, None, tokens).unwrap().unwrap(); let expected = hashset!{ ("word", 0, false), @@ -997,7 +999,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), ]); - let query_tree = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } From c1ce4e4ca91a4751819d1016c1205d80614e62aa Mon Sep 17 00:00:00 2001 From: many Date: Tue, 4 May 2021 15:28:05 +0200 Subject: [PATCH 0673/1889] Introduce mocked ExactAttribute step in exactness criterion --- milli/src/search/criteria/exactness.rs | 20 +++++++++++++++++++- milli/src/search/criteria/mod.rs | 16 ++++++++++++---- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index a67b9ed3c..e7ece6e91 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -150,7 +150,25 @@ fn resolve_state( { use State::*; match state { - ExactAttribute(mut allowed_candidates) | + ExactAttribute(mut allowed_candidates) => { + let query_len = query.len() as u32; + let mut candidates = RoaringBitmap::new(); + let attributes_ids = ctx.searchable_fields_ids()?; + for id in attributes_ids { + if let Some(attribute_allowed_docids) = ctx.field_id_len_docids(id, query_len)? { + let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; + attribute_candidates_array.push(attribute_allowed_docids); + candidates |= intersection_of(attribute_candidates_array.iter().collect()); + } + } + + // only keep allowed candidates + candidates &= &allowed_candidates; + // remove current candidates from allowed candidates + allowed_candidates -= &candidates; + Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) + + }, AttributeStartsWith(mut allowed_candidates) => { let mut candidates = RoaringBitmap::new(); let attributes_ids = ctx.searchable_fields_ids()?; diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1c626e183..d2fd808f9 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -4,7 +4,7 @@ use std::borrow::Cow; use anyhow::bail; use roaring::RoaringBitmap; -use crate::{TreeLevel, search::{word_derivations, WordDerivationsCache}}; +use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; use crate::{Index, DocumentId}; use super::query_tree::{Operation, PrimitiveQuery, PrimitiveQueryPart, Query, QueryKind}; @@ -84,7 +84,8 @@ pub trait Context<'c> { fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>>; fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; fn synonyms(&self, word: &str) -> heed::Result>>>; - fn searchable_fields_ids(&self) -> heed::Result>; + fn searchable_fields_ids(&self) -> heed::Result>; + fn field_id_len_docids(&self, field_id: FieldId, len: u32) -> heed::Result>; fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error>; } pub struct CriteriaBuilder<'t> { @@ -180,12 +181,15 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.words_synonyms(self.rtxn, &[word]) } - fn searchable_fields_ids(&self) -> heed::Result> { + fn searchable_fields_ids(&self) -> heed::Result> { match self.index.searchable_fields_ids(self.rtxn)? { Some(searchable_fields_ids) => Ok(searchable_fields_ids), None => Ok(self.index.fields_ids_map(self.rtxn)?.ids().collect()), } + } + fn field_id_len_docids(&self, field_id: FieldId, len: u32) -> heed::Result> { + Ok(None) } fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { @@ -486,13 +490,17 @@ pub mod test { todo!() } - fn searchable_fields_ids(&self) -> heed::Result> { + fn searchable_fields_ids(&self) -> heed::Result> { todo!() } fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { todo!() } + + fn field_id_len_docids(&self, field_id: FieldId, len: u32) -> heed::Result> { + todo!() + } } impl<'a> Default for TestContext<'a> { From 44b6843de78925860239c9977048fe31fc11fcc8 Mon Sep 17 00:00:00 2001 From: Many Date: Thu, 6 May 2021 11:24:46 +0200 Subject: [PATCH 0674/1889] Fix pull request reviews Update milli/src/fields_ids_map.rs Update milli/src/search/criteria/exactness.rs Update milli/src/search/criteria/mod.rs --- milli/src/fields_ids_map.rs | 4 ++-- milli/src/search/criteria/exactness.rs | 29 +++++++------------------- milli/src/search/criteria/mod.rs | 11 +++++----- 3 files changed, 14 insertions(+), 30 deletions(-) diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 6eed9c41f..76ff2d281 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -66,12 +66,12 @@ impl FieldsIdsMap { self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) } - /// Iterate over the ids in the ids order. + /// Iterate over the ids in the order of the ids. pub fn ids<'a>(&'a self) -> impl Iterator + 'a { self.ids_names.keys().copied() } - /// Iterate over the names in the ids order. + /// Iterate over the names in the order of the ids. pub fn names(&self) -> impl Iterator { self.ids_names.values().map(AsRef::as_ref) } diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index e7ece6e91..c004f4a51 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -51,7 +51,7 @@ impl<'t> Criterion for Exactness<'t> { } loop { - debug!("Exactness for query {:?} at state {:?}", self.query, self.state); + debug!("Exactness at state {:?}", self.state); match self.state.as_mut() { Some(state) if state.is_empty() => { @@ -296,27 +296,12 @@ fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[Ex Ok(attribute_candidates_array) } -fn intersection_of(mut to_intersect: Vec<&RoaringBitmap>) -> RoaringBitmap { - match to_intersect.len() { - 0 => RoaringBitmap::new(), - 1 => to_intersect[0].clone(), - 2 => to_intersect[0] & to_intersect[1], - _ => { - to_intersect.sort_unstable_by(|a, b| a.len().cmp(&b.len()).reverse()); - - match to_intersect.pop() { - None => RoaringBitmap::new(), - Some(candidates) => { - let mut candidates = candidates.clone(); - while let Some(bitmap) = to_intersect.pop() { - if candidates.is_empty() { break; } - candidates &= bitmap; - } - - candidates - }, - } - } +fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap { + rbs.sort_unstable_by_key(|rb| rb.len()); + let mut iter = rbs.into_iter(); + match iter.next() { + Some(first) => iter.fold(first.clone(), |acc, rb| acc & rb), + None => RoaringBitmap::new(), } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d2fd808f9..76e263036 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; use crate::{Index, DocumentId}; -use super::query_tree::{Operation, PrimitiveQuery, PrimitiveQueryPart, Query, QueryKind}; +use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use self::asc_desc::AscDesc; use self::attribute::Attribute; use self::exactness::Exactness; @@ -188,7 +188,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { } } - fn field_id_len_docids(&self, field_id: FieldId, len: u32) -> heed::Result> { + fn field_id_len_docids(&self, _field_id: FieldId, _len: u32) -> heed::Result> { Ok(None) } @@ -226,7 +226,6 @@ impl<'t> CriteriaBuilder<'t> { Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), - _otherwise => criterion, }; } @@ -486,7 +485,7 @@ pub mod test { todo!() } - fn synonyms(&self, word: &str) -> heed::Result>>> { + fn synonyms(&self, _word: &str) -> heed::Result>>> { todo!() } @@ -494,11 +493,11 @@ pub mod test { todo!() } - fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { + fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> Result, heed::Error> { todo!() } - fn field_id_len_docids(&self, field_id: FieldId, len: u32) -> heed::Result> { + fn field_id_len_docids(&self, _field_id: FieldId, _len: u32) -> heed::Result> { todo!() } } From 313c36246159f185c6e2734aa7294e047babee91 Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Thu, 6 May 2021 18:14:16 +0200 Subject: [PATCH 0675/1889] early return on empty document addition --- milli/src/update/index_documents/mod.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3acae7821..a9ebcd20a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::fs::File; -use std::io::{self, Seek, SeekFrom}; +use std::io::{self, Seek, SeekFrom, BufReader, BufRead}; use std::num::{NonZeroU32, NonZeroUsize}; use std::str; use std::sync::mpsc::sync_channel; @@ -326,6 +326,16 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { R: io::Read, F: Fn(UpdateIndexingStep, u64) + Sync, { + let mut reader = BufReader::new(reader); + reader.fill_buf()?; + + // Early return when there are no document to add + if reader.buffer().is_empty() { + return Ok(DocumentAdditionResult { + nb_documents: 0, + }) + } + self.index.set_updated_at(self.wtxn, &Utc::now())?; let before_transform = Instant::now(); let update_id = self.update_id; From eeb0c70ea2a3f78038d98c20c82a789390dc3319 Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Thu, 6 May 2021 21:16:40 +0200 Subject: [PATCH 0676/1889] meilisearch compatible primary key inference --- milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/index_documents/transform.rs | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a9ebcd20a..82f494591 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -329,7 +329,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut reader = BufReader::new(reader); reader.fill_buf()?; - // Early return when there are no document to add + // Early return when there is no document to add if reader.buffer().is_empty() { return Ok(DocumentAdditionResult { nb_documents: 0, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index e029a5135..ced5fe2c7 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -47,6 +47,10 @@ pub struct Transform<'t, 'i> { pub autogenerate_docids: bool, } +fn is_primary_key(field: impl AsRef) -> bool { + field.as_ref().to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME) +} + impl Transform<'_, '_> { pub fn output_from_json(self, reader: R, progress_callback: F) -> anyhow::Result where @@ -92,7 +96,7 @@ impl Transform<'_, '_> { // We extract the primary key from the first document in // the batch if it hasn't already been defined in the index let first = documents.peek().and_then(|r| r.as_ref().ok()); - let alternative_name = first.and_then(|doc| doc.keys().find(|k| k.contains(DEFAULT_PRIMARY_KEY_NAME)).cloned()); + let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); let (primary_key_id, primary_key) = compute_primary_key_pair( self.index.primary_key(self.rtxn)?, &mut fields_ids_map, @@ -232,7 +236,7 @@ impl Transform<'_, '_> { // The primary key is known so we must find the position in the CSV headers. headers.iter().position(|h| h == primary_key) }, - None => headers.iter().position(|h| h.contains("id")), + None => headers.iter().position(|f| is_primary_key(&f)), }; // Returns the field id in the fields ids map, create an "id" field From e923d51b8f928e6bd1d5ccab765eb8b2c5f81ecf Mon Sep 17 00:00:00 2001 From: many Date: Wed, 5 May 2021 20:46:56 +0200 Subject: [PATCH 0677/1889] Make bucket candidates optionals --- milli/src/search/criteria/asc_desc.rs | 17 +- milli/src/search/criteria/attribute.rs | 111 ++++++------- milli/src/search/criteria/exactness.rs | 15 +- milli/src/search/criteria/final.rs | 20 +-- milli/src/search/criteria/initial.rs | 4 +- milli/src/search/criteria/mod.rs | 19 +-- milli/src/search/criteria/proximity.rs | 146 +++++++---------- milli/src/search/criteria/typo.rs | 210 ++++++++++++------------- milli/src/search/criteria/words.rs | 80 +++++----- 9 files changed, 276 insertions(+), 346 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 54cbb0fae..7b619f26a 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -94,7 +94,6 @@ impl<'t> Criterion for AscDesc<'t> { None => { match self.parent.next(params)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates_is_some = candidates.is_some(); self.query_tree = query_tree; let candidates = match (&self.query_tree, candidates) { (_, Some(mut candidates)) => { @@ -103,7 +102,7 @@ impl<'t> Criterion for AscDesc<'t> { }, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), params.wdcache)?; + let mut candidates = resolve_query_tree(&context, qt, params.wdcache)?; candidates -= params.excluded_candidates; candidates.intersect_with(&self.faceted_candidates); candidates @@ -111,15 +110,9 @@ impl<'t> Criterion for AscDesc<'t> { (None, None) => take(&mut self.faceted_candidates), }; - // If our parent returns candidates it means that the bucket - // candidates were already computed before and we can use them. - // - // If not, we must use the just computed candidates as our bucket - // candidates. - if candidates_is_some { - self.bucket_candidates.union_with(&bucket_candidates); - } else { - self.bucket_candidates.union_with(&candidates); + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, } if candidates.is_empty() { @@ -143,7 +136,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), - bucket_candidates: take(&mut self.bucket_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, } diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 5993f03bd..fc7050a7f 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -24,13 +24,12 @@ const LEVEL_EXPONENTIATION_BASE: u32 = 4; /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 1000; +type FlattenedQueryTree = Vec>>; pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, - query_tree: Option, - candidates: Option, + state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, bucket_candidates: RoaringBitmap, parent: Box, - flattened_query_tree: Option>>>, current_buckets: Option>, } @@ -38,11 +37,9 @@ impl<'t> Attribute<'t> { pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Attribute { ctx, - query_tree: None, - candidates: None, + state: None, bucket_candidates: RoaringBitmap::new(), parent, - flattened_query_tree: None, current_buckets: None, } } @@ -52,29 +49,25 @@ impl<'t> Criterion for Attribute<'t> { #[logging_timer::time("Attribute::{}")] fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some(candidates) = self.candidates.as_mut() { - *candidates -= params.excluded_candidates; + if let Some((_, _, allowed_candidates)) = self.state.as_mut() { + *allowed_candidates -= params.excluded_candidates; } loop { - match (&self.query_tree, &mut self.candidates) { - (_, Some(candidates)) if candidates.is_empty() => { + match self.state.take() { + Some((query_tree, _, allowed_candidates)) if allowed_candidates.is_empty() => { return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, - (Some(qt), Some(candidates)) => { - let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| { - flatten_query_tree(&qt) - }); - - let found_candidates = if candidates.len() < CANDIDATES_THRESHOLD { + Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { + let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { let current_buckets = match self.current_buckets.as_mut() { Some(current_buckets) => current_buckets, None => { - let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; + let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?; self.current_buckets.get_or_insert(new_buckets.into_iter()) }, }; @@ -83,62 +76,60 @@ impl<'t> Criterion for Attribute<'t> { Some((_score, candidates)) => candidates, None => { return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, } } else { - match set_compute_candidates(self.ctx, flattened_query_tree, candidates, params.wdcache)? { + match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? { Some(candidates) => candidates, None => { return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, } }; - candidates.difference_with(&found_candidates); + allowed_candidates -= &found_candidates; + + self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), + query_tree: Some(query_tree), candidates: Some(found_candidates), - bucket_candidates: take(&mut self.bucket_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, - (Some(qt), None) => { - let mut query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), params.wdcache)?; - query_tree_candidates -= params.excluded_candidates; - self.bucket_candidates |= &query_tree_candidates; - self.candidates = Some(query_tree_candidates); - }, - (None, Some(_)) => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take(), - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, - (None, None) => { + None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + let candidates = match candidates { + Some(candidates) => candidates, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, + }; + + let flattened_query_tree = flatten_query_tree(&query_tree); + + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + self.state = Some((query_tree, flattened_query_tree, candidates)); + self.current_buckets = None; + }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, - candidates: None, + candidates, bucket_candidates, })); }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree; - self.candidates = candidates; - self.bucket_candidates |= bucket_candidates; - self.flattened_query_tree = None; - self.current_buckets = None; - }, None => return Ok(None), } }, @@ -467,7 +458,7 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {} fn initialize_query_level_iterators<'t, 'q>( ctx: &'t dyn Context<'t>, - branches: &'q Vec>>, + branches: &'q FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result>> { @@ -517,7 +508,7 @@ fn initialize_query_level_iterators<'t, 'q>( fn set_compute_candidates<'t>( ctx: &'t dyn Context<'t>, - branches: &Vec>>, + branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> @@ -570,11 +561,11 @@ fn set_compute_candidates<'t>( fn linear_compute_candidates( ctx: &dyn Context, - branches: &Vec>>, + branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, ) -> anyhow::Result> { - fn compute_candidate_rank(branches: &Vec>>, words_positions: HashMap) -> u64 { + fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { @@ -659,10 +650,10 @@ fn linear_compute_candidates( } // TODO can we keep refs of Query -fn flatten_query_tree(query_tree: &Operation) -> Vec>> { +fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { use crate::search::criteria::Operation::{And, Or, Consecutive}; - fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec>> { + fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree { match tail.split_first() { Some((thead, tail)) => { let tail = and_recurse(thead, tail); @@ -680,7 +671,7 @@ fn flatten_query_tree(query_tree: &Operation) -> Vec>> { } } - fn recurse(op: &Operation) -> Vec>> { + fn recurse(op: &Operation) -> FlattenedQueryTree { match op { And(ops) | Consecutive(ops) => { ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index c004f4a51..2fd216053 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, mem}; +use std::mem::take; use log::debug; use roaring::RoaringBitmap; @@ -60,13 +60,13 @@ impl<'t> Criterion for Exactness<'t> { self.query_tree = None; }, Some(state) => { - let (candidates, state) = resolve_state(self.ctx, mem::take(state), &self.query)?; + let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?; self.state = state; return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), - bucket_candidates: mem::take(&mut self.bucket_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { @@ -74,11 +74,16 @@ impl<'t> Criterion for Exactness<'t> { Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { let candidates = match candidates { Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, &mut HashMap::new(), params.wdcache)?, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)?, }; + + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + self.state = Some(State::new(candidates)); self.query_tree = Some(query_tree); - self.bucket_candidates |= bucket_candidates; }, Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index 707195ba7..0dbf3ee1a 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -1,5 +1,3 @@ -use std::collections::HashMap; - use log::debug; use roaring::RoaringBitmap; @@ -41,19 +39,15 @@ impl<'t> Final<'t> { }; match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => { - let candidates = match candidates { - Some(candidates) => candidates, - None => { - let candidates = match query_tree.as_ref() { - Some(qt) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, - None => self.ctx.documents_ids()?, - }; - bucket_candidates |= &candidates; - candidates - } + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (candidates, query_tree.as_ref()) { + (Some(candidates), _) => candidates, + (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)?, + (None, None) => self.ctx.documents_ids()?, }; + let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); + self.returned_candidates |= &candidates; return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 10858dd99..eb2bf0b95 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -12,8 +12,8 @@ impl Initial { pub fn new(query_tree: Option, mut candidates: Option) -> Initial { let answer = CriterionResult { query_tree, - candidates: candidates.clone(), - bucket_candidates: candidates.take().unwrap_or_default(), + candidates: candidates.take(), + bucket_candidates: None, }; Initial { answer: Some(answer) } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 76e263036..f9fca2624 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -39,7 +39,7 @@ pub struct CriterionResult { /// if None, it is up to the child to compute the candidates itself. candidates: Option, /// Candidates that comes from the current bucket of the initial criterion. - bucket_candidates: RoaringBitmap, + bucket_candidates: Option, } #[derive(Debug, PartialEq)] @@ -57,15 +57,6 @@ enum Candidates { Forbidden(RoaringBitmap) } -impl Candidates { - fn into_inner(self) -> RoaringBitmap { - match self { - Self::Allowed(inner) => inner, - Self::Forbidden(inner) => inner, - } - } -} - impl Default for Candidates { fn default() -> Self { Self::Forbidden(RoaringBitmap::new()) @@ -236,14 +227,12 @@ impl<'t> CriteriaBuilder<'t> { pub fn resolve_query_tree<'t>( ctx: &'t dyn Context, query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { @@ -252,7 +241,7 @@ pub fn resolve_query_tree<'t>( match query_tree { And(ops) => { let mut ops = ops.iter().map(|op| { - resolve_operation(ctx, op, cache, wdcache) + resolve_operation(ctx, op, wdcache) }).collect::>>()?; ops.sort_unstable_by_key(|cds| cds.len()); @@ -296,7 +285,7 @@ pub fn resolve_query_tree<'t>( Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { - let docids = resolve_operation(ctx, op, cache, wdcache)?; + let docids = resolve_operation(ctx, op, wdcache)?; candidates.union_with(&docids); } Ok(candidates) @@ -305,7 +294,7 @@ pub fn resolve_query_tree<'t>( } } - resolve_operation(ctx, query_tree, cache, wdcache) + resolve_operation(ctx, query_tree, wdcache) } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 08fba1447..e50d3941d 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -30,8 +30,8 @@ const PROXIMITY_THRESHOLD: u8 = 0; pub struct Proximity<'t> { ctx: &'t dyn Context<'t>, - /// ((max_proximity, query_tree), allowed_candidates) - state: Option<(Option<(usize, Operation)>, RoaringBitmap)>, + /// (max_proximity, query_tree, allowed_candidates) + state: Option<(u8, Operation, RoaringBitmap)>, proximity: u8, bucket_candidates: RoaringBitmap, parent: Box, @@ -57,114 +57,90 @@ impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some((_, candidates)) = self.state.as_mut() { - *candidates -= params.excluded_candidates; + if let Some((_, _, allowed_candidates)) = self.state.as_mut() { + *allowed_candidates -= params.excluded_candidates; } loop { debug!("Proximity at iteration {} (max prox {:?}) ({:?})", self.proximity, - self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)), - self.state.as_ref().map(|(_, cd)| cd), + self.state.as_ref().map(|(mp, _, _)| mp), + self.state.as_ref().map(|(_, _, cd)| cd), ); match &mut self.state { - Some((_, candidates)) if candidates.is_empty() => { + Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => { self.state = None; // reset state }, - Some((Some((max_prox, query_tree)), candidates)) => { - if self.proximity as usize > *max_prox { - self.state = None; // reset state - } else { - let mut new_candidates = if candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { - if let Some(cache) = self.plane_sweep_cache.as_mut() { - match cache.next() { - Some((p, candidates)) => { - self.proximity = p; - candidates - }, - None => { - self.state = None; // reset state - continue - }, - } - } else { - let cache = resolve_plane_sweep_candidates( - self.ctx, - query_tree, - candidates, - params.wdcache, - )?; - self.plane_sweep_cache = Some(cache.into_iter()); - - continue + Some((_, query_tree, allowed_candidates)) => { + let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { + if let Some(cache) = self.plane_sweep_cache.as_mut() { + match cache.next() { + Some((p, candidates)) => { + self.proximity = p; + candidates + }, + None => { + self.state = None; // reset state + continue + }, } - } else { // use set theory based algorithm - resolve_candidates( - self.ctx, - &query_tree, - self.proximity, - &mut self.candidates_cache, - params.wdcache, - )? - }; + } else { + let cache = resolve_plane_sweep_candidates( + self.ctx, + query_tree, + allowed_candidates, + params.wdcache, + )?; + self.plane_sweep_cache = Some(cache.into_iter()); - new_candidates.intersect_with(&candidates); - candidates.difference_with(&new_candidates); - self.proximity += 1; + continue + } + } else { // use set theory based algorithm + resolve_candidates( + self.ctx, + &query_tree, + self.proximity, + &mut self.candidates_cache, + params.wdcache, + )? + }; + + new_candidates &= &*allowed_candidates; + *allowed_candidates -= &new_candidates; + self.proximity += 1; - return Ok(Some(CriterionResult { - query_tree: Some(query_tree.clone()), - candidates: Some(new_candidates), - bucket_candidates: take(&mut self.bucket_candidates), - })); - } - }, - Some((None, candidates)) => { - let candidates = take(candidates); - self.state = None; // reset state return Ok(Some(CriterionResult { - query_tree: None, - candidates: Some(candidates.clone()), - bucket_candidates: candidates, + query_tree: Some(query_tree.clone()), + candidates: Some(new_candidates), + bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates: None, - bucket_candidates, - })); - }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates_is_some = candidates.is_some(); - let candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let candidates = resolve_query_tree(self.ctx, qt, &mut HashMap::new(), params.wdcache)?; - candidates - params.excluded_candidates - }, - (None, None) => RoaringBitmap::new(), + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + let candidates = match candidates { + Some(candidates) => candidates, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, }; - // If our parent returns candidates it means that the bucket - // candidates were already computed before and we can use them. - // - // If not, we must use the just computed candidates as our bucket - // candidates. - if candidates_is_some { - self.bucket_candidates.union_with(&bucket_candidates); - } else { - self.bucket_candidates.union_with(&candidates); + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, } - let query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); - self.state = Some((query_tree, candidates)); + let maximum_proximity = maximum_proximity(&query_tree); + self.state = Some((maximum_proximity as u8, query_tree, candidates)); self.proximity = 0; self.plane_sweep_cache = None; }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + bucket_candidates, + })); + }, None => return Ok(None), } }, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index f265b30ae..288a92f65 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -13,15 +13,19 @@ use super::{ CriterionParameters, CriterionResult, query_docids, - query_pair_proximity_docids + query_pair_proximity_docids, + resolve_query_tree, }; +/// Maximum number of typo for a word of any length. +const MAX_TYPOS_PER_WORD: u8 = 2; + pub struct Typo<'t> { ctx: &'t dyn Context<'t>, - query_tree: Option<(usize, Operation)>, - number_typos: u8, - candidates: Candidates, - bucket_candidates: RoaringBitmap, + /// (max_typos, query_tree, candidates) + state: Option<(u8, Operation, Candidates)>, + typos: u8, + bucket_candidates: Option, parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } @@ -30,10 +34,9 @@ impl<'t> Typo<'t> { pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { Typo { ctx, - query_tree: None, - number_typos: 0, - candidates: Candidates::default(), - bucket_candidates: RoaringBitmap::new(), + state: None, + typos: 0, + bucket_candidates: None, parent, candidates_cache: HashMap::new(), } @@ -45,113 +48,101 @@ impl<'t> Criterion for Typo<'t> { fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; // remove excluded candidates when next is called, instead of doing it in the loop. - match &mut self.candidates { - Allowed(candidates) => *candidates -= params.excluded_candidates, - Forbidden(candidates) => *candidates |= params.excluded_candidates, + match self.state.as_mut() { + Some((_, _, Allowed(candidates))) => *candidates -= params.excluded_candidates, + Some((_, _, Forbidden(candidates))) => *candidates |= params.excluded_candidates, + None => (), } loop { - debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); + debug!("Typo at iteration {} (max typos {:?}) ({:?})", + self.typos, + self.state.as_ref().map(|(mt, _, _)| mt), + self.state.as_ref().map(|(_, _, cd)| cd), + ); - match (&mut self.query_tree, &mut self.candidates) { - (_, Allowed(candidates)) if candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: Some(take(&mut self.candidates).into_inner()), - bucket_candidates: take(&mut self.bucket_candidates), - })); + match self.state.as_mut() { + Some((max_typos, _, _)) if self.typos > *max_typos => { + self.state = None; // reset state }, - (Some((max_typos, query_tree)), Allowed(candidates)) => { - if self.number_typos as usize > *max_typos { - self.query_tree = None; - self.candidates = Candidates::default(); - } else { - let fst = self.ctx.words_fst(); - let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)? - } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?; - query_tree.clone() - } else { - query_tree.clone() + Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { + self.state = None; // reset state + }, + Some((_, query_tree, candidates_authorization)) => { + let fst = self.ctx.words_fst(); + let new_query_tree = if self.typos < MAX_TYPOS_PER_WORD { + alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? + } else if self.typos == MAX_TYPOS_PER_WORD { + // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, + // we keep the altered query tree + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; + // we compute the allowed candidates + let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; + // we assign the allowed candidates to the candidates authorization. + *candidates_authorization = match take(candidates_authorization) { + Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), + Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), }; - - let mut new_candidates = resolve_candidates( - self.ctx, - &new_query_tree, - self.number_typos, - &mut self.candidates_cache, - params.wdcache, - )?; - new_candidates.intersect_with(&candidates); - candidates.difference_with(&new_candidates); - self.number_typos += 1; - - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: Some(new_candidates), - bucket_candidates: take(&mut self.bucket_candidates), - })); - } - }, - (Some((max_typos, query_tree)), Forbidden(candidates)) => { - if self.number_typos as usize > *max_typos { - self.query_tree = None; - self.candidates = Candidates::default(); + query_tree.clone() } else { - let fst = self.ctx.words_fst(); - let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)? - } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?; - query_tree.clone() - } else { - query_tree.clone() - }; + query_tree.clone() + }; - let mut new_candidates = resolve_candidates( - self.ctx, - &new_query_tree, - self.number_typos, - &mut self.candidates_cache, - params.wdcache, - )?; - new_candidates.difference_with(&candidates); - candidates.union_with(&new_candidates); - self.number_typos += 1; - self.bucket_candidates.union_with(&new_candidates); + let mut candidates = resolve_candidates( + self.ctx, + &new_query_tree, + self.typos, + &mut self.candidates_cache, + params.wdcache, + )?; - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: Some(new_candidates), - bucket_candidates: take(&mut self.bucket_candidates), - })); - } - }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); - return Ok(Some(CriterionResult { - query_tree: None, - candidates: Some(candidates.clone()), - bucket_candidates: candidates, - })); - }, - (None, Forbidden(_)) => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates: None, - bucket_candidates, - })); + match candidates_authorization { + Allowed(allowed_candidates) => { + candidates &= &*allowed_candidates; + *allowed_candidates -= &candidates; }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); - self.number_typos = 0; - self.candidates = candidates.map_or_else(|| { + Forbidden(forbidden_candidates) => { + candidates -= &*forbidden_candidates; + *forbidden_candidates |= &candidates; + }, + } + + let bucket_candidates = match self.bucket_candidates.as_mut() { + Some(bucket_candidates) => take(bucket_candidates), + None => candidates.clone(), + }; + + self.typos += 1; + + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: Some(candidates), + bucket_candidates: Some(bucket_candidates), + })); + }, + None => { + match self.parent.next(params)? { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { + (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), + (self_bc, parent_bc) => self_bc.or(parent_bc), + }; + + let candidates = candidates.map_or_else(|| { Candidates::Forbidden(params.excluded_candidates.clone()) }, Candidates::Allowed); - self.bucket_candidates.union_with(&bucket_candidates); + + let maximum_typos = maximum_typo(&query_tree) as u8; + self.state = Some((maximum_typos, query_tree, candidates)); + self.typos = 0; + + }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + bucket_candidates, + })); }, None => return Ok(None), } @@ -185,7 +176,6 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) }, Operation::Query(q) => { - // TODO may be optimized when number_typos == 0 if let QueryKind::Tolerant { typo, word } = &q.kind { // if no typo is allowed we don't call word_derivations function, // and directly create an Exact query @@ -384,7 +374,7 @@ mod test { ]), ])), candidates: Some(candidates_1.clone()), - bucket_candidates: candidates_1, + bucket_candidates: Some(candidates_1), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); @@ -406,7 +396,7 @@ mod test { ]), ])), candidates: Some(candidates_2.clone()), - bucket_candidates: candidates_2, + bucket_candidates: Some(candidates_2), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); @@ -428,7 +418,7 @@ mod test { let expected = CriterionResult { query_tree: None, candidates: Some(facet_candidates.clone()), - bucket_candidates: facet_candidates, + bucket_candidates: None, }; // first iteration, returns the facet candidates @@ -471,7 +461,7 @@ mod test { ]), ])), candidates: Some(&candidates_1 & &facet_candidates), - bucket_candidates: facet_candidates.clone(), + bucket_candidates: Some(&candidates_1 & &facet_candidates), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); @@ -493,7 +483,7 @@ mod test { ]), ])), candidates: Some(&candidates_2 & &facet_candidates), - bucket_candidates: RoaringBitmap::new(), + bucket_candidates: Some(&candidates_2 & &facet_candidates), }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 23a45223a..2f7ebbfbf 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::mem::take; use log::debug; @@ -11,9 +10,9 @@ pub struct Words<'t> { ctx: &'t dyn Context<'t>, query_trees: Vec, candidates: Option, - bucket_candidates: RoaringBitmap, + bucket_candidates: Option, parent: Box, - candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, + compute_candidates: bool, } impl<'t> Words<'t> { @@ -22,9 +21,9 @@ impl<'t> Words<'t> { ctx, query_trees: Vec::default(), candidates: None, - bucket_candidates: RoaringBitmap::new(), + bucket_candidates: None, parent, - candidates_cache: HashMap::default(), + compute_candidates: false, } } } @@ -40,55 +39,48 @@ impl<'t> Criterion for Words<'t> { loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); - match (self.query_trees.pop(), &mut self.candidates) { - (query_tree, Some(candidates)) if candidates.is_empty() => { - self.query_trees = Vec::new(); - return Ok(Some(CriterionResult { - query_tree, - candidates: self.candidates.take(), - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, - (Some(qt), Some(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, params.wdcache)?; - found_candidates.intersect_with(&candidates); - candidates.difference_with(&found_candidates); + match self.query_trees.pop() { + Some(query_tree) => { + let candidates = match self.candidates.as_mut() { + Some(allowed_candidates) if self.compute_candidates => { + let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; + candidates &= &*allowed_candidates; + *allowed_candidates -= &candidates; + Some(candidates) + }, + candidates => candidates.cloned(), + }; + + let bucket_candidates = match self.bucket_candidates.as_mut() { + Some(bucket_candidates) => Some(take(bucket_candidates)), + None => None, + }; return Ok(Some(CriterionResult { - query_tree: Some(qt), - candidates: Some(found_candidates), - bucket_candidates: take(&mut self.bucket_candidates), + query_tree: Some(query_tree), + candidates, + bucket_candidates, })); }, - (Some(qt), None) => { - return Ok(Some(CriterionResult { - query_tree: Some(qt), - candidates: None, - bucket_candidates: take(&mut self.bucket_candidates), - })); - }, - (None, Some(_)) => { - let candidates = self.candidates.take(); - return Ok(Some(CriterionResult { - query_tree: None, - candidates: candidates.clone(), - bucket_candidates: candidates.unwrap_or_default(), - })); - }, - (None, None) => { + None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + self.query_trees = explode_query_tree(query_tree); + self.candidates = candidates; + self.compute_candidates = bucket_candidates.is_some(); + + self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { + (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), + (self_bc, parent_bc) => self_bc.or(parent_bc), + }; + }, + Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, - candidates: None, + candidates, bucket_candidates, })); }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); - self.candidates = candidates; - self.bucket_candidates.union_with(&bucket_candidates); - }, None => return Ok(None), } }, From efba662ca6ffc785acc5ae2ca7372016f010ea35 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 10 May 2021 10:27:18 +0200 Subject: [PATCH 0678/1889] Fix clippy warnings in cirteria --- milli/src/search/criteria/asc_desc.rs | 4 +-- milli/src/search/criteria/attribute.rs | 20 +++++++------- milli/src/search/criteria/final.rs | 38 ++++++++++++-------------- milli/src/search/criteria/typo.rs | 36 ++++++++++++------------ milli/src/search/query_tree.rs | 10 +++---- 5 files changed, 54 insertions(+), 54 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 7b619f26a..31edc453e 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -215,7 +215,7 @@ fn iterative_facet_ordered_iter<'t>( docids_values.push((docid, OrderedFloat(value))); } } - docids_values.sort_unstable_by_key(|(_, v)| v.clone()); + docids_values.sort_unstable_by_key(|(_, v)| *v); let iter = docids_values.into_iter(); let iter = if ascending { Box::new(iter) as Box> @@ -226,7 +226,7 @@ fn iterative_facet_ordered_iter<'t>( // The itertools GroupBy iterator doesn't provide an owned version, we are therefore // required to collect the result into an owned collection (a Vec). // https://github.com/rust-itertools/itertools/issues/499 - let vec: Vec<_> = iter.group_by(|(_, v)| v.clone()) + let vec: Vec<_> = iter.group_by(|(_, v)| *v) .into_iter() .map(|(_, ids)| ids.map(|(id, _)| id).collect()) .collect(); diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index fc7050a7f..9d4d88d0d 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -155,7 +155,7 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result> { match ctx.word_position_last_level(&word, in_prefix_cache)? { Some(level) => { - let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level.clone()) as u32); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) }, @@ -164,8 +164,8 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { } fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option) -> heed::Result { - let level = level.min(&self.level).clone(); - let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level.clone()) as u32); + let level = *level.min(&self.level); + let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); let word = self.word.clone(); let in_prefix_cache = self.in_prefix_cache; let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; @@ -214,7 +214,7 @@ struct QueryLevelIterator<'t, 'q> { } impl<'t, 'q> QueryLevelIterator<'t, 'q> { - fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn new(ctx: &'t dyn Context<'t>, queries: &'q [Query], wdcache: &mut WordDerivationsCache) -> anyhow::Result> { let mut inner = Vec::with_capacity(queries.len()); for query in queries { match &query.kind { @@ -244,7 +244,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { } } - let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone()); + let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level); match highest { Some(level) => Ok(Some(Self { parent: None, @@ -287,7 +287,7 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { let u8_level = Into::::into(level); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); for wli in self.inner.iter_mut() { - let wli_u8_level = Into::::into(wli.level.clone()); + let wli_u8_level = Into::::into(wli.level); let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); for _ in 0..accumulated_count { if let Some((next_left, _, next_docids)) = wli.next()? { @@ -364,8 +364,8 @@ fn interval_to_skip( already_skiped: usize, allowed_candidates: &RoaringBitmap, ) -> usize { - parent_accumulator.into_iter() - .zip(current_accumulator.into_iter()) + parent_accumulator.iter() + .zip(current_accumulator.iter()) .skip(already_skiped) .take_while(|(parent, current)| { let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); @@ -410,7 +410,7 @@ impl<'t, 'q> Branch<'t, 'q> { /// update inner interval in order to be ranked by the binary_heap without computing it, /// the next() method should be called when the real interval is needed. fn lazy_next(&mut self) { - let u8_level = Into::::into(self.tree_level.clone()); + let u8_level = Into::::into(self.tree_level); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); let (left, right, _) = self.last_result; @@ -679,7 +679,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] } else { - ops.into_iter().map(recurse).flatten().collect() + ops.iter().map(recurse).flatten().collect() }, Operation::Query(query) => vec![vec![vec![query.clone()]]], } diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index 0dbf3ee1a..e2fb81aaf 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -30,30 +30,28 @@ impl<'t> Final<'t> { #[logging_timer::time("Final::{}")] pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> anyhow::Result> { - loop { - debug!("Final iteration"); - let mut criterion_parameters = CriterionParameters { - wdcache: &mut self.wdcache, - // returned_candidates is merged with excluded_candidates to avoid duplicas - excluded_candidates: &(&self.returned_candidates | excluded_candidates), - }; + debug!("Final iteration"); + let mut criterion_parameters = CriterionParameters { + wdcache: &mut self.wdcache, + // returned_candidates is merged with excluded_candidates to avoid duplicas + excluded_candidates: &(&self.returned_candidates | excluded_candidates), + }; - match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates = match (candidates, query_tree.as_ref()) { - (Some(candidates), _) => candidates, - (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)?, - (None, None) => self.ctx.documents_ids()?, - }; + match self.parent.next(&mut criterion_parameters)? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (candidates, query_tree.as_ref()) { + (Some(candidates), _) => candidates, + (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)?, + (None, None) => self.ctx.documents_ids()?, + }; - let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); + let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); - self.returned_candidates |= &candidates; + self.returned_candidates |= &candidates; - return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); - }, - None => return Ok(None), - } + return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); + }, + None => return Ok(None), } } } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 288a92f65..059d52e48 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -70,22 +70,24 @@ impl<'t> Criterion for Typo<'t> { }, Some((_, query_tree, candidates_authorization)) => { let fst = self.ctx.words_fst(); - let new_query_tree = if self.typos < MAX_TYPOS_PER_WORD { - alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? - } else if self.typos == MAX_TYPOS_PER_WORD { - // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, - // we keep the altered query tree - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; - // we compute the allowed candidates - let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; - // we assign the allowed candidates to the candidates authorization. - *candidates_authorization = match take(candidates_authorization) { - Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), - Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), - }; - query_tree.clone() - } else { - query_tree.clone() + let new_query_tree = match self.typos { + typos if typos < MAX_TYPOS_PER_WORD => { + alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? + }, + MAX_TYPOS_PER_WORD => { + // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, + // we keep the altered query tree + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; + // we compute the allowed candidates + let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; + // we assign the allowed candidates to the candidates authorization. + *candidates_authorization = match take(candidates_authorization) { + Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), + Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), + }; + query_tree.clone() + }, + _otherwise => query_tree.clone(), }; let mut candidates = resolve_candidates( @@ -187,7 +189,7 @@ fn alterate_query_tree( } else { let typo = *typo.min(&number_typos); let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; - let queries = words.into_iter().map(|(word, typo)| { + let queries = words.iter().map(|(word, typo)| { Operation::Query(Query { prefix: false, kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() }, diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index b74b8af58..4876e37c8 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -241,7 +241,7 @@ impl<'a> QueryTreeBuilder<'a> { } /// Split the word depending on the frequency of subwords in the database documents. -fn split_best_frequency<'a>(ctx: &impl Context, word: &'a str) -> heed::Result> { +fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -438,14 +438,14 @@ fn create_query_tree( let start = number_phrases + (number_phrases == 0) as usize; for len in start..=query.len() { let mut word_count = len - number_phrases; - let query: Vec<_> = query.iter().filter_map(|p| { + let query: Vec<_> = query.iter().filter(|p| { if p.is_phrase() { - Some(p) + true } else if word_count != 0 { word_count -= 1; - Some(p) + true } else { - None + false } }) .cloned() From a3944a7083ebe3a4729051b59da03f30d2e78512 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 10 May 2021 12:33:37 +0200 Subject: [PATCH 0679/1889] Introduce a filtered_candidates field --- milli/src/search/criteria/asc_desc.rs | 20 ++++++++++---------- milli/src/search/criteria/attribute.rs | 16 +++++++++++++--- milli/src/search/criteria/exactness.rs | 16 +++++++++++----- milli/src/search/criteria/final.rs | 19 ++++++++++++------- milli/src/search/criteria/initial.rs | 5 +++-- milli/src/search/criteria/mod.rs | 2 ++ milli/src/search/criteria/proximity.rs | 12 +++++++++--- milli/src/search/criteria/typo.rs | 20 ++++++++++++++------ milli/src/search/criteria/words.rs | 16 +++++++++------- 9 files changed, 83 insertions(+), 43 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 31edc453e..0511ce319 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -93,23 +93,22 @@ impl<'t> Criterion for AscDesc<'t> { match self.candidates.next().transpose()? { None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { self.query_tree = query_tree; - let candidates = match (&self.query_tree, candidates) { - (_, Some(mut candidates)) => { - candidates.intersect_with(&self.faceted_candidates); - candidates - }, + let mut candidates = match (&self.query_tree, candidates) { + (_, Some(candidates)) => candidates & &self.faceted_candidates, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let mut candidates = resolve_query_tree(&context, qt, params.wdcache)?; - candidates -= params.excluded_candidates; - candidates.intersect_with(&self.faceted_candidates); - candidates + let candidates = resolve_query_tree(&context, qt, params.wdcache)?; + candidates & &self.faceted_candidates }, (None, None) => take(&mut self.faceted_candidates), }; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + match bucket_candidates { Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, None => self.bucket_candidates |= &candidates, @@ -136,6 +135,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 9d4d88d0d..6818e02fd 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -25,6 +25,7 @@ const LEVEL_EXPONENTIATION_BASE: u32 = 4; const CANDIDATES_THRESHOLD: u64 = 1000; type FlattenedQueryTree = Vec>>; + pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, @@ -59,6 +60,7 @@ impl<'t> Criterion for Attribute<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, @@ -78,6 +80,7 @@ impl<'t> Criterion for Attribute<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, @@ -89,6 +92,7 @@ impl<'t> Criterion for Attribute<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, @@ -102,17 +106,22 @@ impl<'t> Criterion for Attribute<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates: Some(found_candidates), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { - let candidates = match candidates { + Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { + let mut candidates = match candidates { Some(candidates) => candidates, None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, }; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + let flattened_query_tree = flatten_query_tree(&query_tree); match bucket_candidates { @@ -123,10 +132,11 @@ impl<'t> Criterion for Attribute<'t> { self.state = Some((query_tree, flattened_query_tree, candidates)); self.current_buckets = None; }, - Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, + filtered_candidates, bucket_candidates, })); }, diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 2fd216053..b1026ccc2 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -66,17 +66,22 @@ impl<'t> Criterion for Exactness<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { - let candidates = match candidates { + Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { + let mut candidates = match candidates { Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)?, + None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, }; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + match bucket_candidates { Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, None => self.bucket_candidates |= &candidates, @@ -85,10 +90,11 @@ impl<'t> Criterion for Exactness<'t> { self.state = Some(State::new(candidates)); self.query_tree = Some(query_tree); }, - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { - query_tree, + query_tree: None, candidates, + filtered_candidates, bucket_candidates, })); }, diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index e2fb81aaf..860362f51 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -31,27 +31,32 @@ impl<'t> Final<'t> { #[logging_timer::time("Final::{}")] pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> anyhow::Result> { debug!("Final iteration"); + let excluded_candidates = &self.returned_candidates | excluded_candidates; let mut criterion_parameters = CriterionParameters { wdcache: &mut self.wdcache, // returned_candidates is merged with excluded_candidates to avoid duplicas - excluded_candidates: &(&self.returned_candidates | excluded_candidates), + excluded_candidates: &excluded_candidates, }; match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { - let candidates = match (candidates, query_tree.as_ref()) { + Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { + let mut candidates = match (candidates, query_tree.as_ref()) { (Some(candidates), _) => candidates, - (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)?, - (None, None) => self.ctx.documents_ids()?, + (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates, + (None, None) => self.ctx.documents_ids()? - excluded_candidates, }; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); self.returned_candidates |= &candidates; - return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); + Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })) }, - None => return Ok(None), + None => Ok(None), } } } diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index eb2bf0b95..5d242a0eb 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -9,10 +9,11 @@ pub struct Initial { } impl Initial { - pub fn new(query_tree: Option, mut candidates: Option) -> Initial { + pub fn new(query_tree: Option, filtered_candidates: Option) -> Initial { let answer = CriterionResult { query_tree, - candidates: candidates.take(), + candidates: None, + filtered_candidates, bucket_candidates: None, }; Initial { answer: Some(answer) } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index f9fca2624..99e4a4209 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -38,6 +38,8 @@ pub struct CriterionResult { /// The candidates that this criterion is allowed to return subsets of, /// if None, it is up to the child to compute the candidates itself. candidates: Option, + /// The candidates, coming from facet filters, that this criterion is allowed to return subsets of. + filtered_candidates: Option, /// Candidates that comes from the current bucket of the initial criterion. bucket_candidates: Option, } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index e50d3941d..bf9be9b9f 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -113,17 +113,22 @@ impl<'t> Criterion for Proximity<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), candidates: Some(new_candidates), + filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { - let candidates = match candidates { + Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { + let mut candidates = match candidates { Some(candidates) => candidates, None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, }; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + match bucket_candidates { Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, None => self.bucket_candidates |= &candidates, @@ -134,10 +139,11 @@ impl<'t> Criterion for Proximity<'t> { self.proximity = 0; self.plane_sweep_cache = None; }, - Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, + filtered_candidates, bucket_candidates, })); }, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 059d52e48..a844417eb 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -119,30 +119,33 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: Some(candidates), + filtered_candidates: None, bucket_candidates: Some(bucket_candidates), })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc), }; - let candidates = candidates.map_or_else(|| { - Candidates::Forbidden(params.excluded_candidates.clone()) - }, Candidates::Allowed); + let candidates = match candidates.or(filtered_candidates) { + Some(candidates) => Candidates::Allowed(candidates - params.excluded_candidates), + None => Candidates::Forbidden(params.excluded_candidates.clone()), + }; let maximum_typos = maximum_typo(&query_tree) as u8; self.state = Some((maximum_typos, query_tree, candidates)); self.typos = 0; }, - Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, + filtered_candidates, bucket_candidates, })); }, @@ -377,6 +380,7 @@ mod test { ])), candidates: Some(candidates_1.clone()), bucket_candidates: Some(candidates_1), + filtered_candidates: None, }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); @@ -399,6 +403,7 @@ mod test { ])), candidates: Some(candidates_2.clone()), bucket_candidates: Some(candidates_2), + filtered_candidates: None, }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); @@ -419,8 +424,9 @@ mod test { let expected = CriterionResult { query_tree: None, - candidates: Some(facet_candidates.clone()), + candidates: None, bucket_candidates: None, + filtered_candidates: Some(facet_candidates.clone()), }; // first iteration, returns the facet candidates @@ -464,6 +470,7 @@ mod test { ])), candidates: Some(&candidates_1 & &facet_candidates), bucket_candidates: Some(&candidates_1 & &facet_candidates), + filtered_candidates: None, }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); @@ -486,6 +493,7 @@ mod test { ])), candidates: Some(&candidates_2 & &facet_candidates), bucket_candidates: Some(&candidates_2 & &facet_candidates), + filtered_candidates: None, }; assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 2f7ebbfbf..8730fa331 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -11,8 +11,8 @@ pub struct Words<'t> { query_trees: Vec, candidates: Option, bucket_candidates: Option, + filtered_candidates: Option, parent: Box, - compute_candidates: bool, } impl<'t> Words<'t> { @@ -23,7 +23,7 @@ impl<'t> Words<'t> { candidates: None, bucket_candidates: None, parent, - compute_candidates: false, + filtered_candidates: None, } } } @@ -42,13 +42,13 @@ impl<'t> Criterion for Words<'t> { match self.query_trees.pop() { Some(query_tree) => { let candidates = match self.candidates.as_mut() { - Some(allowed_candidates) if self.compute_candidates => { + Some(allowed_candidates) => { let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; candidates &= &*allowed_candidates; *allowed_candidates -= &candidates; Some(candidates) }, - candidates => candidates.cloned(), + None => None, }; let bucket_candidates = match self.bucket_candidates.as_mut() { @@ -59,25 +59,27 @@ impl<'t> Criterion for Words<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates, + filtered_candidates: self.filtered_candidates.clone(), bucket_candidates, })); }, None => { match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { self.query_trees = explode_query_tree(query_tree); self.candidates = candidates; - self.compute_candidates = bucket_candidates.is_some(); + self.filtered_candidates = filtered_candidates; self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc), }; }, - Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, + filtered_candidates, bucket_candidates, })); }, From df7a32e3d0589b72c82a323968a5e72edf34f912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 21 Apr 2021 11:48:23 +0200 Subject: [PATCH 0680/1889] Move the creation date initialization into a function --- milli/src/index.rs | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index f222069f6..07af6286b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -81,16 +81,7 @@ impl Index { let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; - { - let mut txn = env.write_txn()?; - // The db was just created, we update its metadata with the relevant information. - if main.get::<_, Str, SerdeJson>>(&txn, CREATED_AT_KEY)?.is_none() { - let now = Utc::now(); - main.put::<_, Str, SerdeJson>>(&mut txn, UPDATED_AT_KEY, &now)?; - main.put::<_, Str, SerdeJson>>(&mut txn, CREATED_AT_KEY, &now)?; - txn.commit()?; - } - } + Index::initialize_creation_dates(&env, main)?; Ok(Index { env, From a56c46b6f1dbfe22bef9b33535d5e4ae236030ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 21 Apr 2021 11:49:26 +0200 Subject: [PATCH 0681/1889] Explode the string and f64 facet databases into two --- milli/src/index.rs | 50 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 07af6286b..305d95cc7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -14,6 +14,10 @@ use crate::{ BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, }; +use crate::heed_codec::facet::{ + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FacetValueStringCodec, FacetLevelValueF64Codec, +}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; @@ -40,33 +44,45 @@ const UPDATED_AT_KEY: &str = "updated-at"; pub struct Index { /// The LMDB environment which this index is associated with. pub env: heed::Env, + /// Contains many different types (e.g. the fields ids map). pub main: PolyDatabase, + /// A word and all the documents ids containing the word. pub word_docids: Database, /// A prefix of word and all the documents ids containing this prefix. pub word_prefix_docids: Database, + /// Maps a word and a document id (u32) to all the positions where the given word appears. pub docid_word_positions: Database, + /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, + /// Maps the word, level and position range with the docids that corresponds to it. pub word_level_position_docids: Database, /// Maps the level positions of a word prefix with all the docids where this prefix appears. pub word_prefix_level_position_docids: Database, - /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. - pub facet_field_id_value_docids: Database, - /// Maps the document id, the facet field id and the globally ordered value. - pub field_id_docid_facet_values: Database, + + /// Maps the facet field id, level and the number with the docids that corresponds to it. + pub facet_id_f64_docids: Database, + /// Maps the facet field id and the string with the docids that corresponds to it. + pub facet_id_string_docids: Database, + + /// Maps the document id, the facet field id and the numbers. + pub field_id_docid_facet_f64s: Database, + /// Maps the document id, the facet field id and the strings. + pub field_id_docid_facet_strings: Database, + /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, } impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(11); + options.max_dbs(13); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -77,8 +93,10 @@ impl Index { let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; - let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; - let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; + let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?; + let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?; + let field_id_docid_facet_f64s = env.create_database(Some("field-id-docid-facet-f64s"))?; + let field_id_docid_facet_strings = env.create_database(Some("field-id-docid-facet-strings"))?; let documents = env.create_database(Some("documents"))?; Index::initialize_creation_dates(&env, main)?; @@ -93,12 +111,26 @@ impl Index { word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, - facet_field_id_value_docids, - field_id_docid_facet_values, + facet_id_f64_docids, + facet_id_string_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, documents, }) } + fn initialize_creation_dates(env: &heed::Env, main: PolyDatabase) -> heed::Result<()> { + let mut txn = env.write_txn()?; + // The db was just created, we update its metadata with the relevant information. + if main.get::<_, Str, SerdeJson>>(&txn, CREATED_AT_KEY)?.is_none() { + let now = Utc::now(); + main.put::<_, Str, SerdeJson>>(&mut txn, UPDATED_AT_KEY, &now)?; + main.put::<_, Str, SerdeJson>>(&mut txn, CREATED_AT_KEY, &now)?; + txn.commit()?; + } + Ok(()) + } + /// Create a write transaction to be able to write into the index. pub fn write_txn(&self) -> heed::Result { self.env.write_txn() From 837c1041c7c02f47100c78a6ae9b18fc315f68e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 21 Apr 2021 15:43:44 +0200 Subject: [PATCH 0682/1889] Clear and delete the documents from the facet database --- milli/src/update/clear_documents.rs | 12 ++- milli/src/update/delete_documents.rs | 144 +++++++++++++++++---------- 2 files changed, 100 insertions(+), 56 deletions(-) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index f89c2d00c..ba0c9e58e 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -30,8 +30,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, - facet_field_id_value_docids, - field_id_docid_facet_values, + facet_id_f64_docids, + facet_id_string_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, documents, } = self.index; @@ -59,8 +61,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_level_position_docids.clear(self.wtxn)?; word_prefix_level_position_docids.clear(self.wtxn)?; - facet_field_id_value_docids.clear(self.wtxn)?; - field_id_docid_facet_values.clear(self.wtxn)?; + facet_id_f64_docids.clear(self.wtxn)?; + facet_id_string_docids.clear(self.wtxn)?; + field_id_docid_facet_f64s.clear(self.wtxn)?; + field_id_docid_facet_strings.clear(self.wtxn)?; documents.clear(self.wtxn)?; Ok(number_of_documents) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 4c5f8d61a..b2b1e8410 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -4,13 +4,12 @@ use std::collections::hash_map::Entry; use anyhow::anyhow; use chrono::Utc; use fst::IntoStreamer; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; use serde_json::Value; -use crate::facet::FacetType; -use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; -use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds}; use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -90,8 +89,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, - facet_field_id_value_docids, - field_id_docid_facet_values, + facet_id_f64_docids, + facet_id_string_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, documents, } = self.index; @@ -285,52 +286,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); - // Remove the documents ids from the faceted documents ids. - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; - for (field_id, facet_type) in faceted_fields { - let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; - docids.difference_with(&self.documents_ids); - self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; - - // We delete the entries that are part of the documents ids. - let iter = field_id_docid_facet_values.prefix_iter_mut(self.wtxn, &[field_id])?; - match facet_type { - FacetType::String => { - let mut iter = iter.remap_key_type::(); - while let Some(result) = iter.next() { - let ((_fid, docid, _value), ()) = result?; - if self.documents_ids.contains(docid) { - iter.del_current()?; - } - } - }, - FacetType::Number => { - let mut iter = iter.remap_key_type::(); - while let Some(result) = iter.next() { - let ((_fid, docid, _value), ()) = result?; - if self.documents_ids.contains(docid) { - iter.del_current()?; - } - } - }, - } - } - - // We delete the documents ids that are under the facet field id values. - let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids.difference_with(&self.documents_ids); - if docids.is_empty() { - iter.del_current()?; - } else if docids.len() != previous_len { - iter.put_current(bytes, &docids)?; - } - } - - drop(iter); - // We delete the documents ids that are under the word level position docids. let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); while let Some(result) = iter.next() { @@ -361,10 +316,95 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // We delete the documents ids that are under the facet field id values. + remove_docids_from_facet_field_id_value_docids( + self.wtxn, + facet_id_f64_docids, + &self.documents_ids, + )?; + + remove_docids_from_facet_field_id_value_docids( + self.wtxn, + facet_id_string_docids, + &self.documents_ids, + )?; + + // Remove the documents ids from the faceted documents ids. + let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + for (field_id, facet_type) in faceted_fields { + let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; + docids.difference_with(&self.documents_ids); + self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; + + remove_docids_from_field_id_docid_facet_value( + self.wtxn, + field_id_docid_facet_f64s, + field_id, + &self.documents_ids, + |(_fid, docid, _value)| docid, + )?; + + remove_docids_from_field_id_docid_facet_value( + self.wtxn, + field_id_docid_facet_strings, + field_id, + &self.documents_ids, + |(_fid, docid, _value)| docid, + )?; + } + Ok(self.documents_ids.len()) } } +fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>( + wtxn: &'a mut heed::RwTxn, + db: &heed::Database, + field_id: FieldId, + to_remove: &RoaringBitmap, + convert: F, +) -> heed::Result<()> +where + C: heed::BytesDecode<'a, DItem=K> + heed::BytesEncode<'a, EItem=K>, + F: Fn(K) -> DocumentId, +{ + let mut iter = db.remap_key_type::() + .prefix_iter_mut(wtxn, &[field_id])? + .remap_key_type::(); + + while let Some(result) = iter.next() { + let (key, ()) = result?; + if to_remove.contains(convert(key)) { + iter.del_current()?; + } + } + + Ok(()) +} + +fn remove_docids_from_facet_field_id_value_docids<'a, C>( + wtxn: &'a mut heed::RwTxn, + db: &heed::Database, + to_remove: &RoaringBitmap, +) -> heed::Result<()> +where + C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, +{ + let mut iter = db.remap_key_type::().iter_mut(wtxn)?; + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids.difference_with(to_remove); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(bytes, &docids)?; + } + } + + Ok(()) +} + #[cfg(test)] mod tests { use heed::EnvOpenOptions; From 597144b0b93ff23ef7523685d5112ecc4ce799d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 28 Apr 2021 10:20:31 +0200 Subject: [PATCH 0683/1889] Use both number and string facet databases in the distinct system --- milli/src/search/criteria/asc_desc.rs | 57 ++++++----- milli/src/search/distinct/facet_distinct.rs | 101 +++++++++++--------- 2 files changed, 85 insertions(+), 73 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 0511ce319..9e8bebb8f 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -8,7 +8,6 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; @@ -39,8 +38,7 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ) -> anyhow::Result - { + ) -> anyhow::Result { Self::new(index, rtxn, parent, field_name, true) } @@ -49,8 +47,7 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ) -> anyhow::Result - { + ) -> anyhow::Result { Self::new(index, rtxn, parent, field_name, false) } @@ -60,11 +57,11 @@ impl<'t> AscDesc<'t> { parent: Box, field_name: String, ascending: bool, - ) -> anyhow::Result - { + ) -> anyhow::Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let faceted_fields = index.faceted_fields(rtxn)?; - let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; + let (field_id, facet_type) = + field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; Ok(AscDesc { index, @@ -86,8 +83,10 @@ impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { loop { - debug!("Facet {}({}) iteration", - if self.ascending { "Asc" } else { "Desc" }, self.field_name + debug!( + "Facet {}({}) iteration", + if self.ascending { "Asc" } else { "Desc" }, + self.field_name ); match self.candidates.next().transpose()? { @@ -138,7 +137,7 @@ impl<'t> Criterion for AscDesc<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, + } } } } @@ -148,14 +147,13 @@ fn field_id_facet_type( fields_ids_map: &FieldsIdsMap, faceted_fields: &HashMap, field: &str, -) -> anyhow::Result<(FieldId, FacetType)> -{ - let id = fields_ids_map.id(field).with_context(|| { - format!("field {:?} isn't registered", field) - })?; - let facet_type = faceted_fields.get(field).with_context(|| { - format!("field {:?} isn't faceted", field) - })?; +) -> anyhow::Result<(FieldId, FacetType)> { + let id = fields_ids_map + .id(field) + .with_context(|| format!("field {:?} isn't registered", field))?; + let facet_type = faceted_fields + .get(field) + .with_context(|| format!("field {:?} isn't faceted", field))?; Ok((id, *facet_type)) } @@ -170,14 +168,12 @@ fn facet_ordered<'t>( facet_type: FacetType, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result> + 't>> -{ +) -> anyhow::Result> + 't>> { match facet_type { FacetType::Number => { if candidates.len() <= CANDIDATES_THRESHOLD { - let iter = iterative_facet_ordered_iter( - index, rtxn, field_id, ascending, candidates, - )?; + let iter = + iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; Ok(Box::new(iter.map(Ok)) as Box>) } else { let facet_fn = if ascending { @@ -188,7 +184,7 @@ fn facet_ordered<'t>( let iter = facet_fn(rtxn, index, field_id, candidates)?; Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } - }, + } FacetType::String => bail!("criteria facet type must be a number"), } } @@ -202,14 +198,14 @@ fn iterative_facet_ordered_iter<'t>( field_id: FieldId, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result + 't> -{ - let db = index.field_id_docid_facet_values.remap_key_type::(); +) -> anyhow::Result + 't> { let mut docids_values = Vec::with_capacity(candidates.len() as usize); for docid in candidates.iter() { let left = (field_id, docid, f64::MIN); let right = (field_id, docid, f64::MAX); - let mut iter = db.range(rtxn, &(left..=right))?; + let mut iter = index + .field_id_docid_facet_f64s + .range(rtxn, &(left..=right))?; let entry = if ascending { iter.next() } else { iter.last() }; if let Some(((_, _, value), ())) = entry.transpose()? { docids_values.push((docid, OrderedFloat(value))); @@ -226,7 +222,8 @@ fn iterative_facet_ordered_iter<'t>( // The itertools GroupBy iterator doesn't provide an owned version, we are therefore // required to collect the result into an owned collection (a Vec). // https://github.com/rust-itertools/itertools/issues/499 - let vec: Vec<_> = iter.group_by(|(_, v)| *v) + let vec: Vec<_> = iter + .group_by(|(_, v)| v.clone()) .into_iter() .map(|(_, ids)| ids.map(|(id, _)| id).collect()) .collect(); diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 3c508b25b..f3952e6f1 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -1,10 +1,14 @@ use std::mem::size_of; +use heed::types::ByteSlice; use roaring::RoaringBitmap; +use super::{Distinct, DocIter}; use crate::heed_codec::facet::*; use crate::{facet::FacetType, DocumentId, FieldId, Index}; -use super::{Distinct, DocIter}; + +const FID_SIZE: usize = size_of::(); +const DOCID_SIZE: usize = size_of::(); /// A distinct implementer that is backed by facets. /// @@ -48,31 +52,27 @@ pub struct FacetDistinctIter<'a> { } impl<'a> FacetDistinctIter<'a> { - fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result - where - KC: heed::BytesEncode<'c>, - { - let facet_docids = self - .index - .facet_field_id_value_docids - .remap_key_type::() - .get(self.txn, key)? - .expect("Corrupted data: Facet values must exist"); - Ok(facet_docids) + fn facet_string_docids(&self, key: &str) -> heed::Result> { + self.index + .facet_id_string_docids + .get(self.txn, &(self.distinct, key)) + } + + fn facet_number_docids(&self, key: f64) -> heed::Result> { + // get facet docids on level 0 + self.index + .facet_id_f64_docids + .get(self.txn, &(self.distinct, 0, key, key)) } fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> { - let iter = get_facet_values::( - id, - self.distinct, - self.index, - self.txn, - )?; + let iter = facet_string_values(id, self.distinct, self.index, self.txn)?; for item in iter { let ((_, _, value), _) = item?; - let key = (self.distinct, value); - let facet_docids = self.get_facet_docids::(&key)?; + let facet_docids = self + .facet_string_docids(value)? + .expect("Corrupted data: Facet values must exist"); self.excluded.union_with(&facet_docids); } @@ -82,17 +82,13 @@ impl<'a> FacetDistinctIter<'a> { } fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> { - let iter = get_facet_values::(id, - self.distinct, - self.index, - self.txn, - )?; + let iter = facet_number_values(id, self.distinct, self.index, self.txn)?; for item in iter { let ((_, _, value), _) = item?; - // get facet docids on level 0 - let key = (self.distinct, 0, value, value); - let facet_docids = self.get_facet_docids::(&key)?; + let facet_docids = self + .facet_number_docids(value)? + .expect("Corrupted data: Facet values must exist"); self.excluded.union_with(&facet_docids); } @@ -129,26 +125,44 @@ impl<'a> FacetDistinctIter<'a> { } } -fn get_facet_values<'a, KC>( +fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { + let mut key = [0; FID_SIZE + DOCID_SIZE]; + key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); + key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); + key +} + +fn facet_number_values<'a>( id: DocumentId, distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> anyhow::Result> -where - KC: heed::BytesDecode<'a>, -{ - const FID_SIZE: usize = size_of::(); - const DOCID_SIZE: usize = size_of::(); - - let mut key = [0; FID_SIZE + DOCID_SIZE]; - key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); - key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); +) -> anyhow::Result> { + let key = facet_values_prefix_key(distinct, id); let iter = index - .field_id_docid_facet_values + .field_id_docid_facet_f64s + .remap_key_type::() .prefix_iter(txn, &key)? - .remap_key_type::(); + .remap_key_type::(); + + Ok(iter) +} + +fn facet_string_values<'a>( + id: DocumentId, + distinct: FieldId, + index: &Index, + txn: &'a heed::RoTxn, +) -> anyhow::Result> { + let key = facet_values_prefix_key(distinct, id); + + let iter = index + .field_id_docid_facet_strings + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_key_type::(); + Ok(iter) } @@ -186,8 +200,8 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { mod test { use std::collections::HashMap; - use super::*; use super::super::test::{generate_index, validate_distinct_candidates}; + use super::*; use crate::facet::FacetType; macro_rules! test_facet_distinct { @@ -196,7 +210,8 @@ mod test { fn $name() { use std::iter::FromIterator; - let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); + let facets = + HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); let (index, fid, candidates) = generate_index($distinct, facets); let txn = index.read_txn().unwrap(); let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type); From 038e03a4e42a4c195cab2cd6730cde541af52c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 28 Apr 2021 10:22:48 +0200 Subject: [PATCH 0684/1889] Use both facet databases in the FacetIter type --- milli/src/search/facet/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 0252af963..26bcf1b83 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -140,7 +140,7 @@ impl<'t> FacetIter<'t> { documents_ids: RoaringBitmap, ) -> heed::Result> { - let db = index.facet_field_id_value_docids.remap_key_type::(); + let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Left(highest_iter))]; @@ -157,7 +157,7 @@ impl<'t> FacetIter<'t> { documents_ids: RoaringBitmap, ) -> heed::Result> { - let db = index.facet_field_id_value_docids.remap_key_type::(); + let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Right(highest_iter))]; @@ -175,7 +175,7 @@ impl<'t> FacetIter<'t> { documents_ids: RoaringBitmap, ) -> heed::Result> { - let db = index.facet_field_id_value_docids.remap_key_type::(); + let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Left(highest_iter))]; From bd7b285bae9b66866219eb8b73dd8f178f3810e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 28 Apr 2021 17:58:16 +0200 Subject: [PATCH 0685/1889] Split the update side to use the number and the strings facet databases --- milli/src/criterion.rs | 10 +- milli/src/index.rs | 96 ++++-- milli/src/search/criteria/asc_desc.rs | 41 +-- milli/src/search/distinct/facet_distinct.rs | 17 +- milli/src/search/mod.rs | 2 +- milli/src/update/clear_documents.rs | 6 +- milli/src/update/delete_documents.rs | 13 +- milli/src/update/facets.rs | 88 +++-- milli/src/update/index_documents/mod.rs | 63 ++-- milli/src/update/index_documents/store.rs | 341 ++++++++++++-------- milli/src/update/settings.rs | 14 +- 11 files changed, 406 insertions(+), 285 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 8bae99a20..1d7326db7 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::HashSet; use std::fmt; use anyhow::{Context, bail}; @@ -6,8 +6,6 @@ use regex::Regex; use serde::{Serialize, Deserialize}; use once_cell::sync::Lazy; -use crate::facet::FacetType; - static ASC_DESC_REGEX: Lazy = Lazy::new(|| { Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() }); @@ -33,7 +31,7 @@ pub enum Criterion { } impl Criterion { - pub fn from_str(faceted_attributes: &HashMap, txt: &str) -> anyhow::Result { + pub fn from_str(faceted_attributes: &HashSet, txt: &str) -> anyhow::Result { match txt { "words" => Ok(Criterion::Words), "typo" => Ok(Criterion::Typo), @@ -44,7 +42,9 @@ impl Criterion { let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; let order = caps.get(1).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str(); - faceted_attributes.get(field_name).with_context(|| format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name))?; + faceted_attributes.get(field_name).with_context(|| { + format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name) + })?; match order { "asc" => Ok(Criterion::Asc(field_name.to_string())), "desc" => Ok(Criterion::Desc(field_name.to_string())), diff --git a/milli/src/index.rs b/milli/src/index.rs index 305d95cc7..14b153a2e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::path::Path; use anyhow::Context; @@ -18,24 +18,24 @@ use crate::heed_codec::facet::{ FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FacetValueStringCodec, FacetLevelValueF64Codec, }; -use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; -pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; -pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; +pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; +pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; +pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; -pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; -pub const WORDS_FST_KEY: &str = "words-fst"; pub const STOP_WORDS_KEY: &str = "stop-words"; +pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; pub const SYNONYMS_KEY: &str = "synonyms"; +pub const WORDS_FST_KEY: &str = "words-fst"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; @@ -321,53 +321,97 @@ impl Index { /* faceted fields */ - /// Writes the facet fields associated with their facet type or `None` if - /// the facet type is currently unknown. - pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types) + /// Writes the facet fields names in the database. + pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields) } - /// Deletes the facet fields ids associated with their facet type. + /// Deletes the facet fields ids in the database. pub fn delete_faceted_fields(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY) } - /// Returns the facet fields names associated with their facet type. - pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { + /// Returns the facet fields names. + pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) } /// Same as `faceted_fields`, but returns ids instead. - pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result> { + pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result> { let faceted_fields = self.faceted_fields(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?; let faceted_fields = faceted_fields .iter() - .map(|(k, v)| { - let kid = fields_ids_map + .map(|k| { + fields_ids_map .id(k) .ok_or_else(|| format!("{:?} should be present in the field id map", k)) - .expect("corrupted data: "); - (kid, *v) + .expect("corrupted data: ") }) .collect(); + Ok(faceted_fields) } /* faceted documents ids */ - /// Writes the documents ids that are faceted under this field id. - pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap) -> heed::Result<()> { - let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + /// Writes the documents ids that are faceted with numbers under this field id. + pub fn put_number_faceted_documents_ids( + &self, + wtxn: &mut RwTxn, + field_id: FieldId, + docids: &RoaringBitmap, + ) -> heed::Result<()> + { + let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } - /// Retrieve all the documents ids that faceted under this field id. - pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: FieldId) -> heed::Result { - let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + /// Retrieve all the documents ids that faceted with numbers under this field id. + pub fn number_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result + { + let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + *buffer.last_mut().unwrap() = field_id; + match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + + /// Writes the documents ids that are faceted with strings under this field id. + pub fn put_string_faceted_documents_ids( + &self, + wtxn: &mut RwTxn, + field_id: FieldId, + docids: &RoaringBitmap, + ) -> heed::Result<()> + { + let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + *buffer.last_mut().unwrap() = field_id; + self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) + } + + /// Retrieve all the documents ids that faceted with strings under this field id. + pub fn string_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result + { + let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 9e8bebb8f..32857b8d7 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::mem::take; -use anyhow::{bail, Context as _}; +use anyhow::Context; use itertools::Itertools; use log::debug; use ordered_float::OrderedFloat; @@ -23,7 +23,6 @@ pub struct AscDesc<'t> { rtxn: &'t heed::RoTxn<'t>, field_name: String, field_id: FieldId, - facet_type: FacetType, ascending: bool, query_tree: Option, candidates: Box> + 't>, @@ -51,6 +50,7 @@ impl<'t> AscDesc<'t> { Self::new(index, rtxn, parent, field_name, false) } + fn new( index: &'t Index, rtxn: &'t heed::RoTxn, @@ -60,19 +60,19 @@ impl<'t> AscDesc<'t> { ) -> anyhow::Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let faceted_fields = index.faceted_fields(rtxn)?; - let (field_id, facet_type) = - field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; + let field_id = fields_ids_map + .id(&field_name) + .with_context(|| format!("field {:?} isn't registered", field_name))?; Ok(AscDesc { index, rtxn, field_name, field_id, - facet_type, ascending, query_tree: None, candidates: Box::new(std::iter::empty()), - faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, + faceted_candidates: index.number_faceted_documents_ids(rtxn, field_id)?, bucket_candidates: RoaringBitmap::new(), parent, }) @@ -165,27 +165,20 @@ fn facet_ordered<'t>( index: &'t Index, rtxn: &'t heed::RoTxn, field_id: FieldId, - facet_type: FacetType, ascending: bool, candidates: RoaringBitmap, ) -> anyhow::Result> + 't>> { - match facet_type { - FacetType::Number => { - if candidates.len() <= CANDIDATES_THRESHOLD { - let iter = - iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; - Ok(Box::new(iter.map(Ok)) as Box>) - } else { - let facet_fn = if ascending { - FacetIter::new_reducing - } else { - FacetIter::new_reverse_reducing - }; - let iter = facet_fn(rtxn, index, field_id, candidates)?; - Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) - } - } - FacetType::String => bail!("criteria facet type must be a number"), + if candidates.len() <= CANDIDATES_THRESHOLD { + let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; + Ok(Box::new(iter.map(Ok)) as Box>) + } else { + let facet_fn = if ascending { + FacetIter::new_reducing + } else { + FacetIter::new_reverse_reducing + }; + let iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index f3952e6f1..44dd6bc66 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::heed_codec::facet::*; -use crate::{facet::FacetType, DocumentId, FieldId, Index}; +use crate::{DocumentId, FieldId, Index}; const FID_SIZE: usize = size_of::(); const DOCID_SIZE: usize = size_of::(); @@ -22,7 +22,6 @@ pub struct FacetDistinct<'a> { distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>, - facet_type: FacetType, } impl<'a> FacetDistinct<'a> { @@ -30,14 +29,9 @@ impl<'a> FacetDistinct<'a> { distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>, - facet_type: FacetType, - ) -> Self { - Self { - distinct, - index, - txn, - facet_type, - } + ) -> Self + { + Self { distinct, index, txn } } } @@ -45,7 +39,6 @@ pub struct FacetDistinctIter<'a> { candidates: RoaringBitmap, distinct: FieldId, excluded: RoaringBitmap, - facet_type: FacetType, index: &'a Index, iter_offset: usize, txn: &'a heed::RoTxn<'a>, @@ -117,6 +110,7 @@ impl<'a> FacetDistinctIter<'a> { // increasing the offset we make sure to get the first valid value for the next // distinct document to keep. self.iter_offset += 1; + Ok(Some(id)) } // no more candidate at this offset, return. @@ -188,7 +182,6 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { candidates, distinct: self.distinct, excluded, - facet_type: self.facet_type, index: self.index, iter_offset: 0, txn: self.txn, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index be107bf72..640f081ba 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -145,7 +145,7 @@ impl<'a> Search<'a> { let faceted_fields = self.index.faceted_fields(self.rtxn)?; match faceted_fields.get(name) { Some(facet_type) => { - let distinct = FacetDistinct::new(id, self.index, self.rtxn, *facet_type); + let distinct = FacetDistinct::new(id, self.index, self.rtxn); self.perform_sort(distinct, matching_words, criteria) } None => { diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ba0c9e58e..c163046ec 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -49,8 +49,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?; // We clean all the faceted documents ids. - for (field_id, _) in faceted_fields { - self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?; + let empty = RoaringBitmap::default(); + for field_id in faceted_fields { + self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?; + self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?; } // Clear the other databases. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index b2b1e8410..e93ff9a0a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -330,11 +330,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { )?; // Remove the documents ids from the faceted documents ids. - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; - for (field_id, facet_type) in faceted_fields { - let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; + for field_id in self.index.faceted_fields_ids(self.wtxn)? { + // Remove docids from the number faceted documents ids + let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?; docids.difference_with(&self.documents_ids); - self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; + self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?; remove_docids_from_field_id_docid_facet_value( self.wtxn, @@ -344,6 +344,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { |(_fid, docid, _value)| docid, )?; + // Remove docids from the string faceted documents ids + let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?; + docids.difference_with(&self.documents_ids); + self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?; + remove_docids_from_field_id_docid_facet_value( self.wtxn, field_id_docid_facet_strings, diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index b9e4d7488..af72133a2 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -9,7 +9,6 @@ use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::Index; @@ -62,56 +61,51 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - for (field_id, facet_type) in faceted_fields { - let (content, documents_ids) = match facet_type { - FacetType::String => { - let documents_ids = compute_faceted_documents_ids( - self.wtxn, - self.index.facet_field_id_value_docids, - field_id, - )?; - (None, documents_ids) - }, - FacetType::Number => { - clear_field_number_levels( - self.wtxn, - self.index.facet_field_id_value_docids.remap_key_type::(), - field_id, - )?; + for field_id in faceted_fields { + // Compute and store the faceted strings documents ids. + let string_documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_id_string_docids.remap_key_type::(), + field_id, + )?; - let documents_ids = compute_faceted_documents_ids( - self.wtxn, - self.index.facet_field_id_value_docids, - field_id, - )?; + // Clear the facet number levels. + clear_field_number_levels( + self.wtxn, + self.index.facet_id_f64_docids, + field_id, + )?; - let content = compute_facet_number_levels( - self.wtxn, - self.index.facet_field_id_value_docids.remap_key_type::(), - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.level_group_size, - self.min_level_size, - field_id, - )?; + // Compute and store the faceted numbers documents ids. + let number_documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_id_f64_docids.remap_key_type::(), + field_id, + )?; - (Some(content), documents_ids) - }, - }; + let content = compute_facet_number_levels( + self.wtxn, + self.index.facet_id_f64_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + field_id, + )?; - if let Some(content) = content { - write_into_lmdb_database( - self.wtxn, - *self.index.facet_field_id_value_docids.as_polymorph(), - content, - |_, _| anyhow::bail!("invalid facet level merging"), - WriteMethod::GetMergePut, - )?; - } + self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?; + self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?; - self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?; + // Store the + write_into_lmdb_database( + self.wtxn, + *self.index.facet_id_f64_docids.as_polymorph(), + content, + |_, _| anyhow::bail!("invalid facet number level merging"), + WriteMethod::GetMergePut, + )?; } Ok(()) @@ -205,10 +199,12 @@ fn compute_faceted_documents_ids( ) -> anyhow::Result { let mut documents_ids = RoaringBitmap::new(); + for result in db.prefix_iter(rtxn, &[field_id])? { let (_key, docids) = result?; - documents_ids.union_with(&docids); + documents_ids |= docids; } + Ok(documents_ids) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3acae7821..10c2e41e7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -412,7 +412,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { Main, WordDocids, WordLevel0PositionDocids, - FacetLevel0ValuesDocids, + FacetLevel0NumbersDocids, } let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; @@ -478,8 +478,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); - let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); - let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len()); + let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len()); + let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len()); + let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len()); + let mut field_id_docid_facet_strings_readers = Vec::with_capacity(readers.len()); let mut documents_readers = Vec::with_capacity(readers.len()); readers.into_iter().for_each(|readers| { let Readers { @@ -488,17 +490,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions, words_pairs_proximities_docids, word_level_position_docids, - facet_field_value_docids, - field_id_docid_facet_values, - documents + facet_field_numbers_docids, + facet_field_strings_docids, + field_id_docid_facet_numbers, + field_id_docid_facet_strings, + documents, } = readers; main_readers.push(main); word_docids_readers.push(word_docids); docid_word_positions_readers.push(docid_word_positions); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); word_level_position_docids_readers.push(word_level_position_docids); - facet_field_value_docids_readers.push(facet_field_value_docids); - field_id_docid_facet_values_readers.push(field_id_docid_facet_values); + facet_field_numbers_docids_readers.push(facet_field_numbers_docids); + facet_field_strings_docids_readers.push(facet_field_strings_docids); + field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers); + field_id_docid_facet_strings_readers.push(field_id_docid_facet_strings); documents_readers.push(documents); }); @@ -523,8 +529,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { (DatabaseType::Main, main_readers, main_merge as MergeFn), (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), ( - DatabaseType::FacetLevel0ValuesDocids, - facet_field_value_docids_readers, + DatabaseType::FacetLevel0NumbersDocids, + facet_field_numbers_docids_readers, facet_field_value_docids_merge, ), ( @@ -547,7 +553,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, - field_id_docid_facet_values_readers, + facet_field_numbers_docids_readers, + facet_field_strings_docids_readers, + field_id_docid_facet_numbers_readers, + field_id_docid_facet_strings_readers, )) as anyhow::Result<_> })?; @@ -556,7 +565,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, - field_id_docid_facet_values_readers, + facet_field_numbers_docids_readers, + facet_field_strings_docids_readers, + field_id_docid_facet_numbers_readers, + field_id_docid_facet_strings_readers, ) = readers; let mut documents_ids = self.index.documents_ids(self.wtxn)?; @@ -624,11 +636,26 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); - debug!("Writing the field id docid facet values into LMDB on disk..."); + debug!("Writing the field id docid facet numbers into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, - *self.index.field_id_docid_facet_values.as_polymorph(), - field_id_docid_facet_values_readers, + *self.index.field_id_docid_facet_f64s.as_polymorph(), + field_id_docid_facet_numbers_readers, + field_id_docid_facet_values_merge, + write_method, + )?; + + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases, + }); + + debug!("Writing the field id docid facet strings into LMDB on disk..."); + merge_into_lmdb_database( + self.wtxn, + *self.index.field_id_docid_facet_strings.as_polymorph(), + field_id_docid_facet_strings_readers, field_id_docid_facet_values_merge, write_method, )?; @@ -678,9 +705,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, - DatabaseType::FacetLevel0ValuesDocids => { - debug!("Writing the facet level 0 values docids into LMDB on disk..."); - let db = *self.index.facet_field_id_value_docids.as_polymorph(); + DatabaseType::FacetLevel0NumbersDocids => { + debug!("Writing the facet numbers docids into LMDB on disk..."); + let db = *self.index.facet_id_f64_docids.as_polymorph(); write_into_lmdb_database( self.wtxn, db, diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 0f97476d9..ba8da6d16 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -6,25 +6,24 @@ use std::iter::FromIterator; use std::time::Instant; use std::{cmp, iter}; -use anyhow::{bail, Context}; +use anyhow::Context; use bstr::ByteSlice as _; use fst::Set; use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; -use log::{debug, info, warn}; +use log::{debug, info}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; use tempfile::tempfile; -use crate::facet::{FacetType, FacetValue}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; +use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ @@ -45,8 +44,10 @@ pub struct Readers { pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, pub word_level_position_docids: Reader, - pub facet_field_value_docids: Reader, - pub field_id_docid_facet_values: Reader, + pub facet_field_numbers_docids: Reader, + pub facet_field_strings_docids: Reader, + pub field_id_docid_facet_numbers: Reader, + pub field_id_docid_facet_strings: Reader, pub documents: Reader, } @@ -55,13 +56,14 @@ pub struct Store<'s, A> { primary_key: String, fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, - faceted_fields: HashMap, + faceted_fields: HashSet, // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, - facet_field_value_docids: LinkedHashMap<(u8, FacetValue), RoaringBitmap>, + facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, + facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, facet_field_value_docids_limit: usize, // MTBL parameters chunk_compression_type: CompressionType, @@ -72,8 +74,10 @@ pub struct Store<'s, A> { word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, word_level_position_docids_sorter: Sorter, - facet_field_value_docids_sorter: Sorter, - field_id_docid_facet_values_sorter: Sorter, + facet_field_numbers_docids_sorter: Sorter, + facet_field_strings_docids_sorter: Sorter, + field_id_docid_facet_numbers_sorter: Sorter, + field_id_docid_facet_strings_sorter: Sorter, // MTBL writers docid_word_positions_writer: Writer, documents_writer: Writer, @@ -86,7 +90,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { primary_key: String, fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, - faceted_fields: HashMap, + faceted_fields: HashSet, linked_hash_map_size: Option, max_nb_chunks: Option, max_memory: Option, @@ -132,7 +136,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_nb_chunks, max_memory, ); - let facet_field_value_docids_sorter = create_sorter( + let facet_field_numbers_docids_sorter = create_sorter( facet_field_value_docids_merge, chunk_compression_type, chunk_compression_level, @@ -140,7 +144,23 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_nb_chunks, max_memory, ); - let field_id_docid_facet_values_sorter = create_sorter( + let facet_field_strings_docids_sorter = create_sorter( + facet_field_value_docids_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + max_memory, + ); + let field_id_docid_facet_numbers_sorter = create_sorter( + field_id_docid_facet_values_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + Some(1024 * 1024 * 1024), // 1MB + ); + let field_id_docid_facet_strings_sorter = create_sorter( field_id_docid_facet_values_merge, chunk_compression_type, chunk_compression_level, @@ -173,7 +193,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids_limit: linked_hash_map_size, words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids_limit: linked_hash_map_size, - facet_field_value_docids: LinkedHashMap::with_capacity(linked_hash_map_size), + facet_field_number_docids: LinkedHashMap::with_capacity(linked_hash_map_size), + facet_field_string_docids: LinkedHashMap::with_capacity(linked_hash_map_size), facet_field_value_docids_limit: linked_hash_map_size, // MTBL parameters chunk_compression_type, @@ -184,8 +205,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids_sorter, words_pairs_proximities_docids_sorter, word_level_position_docids_sorter, - facet_field_value_docids_sorter, - field_id_docid_facet_values_sorter, + facet_field_numbers_docids_sorter, + facet_field_strings_docids_sorter, + field_id_docid_facet_numbers_sorter, + field_id_docid_facet_strings_sorter, // MTBL writers docid_word_positions_writer, documents_writer, @@ -215,34 +238,68 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - // Save the documents ids under the facet field id and value we have seen it. - fn insert_facet_values_docid( + fn insert_facet_number_values_docid( &mut self, field_id: FieldId, - field_value: FacetValue, + value: OrderedFloat, id: DocumentId, ) -> anyhow::Result<()> { - Self::write_field_id_docid_facet_value(&mut self.field_id_docid_facet_values_sorter, field_id, id, &field_value)?; + let sorter = &mut self.field_id_docid_facet_numbers_sorter; + Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; - let key = (field_id, field_value); + let key = (field_id, value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. - match self.facet_field_value_docids.get_refresh(&key) { + match self.facet_field_number_docids.get_refresh(&key) { Some(old) => { old.insert(id); }, None => { // A newly inserted element is append at the end of the linked hash map. - self.facet_field_value_docids.insert(key, RoaringBitmap::from_iter(Some(id))); + self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); // If the word docids just reached it's capacity we must make sure to remove // one element, this way next time we insert we doesn't grow the capacity. - if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit { + if self.facet_field_number_docids.len() == self.facet_field_value_docids_limit { // Removing the front element is equivalent to removing the LRU element. - Self::write_facet_field_value_docids( - &mut self.facet_field_value_docids_sorter, - self.facet_field_value_docids.pop_front(), + Self::write_facet_field_number_docids( + &mut self.facet_field_numbers_docids_sorter, + self.facet_field_number_docids.pop_front(), )?; } } } + + Ok(()) + } + + // Save the documents ids under the facet field id and value we have seen it. + fn insert_facet_string_values_docid( + &mut self, + field_id: FieldId, + value: String, + id: DocumentId, + ) -> anyhow::Result<()> + { + let sorter = &mut self.field_id_docid_facet_strings_sorter; + Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; + + let key = (field_id, value); + // if get_refresh finds the element it is assured to be at the end of the linked hash map. + match self.facet_field_string_docids.get_refresh(&key) { + Some(old) => { old.insert(id); }, + None => { + // A newly inserted element is append at the end of the linked hash map. + self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); + // If the word docids just reached it's capacity we must make sure to remove + // one element, this way next time we insert we doesn't grow the capacity. + if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { + // Removing the front element is equivalent to removing the LRU element. + Self::write_facet_field_string_docids( + &mut self.facet_field_strings_docids_sorter, + self.facet_field_string_docids.pop_front(), + )?; + } + } + } + Ok(()) } @@ -287,7 +344,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { &mut self, document_id: DocumentId, words_positions: &mut HashMap>, - facet_values: &mut HashMap>, + facet_numbers_values: &mut HashMap>, + facet_strings_values: &mut HashMap>, record: &[u8], ) -> anyhow::Result<()> { @@ -306,10 +364,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { words_positions.clear(); - // We store document_id associated with all the field id and values. - for (field, values) in facet_values.drain() { + // We store document_id associated with all the facet numbers fields ids and values. + for (field, values) in facet_numbers_values.drain() { for value in values { - self.insert_facet_values_docid(field, value, document_id)?; + let value = OrderedFloat::from(value); + self.insert_facet_number_values_docid(field, value, document_id)?; + } + } + + // We store document_id associated with all the facet strings fields ids and values. + for (field, values) in facet_strings_values.drain() { + for value in values { + self.insert_facet_string_values_docid(field, value, document_id)?; } } @@ -409,20 +475,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_facet_field_value_docids( + fn write_facet_field_string_docids( sorter: &mut Sorter, iter: I, ) -> anyhow::Result<()> - where I: IntoIterator + where I: IntoIterator { - use FacetValue::*; - for ((field_id, value), docids) in iter { - let result = match value { - String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), - Number(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned), - }; - let key = result.context("could not serialize facet key")?; + let key = FacetValueStringCodec::bytes_encode(&(field_id, &value)) + .map(Cow::into_owned) + .context("could not serialize facet key")?; let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) .context("could not serialize docids")?; if lmdb_key_valid_size(&key) { @@ -433,21 +495,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_field_id_docid_facet_value( + fn write_facet_field_number_docids( + sorter: &mut Sorter, + iter: I, + ) -> anyhow::Result<()> + where I: IntoIterator), RoaringBitmap)> + { + for ((field_id, value), docids) in iter { + let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value)) + .map(Cow::into_owned) + .context("could not serialize facet key")?; + let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) + .context("could not serialize docids")?; + if lmdb_key_valid_size(&key) { + sorter.insert(&key, &bytes)?; + } + } + + Ok(()) + } + + fn write_field_id_docid_facet_number_value( sorter: &mut Sorter, field_id: FieldId, document_id: DocumentId, - value: &FacetValue, + value: OrderedFloat, ) -> anyhow::Result<()> { - use FacetValue::*; + let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value)) + .map(Cow::into_owned) + .context("could not serialize facet key")?; - let result = match value { - String(s) => FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, s)).map(Cow::into_owned), - Number(f) => FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, **f)).map(Cow::into_owned), - }; + if lmdb_key_valid_size(&key) { + sorter.insert(&key, &[])?; + } + + Ok(()) + } + + fn write_field_id_docid_facet_string_value( + sorter: &mut Sorter, + field_id: FieldId, + document_id: DocumentId, + value: &str, + ) -> anyhow::Result<()> + { + let key = FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, value)) + .map(Cow::into_owned) + .context("could not serialize facet key")?; - let key = result.context("could not serialize facet key")?; if lmdb_key_valid_size(&key) { sorter.insert(&key, &[])?; } @@ -493,7 +589,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut before = Instant::now(); let mut words_positions = HashMap::new(); - let mut facet_values = HashMap::new(); + let mut facet_numbers_values = HashMap::new(); + let mut facet_strings_values = HashMap::new(); let mut count: usize = 0; while let Some((key, value)) = documents.next()? { @@ -513,32 +610,12 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } for (attr, content) in document.iter() { - if self.faceted_fields.contains_key(&attr) || self.searchable_fields.contains(&attr) { + if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) { let value = serde_json::from_slice(content)?; - if let Some(ftype) = self.faceted_fields.get(&attr) { - let mut values = match parse_facet_value(*ftype, &value) { - Ok(values) => values, - Err(e) => { - // We extract the name of the attribute and the document id - // to help users debug a facet type conversion. - let attr_name = self.fields_ids_map.name(attr).unwrap(); - let document_id: Value = self.fields_ids_map.id(&self.primary_key) - .and_then(|fid| document.get(fid)) - .map(serde_json::from_slice) - .unwrap()?; - - let context = format!( - "while extracting facet from the {:?} attribute in the {} document", - attr_name, document_id, - ); - warn!("{}", e.context(context)); - - SmallVec8::default() - }, - }; - facet_values.entry(attr).or_insert_with(SmallVec8::new).extend(values.drain(..)); - } + let (facet_numbers, facet_strings) = extract_facet_values(&value); + facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers); + facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings); if self.searchable_fields.contains(&attr) { let content = match json_to_string(&value) { @@ -558,7 +635,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } // We write the document in the documents store. - self.write_document(document_id, &mut words_positions, &mut facet_values, value)?; + self.write_document( + document_id, + &mut words_positions, + &mut facet_numbers_values, + &mut facet_strings_values, + value, + )?; } // Compute the document id of the next document. @@ -585,9 +668,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { &mut self.words_pairs_proximities_docids_sorter, self.words_pairs_proximities_docids, )?; - Self::write_facet_field_value_docids( - &mut self.facet_field_value_docids_sorter, - self.facet_field_value_docids, + Self::write_facet_field_number_docids( + &mut self.facet_field_numbers_docids_sorter, + self.facet_field_number_docids, + )?; + + Self::write_facet_field_string_docids( + &mut self.facet_field_strings_docids_sorter, + self.facet_field_string_docids, )?; let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; @@ -613,18 +701,26 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; - let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; + let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; - let mut field_id_docid_facet_values_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_docid_facet_values_sorter.write_into(&mut field_id_docid_facet_values_wtr)?; + let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; + + let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?; + + let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?; let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; - let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; - let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?; + let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; + let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; + let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; + let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; let documents = writer_into_reader(self.documents_writer, shrink_size)?; @@ -634,8 +730,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { docid_word_positions, words_pairs_proximities_docids, word_level_position_docids, - facet_field_value_docids, - field_id_docid_facet_values, + facet_field_numbers_docids, + facet_field_strings_docids, + field_id_docid_facet_numbers, + field_id_docid_facet_strings, documents, }) } @@ -710,71 +808,36 @@ fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator< .filter(|(_, t)| t.is_word()) } -fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result> { - use FacetValue::*; - - fn inner_parse_facet_value( - ftype: FacetType, +fn extract_facet_values(value: &Value) -> (Vec, Vec) { + fn inner_extract_facet_values( value: &Value, can_recurse: bool, - output: &mut SmallVec8, - ) -> anyhow::Result<()> - { + output_numbers: &mut Vec, + output_strings: &mut Vec, + ) { match value { - Value::Null => Ok(()), - Value::Bool(b) => match ftype { - FacetType::String => { - output.push(String(b.to_string())); - Ok(()) - }, - FacetType::Number => { - output.push(Number(OrderedFloat(if *b { 1.0 } else { 0.0 }))); - Ok(()) - }, - }, - Value::Number(number) => match ftype { - FacetType::String => { - output.push(String(number.to_string())); - Ok(()) - }, - FacetType::Number => match number.as_f64() { - Some(float) => { - output.push(Number(OrderedFloat(float))); - Ok(()) - }, - None => bail!("invalid facet type, expecting {} found number", ftype), - }, + Value::Null => (), + Value::Bool(b) => output_strings.push(b.to_string()), + Value::Number(number) => if let Some(float) = number.as_f64() { + output_numbers.push(float); }, Value::String(string) => { // TODO must be normalized and not only lowercased. let string = string.trim().to_lowercase(); - match ftype { - FacetType::String => { - output.push(String(string)); - Ok(()) - }, - FacetType::Number => match string.parse() { - Ok(float) => { - output.push(Number(OrderedFloat(float))); - Ok(()) - }, - Err(_err) => bail!("invalid facet type, expecting {} found string", ftype), - }, - } + output_strings.push(string); }, Value::Array(values) => if can_recurse { - values.iter().map(|v| inner_parse_facet_value(ftype, v, false, output)).collect() - } else { - bail!( - "invalid facet type, expecting {} found array (recursive arrays are not supported)", - ftype, - ); + for value in values { + inner_extract_facet_values(value, false, output_numbers, output_strings); + } }, - Value::Object(_) => bail!("invalid facet type, expecting {} found object", ftype), + Value::Object(_) => (), } } - let mut facet_values = SmallVec8::new(); - inner_parse_facet_value(ftype, value, true, &mut facet_values)?; - Ok(facet_values) + let mut facet_number_values = Vec::new(); + let mut facet_string_values = Vec::new(); + inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); + + (facet_number_values, facet_string_values) } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c4d4fcfce..79c447834 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeSet, HashMap}; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::str::FromStr; use anyhow::Context; @@ -11,7 +11,6 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::{FieldsIdsMap, Index}; use crate::criterion::Criterion; -use crate::facet::FacetType; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; @@ -68,7 +67,7 @@ pub struct Settings<'a, 't, 'u, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, - faceted_fields: Setting>, + faceted_fields: Setting>, criteria: Setting>, stop_words: Setting>, distinct_attribute: Setting, @@ -123,7 +122,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.faceted_fields = Setting::Reset; } - pub fn set_faceted_fields(&mut self, names_facet_types: HashMap) { + pub fn set_faceted_fields(&mut self, names_facet_types: HashSet) { self.faceted_fields = Setting::Set(names_facet_types); } @@ -387,11 +386,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match self.faceted_fields { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let mut new_facets = HashMap::new(); - for (name, ty) in fields { + let mut new_facets = HashSet::new(); + for name in fields { fields_ids_map.insert(name).context("field id limit exceeded")?; - let ty = FacetType::from_str(&ty)?; - new_facets.insert(name.clone(), ty); + new_facets.insert(name.clone()); } self.index.put_faceted_fields(self.wtxn, &new_facets)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; From e62b89a2edaf252fb7713074b171b82fe851258f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 3 May 2021 10:05:36 +0200 Subject: [PATCH 0686/1889] Make the facet distinct work with the new split facets --- milli/src/search/distinct/facet_distinct.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 44dd6bc66..7411e4af9 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -25,13 +25,12 @@ pub struct FacetDistinct<'a> { } impl<'a> FacetDistinct<'a> { - pub fn new( - distinct: FieldId, - index: &'a Index, - txn: &'a heed::RoTxn<'a>, - ) -> Self - { - Self { distinct, index, txn } + pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { + Self { + distinct, + index, + txn, + } } } @@ -100,10 +99,9 @@ impl<'a> FacetDistinctIter<'a> { let mut candidates_iter = self.candidates.iter().skip(self.iter_offset); match candidates_iter.next() { Some(id) => { - match self.facet_type { - FacetType::String => self.distinct_string(id)?, - FacetType::Number => self.distinct_number(id)?, - }; + // We distinct the document id on its facet strings and facet numbers. + self.distinct_string(id)?; + self.distinct_number(id)?; // The first document of each iteration is kept, since the next call to // `difference_with` will filter out all the documents for that facet value. By From f7efde11d9bf60a012141efeae63ecb309fd6f17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 3 May 2021 11:45:45 +0200 Subject: [PATCH 0687/1889] Refine the facet condition to use both facet databases --- milli/src/search/facet/facet_condition.rs | 304 ++++++++-------------- milli/src/search/facet/mod.rs | 2 +- milli/src/search/mod.rs | 4 +- 3 files changed, 111 insertions(+), 199 deletions(-) diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index 525450ee1..a02a08571 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::ops::Bound::{self, Included, Excluded}; use std::str::FromStr; @@ -21,76 +21,52 @@ use super::parser::Rule; use super::parser::{PREC_CLIMBER, FilterParser}; use self::FacetCondition::*; -use self::FacetNumberOperator::*; +use self::Operator::*; -#[derive(Debug, Copy, Clone, PartialEq)] -pub enum FacetNumberOperator { +#[derive(Debug, Clone, PartialEq)] +pub enum Operator { GreaterThan(f64), GreaterThanOrEqual(f64), - Equal(f64), - NotEqual(f64), + Equal(Option, String), + NotEqual(Option, String), LowerThan(f64), LowerThanOrEqual(f64), Between(f64, f64), } -impl FacetNumberOperator { +impl Operator { /// This method can return two operations in case it must express /// an OR operation for the between case (i.e. `TO`). fn negate(self) -> (Self, Option) { match self { - GreaterThan(x) => (LowerThanOrEqual(x), None), - GreaterThanOrEqual(x) => (LowerThan(x), None), - Equal(x) => (NotEqual(x), None), - NotEqual(x) => (Equal(x), None), - LowerThan(x) => (GreaterThanOrEqual(x), None), - LowerThanOrEqual(x) => (GreaterThan(x), None), - Between(x, y) => (LowerThan(x), Some(GreaterThan(y))), - } - } -} - -#[derive(Debug, Clone, PartialEq)] -pub enum FacetStringOperator { - Equal(String), - NotEqual(String), -} - -impl FacetStringOperator { - fn equal(s: &str) -> Self { - FacetStringOperator::Equal(s.to_lowercase()) - } - - #[allow(dead_code)] - fn not_equal(s: &str) -> Self { - FacetStringOperator::equal(s).negate() - } - - fn negate(self) -> Self { - match self { - FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x), - FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x), + GreaterThan(n) => (LowerThanOrEqual(n), None), + GreaterThanOrEqual(n) => (LowerThan(n), None), + Equal(n, s) => (NotEqual(n, s), None), + NotEqual(n, s) => (Equal(n, s), None), + LowerThan(n) => (GreaterThanOrEqual(n), None), + LowerThanOrEqual(n) => (GreaterThan(n), None), + Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), } } } #[derive(Debug, Clone, PartialEq)] pub enum FacetCondition { - OperatorString(FieldId, FacetStringOperator), - OperatorNumber(FieldId, FacetNumberOperator), + Operator(FieldId, Operator), Or(Box, Box), And(Box, Box), } -fn get_field_id_facet_type<'a>( +fn field_id<'a>( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, items: &mut Pairs<'a, Rule>, -) -> Result<(FieldId, FacetType), PestError> +) -> Result> { // lexing ensures that we at least have a key let key = items.next().unwrap(); - let field_id = fields_ids_map + + fields_ids_map .id(key.as_str()) .ok_or_else(|| { PestError::new_from_span( @@ -103,32 +79,14 @@ fn get_field_id_facet_type<'a>( }, key.as_span(), ) - })?; - - let facet_type = faceted_fields - .get(&field_id) - .copied() - .ok_or_else(|| { - PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` is not faceted, available faceted attributes are: {}", - key.as_str(), - faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::>().join(", ") - ), - }, - key.as_span(), - ) - })?; - - Ok((field_id, facet_type)) + }) } -fn pest_parse(pair: Pair) -> Result> +fn pest_parse(pair: Pair) -> (Result>, String) where T: FromStr, T::Err: ToString, { - match pair.as_str().parse() { + let result = match pair.as_str().parse::() { Ok(value) => Ok(value), Err(e) => { Err(PestError::::new_from_span( @@ -136,7 +94,9 @@ where T: FromStr, pair.as_span(), )) } - } + }; + + (result, pair.as_str().to_string()) } impl FacetCondition { @@ -232,7 +192,7 @@ impl FacetCondition { fn from_pairs( fim: &FieldsIdsMap, - ff: &HashMap, + ff: &HashSet, expression: Pairs, ) -> anyhow::Result { @@ -263,10 +223,9 @@ impl FacetCondition { fn negate(self) -> FacetCondition { match self { - OperatorString(fid, op) => OperatorString(fid, op.negate()), - OperatorNumber(fid, op) => match op.negate() { - (op, None) => OperatorNumber(fid, op), - (a, Some(b)) => Or(Box::new(OperatorNumber(fid, a)), Box::new(OperatorNumber(fid, b))), + Operator(fid, op) => match op.negate() { + (op, None) => Operator(fid, op), + (a, Some(b)) => Or(Box::new(Operator(fid, a)), Box::new(Operator(fid, b))), }, Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), @@ -275,137 +234,100 @@ impl FacetCondition { fn between( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let item_span = item.as_span(); let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; - let lvalue = items.next().unwrap(); - let rvalue = items.next().unwrap(); - match ftype { - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - FacetType::Number => { - let lvalue = pest_parse(lvalue)?; - let rvalue = pest_parse(rvalue)?; - Ok(OperatorNumber(fid, Between(lvalue, rvalue))) - }, - } + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + + let (lresult, _) = pest_parse(items.next().unwrap()); + let (rresult, _) = pest_parse(items.next().unwrap()); + + let lvalue = lresult?; + let rvalue = rresult?; + + Ok(Operator(fid, Between(lvalue, rvalue))) } fn equal( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); - match ftype { - FacetType::String => Ok(OperatorString(fid, FacetStringOperator::equal(value.as_str()))), - FacetType::Number => Ok(OperatorNumber(fid, Equal(pest_parse(value)?))), - } + let (result, svalue) = pest_parse(value); + + Ok(Operator(fid, Equal(Some(result?), svalue))) } fn greater_than( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let item_span = item.as_span(); let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); - match ftype { - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - FacetType::Number => Ok(OperatorNumber(fid, GreaterThan(pest_parse(value)?))), - } + let (result, _svalue) = pest_parse(value); + + Ok(Operator(fid, GreaterThan(result?))) } fn greater_than_or_equal( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let item_span = item.as_span(); let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); - match ftype { - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - FacetType::Number => Ok(OperatorNumber(fid, GreaterThanOrEqual(pest_parse(value)?))), - } + let (result, _svalue) = pest_parse(value); + + Ok(Operator(fid, GreaterThanOrEqual(result?))) } fn lower_than( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let item_span = item.as_span(); let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); - match ftype { - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - FacetType::Number => Ok(OperatorNumber(fid, LowerThan(pest_parse(value)?))), - } + let (result, _svalue) = pest_parse(value); + + Ok(Operator(fid, LowerThan(result?))) } fn lower_than_or_equal( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let item_span = item.as_span(); let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); - match ftype { - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - FacetType::Number => Ok(OperatorNumber(fid, LowerThanOrEqual(pest_parse(value)?))), - } + let (result, _svalue) = pest_parse(value); + + Ok(Operator(fid, LowerThanOrEqual(result?))) } } @@ -485,34 +407,53 @@ impl FacetCondition { Ok(()) } - fn evaluate_number_operator<>( + fn evaluate_operator( rtxn: &heed::RoTxn, index: &Index, - db: heed::Database, + numbers_db: heed::Database, + strings_db: heed::Database, field_id: FieldId, - operator: FacetNumberOperator, + operator: &Operator, ) -> anyhow::Result { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. let (left, right) = match operator { - GreaterThan(val) => (Excluded(val), Included(f64::MAX)), - GreaterThanOrEqual(val) => (Included(val), Included(f64::MAX)), - Equal(val) => (Included(val), Included(val)), - NotEqual(val) => { - let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; - let docids = Self::evaluate_number_operator(rtxn, index, db, field_id, Equal(val))?; - return Ok(all_documents_ids - docids); + GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), + GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), + Equal(number, string) => { + let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); + let number_docids = match number { + Some(n) => { + let n = Included(*n); + let mut output = RoaringBitmap::new(); + Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?; + output + }, + None => RoaringBitmap::new(), + }; + return Ok(string_docids | number_docids); }, - LowerThan(val) => (Included(f64::MIN), Excluded(val)), - LowerThanOrEqual(val) => (Included(f64::MIN), Included(val)), - Between(left, right) => (Included(left), Included(right)), + NotEqual(number, string) => { + let all_numbers_ids = if number.is_some() { + index.number_faceted_documents_ids(rtxn, field_id)? + } else { + RoaringBitmap::new() + }; + let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; + let operator = Equal(*number, string.clone()); + let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?; + return Ok((all_numbers_ids | all_strings_ids) - docids); + }, + LowerThan(val) => (Included(f64::MIN), Excluded(*val)), + LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), + Between(left, right) => (Included(*left), Included(*right)), }; // Ask for the biggest value that can exist for this specific field, if it exists // that's fine if it don't, the value just before will be returned instead. - let biggest_level = db + let biggest_level = numbers_db .remap_data_type::() .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))? .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); @@ -520,52 +461,25 @@ impl FacetCondition { match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, db, field_id, level, left, right, &mut output)?; + Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?; Ok(output) }, None => Ok(RoaringBitmap::new()), } } - fn evaluate_string_operator( - rtxn: &heed::RoTxn, - index: &Index, - db: heed::Database, - field_id: FieldId, - operator: &FacetStringOperator, - ) -> anyhow::Result - { - match operator { - FacetStringOperator::Equal(string) => { - match db.get(rtxn, &(field_id, string))? { - Some(docids) => Ok(docids), - None => Ok(RoaringBitmap::new()) - } - }, - FacetStringOperator::NotEqual(string) => { - let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; - let op = FacetStringOperator::Equal(string.clone()); - let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?; - Ok(all_documents_ids - docids) - }, - } - } - pub fn evaluate( &self, rtxn: &heed::RoTxn, index: &Index, ) -> anyhow::Result { - let db = index.facet_field_id_value_docids; + let numbers_db = index.facet_id_f64_docids; + let strings_db = index.facet_id_string_docids; + match self { - OperatorString(fid, op) => { - let db = db.remap_key_type::(); - Self::evaluate_string_operator(rtxn, index, db, *fid, op) - }, - OperatorNumber(fid, op) => { - let db = db.remap_key_type::(); - Self::evaluate_number_operator(rtxn, index, db, *fid, *op) + Operator(fid, op) => { + Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op) }, Or(lhs, rhs) => { let lhs = lhs.evaluate(rtxn, index)?; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 26bcf1b83..fff1d14a8 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -9,7 +9,7 @@ use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{Index, FieldId}; -pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator}; +pub use self::facet_condition::{FacetCondition, Operator}; pub use self::facet_distribution::FacetDistribution; mod facet_condition; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 640f081ba..f2211ad78 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -16,9 +16,7 @@ use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct}; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{Index, DocumentId}; -pub use self::facet::{ - FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator, -}; +pub use self::facet::{FacetCondition, FacetDistribution, FacetIter, Operator}; pub use self::query_tree::MatchingWords; use self::query_tree::QueryTreeBuilder; From 79efded841368f23393b877543fc4b1d59efeafc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 3 May 2021 12:51:33 +0200 Subject: [PATCH 0688/1889] Refine the FacetCondition from_array constructor --- milli/src/search/facet/facet_condition.rs | 68 ++++++----------------- 1 file changed, 16 insertions(+), 52 deletions(-) diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index a02a08571..899b99b71 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -1,9 +1,8 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt::Debug; use std::ops::Bound::{self, Included, Excluded}; use std::str::FromStr; -use anyhow::Context; use either::Either; use heed::types::DecodeIgnore; use log::debug; @@ -12,7 +11,6 @@ use pest::iterators::{Pair, Pairs}; use pest::Parser; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec}; @@ -65,21 +63,19 @@ fn field_id<'a>( { // lexing ensures that we at least have a key let key = items.next().unwrap(); - - fields_ids_map - .id(key.as_str()) - .ok_or_else(|| { - PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` not found, available attributes are: {}", - key.as_str(), - fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", ") - ), - }, - key.as_span(), - ) - }) + match fields_ids_map.id(key.as_str()) { + Some(field_id) => Ok(field_id), + None => Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` not found, available attributes are: {}", + key.as_str(), + fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", ") + ), + }, + key.as_span(), + )), + } } fn pest_parse(pair: Pair) -> (Result>, String) @@ -110,32 +106,6 @@ impl FacetCondition { A: AsRef, B: AsRef, { - fn facet_condition( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - key: &str, - value: &str, - ) -> anyhow::Result - { - let fid = fields_ids_map.id(key).with_context(|| { - format!("{:?} isn't present in the fields ids map", key) - })?; - let ftype = faceted_fields.get(key).copied().with_context(|| { - format!("{:?} isn't a faceted field", key) - })?; - let (neg, value) = match value.trim().strip_prefix('-') { - Some(value) => (true, value.trim()), - None => (false, value.trim()), - }; - - let operator = match ftype { - FacetType::String => OperatorString(fid, FacetStringOperator::equal(value)), - FacetType::Number => OperatorNumber(fid, FacetNumberOperator::Equal(value.parse()?)), - }; - - if neg { Ok(operator.negate()) } else { Ok(operator) } - } - let fields_ids_map = index.fields_ids_map(rtxn)?; let faceted_fields = index.faceted_fields(rtxn)?; let mut ands = None; @@ -145,10 +115,7 @@ impl FacetCondition { Either::Left(array) => { let mut ors = None; for rule in array { - let mut iter = rule.as_ref().splitn(2, ':'); - let key = iter.next().context("missing facet condition key")?; - let value = iter.next().context("missing facet condition value")?; - let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?; + let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?; ors = match ors.take() { Some(ors) => Some(Or(Box::new(ors), Box::new(condition))), None => Some(condition), @@ -163,10 +130,7 @@ impl FacetCondition { } }, Either::Right(rule) => { - let mut iter = rule.as_ref().splitn(2, ':'); - let key = iter.next().context("missing facet condition key")?; - let value = iter.next().context("missing facet condition value")?; - let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?; + let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?; ands = match ands.take() { Some(ands) => Some(And(Box::new(ands), Box::new(condition))), None => Some(condition), From 02c655ff1a153eb548509284eec73e5e024a9d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 3 May 2021 15:17:24 +0200 Subject: [PATCH 0689/1889] Refine the facet distribution to use both databases --- milli/src/search/criteria/asc_desc.rs | 2 - milli/src/search/facet/facet_condition.rs | 47 +++--- milli/src/search/facet/facet_distribution.rs | 156 +++++++++---------- milli/src/search/mod.rs | 15 +- 4 files changed, 106 insertions(+), 114 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 32857b8d7..f57d6d54f 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -50,7 +50,6 @@ impl<'t> AscDesc<'t> { Self::new(index, rtxn, parent, field_name, false) } - fn new( index: &'t Index, rtxn: &'t heed::RoTxn, @@ -59,7 +58,6 @@ impl<'t> AscDesc<'t> { ascending: bool, ) -> anyhow::Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields(rtxn)?; let field_id = fields_ids_map .id(&field_name) .with_context(|| format!("field {:?} isn't registered", field_name))?; diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index 899b99b71..b189f5f0f 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -55,27 +55,45 @@ pub enum FacetCondition { And(Box, Box), } -fn field_id<'a>( +fn field_id( fields_ids_map: &FieldsIdsMap, faceted_fields: &HashSet, - items: &mut Pairs<'a, Rule>, + items: &mut Pairs, ) -> Result> { // lexing ensures that we at least have a key let key = items.next().unwrap(); - match fields_ids_map.id(key.as_str()) { - Some(field_id) => Ok(field_id), - None => Err(PestError::new_from_span( + + let field_id = match fields_ids_map.id(key.as_str()) { + Some(field_id) => field_id, + None => return Err(PestError::new_from_span( ErrorVariant::CustomError { message: format!( "attribute `{}` not found, available attributes are: {}", key.as_str(), - fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", ") + fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", "), ), }, key.as_span(), )), + }; + + if !faceted_fields.contains(&field_id) { + return Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` is not faceted, available faceted attributes are: {}", + key.as_str(), + faceted_fields.iter().flat_map(|id| { + fields_ids_map.name(*id) + }).collect::>().join(", "), + ), + }, + key.as_span(), + )); } + + Ok(field_id) } fn pest_parse(pair: Pair) -> (Result>, String) @@ -84,12 +102,10 @@ where T: FromStr, { let result = match pair.as_str().parse::() { Ok(value) => Ok(value), - Err(e) => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { message: e.to_string() }, - pair.as_span(), - )) - } + Err(e) => Err(PestError::::new_from_span( + ErrorVariant::CustomError { message: e.to_string() }, + pair.as_span(), + )), }; (result, pair.as_str().to_string()) @@ -106,8 +122,6 @@ impl FacetCondition { A: AsRef, B: AsRef, { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields(rtxn)?; let mut ands = None; for either in array { @@ -202,7 +216,6 @@ impl FacetCondition { item: Pair, ) -> anyhow::Result { - let item_span = item.as_span(); let mut items = item.into_inner(); let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; @@ -236,7 +249,6 @@ impl FacetCondition { item: Pair, ) -> anyhow::Result { - let item_span = item.as_span(); let mut items = item.into_inner(); let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; @@ -252,7 +264,6 @@ impl FacetCondition { item: Pair, ) -> anyhow::Result { - let item_span = item.as_span(); let mut items = item.into_inner(); let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; @@ -268,7 +279,6 @@ impl FacetCondition { item: Pair, ) -> anyhow::Result { - let item_span = item.as_span(); let mut items = item.into_inner(); let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; @@ -284,7 +294,6 @@ impl FacetCondition { item: Pair, ) -> anyhow::Result { - let item_span = item.as_span(); let mut items = item.into_inner(); let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 7fd2d385b..c6122cc77 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -3,12 +3,12 @@ use std::ops::Bound::Unbounded; use std::{cmp, fmt}; use anyhow::Context; -use heed::BytesDecode; +use heed::{Database, BytesDecode}; +use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; -use crate::facet::{FacetType, FacetValue}; -use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; +use crate::facet::FacetType; +use crate::heed_codec::facet::FacetValueStringCodec; use crate::search::facet::{FacetIter, FacetRange}; use crate::{Index, FieldId, DocumentId}; @@ -60,86 +60,81 @@ impl<'a> FacetDistribution<'a> { /// There is a small amount of candidates OR we ask for facet string values so we /// decide to iterate over the facet values of each one of them, one by one. - fn facet_values_from_documents( + fn facet_distribution_from_documents( &self, field_id: FieldId, facet_type: FacetType, candidates: &RoaringBitmap, - ) -> heed::Result> + distribution: &mut BTreeMap, + ) -> heed::Result<()> { fn fetch_facet_values<'t, KC, K: 't>( - index: &Index, rtxn: &'t heed::RoTxn, + db: Database, field_id: FieldId, candidates: &RoaringBitmap, - ) -> heed::Result> + distribution: &mut BTreeMap, + ) -> heed::Result<()> where + K: fmt::Display, KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, - K: Into, { - let mut facet_values = BTreeMap::new(); let mut key_buffer = vec![field_id]; for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { key_buffer.truncate(1); key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = index.field_id_docid_facet_values + let iter = db + .remap_key_type::() .prefix_iter(rtxn, &key_buffer)? .remap_key_type::(); for result in iter { let ((_, _, value), ()) = result?; - *facet_values.entry(value.into()).or_insert(0) += 1; + *distribution.entry(value.to_string()).or_insert(0) += 1; } } - Ok(facet_values) + Ok(()) } - let index = self.index; - let rtxn = self.rtxn; match facet_type { - FacetType::String => { - fetch_facet_values::(index, rtxn, field_id, candidates) - }, FacetType::Number => { - fetch_facet_values::(index, rtxn, field_id, candidates) + let db = self.index.field_id_docid_facet_f64s; + fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) }, + FacetType::String => { + let db = self.index.field_id_docid_facet_strings; + fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) + } } } /// There is too much documents, we use the facet levels to move throught /// the facet values, to find the candidates and values associated. - fn facet_values_from_facet_levels( + fn facet_numbers_distribution_from_facet_levels( &self, field_id: FieldId, - facet_type: FacetType, candidates: &RoaringBitmap, - ) -> heed::Result> + distribution: &mut BTreeMap, + ) -> heed::Result<()> { - let iter = match facet_type { - FacetType::String => unreachable!(), - FacetType::Number => { - let iter = FacetIter::new_non_reducing( - self.rtxn, self.index, field_id, candidates.clone(), - )?; - iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids))) - }, - }; + let iter = FacetIter::new_non_reducing( + self.rtxn, self.index, field_id, candidates.clone(), + )?; - let mut facet_values = BTreeMap::new(); for result in iter { let (value, mut docids) = result?; docids.intersect_with(candidates); if !docids.is_empty() { - facet_values.insert(value, docids.len()); + distribution.insert(value.to_string(), docids.len()); } - if facet_values.len() == self.max_values_by_facet { + if distribution.len() == self.max_values_by_facet { break; } } - Ok(facet_values) + Ok(()) } /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the @@ -147,80 +142,73 @@ impl<'a> FacetDistribution<'a> { fn facet_values_from_raw_facet_database( &self, field_id: FieldId, - facet_type: FacetType, - ) -> heed::Result> + ) -> heed::Result> { - let db = self.index.facet_field_id_value_docids; - let level = 0; - let iter = match facet_type { - FacetType::String => { - let iter = db - .prefix_iter(self.rtxn, &[field_id])? - .remap_key_type::() - .map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids))); - Box::new(iter) as Box::> - }, - FacetType::Number => { - let db = db.remap_key_type::(); - let range = FacetRange::new( - self.rtxn, db, field_id, level, Unbounded, Unbounded, - )?; - Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) - }, - }; + let mut distribution = BTreeMap::new(); - let mut facet_values = BTreeMap::new(); - for result in iter { - let (value, docids) = result?; - facet_values.insert(value, docids.len()); - if facet_values.len() == self.max_values_by_facet { + let db = self.index.facet_id_f64_docids; + let range = FacetRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + + for result in range { + let ((_, _, value, _), docids) = result?; + distribution.insert(value.to_string(), docids.len()); + if distribution.len() == self.max_values_by_facet { break; } } - Ok(facet_values) + let iter = self.index + .facet_id_string_docids + .remap_key_type::() + .prefix_iter(self.rtxn, &[field_id])? + .remap_key_type::(); + + for result in iter { + let ((_, value), docids) = result?; + distribution.insert(value.to_string(), docids.len()); + if distribution.len() == self.max_values_by_facet { + break; + } + } + + Ok(distribution) } - fn facet_values( - &self, - field_id: FieldId, - facet_type: FacetType, - ) -> heed::Result> - { + fn facet_values(&self, field_id: FieldId) -> heed::Result> { + use FacetType::{Number, String}; + if let Some(candidates) = self.candidates.as_ref() { // Classic search, candidates were specified, we must return facet values only related // to those candidates. We also enter here for facet strings for performance reasons. - if candidates.len() <= CANDIDATES_THRESHOLD || facet_type == FacetType::String { - self.facet_values_from_documents(field_id, facet_type, candidates) + let mut distribution = BTreeMap::new(); + if candidates.len() <= CANDIDATES_THRESHOLD { + self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?; + self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; } else { - self.facet_values_from_facet_levels(field_id, facet_type, candidates) + self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?; + self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; } + + Ok(distribution) } else { - self.facet_values_from_raw_facet_database(field_id, facet_type) + self.facet_values_from_raw_facet_database(field_id) } } - pub fn execute(&self) -> anyhow::Result>> { + pub fn execute(&self) -> anyhow::Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let faceted_fields = self.index.faceted_fields(self.rtxn)?; - let fields_ids: Vec<_> = match &self.facets { - Some(names) => names - .iter() - .filter_map(|n| faceted_fields.get(n).map(|t| (n.to_string(), *t))) - .collect(), - None => faceted_fields.into_iter().collect(), - }; - let mut facets_values = BTreeMap::new(); - for (name, ftype) in fields_ids { + let mut distribution = BTreeMap::new(); + for name in faceted_fields { let fid = fields_ids_map.id(&name).with_context(|| { format!("missing field name {:?} from the fields id map", name) })?; - let values = self.facet_values(fid, ftype)?; - facets_values.insert(name, values); + let values = self.facet_values(fid)?; + distribution.insert(name, values); } - Ok(facets_values) + Ok(distribution) } } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f2211ad78..623581706 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -141,15 +141,12 @@ impl<'a> Search<'a> { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; let id = field_ids_map.id(name).expect("distinct not present in field map"); let faceted_fields = self.index.faceted_fields(self.rtxn)?; - match faceted_fields.get(name) { - Some(facet_type) => { - let distinct = FacetDistinct::new(id, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) - } - None => { - let distinct = MapDistinct::new(id, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) - } + if faceted_fields.contains(name) { + let distinct = FacetDistinct::new(id, self.index, self.rtxn); + self.perform_sort(distinct, matching_words, criteria) + } else { + let distinct = MapDistinct::new(id, self.index, self.rtxn); + self.perform_sort(distinct, matching_words, criteria) } } } From 3a4a150ef04b0b866b6061036c2cd0cb13b0fcdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 3 May 2021 15:58:47 +0200 Subject: [PATCH 0690/1889] Fix the tests and remaining warnings --- milli/src/search/criteria/asc_desc.rs | 19 +------ milli/src/search/distinct/facet_distinct.rs | 16 +++--- milli/src/search/distinct/map_distinct.rs | 4 +- milli/src/search/distinct/mod.rs | 4 +- milli/src/search/facet/facet_condition.rs | 56 ++++++++++----------- milli/src/update/clear_documents.rs | 6 ++- milli/src/update/index_documents/mod.rs | 21 ++++++-- milli/src/update/index_documents/store.rs | 8 +-- milli/src/update/settings.rs | 43 ++++++++++------ 9 files changed, 88 insertions(+), 89 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index f57d6d54f..c80bb38f1 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::mem::take; use anyhow::Context; @@ -7,11 +6,10 @@ use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; -use crate::{FieldsIdsMap, FieldId, Index}; +use crate::{FieldId, Index}; use super::{Criterion, CriterionParameters, CriterionResult}; /// Threshold on the number of candidates that will make @@ -119,7 +117,6 @@ impl<'t> Criterion for AscDesc<'t> { self.index, self.rtxn, self.field_id, - self.facet_type, self.ascending, candidates, )?; @@ -141,20 +138,6 @@ impl<'t> Criterion for AscDesc<'t> { } } -fn field_id_facet_type( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - field: &str, -) -> anyhow::Result<(FieldId, FacetType)> { - let id = fields_ids_map - .id(field) - .with_context(|| format!("field {:?} isn't registered", field))?; - let facet_type = faceted_fields - .get(field) - .with_context(|| format!("field {:?} isn't faceted", field))?; - Ok((id, *facet_type)) -} - /// Returns an iterator over groups of the given candidates in ascending or descending order. /// /// It will either use an iterative or a recursive method on the whole facet database depending diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 7411e4af9..9485087d3 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -189,23 +189,21 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { #[cfg(test)] mod test { - use std::collections::HashMap; + use std::collections::HashSet; use super::super::test::{generate_index, validate_distinct_candidates}; use super::*; - use crate::facet::FacetType; macro_rules! test_facet_distinct { - ($name:ident, $distinct:literal, $facet_type:expr) => { + ($name:ident, $distinct:literal) => { #[test] fn $name() { use std::iter::FromIterator; - let facets = - HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); + let facets = HashSet::from_iter(Some(($distinct.to_string()))); let (index, fid, candidates) = generate_index($distinct, facets); let txn = index.read_txn().unwrap(); - let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type); + let mut map_distinct = FacetDistinct::new(fid, &index, &txn); let excluded = RoaringBitmap::new(); let mut iter = map_distinct.distinct(candidates.clone(), excluded); let count = validate_distinct_candidates(iter.by_ref(), fid, &index); @@ -215,7 +213,7 @@ mod test { }; } - test_facet_distinct!(test_string, "txt", FacetType::String); - test_facet_distinct!(test_strings, "txts", FacetType::String); - test_facet_distinct!(test_number, "cat-int", FacetType::Number); + test_facet_distinct!(test_string, "txt"); + test_facet_distinct!(test_strings, "txts"); + test_facet_distinct!(test_number, "cat-int"); } diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index 4c01d1ded..465af2c3b 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -110,7 +110,7 @@ impl<'a, 'b> Distinct<'b> for MapDistinct<'a> { #[cfg(test)] mod test { - use std::collections::HashMap; + use std::collections::HashSet; use super::*; use super::super::test::{generate_index, validate_distinct_candidates}; @@ -119,7 +119,7 @@ mod test { ($name:ident, $distinct:literal) => { #[test] fn $name() { - let (index, fid, candidates) = generate_index($distinct, HashMap::new()); + let (index, fid, candidates) = generate_index($distinct, HashSet::new()); let txn = index.read_txn().unwrap(); let mut map_distinct = MapDistinct::new(fid, &index, &txn); let excluded = RoaringBitmap::new(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 776f0d2b3..0dd628d5b 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -28,7 +28,7 @@ pub trait Distinct<'a> { #[cfg(test)] mod test { - use std::collections::{HashMap, HashSet}; + use std::collections::HashSet; use once_cell::sync::Lazy; use rand::{seq::SliceRandom, Rng}; @@ -74,7 +74,7 @@ mod test { /// Returns a temporary index populated with random test documents, the FieldId for the /// distinct attribute, and the RoaringBitmap with the document ids. - pub(crate) fn generate_index(distinct: &str, facets: HashMap) -> (TempIndex, FieldId, RoaringBitmap) { + pub(crate) fn generate_index(distinct: &str, facets: HashSet) -> (TempIndex, FieldId, RoaringBitmap) { let index = TempIndex::new(); let mut txn = index.write_txn().unwrap(); diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index b189f5f0f..e7917df97 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -240,7 +240,10 @@ impl FacetCondition { let value = items.next().unwrap(); let (result, svalue) = pest_parse(value); - Ok(Operator(fid, Equal(Some(result?), svalue))) + // TODO we must normalize instead of lowercase. + let svalue = svalue.to_lowercase(); + + Ok(Operator(fid, Equal(result.ok(), svalue))) } fn greater_than( @@ -473,7 +476,8 @@ mod tests { use super::*; use crate::update::Settings; use heed::EnvOpenOptions; - use maplit::hashmap; + use maplit::hashset; + use big_s::S; #[test] fn string() { @@ -485,22 +489,22 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); + builder.set_faceted_fields(hashset!{ S("channel") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); - let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); - let expected = OperatorString(0, FacetStringOperator::equal("Ponce")); + let condition = FacetCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); + let expected = Operator(0, Operator::Equal(None, S("ponce"))); assert_eq!(condition, expected); let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); - let expected = OperatorString(0, FacetStringOperator::not_equal("ponce")); + let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); - let expected = OperatorString(0, FacetStringOperator::not_equal("ponce")); + let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); } @@ -514,20 +518,20 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "timestamp".into() => "number".into() }); + builder.set_faceted_fields(hashset!{ "timestamp".into() }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = OperatorNumber(0, Between(22.0, 44.0)); + let expected = Operator(0, Between(22.0, 44.0)); assert_eq!(condition, expected); let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); let expected = Or( - Box::new(OperatorNumber(0, LowerThan(22.0))), - Box::new(OperatorNumber(0, GreaterThan(44.0))), + Box::new(Operator(0, LowerThan(22.0))), + Box::new(Operator(0, GreaterThan(44.0))), ); assert_eq!(condition, expected); } @@ -542,11 +546,8 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order - builder.set_faceted_fields(hashmap!{ - "channel".into() => "string".into(), - "timestamp".into() => "number".into(), - }); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -557,10 +558,10 @@ mod tests { "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", ).unwrap(); let expected = Or( - Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), + Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(And( - Box::new(OperatorNumber(1, Between(22.0, 44.0))), - Box::new(OperatorString(0, FacetStringOperator::not_equal("ponce"))), + Box::new(Operator(1, Between(22.0, 44.0))), + Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), )) ); assert_eq!(condition, expected); @@ -570,13 +571,13 @@ mod tests { "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", ).unwrap(); let expected = Or( - Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), + Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(Or( Box::new(Or( - Box::new(OperatorNumber(1, LowerThan(22.0))), - Box::new(OperatorNumber(1, GreaterThan(44.0))), + Box::new(Operator(1, LowerThan(22.0))), + Box::new(Operator(1, GreaterThan(44.0))), )), - Box::new(OperatorString(0, FacetStringOperator::equal("ponce"))), + Box::new(Operator(0, Operator::Equal(None, S("ponce")))), )), ); assert_eq!(condition, expected); @@ -592,11 +593,8 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order - builder.set_faceted_fields(hashmap!{ - "channel".into() => "string".into(), - "timestamp".into() => "number".into(), - }); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -604,7 +602,7 @@ mod tests { let rtxn = index.read_txn().unwrap(); let condition = FacetCondition::from_array( &rtxn, &index, - vec![Either::Right("channel:gotaga"), Either::Left(vec!["timestamp:44", "channel:-ponce"])], + vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])], ).unwrap().unwrap(); let expected = FacetCondition::from_str( &rtxn, &index, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index c163046ec..e4c1d35f8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -118,8 +118,10 @@ mod tests { assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); - assert!(index.facet_field_id_value_docids.is_empty(&rtxn).unwrap()); - assert!(index.field_id_docid_facet_values.is_empty(&rtxn).unwrap()); + assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); + assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); + assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); + assert!(index.field_id_docid_facet_strings.is_empty(&rtxn).unwrap()); assert!(index.documents.is_empty(&rtxn).unwrap()); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 10c2e41e7..064f4e6fd 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -450,8 +450,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .enumerate() .map(|(i, documents)| { let store = Store::new( - primary_key.clone(), - fields_ids_map.clone(), searchable_fields.clone(), faceted_fields.clone(), linked_hash_map_size, @@ -553,7 +551,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, - facet_field_numbers_docids_readers, facet_field_strings_docids_readers, field_id_docid_facet_numbers_readers, field_id_docid_facet_strings_readers, @@ -565,7 +562,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, - facet_field_numbers_docids_readers, facet_field_strings_docids_readers, field_id_docid_facet_numbers_readers, field_id_docid_facet_strings_readers, @@ -599,7 +595,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; - let total_databases = 8; + let total_databases = 10; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, @@ -636,6 +632,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); + debug!("Writing the facet id string docids into LMDB on disk..."); + merge_into_lmdb_database( + self.wtxn, + *self.index.facet_id_string_docids.as_polymorph(), + facet_field_strings_docids_readers, + facet_field_value_docids_merge, + write_method, + )?; + + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases, + }); + debug!("Writing the field id docid facet numbers into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index ba8da6d16..afc199293 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -23,7 +23,7 @@ use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; +use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ @@ -53,8 +53,6 @@ pub struct Readers { pub struct Store<'s, A> { // Indexing parameters - primary_key: String, - fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, faceted_fields: HashSet, // Caches @@ -87,8 +85,6 @@ pub struct Store<'s, A> { impl<'s, A: AsRef<[u8]>> Store<'s, A> { pub fn new( - primary_key: String, - fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, faceted_fields: HashSet, linked_hash_map_size: Option, @@ -184,8 +180,6 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(Store { // Indexing parameters. - primary_key, - fields_ids_map, searchable_fields, faceted_fields, // Caches diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 79c447834..1571f627d 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,5 +1,4 @@ use std::collections::{BTreeSet, HashMap, HashSet}; -use std::str::FromStr; use anyhow::Context; use chrono::Utc; @@ -443,9 +442,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { #[cfg(test)] mod tests { use heed::EnvOpenOptions; - use maplit::{btreeset, hashmap}; + use heed::types::ByteSlice; + use maplit::{btreeset, hashmap, hashset}; + use big_s::S; - use crate::facet::FacetType; use crate::update::{IndexDocuments, UpdateFormat}; use super::*; @@ -620,37 +620,53 @@ mod tests { // Set the faceted fields to be the age. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "age".into() => "number".into() }); + builder.set_faceted_fields(hashset!{ S("age") }); builder.execute(|_, _| ()).unwrap(); // Then index some documents. - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = &br#"[ + { "name": "kevin", "age": 23 }, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashmap!{ "age".to_string() => FacetType::Number }); + assert_eq!(fields_ids, hashset!{ S("age") }); // Only count the field_id 0 and level 0 facet values. - let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); + // TODO we must support typed CSVs for numbers to be understood. + let count = index.facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 3); drop(rtxn); // Index a little more documents with new and current facets values. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..]; + let content = &br#"[ + { "name": "kevin2", "age": 23 }, + { "name": "kevina2", "age": 21 }, + { "name": "benoit", "age": 35 } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); - builder.update_format(UpdateFormat::Csv); + builder.enable_autogenerate_docids(); + builder.update_format(UpdateFormat::Json); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); // Only count the field_id 0 and level 0 facet values. - let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); + // TODO we must support typed CSVs for numbers to be understood. + let count = index.facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 4); } @@ -817,10 +833,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); - builder.set_faceted_fields(hashmap!{ - "age".into() => "number".into(), - "toto".into() => "number".into(), - }); + builder.set_faceted_fields(hashset!{ S("age"), S("toto") }); builder.set_criteria(vec!["asc(toto)".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); From 28bd9e183e789f03388ce48d44d0e9df0699f330 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 4 May 2021 12:05:43 +0200 Subject: [PATCH 0691/1889] Fix the infos crate to support split facet databases --- infos/src/main.rs | 219 +++++++++++++++++++++++++--------------------- 1 file changed, 121 insertions(+), 98 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 902394af8..ee2060d38 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -5,6 +5,7 @@ use std::{str, io, fmt}; use anyhow::Context; use byte_unit::Byte; use heed::EnvOpenOptions; +use milli::facet::FacetType; use milli::{Index, TreeLevel}; use structopt::StructOpt; @@ -22,8 +23,11 @@ const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; -const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids"; -const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values"; +const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids"; +const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids"; +const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s"; +const FIELD_ID_DOCID_FACET_STRINGS_DB_NAME: &str = "field-id-docid-facet-strings"; + const DOCUMENTS_DB_NAME: &str = "documents"; const ALL_DATABASE_NAMES: &[&str] = &[ @@ -35,8 +39,10 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_LEVEL_POSITION_DOCIDS_DB_NAME, WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, - FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME, - FIELD_ID_DOCID_FACET_VALUES_DB_NAME, + FACET_ID_F64_DOCIDS_DB_NAME, + FACET_ID_STRING_DOCIDS_DB_NAME, + FIELD_ID_DOCID_FACET_F64S_DB_NAME, + FIELD_ID_DOCID_FACET_STRINGS_DB_NAME, DOCUMENTS_DB_NAME, ]; @@ -108,8 +114,18 @@ enum Command { prefixes: Vec, }, - /// Outputs a CSV with the documents ids along with the facet values where it appears. - FacetValuesDocids { + /// Outputs a CSV with the documents ids along with the facet numbers where it appears. + FacetNumbersDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The field name in the document. + field_name: String, + }, + + /// Outputs a CSV with the documents ids along with the facet strings where it appears. + FacetStringsDocids { /// Display the whole documents ids in details. #[structopt(long)] full_display: bool, @@ -149,8 +165,8 @@ enum Command { internal_documents_ids: Vec, }, - /// Outputs some facets statistics for the given facet name. - FacetStats { + /// Outputs some facets numbers statistics for the given facet name. + FacetNumberStats { /// The field name in the document. field_name: String, }, @@ -243,8 +259,11 @@ fn main() -> anyhow::Result<()> { WordsPrefixesDocids { full_display, prefixes } => { words_prefixes_docids(&index, &rtxn, !full_display, prefixes) }, - FacetValuesDocids { full_display, field_name } => { - facet_values_docids(&index, &rtxn, !full_display, field_name) + FacetNumbersDocids { full_display, field_name } => { + facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name) + }, + FacetStringsDocids { full_display, field_name } => { + facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name) }, WordsLevelPositionsDocids { full_display, words } => { words_level_positions_docids(&index, &rtxn, !full_display, words) @@ -255,7 +274,7 @@ fn main() -> anyhow::Result<()> { DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) }, - FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), + FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { average_number_of_positions_by_word(&index, &rtxn) @@ -297,36 +316,22 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: } /// Helper function that converts the facet value key to a unique type -/// that can be used to log or display purposes. -fn facet_values_iter<'txn, DC: 'txn, T>( +/// that can be used for log or display purposes. +fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>( rtxn: &'txn heed::RoTxn, - db: heed::Database, + db: heed::Database, field_id: u8, - facet_type: milli::facet::FacetType, - string_fn: impl Fn(&str) -> T + 'txn, - float_fn: impl Fn(u8, f64, f64) -> T + 'txn, -) -> heed::Result> + 'txn>> +) -> heed::Result> + 'txn>> where + KC: heed::BytesDecode<'txn>, DC: heed::BytesDecode<'txn>, { - use milli::facet::FacetType; - use milli::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; + let iter = db + .remap_key_type::() + .prefix_iter(&rtxn, &[field_id])? + .remap_key_type::(); - let iter = db.prefix_iter(&rtxn, &[field_id])?; - match facet_type { - FacetType::String => { - let iter = iter.remap_key_type::() - .map(move |r| r.map(|((_, key), value)| (string_fn(key), value))); - Ok(Box::new(iter) as Box>) - }, - FacetType::Number => { - let iter = iter.remap_key_type::() - .map(move |r| r.map(|((_, level, left, right), value)| { - (float_fn(level, left, right), value) - })); - Ok(Box::new(iter)) - }, - } + Ok(Box::new(iter)) } fn facet_number_value_to_string(level: u8, left: T, right: T) -> (u8, String) { @@ -352,9 +357,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, - facet_field_id_value_docids, - field_id_docid_facet_values: _, - documents + facet_id_f64_docids, + facet_id_string_docids, + field_id_docid_facet_f64s: _, + field_id_docid_facet_strings: _, + documents, } = index; let main_name = "main"; @@ -365,7 +372,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let word_level_position_docids_name = "word_level_position_docids"; let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; - let facet_field_id_value_docids_name = "facet_field_id_value_docids"; + let facet_id_f64_docids_name = "facet_id_f64_docids"; + let facet_id_string_docids_name = "facet_id_string_docids"; let documents_name = "documents"; let mut heap = BinaryHeap::with_capacity(limit + 1); @@ -437,27 +445,27 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; - for (field_id, field_type) in faceted_fields { - let facet_name = fields_ids_map.name(field_id).unwrap(); - let db = facet_field_id_value_docids.remap_data_type::(); - let iter = facet_values_iter( - rtxn, - db, - field_id, - field_type, - |key| key.to_owned(), - |level, left, right| { - let mut output = facet_number_value_to_string(level, left, right).1; - let _ = write!(&mut output, " (level {})", level); - output - }, - )?; + for facet_id in faceted_fields { + let facet_name = fields_ids_map.name(facet_id).unwrap(); - for result in iter { - let (fvalue, value) = result?; + // List the facet numbers of this facet id. + let db = facet_id_f64_docids.remap_data_type::(); + for result in facet_values_iter(rtxn, db, facet_id)? { + let ((_fid, level, left, right), value) = result?; + let mut output = facet_number_value_to_string(level, left, right).1; + write!(&mut output, " (level {})", level)?; + let key = format!("{} {}", facet_name, output); + heap.push(Reverse((value.len(), key, facet_id_f64_docids_name))); + if heap.len() > limit { heap.pop(); } + } + + // List the facet strings of this facet id. + let db = facet_id_string_docids.remap_data_type::(); + for result in facet_values_iter(rtxn, db, facet_id)? { + let ((_fid, fvalue), value) = result?; let key = format!("{} {}", facet_name, fvalue); - heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name))); + heap.push(Reverse((value.len(), key, facet_id_string_docids_name))); if heap.len() > limit { heap.pop(); } } } @@ -536,38 +544,55 @@ fn words_prefixes_docids( Ok(wtr.flush()?) } -fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> { +fn facet_values_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + facet_type: FacetType, + field_name: String, +) -> anyhow::Result<()> +{ let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; let field_id = fields_ids_map.id(&field_name) .with_context(|| format!("field {} not found", field_name))?; - let field_type = faceted_fields.get(&field_id) - .with_context(|| format!("field {} is not faceted", field_name))?; + + if !faceted_fields.contains(&field_id) { + anyhow::bail!("field {} is not faceted", field_name); + } let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["facet_value", "facet_level", "documents_count", "documents_ids"])?; - let db = index.facet_field_id_value_docids; - let iter = facet_values_iter( - rtxn, - db, - field_id, - *field_type, - |key| (0, key.to_owned()), - facet_number_value_to_string, - )?; - - for result in iter { - let ((level, value), docids) = result?; - let count = docids.len(); - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; + match facet_type { + FacetType::Number => { + wtr.write_record(&["facet_number", "facet_level", "documents_count", "documents_ids"])?; + for result in facet_values_iter(rtxn, index.facet_id_f64_docids, field_id)? { + let ((_fid, level, left, right), docids) = result?; + let value = facet_number_value_to_string(level, left, right).1; + let count = docids.len(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; + } + }, + FacetType::String => { + wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; + for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { + let ((_fid, value), docids) = result?; + let count = docids.len(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[value.to_string(), count.to_string(), docids])?; + } + } } Ok(wtr.flush()?) @@ -684,31 +709,24 @@ fn docids_words_positions( Ok(wtr.flush()?) } -fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { +fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; let field_id = fields_ids_map.id(&field_name) .with_context(|| format!("field {} not found", field_name))?; - let field_type = faceted_fields.get(&field_id) - .with_context(|| format!("field {} is not faceted", field_name))?; - let db = index.facet_field_id_value_docids; - let iter = facet_values_iter( - rtxn, - db, - field_id, - *field_type, - |_key| 0u8, - |level, _left, _right| level, - )?; + if !faceted_fields.contains(&field_id) { + anyhow::bail!("field {} is not faceted", field_name); + } + let iter = facet_values_iter(rtxn, index.facet_id_f64_docids, field_id)?; println!("The database {:?} facet stats", field_name); let mut level_size = 0; let mut current_level = None; for result in iter { - let (level, _) = result?; + let ((_fid, level, _left, _right), _) = result?; if let Some(current) = current_level { if current != level { println!("\tnumber of groups at level {}: {}", current, level_size); @@ -843,7 +861,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a use heed::types::ByteSlice; let Index { - env: _, + env: _env, main, word_docids, word_prefix_docids, @@ -852,8 +870,10 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, - facet_field_id_value_docids, - field_id_docid_facet_values, + facet_id_f64_docids, + facet_id_string_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, documents, } = index; @@ -873,8 +893,11 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), - FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(), - FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(), + FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(), + FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(), + FIELD_ID_DOCID_FACET_STRINGS_DB_NAME => field_id_docid_facet_strings.as_polymorph(), + DOCUMENTS_DB_NAME => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; From 5012cc3a32ffd7345ec929b74484b9216cc20db2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 4 May 2021 12:09:43 +0200 Subject: [PATCH 0692/1889] Fix the http-ui crate to support split facet databases --- http-ui/src/main.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 00618f58a..7d51098c3 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -30,7 +30,6 @@ use warp::{Filter, http::Response}; use warp::filters::ws::Message; use milli::{FacetCondition, Index, MatchingWords, obkv_to_json, SearchResult, UpdateStore}; -use milli::facet::FacetValue; use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; use milli::update::UpdateIndexingStep::*; @@ -252,7 +251,7 @@ struct Settings { searchable_attributes: Setting>, #[serde(default, skip_serializing_if = "Setting::is_not_set")] - faceted_attributes: Setting>, + faceted_attributes: Setting>, #[serde(default, skip_serializing_if = "Setting::is_not_set")] criteria: Setting>, @@ -671,7 +670,7 @@ async fn main() -> anyhow::Result<()> { struct Answer { documents: Vec>, number_of_candidates: u64, - facets: BTreeMap>, + facets: BTreeMap>, } let disable_highlighting = opt.disable_highlighting; @@ -985,7 +984,7 @@ async fn main() -> anyhow::Result<()> { #[cfg(test)] mod tests { - use maplit::{btreeset,hashmap}; + use maplit::{btreeset,hashmap, hashset}; use serde_test::{assert_tokens, Token}; use milli::update::Setting; @@ -997,10 +996,10 @@ mod tests { let settings = Settings { displayed_attributes: Setting::Set(vec!["name".to_string()]), searchable_attributes: Setting::Set(vec!["age".to_string()]), - faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }), + faceted_attributes: Setting::Set(hashset!{ "age".to_string() }), criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), - synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }) + synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] }) }; assert_tokens(&settings, &[ From a5e98cf46d0db546efd833b9886e23c1ae162ce7 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 12 May 2021 14:15:38 +0200 Subject: [PATCH 0693/1889] Fix plane sweep algorithm --- milli/src/search/criteria/proximity.rs | 33 ++++++++++++++++---------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index bf9be9b9f..d190ef031 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -324,18 +324,30 @@ fn resolve_plane_sweep_candidates( // take the inner proximity of the first group as initial let (_, (_, mut proximity, _)) = groups.first()?; let (_, (left_most_pos, _, _)) = groups.first()?; - let (_, (_, _, right_most_pos)) = groups.last()?; + let (_, (_, _, right_most_pos)) = groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; for pair in groups.windows(2) { - if let [(i1, (_, _, rpos1)), (i2, (lpos2, prox2, _))] = pair { - // if a pair overlap, meaning that they share at least a word, we return None - if rpos1 >= lpos2 { return None } + if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair { + // if two positions are equal, meaning that they share at least a word, we return None + if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 { + return None + } + + let pair_proximity = { + // if intervals are disjoint [..].(..) + if lpos2 > rpos1 { lpos2 - rpos1 } + // if the second interval is a subset of the first [.(..).] + else if rpos2 < rpos1 { (lpos2 - lpos1).min(rpos1 - rpos2) } + // if intervals overlaps [.(..].) + else { (lpos2 - lpos1).min(rpos2 - rpos1) } + }; + // if groups are in the good order (query order) we remove 1 to the proximity // the proximity is clamped to 7 let pair_proximity = if i1 < i2 { - (*lpos2 - *rpos1 - 1).min(7) + (pair_proximity - 1).min(7) } else { - (*lpos2 - *rpos1).min(7) + pair_proximity.min(7) }; proximity += pair_proximity as u8 + prox2; @@ -385,26 +397,21 @@ fn resolve_plane_sweep_candidates( // let q be the position q of second group of the interval. let q = current[1]; - let mut leftmost_index = 0; - // If p > r, then the interval [l, r] is minimal and // we insert it into the heap according to its size. if p.map_or(true, |p| p.1 > rightmost.1) { - leftmost_index = current[0].0; if let Some(group) = compute_groups_proximity(¤t, consecutive) { output.push(group); } } - // TODO not sure about breaking here or when the p list is found empty. let p = match p { Some(p) => p, None => break, }; - // Remove the leftmost group P in the interval, - // and pop the same group from a list. - current[leftmost_index] = p; + // Replace the leftmost group P in the interval. + current[0] = p; if p.1 > rightmost.1 { // if [l, r] is minimal, let r = p and l = q. From 1c0a5cd136146e0b290445ed16130e48014402d8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 31 May 2021 15:22:50 +0200 Subject: [PATCH 0694/1889] Resolve code modification suggestions --- milli/src/search/facet/facet_condition.rs | 2 -- milli/src/update/facets.rs | 1 - milli/src/update/index_documents/store.rs | 1 - 3 files changed, 4 deletions(-) diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index e7917df97..fd7053269 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -240,9 +240,7 @@ impl FacetCondition { let value = items.next().unwrap(); let (result, svalue) = pest_parse(value); - // TODO we must normalize instead of lowercase. let svalue = svalue.to_lowercase(); - Ok(Operator(fid, Equal(result.ok(), svalue))) } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index af72133a2..f0eab6023 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -98,7 +98,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?; self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?; - // Store the write_into_lmdb_database( self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index afc199293..4f65d77e1 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -816,7 +816,6 @@ fn extract_facet_values(value: &Value) -> (Vec, Vec) { output_numbers.push(float); }, Value::String(string) => { - // TODO must be normalized and not only lowercased. let string = string.trim().to_lowercase(); output_strings.push(string); }, From 4ddf008be28f9507a821fa7468ef022722a4264a Mon Sep 17 00:00:00 2001 From: many Date: Thu, 27 May 2021 15:27:41 +0200 Subject: [PATCH 0695/1889] add field id word count database --- .../heed_codec/field_id_word_count_codec.rs | 22 ++++++++++++ milli/src/heed_codec/mod.rs | 2 ++ milli/src/index.rs | 9 +++-- milli/src/lib.rs | 2 +- milli/src/update/clear_documents.rs | 3 ++ milli/src/update/delete_documents.rs | 15 ++++++++ .../update/index_documents/merge_function.rs | 4 +++ milli/src/update/index_documents/mod.rs | 23 ++++++++++++- milli/src/update/index_documents/store.rs | 34 ++++++++++++++++++- 9 files changed, 109 insertions(+), 5 deletions(-) create mode 100644 milli/src/heed_codec/field_id_word_count_codec.rs diff --git a/milli/src/heed_codec/field_id_word_count_codec.rs b/milli/src/heed_codec/field_id_word_count_codec.rs new file mode 100644 index 000000000..5796e5020 --- /dev/null +++ b/milli/src/heed_codec/field_id_word_count_codec.rs @@ -0,0 +1,22 @@ +use std::{borrow::Cow, convert::TryInto}; + +use crate::FieldId; + +pub struct FieldIdWordCountCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec { + type DItem = (FieldId, u8); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?; + Some((field_id, word_count)) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec { + type EItem = (FieldId, u8); + + fn bytes_encode((field_id, word_count): &Self::EItem) -> Option> { + Some(Cow::Owned(vec![*field_id, *word_count])) + } +} diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index cc73cdc65..65a06573e 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -4,6 +4,7 @@ mod roaring_bitmap; mod roaring_bitmap_length; mod str_level_position_codec; mod str_str_u8_codec; +mod field_id_word_count_codec; pub mod facet; pub use self::beu32_str_codec::BEU32StrCodec; @@ -12,3 +13,4 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; pub use self::str_level_position_codec::StrLevelPositionCodec; pub use self::str_str_u8_codec::StrStrU8Codec; +pub use self::field_id_word_count_codec::FieldIdWordCountCodec; diff --git a/milli/src/index.rs b/milli/src/index.rs index 14b153a2e..bd057a02a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -13,6 +13,7 @@ use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; use crate::{ BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, + FieldIdWordCountCodec, }; use crate::heed_codec::facet::{ FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, @@ -60,9 +61,11 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, - + /// Maps the word, level and position range with the docids that corresponds to it. pub word_level_position_docids: Database, + /// Maps the field id and the word count with the docids that corresponds to it. + pub field_id_word_count_docids: Database, /// Maps the level positions of a word prefix with all the docids where this prefix appears. pub word_prefix_level_position_docids: Database, @@ -82,7 +85,7 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(13); + options.max_dbs(14); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -92,6 +95,7 @@ impl Index { let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; + let field_id_word_count_docids = env.create_database(Some("field-id-word-count-docids"))?; let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?; let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?; @@ -111,6 +115,7 @@ impl Index { word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, + field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 03169bce7..e4b58765e 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -23,7 +23,7 @@ use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; -pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec}; +pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index e4c1d35f8..f4c13e8f8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -29,6 +29,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_pair_proximity_docids, word_prefix_pair_proximity_docids, word_level_position_docids, + field_id_word_count_docids, word_prefix_level_position_docids, facet_id_f64_docids, facet_id_string_docids, @@ -62,6 +63,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_level_position_docids.clear(self.wtxn)?; + field_id_word_count_docids.clear(self.wtxn)?; word_prefix_level_position_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; @@ -117,6 +119,7 @@ mod tests { assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); + assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e93ff9a0a..bf49603ce 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -86,6 +86,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_prefix_docids, docid_word_positions, word_pair_proximity_docids, + field_id_word_count_docids, word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, @@ -316,6 +317,20 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // Remove the documents ids from field id word count database. + let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; + while let Some((key, mut docids)) = iter.next().transpose()? { + let previous_len = docids.len(); + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(&key, &docids)?; + } + } + + drop(iter); + // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_field_id_value_docids( self.wtxn, diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index a6d008513..230116e99 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -60,6 +60,10 @@ pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> an cbo_roaring_bitmap_merge(values) } +pub fn field_id_word_count_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + cbo_roaring_bitmap_merge(values) +} + pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { cbo_roaring_bitmap_merge(values) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 064f4e6fd..71f281e98 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -29,6 +29,7 @@ pub use self::merge_function::{ docid_word_positions_merge, documents_merge, word_level_position_docids_merge, word_prefix_level_positions_docids_merge, facet_field_value_docids_merge, field_id_docid_facet_values_merge, + field_id_word_count_docids_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -412,6 +413,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { Main, WordDocids, WordLevel0PositionDocids, + FieldIdWordCountDocids, FacetLevel0NumbersDocids, } @@ -476,6 +478,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); + let mut field_id_word_count_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len()); let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len()); @@ -488,6 +491,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions, words_pairs_proximities_docids, word_level_position_docids, + field_id_word_count_docids, facet_field_numbers_docids, facet_field_strings_docids, field_id_docid_facet_numbers, @@ -499,6 +503,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers.push(docid_word_positions); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); word_level_position_docids_readers.push(word_level_position_docids); + field_id_word_count_docids_readers.push(field_id_word_count_docids); facet_field_numbers_docids_readers.push(facet_field_numbers_docids); facet_field_strings_docids_readers.push(facet_field_strings_docids); field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers); @@ -536,6 +541,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { word_level_position_docids_readers, word_level_position_docids_merge, ), + ( + DatabaseType::FieldIdWordCountDocids, + field_id_word_count_docids_readers, + field_id_word_count_docids_merge, + ), ] .into_par_iter() .for_each(|(dbtype, readers, merge)| { @@ -595,7 +605,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; - let total_databases = 10; + let total_databases = 11; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, @@ -727,6 +737,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, + DatabaseType::FieldIdWordCountDocids => { + debug!("Writing the field id word count docids into LMDB on disk..."); + let db = *self.index.field_id_word_count_docids.as_polymorph(); + write_into_lmdb_database( + self.wtxn, + db, + content, + field_id_word_count_docids_merge, + write_method, + )?; + }, DatabaseType::WordLevel0PositionDocids => { debug!("Writing the word level 0 positions docids into LMDB on disk..."); let db = *self.index.word_level_position_docids.as_polymorph(); diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 4f65d77e1..78ff6cbb0 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -29,7 +29,7 @@ use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, word_level_position_docids_merge, facet_field_value_docids_merge, - field_id_docid_facet_values_merge, + field_id_docid_facet_values_merge, field_id_word_count_docids_merge, }; const LMDB_MAX_KEY_LENGTH: usize = 511; @@ -44,6 +44,7 @@ pub struct Readers { pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, pub word_level_position_docids: Reader, + pub field_id_word_count_docids: Reader, pub facet_field_numbers_docids: Reader, pub facet_field_strings_docids: Reader, pub field_id_docid_facet_numbers: Reader, @@ -58,6 +59,7 @@ pub struct Store<'s, A> { // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, + field_id_word_count_docids: HashMap<(u8, u8), RoaringBitmap>, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, @@ -72,6 +74,7 @@ pub struct Store<'s, A> { word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, word_level_position_docids_sorter: Sorter, + field_id_word_count_docids_sorter: Sorter, facet_field_numbers_docids_sorter: Sorter, facet_field_strings_docids_sorter: Sorter, field_id_docid_facet_numbers_sorter: Sorter, @@ -132,6 +135,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_nb_chunks, max_memory, ); + let field_id_word_count_docids_sorter = create_sorter( + field_id_word_count_docids_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + max_memory, + ); let facet_field_numbers_docids_sorter = create_sorter( facet_field_value_docids_merge, chunk_compression_type, @@ -184,6 +195,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { faceted_fields, // Caches word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), + field_id_word_count_docids: HashMap::new(), word_docids_limit: linked_hash_map_size, words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids_limit: linked_hash_map_size, @@ -199,6 +211,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids_sorter, words_pairs_proximities_docids_sorter, word_level_position_docids_sorter, + field_id_word_count_docids_sorter, facet_field_numbers_docids_sorter, facet_field_strings_docids_sorter, field_id_docid_facet_numbers_sorter, @@ -620,10 +633,17 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let analyzed = self.analyzer.analyze(&content); let tokens = process_tokens(analyzed.tokens()); + let mut last_pos = None; for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { + last_pos = Some(pos); let position = (attr as usize * MAX_POSITION + pos) as u32; words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); } + + if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { + let key = (attr, last_pos as u8 + 1); + self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id); + } } } } @@ -683,6 +703,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids_wtr.insert(word, val)?; } + let mut docids_buffer = Vec::new(); + for ((fid, count), docids) in self.field_id_word_count_docids { + docids_buffer.clear(); + CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer)?; + self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?; + } + let fst = builder.into_set(); self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?; @@ -695,6 +722,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; + let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; + let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; @@ -711,6 +741,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; + let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; @@ -724,6 +755,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { docid_word_positions, words_pairs_proximities_docids, word_level_position_docids, + field_id_word_count_docids, facet_field_numbers_docids, facet_field_strings_docids, field_id_docid_facet_numbers, From c701f8bf36fbba25413f00ab1e8ebf7abeefb0a4 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 27 May 2021 15:32:14 +0200 Subject: [PATCH 0696/1889] Use field id word count database in exactness criterion --- milli/src/search/criteria/exactness.rs | 4 ++-- milli/src/search/criteria/mod.rs | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index b1026ccc2..7f27287b7 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -162,11 +162,11 @@ fn resolve_state( use State::*; match state { ExactAttribute(mut allowed_candidates) => { - let query_len = query.len() as u32; + let query_len = query.len() as u8; let mut candidates = RoaringBitmap::new(); let attributes_ids = ctx.searchable_fields_ids()?; for id in attributes_ids { - if let Some(attribute_allowed_docids) = ctx.field_id_len_docids(id, query_len)? { + if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; attribute_candidates_array.push(attribute_allowed_docids); candidates |= intersection_of(attribute_candidates_array.iter().collect()); diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 99e4a4209..456d16e1a 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -78,7 +78,7 @@ pub trait Context<'c> { fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; fn synonyms(&self, word: &str) -> heed::Result>>>; fn searchable_fields_ids(&self) -> heed::Result>; - fn field_id_len_docids(&self, field_id: FieldId, len: u32) -> heed::Result>; + fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result>; fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error>; } pub struct CriteriaBuilder<'t> { @@ -181,8 +181,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { } } - fn field_id_len_docids(&self, _field_id: FieldId, _len: u32) -> heed::Result> { - Ok(None) + fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result> { + let key = (field_id, word_count); + self.index.field_id_word_count_docids.get(self.rtxn, &key) } fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { @@ -488,7 +489,7 @@ pub mod test { todo!() } - fn field_id_len_docids(&self, _field_id: FieldId, _len: u32) -> heed::Result> { + fn field_id_word_count_docids(&self, _field_id: FieldId, _word_count: u8) -> heed::Result> { todo!() } } From b8e6db0feb9ac9d9860629996863b14b5c9b1583 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 31 May 2021 16:14:51 +0200 Subject: [PATCH 0697/1889] Add database in infos crate --- infos/src/main.rs | 62 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/infos/src/main.rs b/infos/src/main.rs index ee2060d38..a00c882b7 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -23,6 +23,7 @@ const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; +const FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME: &str = "field-id-word-count-docids"; const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids"; const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids"; const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s"; @@ -39,6 +40,7 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_LEVEL_POSITION_DOCIDS_DB_NAME, WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, + FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME, FACET_ID_F64_DOCIDS_DB_NAME, FACET_ID_STRING_DOCIDS_DB_NAME, FIELD_ID_DOCID_FACET_F64S_DB_NAME, @@ -155,6 +157,15 @@ enum Command { prefixes: Vec, }, + FieldIdWordCountDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The field name in the document. + field_name: String, + }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. DocidsWordsPositions { /// Display the whole positions in detail. @@ -271,6 +282,9 @@ fn main() -> anyhow::Result<()> { WordPrefixesLevelPositionsDocids { full_display, prefixes } => { word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) }, + FieldIdWordCountDocids { full_display, field_name } => { + field_id_word_count_docids(&index, &rtxn, !full_display, field_name) + }, DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) }, @@ -357,6 +371,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, + field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s: _, @@ -372,6 +387,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let word_level_position_docids_name = "word_level_position_docids"; let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; + let field_id_word_count_docids_name = "field_id_word_count_docids"; let facet_id_f64_docids_name = "facet_id_f64_docids"; let facet_id_string_docids_name = "facet_id_string_docids"; let documents_name = "documents"; @@ -443,6 +459,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in field_id_word_count_docids.remap_data_type::().iter(rtxn)? { + let ((field_id, word_count), docids) = result?; + let key = format!("{} {}", field_id, word_count); + heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name))); + if heap.len() > limit { heap.pop(); } + } + let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; @@ -676,6 +699,39 @@ fn word_prefixes_level_positions_docids( Ok(wtr.flush()?) } +fn field_id_word_count_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + field_name: String +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["field_name", "word_count", "docids"])?; + + let field_id = index.fields_ids_map(rtxn)? + .id(&field_name) + .with_context(|| format!("unknown field name: {}", &field_name))?; + + let left = (field_id, 1); + let right = (field_id, 11); + let iter = index.field_id_word_count_docids + .range(rtxn, &(left..=right))?; + + for result in iter { + let ((_, word_count), docids) = result?; + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[&field_name, &format!("{}", word_count), &docids])?; + } + + Ok(wtr.flush()?) +} + fn docids_words_positions( index: &Index, rtxn: &heed::RoTxn, @@ -870,6 +926,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, + field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s, @@ -893,6 +950,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), + FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => field_id_word_count_docids.as_polymorph(), FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(), FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(), FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(), @@ -999,6 +1057,10 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu let db = index.word_prefix_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, + FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => { + let db = index.field_id_word_count_docids.as_polymorph(); + compute_stats::(*db, rtxn, name) + }, unknown => anyhow::bail!("unknown database {:?}", unknown), } } From 1df68d342a69d33e9b14f73787877eb014c2d502 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 31 May 2021 18:22:29 +0200 Subject: [PATCH 0698/1889] Make the MatchingWords return the number of matching bytes --- milli/src/search/matching_words.rs | 204 +++++++++++++++++++++++++++++ milli/src/search/mod.rs | 3 +- milli/src/search/query_tree.rs | 75 ----------- 3 files changed, 206 insertions(+), 76 deletions(-) create mode 100644 milli/src/search/matching_words.rs diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs new file mode 100644 index 000000000..37a4f49c0 --- /dev/null +++ b/milli/src/search/matching_words.rs @@ -0,0 +1,204 @@ +use std::collections::HashSet; +use std::cmp::{min, Reverse}; +use std::collections::BTreeMap; +use std::ops::{Index, IndexMut}; + +use levenshtein_automata::{DFA, Distance}; + +use crate::search::query_tree::{Operation, Query}; + +use super::build_dfa; + +type IsPrefix = bool; + +/// The query tree builder is the interface to build a query tree. +#[derive(Default)] +pub struct MatchingWords { + dfas: Vec<(DFA, String, u8, IsPrefix)>, +} + +impl MatchingWords { + /// Lists all words which can be considered as a match for the query tree. + pub fn from_query_tree(tree: &Operation) -> Self { + let mut dfas: Vec<_> = fetch_queries(tree) + .into_iter() + .map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p)) + .collect(); + dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| Reverse(query_word.len())); + Self { dfas } + } + + /// Returns the number of matching bytes if the word matches. + pub fn matching_bytes(&self, word: &str) -> Option { + self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) { + Distance::Exact(t) if t <= *typo => { + if *is_prefix { + let (_dist, len) = prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes()); + Some(len) + } else { + Some(word.len()) + } + }, + _otherwise => None, + }) + } +} + +/// Lists all words which can be considered as a match for the query tree. +fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { + fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { + match tree { + Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { + ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); + }, + Operation::Query(Query { prefix, kind }) => { + let typo = if kind.is_exact() { 0 } else { kind.typo() }; + out.insert((kind.word(), typo, *prefix)); + }, + } + } + + let mut queries = HashSet::new(); + resolve_ops(tree, &mut queries); + queries +} + +// A simple wrapper around vec so we can get contiguous but index it like it's 2D array. +struct N2Array { + y_size: usize, + buf: Vec, +} + +impl N2Array { + fn new(x: usize, y: usize, value: T) -> N2Array { + N2Array { + y_size: y, + buf: vec![value; x * y], + } + } +} + +impl Index<(usize, usize)> for N2Array { + type Output = T; + + #[inline] + fn index(&self, (x, y): (usize, usize)) -> &T { + &self.buf[(x * self.y_size) + y] + } +} + +impl IndexMut<(usize, usize)> for N2Array { + #[inline] + fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T { + &mut self.buf[(x * self.y_size) + y] + } +} + +fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) { + let (n, m) = (source.len(), target.len()); + + if n == 0 { + return (m as u32, 0); + } + if m == 0 { + return (n as u32, 0); + } + + if n == m && source == target { + return (0, m); + } + + let inf = n + m; + let mut matrix = N2Array::new(n + 2, m + 2, 0); + + matrix[(0, 0)] = inf; + for i in 0..n + 1 { + matrix[(i + 1, 0)] = inf; + matrix[(i + 1, 1)] = i; + } + for j in 0..m + 1 { + matrix[(0, j + 1)] = inf; + matrix[(1, j + 1)] = j; + } + + let mut last_row = BTreeMap::new(); + + for (row, char_s) in source.iter().enumerate() { + let mut last_match_col = 0; + let row = row + 1; + + for (col, char_t) in target.iter().enumerate() { + let col = col + 1; + let last_match_row = *last_row.get(&char_t).unwrap_or(&0); + let cost = if char_s == char_t { 0 } else { 1 }; + + let dist_add = matrix[(row, col + 1)] + 1; + let dist_del = matrix[(row + 1, col)] + 1; + let dist_sub = matrix[(row, col)] + cost; + let dist_trans = matrix[(last_match_row, last_match_col)] + + (row - last_match_row - 1) + + 1 + + (col - last_match_col - 1); + + let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans)); + + matrix[(row + 1, col + 1)] = dist; + + if cost == 0 { + last_match_col = col; + } + } + + last_row.insert(char_s, row); + } + + let mut minimum = (u32::max_value(), 0); + + for x in 0..=m { + let dist = matrix[(n + 1, x + 1)] as u32; + if dist < minimum.0 { + minimum = (dist, x) + } + } + + minimum +} + +#[cfg(test)] +mod tests { + use super::*; + + use crate::MatchingWords; + use crate::search::query_tree::{Operation, Query, QueryKind}; + + #[test] + fn matched_length() { + let query = "Levenste"; + let text = "Levenshtein"; + + let (dist, length) = prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes()); + assert_eq!(dist, 1); + assert_eq!(&text[..length], "Levenshte"); + } + + #[test] + fn matching_words() { + let query_tree = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: true, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "world".to_string()) }), + ]), + ]); + + let matching_words = MatchingWords::from_query_tree(&query_tree); + + assert_eq!(matching_words.matching_bytes("word"), Some(4)); + assert_eq!(matching_words.matching_bytes("nyc"), None); + assert_eq!(matching_words.matching_bytes("world"), Some(5)); + assert_eq!(matching_words.matching_bytes("splitted"), Some(5)); + assert_eq!(matching_words.matching_bytes("thisnew"), None); + assert_eq!(matching_words.matching_bytes("borld"), Some(5)); + assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4)); + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 623581706..fc64d020f 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{Index, DocumentId}; pub use self::facet::{FacetCondition, FacetDistribution, FacetIter, Operator}; -pub use self::query_tree::MatchingWords; +pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; // Building these factories is not free. @@ -29,6 +29,7 @@ mod criteria; mod distinct; mod facet; mod query_tree; +mod matching_words; pub struct Search<'a> { query: Option, diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4876e37c8..3125664ab 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -294,48 +294,6 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result, -} - -impl MatchingWords { - /// List all words which can be considered as a match for the query tree. - pub fn from_query_tree(tree: &Operation) -> Self { - Self { - dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect() - } - } - - /// Return true if the word match. - pub fn matches(&self, word: &str) -> bool { - self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) { - Distance::Exact(t) => t <= *typo, - Distance::AtLeast(_) => false, - }) - } -} - -/// Lists all words which can be considered as a match for the query tree. -fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { - fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { - match tree { - Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { - ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); - }, - Operation::Query(Query { prefix, kind }) => { - let typo = if kind.is_exact() { 0 } else { kind.typo() }; - out.insert((kind.word(), typo, *prefix)); - }, - } - } - - let mut queries = HashSet::new(); - resolve_ops(tree, &mut queries); - queries -} - /// Main function that creates the final query tree from the primitive query. fn create_query_tree( ctx: &impl Context, @@ -951,39 +909,6 @@ mod test { assert_eq!(expected, query_tree); } - #[test] - fn fetching_words() { - let query = "wordsplit nyc world"; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); - - let context = TestContext::default(); - let (query_tree, _) = context.build(false, true, None, tokens).unwrap().unwrap(); - - let expected = hashset!{ - ("word", 0, false), - ("nyc", 0, false), - ("wordsplit", 2, false), - ("wordsplitnycworld", 2, true), - ("nature", 0, false), - ("new", 0, false), - ("city", 0, false), - ("world", 1, true), - ("york", 0, false), - ("split", 0, false), - ("nycworld", 1, true), - ("earth", 0, false), - ("wordsplitnyc", 2, false), - }; - - let mut keys = context.postings.keys().collect::>(); - keys.sort_unstable(); - - let words = fetch_queries(&query_tree); - assert_eq!(expected, words); - } - #[test] fn words_limit() { let query = "\"hey my\" good friend"; From 1373637da18b3bc58eaddae2ad5614cac94137c1 Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Tue, 27 Apr 2021 14:22:33 +0200 Subject: [PATCH 0699/1889] optimize roaring codec --- .../roaring_bitmap/bo_roaring_bitmap_codec.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs index 7ceb69f9a..25c8afe2f 100644 --- a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs @@ -1,4 +1,6 @@ use std::borrow::Cow; +use std::mem::size_of; + use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use roaring::RoaringBitmap; @@ -7,9 +9,14 @@ pub struct BoRoaringBitmapCodec; impl heed::BytesDecode<'_> for BoRoaringBitmapCodec { type DItem = RoaringBitmap; - fn bytes_decode(mut bytes: &[u8]) -> Option { + fn bytes_decode(bytes: &[u8]) -> Option { let mut bitmap = RoaringBitmap::new(); - while let Ok(integer) = bytes.read_u32::() { + let num_u32 = bytes.len() / size_of::(); + for i in 0..num_u32 { + let start = i * size_of::(); + let end = (i + 1) * size_of::(); + let mut bytes = bytes.get(start..end)?; + let integer = bytes.read_u32::().ok()?; bitmap.insert(integer); } Some(bitmap) From 984dc7c1ed157a3691b77a0982d06390c13ea2ce Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Tue, 27 Apr 2021 18:42:08 +0200 Subject: [PATCH 0700/1889] rewrite roaring codec without byteorder. --- .../roaring_bitmap/bo_roaring_bitmap_codec.rs | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs index 25c8afe2f..8d1eb79dd 100644 --- a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; +use std::convert::TryInto; use std::mem::size_of; -use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use roaring::RoaringBitmap; pub struct BoRoaringBitmapCodec; @@ -11,14 +11,12 @@ impl heed::BytesDecode<'_> for BoRoaringBitmapCodec { fn bytes_decode(bytes: &[u8]) -> Option { let mut bitmap = RoaringBitmap::new(); - let num_u32 = bytes.len() / size_of::(); - for i in 0..num_u32 { - let start = i * size_of::(); - let end = (i + 1) * size_of::(); - let mut bytes = bytes.get(start..end)?; - let integer = bytes.read_u32::().ok()?; - bitmap.insert(integer); + + for chunk in bytes.chunks(size_of::()) { + let bytes = chunk.try_into().ok()?; + bitmap.push(u32::from_ne_bytes(bytes)); } + Some(bitmap) } } @@ -27,10 +25,12 @@ impl heed::BytesEncode<'_> for BoRoaringBitmapCodec { type EItem = RoaringBitmap; fn bytes_encode(item: &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(item.len() as usize * 4); - for integer in item.iter() { - bytes.write_u32::(integer).ok()?; - } - Some(Cow::Owned(bytes)) + let mut out = Vec::with_capacity(item.len() as usize * size_of::()); + + item.iter() + .map(|i| i.to_ne_bytes()) + .for_each(|bytes| out.extend_from_slice(&bytes)); + + Some(Cow::Owned(out)) } } From 225ae6fd254d64ba863338f0f93c5a16ac0d20a7 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 1 Jun 2021 11:48:56 +0200 Subject: [PATCH 0701/1889] Resolve PR comments --- milli/src/search/matching_words.rs | 12 +++++++++--- milli/src/search/query_tree.rs | 5 +---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 37a4f49c0..17649849d 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -11,24 +11,28 @@ use super::build_dfa; type IsPrefix = bool; -/// The query tree builder is the interface to build a query tree. +/// Structure created from a query tree +/// referencing words that match the given query tree. #[derive(Default)] pub struct MatchingWords { dfas: Vec<(DFA, String, u8, IsPrefix)>, } impl MatchingWords { - /// Lists all words which can be considered as a match for the query tree. pub fn from_query_tree(tree: &Operation) -> Self { + // fetch matchable words from the query tree let mut dfas: Vec<_> = fetch_queries(tree) .into_iter() + // create DFAs for each word .map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p)) .collect(); + // Sort word by len in DESC order prioritizing the longuest word, + // in order to highlight the longuest part of the matched word. dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| Reverse(query_word.len())); Self { dfas } } - /// Returns the number of matching bytes if the word matches. + /// Returns the number of matching bytes if the word matches one of the query words. pub fn matching_bytes(&self, word: &str) -> Option { self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) { Distance::Exact(t) if t <= *typo => { @@ -94,6 +98,8 @@ impl IndexMut<(usize, usize)> for N2Array { } } +/// Returns the distance between the source word and the target word, +/// and the number of byte matching in the target word. fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) { let (n, m) = (source.len(), target.len()); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 3125664ab..03305943b 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,14 +1,11 @@ -use std::collections::HashSet; use std::{fmt, cmp, mem}; use fst::Set; -use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; use slice_group_by::GroupBy; use crate::Index; -use super::build_dfa; type IsOptionalWord = bool; type IsPrefix = bool; @@ -519,7 +516,7 @@ pub fn maximum_proximity(operation: &Operation) -> usize { mod test { use std::collections::HashMap; - use maplit::{hashmap, hashset}; + use maplit::hashmap; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::{Rng, SeedableRng, rngs::StdRng}; From 608c5bad24edb63f5dbe3d038665fa1b3a8563c9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 1 Jun 2021 16:24:46 +0200 Subject: [PATCH 0702/1889] fix http-ui --- http-ui/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 7d51098c3..da3b6204c 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -144,7 +144,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed = self.analyzer.analyze(&old_string); for (word, token) in analyzed.reconstruct() { if token.is_word() { - let to_highlight = matching_words.matches(token.text()); + let to_highlight = matching_words.matching_bytes(token.text()).is_some(); if to_highlight { string.push_str("") } string.push_str(word); if to_highlight { string.push_str("") } From 8e6d1ff0dc76750bbfa4432ea484ceb6e5bd5cee Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 1 Jun 2021 17:04:02 +0200 Subject: [PATCH 0703/1889] Update milli/src/update/index_documents/store.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/update/index_documents/store.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 78ff6cbb0..08050092e 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -59,7 +59,7 @@ pub struct Store<'s, A> { // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, - field_id_word_count_docids: HashMap<(u8, u8), RoaringBitmap>, + field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, From ab2cf69e8d87eec4e722ba1b4852a31fdf49d10c Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 1 Jun 2021 17:04:10 +0200 Subject: [PATCH 0704/1889] Update milli/src/update/delete_documents.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/update/delete_documents.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index bf49603ce..f0f4788fb 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -317,7 +317,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); - // Remove the documents ids from field id word count database. + // Remove the documents ids from the field id word count database. let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; while let Some((key, mut docids)) = iter.next().transpose()? { let previous_len = docids.len(); From e857ca4d7db7748c2a3a0f8af0899fba8f9bc149 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 1 Jun 2021 17:23:29 +0200 Subject: [PATCH 0705/1889] Fix PR comments --- infos/src/main.rs | 6 ++++-- milli/src/search/criteria/exactness.rs | 28 ++++++++++++++------------ 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index a00c882b7..d6aa1f854 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -157,6 +157,8 @@ enum Command { prefixes: Vec, }, + /// Outputs a CSV with the documents ids along with + /// the field id and the word count where it appears. FieldIdWordCountDocids { /// Display the whole documents ids in details. #[structopt(long)] @@ -714,8 +716,8 @@ fn field_id_word_count_docids( .id(&field_name) .with_context(|| format!("unknown field name: {}", &field_name))?; - let left = (field_id, 1); - let right = (field_id, 11); + let left = (field_id, 0); + let right = (field_id, u8::max_value()); let iter = index.field_id_word_count_docids .range(rtxn, &(left..=right))?; diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 7f27287b7..4d9e54f6e 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -1,9 +1,10 @@ +use std::convert::TryFrom; use std::mem::take; +use std::ops::BitOr; use log::debug; use roaring::RoaringBitmap; use itertools::Itertools; -use std::ops::BitOr; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; use crate::search::criteria::{ @@ -162,23 +163,24 @@ fn resolve_state( use State::*; match state { ExactAttribute(mut allowed_candidates) => { - let query_len = query.len() as u8; let mut candidates = RoaringBitmap::new(); - let attributes_ids = ctx.searchable_fields_ids()?; - for id in attributes_ids { - if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { - let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; - attribute_candidates_array.push(attribute_allowed_docids); - candidates |= intersection_of(attribute_candidates_array.iter().collect()); + if let Ok(query_len) = u8::try_from(query.len()) { + let attributes_ids = ctx.searchable_fields_ids()?; + for id in attributes_ids { + if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { + let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; + attribute_candidates_array.push(attribute_allowed_docids); + candidates |= intersection_of(attribute_candidates_array.iter().collect()); + } } + + // only keep allowed candidates + candidates &= &allowed_candidates; + // remove current candidates from allowed candidates + allowed_candidates -= &candidates; } - // only keep allowed candidates - candidates &= &allowed_candidates; - // remove current candidates from allowed candidates - allowed_candidates -= &candidates; Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) - }, AttributeStartsWith(mut allowed_candidates) => { let mut candidates = RoaringBitmap::new(); From 4fdbfd6048531c0cc2666062f8fdf7325480d5a8 Mon Sep 17 00:00:00 2001 From: tamo Date: Thu, 1 Apr 2021 18:54:14 +0200 Subject: [PATCH 0706/1889] push a first version of the benchmark for the typo --- milli/Cargo.toml | 2 +- milli/benches/README.md | 8 ++++++ milli/benches/{search.rs => typo.rs} | 33 ++++++++++++---------- milli/benches/utils.rs | 41 ++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 15 deletions(-) create mode 100644 milli/benches/README.md rename milli/benches/{search.rs => typo.rs} (52%) create mode 100644 milli/benches/utils.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3b25bb268..175c15679 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -61,5 +61,5 @@ rand = "0.8.3" default = [] [[bench]] -name = "search" +name = "typo" harness = false diff --git a/milli/benches/README.md b/milli/benches/README.md new file mode 100644 index 000000000..c02af0084 --- /dev/null +++ b/milli/benches/README.md @@ -0,0 +1,8 @@ +Benchmarks +========== + +For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command: +``` +xsv sample --seed 42 song.csv -o smol_songs.csv +``` +The original songs.csv datasets is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz) diff --git a/milli/benches/search.rs b/milli/benches/typo.rs similarity index 52% rename from milli/benches/search.rs rename to milli/benches/typo.rs index a201e241c..9fbce8038 100644 --- a/milli/benches/search.rs +++ b/milli/benches/typo.rs @@ -1,22 +1,27 @@ -use std::time::Duration; +mod utils; -use heed::EnvOpenOptions; -use milli::Index; +use std::time::Duration; use criterion::{criterion_group, criterion_main, BenchmarkId}; -fn bench_search(c: &mut criterion::Criterion) { - let database = "books-4cpu.mmdb"; +fn bench_typo(c: &mut criterion::Criterion) { + let index = utils::base_setup(Some(vec!["typo".to_string()])); + let queries = [ - "minogue kylie", - "minogue kylie live", + "mongus ", + "thelonius monk ", + "Disnaylande ", + "the white striper ", + "indochie ", + "indochien ", + "klub des loopers ", + "fear of the duck ", + "michel depech ", + "stromal ", + "dire straights ", + "Arethla Franklin ", ]; - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - options.max_readers(10); - let index = Index::new(options, database).unwrap(); - - let mut group = c.benchmark_group("search"); + let mut group = c.benchmark_group("typo"); group.sample_size(10); group.measurement_time(Duration::from_secs(12)); @@ -32,5 +37,5 @@ fn bench_search(c: &mut criterion::Criterion) { group.finish(); } -criterion_group!(benches, bench_search); +criterion_group!(benches, bench_typo); criterion_main!(benches); diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs new file mode 100644 index 000000000..23c47ea76 --- /dev/null +++ b/milli/benches/utils.rs @@ -0,0 +1,41 @@ +use std::{fs::{File, create_dir_all}}; + +use heed::EnvOpenOptions; +use milli::{Index, update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}}; + +pub fn base_setup(criteria: Option>) -> Index { + let database = "songs.mmdb"; + create_dir_all(&database).unwrap(); + + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.max_readers(10); + let index = Index::new(options, database).unwrap(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + if let Some(criteria) = criteria { + builder.reset_faceted_fields(); + builder.reset_criteria(); + builder.reset_stop_words(); + + builder.set_criteria(criteria); + } + + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, &index); + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + // we called from cargo the current directory is supposed to be milli/milli + let reader = File::open("benches/smol_songs.csv").unwrap(); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + index +} From e425f70ef9ef14ea3242ec5f9e9f18d09d92ea55 Mon Sep 17 00:00:00 2001 From: tamo Date: Thu, 1 Apr 2021 19:27:12 +0200 Subject: [PATCH 0707/1889] let criterion decide how much iteration it wants to do in 10s --- milli/benches/typo.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/milli/benches/typo.rs b/milli/benches/typo.rs index 9fbce8038..1bbe1aecb 100644 --- a/milli/benches/typo.rs +++ b/milli/benches/typo.rs @@ -22,8 +22,7 @@ fn bench_typo(c: &mut criterion::Criterion) { ]; let mut group = c.benchmark_group("typo"); - group.sample_size(10); - group.measurement_time(Duration::from_secs(12)); + group.measurement_time(Duration::from_secs(10)); for query in &queries { group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { From 15cce89a45d7032abc1e9e622ce0ce2b200e5273 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 6 Apr 2021 16:06:49 +0200 Subject: [PATCH 0708/1889] update the README with instructions to get the download the dataset --- milli/benches/README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/milli/benches/README.md b/milli/benches/README.md index c02af0084..9b53fc0d1 100644 --- a/milli/benches/README.md +++ b/milli/benches/README.md @@ -3,6 +3,13 @@ Benchmarks For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command: ``` -xsv sample --seed 42 song.csv -o smol_songs.csv +xsv sample --seed 42 song.csv -o smol-songs.csv +``` +You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) +And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). + +You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz` +You can run the following command from the root of this git repository +``` +wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz ``` -The original songs.csv datasets is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz) From 49e4cc3daf85bd4a86325988e23f68de2f64b700 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 6 Apr 2021 19:17:24 +0200 Subject: [PATCH 0709/1889] add the words criterion to the bench --- milli/Cargo.toml | 4 ++++ milli/benches/words.rs | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 milli/benches/words.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 175c15679..5184d028b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -63,3 +63,7 @@ default = [] [[bench]] name = "typo" harness = false + +[[bench]] +name = "words" +harness = false diff --git a/milli/benches/words.rs b/milli/benches/words.rs new file mode 100644 index 000000000..92ca0a784 --- /dev/null +++ b/milli/benches/words.rs @@ -0,0 +1,35 @@ +mod utils; + +use std::time::Duration; +use criterion::{criterion_group, criterion_main, BenchmarkId}; + +fn bench_words(c: &mut criterion::Criterion) { + let index = utils::base_setup(Some(vec!["words".to_string()])); + + let queries = [ + "the black saint and the sinner lady and the good doggo ", // four words to pop + "les liaisons dangeureuses 1793 ", // one word to pop + "The Disneyland Children's Sing-Alone song ", // two words to pop + "seven nation mummy ", // one word to pop + "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop + "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop + "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 + ]; + + let mut group = c.benchmark_group("words"); + group.measurement_time(Duration::from_secs(10)); + + for query in &queries { + group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let _documents_ids = index.search(&rtxn).query(*query).execute().unwrap(); + }); + }); + } + + group.finish(); +} + +criterion_group!(benches, bench_words); +criterion_main!(benches); From aee49bb3cd20a88c0d62c735268086d0033e1ed5 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 7 Apr 2021 11:04:53 +0200 Subject: [PATCH 0710/1889] add the proximity criterion --- milli/Cargo.toml | 4 ++++ milli/benches/proximity.rs | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 milli/benches/proximity.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 5184d028b..156518e19 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -67,3 +67,7 @@ harness = false [[bench]] name = "words" harness = false + +[[bench]] +name = "proximity" +harness = false diff --git a/milli/benches/proximity.rs b/milli/benches/proximity.rs new file mode 100644 index 000000000..5b687855f --- /dev/null +++ b/milli/benches/proximity.rs @@ -0,0 +1,33 @@ +mod utils; + +use std::time::Duration; +use criterion::{criterion_group, criterion_main, BenchmarkId}; + +fn bench_proximity(c: &mut criterion::Criterion) { + let index = utils::base_setup(Some(vec!["words".to_string()])); + + let queries = [ + "black saint sinner lady ", + "les dangeureuses 1960 ", + "The Disneyland Sing-Alone song ", + "Under Great Northern Lights ", + "7000 Danses Un Jour Dans Notre Vie", + ]; + + let mut group = c.benchmark_group("proximity"); + group.measurement_time(Duration::from_secs(10)); + + for query in &queries { + group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let _documents_ids = index.search(&rtxn).query(*query).optional_words(false).execute().unwrap(); + }); + }); + } + + group.finish(); +} + +criterion_group!(benches, bench_proximity); +criterion_main!(benches); From a2bff68c1a16e780e572d2e0aa3b304abd6c47c2 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 7 Apr 2021 11:05:10 +0200 Subject: [PATCH 0711/1889] remove the optional words for the typo criterion --- milli/benches/typo.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/benches/typo.rs b/milli/benches/typo.rs index 1bbe1aecb..184f1e5df 100644 --- a/milli/benches/typo.rs +++ b/milli/benches/typo.rs @@ -28,7 +28,7 @@ fn bench_typo(c: &mut criterion::Criterion) { group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { b.iter(|| { let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(*query).execute().unwrap(); + let _documents_ids = index.search(&rtxn).query(*query).optional_words(false).execute().unwrap(); }); }); } From 3def42abd8e14ccc28b9d6e8cb622ec37034ea52 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 7 Apr 2021 11:50:38 +0200 Subject: [PATCH 0712/1889] merge all the criterion only benchmarks in one file --- milli/Cargo.toml | 10 +------ milli/benches/criterion.rs | 58 ++++++++++++++++++++++++++++++++++++++ milli/benches/proximity.rs | 33 ---------------------- milli/benches/typo.rs | 40 -------------------------- milli/benches/utils.rs | 36 ++++++++++++++++++++--- milli/benches/words.rs | 35 ----------------------- 6 files changed, 91 insertions(+), 121 deletions(-) create mode 100644 milli/benches/criterion.rs delete mode 100644 milli/benches/proximity.rs delete mode 100644 milli/benches/typo.rs delete mode 100644 milli/benches/words.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 156518e19..399b04428 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -61,13 +61,5 @@ rand = "0.8.3" default = [] [[bench]] -name = "typo" -harness = false - -[[bench]] -name = "words" -harness = false - -[[bench]] -name = "proximity" +name = "criterion" harness = false diff --git a/milli/benches/criterion.rs b/milli/benches/criterion.rs new file mode 100644 index 000000000..3f0b6d6b7 --- /dev/null +++ b/milli/benches/criterion.rs @@ -0,0 +1,58 @@ +mod utils; + +use criterion::{criterion_group, criterion_main}; + +fn bench_criterion(c: &mut criterion::Criterion) { + let confs = &[ + utils::Conf { + group_name: "proximity", + queries: &[ + "black saint sinner lady ", + "les dangeureuses 1960 ", + "The Disneyland Sing-Along Chorus ", + "Under Great Northern Lights ", + "7000 Danses Un Jour Dans Notre Vie", + ], + criterion: Some(&["proximity"]), + optional_words: false, + }, + utils::Conf { + group_name: "typo", + queries: &[ + "mongus ", + "thelonius monk ", + "Disnaylande ", + "the white striper ", + "indochie ", + "indochien ", + "klub des loopers ", + "fear of the duck ", + "michel depech ", + "stromal ", + "dire straights ", + "Arethla Franklin ", + ], + criterion: Some(&["typo"]), + optional_words: false, + }, + utils::Conf { + group_name: "words", + queries: &[ + "the black saint and the sinner lady and the good doggo ", // four words to pop + "les liaisons dangeureuses 1793 ", // one word to pop + "The Disneyland Children's Sing-Alone song ", // two words to pop + "seven nation mummy ", // one word to pop + "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop + "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop + "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 + ], + criterion: Some(&["words"]), + optional_words: true, + } + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_criterion); +criterion_main!(benches); diff --git a/milli/benches/proximity.rs b/milli/benches/proximity.rs deleted file mode 100644 index 5b687855f..000000000 --- a/milli/benches/proximity.rs +++ /dev/null @@ -1,33 +0,0 @@ -mod utils; - -use std::time::Duration; -use criterion::{criterion_group, criterion_main, BenchmarkId}; - -fn bench_proximity(c: &mut criterion::Criterion) { - let index = utils::base_setup(Some(vec!["words".to_string()])); - - let queries = [ - "black saint sinner lady ", - "les dangeureuses 1960 ", - "The Disneyland Sing-Alone song ", - "Under Great Northern Lights ", - "7000 Danses Un Jour Dans Notre Vie", - ]; - - let mut group = c.benchmark_group("proximity"); - group.measurement_time(Duration::from_secs(10)); - - for query in &queries { - group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { - b.iter(|| { - let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(*query).optional_words(false).execute().unwrap(); - }); - }); - } - - group.finish(); -} - -criterion_group!(benches, bench_proximity); -criterion_main!(benches); diff --git a/milli/benches/typo.rs b/milli/benches/typo.rs deleted file mode 100644 index 184f1e5df..000000000 --- a/milli/benches/typo.rs +++ /dev/null @@ -1,40 +0,0 @@ -mod utils; - -use std::time::Duration; -use criterion::{criterion_group, criterion_main, BenchmarkId}; - -fn bench_typo(c: &mut criterion::Criterion) { - let index = utils::base_setup(Some(vec!["typo".to_string()])); - - let queries = [ - "mongus ", - "thelonius monk ", - "Disnaylande ", - "the white striper ", - "indochie ", - "indochien ", - "klub des loopers ", - "fear of the duck ", - "michel depech ", - "stromal ", - "dire straights ", - "Arethla Franklin ", - ]; - - let mut group = c.benchmark_group("typo"); - group.measurement_time(Duration::from_secs(10)); - - for query in &queries { - group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { - b.iter(|| { - let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(*query).optional_words(false).execute().unwrap(); - }); - }); - } - - group.finish(); -} - -criterion_group!(benches, bench_typo); -criterion_main!(benches); diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 23c47ea76..c608a3ef3 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -1,9 +1,17 @@ -use std::{fs::{File, create_dir_all}}; +use std::{fs::{File, create_dir_all}, time::Duration}; use heed::EnvOpenOptions; +use criterion::BenchmarkId; use milli::{Index, update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}}; -pub fn base_setup(criteria: Option>) -> Index { +pub struct Conf<'a> { + pub group_name: &'a str, + pub queries: &'a[&'a str], + pub criterion: Option<&'a [&'a str]>, + pub optional_words: bool, +} + +pub fn base_setup(criterion: Option>) -> Index { let database = "songs.mmdb"; create_dir_all(&database).unwrap(); @@ -16,12 +24,12 @@ pub fn base_setup(criteria: Option>) -> Index { let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); - if let Some(criteria) = criteria { + if let Some(criterion) = criterion { builder.reset_faceted_fields(); builder.reset_criteria(); builder.reset_stop_words(); - builder.set_criteria(criteria); + builder.set_criteria(criterion); } builder.execute(|_, _| ()).unwrap(); @@ -39,3 +47,23 @@ pub fn base_setup(criteria: Option>) -> Index { index } + +pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { + for conf in confs { + let criterion = conf.criterion.map(|s| s.iter().map(|s| s.to_string()).collect()); + let index = base_setup(criterion); + + let mut group = c.benchmark_group(conf.group_name); + group.measurement_time(Duration::from_secs(10)); + + for &query in conf.queries { + group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let _documents_ids = index.search(&rtxn).query(query).optional_words(conf.optional_words).execute().unwrap(); + }); + }); + } + group.finish(); + } +} diff --git a/milli/benches/words.rs b/milli/benches/words.rs deleted file mode 100644 index 92ca0a784..000000000 --- a/milli/benches/words.rs +++ /dev/null @@ -1,35 +0,0 @@ -mod utils; - -use std::time::Duration; -use criterion::{criterion_group, criterion_main, BenchmarkId}; - -fn bench_words(c: &mut criterion::Criterion) { - let index = utils::base_setup(Some(vec!["words".to_string()])); - - let queries = [ - "the black saint and the sinner lady and the good doggo ", // four words to pop - "les liaisons dangeureuses 1793 ", // one word to pop - "The Disneyland Children's Sing-Alone song ", // two words to pop - "seven nation mummy ", // one word to pop - "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop - "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop - "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 - ]; - - let mut group = c.benchmark_group("words"); - group.measurement_time(Duration::from_secs(10)); - - for query in &queries { - group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { - b.iter(|| { - let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(*query).execute().unwrap(); - }); - }); - } - - group.finish(); -} - -criterion_group!(benches, bench_words); -criterion_main!(benches); From ea0c6d8c401a3ee37c14a62878e4b1641e08d726 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 13 Apr 2021 10:44:27 +0200 Subject: [PATCH 0713/1889] add a bunch of queries and start the introduction of the filters and the new dataset --- milli/benches/criterion.rs | 48 ++++++++++++++++++++++++++++++-- milli/benches/normal_search.rs | 51 ++++++++++++++++++++++++++++++++++ milli/benches/utils.rs | 43 ++++++++++++++++++++++------ 3 files changed, 132 insertions(+), 10 deletions(-) create mode 100644 milli/benches/normal_search.rs diff --git a/milli/benches/criterion.rs b/milli/benches/criterion.rs index 3f0b6d6b7..bdfe3d478 100644 --- a/milli/benches/criterion.rs +++ b/milli/benches/criterion.rs @@ -3,6 +3,24 @@ mod utils; use criterion::{criterion_group, criterion_main}; fn bench_criterion(c: &mut criterion::Criterion) { + let songs_base_queries = &[ + "mingus ", + "thelonious monk ", + "Disneyland ", + "the white stripes ", + "indochine ", + "klub des loosers ", + "fear of the dark ", + "michel delpech ", + "stromae ", + "dire straits ", + "aretha franklin ", + ]; + let default_criterion: Vec = milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); + let default_criterion = default_criterion.iter().map(|s| s.as_str()); + let asc_default: Vec<&str> = std::iter::once("asc").chain(default_criterion.clone()).collect(); + let desc_default: Vec<&str> = std::iter::once("desc").chain(default_criterion.clone()).collect(); + let confs = &[ utils::Conf { group_name: "proximity", @@ -15,6 +33,7 @@ fn bench_criterion(c: &mut criterion::Criterion) { ], criterion: Some(&["proximity"]), optional_words: false, + ..utils::Conf::BASE }, utils::Conf { group_name: "typo", @@ -34,6 +53,7 @@ fn bench_criterion(c: &mut criterion::Criterion) { ], criterion: Some(&["typo"]), optional_words: false, + ..utils::Conf::BASE }, utils::Conf { group_name: "words", @@ -47,8 +67,32 @@ fn bench_criterion(c: &mut criterion::Criterion) { "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 ], criterion: Some(&["words"]), - optional_words: true, - } + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "asc", + queries: songs_base_queries, + criterion: Some(&["asc"]), + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "desc", + queries: songs_base_queries, + criterion: Some(&["desc"]), + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "asc + default", + queries: songs_base_queries, + criterion: Some(&asc_default[..]), + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "desc + default", + queries: songs_base_queries, + criterion: Some(&desc_default[..]), + ..utils::Conf::BASE + }, ]; utils::run_benches(c, confs); diff --git a/milli/benches/normal_search.rs b/milli/benches/normal_search.rs new file mode 100644 index 000000000..39a343cf0 --- /dev/null +++ b/milli/benches/normal_search.rs @@ -0,0 +1,51 @@ +mod utils; + +use criterion::{criterion_group, criterion_main}; + +fn bench_normal(c: &mut criterion::Criterion) { + let confs = &[ + utils::Conf { + group_name: "basic placeholder", + queries: &[ + "", + ], + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "basic without quote", + queries: &[ + "david bowie", // 1200 + "michael jackson", // 600 + "marcus miller", // 60 + "Notstandskomitee", // 4 + ], + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "basic with quote", + queries: &[ + "\"david\" \"bowie\"", // 1200 + "\"michael\" \"jackson\"", // 600 + "\"marcus\" \"miller\"", // 60 + "\"Notstandskomitee\"", // 4 + ], + ..utils::Conf::BASE + }, + utils::Conf { + group_name: "prefix search", + queries: &[ + "s", // 500k+ results + "a", + "b", + "i", + "x", // only 7k results + ], + ..utils::Conf::BASE + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_normal); +criterion_main!(benches); diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index c608a3ef3..6c8360fe2 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -1,18 +1,40 @@ -use std::{fs::{File, create_dir_all}, time::Duration}; +use std::{fs::{File, create_dir_all, remove_dir_all}, time::Duration}; use heed::EnvOpenOptions; use criterion::BenchmarkId; -use milli::{Index, update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}}; +use milli::{FacetCondition, Index, update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}}; pub struct Conf<'a> { + /// where we are going to create our database.mmdb directory + /// each benchmark will first try to delete it and then recreate it + pub database_name: &'a str, + /// the dataset to be used, it must be an uncompressed csv + pub dataset: &'a str, pub group_name: &'a str, pub queries: &'a[&'a str], pub criterion: Option<&'a [&'a str]>, + pub facet_condition: Option, pub optional_words: bool, } -pub fn base_setup(criterion: Option>) -> Index { - let database = "songs.mmdb"; +impl Conf<'_> { + pub const BASE: Self = Conf { + database_name: "benches.mmdb", + dataset: "", + group_name: "", + queries: &[], + criterion: None, + facet_condition: None, + optional_words: true, + }; +} + +pub fn base_setup(database: &str, dataset: &str, criterion: Option>) -> Index { + match remove_dir_all(&database) { + Ok(_) => (), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), + Err(e) => panic!("{}", e), + } create_dir_all(&database).unwrap(); let mut options = EnvOpenOptions::new(); @@ -41,7 +63,7 @@ pub fn base_setup(criterion: Option>) -> Index { builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); // we called from cargo the current directory is supposed to be milli/milli - let reader = File::open("benches/smol_songs.csv").unwrap(); + let reader = File::open(dataset).unwrap(); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -51,16 +73,21 @@ pub fn base_setup(criterion: Option>) -> Index { pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { for conf in confs { let criterion = conf.criterion.map(|s| s.iter().map(|s| s.to_string()).collect()); - let index = base_setup(criterion); + let index = base_setup(conf.database_name, conf.dataset, criterion); - let mut group = c.benchmark_group(conf.group_name); + let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name)); group.measurement_time(Duration::from_secs(10)); for &query in conf.queries { group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { b.iter(|| { let rtxn = index.read_txn().unwrap(); - let _documents_ids = index.search(&rtxn).query(query).optional_words(conf.optional_words).execute().unwrap(); + let mut search = index.search(&rtxn); + search.query(query).optional_words(conf.optional_words); + if let Some(facet_condition) = conf.facet_condition.clone() { + search.facet_condition(facet_condition); + } + let _ids = search.execute().unwrap(); }); }); } From 4b78ef31b649a32f9d5274f872413c26b7b40910 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 13 Apr 2021 11:40:16 +0200 Subject: [PATCH 0714/1889] add the configuration of the searchable fields and displayed fields and a default configuration for the songs --- milli/benches/criterion.rs | 14 +++++----- milli/benches/normal_search.rs | 8 +++--- milli/benches/utils.rs | 51 ++++++++++++++++++++++++++++------ 3 files changed, 53 insertions(+), 20 deletions(-) diff --git a/milli/benches/criterion.rs b/milli/benches/criterion.rs index bdfe3d478..fb79a597d 100644 --- a/milli/benches/criterion.rs +++ b/milli/benches/criterion.rs @@ -33,7 +33,7 @@ fn bench_criterion(c: &mut criterion::Criterion) { ], criterion: Some(&["proximity"]), optional_words: false, - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "typo", @@ -53,7 +53,7 @@ fn bench_criterion(c: &mut criterion::Criterion) { ], criterion: Some(&["typo"]), optional_words: false, - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "words", @@ -67,31 +67,31 @@ fn bench_criterion(c: &mut criterion::Criterion) { "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 ], criterion: Some(&["words"]), - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "asc", queries: songs_base_queries, criterion: Some(&["asc"]), - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "desc", queries: songs_base_queries, criterion: Some(&["desc"]), - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "asc + default", queries: songs_base_queries, criterion: Some(&asc_default[..]), - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "desc + default", queries: songs_base_queries, criterion: Some(&desc_default[..]), - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, ]; diff --git a/milli/benches/normal_search.rs b/milli/benches/normal_search.rs index 39a343cf0..bd57a8c45 100644 --- a/milli/benches/normal_search.rs +++ b/milli/benches/normal_search.rs @@ -9,7 +9,7 @@ fn bench_normal(c: &mut criterion::Criterion) { queries: &[ "", ], - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "basic without quote", @@ -19,7 +19,7 @@ fn bench_normal(c: &mut criterion::Criterion) { "marcus miller", // 60 "Notstandskomitee", // 4 ], - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "basic with quote", @@ -29,7 +29,7 @@ fn bench_normal(c: &mut criterion::Criterion) { "\"marcus\" \"miller\"", // 60 "\"Notstandskomitee\"", // 4 ], - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "prefix search", @@ -40,7 +40,7 @@ fn bench_normal(c: &mut criterion::Criterion) { "i", "x", // only 7k results ], - ..utils::Conf::BASE + ..utils::Conf::BASE_SONGS }, ]; diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 6c8360fe2..2eb067a02 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -2,7 +2,7 @@ use std::{fs::{File, create_dir_all, remove_dir_all}, time::Duration}; use heed::EnvOpenOptions; use criterion::BenchmarkId; -use milli::{FacetCondition, Index, update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}}; +use milli::{FacetCondition, Index, update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}}; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory @@ -12,48 +12,82 @@ pub struct Conf<'a> { pub dataset: &'a str, pub group_name: &'a str, pub queries: &'a[&'a str], + /// here you can change which criterion are used and in which order. + /// - if you specify something all the base configuration will be thrown out + /// - if you don't specify anything (None) the default configuration will be kept pub criterion: Option<&'a [&'a str]>, + /// the last chance to configure your database as you want + pub configure: fn(&mut Settings), pub facet_condition: Option, + /// enable or disable the optional words on the query pub optional_words: bool, } impl Conf<'_> { + fn nop(_builder: &mut Settings) {} + + fn songs_conf(builder: &mut Settings) { + let displayed_fields = [ + "id", "title", "album", "artist", "genre", "country", "released", "duration", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_searchable_fields(searchable_fields); + } + pub const BASE: Self = Conf { database_name: "benches.mmdb", dataset: "", group_name: "", queries: &[], criterion: None, + configure: Self::nop, facet_condition: None, optional_words: true, }; + + pub const BASE_SONGS: Self = Conf { + dataset: "smol-songs", + configure: Self::songs_conf, + ..Self::BASE + }; } -pub fn base_setup(database: &str, dataset: &str, criterion: Option>) -> Index { - match remove_dir_all(&database) { +pub fn base_setup(conf: &Conf) -> Index { + match remove_dir_all(&conf.database_name) { Ok(_) => (), Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), Err(e) => panic!("{}", e), } - create_dir_all(&database).unwrap(); + create_dir_all(&conf.database_name).unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(100 * 1024 * 1024 * 1024); // 100 GB options.max_readers(10); - let index = Index::new(options, database).unwrap(); + let index = Index::new(options, conf.database_name).unwrap(); let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); - if let Some(criterion) = criterion { + if let Some(criterion) = conf.criterion { builder.reset_faceted_fields(); builder.reset_criteria(); builder.reset_stop_words(); + let criterion = criterion.iter().map(|s| s.to_string()).collect(); builder.set_criteria(criterion); } + (conf.configure)(&mut builder); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -63,7 +97,7 @@ pub fn base_setup(database: &str, dataset: &str, criterion: Option>) builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); // we called from cargo the current directory is supposed to be milli/milli - let reader = File::open(dataset).unwrap(); + let reader = File::open(conf.dataset).unwrap(); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -72,8 +106,7 @@ pub fn base_setup(database: &str, dataset: &str, criterion: Option>) pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { for conf in confs { - let criterion = conf.criterion.map(|s| s.iter().map(|s| s.to_string()).collect()); - let index = base_setup(conf.database_name, conf.dataset, criterion); + let index = base_setup(conf); let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name)); group.measurement_time(Duration::from_secs(10)); From 136efd6b534dc864e7b61efe478c37c7bc5ee7ee Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 13 Apr 2021 14:26:08 +0200 Subject: [PATCH 0715/1889] fix the benches --- milli/benches/criterion.rs | 8 ++++---- milli/benches/utils.rs | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/milli/benches/criterion.rs b/milli/benches/criterion.rs index fb79a597d..3049557f0 100644 --- a/milli/benches/criterion.rs +++ b/milli/benches/criterion.rs @@ -18,8 +18,8 @@ fn bench_criterion(c: &mut criterion::Criterion) { ]; let default_criterion: Vec = milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); let default_criterion = default_criterion.iter().map(|s| s.as_str()); - let asc_default: Vec<&str> = std::iter::once("asc").chain(default_criterion.clone()).collect(); - let desc_default: Vec<&str> = std::iter::once("desc").chain(default_criterion.clone()).collect(); + let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect(); + let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect(); let confs = &[ utils::Conf { @@ -72,13 +72,13 @@ fn bench_criterion(c: &mut criterion::Criterion) { utils::Conf { group_name: "asc", queries: songs_base_queries, - criterion: Some(&["asc"]), + criterion: Some(&["asc(released-timestamp)"]), ..utils::Conf::BASE_SONGS }, utils::Conf { group_name: "desc", queries: songs_base_queries, - criterion: Some(&["desc"]), + criterion: Some(&["desc(released-timestamp)"]), ..utils::Conf::BASE_SONGS }, utils::Conf { diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 2eb067a02..9b58b54b8 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -40,6 +40,18 @@ impl Conf<'_> { .map(|s| s.to_string()) .collect(); builder.set_searchable_fields(searchable_fields); + + let faceted_fields = [ + ("released-timestamp", "integer"), + ("duration-float", "float"), + ("genre", "string"), + ("country", "string"), + ("artist", "string"), + ] + .iter() + .map(|(a, b)| (a.to_string(), b.to_string())) + .collect(); + builder.set_faceted_fields(faceted_fields); } pub const BASE: Self = Conf { @@ -54,7 +66,7 @@ impl Conf<'_> { }; pub const BASE_SONGS: Self = Conf { - dataset: "smol-songs", + dataset: "smol-songs.csv", configure: Self::songs_conf, ..Self::BASE }; @@ -97,7 +109,8 @@ pub fn base_setup(conf: &Conf) -> Index { builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); // we called from cargo the current directory is supposed to be milli/milli - let reader = File::open(conf.dataset).unwrap(); + let dataset_path = format!("benches/{}", conf.dataset); + let reader = File::open(&dataset_path).expect(&format!("could not find the dataset in: {}", &dataset_path)); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); From 5132a106a160641513084f6a880bf7ba09a03d18 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 13 Apr 2021 18:34:00 +0200 Subject: [PATCH 0716/1889] refactorize everything related to the songs dataset in a songs benchmark file --- milli/Cargo.toml | 2 +- milli/benches/criterion.rs | 102 ------------------ milli/benches/normal_search.rs | 51 --------- milli/benches/songs.rs | 185 +++++++++++++++++++++++++++++++++ milli/benches/utils.rs | 33 ------ 5 files changed, 186 insertions(+), 187 deletions(-) delete mode 100644 milli/benches/criterion.rs delete mode 100644 milli/benches/normal_search.rs create mode 100644 milli/benches/songs.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 399b04428..2bdb3f4dc 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -61,5 +61,5 @@ rand = "0.8.3" default = [] [[bench]] -name = "criterion" +name = "songs" harness = false diff --git a/milli/benches/criterion.rs b/milli/benches/criterion.rs deleted file mode 100644 index 3049557f0..000000000 --- a/milli/benches/criterion.rs +++ /dev/null @@ -1,102 +0,0 @@ -mod utils; - -use criterion::{criterion_group, criterion_main}; - -fn bench_criterion(c: &mut criterion::Criterion) { - let songs_base_queries = &[ - "mingus ", - "thelonious monk ", - "Disneyland ", - "the white stripes ", - "indochine ", - "klub des loosers ", - "fear of the dark ", - "michel delpech ", - "stromae ", - "dire straits ", - "aretha franklin ", - ]; - let default_criterion: Vec = milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); - let default_criterion = default_criterion.iter().map(|s| s.as_str()); - let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect(); - let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect(); - - let confs = &[ - utils::Conf { - group_name: "proximity", - queries: &[ - "black saint sinner lady ", - "les dangeureuses 1960 ", - "The Disneyland Sing-Along Chorus ", - "Under Great Northern Lights ", - "7000 Danses Un Jour Dans Notre Vie", - ], - criterion: Some(&["proximity"]), - optional_words: false, - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "typo", - queries: &[ - "mongus ", - "thelonius monk ", - "Disnaylande ", - "the white striper ", - "indochie ", - "indochien ", - "klub des loopers ", - "fear of the duck ", - "michel depech ", - "stromal ", - "dire straights ", - "Arethla Franklin ", - ], - criterion: Some(&["typo"]), - optional_words: false, - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "words", - queries: &[ - "the black saint and the sinner lady and the good doggo ", // four words to pop - "les liaisons dangeureuses 1793 ", // one word to pop - "The Disneyland Children's Sing-Alone song ", // two words to pop - "seven nation mummy ", // one word to pop - "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop - "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop - "whathavenotnsuchforth and then a good amount of words tot pop in order to match the first one ", // 16 - ], - criterion: Some(&["words"]), - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "asc", - queries: songs_base_queries, - criterion: Some(&["asc(released-timestamp)"]), - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "desc", - queries: songs_base_queries, - criterion: Some(&["desc(released-timestamp)"]), - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "asc + default", - queries: songs_base_queries, - criterion: Some(&asc_default[..]), - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "desc + default", - queries: songs_base_queries, - criterion: Some(&desc_default[..]), - ..utils::Conf::BASE_SONGS - }, - ]; - - utils::run_benches(c, confs); -} - -criterion_group!(benches, bench_criterion); -criterion_main!(benches); diff --git a/milli/benches/normal_search.rs b/milli/benches/normal_search.rs deleted file mode 100644 index bd57a8c45..000000000 --- a/milli/benches/normal_search.rs +++ /dev/null @@ -1,51 +0,0 @@ -mod utils; - -use criterion::{criterion_group, criterion_main}; - -fn bench_normal(c: &mut criterion::Criterion) { - let confs = &[ - utils::Conf { - group_name: "basic placeholder", - queries: &[ - "", - ], - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "basic without quote", - queries: &[ - "david bowie", // 1200 - "michael jackson", // 600 - "marcus miller", // 60 - "Notstandskomitee", // 4 - ], - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "basic with quote", - queries: &[ - "\"david\" \"bowie\"", // 1200 - "\"michael\" \"jackson\"", // 600 - "\"marcus\" \"miller\"", // 60 - "\"Notstandskomitee\"", // 4 - ], - ..utils::Conf::BASE_SONGS - }, - utils::Conf { - group_name: "prefix search", - queries: &[ - "s", // 500k+ results - "a", - "b", - "i", - "x", // only 7k results - ], - ..utils::Conf::BASE_SONGS - }, - ]; - - utils::run_benches(c, confs); -} - -criterion_group!(benches, bench_normal); -criterion_main!(benches); diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs new file mode 100644 index 000000000..586b8d4ef --- /dev/null +++ b/milli/benches/songs.rs @@ -0,0 +1,185 @@ +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = [ + "id", "title", "album", "artist", "genre", "country", "released", "duration", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = [ + ("released-timestamp", "integer"), + ("duration-float", "float"), + ("genre", "string"), + ("country", "string"), + ("artist", "string"), + ] + .iter() + .map(|(a, b)| (a.to_string(), b.to_string())) + .collect(); + builder.set_faceted_fields(faceted_fields); +} + +const BASE_CONF: Conf = Conf { + dataset: "smol-songs.csv", + queries: &[ + "mingus ", + "thelonious monk ", + "Disneyland ", + "the white stripes ", + "indochine ", + "klub des loosers ", + "fear of the dark ", + "michel delpech ", + "stromae ", + "dire straits ", + "aretha franklin ", + ], + configure: base_conf, + ..Conf::BASE +}; + +fn bench_songs(c: &mut criterion::Criterion) { + let default_criterion: Vec = milli::default_criteria() + .iter() + .map(|criteria| criteria.to_string()) + .collect(); + let default_criterion = default_criterion.iter().map(|s| s.as_str()); + let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)") + .chain(default_criterion.clone()) + .collect(); + let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)") + .chain(default_criterion.clone()) + .collect(); + + let confs = &[ + /* first we bench each criterion alone */ + utils::Conf { + group_name: "proximity", + queries: &[ + "black saint sinner lady ", + "les dangeureuses 1960 ", + "The Disneyland Sing-Along Chorus ", + "Under Great Northern Lights ", + "7000 Danses Un Jour Dans Notre Vie", + ], + criterion: Some(&["proximity"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "typo", + queries: &[ + "mongus ", + "thelonius monk ", + "Disnaylande ", + "the white striper ", + "indochie ", + "indochien ", + "klub des loopers ", + "fear of the duck ", + "michel depech ", + "stromal ", + "dire straights ", + "Arethla Franklin ", + ], + criterion: Some(&["typo"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "words", + queries: &[ + "the black saint and the sinner lady and the good doggo ", // four words to pop + "les liaisons dangeureuses 1793 ", // one word to pop + "The Disneyland Children's Sing-Alone song ", // two words to pop + "seven nation mummy ", // one word to pop + "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop + "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop + "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 16 + ], + criterion: Some(&["words"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "asc", + criterion: Some(&["asc(released-timestamp)"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc", + criterion: Some(&["desc(released-timestamp)"]), + ..BASE_CONF + }, + + /* then we bench the asc and desc criterion on top of the default criterion */ + utils::Conf { + group_name: "asc + default", + criterion: Some(&asc_default[..]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc + default", + criterion: Some(&desc_default[..]), + ..BASE_CONF + }, + + /* the we bench some global / normal search with all the default criterion in the default + * order */ + utils::Conf { + group_name: "basic placeholder", + queries: &[ + "", + ], + ..BASE_CONF + }, + utils::Conf { + group_name: "basic without quote", + queries: &[ + "david bowie", // 1200 + "michael jackson", // 600 + "marcus miller", // 60 + "Notstandskomitee", // 4 + ], + ..BASE_CONF + }, + utils::Conf { + group_name: "basic with quote", + queries: &[ + "\"david\" \"bowie\"", // 1200 + "\"michael\" \"jackson\"", // 600 + "\"marcus\" \"miller\"", // 60 + "\"Notstandskomitee\"", // 4 + ], + ..BASE_CONF + }, + utils::Conf { + group_name: "prefix search", + queries: &[ + "s", // 500k+ results + "a", + "b", + "i", + "x", // only 7k results + ], + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_songs); +criterion_main!(benches); diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 9b58b54b8..b101adb63 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -26,34 +26,6 @@ pub struct Conf<'a> { impl Conf<'_> { fn nop(_builder: &mut Settings) {} - fn songs_conf(builder: &mut Settings) { - let displayed_fields = [ - "id", "title", "album", "artist", "genre", "country", "released", "duration", - ] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = ["title", "album", "artist"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = [ - ("released-timestamp", "integer"), - ("duration-float", "float"), - ("genre", "string"), - ("country", "string"), - ("artist", "string"), - ] - .iter() - .map(|(a, b)| (a.to_string(), b.to_string())) - .collect(); - builder.set_faceted_fields(faceted_fields); - } - pub const BASE: Self = Conf { database_name: "benches.mmdb", dataset: "", @@ -65,11 +37,6 @@ impl Conf<'_> { optional_words: true, }; - pub const BASE_SONGS: Self = Conf { - dataset: "smol-songs.csv", - configure: Self::songs_conf, - ..Self::BASE - }; } pub fn base_setup(conf: &Conf) -> Index { From beae84376658257f5a538aced80e3b5898f4f022 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 13 Apr 2021 18:39:34 +0200 Subject: [PATCH 0717/1889] add a missing space --- milli/benches/songs.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs index 586b8d4ef..71bc164ab 100644 --- a/milli/benches/songs.rs +++ b/milli/benches/songs.rs @@ -73,7 +73,7 @@ fn bench_songs(c: &mut criterion::Criterion) { "les dangeureuses 1960 ", "The Disneyland Sing-Along Chorus ", "Under Great Northern Lights ", - "7000 Danses Un Jour Dans Notre Vie", + "7000 Danses Un Jour Dans Notre Vie ", ], criterion: Some(&["proximity"]), optional_words: false, From d0b44c380f6bb98aba0c57ee32910d4d6f71a948 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 14 Apr 2021 12:09:51 +0200 Subject: [PATCH 0718/1889] add benchmarks on a wiki dataset --- milli/Cargo.toml | 4 ++ milli/benches/wiki.rs | 127 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 milli/benches/wiki.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 2bdb3f4dc..1c0f74613 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -63,3 +63,7 @@ default = [] [[bench]] name = "songs" harness = false + +[[bench]] +name = "wiki" +harness = false diff --git a/milli/benches/wiki.rs b/milli/benches/wiki.rs new file mode 100644 index 000000000..fc8af02e5 --- /dev/null +++ b/milli/benches/wiki.rs @@ -0,0 +1,127 @@ +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = ["title", "body", "url"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); +} + +const BASE_CONF: Conf = Conf { + dataset: "smol-wiki-articles.csv", + queries: &[ + "mingus ", // 46 candidates + "miles davis ", // 159 + "rock and roll ", // 1007 + "machine ", // 3448 + "spain ", // 7002 + "japan ", // 10.593 + "france ", // 17.616 + "film ", // 24.959 + ], + configure: base_conf, + ..Conf::BASE +}; + +fn bench_songs(c: &mut criterion::Criterion) { + let basic_with_quote: Vec = BASE_CONF + .queries + .iter() + .map(|s| { + s.trim() + .split(' ') + .map(|s| format!(r#""{}""#, s)).collect::>().join(" ") + }) + .collect(); + let basic_with_quote: &[&str] = &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + + let confs = &[ + /* first we bench each criterion alone */ + utils::Conf { + group_name: "proximity", + queries: &[ + "herald sings ", + "april paris ", + "tea two ", + "diesel engine ", + ], + criterion: Some(&["proximity"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "typo", + queries: &[ + "migrosoft ", + "linax ", + "Disnaylande ", + "phytogropher ", + "nympalidea ", + "aritmetric ", + "the fronce ", + "sisan ", + ], + criterion: Some(&["typo"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "words", + queries: &[ + "the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results + "Kameya Tokujirō mingus monk ", // two words to pop, 55 + "Ulrich Hensel meilisearch milli ", // two words to pop, 306 + "Idaho Bellevue pizza ", // one word to pop, 800 + "Abraham machin ", // one word to pop, 1141 + ], + criterion: Some(&["words"]), + ..BASE_CONF + }, + /* the we bench some global / normal search with all the default criterion in the default + * order */ + utils::Conf { + group_name: "basic placeholder", + queries: &[""], + ..BASE_CONF + }, + utils::Conf { + group_name: "basic without quote", + queries: &BASE_CONF + .queries + .iter() + .map(|s| s.trim()) // we remove the space at the end of each request + .collect::>(), + ..BASE_CONF + }, + utils::Conf { + group_name: "basic with quote", + queries: basic_with_quote, + ..BASE_CONF + }, + utils::Conf { + group_name: "prefix search", + queries: &[ + "t", // 453k results + "c", // 405k + "g", // 318k + "j", // 227k + "q", // 71k + "x", // 17k + ], + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_songs); +criterion_main!(benches); From 7086009f9350b2ef72c126a2b0ffa342b405869b Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 14 Apr 2021 12:36:12 +0200 Subject: [PATCH 0719/1889] improve the base search --- milli/benches/songs.rs | 52 +++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs index 71bc164ab..f6e36262d 100644 --- a/milli/benches/songs.rs +++ b/milli/benches/songs.rs @@ -35,17 +35,16 @@ fn base_conf(builder: &mut Settings) { const BASE_CONF: Conf = Conf { dataset: "smol-songs.csv", queries: &[ - "mingus ", - "thelonious monk ", - "Disneyland ", - "the white stripes ", - "indochine ", - "klub des loosers ", - "fear of the dark ", - "michel delpech ", - "stromae ", - "dire straits ", - "aretha franklin ", + "john ", // 9097 + "david ", // 4794 + "charles ", // 1957 + "david bowie ", // 1200 + "michael jackson ", // 600 + "thelonious monk ", // 303 + "charles mingus ", // 142 + "marcus miller ", // 60 + "tamo ", // 13 + "Notstandskomitee ", // 4 ], configure: base_conf, ..Conf::BASE @@ -64,6 +63,17 @@ fn bench_songs(c: &mut criterion::Criterion) { .chain(default_criterion.clone()) .collect(); + let basic_with_quote: Vec = BASE_CONF + .queries + .iter() + .map(|s| { + s.trim() + .split(' ') + .map(|s| format!(r#""{}""#, s)).collect::>().join(" ") + }) + .collect(); + let basic_with_quote: &[&str] = &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + let confs = &[ /* first we bench each criterion alone */ utils::Conf { @@ -108,7 +118,7 @@ fn bench_songs(c: &mut criterion::Criterion) { "seven nation mummy ", // one word to pop "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop - "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 16 + "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13 ], criterion: Some(&["words"]), ..BASE_CONF @@ -147,22 +157,16 @@ fn bench_songs(c: &mut criterion::Criterion) { }, utils::Conf { group_name: "basic without quote", - queries: &[ - "david bowie", // 1200 - "michael jackson", // 600 - "marcus miller", // 60 - "Notstandskomitee", // 4 - ], + queries: &BASE_CONF + .queries + .iter() + .map(|s| s.trim()) // we remove the space at the end of each request + .collect::>(), ..BASE_CONF }, utils::Conf { group_name: "basic with quote", - queries: &[ - "\"david\" \"bowie\"", // 1200 - "\"michael\" \"jackson\"", // 600 - "\"marcus\" \"miller\"", // 60 - "\"Notstandskomitee\"", // 4 - ], + queries: basic_with_quote, ..BASE_CONF }, utils::Conf { From 5d5d11560890bf27e4254686d6ba44c3aab5afcc Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 14 Apr 2021 13:13:33 +0200 Subject: [PATCH 0720/1889] reformat all the files --- milli/benches/songs.rs | 48 ++++++++++++++++++++---------------------- milli/benches/utils.rs | 18 ++++++++++------ milli/benches/wiki.rs | 9 ++++++-- 3 files changed, 42 insertions(+), 33 deletions(-) diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs index f6e36262d..0c056d93f 100644 --- a/milli/benches/songs.rs +++ b/milli/benches/songs.rs @@ -35,15 +35,15 @@ fn base_conf(builder: &mut Settings) { const BASE_CONF: Conf = Conf { dataset: "smol-songs.csv", queries: &[ - "john ", // 9097 - "david ", // 4794 - "charles ", // 1957 - "david bowie ", // 1200 - "michael jackson ", // 600 - "thelonious monk ", // 303 - "charles mingus ", // 142 - "marcus miller ", // 60 - "tamo ", // 13 + "john ", // 9097 + "david ", // 4794 + "charles ", // 1957 + "david bowie ", // 1200 + "michael jackson ", // 600 + "thelonious monk ", // 303 + "charles mingus ", // 142 + "marcus miller ", // 60 + "tamo ", // 13 "Notstandskomitee ", // 4 ], configure: base_conf, @@ -69,10 +69,15 @@ fn bench_songs(c: &mut criterion::Criterion) { .map(|s| { s.trim() .split(' ') - .map(|s| format!(r#""{}""#, s)).collect::>().join(" ") + .map(|s| format!(r#""{}""#, s)) + .collect::>() + .join(" ") }) .collect(); - let basic_with_quote: &[&str] = &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + let basic_with_quote: &[&str] = &basic_with_quote + .iter() + .map(|s| s.as_str()) + .collect::>(); let confs = &[ /* first we bench each criterion alone */ @@ -113,10 +118,10 @@ fn bench_songs(c: &mut criterion::Criterion) { group_name: "words", queries: &[ "the black saint and the sinner lady and the good doggo ", // four words to pop - "les liaisons dangeureuses 1793 ", // one word to pop - "The Disneyland Children's Sing-Alone song ", // two words to pop - "seven nation mummy ", // one word to pop - "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop + "les liaisons dangeureuses 1793 ", // one word to pop + "The Disneyland Children's Sing-Alone song ", // two words to pop + "seven nation mummy ", // one word to pop + "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13 ], @@ -133,7 +138,6 @@ fn bench_songs(c: &mut criterion::Criterion) { criterion: Some(&["desc(released-timestamp)"]), ..BASE_CONF }, - /* then we bench the asc and desc criterion on top of the default criterion */ utils::Conf { group_name: "asc + default", @@ -145,14 +149,11 @@ fn bench_songs(c: &mut criterion::Criterion) { criterion: Some(&desc_default[..]), ..BASE_CONF }, - /* the we bench some global / normal search with all the default criterion in the default * order */ utils::Conf { group_name: "basic placeholder", - queries: &[ - "", - ], + queries: &[""], ..BASE_CONF }, utils::Conf { @@ -173,14 +174,11 @@ fn bench_songs(c: &mut criterion::Criterion) { group_name: "prefix search", queries: &[ "s", // 500k+ results - "a", - "b", - "i", - "x", // only 7k results + "a", "b", "i", "x", // only 7k results ], ..BASE_CONF }, - ]; + ]; utils::run_benches(c, confs); } diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index b101adb63..4c8fb347d 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -1,8 +1,14 @@ -use std::{fs::{File, create_dir_all, remove_dir_all}, time::Duration}; +use std::{ + fs::{create_dir_all, remove_dir_all, File}, + time::Duration, +}; -use heed::EnvOpenOptions; use criterion::BenchmarkId; -use milli::{FacetCondition, Index, update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}}; +use heed::EnvOpenOptions; +use milli::{ + update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}, + FacetCondition, Index, +}; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory @@ -11,7 +17,7 @@ pub struct Conf<'a> { /// the dataset to be used, it must be an uncompressed csv pub dataset: &'a str, pub group_name: &'a str, - pub queries: &'a[&'a str], + pub queries: &'a [&'a str], /// here you can change which criterion are used and in which order. /// - if you specify something all the base configuration will be thrown out /// - if you don't specify anything (None) the default configuration will be kept @@ -36,7 +42,6 @@ impl Conf<'_> { facet_condition: None, optional_words: true, }; - } pub fn base_setup(conf: &Conf) -> Index { @@ -77,7 +82,8 @@ pub fn base_setup(conf: &Conf) -> Index { builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); // we called from cargo the current directory is supposed to be milli/milli let dataset_path = format!("benches/{}", conf.dataset); - let reader = File::open(&dataset_path).expect(&format!("could not find the dataset in: {}", &dataset_path)); + let reader = File::open(&dataset_path) + .expect(&format!("could not find the dataset in: {}", &dataset_path)); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/benches/wiki.rs b/milli/benches/wiki.rs index fc8af02e5..d876814a9 100644 --- a/milli/benches/wiki.rs +++ b/milli/benches/wiki.rs @@ -38,10 +38,15 @@ fn bench_songs(c: &mut criterion::Criterion) { .map(|s| { s.trim() .split(' ') - .map(|s| format!(r#""{}""#, s)).collect::>().join(" ") + .map(|s| format!(r#""{}""#, s)) + .collect::>() + .join(" ") }) .collect(); - let basic_with_quote: &[&str] = &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + let basic_with_quote: &[&str] = &basic_with_quote + .iter() + .map(|s| s.as_str()) + .collect::>(); let confs = &[ /* first we bench each criterion alone */ From 7c7fba4e577edacf668f8cec6a24e0f48cea854d Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 14 Apr 2021 14:39:36 +0200 Subject: [PATCH 0721/1889] remove the time limitation to let criterion do what it wants --- milli/benches/utils.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 4c8fb347d..3d91f726a 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -95,7 +95,6 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let index = base_setup(conf); let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name)); - group.measurement_time(Duration::from_secs(10)); for &query in conf.queries { group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { From e5dfde88fd042708d3765dd703e542c7a3a0a512 Mon Sep 17 00:00:00 2001 From: tamo Date: Wed, 14 Apr 2021 16:26:21 +0200 Subject: [PATCH 0722/1889] fix the facets conditions --- milli/benches/songs.rs | 24 +++++++++++++++++++++++- milli/benches/utils.rs | 10 ++++------ milli/benches/wiki.rs | 2 +- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs index 0c056d93f..8ef6df3c8 100644 --- a/milli/benches/songs.rs +++ b/milli/benches/songs.rs @@ -138,6 +138,7 @@ fn bench_songs(c: &mut criterion::Criterion) { criterion: Some(&["desc(released-timestamp)"]), ..BASE_CONF }, + /* then we bench the asc and desc criterion on top of the default criterion */ utils::Conf { group_name: "asc + default", @@ -149,6 +150,24 @@ fn bench_songs(c: &mut criterion::Criterion) { criterion: Some(&desc_default[..]), ..BASE_CONF }, + + /* we bench the filters with the default request */ + utils::Conf { + group_name: "basic filter: <=", + facet_condition: Some("released-timestamp <= 946728000"), // year 2000 + ..BASE_CONF + }, + utils::Conf { + group_name: "basic filter: TO", + facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010 + ..BASE_CONF + }, + utils::Conf { + group_name: "big filter", + facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"), + ..BASE_CONF + }, + /* the we bench some global / normal search with all the default criterion in the default * order */ utils::Conf { @@ -174,7 +193,10 @@ fn bench_songs(c: &mut criterion::Criterion) { group_name: "prefix search", queries: &[ "s", // 500k+ results - "a", "b", "i", "x", // only 7k results + "a", // + "b", // + "i", // + "x", // only 7k results ], ..BASE_CONF }, diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 3d91f726a..460623ab5 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -1,7 +1,4 @@ -use std::{ - fs::{create_dir_all, remove_dir_all, File}, - time::Duration, -}; +use std::fs::{create_dir_all, remove_dir_all, File}; use criterion::BenchmarkId; use heed::EnvOpenOptions; @@ -24,7 +21,7 @@ pub struct Conf<'a> { pub criterion: Option<&'a [&'a str]>, /// the last chance to configure your database as you want pub configure: fn(&mut Settings), - pub facet_condition: Option, + pub facet_condition: Option<&'a str>, /// enable or disable the optional words on the query pub optional_words: bool, } @@ -102,7 +99,8 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let rtxn = index.read_txn().unwrap(); let mut search = index.search(&rtxn); search.query(query).optional_words(conf.optional_words); - if let Some(facet_condition) = conf.facet_condition.clone() { + if let Some(facet_condition) = conf.facet_condition { + let facet_condition = FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap(); search.facet_condition(facet_condition); } let _ids = search.execute().unwrap(); diff --git a/milli/benches/wiki.rs b/milli/benches/wiki.rs index d876814a9..8c15f11ca 100644 --- a/milli/benches/wiki.rs +++ b/milli/benches/wiki.rs @@ -25,7 +25,7 @@ const BASE_CONF: Conf = Conf { "spain ", // 7002 "japan ", // 10.593 "france ", // 17.616 - "film ", // 24.959 + "film ", // 24.959 ], configure: base_conf, ..Conf::BASE From 4969abeaab9711c953aea88efa65316c23756bd0 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 27 Apr 2021 15:02:14 +0200 Subject: [PATCH 0723/1889] update the facets for the benchmarks --- milli/benches/songs.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/benches/songs.rs b/milli/benches/songs.rs index 8ef6df3c8..430b73a40 100644 --- a/milli/benches/songs.rs +++ b/milli/benches/songs.rs @@ -20,8 +20,8 @@ fn base_conf(builder: &mut Settings) { builder.set_searchable_fields(searchable_fields); let faceted_fields = [ - ("released-timestamp", "integer"), - ("duration-float", "float"), + ("released-timestamp", "number"), + ("duration-float", "number"), ("genre", "string"), ("country", "string"), ("artist", "string"), From 3c84075d2d38b707c4df77deb960341f8c4bbedc Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 27 Apr 2021 15:41:16 +0200 Subject: [PATCH 0724/1889] uses an env variable to find the datasets --- milli/benches/README.md | 12 ++++++++++++ milli/benches/utils.rs | 17 +++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/milli/benches/README.md b/milli/benches/README.md index 9b53fc0d1..b2c1aec15 100644 --- a/milli/benches/README.md +++ b/milli/benches/README.md @@ -13,3 +13,15 @@ You can run the following command from the root of this git repository ``` wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz ``` + +- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h +- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h +- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h + +By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that: +``` +MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs +``` + +Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html) + diff --git a/milli/benches/utils.rs b/milli/benches/utils.rs index 460623ab5..f3f5e9bf6 100644 --- a/milli/benches/utils.rs +++ b/milli/benches/utils.rs @@ -7,6 +7,15 @@ use milli::{ FacetCondition, Index, }; +/// The name of the environment variable used to select the path +/// of the directory containing the datasets +const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; + +/// The default path for the dataset if nothing is specified +/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be +/// executed with a pwd of `milli/milli` +const DEFAULT_DATASETS_PATH: &str = "milli/benches"; + pub struct Conf<'a> { /// where we are going to create our database.mmdb directory /// each benchmark will first try to delete it and then recreate it @@ -78,7 +87,10 @@ pub fn base_setup(conf: &Conf) -> Index { builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); // we called from cargo the current directory is supposed to be milli/milli - let dataset_path = format!("benches/{}", conf.dataset); + let base_dataset_path = std::env::vars() + .find(|var| var.0 == BASE_DATASETS_PATH_KEY) + .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value); + let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset); let reader = File::open(&dataset_path) .expect(&format!("could not find the dataset in: {}", &dataset_path)); builder.execute(reader, |_, _| ()).unwrap(); @@ -100,7 +112,8 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let mut search = index.search(&rtxn); search.query(query).optional_words(conf.optional_words); if let Some(facet_condition) = conf.facet_condition { - let facet_condition = FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap(); + let facet_condition = + FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap(); search.facet_condition(facet_condition); } let _ids = search.execute().unwrap(); From 06c414a75388bd34f7dc3e768f433e0b5bafac23 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 25 May 2021 17:09:14 +0200 Subject: [PATCH 0725/1889] move the benchmarks to another crate so we can download the datasets automatically without adding overhead to the build of milli --- Cargo.toml | 2 +- benchmarks/Cargo.toml | 29 ++++++++++ benchmarks/README.md | 30 ++++++++++ {milli => benchmarks}/benches/songs.rs | 3 +- {milli => benchmarks}/benches/utils.rs | 21 ++----- {milli => benchmarks}/benches/wiki.rs | 3 +- benchmarks/build.rs | 80 ++++++++++++++++++++++++++ benchmarks/src/lib.rs | 5 ++ milli/Cargo.toml | 9 --- milli/benches/README.md | 27 --------- 10 files changed, 154 insertions(+), 55 deletions(-) create mode 100644 benchmarks/Cargo.toml create mode 100644 benchmarks/README.md rename {milli => benchmarks}/benches/songs.rs (99%) rename {milli => benchmarks}/benches/utils.rs (81%) rename {milli => benchmarks}/benches/wiki.rs (98%) create mode 100644 benchmarks/build.rs create mode 100644 benchmarks/src/lib.rs delete mode 100644 milli/benches/README.md diff --git a/Cargo.toml b/Cargo.toml index a60c293e3..ff0b2582a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["milli", "http-ui", "infos", "helpers", "search"] +members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"] default-members = ["milli"] [profile.release] diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml new file mode 100644 index 000000000..f7b66fe3a --- /dev/null +++ b/benchmarks/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "benchmarks" +version = "0.1.0" +edition = "2018" +publish = false + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +milli = { path = "../milli" } + +[dev-dependencies] +heed = "*" # we want to use the version milli uses +criterion = "0.3.4" + +[build-dependencies] +anyhow = "1.0" +bytes = "1.0" +flate2 = "1.0.20" +convert_case = "0.4" +reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false } + +[[bench]] +name = "songs" +harness = false + +[[bench]] +name = "wiki" +harness = false diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..8c91700e9 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,30 @@ +Benchmarks +========== + +For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command: +``` +xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv +``` +You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) +And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). + +We also use a subset of `wikipedia-articles.csv` that was generated with the following command: +``` +xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv +``` +You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz). + +----- + +- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h +- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h +- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h + +By default the benchmarks will be downloaded and uncompressed automatically in the target directory. +If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`: +``` +mkdir ~/datasets +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded +touch build.rs +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded +``` diff --git a/milli/benches/songs.rs b/benchmarks/benches/songs.rs similarity index 99% rename from milli/benches/songs.rs rename to benchmarks/benches/songs.rs index 430b73a40..dd52a0afc 100644 --- a/milli/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -1,3 +1,4 @@ +mod datasets_paths; mod utils; use criterion::{criterion_group, criterion_main}; @@ -33,7 +34,7 @@ fn base_conf(builder: &mut Settings) { } const BASE_CONF: Conf = Conf { - dataset: "smol-songs.csv", + dataset: datasets_paths::SMOL_SONGS, queries: &[ "john ", // 9097 "david ", // 4794 diff --git a/milli/benches/utils.rs b/benchmarks/benches/utils.rs similarity index 81% rename from milli/benches/utils.rs rename to benchmarks/benches/utils.rs index f3f5e9bf6..e0feb9b0e 100644 --- a/milli/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -7,15 +7,6 @@ use milli::{ FacetCondition, Index, }; -/// The name of the environment variable used to select the path -/// of the directory containing the datasets -const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; - -/// The default path for the dataset if nothing is specified -/// By default we chose `milli/benches` because any cargo command ran in `milli/milli/**` will be -/// executed with a pwd of `milli/milli` -const DEFAULT_DATASETS_PATH: &str = "milli/benches"; - pub struct Conf<'a> { /// where we are going to create our database.mmdb directory /// each benchmark will first try to delete it and then recreate it @@ -33,6 +24,8 @@ pub struct Conf<'a> { pub facet_condition: Option<&'a str>, /// enable or disable the optional words on the query pub optional_words: bool, + /// primary key, if there is None we'll auto-generate docids for every documents + pub primary_key: Option<&'a str>, } impl Conf<'_> { @@ -47,6 +40,7 @@ impl Conf<'_> { configure: Self::nop, facet_condition: None, optional_words: true, + primary_key: None, }; } @@ -86,13 +80,8 @@ pub fn base_setup(conf: &Conf) -> Index { let mut builder = update_builder.index_documents(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - // we called from cargo the current directory is supposed to be milli/milli - let base_dataset_path = std::env::vars() - .find(|var| var.0 == BASE_DATASETS_PATH_KEY) - .map_or(DEFAULT_DATASETS_PATH.to_owned(), |(_key, value)| value); - let dataset_path = format!("{}/{}", base_dataset_path, conf.dataset); - let reader = File::open(&dataset_path) - .expect(&format!("could not find the dataset in: {}", &dataset_path)); + let reader = File::open(conf.dataset) + .expect(&format!("could not find the dataset in: {}", conf.dataset)); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/benches/wiki.rs b/benchmarks/benches/wiki.rs similarity index 98% rename from milli/benches/wiki.rs rename to benchmarks/benches/wiki.rs index 8c15f11ca..99ecff2ce 100644 --- a/milli/benches/wiki.rs +++ b/benchmarks/benches/wiki.rs @@ -1,3 +1,4 @@ +mod datasets_paths; mod utils; use criterion::{criterion_group, criterion_main}; @@ -16,7 +17,7 @@ fn base_conf(builder: &mut Settings) { } const BASE_CONF: Conf = Conf { - dataset: "smol-wiki-articles.csv", + dataset: datasets_paths::SMOL_WIKI_ARTICLES, queries: &[ "mingus ", // 46 candidates "miles davis ", // 159 diff --git a/benchmarks/build.rs b/benchmarks/build.rs new file mode 100644 index 000000000..dc92a1a4c --- /dev/null +++ b/benchmarks/build.rs @@ -0,0 +1,80 @@ +use std::path::{Path, PathBuf}; +use std::{env, fs}; +use std::{ + fs::File, + io::{Cursor, Read, Seek, Write}, +}; + +use bytes::Bytes; +use convert_case::{Case, Casing}; +use flate2::read::GzDecoder; +use reqwest::IntoUrl; + +const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks"; + +const DATASET_SONGS: &str = "smol-songs"; +const DATASET_WIKI: &str = "smol-wiki-articles"; + +/// The name of the environment variable used to select the path +/// of the directory containing the datasets +const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; + +fn main() -> anyhow::Result<()> { + let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?)); + + let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches"); + let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?; + writeln!( + manifest_paths_file, + r#"//! This file is generated by the build script. +//! Do not modify by hand, use the build.rs file. +#![allow(dead_code)] +"# + )?; + writeln!(manifest_paths_file)?; + + for dataset in &[DATASET_SONGS, DATASET_WIKI] { + let out_path = out_dir.join(dataset); + let out_file = out_path.with_extension("csv"); + + writeln!( + &mut manifest_paths_file, + r#"pub const {}: &str = {:?};"#, + dataset.to_case(Case::ScreamingSnake), + out_file.display(), + )?; + + if out_file.exists() { + eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset); + continue; + } + let url = format!("{}/{}.csv.gz", BASE_URL, dataset); + eprintln!("downloading: {}", url); + let bytes = download_dataset(url.clone())?; + eprintln!("{} downloaded successfully", url); + eprintln!("uncompressing in {}", out_path.display()); + uncompress_in_file(bytes, &out_file)?; + } + + Ok(()) +} + +fn download_dataset(url: U) -> anyhow::Result> { + let bytes = reqwest::blocking::Client::builder() + .timeout(None) + .build()? + .get(url) + .send()? + .bytes()?; + Ok(Cursor::new(bytes)) +} + +fn uncompress_in_file>(bytes: R, path: P) -> anyhow::Result<()> { + let path = path.as_ref(); + let mut gz = GzDecoder::new(bytes); + let mut dataset = Vec::new(); + gz.read_to_end(&mut dataset)?; + + fs::write(path, dataset)?; + Ok(()) +} diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs new file mode 100644 index 000000000..4281ec115 --- /dev/null +++ b/benchmarks/src/lib.rs @@ -0,0 +1,5 @@ +//! This library is only used to isolate the benchmarks +//! from the original milli library. +//! +//! It does not include interesting functions for milli library +//! users only for milli contributors. diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1c0f74613..2af6a9042 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -53,17 +53,8 @@ tinytemplate = "=1.1.0" [dev-dependencies] big_s = "1.0.2" -criterion = "0.3.4" maplit = "1.0.2" rand = "0.8.3" [features] default = [] - -[[bench]] -name = "songs" -harness = false - -[[bench]] -name = "wiki" -harness = false diff --git a/milli/benches/README.md b/milli/benches/README.md deleted file mode 100644 index b2c1aec15..000000000 --- a/milli/benches/README.md +++ /dev/null @@ -1,27 +0,0 @@ -Benchmarks -========== - -For our benchmark we are using a small subset of the dataset songs.csv. It was generated with this command: -``` -xsv sample --seed 42 song.csv -o smol-songs.csv -``` -You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) -And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). - -You need to put this file in the current directory: `milli/milli/benches/smol-songs.csv.gz` -You can run the following command from the root of this git repository -``` -wget https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz -O milli/benches/smol-songs.csv.gz -``` - -- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h -- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h -- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h - -By default the benchmarks expect the datasets to be uncompressed and present in `milli/milli/benches`, but you can also specify your own path with the environment variable `MILLI_BENCH_DATASETS_PATH` like that: -``` -MILLI_BENCH_DATASETS_PATH=~/Downloads/datasets cargo bench --bench songs -``` - -Our benchmarking suite uses criterion which allow you to do a lot of configuration, see the documentation [here](https://bheisler.github.io/criterion.rs/book/user_guide/user_guide.html) - From 4536dfccd05c522b3fc720e3d3dc51a2c68a6d65 Mon Sep 17 00:00:00 2001 From: tamo Date: Tue, 25 May 2021 17:55:45 +0200 Subject: [PATCH 0726/1889] add a way to provide primary_key or autogenerate documents ids --- Cargo.lock | 346 ++++++++++++++++++++++++++++++++++-- benchmarks/benches/songs.rs | 1 + benchmarks/benches/utils.rs | 7 + 3 files changed, 338 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0b1da2b3f..04fd284c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,20 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" +[[package]] +name = "benchmarks" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes 1.0.1", + "convert_case", + "criterion", + "flate2", + "heed", + "milli", + "reqwest", +] + [[package]] name = "big_s" version = "1.0.2" @@ -327,6 +341,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "cow-utils" version = "0.1.2" @@ -506,6 +526,15 @@ version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" +[[package]] +name = "encoding_rs" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "fake-simd" version = "0.1.2" @@ -750,12 +779,31 @@ dependencies = [ "http", "indexmap", "slab", - "tokio", - "tokio-util", + "tokio 0.2.25", + "tokio-util 0.3.1", "tracing", "tracing-futures", ] +[[package]] +name = "h2" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" +dependencies = [ + "bytes 1.0.1", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio 1.6.0", + "tokio-util 0.6.7", + "tracing", +] + [[package]] name = "half" version = "1.7.1" @@ -893,6 +941,17 @@ dependencies = [ "http", ] +[[package]] +name = "http-body" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" +dependencies = [ + "bytes 1.0.1", + "http", + "pin-project-lite 0.2.6", +] + [[package]] name = "http-ui" version = "0.2.1" @@ -922,7 +981,7 @@ dependencies = [ "stderrlog", "structopt", "tempfile", - "tokio", + "tokio 0.2.25", "warp", ] @@ -960,20 +1019,59 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2", + "h2 0.2.7", "http", - "http-body", + "http-body 0.3.1", "httparse", "httpdate", "itoa", "pin-project 1.0.5", - "socket2", - "tokio", + "socket2 0.3.19", + "tokio 0.2.25", "tower-service", "tracing", "want", ] +[[package]] +name = "hyper" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bf09f61b52cfcf4c00de50df88ae423d6c02354e385a86341133b5338630ad1" +dependencies = [ + "bytes 1.0.1", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.3.3", + "http", + "http-body 0.4.2", + "httparse", + "httpdate", + "itoa", + "pin-project 1.0.5", + "socket2 0.4.0", + "tokio 1.6.0", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64" +dependencies = [ + "futures-util", + "hyper 0.14.5", + "log", + "rustls", + "tokio 1.6.0", + "tokio-rustls", + "webpki", +] + [[package]] name = "idna" version = "0.2.2" @@ -1029,6 +1127,12 @@ dependencies = [ "libc", ] +[[package]] +name = "ipnet" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135" + [[package]] name = "itertools" version = "0.9.0" @@ -1261,7 +1365,6 @@ dependencies = [ "bstr", "byteorder", "chrono", - "criterion", "crossbeam-channel", "csv", "either", @@ -1343,6 +1446,19 @@ dependencies = [ "winapi 0.2.8", ] +[[package]] +name = "mio" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956" +dependencies = [ + "libc", + "log", + "miow 0.3.7", + "ntapi", + "winapi 0.3.9", +] + [[package]] name = "mio-named-pipes" version = "0.1.7" @@ -1350,7 +1466,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" dependencies = [ "log", - "mio", + "mio 0.6.23", "miow 0.3.7", "winapi 0.3.9", ] @@ -1363,7 +1479,7 @@ checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0" dependencies = [ "iovec", "libc", - "mio", + "mio 0.6.23", ] [[package]] @@ -1441,6 +1557,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "ntapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +dependencies = [ + "winapi 0.3.9", +] + [[package]] name = "num-integer" version = "0.1.44" @@ -1956,12 +2081,62 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "reqwest" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124" +dependencies = [ + "base64 0.13.0", + "bytes 1.0.1", + "encoding_rs", + "futures-core", + "futures-util", + "http", + "http-body 0.4.2", + "hyper 0.14.5", + "hyper-rustls", + "ipnet", + "js-sys", + "lazy_static", + "log", + "mime", + "percent-encoding", + "pin-project-lite 0.2.6", + "rustls", + "serde", + "serde_urlencoded 0.7.0", + "tokio 1.6.0", + "tokio-rustls", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "webpki-roots", + "winreg", +] + [[package]] name = "retain_mut" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1" +[[package]] +name = "ring" +version = "0.16.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +dependencies = [ + "cc", + "libc", + "once_cell", + "spin", + "untrusted", + "web-sys", + "winapi 0.3.9", +] + [[package]] name = "roaring" version = "0.6.6" @@ -1982,6 +2157,19 @@ dependencies = [ "semver", ] +[[package]] +name = "rustls" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" +dependencies = [ + "base64 0.13.0", + "log", + "ring", + "sct", + "webpki", +] + [[package]] name = "ryu" version = "1.0.5" @@ -2015,6 +2203,16 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "sct" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce" +dependencies = [ + "ring", + "untrusted", +] + [[package]] name = "search" version = "0.2.1" @@ -2108,6 +2306,18 @@ dependencies = [ "url", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + [[package]] name = "sha-1" version = "0.8.2" @@ -2193,6 +2403,22 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "socket2" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e3dfc207c526015c632472a77be09cf1b6e46866581aecae5cc38fb4235dea2" +dependencies = [ + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + [[package]] name = "static_assertions" version = "1.1.0" @@ -2386,7 +2612,7 @@ dependencies = [ "lazy_static", "libc", "memchr", - "mio", + "mio 0.6.23", "mio-named-pipes", "mio-uds", "num_cpus", @@ -2397,6 +2623,21 @@ dependencies = [ "winapi 0.3.9", ] +[[package]] +name = "tokio" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37" +dependencies = [ + "autocfg", + "bytes 1.0.1", + "libc", + "memchr", + "mio 0.7.11", + "num_cpus", + "pin-project-lite 0.2.6", +] + [[package]] name = "tokio-macros" version = "0.2.6" @@ -2408,6 +2649,17 @@ dependencies = [ "syn 1.0.64", ] +[[package]] +name = "tokio-rustls" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" +dependencies = [ + "rustls", + "tokio 1.6.0", + "webpki", +] + [[package]] name = "tokio-tungstenite" version = "0.11.0" @@ -2417,7 +2669,7 @@ dependencies = [ "futures-util", "log", "pin-project 0.4.27", - "tokio", + "tokio 0.2.25", "tungstenite", ] @@ -2432,7 +2684,21 @@ dependencies = [ "futures-sink", "log", "pin-project-lite 0.1.12", - "tokio", + "tokio 0.2.25", +] + +[[package]] +name = "tokio-util" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592" +dependencies = [ + "bytes 1.0.1", + "futures-core", + "futures-sink", + "log", + "pin-project-lite 0.2.6", + "tokio 1.6.0", ] [[package]] @@ -2578,6 +2844,12 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +[[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + [[package]] name = "url" version = "2.2.1" @@ -2654,7 +2926,7 @@ dependencies = [ "futures", "headers", "http", - "hyper", + "hyper 0.13.10", "log", "mime", "mime_guess", @@ -2663,8 +2935,8 @@ dependencies = [ "scoped-tls", "serde", "serde_json", - "serde_urlencoded", - "tokio", + "serde_urlencoded 0.6.1", + "tokio 0.2.25", "tokio-tungstenite", "tower-service", "tracing", @@ -2691,6 +2963,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fe8f61dba8e5d645a4d8132dc7a0a66861ed5e1045d2c0ed940fab33bac0fbe" dependencies = [ "cfg-if 1.0.0", + "serde", + "serde_json", "wasm-bindgen-macro", ] @@ -2709,6 +2983,18 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73157efb9af26fb564bb59a009afd1c7c334a44db171d280690d0c3faaec3468" +dependencies = [ + "cfg-if 1.0.0", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.72" @@ -2748,6 +3034,25 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "webpki-roots" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940" +dependencies = [ + "webpki", +] + [[package]] name = "whatlang" version = "0.9.0" @@ -2800,6 +3105,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "winreg" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +dependencies = [ + "winapi 0.3.9", +] + [[package]] name = "ws2_32-sys" version = "0.2.1" diff --git a/benchmarks/benches/songs.rs b/benchmarks/benches/songs.rs index dd52a0afc..dea8cd605 100644 --- a/benchmarks/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -48,6 +48,7 @@ const BASE_CONF: Conf = Conf { "Notstandskomitee ", // 4 ], configure: base_conf, + primary_key: Some("id"), ..Conf::BASE }; diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index e0feb9b0e..6fa5f2d19 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -56,6 +56,10 @@ pub fn base_setup(conf: &Conf) -> Index { options.map_size(100 * 1024 * 1024 * 1024); // 100 GB options.max_readers(10); let index = Index::new(options, conf.database_name).unwrap(); + if let Some(primary_key) = conf.primary_key { + let mut wtxn = index.write_txn().unwrap(); + index.put_primary_key(&mut wtxn, primary_key).unwrap(); + } let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); @@ -78,6 +82,9 @@ pub fn base_setup(conf: &Conf) -> Index { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.index_documents(&mut wtxn, &index); + if let None = conf.primary_key { + builder.enable_autogenerate_docids(); + } builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); let reader = File::open(conf.dataset) From 0d0e900158cba3c450c99e241db18c224d1a4a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 26 May 2021 15:57:22 +0200 Subject: [PATCH 0727/1889] Add CI for benchmarks --- .github/workflows/benchmarks.yml | 63 +++++++++++++++++++ benchmarks/README.md | 101 +++++++++++++++++++++++++++---- benchmarks/scripts/compare.sh | 58 ++++++++++++++++++ 3 files changed, 209 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/benchmarks.yml create mode 100644 benchmarks/scripts/compare.sh diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 000000000..867e13132 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,63 @@ +name: Benchmarks + +on: + workflow_dispatch: + inputs: + dataset_name: + description: 'The name of the dataset used to benchmark (songs or wiki)' + required: false + default: 'songs' + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: self-hosted + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmpf files + - name: Install critcmp + run: cargo install critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results diff --git a/benchmarks/README.md b/benchmarks/README.md index 8c91700e9..cde4062e5 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,30 +1,105 @@ Benchmarks ========== -For our benchmark we are using a small subset of the dataset `songs.csv`. It was generated with this command: -``` +## TOC + +- [Datasets](#datasets) +- [Run the benchmarks](#run-the-benchmarks) +- [Comparison between benchmarks](#comparison-between-benchmarks) + +## Datasets + +The benchmarks are available for the following datasets: +- `songs` +- `wiki` + +### Songs + +`songs` is a subset of the [`songs.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). + +It was generated with this command: + +```bash xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv ``` -You can download it [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz) -And the original `songs.csv` dataset is available [here](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). -We also use a subset of `wikipedia-articles.csv` that was generated with the following command: -``` +_[Download the generated `songs` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)._ + +### Wiki + +`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz). + +It was generated with the following command: + +```bash xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv ``` -You can download the original [here](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz) and the subset [here](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz). ------ +_[Download the generated `wiki` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz)._ -- To run all the benchmarks we recommand using `cargo bench`, this should takes around ~4h -- You can also run the benchmarks on the `songs` dataset with `cargo bench --bench songs`, it should takes around 1h -- And on the `wiki` dataset with `cargo bench --bench wiki`, it should takes around 3h +## Run the benchmarks -By default the benchmarks will be downloaded and uncompressed automatically in the target directory. -If you don't want to download the datasets everytime you updates something on the code you can specify a custom directory with the env variable `MILLI_BENCH_DATASETS_PATH`: +### On our private server + +The Meili team has self-hosted his own GitHub runner to run benchmarks on our dedicated bare metal server. + +To trigger the benchmark workflow: +- Go to the `Actions` tab of this repository. +- Select the `Benchmarks` workflow on the left. +- Click on `Run workflow` in the blue banner. +- Select the branch on which you want to run the benchmarks and select the dataset you want (default: `songs`). +- Finally, click on `Run workflow`. + +This GitHub workflow will run the benchmarks and push the `critcmp` report to a DigitalOcean Space (= S3). + +_[More about critcmp](https://github.com/BurntSushi/critcmp)._ + +### On your machine + +To run all the benchmarks (~4h): + +```bash +cargo bench ``` + +To run only the `songs` (~1h) or `wiki` (~3h) benchmark: + +```bash +cargo bench --bench +``` + +By default, the benchmarks will be downloaded and uncompressed automatically in the target directory.
+If you don't want to download the datasets every time you update something on the code, you can specify a custom directory with the environment variable `MILLI_BENCH_DATASETS_PATH`: + +```bash mkdir ~/datasets MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded touch build.rs MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded ``` + +## Comparison between benchmarks + +The benchmark reports we push are generated with `critcmp`. Thus, we use `critcmp` to generate comparison results between 2 benchmarks. + +We provide a script to download and display the comparison report. + +Requirements: +- [`s3cmd`](https://github.com/s3tools/s3cmd) and being logged to the DigitalOcean Space "milli-benchmarks". See the [DigitalOcean guide](https://docs.digitalocean.com/products/spaces/resources/s3cmd/) +- [`critcmp`](https://github.com/BurntSushi/critcmp) + +List the available file in the DO Space: + +```bash +s3cmd ls s3://milli-benchmarks/critcmp_results/ +``` +```bash +2021-05-31 14:40 279890 s3://milli-benchmarks/critcmp_results/songs_main_09a4321.json +2021-05-31 13:49 279576 s3://milli-benchmarks/critcmp_results/songs_geosearch_24ec456.json +``` + +Run the comparison script: + +```bash +bash benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json +``` diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh new file mode 100644 index 000000000..868baeacf --- /dev/null +++ b/benchmarks/scripts/compare.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Requirements: +# - s3cmd and being logged to the DO Space "milli-benchmarks". See: https://docs.digitalocean.com/products/spaces/resources/s3cmd/ +# - critcmp. See: https://github.com/BurntSushi/critcmp + +# Usage +# $ bash compare.sh json_file1 json_file1 +# ex: bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json + +# Checking that critcmp is installed +command -v critcmp > /dev/null 2>&1 +if [[ "$?" -ne 0 ]]; then + echo 'You must install critcmp to make this script working.' + echo '$ cargo install critcmp' + echo 'See: https://github.com/BurntSushi/critcmp' + exit 1 +fi + +# Checking that s3cmd is installed +command -v s3cmd > /dev/null 2>&1 +if [[ "$?" -ne 0 ]]; then + echo 'You must install s3cmd to make this script working.' + echo 'See: https://github.com/s3tools/s3cmd' + exit 1 +fi + +if [[ $# -ne 2 ]] + then + echo 'Need 2 arguments.' + echo 'Usage: ' + echo ' $ bash compare.sh file_to_download1 file_to_download2' + echo 'Ex:' + echo ' $ bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json' + exit 1 +fi + +file1="$1" +file2="$2" +s3_path='s3://milli-benchmarks/critcmp_results' +file1_s3_path="$s3_path/$file1" +file2_s3_path="$s3_path/$file2" +file1_local_path="/tmp/$file1" +file2_local_path="/tmp/$file2" + +if [[ ! -f "$file1_local_path" ]]; then + s3cmd get "$file1_s3_path" "$file1_local_path" +else + echo "$file1 already present in /tmp, no need to download." +fi + +if [[ ! -f "$file2_local_path" ]]; then + s3cmd get "$file2_s3_path" "$file2_local_path" +else + echo "$file2 already present in /tmp, no need to download." +fi + +critcmp --color always "$file1_local_path" "$file2_local_path" From b3c0d438902ab069b4e4e9492c8c2c7e5ff87ac1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 13:56:11 +0200 Subject: [PATCH 0728/1889] Update benchmarks/scripts/compare.sh Co-authored-by: Irevoire --- benchmarks/scripts/compare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 868baeacf..6bd260122 100644 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # Requirements: # - s3cmd and being logged to the DO Space "milli-benchmarks". See: https://docs.digitalocean.com/products/spaces/resources/s3cmd/ From 57ed96622b92e066f0b1220e32c09161bf84bf7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 13:56:26 +0200 Subject: [PATCH 0729/1889] Update benchmarks/scripts/compare.sh Co-authored-by: Irevoire --- benchmarks/scripts/compare.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 6bd260122..4d3205c96 100644 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -45,6 +45,10 @@ file2_local_path="/tmp/$file2" if [[ ! -f "$file1_local_path" ]]; then s3cmd get "$file1_s3_path" "$file1_local_path" + if [[ "$?" -ne 0 ]]; then + echo 's3cmd command failed. Check your configuration' + exit 1 + fi else echo "$file1 already present in /tmp, no need to download." fi From 61fe422a884fe299a36ee3a0c455beef6648825b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 13:56:42 +0200 Subject: [PATCH 0730/1889] Update benchmarks/scripts/compare.sh Co-authored-by: Irevoire --- benchmarks/scripts/compare.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 4d3205c96..02f903bee 100644 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -55,6 +55,10 @@ fi if [[ ! -f "$file2_local_path" ]]; then s3cmd get "$file2_s3_path" "$file2_local_path" + if [[ "$?" -ne 0 ]]; then + echo 's3cmd command failed. Check your configuration' + exit 1 + fi else echo "$file2 already present in /tmp, no need to download." fi From bc4f4ee829fba22ba766e9b9f5a1a8f1a3d8bc79 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 1 Jun 2021 14:43:47 +0200 Subject: [PATCH 0731/1889] remove s3cmd as a dependency and provide a script to list all the available benchmarks --- benchmarks/scripts/compare.sh | 23 +++++++++++------------ benchmarks/scripts/list.sh | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 12 deletions(-) mode change 100644 => 100755 benchmarks/scripts/compare.sh create mode 100755 benchmarks/scripts/list.sh diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh old mode 100644 new mode 100755 index 02f903bee..e4231131d --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash # Requirements: -# - s3cmd and being logged to the DO Space "milli-benchmarks". See: https://docs.digitalocean.com/products/spaces/resources/s3cmd/ # - critcmp. See: https://github.com/BurntSushi/critcmp +# - wget # Usage # $ bash compare.sh json_file1 json_file1 @@ -17,11 +17,10 @@ if [[ "$?" -ne 0 ]]; then exit 1 fi -# Checking that s3cmd is installed -command -v s3cmd > /dev/null 2>&1 +# Checking that wget is installed +command -v wget > /dev/null 2>&1 if [[ "$?" -ne 0 ]]; then - echo 'You must install s3cmd to make this script working.' - echo 'See: https://github.com/s3tools/s3cmd' + echo 'You must install wget to make this script working.' exit 1 fi @@ -37,16 +36,16 @@ fi file1="$1" file2="$2" -s3_path='s3://milli-benchmarks/critcmp_results' -file1_s3_path="$s3_path/$file1" -file2_s3_path="$s3_path/$file2" +s3_url='https://milli-benchmarks.fra1.digitaloceanspaces.com/critcmp_results' +file1_s3_url="$s3_url/$file1" +file2_s3_url="$s3_url/$file2" file1_local_path="/tmp/$file1" file2_local_path="/tmp/$file2" if [[ ! -f "$file1_local_path" ]]; then - s3cmd get "$file1_s3_path" "$file1_local_path" + wget "$file1_s3_url" -O "$file1_local_path" if [[ "$?" -ne 0 ]]; then - echo 's3cmd command failed. Check your configuration' + echo 'wget command failed. Check your configuration' exit 1 fi else @@ -54,9 +53,9 @@ else fi if [[ ! -f "$file2_local_path" ]]; then - s3cmd get "$file2_s3_path" "$file2_local_path" + wget "$file2_s3_url" -O "$file2_local_path" if [[ "$?" -ne 0 ]]; then - echo 's3cmd command failed. Check your configuration' + echo 'wget command failed. Check your configuration' exit 1 fi else diff --git a/benchmarks/scripts/list.sh b/benchmarks/scripts/list.sh new file mode 100755 index 000000000..b368028da --- /dev/null +++ b/benchmarks/scripts/list.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# Requirements: +# - curl +# - grep + +res=$(curl -s https://milli-benchmarks.fra1.digitaloceanspaces.com | grep -oP "(?<=)[^<]+" | grep -oP --color=never "(?<=^critcmp_results/).+") + +for pattern in "$@" +do + res=$(echo "$res" | grep $pattern) +done + +echo "$res" From 3c91a9a551e19bc15449837b093de55fe594e743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 16:37:57 +0200 Subject: [PATCH 0732/1889] Update following reviews --- .github/workflows/benchmarks.yml | 14 +++++++++++--- benchmarks/README.md | 11 ++++++++--- benchmarks/scripts/compare.sh | 17 +++++------------ 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 867e13132..a2da8e6d5 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -39,12 +39,12 @@ jobs: id: file # Run benchmarks - - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | cd benchmarks cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }} - # Generate critcmpf files + # Generate critcmp files - name: Install critcmp run: cargo install critcmp - name: Export cripcmp file @@ -52,7 +52,7 @@ jobs: critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json # Upload benchmarks - - name: Upload to DO Spaces # DigitalOcean Spaces = S3 + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 uses: BetaHuhn/do-spaces-action@v2 with: access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} @@ -61,3 +61,11 @@ jobs: space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} source: ${{ steps.file.outputs.basename }}.json out_dir: critcmp_results + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json " diff --git a/benchmarks/README.md b/benchmarks/README.md index cde4062e5..caa4e163f 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -52,8 +52,12 @@ To trigger the benchmark workflow: This GitHub workflow will run the benchmarks and push the `critcmp` report to a DigitalOcean Space (= S3). +The name of the uploaded file is displayed in the workflow. + _[More about critcmp](https://github.com/BurntSushi/critcmp)._ +💡 To compare the just-uploaded benchmark with another one, check out the [next section](#comparison-between-benchmarks). + ### On your machine To run all the benchmarks (~4h): @@ -85,13 +89,14 @@ The benchmark reports we push are generated with `critcmp`. Thus, we use `critcm We provide a script to download and display the comparison report. Requirements: -- [`s3cmd`](https://github.com/s3tools/s3cmd) and being logged to the DigitalOcean Space "milli-benchmarks". See the [DigitalOcean guide](https://docs.digitalocean.com/products/spaces/resources/s3cmd/) +- `grep` +- `curl` - [`critcmp`](https://github.com/BurntSushi/critcmp) List the available file in the DO Space: ```bash -s3cmd ls s3://milli-benchmarks/critcmp_results/ +./benchmarks/script/list.sh ``` ```bash 2021-05-31 14:40 279890 s3://milli-benchmarks/critcmp_results/songs_main_09a4321.json @@ -101,5 +106,5 @@ s3cmd ls s3://milli-benchmarks/critcmp_results/ Run the comparison script: ```bash -bash benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json +./benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json ``` diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index e4231131d..6f8d0c5af 100755 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -2,7 +2,7 @@ # Requirements: # - critcmp. See: https://github.com/BurntSushi/critcmp -# - wget +# - curl # Usage # $ bash compare.sh json_file1 json_file1 @@ -17,13 +17,6 @@ if [[ "$?" -ne 0 ]]; then exit 1 fi -# Checking that wget is installed -command -v wget > /dev/null 2>&1 -if [[ "$?" -ne 0 ]]; then - echo 'You must install wget to make this script working.' - exit 1 -fi - if [[ $# -ne 2 ]] then echo 'Need 2 arguments.' @@ -43,9 +36,9 @@ file1_local_path="/tmp/$file1" file2_local_path="/tmp/$file2" if [[ ! -f "$file1_local_path" ]]; then - wget "$file1_s3_url" -O "$file1_local_path" + curl "$file1_s3_url" -O "$file1_local_path" if [[ "$?" -ne 0 ]]; then - echo 'wget command failed. Check your configuration' + echo 'curl command failed. Check your configuration' exit 1 fi else @@ -53,9 +46,9 @@ else fi if [[ ! -f "$file2_local_path" ]]; then - wget "$file2_s3_url" -O "$file2_local_path" + curl "$file2_s3_url" -O "$file2_local_path" if [[ "$?" -ne 0 ]]; then - echo 'wget command failed. Check your configuration' + echo 'curl command failed. Check your configuration' exit 1 fi else From edfcdb171c8a85f56d0d26dd5e6eb5a841c13690 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 18:54:39 +0200 Subject: [PATCH 0733/1889] Update benchmarks/scripts/list.sh Co-authored-by: Irevoire --- benchmarks/scripts/list.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/scripts/list.sh b/benchmarks/scripts/list.sh index b368028da..764193329 100755 --- a/benchmarks/scripts/list.sh +++ b/benchmarks/scripts/list.sh @@ -4,7 +4,7 @@ # - curl # - grep -res=$(curl -s https://milli-benchmarks.fra1.digitaloceanspaces.com | grep -oP "(?<=)[^<]+" | grep -oP --color=never "(?<=^critcmp_results/).+") +res=$(curl -s https://milli-benchmarks.fra1.digitaloceanspaces.com | grep -o '[^<]\+' | cut -c 5- | grep critcmp_results/ | cut -c 18-) for pattern in "$@" do From ef1ac8a0cb9c0ef39f220bd553dbc02d0450f167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Jun 2021 18:57:35 +0200 Subject: [PATCH 0734/1889] Update README --- benchmarks/README.md | 4 ++-- benchmarks/scripts/compare.sh | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index caa4e163f..ebe8eecdf 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -99,8 +99,8 @@ List the available file in the DO Space: ./benchmarks/script/list.sh ``` ```bash -2021-05-31 14:40 279890 s3://milli-benchmarks/critcmp_results/songs_main_09a4321.json -2021-05-31 13:49 279576 s3://milli-benchmarks/critcmp_results/songs_geosearch_24ec456.json +songs_main_09a4321.json +songs_geosearch_24ec456.json ``` Run the comparison script: diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 6f8d0c5af..067772bec 100755 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -21,9 +21,9 @@ if [[ $# -ne 2 ]] then echo 'Need 2 arguments.' echo 'Usage: ' - echo ' $ bash compare.sh file_to_download1 file_to_download2' + echo ' $ ./compare.sh file_to_download1 file_to_download2' echo 'Ex:' - echo ' $ bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json' + echo ' $ ./compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json' exit 1 fi @@ -38,7 +38,7 @@ file2_local_path="/tmp/$file2" if [[ ! -f "$file1_local_path" ]]; then curl "$file1_s3_url" -O "$file1_local_path" if [[ "$?" -ne 0 ]]; then - echo 'curl command failed. Check your configuration' + echo 'curl command failed.' exit 1 fi else @@ -48,7 +48,7 @@ fi if [[ ! -f "$file2_local_path" ]]; then curl "$file2_s3_url" -O "$file2_local_path" if [[ "$?" -ne 0 ]]; then - echo 'curl command failed. Check your configuration' + echo 'curl command failed.' exit 1 fi else From f346805c0c84e7c1b9dbb4f72ef2d51964a60166 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 2 Jun 2021 15:47:03 +0200 Subject: [PATCH 0735/1889] Update benchmarks/Cargo.toml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- benchmarks/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index f7b66fe3a..6be9c79d1 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2018" publish = false -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] milli = { path = "../milli" } From 2a3f9b32ff2d928d77ea4281e800023e976ae681 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Jun 2021 12:19:55 +0200 Subject: [PATCH 0736/1889] Rename the faceted fields into filterable fields --- milli/src/index.rs | 38 ++++++++++---- milli/src/search/distinct/mod.rs | 2 +- milli/src/search/facet/facet_condition.rs | 52 ++++++++++---------- milli/src/search/facet/facet_distribution.rs | 4 +- milli/src/search/mod.rs | 4 +- milli/src/update/facets.rs | 16 +++--- milli/src/update/index_documents/mod.rs | 4 +- milli/src/update/settings.rs | 34 ++++++------- 8 files changed, 87 insertions(+), 67 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index bd057a02a..9a52188dc 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -25,7 +25,7 @@ pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; -pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; +pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; @@ -324,19 +324,39 @@ impl Index { } } - /* faceted fields */ + /* filterable fields */ - /// Writes the facet fields names in the database. - pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields) + /// Writes the filterable fields names in the database. + pub fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, FILTERABLE_FIELDS_KEY, fields) } - /// Deletes the facet fields ids in the database. - pub fn delete_faceted_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY) + /// Deletes the filterable fields ids in the database. + pub fn delete_filterable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, FILTERABLE_FIELDS_KEY) } - /// Returns the facet fields names. + /// Returns the filterable fields names. + pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result> { + Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FILTERABLE_FIELDS_KEY)?.unwrap_or_default()) + } + + /// Same as `filterable_fields`, but returns ids instead. + pub fn filterable_fields_ids(&self, rtxn: &RoTxn) -> heed::Result> { + let filterable_fields = self.filterable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + let filterable_fields = filterable_fields + .iter() + .map(|k| { + fields_ids_map + .id(k) + .ok_or_else(|| format!("{:?} should be present in the field id map", k)) + .expect("corrupted data: ") + }) + .collect(); + + Ok(filterable_fields) + } pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) } diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 0dd628d5b..eed475863 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -83,7 +83,7 @@ mod test { let mut update = builder.settings(&mut txn, &index); update.set_distinct_attribute(distinct.to_string()); if !facets.is_empty() { - update.set_faceted_fields(facets) + update.set_filterable_fields(facets) } update.execute(|_, _| ()).unwrap(); diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index fd7053269..0c1f9e046 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -57,7 +57,7 @@ pub enum FacetCondition { fn field_id( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashSet, + filterable_fields: &HashSet, items: &mut Pairs, ) -> Result> { @@ -78,13 +78,13 @@ fn field_id( )), }; - if !faceted_fields.contains(&field_id) { + if !filterable_fields.contains(&field_id) { return Err(PestError::new_from_span( ErrorVariant::CustomError { message: format!( - "attribute `{}` is not faceted, available faceted attributes are: {}", + "attribute `{}` is not filterable, available filterable attributes are: {}", key.as_str(), - faceted_fields.iter().flat_map(|id| { + filterable_fields.iter().flat_map(|id| { fields_ids_map.name(*id) }).collect::>().join(", "), ), @@ -163,9 +163,9 @@ impl FacetCondition { ) -> anyhow::Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields_ids(rtxn)?; + let filterable_fields = index.filterable_fields_ids(rtxn)?; let lexed = FilterParser::parse(Rule::prgm, expression)?; - FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed) + FacetCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } fn from_pairs( @@ -212,12 +212,12 @@ impl FacetCondition { fn between( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> anyhow::Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; let (lresult, _) = pest_parse(items.next().unwrap()); let (rresult, _) = pest_parse(items.next().unwrap()); @@ -230,12 +230,12 @@ impl FacetCondition { fn equal( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> anyhow::Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; let value = items.next().unwrap(); let (result, svalue) = pest_parse(value); @@ -246,12 +246,12 @@ impl FacetCondition { fn greater_than( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> anyhow::Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -261,12 +261,12 @@ impl FacetCondition { fn greater_than_or_equal( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> anyhow::Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -276,12 +276,12 @@ impl FacetCondition { fn lower_than( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> anyhow::Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -291,12 +291,12 @@ impl FacetCondition { fn lower_than_or_equal( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> anyhow::Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -484,10 +484,10 @@ mod tests { options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); - // Set the faceted fields to be the channel. + // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashset!{ S("channel") }); + builder.set_filterable_fields(hashset!{ S("channel") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -513,10 +513,10 @@ mod tests { options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); - // Set the faceted fields to be the channel. + // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashset!{ "timestamp".into() }); + builder.set_filterable_fields(hashset!{ "timestamp".into() }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -541,11 +541,11 @@ mod tests { options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); - // Set the faceted fields to be the channel. + // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") }); + builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -588,11 +588,11 @@ mod tests { options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); - // Set the faceted fields to be the channel. + // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") }); + builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index c6122cc77..565f4c6dd 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -197,10 +197,10 @@ impl<'a> FacetDistribution<'a> { pub fn execute(&self) -> anyhow::Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let faceted_fields = self.index.faceted_fields(self.rtxn)?; + let filterable_fields = self.index.filterable_fields(self.rtxn)?; let mut distribution = BTreeMap::new(); - for name in faceted_fields { + for name in filterable_fields { let fid = fields_ids_map.id(&name).with_context(|| { format!("missing field name {:?} from the fields id map", name) })?; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index fc64d020f..abf19844e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -141,8 +141,8 @@ impl<'a> Search<'a> { Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; let id = field_ids_map.id(name).expect("distinct not present in field map"); - let faceted_fields = self.index.faceted_fields(self.rtxn)?; - if faceted_fields.contains(name) { + let filterable_fields = self.index.filterable_fields(self.rtxn)?; + if filterable_fields.contains(name) { let distinct = FacetDistinct::new(id, self.index, self.rtxn); self.perform_sort(distinct, matching_words, criteria) } else { diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index f0eab6023..1c235a509 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -57,14 +57,14 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { pub fn execute(self) -> anyhow::Result<()> { self.index.set_updated_at(self.wtxn, &Utc::now())?; - // We get the faceted fields to be able to create the facet levels. - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + // We get the filterable fields to be able to create the facet levels. + let filterable_fields = self.index.filterable_fields_ids(self.wtxn)?; debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - for field_id in faceted_fields { - // Compute and store the faceted strings documents ids. - let string_documents_ids = compute_faceted_documents_ids( + for field_id in filterable_fields { + // Compute and store the filterable strings documents ids. + let string_documents_ids = compute_filterable_documents_ids( self.wtxn, self.index.facet_id_string_docids.remap_key_type::(), field_id, @@ -77,8 +77,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - // Compute and store the faceted numbers documents ids. - let number_documents_ids = compute_faceted_documents_ids( + // Compute and store the filterable numbers documents ids. + let number_documents_ids = compute_filterable_documents_ids( self.wtxn, self.index.facet_id_f64_docids.remap_key_type::(), field_id, @@ -191,7 +191,7 @@ fn compute_facet_number_levels<'t>( writer_into_reader(writer, shrink_size) } -fn compute_faceted_documents_ids( +fn compute_filterable_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, field_id: u8, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 71f281e98..7efd3631c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -417,7 +417,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { FacetLevel0NumbersDocids, } - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + let filterable_fields = self.index.filterable_fields_ids(self.wtxn)?; let searchable_fields: HashSet<_> = match self.index.searchable_fields_ids(self.wtxn)? { Some(fields) => fields.iter().copied().collect(), None => fields_ids_map.iter().map(|(id, _name)| id).collect(), @@ -453,7 +453,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .map(|(i, documents)| { let store = Store::new( searchable_fields.clone(), - faceted_fields.clone(), + filterable_fields.clone(), linked_hash_map_size, max_nb_chunks, max_memory_by_job, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1571f627d..10b6b8cbe 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -66,7 +66,7 @@ pub struct Settings<'a, 't, 'u, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, - faceted_fields: Setting>, + filterable_fields: Setting>, criteria: Setting>, stop_words: Setting>, distinct_attribute: Setting, @@ -92,7 +92,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { thread_pool: None, searchable_fields: Setting::NotSet, displayed_fields: Setting::NotSet, - faceted_fields: Setting::NotSet, + filterable_fields: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, distinct_attribute: Setting::NotSet, @@ -117,12 +117,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.displayed_fields = Setting::Set(names); } - pub fn reset_faceted_fields(&mut self) { - self.faceted_fields = Setting::Reset; + pub fn reset_filterable_fields(&mut self) { + self.filterable_fields = Setting::Reset; } - pub fn set_faceted_fields(&mut self, names_facet_types: HashSet) { - self.faceted_fields = Setting::Set(names_facet_types); + pub fn set_filterable_fields(&mut self, names: HashSet) { + self.filterable_fields = Setting::Set(names); } pub fn reset_criteria(&mut self) { @@ -267,7 +267,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Setting::Set(ref fields) => { // every time the searchable attributes are updated, we need to update the // ids for any settings that uses the facets. (displayed_fields, - // faceted_fields) + // filterable_fields) let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_fields_ids_map = FieldsIdsMap::new(); @@ -382,7 +382,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } fn update_facets(&mut self) -> anyhow::Result { - match self.faceted_fields { + match self.filterable_fields { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_facets = HashSet::new(); @@ -390,10 +390,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fields_ids_map.insert(name).context("field id limit exceeded")?; new_facets.insert(name.clone()); } - self.index.put_faceted_fields(self.wtxn, &new_facets)?; + self.index.put_filterable_fields(self.wtxn, &new_facets)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_faceted_fields(self.wtxn)?; } + Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; } Setting::NotSet => return Ok(false) } Ok(true) @@ -402,10 +402,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_criteria(&mut self) -> anyhow::Result<()> { match self.criteria { Setting::Set(ref fields) => { - let faceted_fields = self.index.faceted_fields(&self.wtxn)?; + let filterable_fields = self.index.filterable_fields(&self.wtxn)?; let mut new_criteria = Vec::new(); for name in fields { - let criterion = Criterion::from_str(&faceted_fields, &name)?; + let criterion = Criterion::from_str(&filterable_fields, &name)?; new_criteria.push(criterion); } self.index.put_criteria(self.wtxn, &new_criteria)?; @@ -611,16 +611,16 @@ mod tests { } #[test] - fn set_faceted_fields() { + fn set_filterable_fields() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); - // Set the faceted fields to be the age. + // Set the filterable fields to be the age. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashset!{ S("age") }); + builder.set_filterable_fields(hashset!{ S("age") }); builder.execute(|_, _| ()).unwrap(); // Then index some documents. @@ -637,7 +637,7 @@ mod tests { // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); - let fields_ids = index.faceted_fields(&rtxn).unwrap(); + let fields_ids = index.filterable_fields(&rtxn).unwrap(); assert_eq!(fields_ids, hashset!{ S("age") }); // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. @@ -833,7 +833,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); - builder.set_faceted_fields(hashset!{ S("age"), S("toto") }); + builder.set_filterable_fields(hashset!{ S("age"), S("toto") }); builder.set_criteria(vec!["asc(toto)".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); From ff440c1d9d28e60e682f0352bd9c2b2b789a01a7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Jun 2021 12:20:29 +0200 Subject: [PATCH 0737/1889] Introduce the faceted fields method to retrieve those that needs faceting --- milli/src/index.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 9a52188dc..9cfcd841c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -357,8 +357,29 @@ impl Index { Ok(filterable_fields) } + + /* faceted documents ids */ + + /// Returns the faceted fields names. + /// + /// Faceted fields are the union of all the filterable, distinct, and Asc/Desc fields. pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { - Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) + let filterable_fields = self.filterable_fields(rtxn)?; + let distinct_field = self.distinct_attribute(rtxn)?; + let asc_desc_fields = self.criteria(rtxn)? + .into_iter() + .filter_map(|criterion| match criterion { + Criterion::Asc(field) | Criterion::Desc(field) => Some(field), + _otherwise => None, + }); + + let mut faceted_fields = filterable_fields; + faceted_fields.extend(asc_desc_fields); + if let Some(field) = distinct_field { + faceted_fields.insert(field.to_owned()); + } + + Ok(faceted_fields) } /// Same as `faceted_fields`, but returns ids instead. From 187c713de53b41a73d2ff696b71b3d32aa93e051 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Jun 2021 12:32:03 +0200 Subject: [PATCH 0738/1889] Remove the MapDistinct struct as now distinct attributes are faceted --- milli/src/search/distinct/facet_distinct.rs | 7 +- milli/src/search/distinct/map_distinct.rs | 138 -------------------- milli/src/search/distinct/mod.rs | 7 +- milli/src/search/mod.rs | 12 +- 4 files changed, 5 insertions(+), 159 deletions(-) delete mode 100644 milli/src/search/distinct/map_distinct.rs diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 9485087d3..322843ee0 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -189,8 +189,6 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { #[cfg(test)] mod test { - use std::collections::HashSet; - use super::super::test::{generate_index, validate_distinct_candidates}; use super::*; @@ -198,10 +196,7 @@ mod test { ($name:ident, $distinct:literal) => { #[test] fn $name() { - use std::iter::FromIterator; - - let facets = HashSet::from_iter(Some(($distinct.to_string()))); - let (index, fid, candidates) = generate_index($distinct, facets); + let (index, fid, candidates) = generate_index($distinct); let txn = index.read_txn().unwrap(); let mut map_distinct = FacetDistinct::new(fid, &index, &txn); let excluded = RoaringBitmap::new(); diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs deleted file mode 100644 index 465af2c3b..000000000 --- a/milli/src/search/distinct/map_distinct.rs +++ /dev/null @@ -1,138 +0,0 @@ -use std::collections::HashMap; - -use roaring::RoaringBitmap; -use serde_json::Value; - -use super::{Distinct, DocIter}; -use crate::{DocumentId, FieldId, Index}; - -/// A distinct implementer that is backed by an `HashMap`. -/// -/// Each time a document is seen, the value -/// for its distinct field is added to the map. If the map already contains an entry for this -/// value, then the document is filtered out, and is added to the excluded set. -pub struct MapDistinct<'a> { - distinct: FieldId, - map: HashMap, - index: &'a Index, - txn: &'a heed::RoTxn<'a>, -} - -impl<'a> MapDistinct<'a> { - pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { - Self { - distinct, - map: HashMap::new(), - index, - txn, - } - } -} - -pub struct MapDistinctIter<'a, 'b> { - distinct: FieldId, - map: &'b mut HashMap, - index: &'a Index, - txn: &'a heed::RoTxn<'a>, - candidates: roaring::bitmap::IntoIter, - excluded: RoaringBitmap, -} - -impl<'a, 'b> MapDistinctIter<'a, 'b> { - /// Performs the next iteration of the mafacetp distinct. This is a convenience method that is - /// called by the Iterator::next implementation that transposes the result. It makes error - /// handling easier. - fn next_inner(&mut self) -> anyhow::Result> { - let map = &mut self.map; - let mut filter = |value: Value| { - let entry = map.entry(value.to_string()).or_insert(0); - *entry += 1; - *entry <= 1 - }; - - while let Some(id) = self.candidates.next() { - let document = self.index.documents(&self.txn, Some(id))?[0].1; - let value = document - .get(self.distinct) - .map(serde_json::from_slice::) - .transpose()?; - - let accept = match value { - Some(Value::Array(values)) => { - let mut accept = true; - for value in values { - accept &= filter(value); - } - accept - } - Some(Value::Null) | Some(Value::Object(_)) | None => true, - Some(value) => filter(value), - }; - - if accept { - return Ok(Some(id)); - } else { - self.excluded.insert(id); - } - } - Ok(None) - } -} - -impl Iterator for MapDistinctIter<'_, '_> { - type Item = anyhow::Result; - - fn next(&mut self) -> Option { - self.next_inner().transpose() - } -} - -impl DocIter for MapDistinctIter<'_, '_> { - fn into_excluded(self) -> RoaringBitmap { - self.excluded - } -} - -impl<'a, 'b> Distinct<'b> for MapDistinct<'a> { - type Iter = MapDistinctIter<'a, 'b>; - - fn distinct(&'b mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { - MapDistinctIter { - distinct: self.distinct, - map: &mut self.map, - index: &self.index, - txn: &self.txn, - candidates: candidates.into_iter(), - excluded, - } - } -} - -#[cfg(test)] -mod test { - use std::collections::HashSet; - - use super::*; - use super::super::test::{generate_index, validate_distinct_candidates}; - - macro_rules! test_map_distinct { - ($name:ident, $distinct:literal) => { - #[test] - fn $name() { - let (index, fid, candidates) = generate_index($distinct, HashSet::new()); - let txn = index.read_txn().unwrap(); - let mut map_distinct = MapDistinct::new(fid, &index, &txn); - let excluded = RoaringBitmap::new(); - let mut iter = map_distinct.distinct(candidates.clone(), excluded); - let count = validate_distinct_candidates(iter.by_ref(), fid, &index); - let excluded = iter.into_excluded(); - assert_eq!(count as u64 + excluded.len(), candidates.len()); - } - }; - } - - test_map_distinct!(test_string, "txt"); - test_map_distinct!(test_strings, "txts"); - test_map_distinct!(test_int, "cat-int"); - test_map_distinct!(test_ints, "cat-ints"); -} diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index eed475863..68a94ed48 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -1,12 +1,10 @@ mod facet_distinct; -mod map_distinct; mod noop_distinct; use roaring::RoaringBitmap; use crate::DocumentId; pub use facet_distinct::FacetDistinct; -pub use map_distinct::MapDistinct; pub use noop_distinct::NoopDistinct; /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. @@ -74,7 +72,7 @@ mod test { /// Returns a temporary index populated with random test documents, the FieldId for the /// distinct attribute, and the RoaringBitmap with the document ids. - pub(crate) fn generate_index(distinct: &str, facets: HashSet) -> (TempIndex, FieldId, RoaringBitmap) { + pub(crate) fn generate_index(distinct: &str) -> (TempIndex, FieldId, RoaringBitmap) { let index = TempIndex::new(); let mut txn = index.write_txn().unwrap(); @@ -82,9 +80,6 @@ mod test { let builder = UpdateBuilder::new(0); let mut update = builder.settings(&mut txn, &index); update.set_distinct_attribute(distinct.to_string()); - if !facets.is_empty() { - update.set_filterable_fields(facets) - } update.execute(|_, _| ()).unwrap(); // add documents to the index diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index abf19844e..36a155290 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -12,7 +12,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct}; +use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{Index, DocumentId}; @@ -141,14 +141,8 @@ impl<'a> Search<'a> { Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; let id = field_ids_map.id(name).expect("distinct not present in field map"); - let filterable_fields = self.index.filterable_fields(self.rtxn)?; - if filterable_fields.contains(name) { - let distinct = FacetDistinct::new(id, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) - } else { - let distinct = MapDistinct::new(id, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) - } + let distinct = FacetDistinct::new(id, self.index, self.rtxn); + self.perform_sort(distinct, matching_words, criteria) } } } From 1e366dae3e06094f9e764ef619f49b195001427e Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Tue, 1 Jun 2021 14:43:48 +0200 Subject: [PATCH 0739/1889] remove useless lifetime on Distinct Trait --- milli/src/search/distinct/facet_distinct.rs | 2 +- milli/src/search/distinct/mod.rs | 4 ++-- milli/src/search/distinct/noop_distinct.rs | 2 +- milli/src/search/mod.rs | 7 ++++--- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 322843ee0..de7b28141 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -172,7 +172,7 @@ impl DocIter for FacetDistinctIter<'_> { } } -impl<'a> Distinct<'_> for FacetDistinct<'a> { +impl<'a> Distinct for FacetDistinct<'a> { type Iter = FacetDistinctIter<'a>; fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 68a94ed48..945beb7e6 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -18,10 +18,10 @@ pub trait DocIter: Iterator> { /// must return an iterator containing only distinct documents, and add the discarded documents to /// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the /// returned iterator. -pub trait Distinct<'a> { +pub trait Distinct { type Iter: DocIter; - fn distinct(&'a mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; + fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; } #[cfg(test)] diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 3de9be631..bfaafed85 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -26,7 +26,7 @@ impl DocIter for NoopDistinctIter { } } -impl Distinct<'_> for NoopDistinct { +impl Distinct for NoopDistinct { type Iter = NoopDistinctIter; fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 36a155290..11f56b7a6 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -147,12 +147,13 @@ impl<'a> Search<'a> { } } - fn perform_sort( + fn perform_sort( &self, - mut distinct: impl for<'c> Distinct<'c>, + mut distinct: D, matching_words: MatchingWords, mut criteria: Final, - ) -> anyhow::Result { + ) -> anyhow::Result + { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); let mut excluded_candidates = RoaringBitmap::new(); From c10469ddb6e6823b802d001e61a3c1a7bd66e721 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Jun 2021 15:10:34 +0200 Subject: [PATCH 0740/1889] Patch the http-ui crate to support filterable fields --- http-ui/src/main.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index da3b6204c..c232c0620 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -251,7 +251,7 @@ struct Settings { searchable_attributes: Setting>, #[serde(default, skip_serializing_if = "Setting::is_not_set")] - faceted_attributes: Setting>, + filterable_attributes: Setting>, #[serde(default, skip_serializing_if = "Setting::is_not_set")] criteria: Setting>, @@ -420,9 +420,9 @@ async fn main() -> anyhow::Result<()> { } // We transpose the settings JSON struct into a real setting update. - match settings.faceted_attributes { - Setting::Set(faceted_attributes) => builder.set_faceted_fields(faceted_attributes), - Setting::Reset => builder.reset_faceted_fields(), + match settings.filterable_attributes { + Setting::Set(filterable_attributes) => builder.set_filterable_fields(filterable_attributes), + Setting::Reset => builder.reset_filterable_fields(), Setting::NotSet => () } @@ -996,7 +996,7 @@ mod tests { let settings = Settings { displayed_attributes: Setting::Set(vec!["name".to_string()]), searchable_attributes: Setting::Set(vec!["age".to_string()]), - faceted_attributes: Setting::Set(hashset!{ "age".to_string() }), + filterable_attributes: Setting::Set(hashset!{ "age".to_string() }), criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] }) @@ -1047,7 +1047,7 @@ mod tests { let settings = Settings { displayed_attributes: Setting::Reset, searchable_attributes: Setting::Reset, - faceted_attributes: Setting::Reset, + filterable_attributes: Setting::Reset, criteria: Setting::Reset, stop_words: Setting::Reset, synonyms: Setting::Reset, @@ -1076,7 +1076,7 @@ mod tests { let settings = Settings { displayed_attributes: Setting::NotSet, searchable_attributes: Setting::NotSet, - faceted_attributes: Setting::NotSet, + filterable_attributes: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, synonyms: Setting::NotSet, From 6476827d3a576b1f20294d9d08d08a7c8f271598 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Jun 2021 15:22:05 +0200 Subject: [PATCH 0741/1889] Fix the indexer to be sure that distinct and Asc/Desc are also faceted --- milli/src/update/index_documents/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7efd3631c..71f281e98 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -417,7 +417,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { FacetLevel0NumbersDocids, } - let filterable_fields = self.index.filterable_fields_ids(self.wtxn)?; + let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; let searchable_fields: HashSet<_> = match self.index.searchable_fields_ids(self.wtxn)? { Some(fields) => fields.iter().copied().collect(), None => fields_ids_map.iter().map(|(id, _name)| id).collect(), @@ -453,7 +453,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .map(|(i, documents)| { let store = Store::new( searchable_fields.clone(), - filterable_fields.clone(), + faceted_fields.clone(), linked_hash_map_size, max_nb_chunks, max_memory_by_job, From c2afdbb1fb319edd1186081c90262a5cf43323b0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Jun 2021 15:22:58 +0200 Subject: [PATCH 0742/1889] Move and comment some internal facet_condition helper functions --- milli/src/search/facet/facet_condition.rs | 121 ++++++++++++---------- 1 file changed, 65 insertions(+), 56 deletions(-) diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index 0c1f9e046..2ff997270 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -55,62 +55,6 @@ pub enum FacetCondition { And(Box, Box), } -fn field_id( - fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, - items: &mut Pairs, -) -> Result> -{ - // lexing ensures that we at least have a key - let key = items.next().unwrap(); - - let field_id = match fields_ids_map.id(key.as_str()) { - Some(field_id) => field_id, - None => return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` not found, available attributes are: {}", - key.as_str(), - fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", "), - ), - }, - key.as_span(), - )), - }; - - if !filterable_fields.contains(&field_id) { - return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` is not filterable, available filterable attributes are: {}", - key.as_str(), - filterable_fields.iter().flat_map(|id| { - fields_ids_map.name(*id) - }).collect::>().join(", "), - ), - }, - key.as_span(), - )); - } - - Ok(field_id) -} - -fn pest_parse(pair: Pair) -> (Result>, String) -where T: FromStr, - T::Err: ToString, -{ - let result = match pair.as_str().parse::() { - Ok(value) => Ok(value), - Err(e) => Err(PestError::::new_from_span( - ErrorVariant::CustomError { message: e.to_string() }, - pair.as_span(), - )), - }; - - (result, pair.as_str().to_string()) -} - impl FacetCondition { pub fn from_array( rtxn: &heed::RoTxn, @@ -469,6 +413,71 @@ impl FacetCondition { } } +/// Retrieve the field id base on the pest value, returns an error is +/// the field does not exist or is not filterable. +/// +/// The pest pair is simply a string associated with a span, a location to highlight in +/// the error message. +fn field_id( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + items: &mut Pairs, +) -> Result> +{ + // lexing ensures that we at least have a key + let key = items.next().unwrap(); + + let field_id = match fields_ids_map.id(key.as_str()) { + Some(field_id) => field_id, + None => return Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` not found, available attributes are: {}", + key.as_str(), + fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", "), + ), + }, + key.as_span(), + )), + }; + + if !filterable_fields.contains(&field_id) { + return Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` is not filterable, available filterable attributes are: {}", + key.as_str(), + filterable_fields.iter().flat_map(|id| { + fields_ids_map.name(*id) + }).collect::>().join(", "), + ), + }, + key.as_span(), + )); + } + + Ok(field_id) +} + +/// Tries to parse the pest pair into the type `T` specified, always returns +/// the original string that we tried to parse. +/// +/// Returns the parsing error associated with the span if the conversion fails. +fn pest_parse(pair: Pair) -> (Result>, String) +where T: FromStr, + T::Err: ToString, +{ + let result = match pair.as_str().parse::() { + Ok(value) => Ok(value), + Err(e) => Err(PestError::::new_from_span( + ErrorVariant::CustomError { message: e.to_string() }, + pair.as_span(), + )), + }; + + (result, pair.as_str().to_string()) +} + #[cfg(test)] mod tests { use super::*; From 3b1cd4c4b437baf1a80625ade4b4f01bae3ed91c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Jun 2021 15:25:17 +0200 Subject: [PATCH 0743/1889] Rename the FacetCondition into FilterCondition --- http-ui/src/main.rs | 10 ++-- milli/src/lib.rs | 2 +- ...facet_condition.rs => filter_condition.rs} | 50 +++++++++---------- milli/src/search/facet/mod.rs | 4 +- milli/src/search/mod.rs | 20 ++++---- 5 files changed, 43 insertions(+), 43 deletions(-) rename milli/src/search/facet/{facet_condition.rs => filter_condition.rs} (93%) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index c232c0620..b6a894373 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -29,7 +29,7 @@ use tokio::sync::broadcast; use warp::{Filter, http::Response}; use warp::filters::ws::Message; -use milli::{FacetCondition, Index, MatchingWords, obkv_to_json, SearchResult, UpdateStore}; +use milli::{FilterCondition, Index, MatchingWords, obkv_to_json, SearchResult, UpdateStore}; use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; use milli::update::UpdateIndexingStep::*; @@ -690,7 +690,7 @@ async fn main() -> anyhow::Result<()> { let filters = match query.filters { Some(condition) if !condition.trim().is_empty() => { - Some(FacetCondition::from_str(&rtxn, &index, &condition).unwrap()) + Some(FilterCondition::from_str(&rtxn, &index, &condition).unwrap()) } _otherwise => None, }; @@ -698,21 +698,21 @@ async fn main() -> anyhow::Result<()> { let facet_filters = match query.facet_filters { Some(array) => { let eithers = array.into_iter().map(Into::into); - FacetCondition::from_array(&rtxn, &index, eithers).unwrap() + FilterCondition::from_array(&rtxn, &index, eithers).unwrap() } _otherwise => None, }; let condition = match (filters, facet_filters) { (Some(filters), Some(facet_filters)) => { - Some(FacetCondition::And(Box::new(filters), Box::new(facet_filters))) + Some(FilterCondition::And(Box::new(filters), Box::new(facet_filters))) } (Some(condition), None) | (None, Some(condition)) => Some(condition), _otherwise => None, }; if let Some(condition) = condition { - search.facet_condition(condition); + search.filter(condition); } let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index e4b58765e..39e107073 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -27,7 +27,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; -pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; +pub use self::search::{Search, FacetDistribution, FilterCondition, SearchResult, MatchingWords}; pub use self::tree_level::TreeLevel; pub use self::update_store::UpdateStore; diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/filter_condition.rs similarity index 93% rename from milli/src/search/facet/facet_condition.rs rename to milli/src/search/facet/filter_condition.rs index 2ff997270..f58443b6f 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -18,7 +18,7 @@ use super::FacetRange; use super::parser::Rule; use super::parser::{PREC_CLIMBER, FilterParser}; -use self::FacetCondition::*; +use self::FilterCondition::*; use self::Operator::*; #[derive(Debug, Clone, PartialEq)] @@ -49,18 +49,18 @@ impl Operator { } #[derive(Debug, Clone, PartialEq)] -pub enum FacetCondition { +pub enum FilterCondition { Operator(FieldId, Operator), Or(Box, Box), And(Box, Box), } -impl FacetCondition { +impl FilterCondition { pub fn from_array( rtxn: &heed::RoTxn, index: &Index, array: I, - ) -> anyhow::Result> + ) -> anyhow::Result> where I: IntoIterator>, J: IntoIterator, A: AsRef, @@ -73,7 +73,7 @@ impl FacetCondition { Either::Left(array) => { let mut ors = None; for rule in array { - let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?; + let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; ors = match ors.take() { Some(ors) => Some(Or(Box::new(ors), Box::new(condition))), None => Some(condition), @@ -88,7 +88,7 @@ impl FacetCondition { } }, Either::Right(rule) => { - let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?; + let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; ands = match ands.take() { Some(ands) => Some(And(Box::new(ands), Box::new(condition))), None => Some(condition), @@ -104,12 +104,12 @@ impl FacetCondition { rtxn: &heed::RoTxn, index: &Index, expression: &str, - ) -> anyhow::Result + ) -> anyhow::Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields_ids(rtxn)?; let lexed = FilterParser::parse(Rule::prgm, expression)?; - FacetCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) + FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } fn from_pairs( @@ -143,7 +143,7 @@ impl FacetCondition { ) } - fn negate(self) -> FacetCondition { + fn negate(self) -> FilterCondition { match self { Operator(fid, op) => match op.negate() { (op, None) => Operator(fid, op), @@ -158,7 +158,7 @@ impl FacetCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> anyhow::Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; @@ -176,7 +176,7 @@ impl FacetCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> anyhow::Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; @@ -192,7 +192,7 @@ impl FacetCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> anyhow::Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; @@ -207,7 +207,7 @@ impl FacetCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> anyhow::Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; @@ -222,7 +222,7 @@ impl FacetCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> anyhow::Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; @@ -237,7 +237,7 @@ impl FacetCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> anyhow::Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; @@ -249,7 +249,7 @@ impl FacetCondition { } } -impl FacetCondition { +impl FilterCondition { /// Aggregates the documents ids that are part of the specified range automatically /// going deeper through the levels. fn explore_facet_number_levels( @@ -502,15 +502,15 @@ mod tests { // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); - let condition = FacetCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); let expected = Operator(0, Operator::Equal(None, S("ponce"))); assert_eq!(condition, expected); - let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); - let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); } @@ -531,11 +531,11 @@ mod tests { // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); - let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); let expected = Operator(0, Between(22.0, 44.0)); assert_eq!(condition, expected); - let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); let expected = Or( Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0))), @@ -560,7 +560,7 @@ mod tests { // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); - let condition = FacetCondition::from_str( + let condition = FilterCondition::from_str( &rtxn, &index, "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", ).unwrap(); @@ -573,7 +573,7 @@ mod tests { ); assert_eq!(condition, expected); - let condition = FacetCondition::from_str( + let condition = FilterCondition::from_str( &rtxn, &index, "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", ).unwrap(); @@ -607,11 +607,11 @@ mod tests { // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); - let condition = FacetCondition::from_array( + let condition = FilterCondition::from_array( &rtxn, &index, vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])], ).unwrap().unwrap(); - let expected = FacetCondition::from_str( + let expected = FilterCondition::from_str( &rtxn, &index, "channel = gotaga AND (timestamp = 44 OR channel != ponce)", ).unwrap(); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index fff1d14a8..a5e02fc9f 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -9,10 +9,10 @@ use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{Index, FieldId}; -pub use self::facet_condition::{FacetCondition, Operator}; +pub use self::filter_condition::{FilterCondition, Operator}; pub use self::facet_distribution::FacetDistribution; -mod facet_condition; +mod filter_condition; mod facet_distribution; mod parser; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 11f56b7a6..c152d47a4 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -16,7 +16,7 @@ use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{Index, DocumentId}; -pub use self::facet::{FacetCondition, FacetDistribution, FacetIter, Operator}; +pub use self::facet::{FilterCondition, FacetDistribution, FacetIter, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; @@ -33,7 +33,7 @@ mod matching_words; pub struct Search<'a> { query: Option, - facet_condition: Option, + filter: Option, offset: usize, limit: usize, optional_words: bool, @@ -47,7 +47,7 @@ impl<'a> Search<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { Search { query: None, - facet_condition: None, + filter: None, offset: 0, limit: 20, optional_words: true, @@ -88,8 +88,8 @@ impl<'a> Search<'a> { self } - pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { - self.facet_condition = Some(condition); + pub fn filter(&mut self, condition: FilterCondition) -> &mut Search<'a> { + self.filter = Some(condition); self } @@ -121,12 +121,12 @@ impl<'a> Search<'a> { // We create the original candidates with the facet conditions results. let before = Instant::now(); - let facet_candidates = match &self.facet_condition { + let filtered_candidates = match &self.filter { Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?), None => None, }; - debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); + debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed()); let matching_words = match query_tree.as_ref() { Some(query_tree) => MatchingWords::from_query_tree(&query_tree), @@ -134,7 +134,7 @@ impl<'a> Search<'a> { }; let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let criteria = criteria_builder.build(query_tree, primitive_query, facet_candidates)?; + let criteria = criteria_builder.build(query_tree, primitive_query, filtered_candidates)?; match self.index.distinct_attribute(self.rtxn)? { None => self.perform_sort(NoopDistinct, matching_words, criteria), @@ -188,7 +188,7 @@ impl fmt::Debug for Search<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let Search { query, - facet_condition, + filter, offset, limit, optional_words, @@ -199,7 +199,7 @@ impl fmt::Debug for Search<'_> { } = self; f.debug_struct("Search") .field("query", query) - .field("facet_condition", facet_condition) + .field("filter", filter) .field("offset", offset) .field("limit", limit) .field("optional_words", optional_words) From b0c0490e857be4e9cd36ad74514e000e6c50158f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Jun 2021 15:48:38 +0200 Subject: [PATCH 0744/1889] Make sure that we can add a Asc/Desc field without it being filterable --- milli/src/criterion.rs | 11 +++++------ milli/src/update/settings.rs | 25 ++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 1d7326db7..81a2878b3 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,5 +1,5 @@ -use std::collections::HashSet; use std::fmt; +use std::str::FromStr; use anyhow::{Context, bail}; use regex::Regex; @@ -30,8 +30,10 @@ pub enum Criterion { Desc(String), } -impl Criterion { - pub fn from_str(faceted_attributes: &HashSet, txt: &str) -> anyhow::Result { +impl FromStr for Criterion { + type Err = anyhow::Error; + + fn from_str(txt: &str) -> Result { match txt { "words" => Ok(Criterion::Words), "typo" => Ok(Criterion::Typo), @@ -42,9 +44,6 @@ impl Criterion { let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; let order = caps.get(1).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str(); - faceted_attributes.get(field_name).with_context(|| { - format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name) - })?; match order { "asc" => Ok(Criterion::Asc(field_name.to_string())), "desc" => Ok(Criterion::Desc(field_name.to_string())), diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 10b6b8cbe..eba14a17c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -9,7 +9,6 @@ use rayon::ThreadPool; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::{FieldsIdsMap, Index}; -use crate::criterion::Criterion; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; @@ -402,10 +401,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_criteria(&mut self) -> anyhow::Result<()> { match self.criteria { Setting::Set(ref fields) => { - let filterable_fields = self.index.filterable_fields(&self.wtxn)?; let mut new_criteria = Vec::new(); for name in fields { - let criterion = Criterion::from_str(&filterable_fields, &name)?; + let criterion = name.parse()?; new_criteria.push(criterion); } self.index.put_criteria(self.wtxn, &new_criteria)?; @@ -446,6 +444,7 @@ mod tests { use maplit::{btreeset, hashmap, hashset}; use big_s::S; + use crate::{Criterion, FilterCondition}; use crate::update::{IndexDocuments, UpdateFormat}; use super::*; @@ -858,4 +857,24 @@ mod tests { assert!(index.primary_key(&rtxn).unwrap().is_none()); assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap()); } + + #[test] + fn setting_not_filterable_cant_filter() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set all the settings except searchable + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_displayed_fields(vec!["hello".to_string()]); + // It is only Asc(toto), there is a facet database but it is denied to filter with toto. + builder.set_criteria(vec!["asc(toto)".to_string()]); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + FilterCondition::from_str(&rtxn, &index, "toto = 32").unwrap_err(); + } } From 3c304c89d4885fd3fd0cbef58826bf0fa3aace80 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Jun 2021 16:29:14 +0200 Subject: [PATCH 0745/1889] Make sure that we generate the faceted database when required --- milli/src/index.rs | 18 ++-- milli/src/search/distinct/mod.rs | 2 +- milli/src/search/mod.rs | 2 +- milli/src/update/facets.rs | 16 ++-- milli/src/update/settings.rs | 139 ++++++++++++++++++++++++++----- 5 files changed, 135 insertions(+), 42 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 9cfcd841c..4e32f673a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,7 +23,7 @@ use crate::fields_ids_map::FieldsIdsMap; pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; -pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key"; +pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; @@ -365,7 +365,7 @@ impl Index { /// Faceted fields are the union of all the filterable, distinct, and Asc/Desc fields. pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { let filterable_fields = self.filterable_fields(rtxn)?; - let distinct_field = self.distinct_attribute(rtxn)?; + let distinct_field = self.distinct_field(rtxn)?; let asc_desc_fields = self.criteria(rtxn)? .into_iter() .filter_map(|criterion| match criterion { @@ -465,18 +465,18 @@ impl Index { } } - /* Distinct attribute */ + /* distinct field */ - pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { - self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) + pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> { + self.main.put::<_, Str, Str>(wtxn, DISTINCT_FIELD_KEY, distinct_field) } - pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) + pub fn distinct_field<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { + self.main.get::<_, Str, Str>(rtxn, DISTINCT_FIELD_KEY) } - pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) + pub(crate) fn delete_distinct_field(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, DISTINCT_FIELD_KEY) } /* criteria */ diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 945beb7e6..1b7c69c7a 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -79,7 +79,7 @@ mod test { // set distinct and faceted attributes for the index. let builder = UpdateBuilder::new(0); let mut update = builder.settings(&mut txn, &index); - update.set_distinct_attribute(distinct.to_string()); + update.set_distinct_field(distinct.to_string()); update.execute(|_, _| ()).unwrap(); // add documents to the index diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index c152d47a4..872ebfca6 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -136,7 +136,7 @@ impl<'a> Search<'a> { let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; let criteria = criteria_builder.build(query_tree, primitive_query, filtered_candidates)?; - match self.index.distinct_attribute(self.rtxn)? { + match self.index.distinct_field(self.rtxn)? { None => self.perform_sort(NoopDistinct, matching_words, criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 1c235a509..f0eab6023 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -57,14 +57,14 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { pub fn execute(self) -> anyhow::Result<()> { self.index.set_updated_at(self.wtxn, &Utc::now())?; - // We get the filterable fields to be able to create the facet levels. - let filterable_fields = self.index.filterable_fields_ids(self.wtxn)?; + // We get the faceted fields to be able to create the facet levels. + let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - for field_id in filterable_fields { - // Compute and store the filterable strings documents ids. - let string_documents_ids = compute_filterable_documents_ids( + for field_id in faceted_fields { + // Compute and store the faceted strings documents ids. + let string_documents_ids = compute_faceted_documents_ids( self.wtxn, self.index.facet_id_string_docids.remap_key_type::(), field_id, @@ -77,8 +77,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - // Compute and store the filterable numbers documents ids. - let number_documents_ids = compute_filterable_documents_ids( + // Compute and store the faceted numbers documents ids. + let number_documents_ids = compute_faceted_documents_ids( self.wtxn, self.index.facet_id_f64_docids.remap_key_type::(), field_id, @@ -191,7 +191,7 @@ fn compute_facet_number_levels<'t>( writer_into_reader(writer, shrink_size) } -fn compute_filterable_documents_ids( +fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, field_id: u8, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index eba14a17c..ef32c5c44 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -68,7 +68,7 @@ pub struct Settings<'a, 't, 'u, 'i> { filterable_fields: Setting>, criteria: Setting>, stop_words: Setting>, - distinct_attribute: Setting, + distinct_field: Setting, synonyms: Setting>>, } @@ -94,7 +94,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { filterable_fields: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, - distinct_attribute: Setting::NotSet, + distinct_field: Setting::NotSet, synonyms: Setting::NotSet, update_id, } @@ -144,12 +144,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } - pub fn reset_distinct_attribute(&mut self) { - self.distinct_attribute = Setting::Reset; + pub fn reset_distinct_field(&mut self) { + self.distinct_field = Setting::Reset; } - pub fn set_distinct_attribute(&mut self, distinct_attribute: String) { - self.distinct_attribute = Setting::Set(distinct_attribute); + pub fn set_distinct_field(&mut self, distinct_field: String) { + self.distinct_field = Setting::Set(distinct_field); } pub fn reset_synonyms(&mut self) { @@ -165,8 +165,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> - where - F: Fn(UpdateIndexingStep, u64) + Sync + where + F: Fn(UpdateIndexingStep, u64) + Sync { let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let update_id = self.update_id; @@ -197,7 +197,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let output = transform.remap_index_documents( primary_key.to_string(), old_fields_ids_map, - fields_ids_map.clone())?; + fields_ids_map.clone(), + )?; // We clear the full database (words-fst, documents ids and documents content). ClearDocuments::new(self.wtxn, self.index, self.update_id).execute()?; @@ -214,6 +215,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; indexing_builder.thread_pool = self.thread_pool; indexing_builder.execute_raw(output, &cb)?; + Ok(()) } @@ -242,18 +244,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } - fn update_distinct_attribute(&mut self) -> anyhow::Result { - match self.distinct_attribute { + fn update_distinct_field(&mut self) -> anyhow::Result { + match self.distinct_field { Setting::Set(ref attr) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; fields_ids_map .insert(attr) .context("field id limit exceeded")?; - self.index.put_distinct_attribute(self.wtxn, &attr)?; + self.index.put_distinct_field(self.wtxn, &attr)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_distinct_attribute(self.wtxn)?; }, + Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; }, Setting::NotSet => return Ok(false), } Ok(true) @@ -380,7 +382,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } - fn update_facets(&mut self) -> anyhow::Result { + fn update_filterable(&mut self) -> anyhow::Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -393,9 +395,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; } - Setting::NotSet => return Ok(false) + Setting::NotSet => (), } - Ok(true) + Ok(()) } fn update_criteria(&mut self) -> anyhow::Result<()> { @@ -419,20 +421,29 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { F: Fn(UpdateIndexingStep, u64) + Sync { self.index.set_updated_at(self.wtxn, &Utc::now())?; + + let old_faceted_fields = self.index.faceted_fields(&self.wtxn)?; let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; + self.update_displayed()?; - let stop_words_updated = self.update_stop_words()?; - let facets_updated = self.update_facets()?; - self.update_distinct_attribute()?; - // update_criteria MUST be called after update_facets, since criterion fields must be set - // as facets. + self.update_filterable()?; + self.update_distinct_field()?; self.update_criteria()?; + + // If there is new faceted fields we indicate that we must reindex as we must + // index new fields as facets. It means that the distinct attribute, + // an Asc/Desc criterion or a filtered attribute as be added or removed. + let new_faceted_fields = self.index.faceted_fields(&self.wtxn)?; + let faceted_updated = old_faceted_fields != new_faceted_fields; + + let stop_words_updated = self.update_stop_words()?; let synonyms_updated = self.update_synonyms()?; let searchable_updated = self.update_searchable()?; - if stop_words_updated || facets_updated || synonyms_updated || searchable_updated { + if stop_words_updated || faceted_updated || synonyms_updated || searchable_updated { self.reindex(&progress_callback, old_fields_ids_map)?; } + Ok(()) } } @@ -444,7 +455,7 @@ mod tests { use maplit::{btreeset, hashmap, hashset}; use big_s::S; - use crate::{Criterion, FilterCondition}; + use crate::{Criterion, FilterCondition, SearchResult}; use crate::update::{IndexDocuments, UpdateFormat}; use super::*; @@ -669,6 +680,88 @@ mod tests { assert_eq!(count, 4); } + #[test] + fn set_asc_desc_field() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the age. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + // Don't display the generated `id` field. + builder.set_displayed_fields(vec![S("name"), S("age")]); + builder.set_criteria(vec![S("asc(age)")]); + builder.execute(|_, _| ()).unwrap(); + + // Then index some documents. + let content = &br#"[ + { "name": "kevin", "age": 23 }, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); + builder.enable_autogenerate_docids(); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + let documents = index.documents(&rtxn, documents_ids).unwrap(); + + // Fetch the documents "age" field in the ordre in which the documents appear. + let age_field_id = index.fields_ids_map(&rtxn).unwrap().id("age").unwrap(); + let iter = documents.into_iter().map(|(_, doc)| { + let bytes = doc.get(age_field_id).unwrap(); + let string = std::str::from_utf8(bytes).unwrap(); + string.parse::().unwrap() + }); + + assert_eq!(iter.collect::>(), vec![21, 23, 34]); + } + + #[test] + fn set_distinct_field() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the age. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + // Don't display the generated `id` field. + builder.set_displayed_fields(vec![S("name"), S("age")]); + builder.set_distinct_field(S("age")); + builder.execute(|_, _| ()).unwrap(); + + // Then index some documents. + let content = &br#"[ + { "name": "kevin", "age": 23 }, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 }, + { "name": "bernard", "age": 34 }, + { "name": "bertrand", "age": 34 }, + { "name": "bernie", "age": 34 }, + { "name": "ben", "age": 34 } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); + builder.enable_autogenerate_docids(); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + + // There must be at least one document with a 34 as the age. + assert_eq!(documents_ids.len(), 3); + } + #[test] fn default_stop_words() { let path = tempfile::tempdir().unwrap(); From 3db25153e5e3641c2fbd5e793dce359b6de326e7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 2 Jun 2021 17:00:58 +0200 Subject: [PATCH 0746/1889] fix the faceted_fields one last time --- benchmarks/benches/songs.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/benches/songs.rs b/benchmarks/benches/songs.rs index dea8cd605..3f2822ca3 100644 --- a/benchmarks/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -21,14 +21,14 @@ fn base_conf(builder: &mut Settings) { builder.set_searchable_fields(searchable_fields); let faceted_fields = [ - ("released-timestamp", "number"), - ("duration-float", "number"), - ("genre", "string"), - ("country", "string"), - ("artist", "string"), + "released-timestamp", + "duration-float", + "genre", + "country", + "artist", ] .iter() - .map(|(a, b)| (a.to_string(), b.to_string())) + .map(|s| s.to_string()) .collect(); builder.set_faceted_fields(faceted_fields); } From 087ae648997e51bad96dff8ffa83991e864f2415 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 2 Jun 2021 17:03:30 +0200 Subject: [PATCH 0747/1889] add a gitignore to avoid pushing the autogenerated file --- benchmarks/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 benchmarks/.gitignore diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 000000000..1f259516b --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1 @@ +benches/datasets_paths.rs From 6dc08bf45e768bdf9ef4a87b7e8dc7b0346a99e1 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 2 Jun 2021 17:09:21 +0200 Subject: [PATCH 0748/1889] remove the nop function --- benchmarks/benches/utils.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 6fa5f2d19..83367a7ca 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -29,15 +29,13 @@ pub struct Conf<'a> { } impl Conf<'_> { - fn nop(_builder: &mut Settings) {} - pub const BASE: Self = Conf { database_name: "benches.mmdb", dataset: "", group_name: "", queries: &[], criterion: None, - configure: Self::nop, + configure: |_| (), facet_condition: None, optional_words: true, primary_key: None, From 26a9974667fc10cec2e27557f52e2e00c953642b Mon Sep 17 00:00:00 2001 From: many Date: Wed, 2 Jun 2021 16:30:56 +0200 Subject: [PATCH 0749/1889] Make asc/desc criterion return resting documents Fix #161.2 --- milli/src/search/criteria/asc_desc.rs | 24 +++++++++++++++++++----- milli/src/search/criteria/mod.rs | 4 ++-- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index c80bb38f1..f90f3e421 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -24,6 +24,7 @@ pub struct AscDesc<'t> { ascending: bool, query_tree: Option, candidates: Box> + 't>, + allowed_candidates: RoaringBitmap, bucket_candidates: RoaringBitmap, faceted_candidates: RoaringBitmap, parent: Box, @@ -68,6 +69,7 @@ impl<'t> AscDesc<'t> { ascending, query_tree: None, candidates: Box::new(std::iter::empty()), + allowed_candidates: RoaringBitmap::new(), faceted_candidates: index.number_faceted_documents_ids(rtxn, field_id)?, bucket_candidates: RoaringBitmap::new(), parent, @@ -78,6 +80,9 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + self.allowed_candidates -= params.excluded_candidates; + loop { debug!( "Facet {}({}) iteration", @@ -86,18 +91,25 @@ impl<'t> Criterion for AscDesc<'t> { ); match self.candidates.next().transpose()? { + None if !self.allowed_candidates.is_empty() => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: Some(take(&mut self.allowed_candidates)), + filtered_candidates: None, + bucket_candidates: Some(take(&mut self.bucket_candidates)), + })); + }, None => { match self.parent.next(params)? { Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { self.query_tree = query_tree; let mut candidates = match (&self.query_tree, candidates) { - (_, Some(candidates)) => candidates & &self.faceted_candidates, + (_, Some(candidates)) => candidates, (Some(qt), None) => { let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - let candidates = resolve_query_tree(&context, qt, params.wdcache)?; - candidates & &self.faceted_candidates + resolve_query_tree(&context, qt, params.wdcache)? }, - (None, None) => take(&mut self.faceted_candidates), + (None, None) => self.index.documents_ids(self.rtxn)?, }; if let Some(filtered_candidates) = filtered_candidates { @@ -113,12 +125,13 @@ impl<'t> Criterion for AscDesc<'t> { continue; } + self.allowed_candidates = &candidates - params.excluded_candidates; self.candidates = facet_ordered( self.index, self.rtxn, self.field_id, self.ascending, - candidates, + candidates & &self.faceted_candidates, )?; }, None => return Ok(None), @@ -126,6 +139,7 @@ impl<'t> Criterion for AscDesc<'t> { }, Some(mut candidates) => { candidates -= params.excluded_candidates; + self.allowed_candidates -= &candidates; return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), candidates: Some(candidates), diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 456d16e1a..e4ca66b2c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -203,14 +203,14 @@ impl<'t> CriteriaBuilder<'t> { &'t self, query_tree: Option, primitive_query: Option>, - facet_candidates: Option, + filtered_candidates: Option, ) -> anyhow::Result> { use crate::criterion::Criterion as Name; let primitive_query = primitive_query.unwrap_or_default(); - let mut criterion = Box::new(Initial::new(query_tree, facet_candidates)) as Box; + let mut criterion = Box::new(Initial::new(query_tree, filtered_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { Name::Typo => Box::new(Typo::new(self, criterion)), From 6b7841fefc7cbecab405c6e62d13eb306ef09675 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 3 Jun 2021 10:29:21 +0200 Subject: [PATCH 0750/1889] Make sure that the benchmarks always compile --- .github/workflows/test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ff8342620..a6b0db650 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,6 +37,11 @@ jobs: with: command: test + - uses: actions-rs/cargo@v1 + with: + command: bench + args: --no-run -p benchmarks + # - uses: actions-rs/cargo@v1 # with: # command: bench From 82fb5f0bef9899a28b1510c3b10a5e0cb1fabf30 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 3 Jun 2021 10:33:42 +0200 Subject: [PATCH 0751/1889] Fix the benchmarks compilation --- benchmarks/benches/songs.rs | 8 ++++---- benchmarks/benches/utils.rs | 16 ++++++++-------- benchmarks/benches/wiki.rs | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/benchmarks/benches/songs.rs b/benchmarks/benches/songs.rs index 3f2822ca3..e5da16a99 100644 --- a/benchmarks/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -30,7 +30,7 @@ fn base_conf(builder: &mut Settings) { .iter() .map(|s| s.to_string()) .collect(); - builder.set_faceted_fields(faceted_fields); + builder.set_filterable_fields(faceted_fields); } const BASE_CONF: Conf = Conf { @@ -156,17 +156,17 @@ fn bench_songs(c: &mut criterion::Criterion) { /* we bench the filters with the default request */ utils::Conf { group_name: "basic filter: <=", - facet_condition: Some("released-timestamp <= 946728000"), // year 2000 + filter: Some("released-timestamp <= 946728000"), // year 2000 ..BASE_CONF }, utils::Conf { group_name: "basic filter: TO", - facet_condition: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010 + filter: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010 ..BASE_CONF }, utils::Conf { group_name: "big filter", - facet_condition: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"), + filter: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"), ..BASE_CONF }, diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 83367a7ca..be66d7f84 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -4,7 +4,7 @@ use criterion::BenchmarkId; use heed::EnvOpenOptions; use milli::{ update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}, - FacetCondition, Index, + FilterCondition, Index, }; pub struct Conf<'a> { @@ -21,7 +21,7 @@ pub struct Conf<'a> { pub criterion: Option<&'a [&'a str]>, /// the last chance to configure your database as you want pub configure: fn(&mut Settings), - pub facet_condition: Option<&'a str>, + pub filter: Option<&'a str>, /// enable or disable the optional words on the query pub optional_words: bool, /// primary key, if there is None we'll auto-generate docids for every documents @@ -36,7 +36,7 @@ impl Conf<'_> { queries: &[], criterion: None, configure: |_| (), - facet_condition: None, + filter: None, optional_words: true, primary_key: None, }; @@ -64,7 +64,7 @@ pub fn base_setup(conf: &Conf) -> Index { let mut builder = update_builder.settings(&mut wtxn, &index); if let Some(criterion) = conf.criterion { - builder.reset_faceted_fields(); + builder.reset_filterable_fields(); builder.reset_criteria(); builder.reset_stop_words(); @@ -105,10 +105,10 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let rtxn = index.read_txn().unwrap(); let mut search = index.search(&rtxn); search.query(query).optional_words(conf.optional_words); - if let Some(facet_condition) = conf.facet_condition { - let facet_condition = - FacetCondition::from_str(&rtxn, &index, facet_condition).unwrap(); - search.facet_condition(facet_condition); + if let Some(filter) = conf.filter { + let filter = + FilterCondition::from_str(&rtxn, &index, filter).unwrap(); + search.filter(filter); } let _ids = search.execute().unwrap(); }); diff --git a/benchmarks/benches/wiki.rs b/benchmarks/benches/wiki.rs index 99ecff2ce..11ffe87d5 100644 --- a/benchmarks/benches/wiki.rs +++ b/benchmarks/benches/wiki.rs @@ -83,7 +83,7 @@ fn bench_songs(c: &mut criterion::Criterion) { group_name: "words", queries: &[ "the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results - "Kameya Tokujirō mingus monk ", // two words to pop, 55 + "Kameya Tokujirō mingus monk ", // two words to pop, 55 "Ulrich Hensel meilisearch milli ", // two words to pop, 306 "Idaho Bellevue pizza ", // one word to pop, 800 "Abraham machin ", // one word to pop, 1141 From 57898d8a907b939f4bef661c3a7e3e9db58745b9 Mon Sep 17 00:00:00 2001 From: marin postma Date: Wed, 2 Jun 2021 19:05:12 +0200 Subject: [PATCH 0752/1889] fix silent deserialize error --- milli/src/update/index_documents/transform.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index ced5fe2c7..fd508d6a4 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -95,7 +95,11 @@ impl Transform<'_, '_> { // We extract the primary key from the first document in // the batch if it hasn't already been defined in the index - let first = documents.peek().and_then(|r| r.as_ref().ok()); + let first = match documents.peek().map(Result::as_ref).transpose() { + Ok(first) => first, + Err(_) => return Err(documents.next().unwrap().unwrap_err().into()), + }; + let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); let (primary_key_id, primary_key) = compute_primary_key_pair( self.index.primary_key(self.rtxn)?, @@ -236,7 +240,7 @@ impl Transform<'_, '_> { // The primary key is known so we must find the position in the CSV headers. headers.iter().position(|h| h == primary_key) }, - None => headers.iter().position(|f| is_primary_key(&f)), + None => headers.iter().position(is_primary_key), }; // Returns the field id in the fields ids map, create an "id" field From 99b45d2aa03f578ee46d886d906e830d356aaea2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 3 Jun 2021 10:56:01 +0200 Subject: [PATCH 0753/1889] Make sure that all the workspaces crates compile --- .github/workflows/test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a6b0db650..dc2a7b1c6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -29,6 +29,11 @@ jobs: override: true components: rustfmt, clippy + - uses: actions-rs/cargo@v1 + with: + command: check + args: --all + - uses: actions-rs/cargo@v1 with: command: build From 3b2b3aeea9dff2eba08b31068c259e057b36cd5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 3 Jun 2021 12:24:27 +0200 Subject: [PATCH 0754/1889] Update Cargo.toml for next release v0.3.0 --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 0afd05b13..947bd5149 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.2.1" +version = "0.3.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 36745d567..de95ce3a6 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.2.1" +version = "0.3.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index c94ca63e2..53b1b37bd 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.2.1" +version = "0.3.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 2af6a9042..abcded8c9 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.2.1" +version = "0.3.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 1b7cd3a45..d0baca33f 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.2.1" +version = "0.3.0" authors = ["Clément Renault "] edition = "2018" From 10882bcbceb58b706f3bc7ae0b1818f48b02d1db Mon Sep 17 00:00:00 2001 From: many Date: Thu, 3 Jun 2021 14:44:53 +0200 Subject: [PATCH 0755/1889] Introduce integration test on criteria --- milli/tests/assets/test_set.ndjson | 17 + milli/tests/mod.rs | 1 + milli/tests/search/mod.rs | 137 +++++++ milli/tests/search/query_criteria.rs | 531 +++++++++++++++++++++++++++ 4 files changed, 686 insertions(+) create mode 100644 milli/tests/assets/test_set.ndjson create mode 100644 milli/tests/mod.rs create mode 100644 milli/tests/search/mod.rs create mode 100644 milli/tests/search/query_criteria.rs diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson new file mode 100644 index 000000000..f219ab7e9 --- /dev/null +++ b/milli/tests/assets/test_set.ndjson @@ -0,0 +1,17 @@ +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","":""} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","":""} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","":""} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","":""} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","":""} +{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","":""} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","":""} +{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","":""} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","":""} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","":""} +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","":""} +{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","":""} +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","":""} +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","":""} +{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","":""} +{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","":""} diff --git a/milli/tests/mod.rs b/milli/tests/mod.rs new file mode 100644 index 000000000..11095a6a9 --- /dev/null +++ b/milli/tests/mod.rs @@ -0,0 +1 @@ +mod search; diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs new file mode 100644 index 000000000..609787712 --- /dev/null +++ b/milli/tests/search/mod.rs @@ -0,0 +1,137 @@ +use milli::{Criterion, Index, DocumentId}; +use milli::update::{IndexDocuments, UpdateFormat, Settings}; + +use big_s::S; +use heed::EnvOpenOptions; +use maplit::{hashmap, hashset}; +use serde::Deserialize; +use slice_group_by::GroupBy; + +mod query_criteria; + +pub const TEST_QUERY: &'static str = "hello world america"; + +pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]; + +pub const CONTENT: &str = include_str!("../assets/test_set.ndjson"); + +pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut builder = Settings::new(&mut wtxn, &index, 0); + + let criteria = criteria.iter().map(|c| c.to_string()).collect(); + builder.set_criteria(criteria); + builder.set_filterable_fields(hashset!{ + S("tag"), + S("unexisting_field"), + S("asc_desc_rank"), + S("unexisting_field"), + }); + builder.set_synonyms(hashmap!{ + S("hello") => vec![S("good morning")], + S("world") => vec![S("earth")], + S("america") => vec![S("the united states")], + }); + builder.set_searchable_fields(vec![S("title"),S("description")]); + builder.execute(|_, _| ()).unwrap(); + + // index documents + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::JsonStream); + builder.enable_autogenerate_docids(); + builder.execute(CONTENT.as_bytes(), |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + + index +} + +#[allow(dead_code)] +pub fn external_to_internal_ids(index: &Index, external_ids: &[&str]) -> Vec { + let mut rtxn = index.read_txn().unwrap(); + let docid_map = index.external_documents_ids(&mut rtxn).unwrap(); + external_ids.iter().map(|id| docid_map.get(id).unwrap()).collect() +} + +pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec { + let mut rtxn = index.read_txn().unwrap(); + let docid_map = index.external_documents_ids(&mut rtxn).unwrap(); + let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); + internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() +} + +fn fetch_dataset() -> Vec { + serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect() +} + +pub fn expected_order(criteria: &[Criterion], autorize_typo: bool, optional_words: bool) -> Vec { + let dataset = fetch_dataset(); + let mut groups: Vec> = vec![dataset]; + + for criterion in criteria { + let mut new_groups = Vec::new(); + for group in groups.iter_mut() { + match criterion { + Criterion::Attribute => { + group.sort_by_key(|d| d.attribute_rank); + new_groups.extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from)); + }, + Criterion::Exactness => { + group.sort_by_key(|d| d.exact_rank); + new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from)); + }, + Criterion::Proximity => { + group.sort_by_key(|d| d.proximity_rank); + new_groups.extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); + }, + Criterion::Typo => { + group.sort_by_key(|d| d.typo_rank); + new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); + }, + Criterion::Words => { + group.sort_by_key(|d| d.word_rank); + new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from)); + }, + Criterion::Asc(_) => { + group.sort_by_key(|d| d.asc_desc_rank); + new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); + }, + Criterion::Desc(_) => { + group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank)); + new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); + }, + } + } + groups = std::mem::take(&mut new_groups); + } + + if autorize_typo && optional_words { + groups.into_iter().flatten().collect() + } else if optional_words { + groups.into_iter().flatten().filter(|d| d.typo_rank == 0).collect() + } else if autorize_typo { + groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect() + } else { + groups.into_iter().flatten().filter(|d| d.word_rank == 0 && d.typo_rank == 0).collect() + } +} + +#[derive(Debug, Clone, Deserialize)] +pub struct TestDocument { + pub id: String, + pub word_rank: u32, + pub typo_rank: u32, + pub proximity_rank: u32, + pub attribute_rank: u32, + pub exact_rank: u32, + pub asc_desc_rank: u32, + pub title: String, + pub description: String, + pub tag: String, +} diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs new file mode 100644 index 000000000..11d9b9e49 --- /dev/null +++ b/milli/tests/search/query_criteria.rs @@ -0,0 +1,531 @@ +use milli::{Search, SearchResult, Criterion}; +use big_s::S; + +use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; + +#[test] +fn none() { + let criteria = vec![]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn words() { + let criteria = vec![Criterion::Words]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn attribute() { + let criteria = vec![Criterion::Attribute]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn exactness() { + let criteria = vec![Criterion::Exactness]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn proximity() { + let criteria = vec![Criterion::Proximity]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn typo() { + let criteria = vec![Criterion::Typo]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn asc() { + let criteria = vec![Criterion::Asc(S("asc_desc_rank"))]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn desc() { + let criteria = vec![Criterion::Desc(S("asc_desc_rank"))]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn none_0_typo() { + let criteria = vec![]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.authorize_typos(false); + search.optional_words(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn attribute_0_typo() { + let criteria = vec![Criterion::Attribute]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + search.authorize_typos(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn exactness_0_typo() { + let criteria = vec![Criterion::Exactness]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + search.authorize_typos(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn proximity_0_typo() { + let criteria = vec![Criterion::Proximity]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + search.authorize_typos(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn typo_0_typo() { + let criteria = vec![Criterion::Typo]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + search.authorize_typos(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn asc_0_typo() { + let criteria = vec![Criterion::Asc(S("asc_desc_rank"))]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + search.authorize_typos(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn desc_0_typo() { + let criteria = vec![Criterion::Desc(S("asc_desc_rank"))]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + search.authorize_typos(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn test_desc_on_unexisting_field_should_return_all_1() { + let criteria = vec![Criterion::Desc(S("unexisting_field"))]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + search.authorize_typos(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let criteria = vec![]; + let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn test_asc_on_unexisting_field_should_return_all_1() { + let criteria = vec![Criterion::Asc(S("unexisting_field"))]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(false); + search.authorize_typos(false); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let criteria = vec![]; + let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn test_desc_on_unexisting_field_should_return_all_2() { + let criteria = vec![Criterion::Desc(S("unexisting_field"))]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let criteria = vec![]; + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn test_asc_on_unexisting_field_should_return_all_2() { + let criteria = vec![Criterion::Asc(S("unexisting_field"))]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let criteria = vec![]; + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); +} + +#[test] +fn criteria_mixup() { + use Criterion::*; + let index = search::setup_search_index_with_criteria(&vec![Words, Attribute, Desc(S("asc_desc_rank")), Exactness, Proximity, Typo]); + + + let criteria_mix = { + let desc = || Desc(S("asc_desc_rank")); + // all possible criteria order + vec![ + vec![Words, Attribute, desc(), Exactness, Proximity, Typo], + vec![Words, Attribute, desc(), Exactness, Typo, Proximity], + vec![Words, Attribute, desc(), Proximity, Exactness, Typo], + vec![Words, Attribute, desc(), Proximity, Typo, Exactness], + vec![Words, Attribute, desc(), Typo, Exactness, Proximity], + vec![Words, Attribute, desc(), Typo, Proximity, Exactness], + vec![Words, Attribute, Exactness, desc(), Proximity, Typo], + vec![Words, Attribute, Exactness, desc(), Typo, Proximity], + vec![Words, Attribute, Exactness, Proximity, desc(), Typo], + vec![Words, Attribute, Exactness, Proximity, Typo, desc()], + vec![Words, Attribute, Exactness, Typo, desc(), Proximity], + vec![Words, Attribute, Exactness, Typo, Proximity, desc()], + vec![Words, Attribute, Proximity, desc(), Exactness, Typo], + vec![Words, Attribute, Proximity, desc(), Typo, Exactness], + vec![Words, Attribute, Proximity, Exactness, desc(), Typo], + vec![Words, Attribute, Proximity, Exactness, Typo, desc()], + vec![Words, Attribute, Proximity, Typo, desc(), Exactness], + vec![Words, Attribute, Proximity, Typo, Exactness, desc()], + vec![Words, Attribute, Typo, desc(), Exactness, Proximity], + vec![Words, Attribute, Typo, desc(), Proximity, Exactness], + vec![Words, Attribute, Typo, Exactness, desc(), Proximity], + vec![Words, Attribute, Typo, Exactness, Proximity, desc()], + vec![Words, Attribute, Typo, Proximity, desc(), Exactness], + vec![Words, Attribute, Typo, Proximity, Exactness, desc()], + vec![Words, desc(), Attribute, Exactness, Proximity, Typo], + vec![Words, desc(), Attribute, Exactness, Typo, Proximity], + vec![Words, desc(), Attribute, Proximity, Exactness, Typo], + vec![Words, desc(), Attribute, Proximity, Typo, Exactness], + vec![Words, desc(), Attribute, Typo, Exactness, Proximity], + vec![Words, desc(), Attribute, Typo, Proximity, Exactness], + vec![Words, desc(), Exactness, Attribute, Proximity, Typo], + vec![Words, desc(), Exactness, Attribute, Typo, Proximity], + vec![Words, desc(), Exactness, Proximity, Attribute, Typo], + vec![Words, desc(), Exactness, Proximity, Typo, Attribute], + vec![Words, desc(), Exactness, Typo, Attribute, Proximity], + vec![Words, desc(), Exactness, Typo, Proximity, Attribute], + vec![Words, desc(), Proximity, Attribute, Exactness, Typo], + vec![Words, desc(), Proximity, Attribute, Typo, Exactness], + vec![Words, desc(), Proximity, Exactness, Attribute, Typo], + vec![Words, desc(), Proximity, Exactness, Typo, Attribute], + vec![Words, desc(), Proximity, Typo, Attribute, Exactness], + vec![Words, desc(), Proximity, Typo, Exactness, Attribute], + vec![Words, desc(), Typo, Attribute, Exactness, Proximity], + vec![Words, desc(), Typo, Attribute, Proximity, Exactness], + vec![Words, desc(), Typo, Exactness, Attribute, Proximity], + vec![Words, desc(), Typo, Exactness, Proximity, Attribute], + vec![Words, desc(), Typo, Proximity, Attribute, Exactness], + vec![Words, desc(), Typo, Proximity, Exactness, Attribute], + vec![Words, Exactness, Attribute, desc(), Proximity, Typo], + vec![Words, Exactness, Attribute, desc(), Typo, Proximity], + vec![Words, Exactness, Attribute, Proximity, desc(), Typo], + vec![Words, Exactness, Attribute, Proximity, Typo, desc()], + vec![Words, Exactness, Attribute, Typo, desc(), Proximity], + vec![Words, Exactness, Attribute, Typo, Proximity, desc()], + vec![Words, Exactness, desc(), Attribute, Proximity, Typo], + vec![Words, Exactness, desc(), Attribute, Typo, Proximity], + vec![Words, Exactness, desc(), Proximity, Attribute, Typo], + vec![Words, Exactness, desc(), Proximity, Typo, Attribute], + vec![Words, Exactness, desc(), Typo, Attribute, Proximity], + vec![Words, Exactness, desc(), Typo, Proximity, Attribute], + vec![Words, Exactness, Proximity, Attribute, desc(), Typo], + vec![Words, Exactness, Proximity, Attribute, Typo, desc()], + vec![Words, Exactness, Proximity, desc(), Attribute, Typo], + vec![Words, Exactness, Proximity, desc(), Typo, Attribute], + vec![Words, Exactness, Proximity, Typo, Attribute, desc()], + vec![Words, Exactness, Proximity, Typo, desc(), Attribute], + vec![Words, Exactness, Typo, Attribute, desc(), Proximity], + vec![Words, Exactness, Typo, Attribute, Proximity, desc()], + vec![Words, Exactness, Typo, desc(), Attribute, Proximity], + vec![Words, Exactness, Typo, desc(), Proximity, Attribute], + vec![Words, Exactness, Typo, Proximity, Attribute, desc()], + vec![Words, Exactness, Typo, Proximity, desc(), Attribute], + vec![Words, Proximity, Attribute, desc(), Exactness, Typo], + vec![Words, Proximity, Attribute, desc(), Typo, Exactness], + vec![Words, Proximity, Attribute, Exactness, desc(), Typo], + vec![Words, Proximity, Attribute, Exactness, Typo, desc()], + vec![Words, Proximity, Attribute, Typo, desc(), Exactness], + vec![Words, Proximity, Attribute, Typo, Exactness, desc()], + vec![Words, Proximity, desc(), Attribute, Exactness, Typo], + vec![Words, Proximity, desc(), Attribute, Typo, Exactness], + vec![Words, Proximity, desc(), Exactness, Attribute, Typo], + vec![Words, Proximity, desc(), Exactness, Typo, Attribute], + vec![Words, Proximity, desc(), Typo, Attribute, Exactness], + vec![Words, Proximity, desc(), Typo, Exactness, Attribute], + vec![Words, Proximity, Exactness, Attribute, desc(), Typo], + vec![Words, Proximity, Exactness, Attribute, Typo, desc()], + vec![Words, Proximity, Exactness, desc(), Attribute, Typo], + vec![Words, Proximity, Exactness, desc(), Typo, Attribute], + vec![Words, Proximity, Exactness, Typo, Attribute, desc()], + vec![Words, Proximity, Exactness, Typo, desc(), Attribute], + vec![Words, Proximity, Typo, Attribute, desc(), Exactness], + vec![Words, Proximity, Typo, Attribute, Exactness, desc()], + vec![Words, Proximity, Typo, desc(), Attribute, Exactness], + vec![Words, Proximity, Typo, desc(), Exactness, Attribute], + vec![Words, Proximity, Typo, Exactness, Attribute, desc()], + vec![Words, Proximity, Typo, Exactness, desc(), Attribute], + vec![Words, Typo, Attribute, desc(), Exactness, Proximity], + vec![Words, Typo, Attribute, desc(), Proximity, Exactness], + vec![Words, Typo, Attribute, Exactness, desc(), Proximity], + vec![Words, Typo, Attribute, Exactness, Proximity, desc()], + vec![Words, Typo, Attribute, Proximity, desc(), Exactness], + vec![Words, Typo, Attribute, Proximity, Exactness, desc()], + vec![Words, Typo, desc(), Attribute, Proximity, Exactness], + vec![Words, Typo, desc(), Exactness, Attribute, Proximity], + vec![Words, Typo, desc(), Exactness, Attribute, Proximity], + vec![Words, Typo, desc(), Exactness, Proximity, Attribute], + vec![Words, Typo, desc(), Proximity, Attribute, Exactness], + vec![Words, Typo, desc(), Proximity, Exactness, Attribute], + vec![Words, Typo, Exactness, Attribute, desc(), Proximity], + vec![Words, Typo, Exactness, Attribute, Proximity, desc()], + vec![Words, Typo, Exactness, desc(), Attribute, Proximity], + vec![Words, Typo, Exactness, desc(), Proximity, Attribute], + vec![Words, Typo, Exactness, Proximity, Attribute, desc()], + vec![Words, Typo, Exactness, Proximity, desc(), Attribute], + vec![Words, Typo, Proximity, Attribute, desc(), Exactness], + vec![Words, Typo, Proximity, Attribute, Exactness, desc()], + vec![Words, Typo, Proximity, desc(), Attribute, Exactness], + vec![Words, Typo, Proximity, desc(), Exactness, Attribute], + vec![Words, Typo, Proximity, Exactness, Attribute, desc()], + vec![Words, Typo, Proximity, Exactness, desc(), Attribute], + ] + }; + + for criteria in criteria_mix { + eprintln!("Testing with criteria order: {:?}", &criteria); + //update criteria + let mut wtxn = index.write_txn().unwrap(); + index.put_criteria(&mut wtxn, &criteria).unwrap(); + wtxn.commit().unwrap(); + + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.optional_words(true); + search.authorize_typos(true); + + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); + } +} From 76a2343639b5f49abeef653ed238680ea4ee20dd Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 3 Jun 2021 15:39:52 +0200 Subject: [PATCH 0756/1889] Fix the compare script of the benchmarks --- benchmarks/scripts/compare.sh | 48 +++++++++++++++-------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 067772bec..506c94015 100755 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -11,9 +11,9 @@ # Checking that critcmp is installed command -v critcmp > /dev/null 2>&1 if [[ "$?" -ne 0 ]]; then - echo 'You must install critcmp to make this script working.' - echo '$ cargo install critcmp' + echo 'You must install critcmp to make this script work.' echo 'See: https://github.com/BurntSushi/critcmp' + echo ' $ cargo install critcmp' exit 1 fi @@ -21,38 +21,30 @@ if [[ $# -ne 2 ]] then echo 'Need 2 arguments.' echo 'Usage: ' - echo ' $ ./compare.sh file_to_download1 file_to_download2' + echo ' $ ./compare.sh old new' echo 'Ex:' echo ' $ ./compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json' exit 1 fi -file1="$1" -file2="$2" +old_file="$1" +new_file="$2" s3_url='https://milli-benchmarks.fra1.digitaloceanspaces.com/critcmp_results' -file1_s3_url="$s3_url/$file1" -file2_s3_url="$s3_url/$file2" -file1_local_path="/tmp/$file1" -file2_local_path="/tmp/$file2" -if [[ ! -f "$file1_local_path" ]]; then - curl "$file1_s3_url" -O "$file1_local_path" - if [[ "$?" -ne 0 ]]; then - echo 'curl command failed.' - exit 1 +for file in $old_file $new_file +do + file_s3_url="$s3_url/$file" + file_local_path="/tmp/$file" + + if [[ ! -f $file_local_path ]]; then + curl $file_s3_url --output $file_local_path --silent + if [[ "$?" -ne 0 ]]; then + echo 'curl command failed.' + exit 1 + fi fi -else - echo "$file1 already present in /tmp, no need to download." -fi +done -if [[ ! -f "$file2_local_path" ]]; then - curl "$file2_s3_url" -O "$file2_local_path" - if [[ "$?" -ne 0 ]]; then - echo 'curl command failed.' - exit 1 - fi -else - echo "$file2 already present in /tmp, no need to download." -fi - -critcmp --color always "$file1_local_path" "$file2_local_path" +# Print the diff changes between the old and new benchmarks +# by only displaying the lines that have a diff of more than 5%. +critcmp --threshold 5 "/tmp/$old_file" "/tmp/$new_file" From 29824d05ab0a7db8f72ee984b2d22bb2c5bfb807 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 3 Jun 2021 15:59:43 +0200 Subject: [PATCH 0757/1889] Reduce the length of the benchmarks names --- Cargo.lock | 10 +++++----- benchmarks/Cargo.toml | 2 +- benchmarks/benches/utils.rs | 8 +++++--- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 04fd284c6..575f582bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -900,7 +900,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "byte-unit", @@ -954,7 +954,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "askama", @@ -1095,7 +1095,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "byte-unit", @@ -1358,7 +1358,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "big_s", @@ -2215,7 +2215,7 @@ dependencies = [ [[package]] name = "search" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "byte-unit", diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 6be9c79d1..ed366022c 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -10,7 +10,7 @@ milli = { path = "../milli" } [dev-dependencies] heed = "*" # we want to use the version milli uses -criterion = "0.3.4" +criterion = { version = "0.3.4", features = ["html_reports"] } [build-dependencies] anyhow = "1.0" diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index be66d7f84..5138de4d2 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,4 +1,5 @@ use std::fs::{create_dir_all, remove_dir_all, File}; +use std::path::Path; use criterion::BenchmarkId; use heed::EnvOpenOptions; @@ -97,7 +98,9 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { for conf in confs { let index = base_setup(conf); - let mut group = c.benchmark_group(&format!("{}: {}", conf.dataset, conf.group_name)); + let file_name = Path::new(conf.dataset).file_name().and_then(|f| f.to_str()).unwrap(); + let name = format!("{}: {}", file_name, conf.group_name); + let mut group = c.benchmark_group(&name); for &query in conf.queries { group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { @@ -106,8 +109,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let mut search = index.search(&rtxn); search.query(query).optional_words(conf.optional_words); if let Some(filter) = conf.filter { - let filter = - FilterCondition::from_str(&rtxn, &index, filter).unwrap(); + let filter = FilterCondition::from_str(&rtxn, &index, filter).unwrap(); search.filter(filter); } let _ids = search.execute().unwrap(); From 70229f07c8b9c5d4e3b6361c7adb9d58f069434f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 3 Jun 2021 16:22:43 +0200 Subject: [PATCH 0758/1889] Update Cargo.lock --- Cargo.lock | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 04fd284c6..575f582bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -900,7 +900,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "byte-unit", @@ -954,7 +954,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "askama", @@ -1095,7 +1095,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "byte-unit", @@ -1358,7 +1358,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "big_s", @@ -2215,7 +2215,7 @@ dependencies = [ [[package]] name = "search" -version = "0.2.1" +version = "0.3.0" dependencies = [ "anyhow", "byte-unit", From e9104a0a32b3fa69f1d323010aebddd101dc1a72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 3 Jun 2021 16:23:59 +0200 Subject: [PATCH 0759/1889] Add --locked in CI tests --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index dc2a7b1c6..4ee17a239 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -32,7 +32,7 @@ jobs: - uses: actions-rs/cargo@v1 with: command: check - args: --all + args: --all --locked - uses: actions-rs/cargo@v1 with: From 38ab541f4a299b71a76113b9ab19974fb0ff316f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Fri, 4 Jun 2021 00:21:39 +0200 Subject: [PATCH 0760/1889] Make the benchmark command more convenient in CI --- .github/workflows/benchmarks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index a2da8e6d5..e110c6be5 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -68,4 +68,4 @@ jobs: echo "${{ steps.file.outputs.basename }}.json has just been pushed." echo 'How to compare this benchmark with another one?' echo ' - Check the available files with: ./benchmarks/scripts/list.sh' - echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json " + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json" From 563492f1e56683d96ea83e32497c4acbca9972e8 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 7 Jun 2021 17:29:22 +0200 Subject: [PATCH 0761/1889] update the TOC order --- benchmarks/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index ebe8eecdf..3486e6d80 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -3,9 +3,9 @@ Benchmarks ## TOC -- [Datasets](#datasets) - [Run the benchmarks](#run-the-benchmarks) - [Comparison between benchmarks](#comparison-between-benchmarks) +- [Datasets](#datasets) ## Datasets From d912c94034929f18a6c32b80b912633bb0870416 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 7 Jun 2021 14:29:20 +0200 Subject: [PATCH 0762/1889] =?UTF-8?q?improve=20the=20benchmark=E2=80=99s?= =?UTF-8?q?=20readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmarks/README.md | 61 ++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 3486e6d80..e3df18db3 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -7,36 +7,6 @@ Benchmarks - [Comparison between benchmarks](#comparison-between-benchmarks) - [Datasets](#datasets) -## Datasets - -The benchmarks are available for the following datasets: -- `songs` -- `wiki` - -### Songs - -`songs` is a subset of the [`songs.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). - -It was generated with this command: - -```bash -xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv -``` - -_[Download the generated `songs` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)._ - -### Wiki - -`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/wikipedia-articles.csv.gz). - -It was generated with the following command: - -```bash -xsv sample --seed 42 500000 wikipedia-articles.csv -o smol-wikipedia-articles.csv -``` - -_[Download the generated `wiki` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wikipedia-articles.csv.gz)._ - ## Run the benchmarks ### On our private server @@ -108,3 +78,34 @@ Run the comparison script: ```bash ./benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json ``` + +## Datasets + +The benchmarks are available for the following datasets: +- `songs` +- `wiki` + +### Songs + +`songs` is a subset of the [`songs.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). + +It was generated with this command: + +```bash +xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv +``` + +_[Download the generated `songs` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)._ + +### Wiki + +`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/wiki-articles.csv.gz). + +It was generated with the following command: + +```bash +xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv +``` + +_[Download the generated `wiki` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wiki-articles.csv.gz)._ + From c82a382b0b6f92a19d68719713a689698feb5ba5 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 8 Jun 2021 11:18:49 +0200 Subject: [PATCH 0763/1889] compile every build.rs with optimization --- Cargo.toml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index ff0b2582a..1358dbe56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,3 +7,13 @@ debug = true [profile.bench] debug = true + +# Set the settings for build scripts and proc-macros. +[profile.dev.build-override] +opt-level = 3 +[profile.release.build-override] +opt-level = 3 +[profile.bench.build-override] +opt-level = 3 +[profile.test.build-override] +opt-level = 3 From e0c327bae2e49fb136b9a14f20ad999e8cbd7340 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 8 Jun 2021 11:39:10 +0200 Subject: [PATCH 0764/1889] Update Cargo.toml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- Cargo.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 1358dbe56..868199d8a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,8 @@ debug = true [profile.bench] debug = true -# Set the settings for build scripts and proc-macros. +# Make sure that the build scripts and proc-macros are compiled with +# all the optimizations. It speeds up the zip crate that we use in the build.rs. [profile.dev.build-override] opt-level = 3 [profile.release.build-override] From 1fcc5f73acac427234d5c0d425add2cbb5613da3 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 8 Jun 2021 12:33:02 +0200 Subject: [PATCH 0765/1889] Factorize tests using macro_rules --- milli/tests/search/mod.rs | 5 +- milli/tests/search/query_criteria.rs | 403 +++------------------------ 2 files changed, 45 insertions(+), 363 deletions(-) diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 609787712..60ad3d45f 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -98,14 +98,15 @@ pub fn expected_order(criteria: &[Criterion], autorize_typo: bool, optional_word group.sort_by_key(|d| d.word_rank); new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from)); }, - Criterion::Asc(_) => { + Criterion::Asc(field_name) if field_name == "asc_desc_rank" => { group.sort_by_key(|d| d.asc_desc_rank); new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); }, - Criterion::Desc(_) => { + Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank)); new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); }, + Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()), } } groups = std::mem::take(&mut new_groups); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 11d9b9e49..7e398fceb 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -2,376 +2,57 @@ use milli::{Search, SearchResult, Criterion}; use big_s::S; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; +use Criterion::*; -#[test] -fn none() { - let criteria = vec![]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); +const ALLOW_TYPOS: bool = true; +const DISALLOW_TYPOS: bool = false; +const ALLOW_OPTIONAL_WORDS: bool = true; +const DISALLOW_OPTIONAL_WORDS: bool = false; - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); +macro_rules! test_criterion { + ($func:ident, $optional_word:ident, $authorize_typos:ident $(, $criterion:expr)?) => { + #[test] + fn $func() { + let criteria = vec![$($criterion)?]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.authorize_typos($authorize_typos); + search.optional_words($optional_word); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - assert_eq!(documents_ids, expected_external_ids); + let expected_external_ids: Vec<_> = search::expected_order(&criteria, $authorize_typos, $optional_word).into_iter().map(|d| d.id).collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); + + } + } } -#[test] -fn words() { - let criteria = vec![Criterion::Words]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn attribute() { - let criteria = vec![Criterion::Attribute]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn exactness() { - let criteria = vec![Criterion::Exactness]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn proximity() { - let criteria = vec![Criterion::Proximity]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn typo() { - let criteria = vec![Criterion::Typo]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn asc() { - let criteria = vec![Criterion::Asc(S("asc_desc_rank"))]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn desc() { - let criteria = vec![Criterion::Desc(S("asc_desc_rank"))]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn none_0_typo() { - let criteria = vec![]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.authorize_typos(false); - search.optional_words(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn attribute_0_typo() { - let criteria = vec![Criterion::Attribute]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - search.authorize_typos(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn exactness_0_typo() { - let criteria = vec![Criterion::Exactness]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - search.authorize_typos(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn proximity_0_typo() { - let criteria = vec![Criterion::Proximity]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - search.authorize_typos(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn typo_0_typo() { - let criteria = vec![Criterion::Typo]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - search.authorize_typos(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn asc_0_typo() { - let criteria = vec![Criterion::Asc(S("asc_desc_rank"))]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - search.authorize_typos(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn desc_0_typo() { - let criteria = vec![Criterion::Desc(S("asc_desc_rank"))]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - search.authorize_typos(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn test_desc_on_unexisting_field_should_return_all_1() { - let criteria = vec![Criterion::Desc(S("unexisting_field"))]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - search.authorize_typos(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let criteria = vec![]; - let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn test_asc_on_unexisting_field_should_return_all_1() { - let criteria = vec![Criterion::Asc(S("unexisting_field"))]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(false); - search.authorize_typos(false); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let criteria = vec![]; - let expected_external_ids: Vec<_> = search::expected_order(&criteria, false, false).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn test_desc_on_unexisting_field_should_return_all_2() { - let criteria = vec![Criterion::Desc(S("unexisting_field"))]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let criteria = vec![]; - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} - -#[test] -fn test_asc_on_unexisting_field_should_return_all_2() { - let criteria = vec![Criterion::Asc(S("unexisting_field"))]; - let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&mut rtxn, &index); - search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); - - let criteria = vec![]; - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true).into_iter().map(|d| d.id).collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - - assert_eq!(documents_ids, expected_external_ids); -} +test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS); +test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS); +test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Words); +test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Attribute); +test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Attribute); +test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Exactness); +test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Exactness); +test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Proximity); +test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Proximity); +test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("asc_desc_rank"))); +test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("asc_desc_rank"))); +test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("asc_desc_rank"))); +test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("asc_desc_rank"))); +test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("unexisting_field"))); +test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("unexisting_field"))); +test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("unexisting_field"))); +test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("unexisting_field"))); #[test] fn criteria_mixup() { From b64cd2a3e361d43c67d612d045b22b6feeba56b5 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 8 Jun 2021 14:11:00 +0200 Subject: [PATCH 0766/1889] Resolve PR comments --- milli/tests/search/mod.rs | 13 ++----- milli/tests/search/query_criteria.rs | 53 ++++++++++++++-------------- 2 files changed, 29 insertions(+), 37 deletions(-) diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 60ad3d45f..98102f9e9 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -52,13 +52,6 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { index } -#[allow(dead_code)] -pub fn external_to_internal_ids(index: &Index, external_ids: &[&str]) -> Vec { - let mut rtxn = index.read_txn().unwrap(); - let docid_map = index.external_documents_ids(&mut rtxn).unwrap(); - external_ids.iter().map(|id| docid_map.get(id).unwrap()).collect() -} - pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec { let mut rtxn = index.read_txn().unwrap(); let docid_map = index.external_documents_ids(&mut rtxn).unwrap(); @@ -70,7 +63,7 @@ fn fetch_dataset() -> Vec { serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect() } -pub fn expected_order(criteria: &[Criterion], autorize_typo: bool, optional_words: bool) -> Vec { +pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_words: bool) -> Vec { let dataset = fetch_dataset(); let mut groups: Vec> = vec![dataset]; @@ -112,11 +105,11 @@ pub fn expected_order(criteria: &[Criterion], autorize_typo: bool, optional_word groups = std::mem::take(&mut new_groups); } - if autorize_typo && optional_words { + if authorize_typo && optional_words { groups.into_iter().flatten().collect() } else if optional_words { groups.into_iter().flatten().filter(|d| d.typo_rank == 0).collect() - } else if autorize_typo { + } else if authorize_typo { groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect() } else { groups.into_iter().flatten().filter(|d| d.word_rank == 0 && d.typo_rank == 0).collect() diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 7e398fceb..160b77c5b 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -23,36 +23,34 @@ macro_rules! test_criterion { search.authorize_typos($authorize_typos); search.optional_words($optional_word); - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, $authorize_typos, $optional_word).into_iter().map(|d| d.id).collect(); + let expected_external_ids: Vec<_> = search::expected_order(&criteria, $authorize_typos, $optional_word) + .into_iter() + .map(|d| d.id).collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); - assert_eq!(documents_ids, expected_external_ids); - } } } - - -test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS); -test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS); -test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Words); -test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Attribute); -test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Attribute); -test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Exactness); -test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Exactness); -test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Proximity); -test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Proximity); -test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("asc_desc_rank"))); -test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("asc_desc_rank"))); -test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("asc_desc_rank"))); -test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("asc_desc_rank"))); -test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("unexisting_field"))); -test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("unexisting_field"))); -test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("unexisting_field"))); -test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("unexisting_field"))); +test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS); +test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS); +test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Words); +test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Attribute); +test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Attribute); +test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Exactness); +test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Exactness); +test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Proximity); +test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Proximity); +test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("asc_desc_rank"))); +test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("asc_desc_rank"))); +test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("asc_desc_rank"))); +test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("asc_desc_rank"))); +test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("unexisting_field"))); +test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("unexisting_field"))); +test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("unexisting_field"))); +test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("unexisting_field"))); #[test] fn criteria_mixup() { @@ -61,6 +59,7 @@ fn criteria_mixup() { let criteria_mix = { + // Criterion doesn't implement Copy, we create a new Criterion using a closure let desc = || Desc(S("asc_desc_rank")); // all possible criteria order vec![ @@ -199,12 +198,12 @@ fn criteria_mixup() { let mut search = Search::new(&mut rtxn, &index); search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(true); - search.authorize_typos(true); + search.optional_words(ALLOW_OPTIONAL_WORDS); + search.authorize_typos(ALLOW_TYPOS); - let SearchResult { matching_words: _matching_words, candidates: _candidates, documents_ids } = search.execute().unwrap(); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true).into_iter().map(|d| d.id).collect(); + let expected_external_ids: Vec<_> = search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS).into_iter().map(|d| d.id).collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); From afb09c914df05a3579d139b919a98bfa4c851d29 Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 8 Jun 2021 14:24:17 +0200 Subject: [PATCH 0767/1889] Update milli/tests/search/query_criteria.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/tests/search/mod.rs | 2 -- milli/tests/search/query_criteria.rs | 5 ++++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 98102f9e9..20cf13034 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -29,9 +29,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { builder.set_criteria(criteria); builder.set_filterable_fields(hashset!{ S("tag"), - S("unexisting_field"), S("asc_desc_rank"), - S("unexisting_field"), }); builder.set_synonyms(hashmap!{ S("hello") => vec![S("good morning")], diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 160b77c5b..b6f692674 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -203,7 +203,10 @@ fn criteria_mixup() { let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS).into_iter().map(|d| d.id).collect(); + let expected_external_ids: Vec<_> = search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) + .into_iter() + .map(|d| d.id) + .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); From b489d699ce647edfa6ccd0ad59aca8b84eb31efb Mon Sep 17 00:00:00 2001 From: many Date: Tue, 8 Jun 2021 17:29:38 +0200 Subject: [PATCH 0768/1889] Make hard separators split phrase query hard separators will now split a phrase query as double double-quotes Fix #208 --- milli/src/search/query_tree.rs | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 03305943b..33708fe76 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,7 +1,9 @@ use std::{fmt, cmp, mem}; use fst::Set; -use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; +use meilisearch_tokenizer::token::SeparatorKind; +use meilisearch_tokenizer::tokenizer::TokenStream; +use meilisearch_tokenizer::TokenKind; use roaring::RoaringBitmap; use slice_group_by::GroupBy; @@ -467,13 +469,14 @@ fn create_primitive_query(query: TokenStream, stop_words: Option>, wo primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); } }, - TokenKind::Separator(_) => { + TokenKind::Separator(separator_kind) => { let quote_count = token.word.chars().filter(|&s| s == '"').count(); // swap quoted state if we encounter a double quote if quote_count % 2 != 0 { quoted = !quoted; } - if !phrase.is_empty() && quote_count > 0 { + // if there are any quote or any hard separator we close the phrase. + if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); } }, @@ -798,6 +801,29 @@ mod test { assert_eq!(expected, query_tree); } + #[test] + fn phrase_with_hard_separator() { + let query = "\"hey friends. wooop wooop\""; + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + + let expected = Operation::And(vec![ + Operation::Consecutive(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), + ]), + Operation::Consecutive(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), + ]), + ]); + + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + + assert_eq!(expected, query_tree); + } + #[test] fn optional_word() { let query = "hey my friend "; From 133ab98260c3f00ee7d8b2c1f2f71de0af6ed783 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 8 Jun 2021 17:33:29 +0200 Subject: [PATCH 0769/1889] Use the index primary key when deleting documents --- milli/src/update/delete_documents.rs | 32 +++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index f0f4788fb..c4cf132bb 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::collections::hash_map::Entry; -use anyhow::anyhow; +use anyhow::{anyhow, Context}; use chrono::Utc; use fst::IntoStreamer; use heed::types::{ByteSlice, Unit}; @@ -77,7 +77,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let id_field = fields_ids_map.id("id").expect(r#"the field "id" to be present"#); + let primary_key = self.index.primary_key(self.wtxn)?.context("missing primary key")?; + let id_field = fields_ids_map.id(primary_key).expect(r#"the field "id" to be present"#); let Index { env: _env, @@ -439,7 +440,6 @@ mod tests { options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); - // First we send 3 documents with an id for only one of them. let mut wtxn = index.write_txn().unwrap(); let content = &br#"[ { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, @@ -463,4 +463,30 @@ mod tests { assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); } + + #[test] + fn delete_documents_with_strange_primary_key() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = &br#"[ + { "mysuperid": 0, "name": "kevin" }, + { "mysuperid": 1, "name": "kevina" }, + { "mysuperid": 2, "name": "benoit" } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + builder.delete_external_id("0"); + builder.delete_external_id("1"); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + } } From faf148d297b6d692dadce10908c72b3e9f00183d Mon Sep 17 00:00:00 2001 From: Many Date: Tue, 8 Jun 2021 17:52:37 +0200 Subject: [PATCH 0770/1889] Update milli/src/search/query_tree.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/query_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 33708fe76..c87ccfe9b 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -475,7 +475,7 @@ fn create_primitive_query(query: TokenStream, stop_words: Option>, wo if quote_count % 2 != 0 { quoted = !quoted; } - // if there are any quote or any hard separator we close the phrase. + // if there is a quote or a hard separator we close the phrase. if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); } From 103dddba2fab450bd38ca398a37696564d065cd2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 8 Jun 2021 17:44:25 +0200 Subject: [PATCH 0771/1889] Move the UpdateStore into the http-ui crate --- Cargo.lock | 2 +- http-ui/Cargo.toml | 1 + http-ui/src/main.rs | 6 +++++- {milli => http-ui}/src/update_store.rs | 4 +++- milli/Cargo.toml | 1 - milli/src/lib.rs | 2 -- 6 files changed, 10 insertions(+), 6 deletions(-) rename {milli => http-ui}/src/update_store.rs (99%) diff --git a/Cargo.lock b/Cargo.lock index 575f582bc..e3c00257d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -961,6 +961,7 @@ dependencies = [ "askama_warp", "byte-unit", "bytes 0.5.6", + "crossbeam-channel", "either", "flate2", "fst", @@ -1365,7 +1366,6 @@ dependencies = [ "bstr", "byteorder", "chrono", - "crossbeam-channel", "csv", "either", "flate2", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index de95ce3a6..970d1d9bf 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" [dependencies] anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } +crossbeam-channel = "0.5.0" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = "0.10.6" meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.2" } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b6a894373..1f91e6370 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,3 +1,5 @@ +mod update_store; + use std::{io, mem}; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; @@ -29,10 +31,12 @@ use tokio::sync::broadcast; use warp::{Filter, http::Response}; use warp::filters::ws::Message; -use milli::{FilterCondition, Index, MatchingWords, obkv_to_json, SearchResult, UpdateStore}; +use milli::{FilterCondition, Index, MatchingWords, obkv_to_json, SearchResult}; use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; use milli::update::UpdateIndexingStep::*; +use self::update_store::UpdateStore; + static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); #[derive(Debug, StructOpt)] diff --git a/milli/src/update_store.rs b/http-ui/src/update_store.rs similarity index 99% rename from milli/src/update_store.rs rename to http-ui/src/update_store.rs index 7211a6293..122ee6031 100644 --- a/milli/src/update_store.rs +++ b/http-ui/src/update_store.rs @@ -1,3 +1,5 @@ +#![allow(unused)] + use std::path::Path; use std::sync::Arc; @@ -6,7 +8,7 @@ use heed::types::{OwnedType, DecodeIgnore, SerdeJson, ByteSlice}; use heed::{EnvOpenOptions, Env, Database}; use serde::{Serialize, Deserialize}; -use crate::BEU64; +pub type BEU64 = heed::zerocopy::U64; #[derive(Clone)] pub struct UpdateStore { diff --git a/milli/Cargo.toml b/milli/Cargo.toml index abcded8c9..3e755d088 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,7 +9,6 @@ anyhow = "1.0.38" bstr = "0.2.15" byteorder = "1.4.2" chrono = { version = "0.4.19", features = ["serde"] } -crossbeam-channel = "0.5.0" csv = "1.1.5" either = "1.6.1" flate2 = "1.0.20" diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 39e107073..8c1ed514c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -4,7 +4,6 @@ mod criterion; mod external_documents_ids; mod fields_ids_map; mod search; -mod update_store; pub mod facet; pub mod heed_codec; pub mod index; @@ -29,7 +28,6 @@ pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRo pub use self::index::Index; pub use self::search::{Search, FacetDistribution, FilterCondition, SearchResult, MatchingWords}; pub use self::tree_level::TreeLevel; -pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; From 82df524e091aa016626a63c0fc73e532e09b3967 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 8 Jun 2021 17:03:27 +0200 Subject: [PATCH 0772/1889] Make sure that we register the field when setting criteria --- milli/src/criterion.rs | 10 ++++++++++ milli/src/update/settings.rs | 12 +++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 81a2878b3..c2205613d 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -30,6 +30,16 @@ pub enum Criterion { Desc(String), } +impl Criterion { + /// Returns the field name parameter of this criterion. + pub fn field_name(&self) -> Option<&str> { + match self { + Criterion::Asc(name) | Criterion::Desc(name) => Some(name), + _otherwise => None, + } + } +} + impl FromStr for Criterion { type Err = anyhow::Error; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ef32c5c44..ec4618158 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -8,9 +8,10 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rayon::ThreadPool; use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use crate::{FieldsIdsMap, Index}; -use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; +use crate::criterion::Criterion; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; +use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; +use crate::{FieldsIdsMap, Index}; #[derive(Debug, Clone, PartialEq)] pub enum Setting { @@ -403,12 +404,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_criteria(&mut self) -> anyhow::Result<()> { match self.criteria { Setting::Set(ref fields) => { + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_criteria = Vec::new(); for name in fields { - let criterion = name.parse()?; + let criterion: Criterion = name.parse()?; + if let Some(name) = criterion.field_name() { + fields_ids_map.insert(name).context("field id limit exceeded")?; + } new_criteria.push(criterion); } self.index.put_criteria(self.wtxn, &new_criteria)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } Setting::Reset => { self.index.delete_criteria(self.wtxn)?; } Setting::NotSet => (), From 0bf4f3f48a917b1225f9c5ea574b8a0cadf7db7e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 8 Jun 2021 17:55:08 +0200 Subject: [PATCH 0773/1889] Modify a test to check that criteria additions change the fields ids map --- milli/src/update/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ec4618158..1c687e089 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -697,7 +697,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); // Don't display the generated `id` field. - builder.set_displayed_fields(vec![S("name"), S("age")]); + builder.set_displayed_fields(vec![S("name")]); builder.set_criteria(vec![S("asc(age)")]); builder.execute(|_, _| ()).unwrap(); From 7e93811fbcb311cce5085f92bf0e6a507d182577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 8 Jun 2021 18:18:54 +0200 Subject: [PATCH 0774/1889] Update dataset links --- benchmarks/README.md | 8 ++++---- benchmarks/build.rs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index e3df18db3..843ea9b29 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -87,7 +87,7 @@ The benchmarks are available for the following datasets: ### Songs -`songs` is a subset of the [`songs.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/songs.csv.gz). +`songs` is a subset of the [`songs.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/songs.csv.gz). It was generated with this command: @@ -95,11 +95,11 @@ It was generated with this command: xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv ``` -_[Download the generated `songs` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-songs.csv.gz)._ +_[Download the generated `songs` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-songs.csv.gz)._ ### Wiki -`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://meili-datasets.s3.fr-par.scw.cloud/wiki-articles.csv.gz). +`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/wiki-articles.csv.gz). It was generated with the following command: @@ -107,5 +107,5 @@ It was generated with the following command: xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv ``` -_[Download the generated `wiki` dataset](https://meili-datasets.s3.fr-par.scw.cloud/benchmarks/smol-wiki-articles.csv.gz)._ +_[Download the generated `wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._ diff --git a/benchmarks/build.rs b/benchmarks/build.rs index dc92a1a4c..58300bab9 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -10,7 +10,7 @@ use convert_case::{Case, Casing}; use flate2::read::GzDecoder; use reqwest::IntoUrl; -const BASE_URL: &str = "https://meili-datasets.s3.fr-par.scw.cloud/benchmarks"; +const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets"; const DATASET_SONGS: &str = "smol-songs"; const DATASET_WIKI: &str = "smol-wiki-articles"; From ab696f6a2326e069b372d507dab52678409252a6 Mon Sep 17 00:00:00 2001 From: Many Date: Wed, 9 Jun 2021 10:12:17 +0200 Subject: [PATCH 0775/1889] Update milli/tests/search/query_criteria.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/tests/search/query_criteria.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index b6f692674..f0eecfaba 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -57,7 +57,6 @@ fn criteria_mixup() { use Criterion::*; let index = search::setup_search_index_with_criteria(&vec![Words, Attribute, Desc(S("asc_desc_rank")), Exactness, Proximity, Typo]); - let criteria_mix = { // Criterion doesn't implement Copy, we create a new Criterion using a closure let desc = || Desc(S("asc_desc_rank")); From f4ff30e99d09438715605085fade28ef1822b807 Mon Sep 17 00:00:00 2001 From: Many Date: Wed, 9 Jun 2021 10:12:24 +0200 Subject: [PATCH 0776/1889] Update milli/tests/search/mod.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/tests/search/mod.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 20cf13034..8c63e5e08 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -57,12 +57,8 @@ pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> V internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() } -fn fetch_dataset() -> Vec { - serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect() -} - pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_words: bool) -> Vec { - let dataset = fetch_dataset(); + let dataset = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); let mut groups: Vec> = vec![dataset]; for criterion in criteria { From 86b916b008bdebd5e649353275604d174bdaa57c Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 8 Jun 2021 11:59:44 +0200 Subject: [PATCH 0777/1889] enable optimization in every profile --- Cargo.toml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 868199d8a..c0fa64635 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,11 +2,14 @@ members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"] default-members = ["milli"] +[profile.dev] +opt-level = 3 + [profile.release] debug = true -[profile.bench] -debug = true +[profile.test] +opt-level = 3 # Make sure that the build scripts and proc-macros are compiled with # all the optimizations. It speeds up the zip crate that we use in the build.rs. From dc64e139b9f84b616db639c4d29c8aa5ba22979f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 9 Jun 2021 14:39:21 +0200 Subject: [PATCH 0778/1889] Update version for the next release (v0.3.1) --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e3c00257d..ba4a4c60c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -900,7 +900,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.3.0" +version = "0.3.1" dependencies = [ "anyhow", "byte-unit", @@ -954,7 +954,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.3.0" +version = "0.3.1" dependencies = [ "anyhow", "askama", @@ -1096,7 +1096,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.3.0" +version = "0.3.1" dependencies = [ "anyhow", "byte-unit", @@ -1359,7 +1359,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.3.0" +version = "0.3.1" dependencies = [ "anyhow", "big_s", @@ -2215,7 +2215,7 @@ dependencies = [ [[package]] name = "search" -version = "0.3.0" +version = "0.3.1" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 947bd5149..86c027e41 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.3.0" +version = "0.3.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 970d1d9bf..146404772 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.3.0" +version = "0.3.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 53b1b37bd..41c161c07 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.3.0" +version = "0.3.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3e755d088..9fe1ce3d3 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.3.0" +version = "0.3.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index d0baca33f..dbc129bf6 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.3.0" +version = "0.3.1" authors = ["Clément Renault "] edition = "2018" From e923a3ed6af5554728a52f05f74ab936b875a143 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 9 Jun 2021 17:28:12 +0200 Subject: [PATCH 0779/1889] Replace Consecutive by Phrase in query tree Replace Consecutive by Phrase in query tree in order to remove theorical bugs, due of the Consecutive enum type. --- milli/src/search/criteria/attribute.rs | 11 +++- milli/src/search/criteria/mod.rs | 32 +++++------ milli/src/search/criteria/proximity.rs | 69 ++++++++++++++++------- milli/src/search/criteria/typo.rs | 39 ++++++------- milli/src/search/matching_words.rs | 7 ++- milli/src/search/query_tree.rs | 76 ++++++++++++-------------- 6 files changed, 130 insertions(+), 104 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 6818e02fd..f825623f6 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -578,7 +578,6 @@ fn linear_compute_candidates( fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { - let branch_len = branch.len(); let mut branch_rank = Vec::with_capacity(branch_len); for derivates in branch { @@ -661,7 +660,7 @@ fn linear_compute_candidates( // TODO can we keep refs of Query fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { - use crate::search::criteria::Operation::{And, Or, Consecutive}; + use crate::search::criteria::Operation::{And, Or, Phrase}; fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree { match tail.split_first() { @@ -683,7 +682,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { fn recurse(op: &Operation) -> FlattenedQueryTree { match op { - And(ops) | Consecutive(ops) => { + And(ops) => { ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) }, Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { @@ -691,6 +690,12 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { } else { ops.iter().map(recurse).flatten().collect() }, + Phrase(words) => { + let queries = words.iter().map(|word| { + vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}] + }).collect(); + vec![queries] + } Operation::Query(query) => vec![vec![vec![query.clone()]]], } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index e4ca66b2c..b14d75ddb 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::borrow::Cow; -use anyhow::bail; use roaring::RoaringBitmap; use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; @@ -239,7 +238,7 @@ pub fn resolve_query_tree<'t>( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { - use Operation::{And, Consecutive, Or, Query}; + use Operation::{And, Phrase, Or, Query}; match query_tree { And(ops) => { @@ -261,26 +260,23 @@ pub fn resolve_query_tree<'t>( } Ok(candidates) }, - Consecutive(ops) => { + Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; - for slice in ops.windows(2) { - match (&slice[0], &slice[1]) { - (Operation::Query(left), Operation::Query(right)) => { - match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? { - pair_docids if pair_docids.is_empty() => { - return Ok(RoaringBitmap::new()) - }, - pair_docids if first_loop => { - candidates = pair_docids; - first_loop = false; - }, - pair_docids => { - candidates.intersect_with(&pair_docids); - }, + for slice in words.windows(2) { + let (left, right) = (&slice[0], &slice[1]); + match ctx.word_pair_proximity_docids(left, right, 1)? { + Some(pair_docids) => { + if pair_docids.is_empty() { + return Ok(RoaringBitmap::new()); + } else if first_loop { + candidates = pair_docids; + first_loop = false; + } else { + candidates &= pair_docids; } }, - _ => bail!("invalid consecutive query type"), + None => return Ok(RoaringBitmap::new()) } } Ok(candidates) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index d190ef031..5b33b8fd9 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -171,12 +171,33 @@ fn resolve_candidates<'t>( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { - use Operation::{And, Consecutive, Or, Query}; + use Operation::{And, Phrase, Or}; let result = match query_tree { And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, - Consecutive(ops) => if proximity == 0 { - mdfs(ctx, ops, 0, cache, wdcache)? + Phrase(words) => if proximity == 0 { + let most_left = words.first().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())}); + let most_right = words.last().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())}); + let mut candidates = None; + for slice in words.windows(2) { + let (left, right) = (&slice[0], &slice[1]); + match ctx.word_pair_proximity_docids(left, right, 1)? { + Some(pair_docids) => { + match candidates.as_mut() { + Some(candidates) => *candidates &= pair_docids, + None => candidates = Some(pair_docids), + } + }, + None => { + candidates = None; + break; + } + } + } + match (most_left, most_right, candidates) { + (Some(l), Some(r), Some(c)) => vec![(l, r, c)], + _otherwise => Default::default(), + } } else { Default::default() }, @@ -188,7 +209,7 @@ fn resolve_candidates<'t>( } output }, - Query(q) => if proximity == 0 { + Operation::Query(q) => if proximity == 0 { let candidates = query_docids(ctx, q, wdcache)?; vec![(q.clone(), q.clone(), candidates)] } else { @@ -306,14 +327,9 @@ fn resolve_plane_sweep_candidates( ) -> anyhow::Result> { /// FIXME may be buggy with query like "new new york" - fn plane_sweep<'a>( - ctx: &dyn Context, - operations: &'a [Operation], - docid: DocumentId, + fn plane_sweep( + groups_positions: Vec>, consecutive: bool, - rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, - words_positions: &HashMap, - wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { fn compute_groups_proximity( @@ -362,13 +378,9 @@ fn resolve_plane_sweep_candidates( } } - let groups_len = operations.len(); - let mut groups_positions = Vec::with_capacity(groups_len); + let groups_len = groups_positions.len(); - for operation in operations { - let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; - groups_positions.push(positions.into_iter()); - } + let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); // Pop top elements of each list. let mut current = Vec::with_capacity(groups_len); @@ -441,15 +453,32 @@ fn resolve_plane_sweep_candidates( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { - use Operation::{And, Consecutive, Or}; + use Operation::{And, Phrase, Or}; if let Some(result) = rocache.get(query_tree) { return Ok(result.clone()); } let result = match query_tree { - And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?, - Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?, + And(ops) => { + let mut groups_positions = Vec::with_capacity(ops.len()); + for operation in ops { + let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; + groups_positions.push(positions); + } + plane_sweep(groups_positions, false)? + }, + Phrase(words) => { + let mut groups_positions = Vec::with_capacity(words.len()); + for word in words { + let positions = match words_positions.get(word) { + Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), + None => vec![], + }; + groups_positions.push(positions); + } + plane_sweep(groups_positions, true)? + }, Or(_, ops) => { let mut result = Vec::new(); for op in ops { diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index a844417eb..d075b6bca 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -1,6 +1,5 @@ use std::{borrow::Cow, collections::HashMap, mem::take}; -use anyhow::bail; use log::debug; use roaring::RoaringBitmap; @@ -13,7 +12,6 @@ use super::{ CriterionParameters, CriterionResult, query_docids, - query_pair_proximity_docids, resolve_query_tree, }; @@ -174,12 +172,14 @@ fn alterate_query_tree( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result<()> { - use Operation::{And, Consecutive, Or}; + use Operation::{And, Phrase, Or}; match operation { - And(ops) | Consecutive(ops) | Or(_, ops) => { + And(ops) | Or(_, ops) => { ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) }, + // Because Phrases don't allow typos, no alteration can be done. + Phrase(_words) => return Ok(()), Operation::Query(q) => { if let QueryKind::Tolerant { typo, word } = &q.kind { // if no typo is allowed we don't call word_derivations function, @@ -228,32 +228,29 @@ fn resolve_candidates<'t>( wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { - use Operation::{And, Consecutive, Or, Query}; + use Operation::{And, Phrase, Or, Query}; match query_tree { And(ops) => { mdfs(ctx, ops, number_typos, cache, wdcache) }, - Consecutive(ops) => { + Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; - for slice in ops.windows(2) { - match (&slice[0], &slice[1]) { - (Operation::Query(left), Operation::Query(right)) => { - match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? { - pair_docids if pair_docids.is_empty() => { - return Ok(RoaringBitmap::new()) - }, - pair_docids if first_loop => { - candidates = pair_docids; - first_loop = false; - }, - pair_docids => { - candidates.intersect_with(&pair_docids); - }, + for slice in words.windows(2) { + let (left, right) = (&slice[0], &slice[1]); + match ctx.word_pair_proximity_docids(left, right, 1)? { + Some(pair_docids) => { + if pair_docids.is_empty() { + return Ok(RoaringBitmap::new()); + } else if first_loop { + candidates = pair_docids; + first_loop = false; + } else { + candidates &= pair_docids; } }, - _ => bail!("invalid consecutive query type"), + None => return Ok(RoaringBitmap::new()) } } Ok(candidates) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 17649849d..c56db4e96 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -52,13 +52,18 @@ impl MatchingWords { fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { match tree { - Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { + Operation::Or(_, ops) | Operation::And(ops) => { ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); }, Operation::Query(Query { prefix, kind }) => { let typo = if kind.is_exact() { 0 } else { kind.typo() }; out.insert((kind.word(), typo, *prefix)); }, + Operation::Phrase(words) => { + for word in words { + out.insert((word, 0, false)); + } + } } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index c87ccfe9b..234fd3266 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -15,7 +15,8 @@ type IsPrefix = bool; #[derive(Clone, PartialEq, Eq, Hash)] pub enum Operation { And(Vec), - Consecutive(Vec), + // serie of consecutive non prefix and exact words + Phrase(Vec), Or(IsOptionalWord, Vec), Query(Query), } @@ -28,9 +29,8 @@ impl fmt::Debug for Operation { writeln!(f, "{:1$}AND", "", depth * 2)?; children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) }, - Operation::Consecutive(children) => { - writeln!(f, "{:1$}CONSECUTIVE", "", depth * 2)?; - children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + Operation::Phrase(children) => { + writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2) }, Operation::Or(true, children) => { writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; @@ -49,14 +49,6 @@ impl fmt::Debug for Operation { } impl Operation { - fn phrase(words: Vec) -> Operation { - Operation::consecutive( - words.into_iter().map(|s| { - Operation::Query(Query { prefix: false, kind: QueryKind::exact(s) }) - }).collect() - ) - } - fn and(mut ops: Vec) -> Self { if ops.len() == 1 { ops.pop().unwrap() @@ -73,11 +65,11 @@ impl Operation { } } - fn consecutive(mut ops: Vec) -> Self { - if ops.len() == 1 { - ops.pop().unwrap() + fn phrase(mut words: Vec) -> Self { + if words.len() == 1 { + Self::Query(Query {prefix: false, kind: QueryKind::exact(words.pop().unwrap())}) } else { - Self::Consecutive(ops) + Self::Phrase(words) } } @@ -256,10 +248,10 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result>, wo /// Returns the maximum number of typos that this Operation allows. pub fn maximum_typo(operation: &Operation) -> usize { - use Operation::{Or, And, Query, Consecutive}; + use Operation::{Or, And, Query, Phrase}; match operation { Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), - And(ops) | Consecutive(ops) => ops.iter().map(maximum_typo).sum::(), + And(ops) => ops.iter().map(maximum_typo).sum::(), Query(q) => q.kind.typo() as usize, + // no typo allowed in phrases + Phrase(_) => 0, } } /// Returns the maximum proximity that this Operation allows. pub fn maximum_proximity(operation: &Operation) -> usize { - use Operation::{Or, And, Query, Consecutive}; + use Operation::{Or, And, Query, Phrase}; match operation { Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), And(ops) => { ops.iter().map(maximum_proximity).sum::() + ops.len().saturating_sub(1) * 7 }, - Query(_) | Consecutive(_) => 0, + Query(_) | Phrase(_) => 0, } } @@ -765,9 +759,9 @@ mod test { let expected = Operation::Or(false, vec![ Operation::And(vec![ Operation::Or(false, vec![ - Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("word".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Phrase(vec![ + "word".to_string(), + "split".to_string(), ]), Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }), ]), @@ -789,9 +783,9 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), + Operation::Phrase(vec![ + "hey".to_string(), + "friends".to_string(), ]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), ]); @@ -809,13 +803,13 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), + Operation::Phrase(vec![ + "hey".to_string(), + "friends".to_string(), ]), - Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), + Operation::Phrase(vec![ + "wooop".to_string(), + "wooop".to_string(), ]), ]); @@ -870,9 +864,9 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + let expected = Operation::Phrase(vec![ + "hey".to_string(), + "my".to_string(), ]); let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); @@ -940,9 +934,9 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Consecutive(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), + Operation::Phrase(vec![ + "hey".to_string(), + "my".to_string(), ]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), ]); From 36715f571cf1d14c0a91f245a0080130ff352fa4 Mon Sep 17 00:00:00 2001 From: Many Date: Thu, 10 Jun 2021 11:30:33 +0200 Subject: [PATCH 0780/1889] Update milli/src/search/criteria/proximity.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/proximity.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 5b33b8fd9..4da6fd1eb 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -176,8 +176,8 @@ fn resolve_candidates<'t>( let result = match query_tree { And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, Phrase(words) => if proximity == 0 { - let most_left = words.first().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())}); - let most_right = words.last().map(|w| Query {prefix: false, kind: QueryKind::exact(w.clone())}); + let most_left = words.first().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); + let most_right = words.last().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); let mut candidates = None; for slice in words.windows(2) { let (left, right) = (&slice[0], &slice[1]); From f4cab080a6bb0c1f348777220b224a999d53d1b7 Mon Sep 17 00:00:00 2001 From: Many Date: Thu, 10 Jun 2021 11:30:51 +0200 Subject: [PATCH 0781/1889] Update milli/src/search/query_tree.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/query_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 234fd3266..3c3420db4 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -67,7 +67,7 @@ impl Operation { fn phrase(mut words: Vec) -> Self { if words.len() == 1 { - Self::Query(Query {prefix: false, kind: QueryKind::exact(words.pop().unwrap())}) + Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) }) } else { Self::Phrase(words) } From 7d5395c12b9d8e664789340c0bfdd60e4d50fe62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 10 Jun 2021 17:00:04 +0200 Subject: [PATCH 0782/1889] Update Tokenizer version to v0.2.3 --- Cargo.lock | 20 ++++++++++++++++++-- http-ui/Cargo.toml | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ba4a4c60c..c719f6f03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -971,7 +971,7 @@ dependencies = [ "heed", "log", "maplit", - "meilisearch-tokenizer", + "meilisearch-tokenizer 0.2.3", "memmap", "milli", "once_cell", @@ -1332,6 +1332,22 @@ dependencies = [ "whatlang", ] +[[package]] +name = "meilisearch-tokenizer" +version = "0.2.3" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.3#c2399c3f879144ad92e20ae057e14984dfd22781" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang", +] + [[package]] name = "memchr" version = "2.3.4" @@ -1380,7 +1396,7 @@ dependencies = [ "log", "logging_timer", "maplit", - "meilisearch-tokenizer", + "meilisearch-tokenizer 0.2.2", "memmap", "obkv", "once_cell", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 146404772..a11307fbe 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -11,7 +11,7 @@ byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = "0.10.6" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.2" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" } memmap = "0.7.0" milli = { path = "../milli" } once_cell = "1.5.2" From ff9414a6baca2411958d7c634c71fb8deaf52a65 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 9 Jun 2021 10:57:32 +0200 Subject: [PATCH 0783/1889] Use the out of the compute_primary_key_pair function --- milli/src/update/index_documents/transform.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index fd508d6a4..cfc2530b4 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -246,7 +246,7 @@ impl Transform<'_, '_> { // Returns the field id in the fields ids map, create an "id" field // in case it is not in the current headers. let alternative_name = primary_key_pos.map(|pos| headers[pos].to_string()); - let (primary_key_id, _) = compute_primary_key_pair( + let (primary_key_id, primary_key_name) = compute_primary_key_pair( self.index.primary_key(self.rtxn)?, &mut fields_ids_map, alternative_name, @@ -330,10 +330,6 @@ impl Transform<'_, '_> { // Now that we have a valid sorter that contains the user id and the obkv we // give it to the last transforming function which returns the TransformOutput. - let primary_key_name = fields_ids_map - .name(primary_key_id) - .map(String::from) - .expect("Primary key must be present in fields id map"); self.output_from_sorter( sorter, primary_key_name, From 93978ec38a345cd0acbc068cde59f7e739ecf1bb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 9 Jun 2021 11:28:38 +0200 Subject: [PATCH 0784/1889] Serializing a RoaringBitmap into a Vec cannot fail --- .../roaring_bitmap/cbo_roaring_bitmap_codec.rs | 9 ++++----- milli/src/update/index_documents/merge_function.rs | 2 +- milli/src/update/index_documents/store.rs | 7 +++---- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 8ccf831e3..325effa73 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -23,18 +23,17 @@ impl CboRoaringBitmapCodec { } } - pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) -> io::Result<()> { + pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) { if roaring.len() <= THRESHOLD as u64 { // If the number of items (u32s) to encode is less than or equal to the threshold // it means that it would weigh the same or less than the RoaringBitmap // header, so we directly encode them using ByteOrder instead. for integer in roaring { - vec.write_u32::(integer)?; + vec.write_u32::(integer).unwrap(); } - Ok(()) } else { // Otherwise, we use the classic RoaringBitmapCodec that writes a header. - roaring.serialize_into(vec) + roaring.serialize_into(vec).unwrap(); } } @@ -68,7 +67,7 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { fn bytes_encode(item: &Self::EItem) -> Option> { let mut vec = Vec::with_capacity(Self::serialized_size(item)); - Self::serialize_into(item, &mut vec).ok()?; + Self::serialize_into(item, &mut vec); Some(Cow::Owned(vec)) } } diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 230116e99..6da19bc84 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -112,6 +112,6 @@ fn cbo_roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { } let mut vec = Vec::new(); - CboRoaringBitmapCodec::serialize_into(&head, &mut vec)?; + CboRoaringBitmapCodec::serialize_into(&head, &mut vec); Ok(vec) } diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 08050092e..69263e5a0 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -407,7 +407,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // We serialize the document ids into a buffer buffer.clear(); buffer.reserve(CboRoaringBitmapCodec::serialized_size(&docids)); - CboRoaringBitmapCodec::serialize_into(&docids, &mut buffer)?; + CboRoaringBitmapCodec::serialize_into(&docids, &mut buffer); // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { sorter.insert(&key, &buffer)?; @@ -469,8 +469,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { data_buffer.clear(); let positions = RoaringBitmap::from_iter(Some(document_id)); // We serialize the positions into a buffer. - CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer) - .with_context(|| "could not serialize positions")?; + CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer); // that we write under the generated key into MTBL if lmdb_key_valid_size(&key_buffer) { @@ -706,7 +705,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut docids_buffer = Vec::new(); for ((fid, count), docids) in self.field_id_word_count_docids { docids_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer)?; + CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer); self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?; } From cfc7314bd16576b745a806ab125f023fa1afd5a2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 9 Jun 2021 12:17:11 +0200 Subject: [PATCH 0785/1889] Prefer using an explicit merge function name --- .../update/index_documents/merge_function.rs | 70 ++++--------------- milli/src/update/index_documents/mod.rs | 35 +++++----- milli/src/update/index_documents/store.rs | 24 +++---- milli/src/update/word_prefix_docids.rs | 8 ++- .../word_prefix_pair_proximity_docids.rs | 6 +- milli/src/update/words_level_positions.rs | 6 +- 6 files changed, 51 insertions(+), 98 deletions(-) diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 6da19bc84..c8424dc8c 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -1,71 +1,29 @@ use std::borrow::Cow; -use anyhow::{bail, ensure, Context}; +use anyhow::bail; use bstr::ByteSlice as _; use fst::IntoStreamer; use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; -const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); -const FIELDS_IDS_MAP_KEY: &[u8] = crate::index::FIELDS_IDS_MAP_KEY.as_bytes(); -const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes(); +// Union of multiple FSTs +pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + let fsts = values.iter().map(fst::Set::new).collect::, _>>()?; + let op_builder: fst::set::OpBuilder = fsts.iter().map(|fst| fst.into_stream()).collect(); + let op = op_builder.r#union(); -pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - match key { - WORDS_FST_KEY => { - let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect(); - - // Union of the FSTs - let mut op = fst::set::OpBuilder::new(); - fsts.iter().for_each(|fst| op.push(fst.into_stream())); - let op = op.r#union(); - - let mut build = fst::SetBuilder::memory(); - build.extend_stream(op.into_stream()).unwrap(); - Ok(build.into_inner().unwrap()) - }, - FIELDS_IDS_MAP_KEY => { - ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match"); - Ok(values[0].to_vec()) - }, - DOCUMENTS_IDS_KEY => roaring_bitmap_merge(values), - otherwise => bail!("wut {:?}", otherwise), - } -} - -pub fn word_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - roaring_bitmap_merge(values) + let mut build = fst::SetBuilder::memory(); + build.extend_stream(op.into_stream()).unwrap(); + Ok(build.into_inner().unwrap()) } pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result> { - bail!("merging docid word positions is an error ({:?})", key.as_bstr()) + panic!("merging docid word positions is an error ({:?})", key.as_bstr()) } -pub fn field_id_docid_facet_values_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - let first = values.first().context("no value to merge")?; - ensure!(values.iter().all(|v| v == first), "invalid field id docid facet value merging"); - Ok(first.to_vec()) -} - -pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - cbo_roaring_bitmap_merge(values) -} - -pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - cbo_roaring_bitmap_merge(values) -} - -pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - cbo_roaring_bitmap_merge(values) -} - -pub fn field_id_word_count_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - cbo_roaring_bitmap_merge(values) -} - -pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - cbo_roaring_bitmap_merge(values) +pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + Ok(values.first().unwrap().to_vec()) } pub fn documents_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result> { @@ -88,7 +46,7 @@ pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mu writer.finish().unwrap(); } -fn roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { +pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { let (head, tail) = values.split_first().unwrap(); let mut head = RoaringBitmap::deserialize_from(&head[..])?; @@ -102,7 +60,7 @@ fn roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { Ok(vec) } -fn cbo_roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { +pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { let (head, tail) = values.split_first().unwrap(); let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4cf56b563..2b790ce06 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -25,11 +25,8 @@ use crate::update::{ }; use self::store::{Store, Readers}; pub use self::merge_function::{ - main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - docid_word_positions_merge, documents_merge, - word_level_position_docids_merge, word_prefix_level_positions_docids_merge, - facet_field_value_docids_merge, field_id_docid_facet_values_merge, - field_id_word_count_docids_merge, + fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, + docid_word_positions_merge, documents_merge, keep_first }; pub use self::transform::{Transform, TransformOutput}; @@ -539,22 +536,22 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { debug!("Merging the main, word docids and words pairs proximity docids in parallel..."); rayon::spawn(move || { vec![ - (DatabaseType::Main, main_readers, main_merge as MergeFn), - (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), + (DatabaseType::Main, main_readers, fst_merge as MergeFn), + (DatabaseType::WordDocids, word_docids_readers, roaring_bitmap_merge), ( DatabaseType::FacetLevel0NumbersDocids, facet_field_numbers_docids_readers, - facet_field_value_docids_merge, + cbo_roaring_bitmap_merge, ), ( DatabaseType::WordLevel0PositionDocids, word_level_position_docids_readers, - word_level_position_docids_merge, + cbo_roaring_bitmap_merge, ), ( DatabaseType::FieldIdWordCountDocids, field_id_word_count_docids_readers, - field_id_word_count_docids_merge, + cbo_roaring_bitmap_merge, ), ] .into_par_iter() @@ -657,7 +654,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_field_strings_docids_readers, - facet_field_value_docids_merge, + cbo_roaring_bitmap_merge, write_method, )?; @@ -672,7 +669,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.field_id_docid_facet_f64s.as_polymorph(), field_id_docid_facet_numbers_readers, - field_id_docid_facet_values_merge, + keep_first, write_method, )?; @@ -687,7 +684,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.field_id_docid_facet_strings.as_polymorph(), field_id_docid_facet_strings_readers, - field_id_docid_facet_values_merge, + keep_first, write_method, )?; @@ -702,7 +699,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.word_pair_proximity_docids.as_polymorph(), words_pairs_proximities_docids_readers, - words_pairs_proximities_docids_merge, + cbo_roaring_bitmap_merge, write_method, )?; @@ -721,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, self.index.main, content, - main_merge, + fst_merge, WriteMethod::GetMergePut, )?; }, @@ -732,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, db, content, - word_docids_merge, + roaring_bitmap_merge, write_method, )?; }, @@ -743,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, db, content, - facet_field_value_docids_merge, + cbo_roaring_bitmap_merge, write_method, )?; }, @@ -754,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, db, content, - field_id_word_count_docids_merge, + cbo_roaring_bitmap_merge, write_method, )?; }, @@ -765,7 +762,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, db, content, - word_level_position_docids_merge, + cbo_roaring_bitmap_merge, write_method, )?; } diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 69263e5a0..16837ca7b 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -26,11 +26,7 @@ use crate::update::UpdateIndexingStep; use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; -use super::merge_function::{ - main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - word_level_position_docids_merge, facet_field_value_docids_merge, - field_id_docid_facet_values_merge, field_id_word_count_docids_merge, -}; +use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge}; const LMDB_MAX_KEY_LENGTH: usize = 511; const ONE_KILOBYTE: usize = 1024 * 1024; @@ -104,7 +100,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); let main_sorter = create_sorter( - main_merge, + fst_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -112,7 +108,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let word_docids_sorter = create_sorter( - word_docids_merge, + roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -120,7 +116,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let words_pairs_proximities_docids_sorter = create_sorter( - words_pairs_proximities_docids_merge, + cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -128,7 +124,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let word_level_position_docids_sorter = create_sorter( - word_level_position_docids_merge, + cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -136,7 +132,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let field_id_word_count_docids_sorter = create_sorter( - field_id_word_count_docids_merge, + cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -144,7 +140,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let facet_field_numbers_docids_sorter = create_sorter( - facet_field_value_docids_merge, + cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -152,7 +148,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let facet_field_strings_docids_sorter = create_sorter( - facet_field_value_docids_merge, + cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -160,7 +156,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let field_id_docid_facet_numbers_sorter = create_sorter( - field_id_docid_facet_values_merge, + keep_first, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -168,7 +164,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Some(1024 * 1024 * 1024), // 1MB ); let field_id_docid_facet_strings_sorter = create_sorter( - field_id_docid_facet_values_merge, + keep_first, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 58c984212..0544f8789 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -6,7 +6,9 @@ use grenad::CompressionType; use heed::types::ByteSlice; use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database}; +use crate::update::index_documents::{ + create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, +}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -40,7 +42,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( - word_docids_merge, + roaring_bitmap_merge, self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, @@ -66,7 +68,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_docids.as_polymorph(), prefix_docids_sorter, - word_docids_merge, + roaring_bitmap_merge, WriteMethod::Append, )?; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index c972efc4f..c6b935e54 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -11,7 +11,7 @@ use crate::Index; use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::{ WriteMethod, create_sorter, sorter_into_lmdb_database, - words_pairs_proximities_docids_merge, + cbo_roaring_bitmap_merge, }; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { @@ -50,7 +50,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { // Here we create a sorter akin to the previous one. let mut word_prefix_pair_proximity_docids_sorter = create_sorter( - words_pairs_proximities_docids_merge, + cbo_roaring_bitmap_merge, self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, @@ -80,7 +80,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), word_prefix_pair_proximity_docids_sorter, - words_pairs_proximities_docids_merge, + cbo_roaring_bitmap_merge, WriteMethod::Append, )?; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 1b772c37d..f94507aab 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -15,7 +15,7 @@ use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{ create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, - word_prefix_level_positions_docids_merge, sorter_into_lmdb_database + cbo_roaring_bitmap_merge, sorter_into_lmdb_database }; use crate::{Index, TreeLevel}; @@ -86,7 +86,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.index.word_prefix_level_position_docids.clear(self.wtxn)?; let mut word_prefix_level_positions_docids_sorter = create_sorter( - word_prefix_level_positions_docids_merge, + cbo_roaring_bitmap_merge, self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, @@ -119,7 +119,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_level_position_docids.as_polymorph(), word_prefix_level_positions_docids_sorter, - word_prefix_level_positions_docids_merge, + cbo_roaring_bitmap_merge, WriteMethod::Append, )?; From 93a8633f188150e2d13c7a2d5a3d6773f0c9d204 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 9 Jun 2021 14:45:56 +0200 Subject: [PATCH 0786/1889] Remove the documents_merge method that must never be called --- milli/src/update/index_documents/merge_function.rs | 4 ---- milli/src/update/index_documents/mod.rs | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index c8424dc8c..904368fb0 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -26,10 +26,6 @@ pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> Ok(values.first().unwrap().to_vec()) } -pub fn documents_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result> { - bail!("merging documents is an error ({:?})", key.as_bstr()) -} - pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mut Vec) { use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2b790ce06..7fb47fa4a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -26,7 +26,7 @@ use crate::update::{ use self::store::{Store, Readers}; pub use self::merge_function::{ fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, - docid_word_positions_merge, documents_merge, keep_first + docid_word_positions_merge, keep_first }; pub use self::transform::{Transform, TransformOutput}; @@ -149,7 +149,7 @@ pub fn write_into_lmdb_database( let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; match iter.next().transpose()? { Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; + let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; let val = merge(k, &vals)?; iter.put_current(k, &val)?; }, @@ -634,12 +634,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); - debug!("Writing the documents into LMDB on disk..."); + debug!("Inserting the documents into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, *self.index.documents.as_polymorph(), documents_readers, - documents_merge, + keep_first, write_method )?; From ab727e428bbacb0f1117ce9140f972d15833d990 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 9 Jun 2021 14:49:06 +0200 Subject: [PATCH 0787/1889] Remove the docid_word_positions_merge method that must never be called --- milli/src/update/index_documents/merge_function.rs | 4 ---- milli/src/update/index_documents/mod.rs | 7 +++---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 904368fb0..0a32603b5 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -18,10 +18,6 @@ pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { Ok(build.into_inner().unwrap()) } -pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result> { - panic!("merging docid word positions is an error ({:?})", key.as_bstr()) -} - pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { Ok(values.first().unwrap().to_vec()) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7fb47fa4a..1d31cba85 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -25,8 +25,7 @@ use crate::update::{ }; use self::store::{Store, Readers}; pub use self::merge_function::{ - fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, - docid_word_positions_merge, keep_first + fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first }; pub use self::transform::{Transform, TransformOutput}; @@ -619,12 +618,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); - debug!("Writing the docid word positions into LMDB on disk..."); + debug!("Inserting the docid word positions into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, *self.index.docid_word_positions.as_polymorph(), docid_word_positions_readers, - docid_word_positions_merge, + keep_first, write_method )?; From 65b1d09d55400349b00219afaa62f92ab3233acb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 9 Jun 2021 14:57:03 +0200 Subject: [PATCH 0788/1889] Move the obkv merging functions into the merge_function module --- .../update/index_documents/merge_function.rs | 20 ++++++++++++++++-- milli/src/update/index_documents/transform.rs | 21 ++----------------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 0a32603b5..8c93773ce 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -1,12 +1,28 @@ use std::borrow::Cow; -use anyhow::bail; -use bstr::ByteSlice as _; use fst::IntoStreamer; use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; +/// Only the last value associated with an id is kept. +pub fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result> { + Ok(obkvs.last().unwrap().clone().into_owned()) +} + +/// Merge all the obks in the order we see them. +pub fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result> { + let mut iter = obkvs.iter(); + let first = iter.next().map(|b| b.clone().into_owned()).unwrap(); + Ok(iter.fold(first, |acc, current| { + let first = obkv::KvReader::new(&acc); + let second = obkv::KvReader::new(current); + let mut buffer = Vec::new(); + merge_two_obkvs(first, second, &mut buffer); + buffer + })) +} + // Union of multiple FSTs pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { let fsts = values.iter().map(fst::Set::new).collect::, _>>()?; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index cfc2530b4..5fbd24bb1 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -10,8 +10,9 @@ use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; +use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; use super::merge_function::merge_two_obkvs; use super::{create_writer, create_sorter, IndexDocumentsMethod}; @@ -552,24 +553,6 @@ fn compute_primary_key_pair( } } -/// Only the last value associated with an id is kept. -fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result> { - obkvs.last().context("no last value").map(|last| last.clone().into_owned()) -} - -/// Merge all the obks in the order we see them. -fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result> { - let mut iter = obkvs.iter(); - let first = iter.next().map(|b| b.clone().into_owned()).context("no first value")?; - Ok(iter.fold(first, |acc, current| { - let first = obkv::KvReader::new(&acc); - let second = obkv::KvReader::new(current); - let mut buffer = Vec::new(); - merge_two_obkvs(first, second, &mut buffer); - buffer - })) -} - fn validate_document_id(document_id: &str) -> Option<&str> { let document_id = document_id.trim(); Some(document_id).filter(|id| { From d2b1ecc88565c93fd66d3ab6d963a5213e86b30c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 9 Jun 2021 15:26:40 +0200 Subject: [PATCH 0789/1889] Remove a lot of serialization unreachable errors --- .../facet/facet_value_string_codec.rs | 13 +++-- .../facet/field_doc_id_facet_string_codec.rs | 15 ++++-- .../roaring_bitmap/bo_roaring_bitmap_codec.rs | 15 +++--- milli/src/update/index_documents/store.rs | 52 ++++++++++++------- 4 files changed, 62 insertions(+), 33 deletions(-) diff --git a/milli/src/heed_codec/facet/facet_value_string_codec.rs b/milli/src/heed_codec/facet/facet_value_string_codec.rs index 350efc450..259dab972 100644 --- a/milli/src/heed_codec/facet/facet_value_string_codec.rs +++ b/milli/src/heed_codec/facet/facet_value_string_codec.rs @@ -5,6 +5,14 @@ use crate::FieldId; pub struct FacetValueStringCodec; +impl FacetValueStringCodec { + pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { + out.reserve(value.len() + 1); + out.push(field_id); + out.extend_from_slice(value.as_bytes()); + } +} + impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { type DItem = (FieldId, &'a str); @@ -19,9 +27,8 @@ impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec { type EItem = (FieldId, &'a str); fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(value.len() + 1); - bytes.push(*field_id); - bytes.extend_from_slice(value.as_bytes()); + let mut bytes = Vec::new(); + FacetValueStringCodec::serialize_into(*field_id, value, &mut bytes); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs index 2e282b2a0..b002346e9 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -6,6 +6,15 @@ use crate::{FieldId, DocumentId}; pub struct FieldDocIdFacetStringCodec; +impl FieldDocIdFacetStringCodec { + pub fn serialize_into(field_id: FieldId, document_id: DocumentId, value: &str, out: &mut Vec) { + out.reserve(1 + 4 + value.len()); + out.push(field_id); + out.extend_from_slice(&document_id.to_be_bytes()); + out.extend_from_slice(value.as_bytes()); + } +} + impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { type DItem = (FieldId, DocumentId, &'a str); @@ -22,10 +31,8 @@ impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { type EItem = (FieldId, DocumentId, &'a str); fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(1 + 4 + value.len()); - bytes.push(*field_id); - bytes.extend_from_slice(&document_id.to_be_bytes()); - bytes.extend_from_slice(value.as_bytes()); + let mut bytes = Vec::new(); + FieldDocIdFacetStringCodec::serialize_into(*field_id, *document_id, value, &mut bytes); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs index 8d1eb79dd..994e23b39 100644 --- a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs @@ -6,6 +6,13 @@ use roaring::RoaringBitmap; pub struct BoRoaringBitmapCodec; +impl BoRoaringBitmapCodec { + pub fn serialize_into(bitmap: &RoaringBitmap, out: &mut Vec) { + out.reserve(bitmap.len() as usize * size_of::()); + bitmap.iter().map(u32::to_ne_bytes).for_each(|bytes| out.extend_from_slice(&bytes)); + } +} + impl heed::BytesDecode<'_> for BoRoaringBitmapCodec { type DItem = RoaringBitmap; @@ -25,12 +32,8 @@ impl heed::BytesEncode<'_> for BoRoaringBitmapCodec { type EItem = RoaringBitmap; fn bytes_encode(item: &Self::EItem) -> Option> { - let mut out = Vec::with_capacity(item.len() as usize * size_of::()); - - item.iter() - .map(|i| i.to_ne_bytes()) - .for_each(|bytes| out.extend_from_slice(&bytes)); - + let mut out = Vec::new(); + BoRoaringBitmapCodec::serialize_into(item, &mut out); Some(Cow::Owned(out)) } } diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 16837ca7b..4662cd609 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -421,6 +421,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { { // We prefix the words by the document id. let mut key = id.to_be_bytes().to_vec(); + let mut buffer = Vec::new(); let base_size = key.len(); // We order the words lexicographically, this way we avoid passing by a sorter. @@ -429,13 +430,15 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { for (word, positions) in words_positions { key.truncate(base_size); key.extend_from_slice(word.as_bytes()); + buffer.clear(); + // We serialize the positions into a buffer. let positions = RoaringBitmap::from_iter(positions.iter().cloned()); - let bytes = BoRoaringBitmapCodec::bytes_encode(&positions) - .with_context(|| "could not serialize positions")?; + BoRoaringBitmapCodec::serialize_into(&positions, &mut buffer); + // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { - writer.insert(&key, &bytes)?; + writer.insert(&key, &buffer)?; } } @@ -483,14 +486,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { ) -> anyhow::Result<()> where I: IntoIterator { + let mut key_buffer = Vec::new(); + let mut data_buffer = Vec::new(); + for ((field_id, value), docids) in iter { - let key = FacetValueStringCodec::bytes_encode(&(field_id, &value)) - .map(Cow::into_owned) - .context("could not serialize facet key")?; - let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) - .context("could not serialize docids")?; - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &bytes)?; + key_buffer.clear(); + data_buffer.clear(); + + FacetValueStringCodec::serialize_into(field_id, &value, &mut key_buffer); + CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); + + if lmdb_key_valid_size(&key_buffer) { + sorter.insert(&key_buffer, &data_buffer)?; } } @@ -503,14 +510,19 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { ) -> anyhow::Result<()> where I: IntoIterator), RoaringBitmap)> { + let mut data_buffer = Vec::new(); + for ((field_id, value), docids) in iter { + data_buffer.clear(); + let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value)) .map(Cow::into_owned) - .context("could not serialize facet key")?; - let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) - .context("could not serialize docids")?; + .context("could not serialize facet level value key")?; + + CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); + if lmdb_key_valid_size(&key) { - sorter.insert(&key, &bytes)?; + sorter.insert(&key, &data_buffer)?; } } @@ -526,7 +538,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { { let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value)) .map(Cow::into_owned) - .context("could not serialize facet key")?; + .context("could not serialize facet level value key")?; if lmdb_key_valid_size(&key) { sorter.insert(&key, &[])?; @@ -542,12 +554,12 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { value: &str, ) -> anyhow::Result<()> { - let key = FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, value)) - .map(Cow::into_owned) - .context("could not serialize facet key")?; + let mut buffer = Vec::new(); - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &[])?; + FieldDocIdFacetStringCodec::serialize_into(field_id, document_id, value, &mut buffer); + + if lmdb_key_valid_size(&buffer) { + sorter.insert(&buffer, &[])?; } Ok(()) From 23fcf7920ea67672f3c16e50d3ca4c797f851219 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 9 Jun 2021 17:05:46 +0200 Subject: [PATCH 0790/1889] Introduce a basic version of the InternalError struct --- milli/src/error.rs | 42 ++++++++++++++++++++++++++++++++++++++++++ milli/src/lib.rs | 1 + 2 files changed, 43 insertions(+) create mode 100644 milli/src/error.rs diff --git a/milli/src/error.rs b/milli/src/error.rs new file mode 100644 index 000000000..02740377c --- /dev/null +++ b/milli/src/error.rs @@ -0,0 +1,42 @@ +use std::io; + +use crate::{DocumentId, FieldId}; + +pub enum Error { + InternalError(InternalError), + IoError(io::Error), + UserError(UserError), +} + +pub enum InternalError { + DatabaseMissingEntry(DatabaseMissingEntry), + FieldIdMapMissingEntry(FieldIdMapMissingEntry), + IndexingMergingKeys(IndexingMergingKeys), +} + +pub enum IndexingMergingKeys { + DocIdWordPosition, + Document, + MainFstDeserialization, + WordLevelPositionDocids, + WordPrefixLevelPositionDocids, +} + +pub enum FieldIdMapMissingEntry { + DisplayedFieldId { field_id: FieldId }, + DisplayedFieldName { field_name: String }, + FacetedFieldName { field_name: String }, + FilterableFieldName { field_name: String }, + SearchableFieldName { field_name: String }, +} + +pub enum DatabaseMissingEntry { + DocumentId { internal_id: DocumentId }, + FacetValuesDocids, + IndexCreationTime, + IndexUpdateTime, +} + +pub enum UserError { + +} diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 8c1ed514c..b7401330a 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,6 +1,7 @@ #[macro_use] extern crate pest_derive; mod criterion; +mod error; mod external_documents_ids; mod fields_ids_map; mod search; From 44c353fafd97217e74a182b3bbb802e33d16f1a6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 10 Jun 2021 15:55:22 +0200 Subject: [PATCH 0791/1889] Introduce some way to construct an Error --- milli/src/error.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 02740377c..c6dc85e9c 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -1,6 +1,9 @@ use std::io; use crate::{DocumentId, FieldId}; +use heed::{MdbError, Error as HeedError}; + +pub type Object = serde_json::Map; pub enum Error { InternalError(InternalError), @@ -12,6 +15,16 @@ pub enum InternalError { DatabaseMissingEntry(DatabaseMissingEntry), FieldIdMapMissingEntry(FieldIdMapMissingEntry), IndexingMergingKeys(IndexingMergingKeys), + SerializationError(SerializationError), + StoreError(MdbError), + InvalidDatabaseTyping, + DatabaseClosing, +} + +pub enum SerializationError { + Decoding { db_name: Option<&'static str> }, + Encoding { db_name: Option<&'static str> }, + InvalidNumberSerialization, } pub enum IndexingMergingKeys { @@ -38,5 +51,40 @@ pub enum DatabaseMissingEntry { } pub enum UserError { - + AttributeLimitReached, + DocumentLimitReached, + InvalidCriterionName { name: String }, + InvalidDocumentId { document_id: DocumentId }, + MissingDocumentId { document: Object }, + MissingPrimaryKey, + DatabaseSizeReached, + NoSpaceLeftOnDevice, + InvalidStoreFile, +} + +impl From for Error { + fn from(error: io::Error) -> Error { + // TODO must be improved and more precise + Error::IoError(error) + } +} + +impl From for Error { + fn from(error: HeedError) -> Error { + use self::Error::*; + use self::InternalError::*; + use self::SerializationError::*; + use self::UserError::*; + + match error { + HeedError::Io(error) => Error::from(error), + HeedError::Mdb(MdbError::MapFull) => UserError(DatabaseSizeReached), + HeedError::Mdb(MdbError::Invalid) => UserError(InvalidStoreFile), + HeedError::Mdb(error) => InternalError(StoreError(error)), + HeedError::Encoding => InternalError(SerializationError(Encoding { db_name: None })), + HeedError::Decoding => InternalError(SerializationError(Decoding { db_name: None })), + HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), + HeedError::DatabaseClosing => InternalError(DatabaseClosing), + } + } } From 456541e921b9bebd5e6a8e66167da6cdee32aae6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 10 Jun 2021 17:31:08 +0200 Subject: [PATCH 0792/1889] Implement the Display trait on the Error type --- milli/src/error.rs | 132 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 106 insertions(+), 26 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index c6dc85e9c..ce22a5512 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -1,60 +1,50 @@ -use std::io; +use std::error::Error as StdError; +use std::{fmt, io}; + +use heed::{MdbError, Error as HeedError}; +use serde_json::{Map, Value}; use crate::{DocumentId, FieldId}; -use heed::{MdbError, Error as HeedError}; -pub type Object = serde_json::Map; +pub type Object = Map; +#[derive(Debug)] pub enum Error { InternalError(InternalError), IoError(io::Error), UserError(UserError), } +#[derive(Debug)] pub enum InternalError { - DatabaseMissingEntry(DatabaseMissingEntry), + DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, FieldIdMapMissingEntry(FieldIdMapMissingEntry), - IndexingMergingKeys(IndexingMergingKeys), + IndexingMergingKeys { process: &'static str }, SerializationError(SerializationError), StoreError(MdbError), InvalidDatabaseTyping, DatabaseClosing, } +#[derive(Debug)] pub enum SerializationError { Decoding { db_name: Option<&'static str> }, Encoding { db_name: Option<&'static str> }, InvalidNumberSerialization, } -pub enum IndexingMergingKeys { - DocIdWordPosition, - Document, - MainFstDeserialization, - WordLevelPositionDocids, - WordPrefixLevelPositionDocids, -} - +#[derive(Debug)] pub enum FieldIdMapMissingEntry { - DisplayedFieldId { field_id: FieldId }, - DisplayedFieldName { field_name: String }, - FacetedFieldName { field_name: String }, - FilterableFieldName { field_name: String }, - SearchableFieldName { field_name: String }, -} - -pub enum DatabaseMissingEntry { - DocumentId { internal_id: DocumentId }, - FacetValuesDocids, - IndexCreationTime, - IndexUpdateTime, + FieldId { field_id: FieldId, from_db_name: &'static str }, + FieldName { field_name: String, from_db_name: &'static str }, } +#[derive(Debug)] pub enum UserError { AttributeLimitReached, DocumentLimitReached, InvalidCriterionName { name: String }, - InvalidDocumentId { document_id: DocumentId }, + InvalidDocumentId { document_id: Value }, MissingDocumentId { document: Object }, MissingPrimaryKey, DatabaseSizeReached, @@ -88,3 +78,93 @@ impl From for Error { } } } + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::InternalError(error) => write!(f, "internal: {}", error), + Self::IoError(error) => error.fmt(f), + Self::UserError(error) => error.fmt(f), + } + } +} + +impl StdError for Error {} + +impl fmt::Display for InternalError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::DatabaseMissingEntry { db_name, key } => { + write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) + }, + Self::FieldIdMapMissingEntry(error) => error.fmt(f), + Self::IndexingMergingKeys { process } => { + write!(f, "invalid merge while processing {}", process) + }, + Self::SerializationError(error) => error.fmt(f), + Self::StoreError(error) => error.fmt(f), + Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), + Self::DatabaseClosing => HeedError::DatabaseClosing.fmt(f), + } + } +} + +impl StdError for InternalError {} + +impl fmt::Display for UserError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), + Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), + Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), + Self::InvalidDocumentId { document_id } => { + let json = serde_json::to_string(document_id).unwrap(); + write!(f, "document identifier is invalid {}", json) + }, + Self::MissingDocumentId { document } => { + let json = serde_json::to_string(document).unwrap(); + write!(f, "document doesn't have an identifier {}", json) + }, + Self::MissingPrimaryKey => f.write_str("missing primary key"), + Self::DatabaseSizeReached => f.write_str("database size reached"), + // TODO where can we find it instead of writing the text ourselves? + Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), + Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), + } + } +} + +impl StdError for UserError {} + +impl fmt::Display for FieldIdMapMissingEntry { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::FieldId { field_id, from_db_name } => { + write!(f, "unknown field id {} coming from {} database", field_id, from_db_name) + }, + Self::FieldName { field_name, from_db_name } => { + write!(f, "unknown field name {} coming from {} database", field_name, from_db_name) + }, + } + } +} + +impl StdError for FieldIdMapMissingEntry {} + +impl fmt::Display for SerializationError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Decoding { db_name: Some(name) } => { + write!(f, "decoding from the {} database failed", name) + }, + Self::Decoding { db_name: None } => f.write_str("decoding failed"), + Self::Encoding { db_name: Some(name) } => { + write!(f, "encoding into the {} database failed", name) + }, + Self::Encoding { db_name: None } => f.write_str("encoding failed"), + Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"), + } + } +} + +impl StdError for SerializationError {} From ca78cb5aca8c91a6efdc9e4d1ba37080d46c5e00 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 14 Jun 2021 16:58:38 +0200 Subject: [PATCH 0793/1889] Introduce more variants to the error module enums --- milli/src/error.rs | 107 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 96 insertions(+), 11 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index ce22a5512..096851f09 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -1,9 +1,12 @@ +use std::convert::Infallible; use std::error::Error as StdError; -use std::{fmt, io}; +use std::{fmt, io, str}; use heed::{MdbError, Error as HeedError}; +use rayon::ThreadPoolBuildError; use serde_json::{Map, Value}; +use crate::search::ParserRule; use crate::{DocumentId, FieldId}; pub type Object = Map; @@ -17,13 +20,18 @@ pub enum Error { #[derive(Debug)] pub enum InternalError { + DatabaseClosing, DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, FieldIdMapMissingEntry(FieldIdMapMissingEntry), + Fst(fst::Error), + GrenadInvalidCompressionType, IndexingMergingKeys { process: &'static str }, - SerializationError(SerializationError), - StoreError(MdbError), InvalidDatabaseTyping, - DatabaseClosing, + RayonThreadPool(ThreadPoolBuildError), + SerdeJson(serde_json::Error), + Serialization(SerializationError), + Store(MdbError), + Utf8(str::Utf8Error), } #[derive(Debug)] @@ -42,14 +50,18 @@ pub enum FieldIdMapMissingEntry { #[derive(Debug)] pub enum UserError { AttributeLimitReached, + Csv(csv::Error), + DatabaseSizeReached, DocumentLimitReached, + FilterParsing(pest::error::Error), InvalidCriterionName { name: String }, InvalidDocumentId { document_id: Value }, + InvalidStoreFile, MissingDocumentId { document: Object }, MissingPrimaryKey, - DatabaseSizeReached, NoSpaceLeftOnDevice, - InvalidStoreFile, + SerdeJson(serde_json::Error), + UnknownInternalDocumentId { document_id: DocumentId }, } impl From for Error { @@ -59,6 +71,36 @@ impl From for Error { } } +impl From for Error { + fn from(error: fst::Error) -> Error { + Error::InternalError(InternalError::Fst(error)) + } +} + +impl From> for Error where Error: From { + fn from(error: grenad::Error) -> Error { + match error { + grenad::Error::Io(error) => Error::IoError(error), + grenad::Error::Merge(error) => Error::from(error), + grenad::Error::InvalidCompressionType => { + Error::InternalError(InternalError::GrenadInvalidCompressionType) + }, + } + } +} + +impl From for Error { + fn from(error: str::Utf8Error) -> Error { + Error::InternalError(InternalError::Utf8(error)) + } +} + +impl From for Error { + fn from(_error: Infallible) -> Error { + unreachable!() + } +} + impl From for Error { fn from(error: HeedError) -> Error { use self::Error::*; @@ -70,15 +112,45 @@ impl From for Error { HeedError::Io(error) => Error::from(error), HeedError::Mdb(MdbError::MapFull) => UserError(DatabaseSizeReached), HeedError::Mdb(MdbError::Invalid) => UserError(InvalidStoreFile), - HeedError::Mdb(error) => InternalError(StoreError(error)), - HeedError::Encoding => InternalError(SerializationError(Encoding { db_name: None })), - HeedError::Decoding => InternalError(SerializationError(Decoding { db_name: None })), + HeedError::Mdb(error) => InternalError(Store(error)), + HeedError::Encoding => InternalError(Serialization(Encoding { db_name: None })), + HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })), HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), HeedError::DatabaseClosing => InternalError(DatabaseClosing), } } } +impl From for Error { + fn from(error: ThreadPoolBuildError) -> Error { + Error::InternalError(InternalError::RayonThreadPool(error)) + } +} + +impl From for Error { + fn from(error: FieldIdMapMissingEntry) -> Error { + Error::InternalError(InternalError::FieldIdMapMissingEntry(error)) + } +} + +impl From for Error { + fn from(error: InternalError) -> Error { + Error::InternalError(error) + } +} + +impl From for Error { + fn from(error: UserError) -> Error { + Error::UserError(error) + } +} + +impl From for Error { + fn from(error: SerializationError) -> Error { + Error::InternalError(InternalError::Serialization(error)) + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { @@ -98,13 +170,20 @@ impl fmt::Display for InternalError { write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) }, Self::FieldIdMapMissingEntry(error) => error.fmt(f), + Self::Fst(error) => error.fmt(f), + Self::GrenadInvalidCompressionType => { + f.write_str("invalid compression type have been specified to grenad") + }, Self::IndexingMergingKeys { process } => { write!(f, "invalid merge while processing {}", process) }, - Self::SerializationError(error) => error.fmt(f), - Self::StoreError(error) => error.fmt(f), + Self::Serialization(error) => error.fmt(f), Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), + Self::RayonThreadPool(error) => error.fmt(f), + Self::SerdeJson(error) => error.fmt(f), Self::DatabaseClosing => HeedError::DatabaseClosing.fmt(f), + Self::Store(error) => error.fmt(f), + Self::Utf8(error) => error.fmt(f), } } } @@ -115,7 +194,9 @@ impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), + Self::Csv(error) => error.fmt(f), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), + Self::FilterParsing(error) => error.fmt(f), Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); @@ -130,6 +211,10 @@ impl fmt::Display for UserError { // TODO where can we find it instead of writing the text ourselves? Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), + Self::SerdeJson(error) => error.fmt(f), + Self::UnknownInternalDocumentId { document_id } => { + write!(f, "an unknown internal document id have been used ({})", document_id) + }, } } } From 312c2d1d8ef12d3b7e50a39f909978f73fd0459e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 14 Jun 2021 16:46:19 +0200 Subject: [PATCH 0794/1889] Use the Error enum everywhere in the project --- Cargo.lock | 1 - milli/Cargo.toml | 1 - milli/src/criterion.rs | 11 +- milli/src/index.rs | 22 ++-- milli/src/lib.rs | 16 ++- milli/src/search/criteria/asc_desc.rs | 21 ++-- milli/src/search/criteria/attribute.rs | 17 ++- milli/src/search/criteria/exactness.rs | 6 +- milli/src/search/criteria/final.rs | 3 +- milli/src/search/criteria/initial.rs | 4 +- milli/src/search/criteria/mod.rs | 27 ++--- milli/src/search/criteria/proximity.rs | 19 ++-- milli/src/search/criteria/typo.rs | 13 ++- milli/src/search/criteria/words.rs | 3 +- milli/src/search/distinct/facet_distinct.rs | 14 +-- milli/src/search/distinct/mod.rs | 6 +- milli/src/search/distinct/noop_distinct.rs | 4 +- milli/src/search/facet/facet_distribution.rs | 11 +- milli/src/search/facet/filter_condition.rs | 70 +++++++----- milli/src/search/facet/mod.rs | 3 +- milli/src/search/mod.rs | 10 +- milli/src/search/query_tree.rs | 14 +-- milli/src/update/clear_documents.rs | 5 +- milli/src/update/delete_documents.rs | 14 ++- milli/src/update/facets.rs | 13 ++- .../update/index_documents/merge_function.rs | 16 +-- milli/src/update/index_documents/mod.rs | 92 +++++++++------- milli/src/update/index_documents/store.rs | 104 ++++++++++-------- milli/src/update/index_documents/transform.rs | 77 +++++++------ milli/src/update/settings.rs | 41 +++---- milli/src/update/update_builder.rs | 4 +- milli/src/update/word_prefix_docids.rs | 3 +- .../word_prefix_pair_proximity_docids.rs | 4 +- milli/src/update/words_level_positions.rs | 12 +- milli/src/update/words_prefixes_fst.rs | 4 +- 35 files changed, 385 insertions(+), 300 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c719f6f03..8e6794fb6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1377,7 +1377,6 @@ dependencies = [ name = "milli" version = "0.3.1" dependencies = [ - "anyhow", "big_s", "bstr", "byteorder", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 9fe1ce3d3..ac7a977a2 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -5,7 +5,6 @@ authors = ["Kerollmops "] edition = "2018" [dependencies] -anyhow = "1.0.38" bstr = "0.2.15" byteorder = "1.4.2" chrono = { version = "0.4.19", features = ["serde"] } diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index c2205613d..931cf8588 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,11 +1,12 @@ use std::fmt; use std::str::FromStr; -use anyhow::{Context, bail}; use regex::Regex; use serde::{Serialize, Deserialize}; use once_cell::sync::Lazy; +use crate::error::{Error, UserError}; + static ASC_DESC_REGEX: Lazy = Lazy::new(|| { Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() }); @@ -41,7 +42,7 @@ impl Criterion { } impl FromStr for Criterion { - type Err = anyhow::Error; + type Err = Error; fn from_str(txt: &str) -> Result { match txt { @@ -51,13 +52,15 @@ impl FromStr for Criterion { "attribute" => Ok(Criterion::Attribute), "exactness" => Ok(Criterion::Exactness), text => { - let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; + let caps = ASC_DESC_REGEX.captures(text).ok_or_else(|| { + UserError::InvalidCriterionName { name: text.to_string() } + })?; let order = caps.get(1).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str(); match order { "asc" => Ok(Criterion::Asc(field_name.to_string())), "desc" => Ok(Criterion::Desc(field_name.to_string())), - otherwise => bail!("unknown criterion name: {}", otherwise), + text => return Err(UserError::InvalidCriterionName { name: text.to_string() }.into()), } }, } diff --git a/milli/src/index.rs b/milli/src/index.rs index 4e32f673a..9ebe34a2e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2,14 +2,14 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::path::Path; -use anyhow::Context; use chrono::{DateTime, Utc}; use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use heed::types::*; use roaring::RoaringBitmap; +use crate::error::UserError; use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search}; -use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; +use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId, Result}; use crate::{ BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, @@ -84,7 +84,7 @@ pub struct Index { } impl Index { - pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { + pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { options.max_dbs(14); let env = options.open(path)?; @@ -173,7 +173,7 @@ impl Index { } /// Returns the number of documents indexed in the database. - pub fn number_of_documents(&self, rtxn: &RoTxn) -> anyhow::Result { + pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result { let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, DOCUMENTS_IDS_KEY)?; Ok(count.unwrap_or_default()) } @@ -215,7 +215,7 @@ impl Index { /// Returns the external documents ids map which associate the external ids /// with the internal ids (i.e. `u32`). - pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result> { + pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> { let hard = self.main.get::<_, Str, ByteSlice>(rtxn, HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; let soft = self.main.get::<_, Str, ByteSlice>(rtxn, SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; let hard = match hard { @@ -504,7 +504,7 @@ impl Index { } /// Returns the FST which is the words dictionary of the engine. - pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { + pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), None => Ok(fst::Set::default().map_data(Cow::Owned)?), @@ -521,7 +521,7 @@ impl Index { self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY) } - pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { + pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> Result>> { match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? { Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), None => Ok(None), @@ -555,7 +555,7 @@ impl Index { } /// Returns the FST which is the words prefixes dictionnary of the engine. - pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> anyhow::Result>> { + pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_PREFIXES_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), None => Ok(fst::Set::default().map_data(Cow::Owned)?), @@ -577,13 +577,13 @@ impl Index { &self, rtxn: &'t RoTxn, ids: impl IntoIterator, - ) -> anyhow::Result)>> + ) -> Result)>> { let mut documents = Vec::new(); for id in ids { let kv = self.documents.get(rtxn, &BEU32::new(id))? - .with_context(|| format!("Could not find document {}", id))?; + .ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?; documents.push((id, kv)); } @@ -594,7 +594,7 @@ impl Index { pub fn all_documents<'t>( &self, rtxn: &'t RoTxn, - ) -> anyhow::Result)>>> { + ) -> Result)>>> { Ok(self .documents .iter(rtxn)? diff --git a/milli/src/lib.rs b/milli/src/lib.rs index b7401330a..6fa88ad64 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -15,12 +15,13 @@ pub mod update; use std::borrow::Cow; use std::collections::HashMap; use std::hash::BuildHasherDefault; +use std::result::Result as StdResult; -use anyhow::Context; use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; pub use self::criterion::{Criterion, default_criteria}; +pub use self::error::Error; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec}; @@ -30,6 +31,8 @@ pub use self::index::Index; pub use self::search::{Search, FacetDistribution, FilterCondition, SearchResult, MatchingWords}; pub use self::tree_level::TreeLevel; +pub type Result = std::result::Result; + pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; @@ -44,21 +47,24 @@ pub type FieldId = u8; pub type Position = u32; pub type FieldsDistribution = HashMap; -type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result>; +type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, obkv: obkv::KvReader, -) -> anyhow::Result> +) -> Result> { displayed_fields.iter() .copied() .flat_map(|id| obkv.get(id).map(|value| (id, value))) .map(|(id, value)| { - let name = fields_ids_map.name(id).context("unknown obkv field id")?; - let value = serde_json::from_slice(value)?; + let name = fields_ids_map.name(id).ok_or(error::FieldIdMapMissingEntry::FieldId { + field_id: id, + from_db_name: "documents", + })?; + let value = serde_json::from_slice(value).map_err(error::InternalError::SerdeJson)?; Ok((name.to_owned(), value)) }) .collect() diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index f90f3e421..c72781629 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -1,15 +1,15 @@ use std::mem::take; -use anyhow::Context; use itertools::Itertools; use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; +use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; -use crate::{FieldId, Index}; +use crate::{FieldId, Index, Result}; use super::{Criterion, CriterionParameters, CriterionResult}; /// Threshold on the number of candidates that will make @@ -36,7 +36,7 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ) -> anyhow::Result { + ) -> Result { Self::new(index, rtxn, parent, field_name, true) } @@ -45,7 +45,7 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ) -> anyhow::Result { + ) -> Result { Self::new(index, rtxn, parent, field_name, false) } @@ -55,11 +55,14 @@ impl<'t> AscDesc<'t> { parent: Box, field_name: String, ascending: bool, - ) -> anyhow::Result { + ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map .id(&field_name) - .with_context(|| format!("field {:?} isn't registered", field_name))?; + .ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: field_name.clone(), + from_db_name: "asc-desc", + })?; Ok(AscDesc { index, @@ -79,7 +82,7 @@ impl<'t> AscDesc<'t> { impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { // remove excluded candidates when next is called, instead of doing it in the loop. self.allowed_candidates -= params.excluded_candidates; @@ -162,7 +165,7 @@ fn facet_ordered<'t>( field_id: FieldId, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result> + 't>> { +) -> Result> + 't>> { if candidates.len() <= CANDIDATES_THRESHOLD { let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; Ok(Box::new(iter.map(Ok)) as Box>) @@ -186,7 +189,7 @@ fn iterative_facet_ordered_iter<'t>( field_id: FieldId, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result + 't> { +) -> Result + 't> { let mut docids_values = Vec::with_capacity(candidates.len() as usize); for docid in candidates.iter() { let left = (field_id, docid, f64::MIN); diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index f825623f6..f191defe1 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -5,7 +5,7 @@ use std::mem::take; use roaring::RoaringBitmap; -use crate::{TreeLevel, search::build_dfa}; +use crate::{TreeLevel, Result, search::build_dfa}; use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; @@ -48,7 +48,7 @@ impl<'t> Attribute<'t> { impl<'t> Criterion for Attribute<'t> { #[logging_timer::time("Attribute::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { // remove excluded candidates when next is called, instead of doing it in the loop. if let Some((_, _, allowed_candidates)) = self.state.as_mut() { *allowed_candidates -= params.excluded_candidates; @@ -224,7 +224,12 @@ struct QueryLevelIterator<'t, 'q> { } impl<'t, 'q> QueryLevelIterator<'t, 'q> { - fn new(ctx: &'t dyn Context<'t>, queries: &'q [Query], wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + fn new( + ctx: &'t dyn Context<'t>, + queries: &'q [Query], + wdcache: &mut WordDerivationsCache, + ) -> Result> + { let mut inner = Vec::with_capacity(queries.len()); for query in queries { match &query.kind { @@ -471,7 +476,7 @@ fn initialize_query_level_iterators<'t, 'q>( branches: &'q FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result>> { +) -> Result>> { let mut positions = BinaryHeap::with_capacity(branches.len()); for branch in branches { @@ -521,7 +526,7 @@ fn set_compute_candidates<'t>( branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result> +) -> Result> { let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; let lowest_level = TreeLevel::min_value(); @@ -573,7 +578,7 @@ fn linear_compute_candidates( ctx: &dyn Context, branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, -) -> anyhow::Result> +) -> Result> { fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap) -> u64 { let mut min_rank = u64::max_value(); diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 4d9e54f6e..eb44b7b8e 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -14,7 +14,7 @@ use crate::search::criteria::{ CriterionResult, resolve_query_tree, }; -use crate::TreeLevel; +use crate::{TreeLevel, Result}; pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, @@ -45,7 +45,7 @@ impl<'t> Exactness<'t> { impl<'t> Criterion for Exactness<'t> { #[logging_timer::time("Exactness::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { // remove excluded candidates when next is called, instead of doing it in the loop. if let Some(state) = self.state.as_mut() { state.difference_with(params.excluded_candidates); @@ -158,7 +158,7 @@ fn resolve_state( ctx: &dyn Context, state: State, query: &[ExactQueryPart], -) -> anyhow::Result<(RoaringBitmap, Option)> +) -> Result<(RoaringBitmap, Option)> { use State::*; match state { diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index 860362f51..645a3a5d7 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -1,6 +1,7 @@ use log::debug; use roaring::RoaringBitmap; +use crate::Result; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use super::{resolve_query_tree, Criterion, CriterionResult, CriterionParameters, Context}; @@ -29,7 +30,7 @@ impl<'t> Final<'t> { } #[logging_timer::time("Final::{}")] - pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> anyhow::Result> { + pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> Result> { debug!("Final iteration"); let excluded_candidates = &self.returned_candidates | excluded_candidates; let mut criterion_parameters = CriterionParameters { diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 5d242a0eb..e6d0a17f7 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -1,7 +1,7 @@ use roaring::RoaringBitmap; +use crate::Result; use crate::search::query_tree::Operation; - use super::{Criterion, CriterionResult, CriterionParameters}; pub struct Initial { @@ -22,7 +22,7 @@ impl Initial { impl Criterion for Initial { #[logging_timer::time("Initial::{}")] - fn next(&mut self, _: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, _: &mut CriterionParameters) -> Result> { Ok(self.answer.take()) } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index b14d75ddb..981fc3ef2 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -4,7 +4,7 @@ use std::borrow::Cow; use roaring::RoaringBitmap; use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; -use crate::{Index, DocumentId}; +use crate::{Index, DocumentId, Result}; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use self::asc_desc::AscDesc; @@ -26,7 +26,7 @@ mod words; pub mod r#final; pub trait Criterion { - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result>; + fn next(&mut self, params: &mut CriterionParameters) -> Result>; } /// The result of a call to the parent criterion. @@ -78,8 +78,9 @@ pub trait Context<'c> { fn synonyms(&self, word: &str) -> heed::Result>>>; fn searchable_fields_ids(&self) -> heed::Result>; fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result>; - fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error>; + fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result>; } + pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, index: &'t Index, @@ -185,14 +186,14 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.field_id_word_count_docids.get(self.rtxn, &key) } - fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { + fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result> { let key = (word, level, left, right); self.index.word_level_position_docids.get(self.rtxn, &key) } } impl<'t> CriteriaBuilder<'t> { - pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result { + pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> Result { let words_fst = index.words_fst(rtxn)?; let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) @@ -203,7 +204,7 @@ impl<'t> CriteriaBuilder<'t> { query_tree: Option, primitive_query: Option>, filtered_candidates: Option, - ) -> anyhow::Result> + ) -> Result> { use crate::criterion::Criterion as Name; @@ -230,13 +231,13 @@ pub fn resolve_query_tree<'t>( ctx: &'t dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result + ) -> Result { use Operation::{And, Phrase, Or, Query}; @@ -244,7 +245,7 @@ pub fn resolve_query_tree<'t>( And(ops) => { let mut ops = ops.iter().map(|op| { resolve_operation(ctx, op, wdcache) - }).collect::>>()?; + }).collect::>>()?; ops.sort_unstable_by_key(|cds| cds.len()); @@ -302,7 +303,7 @@ fn all_word_pair_proximity_docids, U: AsRef>( left_words: &[(T, u8)], right_words: &[(U, u8)], proximity: u8 -) -> anyhow::Result +) -> Result { let mut docids = RoaringBitmap::new(); for (left, _l_typo) in left_words { @@ -318,7 +319,7 @@ fn query_docids( ctx: &dyn Context, query: &Query, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { match &query.kind { QueryKind::Exact { word, .. } => { @@ -354,7 +355,7 @@ fn query_pair_proximity_docids( right: &Query, proximity: u8, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { if proximity >= 8 { let mut candidates = query_docids(ctx, left, wdcache)?; @@ -481,7 +482,7 @@ pub mod test { todo!() } - fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> Result, heed::Error> { + fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> heed::Result> { todo!() } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 4da6fd1eb..c3c8027cb 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -5,9 +5,10 @@ use std::mem::take; use roaring::RoaringBitmap; use log::debug; -use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::{build_dfa, WordDerivationsCache}; +use crate::search::{query_tree::QueryKind}; +use crate::{DocumentId, Position, Result}; use super::{ Context, Criterion, @@ -55,7 +56,7 @@ impl<'t> Proximity<'t> { impl<'t> Criterion for Proximity<'t> { #[logging_timer::time("Proximity::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { // remove excluded candidates when next is called, instead of doing it in the loop. if let Some((_, _, allowed_candidates)) = self.state.as_mut() { *allowed_candidates -= params.excluded_candidates; @@ -161,7 +162,7 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, @@ -169,7 +170,7 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result> + ) -> Result> { use Operation::{And, Phrase, Or}; @@ -227,7 +228,7 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result> + ) -> Result> { fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { (0..=mana.min(left_max)).map(move |m| (m, mana - m)) @@ -281,7 +282,7 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result> + ) -> Result> { // Extract the first two elements but gives the tail // that is just after the first element. @@ -324,13 +325,13 @@ fn resolve_plane_sweep_candidates( query_tree: &Operation, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result> +) -> Result> { /// FIXME may be buggy with query like "new new york" fn plane_sweep( groups_positions: Vec>, consecutive: bool, - ) -> anyhow::Result> + ) -> Result> { fn compute_groups_proximity( groups: &[(usize, (Position, u8, Position))], @@ -451,7 +452,7 @@ fn resolve_plane_sweep_candidates( rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, words_positions: &HashMap, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result> + ) -> Result> { use Operation::{And, Phrase, Or}; diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index d075b6bca..436f4affd 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -5,6 +5,7 @@ use roaring::RoaringBitmap; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; +use crate::Result; use super::{ Candidates, Context, @@ -43,7 +44,7 @@ impl<'t> Typo<'t> { impl<'t> Criterion for Typo<'t> { #[logging_timer::time("Typo::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { use Candidates::{Allowed, Forbidden}; // remove excluded candidates when next is called, instead of doing it in the loop. match self.state.as_mut() { @@ -163,14 +164,14 @@ fn alterate_query_tree( mut query_tree: Operation, number_typos: u8, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { fn recurse( words_fst: &fst::Set>, operation: &mut Operation, number_typos: u8, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result<()> + ) -> Result<()> { use Operation::{And, Phrase, Or}; @@ -218,7 +219,7 @@ fn resolve_candidates<'t>( number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, -) -> anyhow::Result +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, @@ -226,7 +227,7 @@ fn resolve_candidates<'t>( number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result + ) -> Result { use Operation::{And, Phrase, Or, Query}; @@ -277,7 +278,7 @@ fn resolve_candidates<'t>( mana: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, - ) -> anyhow::Result + ) -> Result { match branches.split_first() { Some((head, [])) => { diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 8730fa331..add90d80d 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -4,6 +4,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; +use crate::Result; use super::{Context, Criterion, CriterionParameters, CriterionResult, resolve_query_tree}; pub struct Words<'t> { @@ -30,7 +31,7 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { #[logging_timer::time("Words::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { // remove excluded candidates when next is called, instead of doing it in the loop. if let Some(candidates) = self.candidates.as_mut() { *candidates -= params.excluded_candidates; diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index de7b28141..f86d6b8ed 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::heed_codec::facet::*; -use crate::{DocumentId, FieldId, Index}; +use crate::{DocumentId, FieldId, Index, Result}; const FID_SIZE: usize = size_of::(); const DOCID_SIZE: usize = size_of::(); @@ -57,7 +57,7 @@ impl<'a> FacetDistinctIter<'a> { .get(self.txn, &(self.distinct, 0, key, key)) } - fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> { + fn distinct_string(&mut self, id: DocumentId) -> Result<()> { let iter = facet_string_values(id, self.distinct, self.index, self.txn)?; for item in iter { @@ -73,7 +73,7 @@ impl<'a> FacetDistinctIter<'a> { Ok(()) } - fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> { + fn distinct_number(&mut self, id: DocumentId) -> Result<()> { let iter = facet_number_values(id, self.distinct, self.index, self.txn)?; for item in iter { @@ -92,7 +92,7 @@ impl<'a> FacetDistinctIter<'a> { /// Performs the next iteration of the facet distinct. This is a convenience method that is /// called by the Iterator::next implementation that transposes the result. It makes error /// handling easier. - fn next_inner(&mut self) -> anyhow::Result> { + fn next_inner(&mut self) -> Result> { // The first step is to remove all the excluded documents from our candidates self.candidates.difference_with(&self.excluded); @@ -129,7 +129,7 @@ fn facet_number_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> anyhow::Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index @@ -146,7 +146,7 @@ fn facet_string_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> anyhow::Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index @@ -159,7 +159,7 @@ fn facet_string_values<'a>( } impl Iterator for FacetDistinctIter<'_> { - type Item = anyhow::Result; + type Item = Result; fn next(&mut self) -> Option { self.next_inner().transpose() diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 1b7c69c7a..99bc74be0 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -3,13 +3,13 @@ mod noop_distinct; use roaring::RoaringBitmap; -use crate::DocumentId; +use crate::{DocumentId, Result}; pub use facet_distinct::FacetDistinct; pub use noop_distinct::NoopDistinct; /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. /// It provides a way to get back the ownership to the excluded set. -pub trait DocIter: Iterator> { +pub trait DocIter: Iterator> { /// Returns ownership on the internal exluded set. fn into_excluded(self) -> RoaringBitmap; } @@ -106,7 +106,7 @@ mod test { /// Checks that all the candidates are distinct, and returns the candidates number. pub(crate) fn validate_distinct_candidates( - candidates: impl Iterator>, + candidates: impl Iterator>, distinct: FieldId, index: &Index, ) -> usize { diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index bfaafed85..812701794 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -1,6 +1,6 @@ use roaring::{RoaringBitmap, bitmap::IntoIter}; -use crate::DocumentId; +use crate::{DocumentId, Result}; use super::{DocIter, Distinct}; /// A distinct implementer that does not perform any distinct, @@ -13,7 +13,7 @@ pub struct NoopDistinctIter { } impl Iterator for NoopDistinctIter { - type Item = anyhow::Result; + type Item = Result; fn next(&mut self) -> Option { self.candidates.next().map(Ok) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 565f4c6dd..917314b25 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -2,15 +2,15 @@ use std::collections::{HashSet, BTreeMap}; use std::ops::Bound::Unbounded; use std::{cmp, fmt}; -use anyhow::Context; use heed::{Database, BytesDecode}; use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; +use crate::error::FieldIdMapMissingEntry; use crate::facet::FacetType; use crate::heed_codec::facet::FacetValueStringCodec; use crate::search::facet::{FacetIter, FacetRange}; -use crate::{Index, FieldId, DocumentId}; +use crate::{Index, FieldId, DocumentId, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -195,14 +195,15 @@ impl<'a> FacetDistribution<'a> { } } - pub fn execute(&self) -> anyhow::Result>> { + pub fn execute(&self) -> Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let filterable_fields = self.index.filterable_fields(self.rtxn)?; let mut distribution = BTreeMap::new(); for name in filterable_fields { - let fid = fields_ids_map.id(&name).with_context(|| { - format!("missing field name {:?} from the fields id map", name) + let fid = fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: name.clone(), + from_db_name: "filterable-fields", })?; let values = self.facet_values(fid)?; distribution.insert(name, values); diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index f58443b6f..98d638574 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use std::fmt::Debug; use std::ops::Bound::{self, Included, Excluded}; +use std::result::Result as StdResult; use std::str::FromStr; use either::Either; @@ -11,8 +12,9 @@ use pest::iterators::{Pair, Pairs}; use pest::Parser; use roaring::RoaringBitmap; +use crate::error::UserError; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; -use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec}; +use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec, Result}; use super::FacetRange; use super::parser::Rule; @@ -60,7 +62,7 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, array: I, - ) -> anyhow::Result> + ) -> Result> where I: IntoIterator>, J: IntoIterator, A: AsRef, @@ -104,11 +106,11 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, expression: &str, - ) -> anyhow::Result + ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields_ids(rtxn)?; - let lexed = FilterParser::parse(Rule::prgm, expression)?; + let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::FilterParsing)?; FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } @@ -116,7 +118,7 @@ impl FilterCondition { fim: &FieldsIdsMap, ff: &HashSet, expression: Pairs, - ) -> anyhow::Result + ) -> Result { PREC_CLIMBER.climb( expression, @@ -133,7 +135,7 @@ impl FilterCondition { Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), _ => unreachable!(), }, - |lhs: anyhow::Result, op: Pair, rhs: anyhow::Result| { + |lhs: Result, op: Pair, rhs: Result| { match op.as_rule() { Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), @@ -158,16 +160,17 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::FilterParsing)?; let (lresult, _) = pest_parse(items.next().unwrap()); let (rresult, _) = pest_parse(items.next().unwrap()); - let lvalue = lresult?; - let rvalue = rresult?; + let lvalue = lresult.map_err(UserError::FilterParsing)?; + let rvalue = rresult.map_err(UserError::FilterParsing)?; Ok(Operator(fid, Between(lvalue, rvalue))) } @@ -176,10 +179,11 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::FilterParsing)?; let value = items.next().unwrap(); let (result, svalue) = pest_parse(value); @@ -192,60 +196,68 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::FilterParsing)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::FilterParsing)?; - Ok(Operator(fid, GreaterThan(result?))) + Ok(Operator(fid, GreaterThan(value))) } fn greater_than_or_equal( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::FilterParsing)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::FilterParsing)?; - Ok(Operator(fid, GreaterThanOrEqual(result?))) + Ok(Operator(fid, GreaterThanOrEqual(value))) } fn lower_than( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::FilterParsing)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::FilterParsing)?; - Ok(Operator(fid, LowerThan(result?))) + Ok(Operator(fid, LowerThan(value))) } fn lower_than_or_equal( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> anyhow::Result + ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items)?; + let fid = field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::FilterParsing)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::FilterParsing)?; - Ok(Operator(fid, LowerThanOrEqual(result?))) + Ok(Operator(fid, LowerThanOrEqual(value))) } } @@ -260,7 +272,7 @@ impl FilterCondition { left: Bound, right: Bound, output: &mut RoaringBitmap, - ) -> anyhow::Result<()> + ) -> Result<()> { match (left, right) { // If the request is an exact value we must go directly to the deepest level. @@ -332,7 +344,7 @@ impl FilterCondition { strings_db: heed::Database, field_id: FieldId, operator: &Operator, - ) -> anyhow::Result + ) -> Result { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the @@ -390,7 +402,7 @@ impl FilterCondition { &self, rtxn: &heed::RoTxn, index: &Index, - ) -> anyhow::Result + ) -> Result { let numbers_db = index.facet_id_f64_docids; let strings_db = index.facet_id_string_docids; @@ -422,7 +434,7 @@ fn field_id( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, items: &mut Pairs, -) -> Result> +) -> StdResult> { // lexing ensures that we at least have a key let key = items.next().unwrap(); @@ -463,7 +475,7 @@ fn field_id( /// the original string that we tried to parse. /// /// Returns the parsing error associated with the span if the conversion fails. -fn pest_parse(pair: Pair) -> (Result>, String) +fn pest_parse(pair: Pair) -> (StdResult>, String) where T: FromStr, T::Err: ToString, { diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index a5e02fc9f..a1a03dba3 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -9,8 +9,9 @@ use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{Index, FieldId}; -pub use self::filter_condition::{FilterCondition, Operator}; pub use self::facet_distribution::FacetDistribution; +pub use self::filter_condition::{FilterCondition, Operator}; +pub(crate) use self::parser::Rule as ParserRule; mod filter_condition; mod facet_distribution; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 872ebfca6..f8c7b5d9b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::hash_map::{Entry, HashMap}; use std::fmt; use std::mem::take; +use std::result::Result as StdResult; use std::str::Utf8Error; use std::time::Instant; @@ -14,10 +15,11 @@ use roaring::bitmap::RoaringBitmap; use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; use crate::search::criteria::r#final::{Final, FinalResult}; -use crate::{Index, DocumentId}; +use crate::{Index, DocumentId, Result}; pub use self::facet::{FilterCondition, FacetDistribution, FacetIter, Operator}; pub use self::matching_words::MatchingWords; +pub(crate) use self::facet::ParserRule; use self::query_tree::QueryTreeBuilder; // Building these factories is not free. @@ -93,7 +95,7 @@ impl<'a> Search<'a> { self } - pub fn execute(&self) -> anyhow::Result { + pub fn execute(&self) -> Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); let (query_tree, primitive_query) = match self.query.as_ref() { @@ -152,7 +154,7 @@ impl<'a> Search<'a> { mut distinct: D, matching_words: MatchingWords, mut criteria: Final, - ) -> anyhow::Result + ) -> Result { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); @@ -225,7 +227,7 @@ pub fn word_derivations<'c>( max_typo: u8, fst: &fst::Set>, cache: &'c mut WordDerivationsCache, -) -> Result<&'c [(String, u8)], Utf8Error> { +) -> StdResult<&'c [(String, u8)], Utf8Error> { match cache.entry((word.to_string(), is_prefix, max_typo)) { Entry::Occupied(entry) => Ok(entry.into_mut()), Entry::Vacant(entry) => { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 3c3420db4..c371b07d4 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -7,7 +7,7 @@ use meilisearch_tokenizer::TokenKind; use roaring::RoaringBitmap; use slice_group_by::GroupBy; -use crate::Index; +use crate::{Index, Result}; type IsOptionalWord = bool; type IsPrefix = bool; @@ -219,7 +219,7 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build(&self, query: TokenStream) -> anyhow::Result> { + pub fn build(&self, query: TokenStream) -> Result> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { @@ -291,14 +291,14 @@ fn create_query_tree( optional_words: bool, authorize_typos: bool, query: &[PrimitiveQueryPart], -) -> anyhow::Result +) -> Result { /// Matches on the `PrimitiveQueryPart` and create an operation from it. fn resolve_primitive_part( ctx: &impl Context, authorize_typos: bool, part: PrimitiveQueryPart, - ) -> anyhow::Result + ) -> Result { match part { // 1. try to split word in 2 @@ -325,7 +325,7 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - ) -> anyhow::Result + ) -> Result { const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); @@ -379,7 +379,7 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: PrimitiveQuery, - ) -> anyhow::Result + ) -> Result { let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); let mut operation_children = Vec::new(); @@ -532,7 +532,7 @@ mod test { authorize_typos: bool, words_limit: Option, query: TokenStream, - ) -> anyhow::Result> + ) -> Result> { let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index f4c13e8f8..6e26bf027 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,6 +1,7 @@ use chrono::Utc; use roaring::RoaringBitmap; -use crate::{ExternalDocumentsIds, Index, FieldsDistribution}; + +use crate::{ExternalDocumentsIds, Index, FieldsDistribution, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -18,7 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { ClearDocuments { wtxn, index, _update_id: update_id } } - pub fn execute(self) -> anyhow::Result { + pub fn execute(self) -> Result { self.index.set_updated_at(self.wtxn, &Utc::now())?; let Index { env: _env, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index c4cf132bb..6792d6278 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,15 +1,15 @@ use std::collections::HashMap; use std::collections::hash_map::Entry; -use anyhow::{anyhow, Context}; use chrono::Utc; use fst::IntoStreamer; use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; use serde_json::Value; +use crate::error::{InternalError, UserError}; use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds}; +use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result}; use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -25,7 +25,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, update_id: u64, - ) -> anyhow::Result> + ) -> Result> { let external_documents_ids = index .external_documents_ids(wtxn)? @@ -54,7 +54,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { Some(docid) } - pub fn execute(self) -> anyhow::Result { + pub fn execute(self) -> Result { self.index.set_updated_at(self.wtxn, &Utc::now())?; // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; @@ -77,7 +77,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let primary_key = self.index.primary_key(self.wtxn)?.context("missing primary key")?; + let primary_key = self.index.primary_key(self.wtxn)?.ok_or_else(|| { + InternalError::DatabaseMissingEntry { db_name: "main", key: Some("primary-key") } + })?; let id_field = fields_ids_map.id(primary_key).expect(r#"the field "id" to be present"#); let Index { @@ -119,7 +121,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let external_id = match serde_json::from_slice(content).unwrap() { Value::String(string) => SmallString32::from(string.as_str()), Value::Number(number) => SmallString32::from(number.to_string()), - _ => return Err(anyhow!("documents ids must be either strings or numbers")), + document_id => return Err(UserError::InvalidDocumentId { document_id }.into()), }; external_ids.push(external_id); } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index f0eab6023..757cbe810 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -9,11 +9,12 @@ use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; +use crate::error::InternalError; use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::Index; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::{Index, Result}; pub struct Facets<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -55,7 +56,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self } - pub fn execute(self) -> anyhow::Result<()> { + pub fn execute(self) -> Result<()> { self.index.set_updated_at(self.wtxn, &Utc::now())?; // We get the faceted fields to be able to create the facet levels. let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; @@ -102,7 +103,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), content, - |_, _| anyhow::bail!("invalid facet number level merging"), + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number level" }), WriteMethod::GetMergePut, )?; } @@ -132,7 +133,7 @@ fn compute_facet_number_levels<'t>( level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, field_id: u8, -) -> anyhow::Result> +) -> Result> { let first_level_size = db .remap_key_type::() @@ -195,7 +196,7 @@ fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, field_id: u8, -) -> anyhow::Result +) -> Result { let mut documents_ids = RoaringBitmap::new(); @@ -214,7 +215,7 @@ fn write_number_entry( left: f64, right: f64, ids: &RoaringBitmap, -) -> anyhow::Result<()> +) -> Result<()> { let key = (field_id, level, left, right); let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 8c93773ce..3d9ffda6a 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -1,17 +1,19 @@ use std::borrow::Cow; +use std::result::Result as StdResult; use fst::IntoStreamer; use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; +use crate::Result; /// Only the last value associated with an id is kept. -pub fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result> { +pub fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result> { Ok(obkvs.last().unwrap().clone().into_owned()) } /// Merge all the obks in the order we see them. -pub fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result> { +pub fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result> { let mut iter = obkvs.iter(); let first = iter.next().map(|b| b.clone().into_owned()).unwrap(); Ok(iter.fold(first, |acc, current| { @@ -24,8 +26,8 @@ pub fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> anyhow::Result> } // Union of multiple FSTs -pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - let fsts = values.iter().map(fst::Set::new).collect::, _>>()?; +pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { + let fsts = values.iter().map(fst::Set::new).collect::, _>>()?; let op_builder: fst::set::OpBuilder = fsts.iter().map(|fst| fst.into_stream()).collect(); let op = op_builder.r#union(); @@ -34,7 +36,7 @@ pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { Ok(build.into_inner().unwrap()) } -pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { +pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { Ok(values.first().unwrap().to_vec()) } @@ -54,7 +56,7 @@ pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mu writer.finish().unwrap(); } -pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { +pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { let (head, tail) = values.split_first().unwrap(); let mut head = RoaringBitmap::deserialize_from(&head[..])?; @@ -68,7 +70,7 @@ pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result Ok(vec) } -pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { +pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { let (head, tail) = values.split_first().unwrap(); let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 1d31cba85..51c8b948a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -3,11 +3,11 @@ use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom, BufReader, BufRead}; use std::num::{NonZeroU32, NonZeroUsize}; +use std::result::Result as StdResult; use std::str; use std::sync::mpsc::sync_channel; use std::time::Instant; -use anyhow::Context; use bstr::ByteSlice as _; use chrono::Utc; use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; @@ -18,7 +18,8 @@ use rayon::prelude::*; use rayon::ThreadPool; use serde::{Serialize, Deserialize}; -use crate::index::Index; +use crate::error::{Error, InternalError}; +use crate::{Index, Result}; use crate::update::{ Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep, WordPrefixPairProximityDocids, @@ -56,14 +57,14 @@ pub fn create_writer(typ: CompressionType, level: Option, file: File) -> io builder.build(file) } -pub fn create_sorter( - merge: MergeFn, +pub fn create_sorter( + merge: MergeFn, chunk_compression_type: CompressionType, chunk_compression_level: Option, chunk_fusing_shrink_size: Option, max_nb_chunks: Option, max_memory: Option, -) -> Sorter +) -> Sorter> { let mut builder = Sorter::builder(merge); if let Some(shrink_size) = chunk_fusing_shrink_size { @@ -82,7 +83,7 @@ pub fn create_sorter( builder.build() } -pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> anyhow::Result> { +pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> Result> { let mut file = writer.into_inner()?; file.seek(SeekFrom::Start(0))?; let file = if let Some(shrink_size) = shrink_size { @@ -93,19 +94,25 @@ pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> any Reader::new(file).map_err(Into::into) } -pub fn merge_readers(sources: Vec>, merge: MergeFn) -> Merger { +pub fn merge_readers( + sources: Vec>, + merge: MergeFn, +) -> Merger> +{ let mut builder = Merger::builder(merge); builder.extend(sources); builder.build() } -pub fn merge_into_lmdb_database( +pub fn merge_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, sources: Vec>, - merge: MergeFn, + merge: MergeFn, method: WriteMethod, -) -> anyhow::Result<()> +) -> Result<()> +where + Error: From, { debug!("Merging {} MTBL stores...", sources.len()); let before = Instant::now(); @@ -123,13 +130,15 @@ pub fn merge_into_lmdb_database( Ok(()) } -pub fn write_into_lmdb_database( +pub fn write_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, mut reader: Reader, - merge: MergeFn, + merge: MergeFn, method: WriteMethod, -) -> anyhow::Result<()> +) -> Result<()> +where + Error: From, { debug!("Writing MTBL stores..."); let before = Instant::now(); @@ -138,9 +147,7 @@ pub fn write_into_lmdb_database( WriteMethod::Append => { let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; while let Some((k, v)) = reader.next()? { - out_iter.append(k, v).with_context(|| { - format!("writing {:?} into LMDB", k.as_bstr()) - })?; + out_iter.append(k, v)?; } }, WriteMethod::GetMergePut => { @@ -165,13 +172,16 @@ pub fn write_into_lmdb_database( Ok(()) } -pub fn sorter_into_lmdb_database( +pub fn sorter_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, - sorter: Sorter, - merge: MergeFn, + sorter: Sorter>, + merge: MergeFn, method: WriteMethod, -) -> anyhow::Result<()> +) -> Result<()> +where + Error: From, + Error: From> { debug!("Writing MTBL sorter..."); let before = Instant::now(); @@ -188,21 +198,21 @@ pub fn sorter_into_lmdb_database( Ok(()) } -fn merger_iter_into_lmdb_database( +fn merger_iter_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, - mut sorter: MergerIter, - merge: MergeFn, + mut sorter: MergerIter>, + merge: MergeFn, method: WriteMethod, -) -> anyhow::Result<()> +) -> Result<()> +where + Error: From, { match method { WriteMethod::Append => { let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; while let Some((k, v)) = sorter.next()? { - out_iter.append(k, v).with_context(|| { - format!("writing {:?} into LMDB", k.as_bstr()) - })?; + out_iter.append(k, v)?; } }, WriteMethod::GetMergePut => { @@ -211,7 +221,10 @@ fn merger_iter_into_lmdb_database( match iter.next().transpose()? { Some((key, old_val)) if key == k => { let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).expect("merge failed"); + let val = merge(k, &vals).map_err(|_| { + // TODO just wrap this error? + InternalError::IndexingMergingKeys { process: "get-put-merge" } + })?; iter.put_current(k, &val)?; }, _ => { @@ -318,7 +331,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.autogenerate_docids = false; } - pub fn execute(self, reader: R, progress_callback: F) -> anyhow::Result + pub fn execute(self, reader: R, progress_callback: F) -> Result where R: io::Read, F: Fn(UpdateIndexingStep, u64) + Sync, @@ -365,7 +378,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { Ok(DocumentAdditionResult { nb_documents }) } - pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> anyhow::Result<()> + pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync { @@ -403,15 +416,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { debug!("{} documents actually deleted", deleted_documents_count); } - let mmap; - let bytes = if documents_count == 0 { - &[][..] - } else { - mmap = unsafe { Mmap::map(&documents_file).context("mmaping the transform documents file")? }; - &mmap - }; + if documents_count == 0 { + return Ok(()); + } - let documents = grenad::Reader::new(bytes).unwrap(); + let bytes = unsafe { Mmap::map(&documents_file)? }; + let documents = grenad::Reader::new(bytes.as_bytes()).unwrap(); // The enum which indicates the type of the readers // merges that are potentially done on different threads. @@ -477,7 +487,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { &progress_callback, ) }) - .collect::, _>>()?; + .collect::, _>>()?; let mut main_readers = Vec::with_capacity(readers.len()); let mut word_docids_readers = Vec::with_capacity(readers.len()); @@ -535,7 +545,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { debug!("Merging the main, word docids and words pairs proximity docids in parallel..."); rayon::spawn(move || { vec![ - (DatabaseType::Main, main_readers, fst_merge as MergeFn), + (DatabaseType::Main, main_readers, fst_merge as MergeFn<_>), (DatabaseType::WordDocids, word_docids_readers, roaring_bitmap_merge), ( DatabaseType::FacetLevel0NumbersDocids, @@ -570,7 +580,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { facet_field_strings_docids_readers, field_id_docid_facet_numbers_readers, field_id_docid_facet_strings_readers, - )) as anyhow::Result<_> + )) as Result<_> })?; let ( diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 4662cd609..e5e55682e 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -6,7 +6,6 @@ use std::iter::FromIterator; use std::time::Instant; use std::{cmp, iter}; -use anyhow::Context; use bstr::ByteSlice as _; use fst::Set; use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; @@ -19,11 +18,12 @@ use roaring::RoaringBitmap; use serde_json::Value; use tempfile::tempfile; +use crate::error::{Error, InternalError, SerializationError}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId}; +use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge}; @@ -66,15 +66,15 @@ pub struct Store<'s, A> { chunk_compression_level: Option, chunk_fusing_shrink_size: Option, // MTBL sorters - main_sorter: Sorter, - word_docids_sorter: Sorter, - words_pairs_proximities_docids_sorter: Sorter, - word_level_position_docids_sorter: Sorter, - field_id_word_count_docids_sorter: Sorter, - facet_field_numbers_docids_sorter: Sorter, - facet_field_strings_docids_sorter: Sorter, - field_id_docid_facet_numbers_sorter: Sorter, - field_id_docid_facet_strings_sorter: Sorter, + main_sorter: Sorter>, + word_docids_sorter: Sorter>, + words_pairs_proximities_docids_sorter: Sorter>, + word_level_position_docids_sorter: Sorter>, + field_id_word_count_docids_sorter: Sorter>, + facet_field_numbers_docids_sorter: Sorter>, + facet_field_strings_docids_sorter: Sorter>, + field_id_docid_facet_numbers_sorter: Sorter>, + field_id_docid_facet_strings_sorter: Sorter>, // MTBL writers docid_word_positions_writer: Writer, documents_writer: Writer, @@ -93,7 +93,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { chunk_compression_level: Option, chunk_fusing_shrink_size: Option, stop_words: Option<&'s Set
>, - ) -> anyhow::Result + ) -> Result { // We divide the max memory by the number of sorter the Store have. let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); @@ -221,7 +221,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } // Save the documents ids under the position and word we have seen it. - fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> anyhow::Result<()> { + fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> { // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.word_docids.get_refresh(word.as_bytes()) { Some(old) => { old.insert(id); }, @@ -246,7 +246,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { field_id: FieldId, value: OrderedFloat, id: DocumentId, - ) -> anyhow::Result<()> + ) -> Result<()> { let sorter = &mut self.field_id_docid_facet_numbers_sorter; Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; @@ -279,7 +279,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { field_id: FieldId, value: String, id: DocumentId, - ) -> anyhow::Result<()> + ) -> Result<()> { let sorter = &mut self.field_id_docid_facet_strings_sorter; Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; @@ -311,7 +311,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { &mut self, words_pairs_proximities: impl IntoIterator, id: DocumentId, - ) -> anyhow::Result<()> + ) -> Result<()> { for ((w1, w2), prox) in words_pairs_proximities { let w1 = SmallVec32::from(w1.as_bytes()); @@ -350,7 +350,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { facet_numbers_values: &mut HashMap>, facet_strings_values: &mut HashMap>, record: &[u8], - ) -> anyhow::Result<()> + ) -> Result<()> { // We compute the list of words pairs proximities (self-join) and write it directly to disk. let words_pair_proximities = compute_words_pair_proximities(&words_positions); @@ -385,10 +385,12 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_words_pairs_proximities( - sorter: &mut Sorter, + fn write_words_pairs_proximities( + sorter: &mut Sorter>, iter: impl IntoIterator, SmallVec32, u8), RoaringBitmap)>, - ) -> anyhow::Result<()> + ) -> Result<()> + where + Error: From, { let mut key = Vec::new(); let mut buffer = Vec::new(); @@ -417,7 +419,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { writer: &mut Writer, id: DocumentId, words_positions: &HashMap>, - ) -> anyhow::Result<()> + ) -> Result<()> { // We prefix the words by the document id. let mut key = id.to_be_bytes().to_vec(); @@ -445,11 +447,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_word_position_docids( - writer: &mut Sorter, + fn write_word_position_docids( + writer: &mut Sorter>, document_id: DocumentId, words_positions: &HashMap>, - ) -> anyhow::Result<()> + ) -> Result<()> + where + Error: From, { let mut key_buffer = Vec::new(); let mut data_buffer = Vec::new(); @@ -480,11 +484,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_facet_field_string_docids( - sorter: &mut Sorter, + fn write_facet_field_string_docids( + sorter: &mut Sorter>, iter: I, - ) -> anyhow::Result<()> - where I: IntoIterator + ) -> Result<()> + where + I: IntoIterator, + Error: From, { let mut key_buffer = Vec::new(); let mut data_buffer = Vec::new(); @@ -504,11 +510,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_facet_field_number_docids( - sorter: &mut Sorter, + fn write_facet_field_number_docids( + sorter: &mut Sorter>, iter: I, - ) -> anyhow::Result<()> - where I: IntoIterator), RoaringBitmap)> + ) -> Result<()> + where + I: IntoIterator), RoaringBitmap)>, + Error: From, { let mut data_buffer = Vec::new(); @@ -517,7 +525,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value)) .map(Cow::into_owned) - .context("could not serialize facet level value key")?; + .ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?; CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); @@ -529,16 +537,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_field_id_docid_facet_number_value( - sorter: &mut Sorter, + fn write_field_id_docid_facet_number_value( + sorter: &mut Sorter>, field_id: FieldId, document_id: DocumentId, value: OrderedFloat, - ) -> anyhow::Result<()> + ) -> Result<()> + where + Error: From, { let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value)) .map(Cow::into_owned) - .context("could not serialize facet level value key")?; + .ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?; if lmdb_key_valid_size(&key) { sorter.insert(&key, &[])?; @@ -547,12 +557,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_field_id_docid_facet_string_value( - sorter: &mut Sorter, + fn write_field_id_docid_facet_string_value( + sorter: &mut Sorter>, field_id: FieldId, document_id: DocumentId, value: &str, - ) -> anyhow::Result<()> + ) -> Result<()> + where + Error: From, { let mut buffer = Vec::new(); @@ -565,8 +577,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_word_docids(sorter: &mut Sorter, iter: I) -> anyhow::Result<()> - where I: IntoIterator, RoaringBitmap)> + fn write_word_docids(sorter: &mut Sorter>, iter: I) -> Result<()> + where + I: IntoIterator, RoaringBitmap)>, + Error: From, { let mut key = Vec::new(); let mut buffer = Vec::new(); @@ -596,7 +610,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { num_threads: usize, log_every_n: Option, mut progress_callback: F, - ) -> anyhow::Result + ) -> Result where F: FnMut(UpdateIndexingStep), { debug!("{:?}: Indexing in a Store...", thread_index); @@ -625,7 +639,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { for (attr, content) in document.iter() { if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) { - let value = serde_json::from_slice(content)?; + let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; let (facet_numbers, facet_strings) = extract_facet_values(&value); facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers); @@ -679,7 +693,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(readers) } - fn finish(mut self) -> anyhow::Result { + fn finish(mut self) -> Result { let comp_type = self.chunk_compression_type; let comp_level = self.chunk_compression_level; let shrink_size = self.chunk_fusing_shrink_size; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 5fbd24bb1..82003eddc 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -2,17 +2,19 @@ use std::borrow::Cow; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::iter::Peekable; +use std::result::Result as StdResult; use std::time::Instant; -use anyhow::{anyhow, Context}; use grenad::CompressionType; use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; +use crate::error::{Error, UserError, InternalError}; use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; -use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; +use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; +use crate::{Index, Result}; use super::merge_function::merge_two_obkvs; use super::{create_writer, create_sorter, IndexDocumentsMethod}; @@ -53,7 +55,7 @@ fn is_primary_key(field: impl AsRef) -> bool { } impl Transform<'_, '_> { - pub fn output_from_json(self, reader: R, progress_callback: F) -> anyhow::Result + pub fn output_from_json(self, reader: R, progress_callback: F) -> Result where R: Read, F: Fn(UpdateIndexingStep) + Sync, @@ -61,7 +63,7 @@ impl Transform<'_, '_> { self.output_from_generic_json(reader, false, progress_callback) } - pub fn output_from_json_stream(self, reader: R, progress_callback: F) -> anyhow::Result + pub fn output_from_json_stream(self, reader: R, progress_callback: F) -> Result where R: Read, F: Fn(UpdateIndexingStep) + Sync, @@ -74,7 +76,7 @@ impl Transform<'_, '_> { reader: R, is_stream: bool, progress_callback: F, - ) -> anyhow::Result + ) -> Result where R: Read, F: Fn(UpdateIndexingStep) + Sync, @@ -88,7 +90,7 @@ impl Transform<'_, '_> { let iter = Box::new(iter) as Box>; iter.peekable() } else { - let vec: Vec<_> = serde_json::from_reader(reader)?; + let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?; let iter = vec.into_iter().map(Ok); let iter = Box::new(iter) as Box>; iter.peekable() @@ -96,9 +98,12 @@ impl Transform<'_, '_> { // We extract the primary key from the first document in // the batch if it hasn't already been defined in the index - let first = match documents.peek().map(Result::as_ref).transpose() { + let first = match documents.peek().map(StdResult::as_ref).transpose() { Ok(first) => first, - Err(_) => return Err(documents.next().unwrap().unwrap_err().into()), + Err(_) => { + let error = documents.next().unwrap().unwrap_err(); + return Err(UserError::SerdeJson(error).into()); + }, }; let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); @@ -145,7 +150,7 @@ impl Transform<'_, '_> { let mut documents_count = 0; for result in documents { - let document = result?; + let document = result.map_err(UserError::SerdeJson)?; if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { @@ -158,7 +163,7 @@ impl Transform<'_, '_> { // We prepare the fields ids map with the documents keys. for (key, _value) in &document { - fields_ids_map.insert(&key).context("field id limit reached")?; + fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; } // We retrieve the user id from the document based on the primary key name, @@ -167,11 +172,13 @@ impl Transform<'_, '_> { Some(value) => match value { Value::String(string) => Cow::Borrowed(string.as_str()), Value::Number(number) => Cow::Owned(number.to_string()), - _ => return Err(anyhow!("documents ids must be either strings or numbers")), + content => return Err(UserError::InvalidDocumentId { + document_id: content.clone(), + }.into()), }, None => { if !self.autogenerate_docids { - return Err(anyhow!("missing primary key")); + return Err(UserError::MissingPrimaryKey.into()); } let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); Cow::Borrowed(uuid) @@ -186,13 +193,15 @@ impl Transform<'_, '_> { // and this should be the document id we return the one we generated. if let Some(value) = document.get(name) { // We serialize the attribute values. - serde_json::to_writer(&mut json_buffer, value)?; + serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?; writer.insert(field_id, &json_buffer)?; } // We validate the document id [a-zA-Z0-9\-_]. if field_id == primary_key_id && validate_document_id(&external_id).is_none() { - return Err(anyhow!("invalid document id: {:?}", external_id)); + return Err(UserError::InvalidDocumentId { + document_id: Value::from(external_id), + }.into()); } } @@ -217,7 +226,7 @@ impl Transform<'_, '_> { ) } - pub fn output_from_csv(self, reader: R, progress_callback: F) -> anyhow::Result + pub fn output_from_csv(self, reader: R, progress_callback: F) -> Result where R: Read, F: Fn(UpdateIndexingStep) + Sync, @@ -226,12 +235,12 @@ impl Transform<'_, '_> { let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let mut csv = csv::Reader::from_reader(reader); - let headers = csv.headers()?; + let headers = csv.headers().map_err(UserError::Csv)?; let mut fields_ids = Vec::new(); // Generate the new fields ids based on the current fields ids and this CSV headers. for (i, header) in headers.iter().enumerate() { - let id = fields_ids_map.insert(header).context("field id limit reached)")?; + let id = fields_ids_map.insert(header).ok_or(UserError::AttributeLimitReached)?; fields_ids.push((id, i)); } @@ -281,7 +290,7 @@ impl Transform<'_, '_> { let mut documents_count = 0; let mut record = csv::StringRecord::new(); - while csv.read_record(&mut record)? { + while csv.read_record(&mut record).map_err(UserError::Csv)? { obkv_buffer.clear(); let mut writer = obkv::KvWriter::new(&mut obkv_buffer); @@ -298,7 +307,9 @@ impl Transform<'_, '_> { // We validate the document id [a-zA-Z0-9\-_]. match validate_document_id(&external_id) { Some(valid) => valid, - None => return Err(anyhow!("invalid document id: {:?}", external_id)), + None => return Err(UserError::InvalidDocumentId { + document_id: Value::from(external_id), + }.into()), } }, None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), @@ -316,7 +327,7 @@ impl Transform<'_, '_> { for (field_id, field) in iter { // We serialize the attribute values as JSON strings. json_buffer.clear(); - serde_json::to_writer(&mut json_buffer, &field)?; + serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?; writer.insert(*field_id, &json_buffer)?; } @@ -344,17 +355,18 @@ impl Transform<'_, '_> { /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. - fn output_from_sorter( + fn output_from_sorter( self, - sorter: grenad::Sorter, + sorter: grenad::Sorter>, primary_key: String, fields_ids_map: FieldsIdsMap, approximate_number_of_documents: usize, mut external_documents_ids: ExternalDocumentsIds<'_>, progress_callback: F, - ) -> anyhow::Result + ) -> Result where F: Fn(UpdateIndexingStep) + Sync, + Error: From, { let documents_ids = self.index.documents_ids(self.rtxn)?; let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; @@ -362,7 +374,7 @@ impl Transform<'_, '_> { // Once we have sort and deduplicated the documents we write them into a final file. let mut final_sorter = create_sorter( - |_docid, _obkvs| Err(anyhow!("cannot merge two documents")), + |_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "merging documents" }), self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, @@ -398,7 +410,10 @@ impl Transform<'_, '_> { IndexDocumentsMethod::UpdateDocuments => { let key = BEU32::new(docid); let base_obkv = self.index.documents.get(&self.rtxn, &key)? - .context("document not found")?; + .ok_or(InternalError::DatabaseMissingEntry { + db_name: "documents", + key: None, + })?; let update_obkv = obkv::KvReader::new(update_obkv); merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); (docid, obkv_buffer.as_slice()) @@ -409,7 +424,7 @@ impl Transform<'_, '_> { // If this user id is new we add it to the external documents ids map // for new ids and into the list of new documents. let new_docid = available_documents_ids.next() - .context("no more available documents ids")?; + .ok_or(UserError::DocumentLimitReached)?; new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; new_documents_ids.insert(new_docid); (new_docid, update_obkv) @@ -469,7 +484,7 @@ impl Transform<'_, '_> { primary_key: String, old_fields_ids_map: FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, - ) -> anyhow::Result + ) -> Result { let fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; @@ -529,10 +544,10 @@ fn compute_primary_key_pair( fields_ids_map: &mut FieldsIdsMap, alternative_name: Option, autogenerate_docids: bool, -) -> anyhow::Result<(FieldId, String)> { +) -> Result<(FieldId, String)> { match primary_key { Some(primary_key) => { - let id = fields_ids_map.insert(primary_key).ok_or(anyhow!("Maximum number of fields exceeded"))?; + let id = fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; Ok((id, primary_key.to_string())) } None => { @@ -542,12 +557,12 @@ fn compute_primary_key_pair( if !autogenerate_docids { // If there is no primary key in the current document batch, we must // return an error and not automatically generate any document id. - anyhow::bail!("missing primary key") + return Err(UserError::MissingPrimaryKey.into()); } DEFAULT_PRIMARY_KEY_NAME.to_string() }, }; - let id = fields_ids_map.insert(&name).context("field id limit reached")?; + let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; Ok((id, name)) }, } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1c687e089..1756a21c9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeSet, HashMap, HashSet}; +use std::result::Result as StdResult; -use anyhow::Context; use chrono::Utc; use grenad::CompressionType; use itertools::Itertools; @@ -9,9 +9,10 @@ use rayon::ThreadPool; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::criterion::Criterion; +use crate::error::UserError; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; -use crate::{FieldsIdsMap, Index}; +use crate::{FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq)] pub enum Setting { @@ -33,7 +34,7 @@ impl Setting { } impl Serialize for Setting { - fn serialize(&self, serializer: S) -> Result where S: Serializer { + fn serialize(&self, serializer: S) -> StdResult where S: Serializer { match self { Self::Set(value) => Some(value), // Usually not_set isn't serialized by setting skip_serializing_if field attribute @@ -43,7 +44,7 @@ impl Serialize for Setting { } impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting { - fn deserialize(deserializer: D) -> Result where D: Deserializer<'de> { + fn deserialize(deserializer: D) -> StdResult where D: Deserializer<'de> { Deserialize::deserialize(deserializer).map(|x| match x { Some(x) => Self::Set(x), None => Self::Reset, // Reset is forced by sending null value @@ -165,7 +166,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } - fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync { @@ -192,7 +193,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { }; // There already has been a document addition, the primary key should be set by now. - let primary_key = self.index.primary_key(&self.wtxn)?.context("Index must have a primary key")?; + let primary_key = self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; // We remap the documents fields based on the new `FieldsIdsMap`. let output = transform.remap_index_documents( @@ -220,7 +221,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } - fn update_displayed(&mut self) -> anyhow::Result { + fn update_displayed(&mut self) -> Result { match self.displayed_fields { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -234,7 +235,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { for name in names.iter() { fields_ids_map .insert(name) - .context("field id limit exceeded")?; + .ok_or(UserError::AttributeLimitReached)?; } self.index.put_displayed_fields(self.wtxn, &names)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -245,13 +246,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } - fn update_distinct_field(&mut self) -> anyhow::Result { + fn update_distinct_field(&mut self) -> Result { match self.distinct_field { Setting::Set(ref attr) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; fields_ids_map .insert(attr) - .context("field id limit exceeded")?; + .ok_or(UserError::AttributeLimitReached)?; self.index.put_distinct_field(self.wtxn, &attr)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -264,7 +265,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { /// Updates the index's searchable attributes. This causes the field map to be recomputed to /// reflect the order of the searchable attributes. - fn update_searchable(&mut self) -> anyhow::Result { + fn update_searchable(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { // every time the searchable attributes are updated, we need to update the @@ -285,13 +286,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { for name in names.iter() { new_fields_ids_map .insert(&name) - .context("field id limit exceeded")?; + .ok_or(UserError::AttributeLimitReached)?; } for (_, name) in old_fields_ids_map.iter() { new_fields_ids_map .insert(&name) - .context("field id limit exceeded")?; + .ok_or(UserError::AttributeLimitReached)?; } self.index.put_searchable_fields(self.wtxn, &names)?; @@ -303,7 +304,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } - fn update_stop_words(&mut self) -> anyhow::Result { + fn update_stop_words(&mut self) -> Result { match self.stop_words { Setting::Set(ref stop_words) => { let current = self.index.stop_words(self.wtxn)?; @@ -325,7 +326,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } - fn update_synonyms(&mut self) -> anyhow::Result { + fn update_synonyms(&mut self) -> Result { match self.synonyms { Setting::Set(ref synonyms) => { fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec { @@ -383,13 +384,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } - fn update_filterable(&mut self) -> anyhow::Result<()> { + fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_facets = HashSet::new(); for name in fields { - fields_ids_map.insert(name).context("field id limit exceeded")?; + fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; new_facets.insert(name.clone()); } self.index.put_filterable_fields(self.wtxn, &new_facets)?; @@ -401,7 +402,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } - fn update_criteria(&mut self) -> anyhow::Result<()> { + fn update_criteria(&mut self) -> Result<()> { match self.criteria { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -409,7 +410,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { for name in fields { let criterion: Criterion = name.parse()?; if let Some(name) = criterion.field_name() { - fields_ids_map.insert(name).context("field id limit exceeded")?; + fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } new_criteria.push(criterion); } @@ -422,7 +423,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } - pub fn execute(mut self, progress_callback: F) -> anyhow::Result<()> + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync { diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 8d6eb034d..1d0e776b1 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -1,7 +1,7 @@ use grenad::CompressionType; use rayon::ThreadPool; -use crate::Index; +use crate::{Index, Result}; use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; pub struct UpdateBuilder<'a> { @@ -76,7 +76,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> anyhow::Result> + ) -> Result> { DeleteDocuments::new(wtxn, index, self.update_id) } diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 0544f8789..a2197b28c 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -5,6 +5,7 @@ use fst::Streamer; use grenad::CompressionType; use heed::types::ByteSlice; +use crate::Result; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{ create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, @@ -33,7 +34,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } } - pub fn execute(self) -> anyhow::Result<()> { + pub fn execute(self) -> Result<()> { // Clear the word prefix docids database. self.index.word_prefix_docids.clear(self.wtxn)?; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index c6b935e54..9019b26e5 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -7,7 +7,7 @@ use heed::BytesEncode; use heed::types::ByteSlice; use log::debug; -use crate::Index; +use crate::{Index, Result}; use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::{ WriteMethod, create_sorter, sorter_into_lmdb_database, @@ -41,7 +41,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } } - pub fn execute(self) -> anyhow::Result<()> { + pub fn execute(self) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index f94507aab..e2e3f7b4c 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -11,7 +11,9 @@ use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; +use crate::error::InternalError; use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; +use crate::Result; use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{ create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, @@ -56,7 +58,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self } - pub fn execute(self) -> anyhow::Result<()> { + pub fn execute(self) -> Result<()> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); let entries = compute_positions_levels( @@ -78,7 +80,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_level_position_docids.as_polymorph(), entries, - |_, _| anyhow::bail!("invalid word level position merging"), + |_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" }), WriteMethod::Append, )?; @@ -142,7 +144,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_level_position_docids.as_polymorph(), entries, - |_, _| anyhow::bail!("invalid word prefix level position merging"), + |_, _| Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }), WriteMethod::Append, )?; @@ -174,7 +176,7 @@ fn compute_positions_levels( shrink_size: Option, level_group_size: NonZeroU32, min_level_size: NonZeroU32, -) -> anyhow::Result> +) -> Result> { // It is forbidden to keep a cursor and write in a database at the same time with LMDB // therefore we write the facet levels entries into a grenad file before transfering them. @@ -251,7 +253,7 @@ fn write_level_entry( left: u32, right: u32, ids: &RoaringBitmap, -) -> anyhow::Result<()> +) -> Result<()> { let key = (word, level, left, right); let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index f53b0ee00..d1aa267b8 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -2,7 +2,7 @@ use std::iter::FromIterator; use std::str; use fst::Streamer; -use crate::{Index, SmallString32}; +use crate::{Index, SmallString32, Result}; pub struct WordsPrefixesFst<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -48,7 +48,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { self } - pub fn execute(self) -> anyhow::Result<()> { + pub fn execute(self) -> Result<()> { let words_fst = self.index.words_fst(&self.wtxn)?; let number_of_words = words_fst.len(); let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; From 78fe4259a900afc2fdac78e5ae1068c28cd40d6b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 14 Jun 2021 18:06:23 +0200 Subject: [PATCH 0795/1889] Fix the http-ui crate --- http-ui/src/main.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 1f91e6370..e23dddd4c 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -391,7 +391,7 @@ async fn main() -> anyhow::Result<()> { match result { Ok(_) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) + Err(e) => Err(e.into()), } } UpdateMeta::ClearDocuments => { @@ -401,7 +401,7 @@ async fn main() -> anyhow::Result<()> { match builder.execute() { Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) + Err(e) => Err(e.into()), } } UpdateMeta::Settings(settings) => { @@ -471,7 +471,7 @@ async fn main() -> anyhow::Result<()> { match result { Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) + Err(e) => Err(e.into()), } } UpdateMeta::Facets(levels) => { @@ -486,7 +486,7 @@ async fn main() -> anyhow::Result<()> { } match builder.execute() { Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e) + Err(e) => Err(e.into()), } } }; From 28c004aa2cd8cb16610aa322e449955c5cf523ce Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 15 Jun 2021 11:06:42 +0200 Subject: [PATCH 0796/1889] Prefer using constant for the database names --- infos/src/main.rs | 100 ++++---- milli/src/index.rs | 224 +++++++++++------- milli/src/update/delete_documents.rs | 6 +- milli/src/update/index_documents/store.rs | 2 +- milli/src/update/index_documents/transform.rs | 3 +- 5 files changed, 183 insertions(+), 152 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index d6aa1f854..b0c304de0 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -5,55 +5,41 @@ use std::{str, io, fmt}; use anyhow::Context; use byte_unit::Byte; use heed::EnvOpenOptions; -use milli::facet::FacetType; -use milli::{Index, TreeLevel}; use structopt::StructOpt; +use milli::facet::FacetType; +use milli::index::db_name::*; +use milli::{Index, TreeLevel}; + use Command::*; #[cfg(target_os = "linux")] #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; -const MAIN_DB_NAME: &str = "main"; -const WORD_DOCIDS_DB_NAME: &str = "word-docids"; -const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids"; -const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; -const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; -const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; -const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; -const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; -const FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME: &str = "field-id-word-count-docids"; -const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids"; -const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids"; -const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s"; -const FIELD_ID_DOCID_FACET_STRINGS_DB_NAME: &str = "field-id-docid-facet-strings"; - -const DOCUMENTS_DB_NAME: &str = "documents"; - const ALL_DATABASE_NAMES: &[&str] = &[ - MAIN_DB_NAME, - WORD_DOCIDS_DB_NAME, - WORD_PREFIX_DOCIDS_DB_NAME, - DOCID_WORD_POSITIONS_DB_NAME, - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, - WORD_LEVEL_POSITION_DOCIDS_DB_NAME, - WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, - FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME, - FACET_ID_F64_DOCIDS_DB_NAME, - FACET_ID_STRING_DOCIDS_DB_NAME, - FIELD_ID_DOCID_FACET_F64S_DB_NAME, - FIELD_ID_DOCID_FACET_STRINGS_DB_NAME, - DOCUMENTS_DB_NAME, + MAIN, + WORD_DOCIDS, + WORD_PREFIX_DOCIDS, + DOCID_WORD_POSITIONS, + WORD_PAIR_PROXIMITY_DOCIDS, + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS, + WORD_LEVEL_POSITION_DOCIDS, + WORD_PREFIX_LEVEL_POSITION_DOCIDS, + FIELD_ID_WORD_COUNT_DOCIDS, + FACET_ID_F64_DOCIDS, + FACET_ID_STRING_DOCIDS, + FIELD_ID_DOCID_FACET_F64S, + FIELD_ID_DOCID_FACET_STRINGS, + DOCUMENTS, ]; const POSTINGS_DATABASE_NAMES: &[&str] = &[ - WORD_DOCIDS_DB_NAME, - WORD_PREFIX_DOCIDS_DB_NAME, - DOCID_WORD_POSITIONS_DB_NAME, - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, + WORD_DOCIDS, + WORD_PREFIX_DOCIDS, + DOCID_WORD_POSITIONS, + WORD_PAIR_PROXIMITY_DOCIDS, + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS, ]; #[derive(Debug, StructOpt)] @@ -944,21 +930,21 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a for name in names { let database = match name.as_str() { - MAIN_DB_NAME => &main, - WORD_PREFIX_DOCIDS_DB_NAME => word_prefix_docids.as_polymorph(), - WORD_DOCIDS_DB_NAME => word_docids.as_polymorph(), - DOCID_WORD_POSITIONS_DB_NAME => docid_word_positions.as_polymorph(), - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_pair_proximity_docids.as_polymorph(), - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), - WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), - WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), - FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => field_id_word_count_docids.as_polymorph(), - FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(), - FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(), - FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(), - FIELD_ID_DOCID_FACET_STRINGS_DB_NAME => field_id_docid_facet_strings.as_polymorph(), + MAIN => &main, + WORD_PREFIX_DOCIDS => word_prefix_docids.as_polymorph(), + WORD_DOCIDS => word_docids.as_polymorph(), + DOCID_WORD_POSITIONS => docid_word_positions.as_polymorph(), + WORD_PAIR_PROXIMITY_DOCIDS => word_pair_proximity_docids.as_polymorph(), + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => word_prefix_pair_proximity_docids.as_polymorph(), + WORD_LEVEL_POSITION_DOCIDS => word_level_position_docids.as_polymorph(), + WORD_PREFIX_LEVEL_POSITION_DOCIDS => word_prefix_level_position_docids.as_polymorph(), + FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(), + FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(), + FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(), + FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(), - DOCUMENTS_DB_NAME => documents.as_polymorph(), + DOCUMENTS => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; @@ -1039,27 +1025,27 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu } match name { - WORD_DOCIDS_DB_NAME => { + WORD_DOCIDS => { let db = index.word_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, - WORD_PREFIX_DOCIDS_DB_NAME => { + WORD_PREFIX_DOCIDS => { let db = index.word_prefix_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, - DOCID_WORD_POSITIONS_DB_NAME => { + DOCID_WORD_POSITIONS => { let db = index.docid_word_positions.as_polymorph(); compute_stats::(*db, rtxn, name) }, - WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => { + WORD_PAIR_PROXIMITY_DOCIDS => { let db = index.word_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => { + WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => { let db = index.word_prefix_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, - FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => { + FIELD_ID_WORD_COUNT_DOCIDS => { let db = index.field_id_word_count_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, diff --git a/milli/src/index.rs b/milli/src/index.rs index 9ebe34a2e..f3411564b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -21,25 +21,44 @@ use crate::heed_codec::facet::{ }; use crate::fields_ids_map::FieldsIdsMap; -pub const CRITERIA_KEY: &str = "criteria"; -pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; -pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; -pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; -pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; -pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; -pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; -pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; -pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; -pub const PRIMARY_KEY_KEY: &str = "primary-key"; -pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; -pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; -pub const STOP_WORDS_KEY: &str = "stop-words"; -pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; -pub const SYNONYMS_KEY: &str = "synonyms"; -pub const WORDS_FST_KEY: &str = "words-fst"; -pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; -const CREATED_AT_KEY: &str = "created-at"; -const UPDATED_AT_KEY: &str = "updated-at"; +pub mod main_key { + pub const CRITERIA_KEY: &str = "criteria"; + pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; + pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; + pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; + pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; + pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; + pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; + pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; + pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; + pub const PRIMARY_KEY_KEY: &str = "primary-key"; + pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; + pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; + pub const STOP_WORDS_KEY: &str = "stop-words"; + pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; + pub const SYNONYMS_KEY: &str = "synonyms"; + pub const WORDS_FST_KEY: &str = "words-fst"; + pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; + pub const CREATED_AT_KEY: &str = "created-at"; + pub const UPDATED_AT_KEY: &str = "updated-at"; +} + +pub mod db_name { + pub const MAIN: &str = "main"; + pub const WORD_DOCIDS: &str = "word-docids"; + pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; + pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; + pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; + pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; + pub const WORD_LEVEL_POSITION_DOCIDS: &str = "word-level-position-docids"; + pub const WORD_PREFIX_LEVEL_POSITION_DOCIDS: &str = "word-prefix-level-position-docids"; + pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; + pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; + pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; + pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; + pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; + pub const DOCUMENTS: &str = "documents"; +} #[derive(Clone)] pub struct Index { @@ -85,23 +104,25 @@ pub struct Index { impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { + use db_name::*; + options.max_dbs(14); let env = options.open(path)?; - let main = env.create_poly_database(Some("main"))?; - let word_docids = env.create_database(Some("word-docids"))?; - let word_prefix_docids = env.create_database(Some("word-prefix-docids"))?; - let docid_word_positions = env.create_database(Some("docid-word-positions"))?; - let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; - let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; - let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; - let field_id_word_count_docids = env.create_database(Some("field-id-word-count-docids"))?; - let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; - let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?; - let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?; - let field_id_docid_facet_f64s = env.create_database(Some("field-id-docid-facet-f64s"))?; - let field_id_docid_facet_strings = env.create_database(Some("field-id-docid-facet-strings"))?; - let documents = env.create_database(Some("documents"))?; + let main = env.create_poly_database(Some(MAIN))?; + let word_docids = env.create_database(Some(WORD_DOCIDS))?; + let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; + let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; + let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; + let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; + let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?; + let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; + let word_prefix_level_position_docids = env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; + let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; + let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; + let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; + let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; + let documents = env.create_database(Some(DOCUMENTS))?; Index::initialize_creation_dates(&env, main)?; @@ -127,10 +148,10 @@ impl Index { fn initialize_creation_dates(env: &heed::Env, main: PolyDatabase) -> heed::Result<()> { let mut txn = env.write_txn()?; // The db was just created, we update its metadata with the relevant information. - if main.get::<_, Str, SerdeJson>>(&txn, CREATED_AT_KEY)?.is_none() { + if main.get::<_, Str, SerdeJson>>(&txn, main_key::CREATED_AT_KEY)?.is_none() { let now = Utc::now(); - main.put::<_, Str, SerdeJson>>(&mut txn, UPDATED_AT_KEY, &now)?; - main.put::<_, Str, SerdeJson>>(&mut txn, CREATED_AT_KEY, &now)?; + main.put::<_, Str, SerdeJson>>(&mut txn, main_key::UPDATED_AT_KEY, &now)?; + main.put::<_, Str, SerdeJson>>(&mut txn, main_key::CREATED_AT_KEY, &now)?; txn.commit()?; } Ok(()) @@ -164,17 +185,17 @@ impl Index { /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. pub fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { - self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, DOCUMENTS_IDS_KEY, docids) + self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) } /// Returns the internal documents ids. pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?.unwrap_or_default()) + Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?.unwrap_or_default()) } /// Returns the number of documents indexed in the database. pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result { - let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, DOCUMENTS_IDS_KEY)?; + let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; Ok(count.unwrap_or_default()) } @@ -183,17 +204,17 @@ impl Index { /// Writes the documents primary key, this is the field name that is used to store the id. pub fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { self.set_updated_at(wtxn, &Utc::now())?; - self.main.put::<_, Str, Str>(wtxn, PRIMARY_KEY_KEY, &primary_key) + self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, &primary_key) } /// Deletes the primary key of the documents, this can be done to reset indexes settings. pub fn delete_primary_key(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, PRIMARY_KEY_KEY) + self.main.delete::<_, Str>(wtxn, main_key::PRIMARY_KEY_KEY) } /// Returns the documents primary key, `None` if it hasn't been defined. pub fn primary_key<'t>(&self, rtxn: &'t RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, PRIMARY_KEY_KEY) + self.main.get::<_, Str, Str>(rtxn, main_key::PRIMARY_KEY_KEY) } /* external documents ids */ @@ -208,16 +229,16 @@ impl Index { let ExternalDocumentsIds { hard, soft } = external_documents_ids; let hard = hard.as_fst().as_bytes(); let soft = soft.as_fst().as_bytes(); - self.main.put::<_, Str, ByteSlice>(wtxn, HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?; - self.main.put::<_, Str, ByteSlice>(wtxn, SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?; + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?; + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?; Ok(()) } /// Returns the external documents ids map which associate the external ids /// with the internal ids (i.e. `u32`). pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> { - let hard = self.main.get::<_, Str, ByteSlice>(rtxn, HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; - let soft = self.main.get::<_, Str, ByteSlice>(rtxn, SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; + let hard = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; + let soft = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; let hard = match hard { Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, None => fst::Map::default().map_data(Cow::Owned)?, @@ -234,13 +255,16 @@ impl Index { /// Writes the fields ids map which associate the documents keys with an internal field id /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. pub fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>(wtxn, FIELDS_IDS_MAP_KEY, map) + self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) } /// Returns the fields ids map which associate the documents keys with an internal field id /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_IDS_MAP_KEY)?.unwrap_or_default()) + Ok(self.main.get::<_, Str, SerdeJson>( + rtxn, + main_key::FIELDS_IDS_MAP_KEY, + )?.unwrap_or_default()) } /* fields distribution */ @@ -248,13 +272,16 @@ impl Index { /// Writes the fields distribution which associates every field name with /// the number of times it occurs in the documents. pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>(wtxn, FIELDS_DISTRIBUTION_KEY, distribution) + self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution) } /// Returns the fields distribution which associates every field name with /// the number of times it occurs in the documents. pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default()) + Ok(self.main.get::<_, Str, SerdeJson>( + rtxn, + main_key::FIELDS_DISTRIBUTION_KEY, + )?.unwrap_or_default()) } /* displayed fields */ @@ -262,19 +289,19 @@ impl Index { /// Writes the fields that must be displayed in the defined order. /// There must be not be any duplicate field id. pub fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, DISPLAYED_FIELDS_KEY, &fields) + self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields) } /// Deletes the displayed fields ids, this will make the engine to display /// all the documents attributes in the order of the `FieldsIdsMap`. pub fn delete_displayed_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, DISPLAYED_FIELDS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::DISPLAYED_FIELDS_KEY) } /// Returns the displayed fields in the order they were set by the user. If it returns /// `None` it means that all the attributes are set as displayed in the order of the `FieldsIdsMap`. pub fn displayed_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { - self.main.get::<_, Str, SerdeBincode>>(rtxn, DISPLAYED_FIELDS_KEY) + self.main.get::<_, Str, SerdeBincode>>(rtxn, main_key::DISPLAYED_FIELDS_KEY) } pub fn displayed_fields_ids(&self, rtxn: &RoTxn) -> heed::Result>> { @@ -291,18 +318,18 @@ impl Index { /// Writes the searchable fields, when this list is specified, only these are indexed. pub fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, SEARCHABLE_FIELDS_KEY, &fields) + self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields) } /// Deletes the searchable fields, when no fields are specified, all fields are indexed. pub fn delete_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, SEARCHABLE_FIELDS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::SEARCHABLE_FIELDS_KEY) } /// Returns the searchable fields, those are the fields that are indexed, /// if the searchable fields aren't there it means that **all** the fields are indexed. pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { - self.main.get::<_, Str, SerdeBincode>>(rtxn, SEARCHABLE_FIELDS_KEY) + self.main.get::<_, Str, SerdeBincode>>(rtxn, main_key::SEARCHABLE_FIELDS_KEY) } /// Identical to `searchable_fields`, but returns the ids instead. @@ -328,17 +355,20 @@ impl Index { /// Writes the filterable fields names in the database. pub fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<_>>(wtxn, FILTERABLE_FIELDS_KEY, fields) + self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) } /// Deletes the filterable fields ids in the database. pub fn delete_filterable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, FILTERABLE_FIELDS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::FILTERABLE_FIELDS_KEY) } /// Returns the filterable fields names. pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result> { - Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FILTERABLE_FIELDS_KEY)?.unwrap_or_default()) + Ok(self.main.get::<_, Str, SerdeJson<_>>( + rtxn, + main_key::FILTERABLE_FIELDS_KEY, + )?.unwrap_or_default()) } /// Same as `filterable_fields`, but returns ids instead. @@ -409,9 +439,9 @@ impl Index { docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } @@ -423,9 +453,9 @@ impl Index { field_id: FieldId, ) -> heed::Result { - let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), @@ -441,9 +471,9 @@ impl Index { docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } @@ -455,9 +485,9 @@ impl Index { field_id: FieldId, ) -> heed::Result { - let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), @@ -468,29 +498,29 @@ impl Index { /* distinct field */ pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> { - self.main.put::<_, Str, Str>(wtxn, DISTINCT_FIELD_KEY, distinct_field) + self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field) } pub fn distinct_field<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, DISTINCT_FIELD_KEY) + self.main.get::<_, Str, Str>(rtxn, main_key::DISTINCT_FIELD_KEY) } pub(crate) fn delete_distinct_field(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, DISTINCT_FIELD_KEY) + self.main.delete::<_, Str>(wtxn, main_key::DISTINCT_FIELD_KEY) } /* criteria */ pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, CRITERIA_KEY, &criteria) + self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) } pub fn delete_criteria(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, CRITERIA_KEY) + self.main.delete::<_, Str>(wtxn, main_key::CRITERIA_KEY) } pub fn criteria(&self, rtxn: &RoTxn) -> heed::Result> { - match self.main.get::<_, Str, SerdeJson>>(rtxn, CRITERIA_KEY)? { + match self.main.get::<_, Str, SerdeJson>>(rtxn, main_key::CRITERIA_KEY)? { Some(criteria) => Ok(criteria), None => Ok(default_criteria()), } @@ -500,12 +530,12 @@ impl Index { /// Writes the FST which is the words dictionary of the engine. pub fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes()) + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) } /// Returns the FST which is the words dictionary of the engine. pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? { + match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::WORDS_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), None => Ok(fst::Set::default().map_data(Cow::Owned)?), } @@ -514,15 +544,15 @@ impl Index { /* stop words */ pub fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, STOP_WORDS_KEY, fst.as_fst().as_bytes()) + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) } pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, STOP_WORDS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::STOP_WORDS_KEY) } pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, STOP_WORDS_KEY)? { + match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::STOP_WORDS_KEY)? { Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), None => Ok(None), } @@ -530,19 +560,29 @@ impl Index { /* synonyms */ - pub fn put_synonyms(&self, wtxn: &mut RwTxn, synonyms: &HashMap, Vec>>) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<_>>(wtxn, SYNONYMS_KEY, synonyms) + pub fn put_synonyms( + &self, + wtxn: &mut RwTxn, + synonyms: &HashMap, Vec>>, + ) -> heed::Result<()> + { + self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms) } pub fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, SYNONYMS_KEY) + self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY) } pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result, Vec>>> { - Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, SYNONYMS_KEY)?.unwrap_or_default()) + Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?.unwrap_or_default()) } - pub fn words_synonyms>(&self, rtxn: &RoTxn, words: &[S]) -> heed::Result>>> { + pub fn words_synonyms>( + &self, + rtxn: &RoTxn, + words: &[S], + ) -> heed::Result>>> + { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms(rtxn)?.remove(&words)) } @@ -551,12 +591,12 @@ impl Index { /// Writes the FST which is the words prefixes dictionnary of the engine. pub fn put_words_prefixes_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) } /// Returns the FST which is the words prefixes dictionnary of the engine. pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_PREFIXES_FST_KEY)? { + match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? { Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), None => Ok(fst::Set::default().map_data(Cow::Owned)?), } @@ -613,7 +653,7 @@ impl Index { /// Returns the index creation time. pub fn created_at(&self, rtxn: &RoTxn) -> heed::Result> { let time = self.main - .get::<_, Str, SerdeJson>>(rtxn, CREATED_AT_KEY)? + .get::<_, Str, SerdeJson>>(rtxn, main_key::CREATED_AT_KEY)? .expect("Index without creation time"); Ok(time) } @@ -621,13 +661,13 @@ impl Index { /// Returns the index last updated time. pub fn updated_at(&self, rtxn: &RoTxn) -> heed::Result> { let time = self.main - .get::<_, Str, SerdeJson>>(rtxn, UPDATED_AT_KEY)? + .get::<_, Str, SerdeJson>>(rtxn, main_key::UPDATED_AT_KEY)? .expect("Index without update time"); Ok(time) } pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) + self.main.put::<_, Str, SerdeJson>>(wtxn, main_key::UPDATED_AT_KEY, &time) } } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 6792d6278..ceba7bf01 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -9,6 +9,7 @@ use serde_json::Value; use crate::error::{InternalError, UserError}; use crate::heed_codec::CboRoaringBitmapCodec; +use crate::index::{db_name, main_key}; use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result}; use super::ClearDocuments; @@ -78,7 +79,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let primary_key = self.index.primary_key(self.wtxn)?.ok_or_else(|| { - InternalError::DatabaseMissingEntry { db_name: "main", key: Some("primary-key") } + InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + } })?; let id_field = fields_ids_map.id(primary_key).expect(r#"the field "id" to be present"#); diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index e5e55682e..94ae12108 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -32,7 +32,7 @@ const LMDB_MAX_KEY_LENGTH: usize = 511; const ONE_KILOBYTE: usize = 1024 * 1024; const MAX_POSITION: usize = 1000; -const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); +const WORDS_FST_KEY: &[u8] = crate::index::main_key::WORDS_FST_KEY.as_bytes(); pub struct Readers { pub main: Reader, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 82003eddc..c44130d7e 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -11,6 +11,7 @@ use roaring::RoaringBitmap; use serde_json::{Map, Value}; use crate::error::{Error, UserError, InternalError}; +use crate::index::db_name; use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; @@ -411,7 +412,7 @@ impl Transform<'_, '_> { let key = BEU32::new(docid); let base_obkv = self.index.documents.get(&self.rtxn, &key)? .ok_or(InternalError::DatabaseMissingEntry { - db_name: "documents", + db_name: db_name::DOCUMENTS, key: None, })?; let update_obkv = obkv::KvReader::new(update_obkv); From f0e804afd5687b2c074a83eb73c68d6eb674cd43 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 15 Jun 2021 11:10:50 +0200 Subject: [PATCH 0797/1889] Rename the FieldIdMapMissingEntry from_db_name field into process --- milli/src/error.rs | 12 ++++++------ milli/src/lib.rs | 2 +- milli/src/search/criteria/asc_desc.rs | 2 +- milli/src/search/facet/facet_distribution.rs | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 096851f09..5a8dfc90b 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -43,8 +43,8 @@ pub enum SerializationError { #[derive(Debug)] pub enum FieldIdMapMissingEntry { - FieldId { field_id: FieldId, from_db_name: &'static str }, - FieldName { field_name: String, from_db_name: &'static str }, + FieldId { field_id: FieldId, process: &'static str }, + FieldName { field_name: String, process: &'static str }, } #[derive(Debug)] @@ -224,11 +224,11 @@ impl StdError for UserError {} impl fmt::Display for FieldIdMapMissingEntry { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::FieldId { field_id, from_db_name } => { - write!(f, "unknown field id {} coming from {} database", field_id, from_db_name) + Self::FieldId { field_id, process } => { + write!(f, "unknown field id {} coming from the {} process", field_id, process) }, - Self::FieldName { field_name, from_db_name } => { - write!(f, "unknown field name {} coming from {} database", field_name, from_db_name) + Self::FieldName { field_name, process } => { + write!(f, "unknown field name {} coming from the {} process", field_name, process) }, } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 6fa88ad64..f37244114 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -62,7 +62,7 @@ pub fn obkv_to_json( .map(|(id, value)| { let name = fields_ids_map.name(id).ok_or(error::FieldIdMapMissingEntry::FieldId { field_id: id, - from_db_name: "documents", + process: "obkv_to_json", })?; let value = serde_json::from_slice(value).map_err(error::InternalError::SerdeJson)?; Ok((name.to_owned(), value)) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index c72781629..95f77fd78 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -61,7 +61,7 @@ impl<'t> AscDesc<'t> { .id(&field_name) .ok_or_else(|| FieldIdMapMissingEntry::FieldName { field_name: field_name.clone(), - from_db_name: "asc-desc", + process: "AscDesc::new", })?; Ok(AscDesc { diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 917314b25..265a8ffeb 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -203,7 +203,7 @@ impl<'a> FacetDistribution<'a> { for name in filterable_fields { let fid = fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { field_name: name.clone(), - from_db_name: "filterable-fields", + process: "FacetDistribution::execute", })?; let values = self.facet_values(fid)?; distribution.insert(name, values); From a7d6930905d423d7a007abb610315f0e7ec65965 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 15 Jun 2021 11:51:32 +0200 Subject: [PATCH 0798/1889] Replace the panicking expect by tracked Errors --- milli/src/index.rs | 137 ++++++++++++-------- milli/src/search/criteria/mod.rs | 6 +- milli/src/search/distinct/facet_distinct.rs | 14 +- milli/src/search/mod.rs | 9 +- milli/src/update/delete_documents.rs | 9 +- 5 files changed, 109 insertions(+), 66 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index f3411564b..02a1f9d58 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -7,7 +7,7 @@ use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use heed::types::*; use roaring::RoaringBitmap; -use crate::error::UserError; +use crate::error::{UserError, FieldIdMapMissingEntry, InternalError}; use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search}; use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId, Result}; use crate::{ @@ -304,14 +304,25 @@ impl Index { self.main.get::<_, Str, SerdeBincode>>(rtxn, main_key::DISPLAYED_FIELDS_KEY) } - pub fn displayed_fields_ids(&self, rtxn: &RoTxn) -> heed::Result>> { - let fields_ids_map = self.fields_ids_map(rtxn)?; - let ids = self.displayed_fields(rtxn)? - .map(|fields| fields - .into_iter() - .map(|name| fields_ids_map.id(name).expect("Field not found")) - .collect::>()); - Ok(ids) + /// Identical to `displayed_fields`, but returns the ids instead. + pub fn displayed_fields_ids(&self, rtxn: &RoTxn) -> Result>> { + match self.displayed_fields(rtxn)? { + Some(fields) => { + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields.into_iter() { + match fields_ids_map.id(name) { + Some(field_id) => fields_ids.push(field_id), + None => return Err(FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "Index::displayed_fields_ids", + }.into()), + } + } + Ok(Some(fields_ids)) + }, + None => Ok(None), + } } /* searchable fields */ @@ -333,20 +344,22 @@ impl Index { } /// Identical to `searchable_fields`, but returns the ids instead. - pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> heed::Result>> { + pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result>> { match self.searchable_fields(rtxn)? { - Some(names) => { - let fields_map = self.fields_ids_map(rtxn)?; - let mut ids = Vec::new(); - for name in names { - let id = fields_map - .id(name) - .ok_or_else(|| format!("field id map must contain {:?}", name)) - .expect("corrupted data: "); - ids.push(id); + Some(fields) => { + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields { + match fields_ids_map.id(name) { + Some(field_id) => fields_ids.push(field_id), + None => return Err(FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "Index::searchable_fields_ids", + }.into()), + } } - Ok(Some(ids)) - } + Ok(Some(fields_ids)) + }, None => Ok(None), } } @@ -371,21 +384,25 @@ impl Index { )?.unwrap_or_default()) } - /// Same as `filterable_fields`, but returns ids instead. - pub fn filterable_fields_ids(&self, rtxn: &RoTxn) -> heed::Result> { - let filterable_fields = self.filterable_fields(rtxn)?; + /// Identical to `filterable_fields`, but returns ids instead. + pub fn filterable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.filterable_fields(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?; - let filterable_fields = filterable_fields - .iter() - .map(|k| { - fields_ids_map - .id(k) - .ok_or_else(|| format!("{:?} should be present in the field id map", k)) - .expect("corrupted data: ") - }) - .collect(); - Ok(filterable_fields) + let mut fields_ids = HashSet::new(); + for name in fields { + match fields_ids_map.id(&name) { + Some(field_id) => { + fields_ids.insert(field_id); + }, + None => return Err(FieldIdMapMissingEntry::FieldName { + field_name: name, + process: "Index::filterable_fields_ids", + }.into()), + } + } + + Ok(fields_ids) } /* faceted documents ids */ @@ -393,7 +410,7 @@ impl Index { /// Returns the faceted fields names. /// /// Faceted fields are the union of all the filterable, distinct, and Asc/Desc fields. - pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { + pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result> { let filterable_fields = self.filterable_fields(rtxn)?; let distinct_field = self.distinct_field(rtxn)?; let asc_desc_fields = self.criteria(rtxn)? @@ -412,21 +429,25 @@ impl Index { Ok(faceted_fields) } - /// Same as `faceted_fields`, but returns ids instead. - pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result> { - let faceted_fields = self.faceted_fields(rtxn)?; + /// Identical to `faceted_fields`, but returns ids instead. + pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.faceted_fields(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?; - let faceted_fields = faceted_fields - .iter() - .map(|k| { - fields_ids_map - .id(k) - .ok_or_else(|| format!("{:?} should be present in the field id map", k)) - .expect("corrupted data: ") - }) - .collect(); - Ok(faceted_fields) + let mut fields_ids = HashSet::new(); + for name in fields.into_iter() { + match fields_ids_map.id(&name) { + Some(field_id) => { + fields_ids.insert(field_id); + }, + None => return Err(FieldIdMapMissingEntry::FieldName { + field_name: name, + process: "Index::faceted_fields_ids", + }.into()), + } + } + + Ok(fields_ids) } /* faceted documents ids */ @@ -651,19 +672,23 @@ impl Index { } /// Returns the index creation time. - pub fn created_at(&self, rtxn: &RoTxn) -> heed::Result> { - let time = self.main + pub fn created_at(&self, rtxn: &RoTxn) -> Result> { + Ok(self.main .get::<_, Str, SerdeJson>>(rtxn, main_key::CREATED_AT_KEY)? - .expect("Index without creation time"); - Ok(time) + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::CREATED_AT_KEY), + })?) } /// Returns the index last updated time. - pub fn updated_at(&self, rtxn: &RoTxn) -> heed::Result> { - let time = self.main + pub fn updated_at(&self, rtxn: &RoTxn) -> Result> { + Ok(self.main .get::<_, Str, SerdeJson>>(rtxn, main_key::UPDATED_AT_KEY)? - .expect("Index without update time"); - Ok(time) + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::UPDATED_AT_KEY), + })?) } pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime) -> heed::Result<()> { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 981fc3ef2..48af0b8aa 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -76,7 +76,7 @@ pub trait Context<'c> { fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>>; fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; fn synonyms(&self, word: &str) -> heed::Result>>>; - fn searchable_fields_ids(&self) -> heed::Result>; + fn searchable_fields_ids(&self) -> Result>; fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result>; fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result>; } @@ -174,7 +174,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.words_synonyms(self.rtxn, &[word]) } - fn searchable_fields_ids(&self) -> heed::Result> { + fn searchable_fields_ids(&self) -> Result> { match self.index.searchable_fields_ids(self.rtxn)? { Some(searchable_fields_ids) => Ok(searchable_fields_ids), None => Ok(self.index.fields_ids_map(self.rtxn)?.ids().collect()), @@ -478,7 +478,7 @@ pub mod test { todo!() } - fn searchable_fields_ids(&self) -> heed::Result> { + fn searchable_fields_ids(&self) -> Result> { todo!() } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index f86d6b8ed..b9ffd9d90 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -3,9 +3,11 @@ use std::mem::size_of; use heed::types::ByteSlice; use roaring::RoaringBitmap; -use super::{Distinct, DocIter}; +use crate::error::InternalError; use crate::heed_codec::facet::*; +use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; +use super::{Distinct, DocIter}; const FID_SIZE: usize = size_of::(); const DOCID_SIZE: usize = size_of::(); @@ -64,7 +66,10 @@ impl<'a> FacetDistinctIter<'a> { let ((_, _, value), _) = item?; let facet_docids = self .facet_string_docids(value)? - .expect("Corrupted data: Facet values must exist"); + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::FACET_ID_STRING_DOCIDS, + key: None, + })?; self.excluded.union_with(&facet_docids); } @@ -80,7 +85,10 @@ impl<'a> FacetDistinctIter<'a> { let ((_, _, value), _) = item?; let facet_docids = self .facet_number_docids(value)? - .expect("Corrupted data: Facet values must exist"); + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::FACET_ID_F64_DOCIDS, + key: None, + })?; self.excluded.union_with(&facet_docids); } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f8c7b5d9b..9deb541e3 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -13,7 +13,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; +use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{Index, DocumentId, Result}; @@ -22,6 +22,8 @@ pub use self::matching_words::MatchingWords; pub(crate) use self::facet::ParserRule; use self::query_tree::QueryTreeBuilder; +use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; + // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); @@ -142,7 +144,10 @@ impl<'a> Search<'a> { None => self.perform_sort(NoopDistinct, matching_words, criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; - let id = field_ids_map.id(name).expect("distinct not present in field map"); + let id = field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "fetching distint attribute", + })?; let distinct = FacetDistinct::new(id, self.index, self.rtxn); self.perform_sort(distinct, matching_words, criteria) } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index ceba7bf01..7fc7e5d77 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -7,7 +7,7 @@ use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; use serde_json::Value; -use crate::error::{InternalError, UserError}; +use crate::error::{InternalError, FieldIdMapMissingEntry, UserError}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result}; @@ -84,7 +84,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { key: Some(main_key::PRIMARY_KEY_KEY), } })?; - let id_field = fields_ids_map.id(primary_key).expect(r#"the field "id" to be present"#); + let id_field = fields_ids_map.id(primary_key).ok_or_else(|| { + FieldIdMapMissingEntry::FieldName { + field_name: primary_key.to_string(), + process: "DeleteDocuments::execute", + } + })?; let Index { env: _env, From 713acc408be998c4d6aa33e5c2a2114bfbc90514 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 15 Jun 2021 13:45:20 +0200 Subject: [PATCH 0799/1889] Introduce the primary key to the Settings builder structure --- benchmarks/benches/utils.rs | 8 +- milli/src/error.rs | 8 ++ milli/src/index.rs | 30 +++---- milli/src/update/index_documents/transform.rs | 2 +- milli/src/update/settings.rs | 89 ++++++++++++++++++- milli/tests/search/query_criteria.rs | 7 +- 6 files changed, 121 insertions(+), 23 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 5138de4d2..d5181849f 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -55,15 +55,15 @@ pub fn base_setup(conf: &Conf) -> Index { options.map_size(100 * 1024 * 1024 * 1024); // 100 GB options.max_readers(10); let index = Index::new(options, conf.database_name).unwrap(); - if let Some(primary_key) = conf.primary_key { - let mut wtxn = index.write_txn().unwrap(); - index.put_primary_key(&mut wtxn, primary_key).unwrap(); - } let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); + if let Some(primary_key) = conf.primary_key { + builder.set_primary_key(primary_key.to_string()); + } + if let Some(criterion) = conf.criterion { builder.reset_filterable_fields(); builder.reset_criteria(); diff --git a/milli/src/error.rs b/milli/src/error.rs index 5a8dfc90b..19f9c364c 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -60,6 +60,8 @@ pub enum UserError { MissingDocumentId { document: Object }, MissingPrimaryKey, NoSpaceLeftOnDevice, + PrimaryKeyCannotBeChanged, + PrimaryKeyCannotBeReset, SerdeJson(serde_json::Error), UnknownInternalDocumentId { document_id: DocumentId }, } @@ -211,6 +213,12 @@ impl fmt::Display for UserError { // TODO where can we find it instead of writing the text ourselves? Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), + Self::PrimaryKeyCannotBeChanged => { + f.write_str("primary key cannot be changed if the database contains documents") + }, + Self::PrimaryKeyCannotBeReset => { + f.write_str("primary key cannot be reset if the database contains documents") + }, Self::SerdeJson(error) => error.fmt(f), Self::UnknownInternalDocumentId { document_id } => { write!(f, "an unknown internal document id have been used ({})", document_id) diff --git a/milli/src/index.rs b/milli/src/index.rs index 02a1f9d58..bf4b3e023 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -184,7 +184,7 @@ impl Index { /* documents ids */ /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. - pub fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { + pub(crate) fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) } @@ -202,7 +202,7 @@ impl Index { /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. - pub fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { + pub(crate) fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { self.set_updated_at(wtxn, &Utc::now())?; self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, &primary_key) } @@ -220,7 +220,7 @@ impl Index { /* external documents ids */ /// Writes the external documents ids and internal ids (i.e. `u32`). - pub fn put_external_documents_ids<'a>( + pub(crate) fn put_external_documents_ids<'a>( &self, wtxn: &mut RwTxn, external_documents_ids: &ExternalDocumentsIds<'a>, @@ -254,7 +254,7 @@ impl Index { /// Writes the fields ids map which associate the documents keys with an internal field id /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. - pub fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { + pub(crate) fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) } @@ -271,7 +271,7 @@ impl Index { /// Writes the fields distribution which associates every field name with /// the number of times it occurs in the documents. - pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { + pub(crate) fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution) } @@ -288,7 +288,7 @@ impl Index { /// Writes the fields that must be displayed in the defined order. /// There must be not be any duplicate field id. - pub fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { + pub(crate) fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields) } @@ -328,7 +328,7 @@ impl Index { /* searchable fields */ /// Writes the searchable fields, when this list is specified, only these are indexed. - pub fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { + pub(crate) fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields) } @@ -367,7 +367,7 @@ impl Index { /* filterable fields */ /// Writes the filterable fields names in the database. - pub fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { + pub(crate) fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) } @@ -453,7 +453,7 @@ impl Index { /* faceted documents ids */ /// Writes the documents ids that are faceted with numbers under this field id. - pub fn put_number_faceted_documents_ids( + pub(crate) fn put_number_faceted_documents_ids( &self, wtxn: &mut RwTxn, field_id: FieldId, @@ -485,7 +485,7 @@ impl Index { } /// Writes the documents ids that are faceted with strings under this field id. - pub fn put_string_faceted_documents_ids( + pub(crate) fn put_string_faceted_documents_ids( &self, wtxn: &mut RwTxn, field_id: FieldId, @@ -532,7 +532,7 @@ impl Index { /* criteria */ - pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { + pub(crate) fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) } @@ -550,7 +550,7 @@ impl Index { /* words fst */ /// Writes the FST which is the words dictionary of the engine. - pub fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + pub(crate) fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) } @@ -564,7 +564,7 @@ impl Index { /* stop words */ - pub fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + pub(crate) fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) } @@ -581,7 +581,7 @@ impl Index { /* synonyms */ - pub fn put_synonyms( + pub(crate) fn put_synonyms( &self, wtxn: &mut RwTxn, synonyms: &HashMap, Vec>>, @@ -611,7 +611,7 @@ impl Index { /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. - pub fn put_words_prefixes_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + pub(crate) fn put_words_prefixes_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index c44130d7e..9e88559d0 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -375,7 +375,7 @@ impl Transform<'_, '_> { // Once we have sort and deduplicated the documents we write them into a final file. let mut final_sorter = create_sorter( - |_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "merging documents" }), + |_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "documents" }), self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1756a21c9..39cb27c00 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -72,6 +72,7 @@ pub struct Settings<'a, 't, 'u, 'i> { stop_words: Setting>, distinct_field: Setting, synonyms: Setting>>, + primary_key: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -98,6 +99,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { stop_words: Setting::NotSet, distinct_field: Setting::NotSet, synonyms: Setting::NotSet, + primary_key: Setting::NotSet, update_id, } } @@ -166,6 +168,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn reset_primary_key(&mut self) { + self.primary_key = Setting::Reset; + } + + pub fn set_primary_key(&mut self, primary_key: String) { + self.primary_key = Setting::Set(primary_key); + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync @@ -423,6 +433,31 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_primary_key(&mut self) -> Result<()> { + match self.primary_key { + Setting::Set(ref primary_key) => { + if self.index.number_of_documents(&self.wtxn)? == 0 { + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + self.index.put_primary_key(self.wtxn, primary_key)?; + Ok(()) + } else { + Err(UserError::PrimaryKeyCannotBeChanged.into()) + } + }, + Setting::Reset => { + if self.index.number_of_documents(&self.wtxn)? == 0 { + self.index.delete_primary_key(self.wtxn)?; + Ok(()) + } else { + Err(UserError::PrimaryKeyCannotBeReset.into()) + } + }, + Setting::NotSet => Ok(()), + } + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync @@ -436,6 +471,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_filterable()?; self.update_distinct_field()?; self.update_criteria()?; + self.update_primary_key()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, @@ -462,8 +498,9 @@ mod tests { use maplit::{btreeset, hashmap, hashset}; use big_s::S; - use crate::{Criterion, FilterCondition, SearchResult}; + use crate::error::Error; use crate::update::{IndexDocuments, UpdateFormat}; + use crate::{Criterion, FilterCondition, SearchResult}; use super::*; @@ -977,4 +1014,54 @@ mod tests { let rtxn = index.read_txn().unwrap(); FilterCondition::from_str(&rtxn, &index, "toto = 32").unwrap_err(); } + + #[test] + fn setting_primary_key() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the primary key settings + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_primary_key(S("mykey")); + + builder.execute(|_, _| ()).unwrap(); + assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); + + // Then index some documents with the "mykey" primary key. + let content = &br#"[ + { "mykey": 1, "name": "kevin", "age": 23 }, + { "mykey": 2, "name": "kevina", "age": 21 }, + { "mykey": 3, "name": "benoit", "age": 34 }, + { "mykey": 4, "name": "bernard", "age": 34 }, + { "mykey": 5, "name": "bertrand", "age": 34 }, + { "mykey": 6, "name": "bernie", "age": 34 }, + { "mykey": 7, "name": "ben", "age": 34 } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); + builder.disable_autogenerate_docids(); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // We now try to reset the primary key + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.reset_primary_key(); + + let err = builder.execute(|_, _| ()).unwrap_err(); + assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset))); + + // But if we clear the database... + let mut wtxn = index.write_txn().unwrap(); + let builder = ClearDocuments::new(&mut wtxn, &index, 0); + builder.execute().unwrap(); + + // ...we can change the primary key + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_primary_key(S("myid")); + builder.execute(|_, _| ()).unwrap(); + } } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index f0eecfaba..2b9c5ae5e 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,5 +1,6 @@ -use milli::{Search, SearchResult, Criterion}; use big_s::S; +use milli::update::Settings; +use milli::{Search, SearchResult, Criterion}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; use Criterion::*; @@ -189,7 +190,9 @@ fn criteria_mixup() { eprintln!("Testing with criteria order: {:?}", &criteria); //update criteria let mut wtxn = index.write_txn().unwrap(); - index.put_criteria(&mut wtxn, &criteria).unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_criteria(criteria.iter().map(ToString::to_string).collect()); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); let mut rtxn = index.read_txn().unwrap(); From 4eda438f6f22b113666210e3ce3797c2df61041b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 15 Jun 2021 14:34:04 +0200 Subject: [PATCH 0800/1889] Add a new Error for when a user use a non-filtered attribute in a filter --- milli/src/error.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/milli/src/error.rs b/milli/src/error.rs index 19f9c364c..a9c2add24 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -56,6 +56,7 @@ pub enum UserError { FilterParsing(pest::error::Error), InvalidCriterionName { name: String }, InvalidDocumentId { document_id: Value }, + InvalidFilterAttribute(pest::error::Error), InvalidStoreFile, MissingDocumentId { document: Object }, MissingPrimaryKey, @@ -204,6 +205,7 @@ impl fmt::Display for UserError { let json = serde_json::to_string(document_id).unwrap(); write!(f, "document identifier is invalid {}", json) }, + Self::InvalidFilterAttribute(error) => error.fmt(f), Self::MissingDocumentId { document } => { let json = serde_json::to_string(document).unwrap(); write!(f, "document doesn't have an identifier {}", json) From 8cfe3e1ec0656d5b4d1f5dfd410d06b05e334e51 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 15 Jun 2021 17:20:33 +0200 Subject: [PATCH 0801/1889] Rename DatabaseSizeReached into MaxDatabaseSizeReached --- milli/src/error.rs | 6 +++--- milli/src/search/facet/filter_condition.rs | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index a9c2add24..294b2aa57 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -51,7 +51,7 @@ pub enum FieldIdMapMissingEntry { pub enum UserError { AttributeLimitReached, Csv(csv::Error), - DatabaseSizeReached, + MaxDatabaseSizeReached, DocumentLimitReached, FilterParsing(pest::error::Error), InvalidCriterionName { name: String }, @@ -113,7 +113,7 @@ impl From for Error { match error { HeedError::Io(error) => Error::from(error), - HeedError::Mdb(MdbError::MapFull) => UserError(DatabaseSizeReached), + HeedError::Mdb(MdbError::MapFull) => UserError(MaxDatabaseSizeReached), HeedError::Mdb(MdbError::Invalid) => UserError(InvalidStoreFile), HeedError::Mdb(error) => InternalError(Store(error)), HeedError::Encoding => InternalError(Serialization(Encoding { db_name: None })), @@ -211,7 +211,7 @@ impl fmt::Display for UserError { write!(f, "document doesn't have an identifier {}", json) }, Self::MissingPrimaryKey => f.write_str("missing primary key"), - Self::DatabaseSizeReached => f.write_str("database size reached"), + Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), // TODO where can we find it instead of writing the text ourselves? Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 98d638574..6d99bb977 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -164,7 +164,7 @@ impl FilterCondition { { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::FilterParsing)?; + .map_err(UserError::InvalidFilterAttribute)?; let (lresult, _) = pest_parse(items.next().unwrap()); let (rresult, _) = pest_parse(items.next().unwrap()); @@ -183,7 +183,7 @@ impl FilterCondition { { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::FilterParsing)?; + .map_err(UserError::InvalidFilterAttribute)?; let value = items.next().unwrap(); let (result, svalue) = pest_parse(value); @@ -200,7 +200,7 @@ impl FilterCondition { { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::FilterParsing)?; + .map_err(UserError::InvalidFilterAttribute)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -217,7 +217,7 @@ impl FilterCondition { { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::FilterParsing)?; + .map_err(UserError::InvalidFilterAttribute)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -234,7 +234,7 @@ impl FilterCondition { { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::FilterParsing)?; + .map_err(UserError::InvalidFilterAttribute)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -251,7 +251,7 @@ impl FilterCondition { { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::FilterParsing)?; + .map_err(UserError::InvalidFilterAttribute)?; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); From adf0c389c5c284b12f3fae9e19870650f1b99b7a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 15 Jun 2021 17:22:04 +0200 Subject: [PATCH 0802/1889] Rename FilterParsing into InvalidFilter --- milli/src/error.rs | 4 ++-- milli/src/search/facet/filter_condition.rs | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 294b2aa57..78a1b1c59 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -53,7 +53,7 @@ pub enum UserError { Csv(csv::Error), MaxDatabaseSizeReached, DocumentLimitReached, - FilterParsing(pest::error::Error), + InvalidFilter(pest::error::Error), InvalidCriterionName { name: String }, InvalidDocumentId { document_id: Value }, InvalidFilterAttribute(pest::error::Error), @@ -199,7 +199,7 @@ impl fmt::Display for UserError { Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), Self::Csv(error) => error.fmt(f), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), - Self::FilterParsing(error) => error.fmt(f), + Self::InvalidFilter(error) => error.fmt(f), Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 6d99bb977..424118f77 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -110,7 +110,7 @@ impl FilterCondition { { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields_ids(rtxn)?; - let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::FilterParsing)?; + let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } @@ -169,8 +169,8 @@ impl FilterCondition { let (lresult, _) = pest_parse(items.next().unwrap()); let (rresult, _) = pest_parse(items.next().unwrap()); - let lvalue = lresult.map_err(UserError::FilterParsing)?; - let rvalue = rresult.map_err(UserError::FilterParsing)?; + let lvalue = lresult.map_err(UserError::InvalidFilter)?; + let rvalue = rresult.map_err(UserError::InvalidFilter)?; Ok(Operator(fid, Between(lvalue, rvalue))) } @@ -204,7 +204,7 @@ impl FilterCondition { let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::FilterParsing)?; + let value = result.map_err(UserError::InvalidFilter)?; Ok(Operator(fid, GreaterThan(value))) } @@ -221,7 +221,7 @@ impl FilterCondition { let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::FilterParsing)?; + let value = result.map_err(UserError::InvalidFilter)?; Ok(Operator(fid, GreaterThanOrEqual(value))) } @@ -238,7 +238,7 @@ impl FilterCondition { let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::FilterParsing)?; + let value = result.map_err(UserError::InvalidFilter)?; Ok(Operator(fid, LowerThan(value))) } @@ -255,7 +255,7 @@ impl FilterCondition { let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::FilterParsing)?; + let value = result.map_err(UserError::InvalidFilter)?; Ok(Operator(fid, LowerThanOrEqual(value))) } From 7ac441e4739b62e8430dc513d8e77e023c1decf9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 15 Jun 2021 17:26:08 +0200 Subject: [PATCH 0803/1889] Fix small typos --- milli/src/search/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 9deb541e3..3c85796bc 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -146,7 +146,7 @@ impl<'a> Search<'a> { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; let id = field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { field_name: name.to_string(), - process: "fetching distint attribute", + process: "distinct attribute", })?; let distinct = FacetDistinct::new(id, self.index, self.rtxn); self.perform_sort(distinct, matching_words, criteria) From ce0315a10f2052fd3c30cd89b89131836912f33d Mon Sep 17 00:00:00 2001 From: many Date: Tue, 15 Jun 2021 17:49:15 +0200 Subject: [PATCH 0804/1889] Close write transaction in test --- milli/src/update/settings.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 39cb27c00..8f4fe48c9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1053,6 +1053,7 @@ mod tests { let err = builder.execute(|_, _| ()).unwrap_err(); assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset))); + wtxn.abort().unwrap(); // But if we clear the database... let mut wtxn = index.write_txn().unwrap(); @@ -1063,5 +1064,6 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_primary_key(S("myid")); builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); } } From f5ff3e8e1953a44cd0d768942a7783f4f8a9caf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 16 Jun 2021 14:01:05 +0200 Subject: [PATCH 0805/1889] Update version for the next release (v0.4.0) --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8e6794fb6..dae36eb0b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -900,7 +900,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.3.1" +version = "0.4.0" dependencies = [ "anyhow", "byte-unit", @@ -954,7 +954,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.3.1" +version = "0.4.0" dependencies = [ "anyhow", "askama", @@ -1096,7 +1096,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.3.1" +version = "0.4.0" dependencies = [ "anyhow", "byte-unit", @@ -1375,7 +1375,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.3.1" +version = "0.4.0" dependencies = [ "big_s", "bstr", @@ -2230,7 +2230,7 @@ dependencies = [ [[package]] name = "search" -version = "0.3.1" +version = "0.4.0" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 86c027e41..4aa208d5e 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.3.1" +version = "0.4.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index a11307fbe..29c6d2b63 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.3.1" +version = "0.4.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 41c161c07..1f1affa2f 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.3.1" +version = "0.4.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ac7a977a2..7fb14a287 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.3.1" +version = "0.4.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index dbc129bf6..ebf9b491b 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.3.1" +version = "0.4.0" authors = ["Clément Renault "] edition = "2018" From 41bdc90f4613970aeb194cb848cee3970f7b89ac Mon Sep 17 00:00:00 2001 From: Many Date: Wed, 16 Jun 2021 12:16:03 +0200 Subject: [PATCH 0806/1889] Revert "Enable optimization in every profile" --- Cargo.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c0fa64635..822907ca8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,9 +8,6 @@ opt-level = 3 [profile.release] debug = true -[profile.test] -opt-level = 3 - # Make sure that the build scripts and proc-macros are compiled with # all the optimizations. It speeds up the zip crate that we use in the build.rs. [profile.dev.build-override] From 9716fb3b361eb76ece836c19fec9589abb650427 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 16 Jun 2021 18:33:33 +0200 Subject: [PATCH 0807/1889] format the whole project --- .rustfmt.toml | 5 + README.md | 15 + benchmarks/benches/songs.rs | 58 +- benchmarks/benches/utils.rs | 6 +- benchmarks/benches/wiki.rs | 19 +- benchmarks/build.rs | 19 +- helpers/src/main.rs | 5 +- http-ui/src/main.rs | 717 ++++++++--------- http-ui/src/update_store.rs | 69 +- infos/src/main.rs | 203 +++-- milli/src/criterion.rs | 37 +- milli/src/error.rs | 33 +- milli/src/external_documents_ids.rs | 10 +- milli/src/facet/facet_type.rs | 7 +- milli/src/facet/facet_value.rs | 2 +- milli/src/facet/value_encoding.rs | 5 +- milli/src/fields_ids_map.rs | 16 +- .../facet/facet_level_value_f64_codec.rs | 3 +- .../facet/field_doc_id_facet_f64_codec.rs | 2 +- .../facet/field_doc_id_facet_string_codec.rs | 9 +- .../heed_codec/field_id_word_count_codec.rs | 3 +- milli/src/heed_codec/mod.rs | 10 +- milli/src/heed_codec/obkv_codec.rs | 1 + .../cbo_roaring_bitmap_codec.rs | 4 +- .../roaring_bitmap/roaring_bitmap_codec.rs | 1 + .../roaring_bitmap_len_codec.rs | 20 +- .../heed_codec/str_level_position_codec.rs | 4 +- milli/src/index.rs | 298 ++++--- milli/src/lib.rs | 33 +- milli/src/proximity.rs | 8 +- milli/src/search/criteria/asc_desc.rs | 87 +-- milli/src/search/criteria/attribute.rs | 422 ++++++---- milli/src/search/criteria/exactness.rs | 163 ++-- milli/src/search/criteria/final.rs | 24 +- milli/src/search/criteria/initial.rs | 11 +- milli/src/search/criteria/mod.rs | 314 +++++--- milli/src/search/criteria/proximity.rs | 283 ++++--- milli/src/search/criteria/typo.rs | 369 +++++---- milli/src/search/criteria/words.rs | 52 +- milli/src/search/distinct/facet_distinct.rs | 26 +- milli/src/search/distinct/mod.rs | 21 +- milli/src/search/distinct/noop_distinct.rs | 10 +- milli/src/search/facet/facet_distribution.rs | 71 +- milli/src/search/facet/filter_condition.rs | 279 ++++--- milli/src/search/facet/mod.rs | 57 +- milli/src/search/facet/parser.rs | 2 +- milli/src/search/matching_words.rs | 53 +- milli/src/search/mod.rs | 40 +- milli/src/search/query_tree.rs | 735 ++++++++++++------ milli/src/update/available_documents_ids.rs | 15 +- milli/src/update/clear_documents.rs | 7 +- milli/src/update/delete_documents.rs | 59 +- milli/src/update/facets.rs | 49 +- milli/src/update/index_documents/mod.rs | 114 ++- milli/src/update/index_documents/store.rs | 260 ++++--- milli/src/update/index_documents/transform.rs | 143 ++-- milli/src/update/mod.rs | 4 +- milli/src/update/settings.rs | 149 ++-- milli/src/update/update_builder.rs | 17 +- milli/src/update/word_prefix_docids.rs | 11 +- .../word_prefix_pair_proximity_docids.rs | 12 +- milli/src/update/words_level_positions.rs | 39 +- milli/src/update/words_prefixes_fst.rs | 7 +- milli/tests/search/mod.rs | 54 +- milli/tests/search/query_criteria.rs | 40 +- script/pre-commit | 36 + qc_loop.sh => script/qc_loop.sh | 0 search/src/main.rs | 6 +- 68 files changed, 3327 insertions(+), 2336 deletions(-) create mode 100644 .rustfmt.toml create mode 100755 script/pre-commit rename qc_loop.sh => script/qc_loop.sh (100%) diff --git a/.rustfmt.toml b/.rustfmt.toml new file mode 100644 index 000000000..250124b77 --- /dev/null +++ b/.rustfmt.toml @@ -0,0 +1,5 @@ +unstable_features = true + +use_small_heuristics = "max" +imports_granularity = "Module" +group_imports = "StdExternalCrate" diff --git a/README.md b/README.md index 13d35380a..b1498d0f5 100644 --- a/README.md +++ b/README.md @@ -41,3 +41,18 @@ the `content-type:application/json` and `content-type:application/x-ndjson` head ### Querying the engine via the website You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700). + + +## Contributing + +You can setup a `git-hook` to stop you from making a commit too fast. It'll stop you if: +- Any of the workspaces does not build +- Your code is not well-formatted + +These two things are also checked in the CI, so ignoring the hook won't help you merge your code. +But if you need to, you can still add `--no-verify` when creating your commit to ignore the hook. + +To enable the hook, run the following command from the root of the project: +``` +cp script/pre-commit .git/hooks/pre-commit +``` diff --git a/benchmarks/benches/songs.rs b/benchmarks/benches/songs.rs index e5da16a99..726190f77 100644 --- a/benchmarks/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -6,33 +6,24 @@ use milli::update::Settings; use utils::Conf; fn base_conf(builder: &mut Settings) { - let displayed_fields = [ - "id", "title", "album", "artist", "genre", "country", "released", "duration", - ] - .iter() - .map(|s| s.to_string()) - .collect(); + let displayed_fields = + ["id", "title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); builder.set_displayed_fields(displayed_fields); - let searchable_fields = ["title", "album", "artist"] + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"] .iter() .map(|s| s.to_string()) .collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = [ - "released-timestamp", - "duration-float", - "genre", - "country", - "artist", - ] - .iter() - .map(|s| s.to_string()) - .collect(); builder.set_filterable_fields(faceted_fields); } +#[rustfmt::skip] const BASE_CONF: Conf = Conf { dataset: datasets_paths::SMOL_SONGS, queries: &[ @@ -53,34 +44,25 @@ const BASE_CONF: Conf = Conf { }; fn bench_songs(c: &mut criterion::Criterion) { - let default_criterion: Vec = milli::default_criteria() - .iter() - .map(|criteria| criteria.to_string()) - .collect(); + let default_criterion: Vec = + milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); let default_criterion = default_criterion.iter().map(|s| s.as_str()); - let asc_default: Vec<&str> = std::iter::once("asc(released-timestamp)") - .chain(default_criterion.clone()) - .collect(); - let desc_default: Vec<&str> = std::iter::once("desc(released-timestamp)") - .chain(default_criterion.clone()) - .collect(); + let asc_default: Vec<&str> = + std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect(); + let desc_default: Vec<&str> = + std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect(); let basic_with_quote: Vec = BASE_CONF .queries .iter() .map(|s| { - s.trim() - .split(' ') - .map(|s| format!(r#""{}""#, s)) - .collect::>() - .join(" ") + s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::>().join(" ") }) .collect(); - let basic_with_quote: &[&str] = &basic_with_quote - .iter() - .map(|s| s.as_str()) - .collect::>(); + let basic_with_quote: &[&str] = + &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + #[rustfmt::skip] let confs = &[ /* first we bench each criterion alone */ utils::Conf { diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index d5181849f..fd1df0a90 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -3,10 +3,8 @@ use std::path::Path; use criterion::BenchmarkId; use heed::EnvOpenOptions; -use milli::{ - update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}, - FilterCondition, Index, -}; +use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}; +use milli::{FilterCondition, Index}; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory diff --git a/benchmarks/benches/wiki.rs b/benchmarks/benches/wiki.rs index 11ffe87d5..3d8b6f1d4 100644 --- a/benchmarks/benches/wiki.rs +++ b/benchmarks/benches/wiki.rs @@ -6,16 +6,14 @@ use milli::update::Settings; use utils::Conf; fn base_conf(builder: &mut Settings) { - let displayed_fields = ["title", "body", "url"] - .iter() - .map(|s| s.to_string()) - .collect(); + let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); builder.set_displayed_fields(displayed_fields); let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); builder.set_searchable_fields(searchable_fields); } +#[rustfmt::skip] const BASE_CONF: Conf = Conf { dataset: datasets_paths::SMOL_WIKI_ARTICLES, queries: &[ @@ -37,18 +35,13 @@ fn bench_songs(c: &mut criterion::Criterion) { .queries .iter() .map(|s| { - s.trim() - .split(' ') - .map(|s| format!(r#""{}""#, s)) - .collect::>() - .join(" ") + s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::>().join(" ") }) .collect(); - let basic_with_quote: &[&str] = &basic_with_quote - .iter() - .map(|s| s.as_str()) - .collect::>(); + let basic_with_quote: &[&str] = + &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + #[rustfmt::skip] let confs = &[ /* first we bench each criterion alone */ utils::Conf { diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 58300bab9..b1edd5499 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -1,9 +1,7 @@ +use std::fs::File; +use std::io::{Cursor, Read, Seek, Write}; use std::path::{Path, PathBuf}; use std::{env, fs}; -use std::{ - fs::File, - io::{Cursor, Read, Seek, Write}, -}; use bytes::Bytes; use convert_case::{Case, Casing}; @@ -45,7 +43,10 @@ fn main() -> anyhow::Result<()> { )?; if out_file.exists() { - eprintln!("The dataset {} already exists on the file system and will not be downloaded again", dataset); + eprintln!( + "The dataset {} already exists on the file system and will not be downloaded again", + dataset + ); continue; } let url = format!("{}/{}.csv.gz", BASE_URL, dataset); @@ -60,12 +61,8 @@ fn main() -> anyhow::Result<()> { } fn download_dataset(url: U) -> anyhow::Result> { - let bytes = reqwest::blocking::Client::builder() - .timeout(None) - .build()? - .get(url) - .send()? - .bytes()?; + let bytes = + reqwest::blocking::Client::builder().timeout(None).build()?.get(url).send()?.bytes()?; Ok(Cursor::new(bytes)) } diff --git a/helpers/src/main.rs b/helpers/src/main.rs index c916d0448..b325aef89 100644 --- a/helpers/src/main.rs +++ b/helpers/src/main.rs @@ -1,9 +1,8 @@ use std::path::PathBuf; use byte_unit::Byte; -use heed::{Env, EnvOpenOptions, CompactionOption}; +use heed::{CompactionOption, Env, EnvOpenOptions}; use structopt::StructOpt; - use Command::*; #[cfg(target_os = "linux")] @@ -65,7 +64,7 @@ fn main() -> anyhow::Result<()> { use CompactionOption::*; let compaction = if enable_compaction { Enabled } else { Disabled }; copy_main_database_to_stdout(env, compaction) - }, + } } } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index e23dddd4c..703861058 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,6 +1,5 @@ mod update_store; -use std::{io, mem}; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; @@ -10,16 +9,19 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; use std::time::Instant; +use std::{io, mem}; use askama_warp::Template; use byte_unit::Byte; use either::Either; use flate2::read::GzDecoder; -use futures::{FutureExt, StreamExt}; -use futures::stream; +use futures::{stream, FutureExt, StreamExt}; use grenad::CompressionType; use heed::EnvOpenOptions; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; +use milli::update::UpdateIndexingStep::*; +use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; +use milli::{obkv_to_json, FilterCondition, Index, MatchingWords, SearchResult}; use once_cell::sync::OnceCell; use rayon::ThreadPool; use serde::{Deserialize, Serialize}; @@ -28,12 +30,9 @@ use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; use tokio::sync::broadcast; -use warp::{Filter, http::Response}; use warp::filters::ws::Message; - -use milli::{FilterCondition, Index, MatchingWords, obkv_to_json, SearchResult}; -use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; -use milli::update::UpdateIndexingStep::*; +use warp::http::Response; +use warp::Filter; use self::update_store::UpdateStore; @@ -149,25 +148,28 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { for (word, token) in analyzed.reconstruct() { if token.is_word() { let to_highlight = matching_words.matching_bytes(token.text()).is_some(); - if to_highlight { string.push_str("") } + if to_highlight { + string.push_str("") + } string.push_str(word); - if to_highlight { string.push_str("") } + if to_highlight { + string.push_str("") + } } else { string.push_str(word); } } Value::String(string) } - Value::Array(values) => { - Value::Array(values.into_iter() - .map(|v| self.highlight_value(v, matching_words)) - .collect()) - } - Value::Object(object) => { - Value::Object(object.into_iter() + Value::Array(values) => Value::Array( + values.into_iter().map(|v| self.highlight_value(v, matching_words)).collect(), + ), + Value::Object(object) => Value::Object( + object + .into_iter() .map(|(k, v)| (k, self.highlight_value(v, matching_words))) - .collect()) - } + .collect(), + ), } } @@ -236,12 +238,7 @@ enum UpdateMeta { #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type")] enum UpdateMetaProgress { - DocumentsAddition { - step: usize, - total_steps: usize, - current: usize, - total: Option, - }, + DocumentsAddition { step: usize, total_steps: usize, current: usize, total: Option }, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -342,157 +339,185 @@ async fn main() -> anyhow::Result<()> { update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size); update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type); - update_builder.chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes()); + update_builder + .chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes()); let before_update = Instant::now(); // we extract the update type and execute the update itself. - let result: anyhow::Result<()> = match meta { - UpdateMeta::DocumentsAddition { method, format, encoding } => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned); + let result: anyhow::Result<()> = + match meta { + UpdateMeta::DocumentsAddition { method, format, encoding } => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned); - match format.as_str() { - "csv" => builder.update_format(UpdateFormat::Csv), - "json" => builder.update_format(UpdateFormat::Json), - "json-stream" => builder.update_format(UpdateFormat::JsonStream), - otherwise => panic!("invalid update format {:?}", otherwise), - }; - - match method.as_str() { - "replace" => builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments), - "update" => builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments), - otherwise => panic!("invalid indexing method {:?}", otherwise), - }; - - let reader = match encoding.as_deref() { - Some("gzip") => Box::new(GzDecoder::new(content)), - None => Box::new(content) as Box, - otherwise => panic!("invalid encoding format {:?}", otherwise), - }; - - let result = builder.execute(reader, |indexing_step, update_id| { - let (current, total) = match indexing_step { - TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), - ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), - IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), - MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)), + match format.as_str() { + "csv" => builder.update_format(UpdateFormat::Csv), + "json" => builder.update_format(UpdateFormat::Json), + "json-stream" => builder.update_format(UpdateFormat::JsonStream), + otherwise => panic!("invalid update format {:?}", otherwise), }; - let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { - update_id, - meta: UpdateMetaProgress::DocumentsAddition { - step: indexing_step.step(), - total_steps: indexing_step.number_of_steps(), - current, - total, - }, - }); - }); - match result { - Ok(_) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } - } - UpdateMeta::ClearDocuments => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let builder = update_builder.clear_documents(&mut wtxn, &index_cloned); - - match builder.execute() { - Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } - } - UpdateMeta::Settings(settings) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.settings(&mut wtxn, &index_cloned); - - // We transpose the settings JSON struct into a real setting update. - match settings.searchable_attributes { - Setting::Set(searchable_attributes) => builder.set_searchable_fields(searchable_attributes), - Setting::Reset => builder.reset_searchable_fields(), - Setting::NotSet => () - } - - // We transpose the settings JSON struct into a real setting update. - match settings.displayed_attributes { - Setting::Set(displayed_attributes) => builder.set_displayed_fields(displayed_attributes), - Setting::Reset => builder.reset_displayed_fields(), - Setting::NotSet => () - } - - // We transpose the settings JSON struct into a real setting update. - match settings.filterable_attributes { - Setting::Set(filterable_attributes) => builder.set_filterable_fields(filterable_attributes), - Setting::Reset => builder.reset_filterable_fields(), - Setting::NotSet => () - } - - // We transpose the settings JSON struct into a real setting update. - match settings.criteria { - Setting::Set(criteria) => builder.set_criteria(criteria), - Setting::Reset => builder.reset_criteria(), - Setting::NotSet => () - } - - // We transpose the settings JSON struct into a real setting update. - match settings.stop_words { - Setting::Set(stop_words) => builder.set_stop_words(stop_words), - Setting::Reset => builder.reset_stop_words(), - Setting::NotSet => () - } - - // We transpose the settings JSON struct into a real setting update. - match settings.synonyms { - Setting::Set(synonyms) => builder.set_synonyms(synonyms), - Setting::Reset => builder.reset_synonyms(), - Setting::NotSet => () - } - - let result = builder.execute(|indexing_step, update_id| { - let (current, total) = match indexing_step { - TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), - ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), - IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)), - MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)), + match method.as_str() { + "replace" => builder + .index_documents_method(IndexDocumentsMethod::ReplaceDocuments), + "update" => builder + .index_documents_method(IndexDocumentsMethod::UpdateDocuments), + otherwise => panic!("invalid indexing method {:?}", otherwise), }; - let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { - update_id, - meta: UpdateMetaProgress::DocumentsAddition { - step: indexing_step.step(), - total_steps: indexing_step.number_of_steps(), - current, - total, - }, - }); - }); - match result { - Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), + let reader = match encoding.as_deref() { + Some("gzip") => Box::new(GzDecoder::new(content)), + None => Box::new(content) as Box, + otherwise => panic!("invalid encoding format {:?}", otherwise), + }; + + let result = builder.execute(reader, |indexing_step, update_id| { + let (current, total) = match indexing_step { + TransformFromUserIntoGenericFormat { documents_seen } => { + (documents_seen, None) + } + ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) + } + IndexDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) + } + MergeDataIntoFinalDatabase { databases_seen, total_databases } => { + (databases_seen, Some(total_databases)) + } + }; + let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { + update_id, + meta: UpdateMetaProgress::DocumentsAddition { + step: indexing_step.step(), + total_steps: indexing_step.number_of_steps(), + current, + total, + }, + }); + }); + + match result { + Ok(_) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), + } } - } - UpdateMeta::Facets(levels) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.facets(&mut wtxn, &index_cloned); - if let Some(value) = levels.level_group_size { - builder.level_group_size(value); + UpdateMeta::ClearDocuments => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let builder = update_builder.clear_documents(&mut wtxn, &index_cloned); + + match builder.execute() { + Ok(_count) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), + } } - if let Some(value) = levels.min_level_size { - builder.min_level_size(value); + UpdateMeta::Settings(settings) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.settings(&mut wtxn, &index_cloned); + + // We transpose the settings JSON struct into a real setting update. + match settings.searchable_attributes { + Setting::Set(searchable_attributes) => { + builder.set_searchable_fields(searchable_attributes) + } + Setting::Reset => builder.reset_searchable_fields(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.displayed_attributes { + Setting::Set(displayed_attributes) => { + builder.set_displayed_fields(displayed_attributes) + } + Setting::Reset => builder.reset_displayed_fields(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.filterable_attributes { + Setting::Set(filterable_attributes) => { + builder.set_filterable_fields(filterable_attributes) + } + Setting::Reset => builder.reset_filterable_fields(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.criteria { + Setting::Set(criteria) => builder.set_criteria(criteria), + Setting::Reset => builder.reset_criteria(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.stop_words { + Setting::Set(stop_words) => builder.set_stop_words(stop_words), + Setting::Reset => builder.reset_stop_words(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.synonyms { + Setting::Set(synonyms) => builder.set_synonyms(synonyms), + Setting::Reset => builder.reset_synonyms(), + Setting::NotSet => (), + } + + let result = builder.execute(|indexing_step, update_id| { + let (current, total) = match indexing_step { + TransformFromUserIntoGenericFormat { documents_seen } => { + (documents_seen, None) + } + ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) + } + IndexDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) + } + MergeDataIntoFinalDatabase { databases_seen, total_databases } => { + (databases_seen, Some(total_databases)) + } + }; + let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { + update_id, + meta: UpdateMetaProgress::DocumentsAddition { + step: indexing_step.step(), + total_steps: indexing_step.number_of_steps(), + current, + total, + }, + }); + }); + + match result { + Ok(_count) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), + } } - match builder.execute() { - Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), + UpdateMeta::Facets(levels) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = update_builder.facets(&mut wtxn, &index_cloned); + if let Some(value) = levels.level_group_size { + builder.level_group_size(value); + } + if let Some(value) = levels.min_level_size { + builder.min_level_size(value); + } + match builder.execute() { + Ok(()) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), + } } - } - }; + }; let meta = match result { - Ok(()) => format!("valid update content processed in {:.02?}", before_update.elapsed()), + Ok(()) => { + format!("valid update content processed in {:.02?}", before_update.elapsed()) + } Err(e) => format!("error while processing update content: {:?}", e), }; @@ -500,7 +525,8 @@ async fn main() -> anyhow::Result<()> { let _ = update_status_sender_cloned.send(processed); Ok(meta) - })?; + }, + )?; // The database name will not change. let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string(); @@ -512,15 +538,11 @@ async fn main() -> anyhow::Result<()> { let db_name_cloned = db_name.clone(); let lmdb_path_cloned = lmdb_path.clone(); let index_cloned = index.clone(); - let dash_html_route = warp::filters::method::get() - .and(warp::filters::path::end()) - .map(move || { + let dash_html_route = + warp::filters::method::get().and(warp::filters::path::end()).map(move || { // We retrieve the database size. - let db_size = File::open(lmdb_path_cloned.clone()) - .unwrap() - .metadata() - .unwrap() - .len() as usize; + let db_size = + File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() as usize; // And the number of documents in the database. let rtxn = index_cloned.read_txn().unwrap(); @@ -537,111 +559,105 @@ async fn main() -> anyhow::Result<()> { .and(warp::path!("updates")) .map(move |header: String| { let update_store = update_store_cloned.clone(); - let mut updates = update_store.iter_metas(|processed, aborted, pending| { - let mut updates = Vec::>::new(); - for result in processed { - let (uid, meta) = result?; - updates.push(UpdateStatus::Processed { update_id: uid.get(), meta }); - } - for result in aborted { - let (uid, meta) = result?; - updates.push(UpdateStatus::Aborted { update_id: uid.get(), meta }); - } - for result in pending { - let (uid, meta) = result?; - updates.push(UpdateStatus::Pending { update_id: uid.get(), meta }); - } - Ok(updates) - }).unwrap(); + let mut updates = update_store + .iter_metas(|processed, aborted, pending| { + let mut updates = Vec::>::new(); + for result in processed { + let (uid, meta) = result?; + updates.push(UpdateStatus::Processed { update_id: uid.get(), meta }); + } + for result in aborted { + let (uid, meta) = result?; + updates.push(UpdateStatus::Aborted { update_id: uid.get(), meta }); + } + for result in pending { + let (uid, meta) = result?; + updates.push(UpdateStatus::Pending { update_id: uid.get(), meta }); + } + Ok(updates) + }) + .unwrap(); updates.sort_unstable_by(|s1, s2| s1.update_id().cmp(&s2.update_id()).reverse()); if header.contains("text/html") { // We retrieve the database size. - let db_size = File::open(lmdb_path_cloned.clone()) - .unwrap() - .metadata() - .unwrap() - .len() as usize; + let db_size = + File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() + as usize; // And the number of documents in the database. let rtxn = index_cloned.read_txn().unwrap(); let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize; - let template = UpdatesTemplate { - db_name: db_name.clone(), - db_size, - docs_count, - updates, - }; + let template = + UpdatesTemplate { db_name: db_name.clone(), db_size, docs_count, updates }; Box::new(template) as Box } else { Box::new(warp::reply::json(&updates)) } }); - let dash_bulma_route = warp::filters::method::get() - .and(warp::path!("bulma.min.css")) - .map(|| Response::builder() - .header("content-type", "text/css; charset=utf-8") - .body(include_str!("../public/bulma.min.css")) - ); + let dash_bulma_route = + warp::filters::method::get().and(warp::path!("bulma.min.css")).map(|| { + Response::builder() + .header("content-type", "text/css; charset=utf-8") + .body(include_str!("../public/bulma.min.css")) + }); - let dash_bulma_dark_route = warp::filters::method::get() - .and(warp::path!("bulma-prefers-dark.min.css")) - .map(|| Response::builder() - .header("content-type", "text/css; charset=utf-8") - .body(include_str!("../public/bulma-prefers-dark.min.css")) - ); + let dash_bulma_dark_route = + warp::filters::method::get().and(warp::path!("bulma-prefers-dark.min.css")).map(|| { + Response::builder() + .header("content-type", "text/css; charset=utf-8") + .body(include_str!("../public/bulma-prefers-dark.min.css")) + }); - let dash_style_route = warp::filters::method::get() - .and(warp::path!("style.css")) - .map(|| Response::builder() + let dash_style_route = warp::filters::method::get().and(warp::path!("style.css")).map(|| { + Response::builder() .header("content-type", "text/css; charset=utf-8") .body(include_str!("../public/style.css")) - ); + }); - let dash_jquery_route = warp::filters::method::get() - .and(warp::path!("jquery-3.4.1.min.js")) - .map(|| Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../public/jquery-3.4.1.min.js")) - ); + let dash_jquery_route = + warp::filters::method::get().and(warp::path!("jquery-3.4.1.min.js")).map(|| { + Response::builder() + .header("content-type", "application/javascript; charset=utf-8") + .body(include_str!("../public/jquery-3.4.1.min.js")) + }); - let dash_filesize_route = warp::filters::method::get() - .and(warp::path!("filesize.min.js")) - .map(|| Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../public/filesize.min.js")) - ); + let dash_filesize_route = + warp::filters::method::get().and(warp::path!("filesize.min.js")).map(|| { + Response::builder() + .header("content-type", "application/javascript; charset=utf-8") + .body(include_str!("../public/filesize.min.js")) + }); - let dash_script_route = warp::filters::method::get() - .and(warp::path!("script.js")) - .map(|| Response::builder() + let dash_script_route = warp::filters::method::get().and(warp::path!("script.js")).map(|| { + Response::builder() .header("content-type", "application/javascript; charset=utf-8") .body(include_str!("../public/script.js")) - ); + }); - let updates_script_route = warp::filters::method::get() - .and(warp::path!("updates-script.js")) - .map(|| Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../public/updates-script.js")) - ); + let updates_script_route = + warp::filters::method::get().and(warp::path!("updates-script.js")).map(|| { + Response::builder() + .header("content-type", "application/javascript; charset=utf-8") + .body(include_str!("../public/updates-script.js")) + }); - let dash_logo_white_route = warp::filters::method::get() - .and(warp::path!("logo-white.svg")) - .map(|| Response::builder() - .header("content-type", "image/svg+xml") - .body(include_str!("../public/logo-white.svg")) - ); + let dash_logo_white_route = + warp::filters::method::get().and(warp::path!("logo-white.svg")).map(|| { + Response::builder() + .header("content-type", "image/svg+xml") + .body(include_str!("../public/logo-white.svg")) + }); - let dash_logo_black_route = warp::filters::method::get() - .and(warp::path!("logo-black.svg")) - .map(|| Response::builder() - .header("content-type", "image/svg+xml") - .body(include_str!("../public/logo-black.svg")) - ); + let dash_logo_black_route = + warp::filters::method::get().and(warp::path!("logo-black.svg")).map(|| { + Response::builder() + .header("content-type", "image/svg+xml") + .body(include_str!("../public/logo-black.svg")) + }); #[derive(Debug, Deserialize)] #[serde(untagged)] @@ -719,7 +735,8 @@ async fn main() -> anyhow::Result<()> { search.filter(condition); } - let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); + let SearchResult { matching_words, candidates, documents_ids } = + search.execute().unwrap(); let number_of_candidates = candidates.len(); let facets = if query.facet_distribution == Some(true) { @@ -745,17 +762,18 @@ async fn main() -> anyhow::Result<()> { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { - highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight); + highlighter.highlight_record( + &mut object, + &matching_words, + &attributes_to_highlight, + ); } documents.push(object); } - let answer = Answer { - documents, - number_of_candidates, - facets: facets.unwrap_or_default(), - }; + let answer = + Answer { documents, number_of_candidates, facets: facets.unwrap_or_default() }; Response::builder() .header("Content-Type", "application/json") @@ -764,9 +782,8 @@ async fn main() -> anyhow::Result<()> { }); let index_cloned = index.clone(); - let document_route = warp::filters::method::get() - .and(warp::path!("document" / String)) - .map(move |id: String| { + let document_route = warp::filters::method::get().and(warp::path!("document" / String)).map( + move |id: String| { let index = index_cloned.clone(); let rtxn = index.read_txn().unwrap(); @@ -780,30 +797,31 @@ async fn main() -> anyhow::Result<()> { match external_documents_ids.get(&id) { Some(document_id) => { let document_id = document_id as u32; - let (_, obkv) = index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); + let (_, obkv) = + index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); Response::builder() .header("Content-Type", "application/json") .body(serde_json::to_string(&document).unwrap()) } - None => { - Response::builder() - .status(404) - .body(format!("Document with id {:?} not found.", id)) - } + None => Response::builder() + .status(404) + .body(format!("Document with id {:?} not found.", id)), } - }); + }, + ); async fn buf_stream( update_store: Arc>, - update_status_sender: broadcast::Sender>, + update_status_sender: broadcast::Sender< + UpdateStatus, + >, update_method: Option, update_format: UpdateFormat, encoding: Option, - mut stream: impl futures::Stream> + Unpin, - ) -> Result - { + mut stream: impl futures::Stream> + Unpin, + ) -> Result { let file = tokio::task::block_in_place(tempfile::tempfile).unwrap(); let mut file = TFile::from_std(file); @@ -869,9 +887,8 @@ async fn main() -> anyhow::Result<()> { let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); - let clearing_route = warp::filters::method::post() - .and(warp::path!("clear-documents")) - .map(move || { + let clearing_route = + warp::filters::method::post().and(warp::path!("clear-documents")).map(move || { let meta = UpdateMeta::ClearDocuments; let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); @@ -919,9 +936,8 @@ async fn main() -> anyhow::Result<()> { let update_store_cloned = update_store.clone(); let update_status_sender_cloned = update_status_sender.clone(); - let abort_pending_updates_route = warp::filters::method::delete() - .and(warp::path!("updates")) - .map(move || { + let abort_pending_updates_route = + warp::filters::method::delete().and(warp::path!("updates")).map(move || { let updates = update_store_cloned.abort_pendings().unwrap(); for (update_id, meta) in updates { let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta }); @@ -930,25 +946,22 @@ async fn main() -> anyhow::Result<()> { warp::reply() }); - let update_ws_route = warp::ws() - .and(warp::path!("updates" / "ws")) - .map(move |ws: warp::ws::Ws| { + let update_ws_route = + warp::ws().and(warp::path!("updates" / "ws")).map(move |ws: warp::ws::Ws| { // And then our closure will be called when it completes... let update_status_receiver = update_status_sender.subscribe(); ws.on_upgrade(|websocket| { // Just echo all updates messages... update_status_receiver .into_stream() - .flat_map(|result| { - match result { - Ok(status) => { - let msg = serde_json::to_string(&status).unwrap(); - stream::iter(Some(Ok(Message::text(msg)))) - } - Err(e) => { - eprintln!("channel error: {:?}", e); - stream::iter(None) - } + .flat_map(|result| match result { + Ok(status) => { + let msg = serde_json::to_string(&status).unwrap(); + stream::iter(Some(Ok(Message::text(msg)))) + } + Err(e) => { + eprintln!("channel error: {:?}", e); + stream::iter(None) } }) .forward(websocket) @@ -988,10 +1001,9 @@ async fn main() -> anyhow::Result<()> { #[cfg(test)] mod tests { - use maplit::{btreeset,hashmap, hashset}; - use serde_test::{assert_tokens, Token}; - + use maplit::{btreeset, hashmap, hashset}; use milli::update::Setting; + use serde_test::{assert_tokens, Token}; use crate::Settings; @@ -1000,50 +1012,53 @@ mod tests { let settings = Settings { displayed_attributes: Setting::Set(vec!["name".to_string()]), searchable_attributes: Setting::Set(vec!["age".to_string()]), - filterable_attributes: Setting::Set(hashset!{ "age".to_string() }), + filterable_attributes: Setting::Set(hashset! { "age".to_string() }), criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), - synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] }) + synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }), }; - assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 6 }, - Token::Str("displayedAttributes"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("name"), - Token::SeqEnd, - Token::Str("searchableAttributes"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("age"), - Token::SeqEnd, - Token::Str("facetedAttributes"), - Token::Some, - Token::Map { len: Some(1) }, - Token::Str("age"), - Token::Str("integer"), - Token::MapEnd, - Token::Str("criteria"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("asc(age)"), - Token::SeqEnd, - Token::Str("stopWords"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("and"), - Token::SeqEnd, - Token::Str("synonyms"), - Token::Some, - Token::Map { len: Some(1) }, - Token::Str("alex"), - Token::Seq {len: Some(1) }, - Token::Str("alexey"), - Token::SeqEnd, - Token::MapEnd, - Token::StructEnd, - ]); + assert_tokens( + &settings, + &[ + Token::Struct { name: "Settings", len: 6 }, + Token::Str("displayedAttributes"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("name"), + Token::SeqEnd, + Token::Str("searchableAttributes"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("age"), + Token::SeqEnd, + Token::Str("facetedAttributes"), + Token::Some, + Token::Map { len: Some(1) }, + Token::Str("age"), + Token::Str("integer"), + Token::MapEnd, + Token::Str("criteria"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("asc(age)"), + Token::SeqEnd, + Token::Str("stopWords"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("and"), + Token::SeqEnd, + Token::Str("synonyms"), + Token::Some, + Token::Map { len: Some(1) }, + Token::Str("alex"), + Token::Seq { len: Some(1) }, + Token::Str("alexey"), + Token::SeqEnd, + Token::MapEnd, + Token::StructEnd, + ], + ); } #[test] @@ -1057,22 +1072,25 @@ mod tests { synonyms: Setting::Reset, }; - assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 6 }, - Token::Str("displayedAttributes"), - Token::None, - Token::Str("searchableAttributes"), - Token::None, - Token::Str("facetedAttributes"), - Token::None, - Token::Str("criteria"), - Token::None, - Token::Str("stopWords"), - Token::None, - Token::Str("synonyms"), - Token::None, - Token::StructEnd, - ]); + assert_tokens( + &settings, + &[ + Token::Struct { name: "Settings", len: 6 }, + Token::Str("displayedAttributes"), + Token::None, + Token::Str("searchableAttributes"), + Token::None, + Token::Str("facetedAttributes"), + Token::None, + Token::Str("criteria"), + Token::None, + Token::Str("stopWords"), + Token::None, + Token::Str("synonyms"), + Token::None, + Token::StructEnd, + ], + ); } #[test] @@ -1086,9 +1104,6 @@ mod tests { synonyms: Setting::NotSet, }; - assert_tokens(&settings, &[ - Token::Struct { name: "Settings", len: 0 }, - Token::StructEnd, - ]); + assert_tokens(&settings, &[Token::Struct { name: "Settings", len: 0 }, Token::StructEnd]); } } diff --git a/http-ui/src/update_store.rs b/http-ui/src/update_store.rs index 122ee6031..b77057fda 100644 --- a/http-ui/src/update_store.rs +++ b/http-ui/src/update_store.rs @@ -4,9 +4,9 @@ use std::path::Path; use std::sync::Arc; use crossbeam_channel::Sender; -use heed::types::{OwnedType, DecodeIgnore, SerdeJson, ByteSlice}; -use heed::{EnvOpenOptions, Env, Database}; -use serde::{Serialize, Deserialize}; +use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson}; +use heed::{Database, Env, EnvOpenOptions}; +use serde::{Deserialize, Serialize}; pub type BEU64 = heed::zerocopy::U64; @@ -25,7 +25,9 @@ pub trait UpdateHandler { } impl UpdateHandler for F -where F: FnMut(u64, M, &[u8]) -> heed::Result + Send + 'static { +where + F: FnMut(u64, M, &[u8]) -> heed::Result + Send + 'static, +{ fn handle_update(&mut self, update_id: u64, meta: M, content: &[u8]) -> heed::Result { self(update_id, meta, content) } @@ -82,26 +84,17 @@ impl UpdateStore { /// Returns the new biggest id to use to store the new update. fn new_update_id(&self, txn: &heed::RoTxn) -> heed::Result { - let last_pending = self.pending_meta - .remap_data_type::() - .last(txn)? - .map(|(k, _)| k.get()); + let last_pending = + self.pending_meta.remap_data_type::().last(txn)?.map(|(k, _)| k.get()); - let last_processed = self.processed_meta - .remap_data_type::() - .last(txn)? - .map(|(k, _)| k.get()); + let last_processed = + self.processed_meta.remap_data_type::().last(txn)?.map(|(k, _)| k.get()); - let last_aborted = self.aborted_meta - .remap_data_type::() - .last(txn)? - .map(|(k, _)| k.get()); + let last_aborted = + self.aborted_meta.remap_data_type::().last(txn)?.map(|(k, _)| k.get()); - let last_update_id = [last_pending, last_processed, last_aborted] - .iter() - .copied() - .flatten() - .max(); + let last_update_id = + [last_pending, last_processed, last_aborted].iter().copied().flatten().max(); match last_update_id { Some(last_id) => Ok(last_id + 1), @@ -112,7 +105,8 @@ impl UpdateStore { /// Registers the update content in the pending store and the meta /// into the pending-meta store. Returns the new unique update id. pub fn register_update(&self, meta: &M, content: &[u8]) -> heed::Result - where M: Serialize, + where + M: Serialize, { let mut wtxn = self.env.write_txn()?; @@ -152,9 +146,8 @@ impl UpdateStore { // a reader while processing it, not a writer. match first_meta { Some((first_id, first_meta)) => { - let first_content = self.pending - .get(&rtxn, &first_id)? - .expect("associated update content"); + let first_content = + self.pending.get(&rtxn, &first_id)?.expect("associated update content"); // Process the pending update using the provided user function. let new_meta = handler.handle_update(first_id.get(), first_meta, first_content)?; @@ -170,15 +163,16 @@ impl UpdateStore { wtxn.commit()?; Ok(Some((first_id.get(), new_meta))) - }, - None => Ok(None) + } + None => Ok(None), } } /// The id and metadata of the update that is currently being processed, /// `None` if no update is being processed. pub fn processing_update(&self) -> heed::Result> - where M: for<'a> Deserialize<'a>, + where + M: for<'a> Deserialize<'a>, { let rtxn = self.env.read_txn()?; match self.pending_meta.first(&rtxn)? { @@ -242,7 +236,8 @@ impl UpdateStore { /// that as already been processed or which doesn't actually exist, will /// return `None`. pub fn abort_update(&self, update_id: u64) -> heed::Result> - where M: Serialize + for<'a> Deserialize<'a>, + where + M: Serialize + for<'a> Deserialize<'a>, { let mut wtxn = self.env.write_txn()?; let key = BEU64::new(update_id); @@ -269,7 +264,8 @@ impl UpdateStore { /// Aborts all the pending updates, and not the one being currently processed. /// Returns the update metas and ids that were successfully aborted. pub fn abort_pendings(&self) -> heed::Result> - where M: Serialize + for<'a> Deserialize<'a>, + where + M: Serialize + for<'a> Deserialize<'a>, { let mut wtxn = self.env.write_txn()?; let mut aborted_updates = Vec::new(); @@ -303,17 +299,19 @@ pub enum UpdateStatusMeta { #[cfg(test)] mod tests { - use super::*; use std::thread; use std::time::{Duration, Instant}; + use super::*; + #[test] fn simple() { let dir = tempfile::tempdir().unwrap(); let options = EnvOpenOptions::new(); - let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content:&_| { + let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| { Ok(meta + " processed") - }).unwrap(); + }) + .unwrap(); let meta = String::from("kiki"); let update_id = update_store.register_update(&meta, &[]).unwrap(); @@ -329,10 +327,11 @@ mod tests { fn long_running_update() { let dir = tempfile::tempdir().unwrap(); let options = EnvOpenOptions::new(); - let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content:&_| { + let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| { thread::sleep(Duration::from_millis(400)); Ok(meta + " processed") - }).unwrap(); + }) + .unwrap(); let before_register = Instant::now(); diff --git a/infos/src/main.rs b/infos/src/main.rs index b0c304de0..151e8c664 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -1,16 +1,14 @@ use std::fmt::Write as _; use std::path::PathBuf; -use std::{str, io, fmt}; +use std::{fmt, io, str}; use anyhow::Context; use byte_unit::Byte; use heed::EnvOpenOptions; -use structopt::StructOpt; - use milli::facet::FacetType; use milli::index::db_name::*; use milli::{Index, TreeLevel}; - +use structopt::StructOpt; use Command::*; #[cfg(target_os = "linux")] @@ -257,53 +255,55 @@ fn main() -> anyhow::Result<()> { WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words), WordsPrefixesDocids { full_display, prefixes } => { words_prefixes_docids(&index, &rtxn, !full_display, prefixes) - }, + } FacetNumbersDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name) - }, + } FacetStringsDocids { full_display, field_name } => { facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name) - }, + } WordsLevelPositionsDocids { full_display, words } => { words_level_positions_docids(&index, &rtxn, !full_display, words) - }, + } WordPrefixesLevelPositionsDocids { full_display, prefixes } => { word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) - }, + } FieldIdWordCountDocids { full_display, field_name } => { field_id_word_count_docids(&index, &rtxn, !full_display, field_name) - }, + } DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) - }, + } FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), - AverageNumberOfPositionsByWord => { - average_number_of_positions_by_word(&index, &rtxn) - }, + AverageNumberOfPositionsByWord => average_number_of_positions_by_word(&index, &rtxn), SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases), DatabaseStats { database } => database_stats(&index, &rtxn, &database), WordPairProximitiesDocids { full_display, word1, word2 } => { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) - }, + } ExportWordsFst => export_words_fst(&index, &rtxn), ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), ExportDocuments { internal_documents_ids } => { export_documents(&index, &rtxn, internal_documents_ids) - }, + } } } fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { - use std::collections::BinaryHeap; use std::cmp::Reverse; + use std::collections::BinaryHeap; let mut heap = BinaryHeap::with_capacity(limit + 1); for result in index.word_docids.iter(rtxn)? { - if limit == 0 { break } + if limit == 0 { + break; + } let (word, docids) = result?; heap.push((Reverse(docids.len()), word)); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } let stdout = io::stdout(); @@ -323,7 +323,7 @@ fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>( rtxn: &'txn heed::RoTxn, db: heed::Database, field_id: u8, -) -> heed::Result> + 'txn>> +) -> heed::Result> + 'txn>> where KC: heed::BytesDecode<'txn>, DC: heed::BytesDecode<'txn>, @@ -347,7 +347,8 @@ fn facet_number_value_to_string(level: u8, left: T, right: T) -> fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { use std::cmp::Reverse; use std::collections::BinaryHeap; - use heed::types::{Str, ByteSlice}; + + use heed::types::{ByteSlice, Str}; let Index { env: _env, @@ -387,71 +388,93 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let words_fst = index.words_fst(rtxn)?; let length = words_fst.as_fst().as_bytes().len(); heap.push(Reverse((length, format!("words-fst"), main_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } // Fetch the word prefix FST let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; let length = words_prefixes_fst.as_fst().as_bytes().len(); heap.push(Reverse((length, format!("words-prefixes-fst"), main_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_docids.remap_data_type::().iter(rtxn)? { let (word, value) = result?; heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_prefix_docids.remap_data_type::().iter(rtxn)? { let (word, value) = result?; heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in docid_word_positions.remap_data_type::().iter(rtxn)? { let ((docid, word), value) = result?; let key = format!("{} {}", docid, word); heap.push(Reverse((value.len(), key, docid_word_positions_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_pair_proximity_docids.remap_data_type::().iter(rtxn)? { let ((word1, word2, prox), value) = result?; let key = format!("{} {} {}", word1, word2, prox); heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_prefix_pair_proximity_docids.remap_data_type::().iter(rtxn)? { let ((word, prefix, prox), value) = result?; let key = format!("{} {} {}", word, prefix, prox); heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_level_position_docids.remap_data_type::().iter(rtxn)? { let ((word, level, left, right), value) = result?; let key = format!("{} {} {:?}", word, level, left..=right); heap.push(Reverse((value.len(), key, word_level_position_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in word_prefix_level_position_docids.remap_data_type::().iter(rtxn)? { let ((word, level, left, right), value) = result?; let key = format!("{} {} {:?}", word, level, left..=right); heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } for result in field_id_word_count_docids.remap_data_type::().iter(rtxn)? { let ((field_id, word_count), docids) = result?; let key = format!("{} {}", field_id, word_count); heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } let faceted_fields = index.faceted_fields_ids(rtxn)?; @@ -468,7 +491,9 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho write!(&mut output, " (level {})", level)?; let key = format!("{} {}", facet_name, output); heap.push(Reverse((value.len(), key, facet_id_f64_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } // List the facet strings of this facet id. @@ -477,14 +502,18 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let ((_fid, fvalue), value) = result?; let key = format!("{} {}", facet_name, fvalue); heap.push(Reverse((value.len(), key, facet_id_string_docids_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } } for result in documents.remap_data_type::().iter(rtxn)? { let (id, value) = result?; heap.push(Reverse((value.len(), id.to_string(), documents_name))); - if heap.len() > limit { heap.pop(); } + if heap.len() > limit { + heap.pop(); + } } } @@ -499,7 +528,12 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho Ok(wtr.flush()?) } -fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec) -> anyhow::Result<()> { +fn words_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + words: Vec, +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["word", "documents_ids"])?; @@ -523,8 +557,7 @@ fn words_prefixes_docids( rtxn: &heed::RoTxn, debug: bool, prefixes: Vec, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["prefix", "documents_ids"])?; @@ -561,12 +594,12 @@ fn facet_values_docids( debug: bool, facet_type: FacetType, field_name: String, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; - let field_id = fields_ids_map.id(&field_name) + let field_id = fields_ids_map + .id(&field_name) .with_context(|| format!("field {} not found", field_name))?; if !faceted_fields.contains(&field_id) { @@ -590,7 +623,7 @@ fn facet_values_docids( }; wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; } - }, + } FacetType::String => { wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { @@ -614,8 +647,7 @@ fn words_level_positions_docids( rtxn: &heed::RoTxn, debug: bool, words: Vec, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; @@ -653,8 +685,7 @@ fn word_prefixes_level_positions_docids( rtxn: &heed::RoTxn, debug: bool, prefixes: Vec, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?; @@ -691,21 +722,20 @@ fn field_id_word_count_docids( index: &Index, rtxn: &heed::RoTxn, debug: bool, - field_name: String -) -> anyhow::Result<()> -{ + field_name: String, +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["field_name", "word_count", "docids"])?; - let field_id = index.fields_ids_map(rtxn)? + let field_id = index + .fields_ids_map(rtxn)? .id(&field_name) .with_context(|| format!("unknown field name: {}", &field_name))?; let left = (field_id, 0); let right = (field_id, u8::max_value()); - let iter = index.field_id_word_count_docids - .range(rtxn, &(left..=right))?; + let iter = index.field_id_word_count_docids.range(rtxn, &(left..=right))?; for result in iter { let ((_, word_count), docids) = result?; @@ -725,8 +755,7 @@ fn docids_words_positions( rtxn: &heed::RoTxn, debug: bool, internal_ids: Vec, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); wtr.write_record(&["document_id", "word", "positions"])?; @@ -734,9 +763,10 @@ fn docids_words_positions( let iter: Box> = if internal_ids.is_empty() { Box::new(index.docid_word_positions.iter(rtxn)?) } else { - let vec: heed::Result> = internal_ids.into_iter().map(|id| { - index.docid_word_positions.prefix_iter(rtxn, &(id, "")) - }).collect(); + let vec: heed::Result> = internal_ids + .into_iter() + .map(|id| index.docid_word_positions.prefix_iter(rtxn, &(id, ""))) + .collect(); Box::new(vec?.into_iter().flatten()) }; @@ -757,7 +787,8 @@ fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; - let field_id = fields_ids_map.id(&field_name) + let field_id = fields_ids_map + .id(&field_name) .with_context(|| format!("field {} not found", field_name))?; if !faceted_fields.contains(&field_id) { @@ -808,9 +839,14 @@ fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result< Ok(()) } -fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) -> anyhow::Result<()> { +fn export_documents( + index: &Index, + rtxn: &heed::RoTxn, + internal_ids: Vec, +) -> anyhow::Result<()> { use std::io::{BufWriter, Write as _}; - use milli::{BEU32, obkv_to_json}; + + use milli::{obkv_to_json, BEU32}; let stdout = io::stdout(); let mut out = BufWriter::new(stdout); @@ -819,13 +855,13 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) - let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); let iter: Box> = if internal_ids.is_empty() { - Box::new(index.documents.iter(rtxn)?.map(|result| { - result.map(|(_id, obkv)| obkv) - })) + Box::new(index.documents.iter(rtxn)?.map(|result| result.map(|(_id, obkv)| obkv))) } else { - Box::new(internal_ids.into_iter().flat_map(|id| { - index.documents.get(rtxn, &BEU32::new(id)).transpose() - })) + Box::new( + internal_ids + .into_iter() + .flat_map(|id| index.documents.get(rtxn, &BEU32::new(id)).transpose()), + ) }; for result in iter { @@ -842,26 +878,27 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) - fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { use heed::types::DecodeIgnore; - use milli::{DocumentId, BEU32StrCodec}; + use milli::{BEU32StrCodec, DocumentId}; let mut words_counts = Vec::new(); let mut count = 0; let mut prev = None as Option<(DocumentId, u32)>; - let iter = index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?; + let iter = + index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?; for result in iter { let ((docid, _word), ()) = result?; match prev.as_mut() { Some((prev_docid, prev_count)) if docid == *prev_docid => { *prev_count += 1; - }, + } Some((prev_docid, prev_count)) => { words_counts.push(*prev_count); *prev_docid = docid; *prev_count = 0; count += 1; - }, + } None => prev = Some((docid, 1)), } } @@ -970,16 +1007,15 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> { use heed::types::ByteSlice; - use heed::{Error, BytesDecode}; - use roaring::RoaringBitmap; + use heed::{BytesDecode, Error}; use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; + use roaring::RoaringBitmap; fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>( db: heed::PolyDatabase, rtxn: &'a heed::RoTxn, name: &str, - ) -> anyhow::Result<()> - { + ) -> anyhow::Result<()> { let mut key_size = 0u64; let mut val_size = 0u64; let mut values_length = Vec::new(); @@ -1028,27 +1064,27 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu WORD_DOCIDS => { let db = index.word_docids.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } WORD_PREFIX_DOCIDS => { let db = index.word_prefix_docids.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } DOCID_WORD_POSITIONS => { let db = index.docid_word_positions.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } WORD_PAIR_PROXIMITY_DOCIDS => { let db = index.word_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => { let db = index.word_prefix_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } FIELD_ID_WORD_COUNT_DOCIDS => { let db = index.field_id_word_count_docids.as_polymorph(); compute_stats::(*db, rtxn, name) - }, + } unknown => anyhow::bail!("unknown database {:?}", unknown), } } @@ -1059,8 +1095,7 @@ fn word_pair_proximities_docids( debug: bool, word1: String, word2: String, -) -> anyhow::Result<()> -{ +) -> anyhow::Result<()> { use heed::types::ByteSlice; use milli::RoaringBitmapCodec; @@ -1081,7 +1116,9 @@ fn word_pair_proximities_docids( // Skip keys that are longer than the requested one, // a longer key means that the second word is a prefix of the request word. - if key.len() != prefix.len() + 1 { continue; } + if key.len() != prefix.len() + 1 { + continue; + } let proximity = key.last().unwrap(); let docids = if debug { diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 931cf8588..cc1fca01f 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,15 +1,14 @@ use std::fmt; use std::str::FromStr; -use regex::Regex; -use serde::{Serialize, Deserialize}; use once_cell::sync::Lazy; +use regex::Regex; +use serde::{Deserialize, Serialize}; use crate::error::{Error, UserError}; -static ASC_DESC_REGEX: Lazy = Lazy::new(|| { - Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() -}); +static ASC_DESC_REGEX: Lazy = + Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()); #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { @@ -52,17 +51,21 @@ impl FromStr for Criterion { "attribute" => Ok(Criterion::Attribute), "exactness" => Ok(Criterion::Exactness), text => { - let caps = ASC_DESC_REGEX.captures(text).ok_or_else(|| { - UserError::InvalidCriterionName { name: text.to_string() } - })?; + let caps = ASC_DESC_REGEX + .captures(text) + .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; let order = caps.get(1).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str(); match order { "asc" => Ok(Criterion::Asc(field_name.to_string())), "desc" => Ok(Criterion::Desc(field_name.to_string())), - text => return Err(UserError::InvalidCriterionName { name: text.to_string() }.into()), + text => { + return Err( + UserError::InvalidCriterionName { name: text.to_string() }.into() + ) + } } - }, + } } } } @@ -82,13 +85,13 @@ impl fmt::Display for Criterion { use Criterion::*; match self { - Words => f.write_str("words"), - Typo => f.write_str("typo"), - Proximity => f.write_str("proximity"), - Attribute => f.write_str("attribute"), - Exactness => f.write_str("exactness"), - Asc(attr) => write!(f, "asc({})", attr), - Desc(attr) => write!(f, "desc({})", attr), + Words => f.write_str("words"), + Typo => f.write_str("typo"), + Proximity => f.write_str("proximity"), + Attribute => f.write_str("attribute"), + Exactness => f.write_str("exactness"), + Asc(attr) => write!(f, "asc({})", attr), + Desc(attr) => write!(f, "desc({})", attr), } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 78a1b1c59..31012c690 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -2,7 +2,7 @@ use std::convert::Infallible; use std::error::Error as StdError; use std::{fmt, io, str}; -use heed::{MdbError, Error as HeedError}; +use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; use serde_json::{Map, Value}; @@ -80,14 +80,17 @@ impl From for Error { } } -impl From> for Error where Error: From { +impl From> for Error +where + Error: From, +{ fn from(error: grenad::Error) -> Error { match error { grenad::Error::Io(error) => Error::IoError(error), grenad::Error::Merge(error) => Error::from(error), grenad::Error::InvalidCompressionType => { Error::InternalError(InternalError::GrenadInvalidCompressionType) - }, + } } } } @@ -171,15 +174,15 @@ impl fmt::Display for InternalError { match self { Self::DatabaseMissingEntry { db_name, key } => { write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) - }, + } Self::FieldIdMapMissingEntry(error) => error.fmt(f), Self::Fst(error) => error.fmt(f), Self::GrenadInvalidCompressionType => { f.write_str("invalid compression type have been specified to grenad") - }, + } Self::IndexingMergingKeys { process } => { write!(f, "invalid merge while processing {}", process) - }, + } Self::Serialization(error) => error.fmt(f), Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), Self::RayonThreadPool(error) => error.fmt(f), @@ -204,12 +207,12 @@ impl fmt::Display for UserError { Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); write!(f, "document identifier is invalid {}", json) - }, + } Self::InvalidFilterAttribute(error) => error.fmt(f), Self::MissingDocumentId { document } => { let json = serde_json::to_string(document).unwrap(); write!(f, "document doesn't have an identifier {}", json) - }, + } Self::MissingPrimaryKey => f.write_str("missing primary key"), Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), // TODO where can we find it instead of writing the text ourselves? @@ -217,14 +220,14 @@ impl fmt::Display for UserError { Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), Self::PrimaryKeyCannotBeChanged => { f.write_str("primary key cannot be changed if the database contains documents") - }, + } Self::PrimaryKeyCannotBeReset => { f.write_str("primary key cannot be reset if the database contains documents") - }, + } Self::SerdeJson(error) => error.fmt(f), Self::UnknownInternalDocumentId { document_id } => { write!(f, "an unknown internal document id have been used ({})", document_id) - }, + } } } } @@ -236,10 +239,10 @@ impl fmt::Display for FieldIdMapMissingEntry { match self { Self::FieldId { field_id, process } => { write!(f, "unknown field id {} coming from the {} process", field_id, process) - }, + } Self::FieldName { field_name, process } => { write!(f, "unknown field name {} coming from the {} process", field_name, process) - }, + } } } } @@ -251,11 +254,11 @@ impl fmt::Display for SerializationError { match self { Self::Decoding { db_name: Some(name) } => { write!(f, "decoding from the {} database failed", name) - }, + } Self::Decoding { db_name: None } => f.write_str("decoding failed"), Self::Encoding { db_name: Some(name) } => { write!(f, "encoding into the {} database failed", name) - }, + } Self::Encoding { db_name: None } => f.write_str("encoding failed"), Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"), } diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index ee2a6c7bb..3dec638da 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use std::convert::TryInto; -use fst::{Streamer, IntoStreamer}; + +use fst::{IntoStreamer, Streamer}; pub struct ExternalDocumentsIds<'a> { pub(crate) hard: fst::Map>, @@ -8,7 +9,10 @@ pub struct ExternalDocumentsIds<'a> { } impl<'a> ExternalDocumentsIds<'a> { - pub fn new(hard: fst::Map>, soft: fst::Map>) -> ExternalDocumentsIds<'a> { + pub fn new( + hard: fst::Map>, + soft: fst::Map>, + ) -> ExternalDocumentsIds<'a> { ExternalDocumentsIds { hard, soft } } @@ -29,7 +33,7 @@ impl<'a> ExternalDocumentsIds<'a> { match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { // u64 MAX means deleted in the soft fst map Some(id) if id != u64::MAX => Some(id.try_into().unwrap()), - _otherwise => None + _otherwise => None, } } diff --git a/milli/src/facet/facet_type.rs b/milli/src/facet/facet_type.rs index 09f29bc00..51dd448e2 100644 --- a/milli/src/facet/facet_type.rs +++ b/milli/src/facet/facet_type.rs @@ -2,10 +2,9 @@ use std::error::Error; use std::fmt; use std::str::FromStr; -use serde::{Serialize, Deserialize}; +use serde::{Deserialize, Serialize}; -#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] -#[derive(Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum FacetType { String, Number, @@ -43,4 +42,4 @@ impl fmt::Display for InvalidFacetType { } } -impl Error for InvalidFacetType { } +impl Error for InvalidFacetType {} diff --git a/milli/src/facet/facet_value.rs b/milli/src/facet/facet_value.rs index 99455fa27..eb7fb3c5e 100644 --- a/milli/src/facet/facet_value.rs +++ b/milli/src/facet/facet_value.rs @@ -50,7 +50,7 @@ impl Serialize for FacetValue { FacetValue::Number(number) => { let string = number.to_string(); serializer.serialize_str(&string) - }, + } } } } diff --git a/milli/src/facet/value_encoding.rs b/milli/src/facet/value_encoding.rs index 7259243e5..31c00bd2d 100644 --- a/milli/src/facet/value_encoding.rs +++ b/milli/src/facet/value_encoding.rs @@ -28,6 +28,7 @@ fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] { #[cfg(test)] mod tests { use std::cmp::Ordering::Less; + use super::*; fn is_sorted(x: &[T]) -> bool { @@ -39,8 +40,8 @@ mod tests { let a = -13_f64; let b = -10.0; let c = -0.0; - let d = 1.0; - let e = 43.0; + let d = 1.0; + let e = 43.0; let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect(); assert!(is_sorted(&vec), "{:?}", vec); diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 76ff2d281..b0a084c3c 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -1,5 +1,7 @@ use std::collections::BTreeMap; -use serde::{Serialize, Deserialize}; + +use serde::{Deserialize, Serialize}; + use crate::FieldId; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -11,11 +13,7 @@ pub struct FieldsIdsMap { impl FieldsIdsMap { pub fn new() -> FieldsIdsMap { - FieldsIdsMap { - names_ids: BTreeMap::new(), - ids_names: BTreeMap::new(), - next_id: Some(0), - } + FieldsIdsMap { names_ids: BTreeMap::new(), ids_names: BTreeMap::new(), next_id: Some(0) } } /// Returns the number of fields ids in the map. @@ -62,17 +60,17 @@ impl FieldsIdsMap { } /// Iterate over the ids and names in the ids order. - pub fn iter(&self) -> impl Iterator { + pub fn iter(&self) -> impl Iterator { self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) } /// Iterate over the ids in the order of the ids. - pub fn ids<'a>(&'a self) -> impl Iterator + 'a { + pub fn ids<'a>(&'a self) -> impl Iterator + 'a { self.ids_names.keys().copied() } /// Iterate over the names in the order of the ids. - pub fn names(&self) -> impl Iterator { + pub fn names(&self) -> impl Iterator { self.ids_names.values().map(AsRef::as_ref) } } diff --git a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs index a4642f961..b23dcb269 100644 --- a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs @@ -71,7 +71,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { #[cfg(test)] mod tests { - use heed::{BytesEncode, BytesDecode}; + use heed::{BytesDecode, BytesEncode}; + use super::*; #[test] diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs index e9b5abeb8..b3c0fa381 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs @@ -1,8 +1,8 @@ use std::borrow::Cow; use std::convert::TryInto; -use crate::{FieldId, DocumentId}; use crate::facet::value_encoding::f64_into_bytes; +use crate::{DocumentId, FieldId}; pub struct FieldDocIdFacetF64Codec; diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs index b002346e9..fd3f1143d 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -2,12 +2,17 @@ use std::borrow::Cow; use std::convert::TryInto; use std::str; -use crate::{FieldId, DocumentId}; +use crate::{DocumentId, FieldId}; pub struct FieldDocIdFacetStringCodec; impl FieldDocIdFacetStringCodec { - pub fn serialize_into(field_id: FieldId, document_id: DocumentId, value: &str, out: &mut Vec) { + pub fn serialize_into( + field_id: FieldId, + document_id: DocumentId, + value: &str, + out: &mut Vec, + ) { out.reserve(1 + 4 + value.len()); out.push(field_id); out.extend_from_slice(&document_id.to_be_bytes()); diff --git a/milli/src/heed_codec/field_id_word_count_codec.rs b/milli/src/heed_codec/field_id_word_count_codec.rs index 5796e5020..64f0e1db6 100644 --- a/milli/src/heed_codec/field_id_word_count_codec.rs +++ b/milli/src/heed_codec/field_id_word_count_codec.rs @@ -1,4 +1,5 @@ -use std::{borrow::Cow, convert::TryInto}; +use std::borrow::Cow; +use std::convert::TryInto; use crate::FieldId; diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 65a06573e..7bd7dff2d 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -1,16 +1,18 @@ mod beu32_str_codec; +pub mod facet; +mod field_id_word_count_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; mod str_level_position_codec; mod str_str_u8_codec; -mod field_id_word_count_codec; -pub mod facet; pub use self::beu32_str_codec::BEU32StrCodec; +pub use self::field_id_word_count_codec::FieldIdWordCountCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; -pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; +pub use self::roaring_bitmap_length::{ + BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, +}; pub use self::str_level_position_codec::StrLevelPositionCodec; pub use self::str_str_u8_codec::StrStrU8Codec; -pub use self::field_id_word_count_codec::FieldIdWordCountCodec; diff --git a/milli/src/heed_codec/obkv_codec.rs b/milli/src/heed_codec/obkv_codec.rs index 94a230e05..b7414b693 100644 --- a/milli/src/heed_codec/obkv_codec.rs +++ b/milli/src/heed_codec/obkv_codec.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; + use obkv::{KvReader, KvWriter}; pub struct ObkvCodec; diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 325effa73..53f64d648 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -75,7 +75,9 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { #[cfg(test)] mod tests { use std::iter::FromIterator; - use heed::{BytesEncode, BytesDecode}; + + use heed::{BytesDecode, BytesEncode}; + use super::*; #[test] diff --git a/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs index 755296704..8fae9b8fd 100644 --- a/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; + use roaring::RoaringBitmap; pub struct RoaringBitmapCodec; diff --git a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs index 042b5cf6b..4d266e413 100644 --- a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs @@ -1,7 +1,7 @@ -use std::io::{self, Read, BufRead}; +use std::io::{self, BufRead, Read}; use std::mem; -use byteorder::{ReadBytesExt, LittleEndian}; +use byteorder::{LittleEndian, ReadBytesExt}; const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; const SERIAL_COOKIE: u16 = 12347; @@ -16,20 +16,14 @@ impl RoaringBitmapLenCodec { if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { (bytes.read_u32::()? as usize, true) } else if (cookie as u16) == SERIAL_COOKIE { - return Err(io::Error::new( - io::ErrorKind::Other, - "run containers are unsupported", - )); + return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported")); } else { return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); } }; if size > u16::max_value() as usize + 1 { - return Err(io::Error::new( - io::ErrorKind::Other, - "size is greater than supported", - )); + return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported")); } let mut description_bytes = vec![0u8; size * 4]; @@ -67,12 +61,12 @@ impl heed::BytesDecode<'_> for RoaringBitmapLenCodec { #[cfg(test)] mod tests { - use super::*; - - use crate::heed_codec::RoaringBitmapCodec; use heed::BytesEncode; use roaring::RoaringBitmap; + use super::*; + use crate::heed_codec::RoaringBitmapCodec; + #[test] fn deserialize_roaring_bitmap_length() { let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect(); diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs index 810e91940..5be45bbeb 100644 --- a/milli/src/heed_codec/str_level_position_codec.rs +++ b/milli/src/heed_codec/str_level_position_codec.rs @@ -13,7 +13,9 @@ impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { fn bytes_decode(bytes: &'a [u8]) -> Option { let footer_len = size_of::() + size_of::() * 2; - if bytes.len() < footer_len { return None } + if bytes.len() < footer_len { + return None; + } let (word, bytes) = bytes.split_at(bytes.len() - footer_len); let word = str::from_utf8(word).ok()?; diff --git a/milli/src/index.rs b/milli/src/index.rs index bf4b3e023..a6c09f3d3 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -3,23 +3,22 @@ use std::collections::{HashMap, HashSet}; use std::path::Path; use chrono::{DateTime, Utc}; -use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use heed::types::*; +use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use crate::error::{UserError, FieldIdMapMissingEntry, InternalError}; -use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, Search}; -use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId, Result}; -use crate::{ - BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, - ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, - FieldIdWordCountCodec, -}; -use crate::heed_codec::facet::{ - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, - FacetValueStringCodec, FacetLevelValueF64Codec, -}; +use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, + FieldDocIdFacetStringCodec, +}; +use crate::{ + default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, + DocumentId, ExternalDocumentsIds, FacetDistribution, FieldId, FieldIdWordCountCodec, + FieldsDistribution, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, + StrLevelPositionCodec, StrStrU8Codec, BEU32, +}; pub mod main_key { pub const CRITERIA_KEY: &str = "criteria"; @@ -114,14 +113,17 @@ impl Index { let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; - let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; + let word_prefix_pair_proximity_docids = + env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; - let word_prefix_level_position_docids = env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; + let word_prefix_level_position_docids = + env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; - let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; + let field_id_docid_facet_strings = + env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; let documents = env.create_database(Some(DOCUMENTS))?; Index::initialize_creation_dates(&env, main)?; @@ -184,18 +186,26 @@ impl Index { /* documents ids */ /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. - pub(crate) fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { + pub(crate) fn put_documents_ids( + &self, + wtxn: &mut RwTxn, + docids: &RoaringBitmap, + ) -> heed::Result<()> { self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) } /// Returns the internal documents ids. pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?.unwrap_or_default()) + Ok(self + .main + .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)? + .unwrap_or_default()) } /// Returns the number of documents indexed in the database. pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result { - let count = self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; + let count = + self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; Ok(count.unwrap_or_default()) } @@ -224,21 +234,30 @@ impl Index { &self, wtxn: &mut RwTxn, external_documents_ids: &ExternalDocumentsIds<'a>, - ) -> heed::Result<()> - { + ) -> heed::Result<()> { let ExternalDocumentsIds { hard, soft } = external_documents_ids; let hard = hard.as_fst().as_bytes(); let soft = soft.as_fst().as_bytes(); - self.main.put::<_, Str, ByteSlice>(wtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, hard)?; - self.main.put::<_, Str, ByteSlice>(wtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, soft)?; + self.main.put::<_, Str, ByteSlice>( + wtxn, + main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, + hard, + )?; + self.main.put::<_, Str, ByteSlice>( + wtxn, + main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, + soft, + )?; Ok(()) } /// Returns the external documents ids map which associate the external ids /// with the internal ids (i.e. `u32`). pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> { - let hard = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; - let soft = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; + let hard = + self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; + let soft = + self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; let hard = match hard { Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, None => fst::Map::default().map_data(Cow::Owned)?, @@ -254,42 +273,62 @@ impl Index { /// Writes the fields ids map which associate the documents keys with an internal field id /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. - pub(crate) fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { + pub(crate) fn put_fields_ids_map( + &self, + wtxn: &mut RwTxn, + map: &FieldsIdsMap, + ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) } /// Returns the fields ids map which associate the documents keys with an internal field id /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, SerdeJson>( - rtxn, - main_key::FIELDS_IDS_MAP_KEY, - )?.unwrap_or_default()) + Ok(self + .main + .get::<_, Str, SerdeJson>(rtxn, main_key::FIELDS_IDS_MAP_KEY)? + .unwrap_or_default()) } /* fields distribution */ /// Writes the fields distribution which associates every field name with /// the number of times it occurs in the documents. - pub(crate) fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution) + pub(crate) fn put_fields_distribution( + &self, + wtxn: &mut RwTxn, + distribution: &FieldsDistribution, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>( + wtxn, + main_key::FIELDS_DISTRIBUTION_KEY, + distribution, + ) } /// Returns the fields distribution which associates every field name with /// the number of times it occurs in the documents. pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result { - Ok(self.main.get::<_, Str, SerdeJson>( - rtxn, - main_key::FIELDS_DISTRIBUTION_KEY, - )?.unwrap_or_default()) + Ok(self + .main + .get::<_, Str, SerdeJson>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)? + .unwrap_or_default()) } /* displayed fields */ /// Writes the fields that must be displayed in the defined order. /// There must be not be any duplicate field id. - pub(crate) fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields) + pub(crate) fn put_displayed_fields( + &self, + wtxn: &mut RwTxn, + fields: &[&str], + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>( + wtxn, + main_key::DISPLAYED_FIELDS_KEY, + &fields, + ) } /// Deletes the displayed fields ids, this will make the engine to display @@ -313,14 +352,17 @@ impl Index { for name in fields.into_iter() { match fields_ids_map.id(name) { Some(field_id) => fields_ids.push(field_id), - None => return Err(FieldIdMapMissingEntry::FieldName { - field_name: name.to_string(), - process: "Index::displayed_fields_ids", - }.into()), + None => { + return Err(FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "Index::displayed_fields_ids", + } + .into()) + } } } Ok(Some(fields_ids)) - }, + } None => Ok(None), } } @@ -328,8 +370,16 @@ impl Index { /* searchable fields */ /// Writes the searchable fields, when this list is specified, only these are indexed. - pub(crate) fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { - self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields) + pub(crate) fn put_searchable_fields( + &self, + wtxn: &mut RwTxn, + fields: &[&str], + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>( + wtxn, + main_key::SEARCHABLE_FIELDS_KEY, + &fields, + ) } /// Deletes the searchable fields, when no fields are specified, all fields are indexed. @@ -352,14 +402,17 @@ impl Index { for name in fields { match fields_ids_map.id(name) { Some(field_id) => fields_ids.push(field_id), - None => return Err(FieldIdMapMissingEntry::FieldName { - field_name: name.to_string(), - process: "Index::searchable_fields_ids", - }.into()), + None => { + return Err(FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "Index::searchable_fields_ids", + } + .into()) + } } } Ok(Some(fields_ids)) - }, + } None => Ok(None), } } @@ -367,7 +420,11 @@ impl Index { /* filterable fields */ /// Writes the filterable fields names in the database. - pub(crate) fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { + pub(crate) fn put_filterable_fields( + &self, + wtxn: &mut RwTxn, + fields: &HashSet, + ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) } @@ -378,10 +435,10 @@ impl Index { /// Returns the filterable fields names. pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result> { - Ok(self.main.get::<_, Str, SerdeJson<_>>( - rtxn, - main_key::FILTERABLE_FIELDS_KEY, - )?.unwrap_or_default()) + Ok(self + .main + .get::<_, Str, SerdeJson<_>>(rtxn, main_key::FILTERABLE_FIELDS_KEY)? + .unwrap_or_default()) } /// Identical to `filterable_fields`, but returns ids instead. @@ -394,11 +451,14 @@ impl Index { match fields_ids_map.id(&name) { Some(field_id) => { fields_ids.insert(field_id); - }, - None => return Err(FieldIdMapMissingEntry::FieldName { - field_name: name, - process: "Index::filterable_fields_ids", - }.into()), + } + None => { + return Err(FieldIdMapMissingEntry::FieldName { + field_name: name, + process: "Index::filterable_fields_ids", + } + .into()) + } } } @@ -413,9 +473,8 @@ impl Index { pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result> { let filterable_fields = self.filterable_fields(rtxn)?; let distinct_field = self.distinct_field(rtxn)?; - let asc_desc_fields = self.criteria(rtxn)? - .into_iter() - .filter_map(|criterion| match criterion { + let asc_desc_fields = + self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { Criterion::Asc(field) | Criterion::Desc(field) => Some(field), _otherwise => None, }); @@ -439,11 +498,14 @@ impl Index { match fields_ids_map.id(&name) { Some(field_id) => { fields_ids.insert(field_id); - }, - None => return Err(FieldIdMapMissingEntry::FieldName { - field_name: name, - process: "Index::faceted_fields_ids", - }.into()), + } + None => { + return Err(FieldIdMapMissingEntry::FieldName { + field_name: name, + process: "Index::faceted_fields_ids", + } + .into()) + } } } @@ -458,8 +520,7 @@ impl Index { wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap, - ) -> heed::Result<()> - { + ) -> heed::Result<()> { let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); @@ -472,8 +533,7 @@ impl Index { &self, rtxn: &RoTxn, field_id: FieldId, - ) -> heed::Result - { + ) -> heed::Result { let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); @@ -490,8 +550,7 @@ impl Index { wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap, - ) -> heed::Result<()> - { + ) -> heed::Result<()> { let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); @@ -504,8 +563,7 @@ impl Index { &self, rtxn: &RoTxn, field_id: FieldId, - ) -> heed::Result - { + ) -> heed::Result { let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); @@ -518,7 +576,11 @@ impl Index { /* distinct field */ - pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> { + pub(crate) fn put_distinct_field( + &self, + wtxn: &mut RwTxn, + distinct_field: &str, + ) -> heed::Result<()> { self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field) } @@ -532,7 +594,11 @@ impl Index { /* criteria */ - pub(crate) fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { + pub(crate) fn put_criteria( + &self, + wtxn: &mut RwTxn, + criteria: &[Criterion], + ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) } @@ -550,7 +616,11 @@ impl Index { /* words fst */ /// Writes the FST which is the words dictionary of the engine. - pub(crate) fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + pub(crate) fn put_words_fst>( + &self, + wtxn: &mut RwTxn, + fst: &fst::Set, + ) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) } @@ -564,7 +634,11 @@ impl Index { /* stop words */ - pub(crate) fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + pub(crate) fn put_stop_words>( + &self, + wtxn: &mut RwTxn, + fst: &fst::Set, + ) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) } @@ -585,8 +659,7 @@ impl Index { &self, wtxn: &mut RwTxn, synonyms: &HashMap, Vec>>, - ) -> heed::Result<()> - { + ) -> heed::Result<()> { self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms) } @@ -595,15 +668,17 @@ impl Index { } pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result, Vec>>> { - Ok(self.main.get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)?.unwrap_or_default()) + Ok(self + .main + .get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)? + .unwrap_or_default()) } pub fn words_synonyms>( &self, rtxn: &RoTxn, words: &[S], - ) -> heed::Result>>> - { + ) -> heed::Result>>> { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms(rtxn)?.remove(&words)) } @@ -611,8 +686,16 @@ impl Index { /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. - pub(crate) fn put_words_prefixes_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { - self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) + pub(crate) fn put_words_prefixes_fst>( + &self, + wtxn: &mut RwTxn, + fst: &fst::Set, + ) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>( + wtxn, + main_key::WORDS_PREFIXES_FST_KEY, + fst.as_fst().as_bytes(), + ) } /// Returns the FST which is the words prefixes dictionnary of the engine. @@ -637,13 +720,14 @@ impl Index { pub fn documents<'t>( &self, rtxn: &'t RoTxn, - ids: impl IntoIterator, - ) -> Result)>> - { + ids: impl IntoIterator, + ) -> Result)>> { let mut documents = Vec::new(); for id in ids { - let kv = self.documents.get(rtxn, &BEU32::new(id))? + let kv = self + .documents + .get(rtxn, &BEU32::new(id))? .ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?; documents.push((id, kv)); } @@ -673,7 +757,8 @@ impl Index { /// Returns the index creation time. pub fn created_at(&self, rtxn: &RoTxn) -> Result> { - Ok(self.main + Ok(self + .main .get::<_, Str, SerdeJson>>(rtxn, main_key::CREATED_AT_KEY)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, @@ -683,7 +768,8 @@ impl Index { /// Returns the index last updated time. pub fn updated_at(&self, rtxn: &RoTxn) -> Result> { - Ok(self.main + Ok(self + .main .get::<_, Str, SerdeJson>>(rtxn, main_key::UPDATED_AT_KEY)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, @@ -691,7 +777,11 @@ impl Index { })?) } - pub(crate) fn set_updated_at(&self, wtxn: &mut RwTxn, time: &DateTime) -> heed::Result<()> { + pub(crate) fn set_updated_at( + &self, + wtxn: &mut RwTxn, + time: &DateTime, + ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>>(wtxn, main_key::UPDATED_AT_KEY, &time) } } @@ -704,8 +794,8 @@ pub(crate) mod tests { use maplit::hashmap; use tempfile::TempDir; - use crate::Index; use crate::update::{IndexDocuments, UpdateFormat}; + use crate::Index; pub(crate) struct TempIndex { inner: Index, @@ -728,10 +818,7 @@ pub(crate) mod tests { options.map_size(100 * 4096); let _tempdir = TempDir::new_in(".").unwrap(); let inner = Index::new(options, _tempdir.path()).unwrap(); - Self { - inner, - _tempdir - } + Self { inner, _tempdir } } } @@ -756,10 +843,13 @@ pub(crate) mod tests { let rtxn = index.read_txn().unwrap(); let fields_distribution = index.fields_distribution(&rtxn).unwrap(); - assert_eq!(fields_distribution, hashmap! { - "id".to_string() => 2, - "name".to_string() => 2, - "age".to_string() => 1, - }); + assert_eq!( + fields_distribution, + hashmap! { + "id".to_string() => 2, + "name".to_string() => 2, + "age".to_string() => 1, + } + ); } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index f37244114..201035a8a 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,14 +1,15 @@ -#[macro_use] extern crate pest_derive; +#[macro_use] +extern crate pest_derive; mod criterion; mod error; mod external_documents_ids; -mod fields_ids_map; -mod search; pub mod facet; +mod fields_ids_map; pub mod heed_codec; pub mod index; pub mod proximity; +mod search; pub mod tree_level; pub mod update; @@ -20,15 +21,17 @@ use std::result::Result as StdResult; use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; -pub use self::criterion::{Criterion, default_criteria}; +pub use self::criterion::{default_criteria, Criterion}; pub use self::error::Error; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; -pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec}; -pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; -pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; +pub use self::heed_codec::{ + BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, + CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, + RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, +}; pub use self::index::Index; -pub use self::search::{Search, FacetDistribution, FilterCondition, SearchResult, MatchingWords}; +pub use self::search::{FacetDistribution, FilterCondition, MatchingWords, Search, SearchResult}; pub use self::tree_level::TreeLevel; pub type Result = std::result::Result; @@ -54,9 +57,9 @@ pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, obkv: obkv::KvReader, -) -> Result> -{ - displayed_fields.iter() +) -> Result> { + displayed_fields + .iter() .copied() .flat_map(|id| obkv.get(id).map(|value| (id, value))) .map(|(id, value)| { @@ -72,7 +75,6 @@ pub fn obkv_to_json( /// Transform a JSON value into a string that can be indexed. pub fn json_to_string(value: &Value) -> Option { - fn inner(value: &Value, output: &mut String) -> bool { use std::fmt::Write; match value { @@ -90,7 +92,7 @@ pub fn json_to_string(value: &Value) -> Option { } // check that at least one value was written count != 0 - }, + } Value::Object(object) => { let mut buffer = String::new(); let mut count = 0; @@ -107,7 +109,7 @@ pub fn json_to_string(value: &Value) -> Option { } // check that at least one value was written count != 0 - }, + } } } @@ -121,9 +123,10 @@ pub fn json_to_string(value: &Value) -> Option { #[cfg(test)] mod tests { - use super::*; use serde_json::json; + use super::*; + #[test] fn json_to_string_object() { let value = json!({ diff --git a/milli/src/proximity.rs b/milli/src/proximity.rs index 0186eb3d0..db98426a5 100644 --- a/milli/src/proximity.rs +++ b/milli/src/proximity.rs @@ -1,4 +1,5 @@ use std::cmp; + use crate::{Attribute, Position}; const ONE_ATTRIBUTE: u32 = 1000; @@ -15,8 +16,11 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { let (lhs_attr, lhs_index) = extract_position(lhs); let (rhs_attr, rhs_index) = extract_position(rhs); - if lhs_attr != rhs_attr { MAX_DISTANCE } - else { index_proximity(lhs_index, rhs_index) } + if lhs_attr != rhs_attr { + MAX_DISTANCE + } else { + index_proximity(lhs_index, rhs_index) + } } pub fn extract_position(position: Position) -> (Attribute, Position) { diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 95f77fd78..ccee2c393 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -5,12 +5,12 @@ use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; +use super::{Criterion, CriterionParameters, CriterionResult}; use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; -use super::{Criterion, CriterionParameters, CriterionResult}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -57,9 +57,8 @@ impl<'t> AscDesc<'t> { ascending: bool, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let field_id = fields_ids_map - .id(&field_name) - .ok_or_else(|| FieldIdMapMissingEntry::FieldName { + let field_id = + fields_ids_map.id(&field_name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { field_name: field_name.clone(), process: "AscDesc::new", })?; @@ -101,44 +100,47 @@ impl<'t> Criterion for AscDesc<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { - self.query_tree = query_tree; - let mut candidates = match (&self.query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - resolve_query_tree(&context, qt, params.wdcache)? - }, - (None, None) => self.index.documents_ids(self.rtxn)?, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + self.query_tree = query_tree; + let mut candidates = match (&self.query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + resolve_query_tree(&context, qt, params.wdcache)? } + (None, None) => self.index.documents_ids(self.rtxn)?, + }; - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, - } + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } - if candidates.is_empty() { - continue; - } + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } - self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = facet_ordered( - self.index, - self.rtxn, - self.field_id, - self.ascending, - candidates & &self.faceted_candidates, - )?; - }, - None => return Ok(None), + if candidates.is_empty() { + continue; + } + + self.allowed_candidates = &candidates - params.excluded_candidates; + self.candidates = facet_ordered( + self.index, + self.rtxn, + self.field_id, + self.ascending, + candidates & &self.faceted_candidates, + )?; } + None => return Ok(None), }, Some(mut candidates) => { candidates -= params.excluded_candidates; @@ -170,11 +172,8 @@ fn facet_ordered<'t>( let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; Ok(Box::new(iter.map(Ok)) as Box>) } else { - let facet_fn = if ascending { - FacetIter::new_reducing - } else { - FacetIter::new_reverse_reducing - }; + let facet_fn = + if ascending { FacetIter::new_reducing } else { FacetIter::new_reverse_reducing }; let iter = facet_fn(rtxn, index, field_id, candidates)?; Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } @@ -194,9 +193,7 @@ fn iterative_facet_ordered_iter<'t>( for docid in candidates.iter() { let left = (field_id, docid, f64::MIN); let right = (field_id, docid, f64::MAX); - let mut iter = index - .field_id_docid_facet_f64s - .range(rtxn, &(left..=right))?; + let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?; let entry = if ascending { iter.next() } else { iter.last() }; if let Some(((_, _, value), ())) = entry.transpose()? { docids_values.push((docid, OrderedFloat(value))); diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index f191defe1..6e0bb40d5 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,15 +1,16 @@ -use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap}; -use std::collections::{BTreeMap, HashMap, btree_map}; +use std::borrow::Cow; +use std::cmp::{self, Ordering}; use std::collections::binary_heap::PeekMut; +use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap}; use std::mem::take; use roaring::RoaringBitmap; -use crate::{TreeLevel, Result, search::build_dfa}; +use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; -use crate::search::{word_derivations, WordDerivationsCache}; -use super::{Criterion, CriterionParameters, CriterionResult, Context, resolve_query_tree}; +use crate::search::{build_dfa, word_derivations, WordDerivationsCache}; +use crate::{Result, TreeLevel}; /// To be able to divide integers by the number of words in the query /// we want to find a multiplier that allow us to divide by any number between 1 and 10. @@ -63,15 +64,19 @@ impl<'t> Criterion for Attribute<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, + } Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { let current_buckets = match self.current_buckets.as_mut() { Some(current_buckets) => current_buckets, None => { - let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?; + let new_buckets = linear_compute_candidates( + self.ctx, + &flattened_query_tree, + &allowed_candidates, + )?; self.current_buckets.get_or_insert(new_buckets.into_iter()) - }, + } }; match current_buckets.next() { @@ -83,10 +88,15 @@ impl<'t> Criterion for Attribute<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, + } } } else { - match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? { + match set_compute_candidates( + self.ctx, + &flattened_query_tree, + &allowed_candidates, + params.wdcache, + )? { Some(candidates) => candidates, None => { return Ok(Some(CriterionResult { @@ -95,13 +105,14 @@ impl<'t> Criterion for Attribute<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, + } } }; allowed_candidates -= &found_candidates; - self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); + self.state = + Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); return Ok(Some(CriterionResult { query_tree: Some(query_tree), @@ -109,39 +120,50 @@ impl<'t> Criterion for Attribute<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + bucket_candidates, + }) => { + let mut candidates = match candidates { + Some(candidates) => candidates, + None => { + resolve_query_tree(self.ctx, &query_tree, params.wdcache)? + - params.excluded_candidates } + }; - let flattened_query_tree = flatten_query_tree(&query_tree); + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, - } + let flattened_query_tree = flatten_query_tree(&query_tree); - self.state = Some((query_tree, flattened_query_tree, candidates)); - self.current_buckets = None; - }, - Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - bucket_candidates, - })); - }, - None => return Ok(None), + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + self.state = Some((query_tree, flattened_query_tree, candidates)); + self.current_buckets = None; } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + })); + } + None => return Ok(None), }, } } @@ -152,7 +174,9 @@ impl<'t> Criterion for Attribute<'t> { /// it will begin at the first non-empty interval and will return every interval without /// jumping over empty intervals. struct WordLevelIterator<'t, 'q> { - inner: Box> + 't>, + inner: Box< + dyn Iterator> + 't, + >, level: TreeLevel, interval_size: u32, word: Cow<'q, str>, @@ -162,49 +186,80 @@ struct WordLevelIterator<'t, 'q> { } impl<'t, 'q> WordLevelIterator<'t, 'q> { - fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result> { + fn new( + ctx: &'t dyn Context<'t>, + word: Cow<'q, str>, + in_prefix_cache: bool, + ) -> heed::Result> { match ctx.word_position_last_level(&word, in_prefix_cache)? { - Some(level) => { + Some(level) => { let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); - let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; - Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) - }, + let inner = + ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; + Ok(Some(Self { + inner, + level, + interval_size, + word, + in_prefix_cache, + inner_next: None, + current_interval: None, + })) + } None => Ok(None), } } - fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option) -> heed::Result { + fn dig( + &self, + ctx: &'t dyn Context<'t>, + level: &TreeLevel, + left_interval: Option, + ) -> heed::Result { let level = *level.min(&self.level); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); let word = self.word.clone(); let in_prefix_cache = self.in_prefix_cache; - let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; + let inner = + ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; - Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) + Ok(Self { + inner, + level, + interval_size, + word, + in_prefix_cache, + inner_next: None, + current_interval: None, + }) } fn next(&mut self) -> heed::Result> { - fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left } + fn is_next_interval(last_right: u32, next_left: u32) -> bool { + last_right + 1 == next_left + } let inner_next = match self.inner_next.take() { Some(inner_next) => Some(inner_next), - None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)), + None => self + .inner + .next() + .transpose()? + .map(|((_, _, left, right), docids)| (left, right, docids)), }; match inner_next { - Some((left, right, docids)) => { - match self.current_interval { - Some((last_left, last_right)) if !is_next_interval(last_right, left) => { - let blank_left = last_left + self.interval_size; - let blank_right = last_right + self.interval_size; - self.current_interval = Some((blank_left, blank_right)); - self.inner_next = Some((left, right, docids)); - Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) - }, - _ => { - self.current_interval = Some((left, right)); - Ok(Some((left, right, docids))) - } + Some((left, right, docids)) => match self.current_interval { + Some((last_left, last_right)) if !is_next_interval(last_right, left) => { + let blank_left = last_left + self.interval_size; + let blank_right = last_right + self.interval_size; + self.current_interval = Some((blank_left, blank_right)); + self.inner_next = Some((left, right, docids)); + Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) + } + _ => { + self.current_interval = Some((left, right)); + Ok(Some((left, right, docids))) } }, None => Ok(None), @@ -228,30 +283,37 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { ctx: &'t dyn Context<'t>, queries: &'q [Query], wdcache: &mut WordDerivationsCache, - ) -> Result> - { + ) -> Result> { let mut inner = Vec::with_capacity(queries.len()); for query in queries { match &query.kind { QueryKind::Exact { word, .. } => { if !query.prefix || ctx.in_prefix_cache(&word) { let word = Cow::Borrowed(query.kind.word()); - if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? { + if let Some(word_level_iterator) = + WordLevelIterator::new(ctx, word, query.prefix)? + { inner.push(word_level_iterator); } } else { - for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { + for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? + { let word = Cow::Owned(word.to_owned()); - if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { + if let Some(word_level_iterator) = + WordLevelIterator::new(ctx, word, false)? + { inner.push(word_level_iterator); } } } - }, + } QueryKind::Tolerant { typo, word } => { - for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { + for (word, _) in + word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? + { let word = Cow::Owned(word.to_owned()); - if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { + if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? + { inner.push(word_level_iterator); } } @@ -284,17 +346,28 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { Some(parent) => { let parent = parent.dig(ctx)?; (parent.level.min(self.level), Some(Box::new(parent))) - }, + } None => (self.level.saturating_sub(1), None), }; - let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten(); + let left_interval = self + .accumulator + .get(self.interval_to_skip) + .map(|opt| opt.as_ref().map(|(left, _, _)| *left)) + .flatten(); let mut inner = Vec::with_capacity(self.inner.len()); for word_level_iterator in self.inner.iter() { inner.push(word_level_iterator.dig(ctx, &level, left_interval)?); } - Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0}) + Ok(Self { + parent, + inner, + level, + accumulator: vec![], + parent_accumulator: vec![], + interval_to_skip: 0, + }) } fn inner_next(&mut self, level: TreeLevel) -> heed::Result> { @@ -305,12 +378,12 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { let wli_u8_level = Into::::into(wli.level); let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); for _ in 0..accumulated_count { - if let Some((next_left, _, next_docids)) = wli.next()? { - accumulated = match accumulated.take(){ + if let Some((next_left, _, next_docids)) = wli.next()? { + accumulated = match accumulated.take() { Some((acc_left, acc_right, mut acc_docids)) => { acc_docids |= next_docids; Some((acc_left, acc_right, acc_docids)) - }, + } None => Some((next_left, next_left + interval_size, next_docids)), }; } @@ -322,7 +395,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { /// return the next meta-interval created from inner WordLevelIterators, /// and from eventual chainned QueryLevelIterator. - fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result> { + fn next( + &mut self, + allowed_candidates: &RoaringBitmap, + tree_level: TreeLevel, + ) -> heed::Result> { let parent_result = match self.parent.as_mut() { Some(parent) => Some(parent.next(allowed_candidates, tree_level)?), None => None, @@ -335,22 +412,30 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { &self.parent_accumulator, &self.accumulator, self.interval_to_skip, - allowed_candidates + allowed_candidates, ); self.accumulator.push(inner_next); self.parent_accumulator.push(parent_next); let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; - for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) { + for current in self + .accumulator + .iter() + .rev() + .zip(self.parent_accumulator.iter()) + .skip(self.interval_to_skip) + { if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { match merged_interval.as_mut() { Some((_, _, merged_docids)) => *merged_docids |= a & b, - None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)), + None => { + merged_interval = Some((left_a + left_b, right_a + right_b, a & b)) + } } } } Ok(merged_interval) - }, + } None => { let level = self.level; match self.inner_next(level)? { @@ -358,12 +443,11 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { self.accumulator = vec![Some((left, right, RoaringBitmap::new()))]; candidates &= allowed_candidates; Ok(Some((left, right, candidates))) - - }, + } None => { self.accumulator = vec![None]; Ok(None) - }, + } } } } @@ -379,16 +463,18 @@ fn interval_to_skip( already_skiped: usize, allowed_candidates: &RoaringBitmap, ) -> usize { - parent_accumulator.iter() + parent_accumulator + .iter() .zip(current_accumulator.iter()) .skip(already_skiped) .take_while(|(parent, current)| { let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); - let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); + let skip_current = current + .as_ref() + .map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); skip_parent && skip_current }) .count() - } /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, @@ -410,7 +496,7 @@ impl<'t, 'q> Branch<'t, 'q> { self.last_result = last_result; self.tree_level = tree_level; Ok(true) - }, + } None => Ok(false), } } @@ -429,7 +515,7 @@ impl<'t, 'q> Branch<'t, 'q> { let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); let (left, right, _) = self.last_result; - self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); + self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); } /// return the score of the current inner interval. @@ -477,31 +563,31 @@ fn initialize_query_level_iterators<'t, 'q>( allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, ) -> Result>> { - let mut positions = BinaryHeap::with_capacity(branches.len()); for branch in branches { let mut branch_positions = Vec::with_capacity(branch.len()); - for queries in branch { + for queries in branch { match QueryLevelIterator::new(ctx, queries, wdcache)? { Some(qli) => branch_positions.push(qli), None => { // the branch seems to be invalid, so we skip it. branch_positions.clear(); break; - }, + } } } // QueryLevelIterator need to be sorted by level and folded in descending order. branch_positions.sort_unstable_by_key(|qli| qli.level); - let folded_query_level_iterators = branch_positions - .into_iter() - .fold(None, |fold: Option, mut qli| match fold { - Some(fold) => { - qli.parent(fold); - Some(qli) - }, - None => Some(qli), - }); + let folded_query_level_iterators = + branch_positions.into_iter().fold(None, |fold: Option, mut qli| { + match fold { + Some(fold) => { + qli.parent(fold); + Some(qli) + } + None => Some(qli), + } + }); if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { let tree_level = folded_query_level_iterators.level; @@ -526,9 +612,9 @@ fn set_compute_candidates<'t>( branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> Result> -{ - let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; +) -> Result> { + let mut branches_heap = + initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; let lowest_level = TreeLevel::min_value(); let mut final_candidates: Option<(u32, RoaringBitmap)> = None; let mut allowed_candidates = allowed_candidates.clone(); @@ -539,15 +625,18 @@ fn set_compute_candidates<'t>( // if current is worst than best we break to return // candidates that correspond to the best rank if let Some((best_rank, _)) = final_candidates { - if branch_rank > best_rank { break } + if branch_rank > best_rank { + break; + } } let _left = branch.last_result.0; let candidates = take(&mut branch.last_result.2); if candidates.is_empty() { // we don't have candidates, get next interval. - if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } - } - else if is_lowest_level { + if !branch.next(&allowed_candidates)? { + PeekMut::pop(branch); + } + } else if is_lowest_level { // we have candidates, but we can't dig deeper. allowed_candidates -= &candidates; final_candidates = match final_candidates.take() { @@ -556,19 +645,20 @@ fn set_compute_candidates<'t>( best_candidates |= candidates; branch.lazy_next(); Some((best_rank, best_candidates)) - }, + } // we take current candidates as best candidates None => { branch.lazy_next(); Some((branch_rank, candidates)) - }, + } }; } else { // we have candidates, lets dig deeper in levels. branch.dig(ctx)?; - if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } + if !branch.next(&allowed_candidates)? { + PeekMut::pop(branch); + } } - } Ok(final_candidates.map(|(_rank, candidates)| candidates)) @@ -578,9 +668,11 @@ fn linear_compute_candidates( ctx: &dyn Context, branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, -) -> Result> -{ - fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap) -> u64 { +) -> Result> { + fn compute_candidate_rank( + branches: &FlattenedQueryTree, + words_positions: HashMap, + ) -> u64 { let mut min_rank = u64::max_value(); for branch in branches { let branch_len = branch.len(); @@ -593,17 +685,20 @@ fn linear_compute_candidates( QueryKind::Exact { word, .. } => { if *prefix { word_derivations(word, true, 0, &words_positions) - .flat_map(|positions| positions.iter().next()).min() + .flat_map(|positions| positions.iter().next()) + .min() } else { - words_positions.get(word) + words_positions + .get(word) .map(|positions| positions.iter().next()) .flatten() } - }, + } QueryKind::Tolerant { typo, word } => { word_derivations(word, *prefix, *typo, &words_positions) - .flat_map(|positions| positions.iter().next()).min() - }, + .flat_map(|positions| positions.iter().next()) + .min() + } }; match (position, current_position) { @@ -627,9 +722,11 @@ fn linear_compute_candidates( branch_rank.sort_unstable(); // because several words in same query can't match all a the position 0, // we substract the word index to the position. - let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); + let branch_rank: u64 = + branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); // here we do the means of the words of the branch - min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); + min_rank = + min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); } } @@ -641,8 +738,7 @@ fn linear_compute_candidates( is_prefix: bool, max_typo: u8, words_positions: &'a HashMap, - ) -> impl Iterator - { + ) -> impl Iterator { let dfa = build_dfa(word, max_typo, is_prefix); words_positions.iter().filter_map(move |(document_word, positions)| { use levenshtein_automata::Distance; @@ -680,25 +776,26 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { } } out - }, + } None => recurse(head), } } fn recurse(op: &Operation) -> FlattenedQueryTree { match op { - And(ops) => { - ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) - }, - Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { - vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] - } else { - ops.iter().map(recurse).flatten().collect() - }, + And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)), + Or(_, ops) => { + if ops.iter().all(|op| op.query().is_some()) { + vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] + } else { + ops.iter().map(recurse).flatten().collect() + } + } Phrase(words) => { - let queries = words.iter().map(|word| { - vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}] - }).collect(); + let queries = words + .iter() + .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) + .collect(); vec![queries] } Operation::Query(query) => vec![vec![vec![query.clone()]]], @@ -712,28 +809,43 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { mod tests { use big_s::S; - use crate::search::criteria::QueryKind; use super::*; + use crate::search::criteria::QueryKind; #[test] fn simple_flatten_query_tree() { - let query_tree = Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), - ]), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), - ]), + let query_tree = Operation::Or( + false, + vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), ]), - ]), - ]); + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(S("thefish")), + }), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(S("the")), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(S("fish")), + }), + ]), + ], + ), + ]), + ], + ); let expected = vec![ vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index eb44b7b8e..1e4d4e7a2 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -2,19 +2,15 @@ use std::convert::TryFrom; use std::mem::take; use std::ops::BitOr; +use itertools::Itertools; use log::debug; use roaring::RoaringBitmap; -use itertools::Itertools; -use crate::search::query_tree::{Operation, PrimitiveQueryPart}; use crate::search::criteria::{ - Context, - Criterion, - CriterionParameters, - CriterionResult, - resolve_query_tree, + resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; -use crate::{TreeLevel, Result}; +use crate::search::query_tree::{Operation, PrimitiveQueryPart}; +use crate::{Result, TreeLevel}; pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, @@ -26,7 +22,11 @@ pub struct Exactness<'t> { } impl<'t> Exactness<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box, primitive_query: &[PrimitiveQueryPart]) -> heed::Result { + pub fn new( + ctx: &'t dyn Context<'t>, + parent: Box, + primitive_query: &[PrimitiveQueryPart], + ) -> heed::Result { let mut query: Vec<_> = Vec::with_capacity(primitive_query.len()); for part in primitive_query { query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?); @@ -59,7 +59,7 @@ impl<'t> Criterion for Exactness<'t> { // reset state self.state = None; self.query_tree = None; - }, + } Some(state) => { let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?; self.state = state; @@ -70,40 +70,51 @@ impl<'t> Criterion for Exactness<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + bucket_candidates, + }) => { + let mut candidates = match candidates { + Some(candidates) => candidates, + None => { + resolve_query_tree(self.ctx, &query_tree, params.wdcache)? + - params.excluded_candidates } + }; - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, - } + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } - self.state = Some(State::new(candidates)); - self.query_tree = Some(query_tree); - }, - Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - bucket_candidates, - })); - }, - None => return Ok(None), + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + self.state = Some(State::new(candidates)); + self.query_tree = Some(query_tree); } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + })); + } + None => return Ok(None), }, } - } + } } } @@ -125,9 +136,9 @@ impl State { fn difference_with(&mut self, lhs: &RoaringBitmap) { match self { - Self::ExactAttribute(candidates) | - Self::AttributeStartsWith(candidates) | - Self::ExactWords(candidates) => *candidates -= lhs, + Self::ExactAttribute(candidates) + | Self::AttributeStartsWith(candidates) + | Self::ExactWords(candidates) => *candidates -= lhs, Self::Remainings(candidates_array) => { candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs); candidates_array.retain(|candidates| !candidates.is_empty()); @@ -137,9 +148,9 @@ impl State { fn is_empty(&self) -> bool { match self { - Self::ExactAttribute(candidates) | - Self::AttributeStartsWith(candidates) | - Self::ExactWords(candidates) => candidates.is_empty(), + Self::ExactAttribute(candidates) + | Self::AttributeStartsWith(candidates) + | Self::ExactWords(candidates) => candidates.is_empty(), Self::Remainings(candidates_array) => { candidates_array.iter().all(RoaringBitmap::is_empty) } @@ -158,8 +169,7 @@ fn resolve_state( ctx: &dyn Context, state: State, query: &[ExactQueryPart], -) -> Result<(RoaringBitmap, Option)> -{ +) -> Result<(RoaringBitmap, Option)> { use State::*; match state { ExactAttribute(mut allowed_candidates) => { @@ -167,8 +177,11 @@ fn resolve_state( if let Ok(query_len) = u8::try_from(query.len()) { let attributes_ids = ctx.searchable_fields_ids()?; for id in attributes_ids { - if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { - let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; + if let Some(attribute_allowed_docids) = + ctx.field_id_word_count_docids(id, query_len)? + { + let mut attribute_candidates_array = + attribute_start_with_docids(ctx, id as u32, query)?; attribute_candidates_array.push(attribute_allowed_docids); candidates |= intersection_of(attribute_candidates_array.iter().collect()); } @@ -181,12 +194,13 @@ fn resolve_state( } Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) - }, + } AttributeStartsWith(mut allowed_candidates) => { let mut candidates = RoaringBitmap::new(); let attributes_ids = ctx.searchable_fields_ids()?; for id in attributes_ids { - let attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; + let attribute_candidates_array = + attribute_start_with_docids(ctx, id as u32, query)?; candidates |= intersection_of(attribute_candidates_array.iter().collect()); } @@ -195,7 +209,7 @@ fn resolve_state( // remove current candidates from allowed candidates allowed_candidates -= &candidates; Ok((candidates, Some(ExactWords(allowed_candidates)))) - }, + } ExactWords(mut allowed_candidates) => { let number_of_part = query.len(); let mut parts_candidates_array = Vec::with_capacity(number_of_part); @@ -210,7 +224,7 @@ fn resolve_state( candidates |= synonym_candidates; } } - }, + } // compute intersection on pair of words with a proximity of 0. Phrase(phrase) => { let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); @@ -220,8 +234,8 @@ fn resolve_state( Some(docids) => bitmaps.push(docids), None => { bitmaps.clear(); - break - }, + break; + } } } } @@ -247,7 +261,7 @@ fn resolve_state( // intersect each word candidates in combinations .map(intersection_of) // union combinations of `c_count` exact words - .fold(RoaringBitmap::new(), RoaringBitmap::bitor); + .fold(RoaringBitmap::new(), RoaringBitmap::bitor); // only keep allowed candidates combinations_candidates &= &allowed_candidates; // remove current candidates from allowed candidates @@ -261,7 +275,7 @@ fn resolve_state( candidates_array.reverse(); Ok((all_exact_candidates, Some(Remainings(candidates_array)))) - }, + } // pop remainings candidates until the emptiness Remainings(mut candidates_array) => { let candidates = candidates_array.pop().unwrap_or_default(); @@ -270,12 +284,15 @@ fn resolve_state( } else { Ok((candidates, None)) } - }, - + } } } -fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[ExactQueryPart]) -> heed::Result> { +fn attribute_start_with_docids( + ctx: &dyn Context, + attribute_id: u32, + query: &[ExactQueryPart], +) -> heed::Result> { let lowest_level = TreeLevel::min_value(); let mut attribute_candidates_array = Vec::new(); // start from attribute first position @@ -293,7 +310,7 @@ fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[Ex } attribute_candidates_array.push(synonyms_candidates); pos += 1; - }, + } Phrase(phrase) => { for word in phrase { let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; @@ -325,24 +342,30 @@ pub enum ExactQueryPart { } impl ExactQueryPart { - fn from_primitive_query_part(ctx: &dyn Context, part: &PrimitiveQueryPart) -> heed::Result { + fn from_primitive_query_part( + ctx: &dyn Context, + part: &PrimitiveQueryPart, + ) -> heed::Result { let part = match part { PrimitiveQueryPart::Word(word, _) => { match ctx.synonyms(word)? { Some(synonyms) => { - let mut synonyms: Vec<_> = synonyms.into_iter().filter_map(|mut array| { - // keep 1 word synonyms only. - match array.pop() { - Some(word) if array.is_empty() => Some(word), - _ => None, - } - }).collect(); + let mut synonyms: Vec<_> = synonyms + .into_iter() + .filter_map(|mut array| { + // keep 1 word synonyms only. + match array.pop() { + Some(word) if array.is_empty() => Some(word), + _ => None, + } + }) + .collect(); synonyms.push(word.clone()); ExactQueryPart::Synonyms(synonyms) - }, + } None => ExactQueryPart::Synonyms(vec![word.clone()]), } - }, + } PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()), }; diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index 645a3a5d7..bd3244143 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -1,10 +1,10 @@ use log::debug; use roaring::RoaringBitmap; -use crate::Result; +use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Criterion, CriterionResult, CriterionParameters, Context}; +use crate::Result; /// The result of a call to the fetcher. #[derive(Debug, Clone, PartialEq)] @@ -26,7 +26,12 @@ pub struct Final<'t> { impl<'t> Final<'t> { pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Final<'t> { - Final { ctx, parent, wdcache: WordDerivationsCache::new(), returned_candidates: RoaringBitmap::new() } + Final { + ctx, + parent, + wdcache: WordDerivationsCache::new(), + returned_candidates: RoaringBitmap::new(), + } } #[logging_timer::time("Final::{}")] @@ -40,10 +45,17 @@ impl<'t> Final<'t> { }; match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { query_tree, candidates, filtered_candidates, bucket_candidates }) => { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + bucket_candidates, + }) => { let mut candidates = match (candidates, query_tree.as_ref()) { (Some(candidates), _) => candidates, - (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates, + (None, Some(qt)) => { + resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates + } (None, None) => self.ctx.documents_ids()? - excluded_candidates, }; @@ -56,7 +68,7 @@ impl<'t> Final<'t> { self.returned_candidates |= &candidates; Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })) - }, + } None => Ok(None), } } diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index e6d0a17f7..514dbff96 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -1,15 +1,18 @@ use roaring::RoaringBitmap; -use crate::Result; +use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::query_tree::Operation; -use super::{Criterion, CriterionResult, CriterionParameters}; +use crate::Result; pub struct Initial { - answer: Option + answer: Option, } impl Initial { - pub fn new(query_tree: Option, filtered_candidates: Option) -> Initial { + pub fn new( + query_tree: Option, + filtered_candidates: Option, + ) -> Initial { let answer = CriterionResult { query_tree, candidates: None, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 48af0b8aa..228d48bd7 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,29 +1,28 @@ -use std::collections::HashMap; use std::borrow::Cow; +use std::collections::HashMap; use roaring::RoaringBitmap; -use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}}; -use crate::{Index, DocumentId, Result}; - -use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use self::asc_desc::AscDesc; use self::attribute::Attribute; use self::exactness::Exactness; -use self::r#final::Final; use self::initial::Initial; use self::proximity::Proximity; +use self::r#final::Final; use self::typo::Typo; use self::words::Words; +use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; +use crate::search::{word_derivations, WordDerivationsCache}; +use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; mod asc_desc; mod attribute; mod exactness; +pub mod r#final; mod initial; mod proximity; mod typo; mod words; -pub mod r#final; pub trait Criterion { fn next(&mut self, params: &mut CriterionParameters) -> Result>; @@ -55,7 +54,7 @@ pub struct CriterionParameters<'a> { #[derive(Debug)] enum Candidates { Allowed(RoaringBitmap), - Forbidden(RoaringBitmap) + Forbidden(RoaringBitmap), } impl Default for Candidates { @@ -68,17 +67,55 @@ pub trait Context<'c> { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; - fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; - fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; + fn word_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result>; + fn word_prefix_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; - fn docid_words_positions(&self, docid: DocumentId) -> heed::Result>; - fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>>; - fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; + fn docid_words_positions( + &self, + docid: DocumentId, + ) -> heed::Result>; + fn word_position_iterator( + &self, + word: &str, + level: TreeLevel, + in_prefix_cache: bool, + left: Option, + right: Option, + ) -> heed::Result< + Box< + dyn Iterator> + 'c, + >, + >; + fn word_position_last_level( + &self, + word: &str, + in_prefix_cache: bool, + ) -> heed::Result>; fn synonyms(&self, word: &str) -> heed::Result>>>; - fn searchable_fields_ids(&self) -> Result>; - fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result>; - fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result>; + fn searchable_fields_ids(&self) -> Result>; + fn field_id_word_count_docids( + &self, + field_id: FieldId, + word_count: u8, + ) -> heed::Result>; + fn word_level_position_docids( + &self, + word: &str, + level: TreeLevel, + left: u32, + right: u32, + ) -> heed::Result>; } pub struct CriteriaBuilder<'t> { @@ -101,12 +138,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.word_prefix_docids.get(self.rtxn, &word) } - fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + fn word_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { let key = (left, right, proximity); self.index.word_pair_proximity_docids.get(self.rtxn, &key) } - fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + fn word_prefix_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { let key = (left, right, proximity); self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) } @@ -119,7 +166,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.words_prefixes_fst.contains(word) } - fn docid_words_positions(&self, docid: DocumentId) -> heed::Result> { + fn docid_words_positions( + &self, + docid: DocumentId, + ) -> heed::Result> { let mut words_positions = HashMap::new(); for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { let ((_, word), positions) = result?; @@ -134,9 +184,12 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { level: TreeLevel, in_prefix_cache: bool, left: Option, - right: Option - ) -> heed::Result> + 'c>> - { + right: Option, + ) -> heed::Result< + Box< + dyn Iterator> + 'c, + >, + > { let range = { let left = left.unwrap_or(u32::min_value()); let right = right.unwrap_or(u32::max_value()); @@ -152,7 +205,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { Ok(Box::new(db.range(self.rtxn, &range)?)) } - fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result> { + fn word_position_last_level( + &self, + word: &str, + in_prefix_cache: bool, + ) -> heed::Result> { let range = { let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); @@ -164,7 +221,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { }; let last_level = db .remap_data_type::() - .range(self.rtxn, &range)?.last().transpose()? + .range(self.rtxn, &range)? + .last() + .transpose()? .map(|((_, level, _, _), _)| level); Ok(last_level) @@ -181,12 +240,22 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { } } - fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result> { + fn field_id_word_count_docids( + &self, + field_id: FieldId, + word_count: u8, + ) -> heed::Result> { let key = (field_id, word_count); self.index.field_id_word_count_docids.get(self.rtxn, &key) } - fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> heed::Result> { + fn word_level_position_docids( + &self, + word: &str, + level: TreeLevel, + left: u32, + right: u32, + ) -> heed::Result> { let key = (word, level, left, right); self.index.word_level_position_docids.get(self.rtxn, &key) } @@ -204,13 +273,13 @@ impl<'t> CriteriaBuilder<'t> { query_tree: Option, primitive_query: Option>, filtered_candidates: Option, - ) -> Result> - { + ) -> Result> { use crate::criterion::Criterion as Name; let primitive_query = primitive_query.unwrap_or_default(); - let mut criterion = Box::new(Initial::new(query_tree, filtered_candidates)) as Box; + let mut criterion = + Box::new(Initial::new(query_tree, filtered_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { Name::Typo => Box::new(Typo::new(self, criterion)), @@ -218,8 +287,12 @@ impl<'t> CriteriaBuilder<'t> { Name::Proximity => Box::new(Proximity::new(self, criterion)), Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), - Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), - Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), + Name::Asc(field) => { + Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?) + } + Name::Desc(field) => { + Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?) + } }; } @@ -231,21 +304,20 @@ pub fn resolve_query_tree<'t>( ctx: &'t dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, - ) -> Result - { - use Operation::{And, Phrase, Or, Query}; + ) -> Result { + use Operation::{And, Or, Phrase, Query}; match query_tree { And(ops) => { - let mut ops = ops.iter().map(|op| { - resolve_operation(ctx, op, wdcache) - }).collect::>>()?; + let mut ops = ops + .iter() + .map(|op| resolve_operation(ctx, op, wdcache)) + .collect::>>()?; ops.sort_unstable_by_key(|cds| cds.len()); @@ -260,7 +332,7 @@ pub fn resolve_query_tree<'t>( } } Ok(candidates) - }, + } Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; @@ -276,12 +348,12 @@ pub fn resolve_query_tree<'t>( } else { candidates &= pair_docids; } - }, - None => return Ok(RoaringBitmap::new()) + } + None => return Ok(RoaringBitmap::new()), } } Ok(candidates) - }, + } Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { @@ -289,7 +361,7 @@ pub fn resolve_query_tree<'t>( candidates.union_with(&docids); } Ok(candidates) - }, + } Query(q) => Ok(query_docids(ctx, q, wdcache)?), } } @@ -297,18 +369,18 @@ pub fn resolve_query_tree<'t>( resolve_operation(ctx, query_tree, wdcache) } - fn all_word_pair_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], right_words: &[(U, u8)], - proximity: u8 -) -> Result -{ + proximity: u8, +) -> Result { let mut docids = RoaringBitmap::new(); for (left, _l_typo) in left_words { for (right, _r_typo) in right_words { - let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); + let current_docids = ctx + .word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)? + .unwrap_or_default(); docids.union_with(¤t_docids); } } @@ -319,8 +391,7 @@ fn query_docids( ctx: &dyn Context, query: &Query, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { match &query.kind { QueryKind::Exact { word, .. } => { if query.prefix && ctx.in_prefix_cache(&word) { @@ -336,7 +407,7 @@ fn query_docids( } else { Ok(ctx.word_docids(&word)?.unwrap_or_default()) } - }, + } QueryKind::Tolerant { typo, word } => { let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); @@ -345,7 +416,7 @@ fn query_docids( docids.union_with(¤t_docids); } Ok(docids) - }, + } } } @@ -355,8 +426,7 @@ fn query_pair_proximity_docids( right: &Query, proximity: u8, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { if proximity >= 8 { let mut candidates = query_docids(ctx, left, wdcache)?; let right_candidates = query_docids(ctx, right, wdcache)?; @@ -368,20 +438,31 @@ fn query_pair_proximity_docids( match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix && ctx.in_prefix_cache(&right) { - Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) + Ok(ctx + .word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? + .unwrap_or_default()) } else if prefix { let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) } else { - Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) + Ok(ctx + .word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? + .unwrap_or_default()) } - }, + } (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); + let l_words = + word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); if prefix && ctx.in_prefix_cache(&right) { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { - let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); + let current_docids = ctx + .word_prefix_pair_proximity_docids( + left.as_ref(), + right.as_ref(), + proximity, + )? + .unwrap_or_default(); docids.union_with(¤t_docids); } Ok(docids) @@ -391,28 +472,36 @@ fn query_pair_proximity_docids( } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } - }, + } (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) - }, - (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { - let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); + } + ( + QueryKind::Tolerant { typo: l_typo, word: left }, + QueryKind::Tolerant { typo: r_typo, word: right }, + ) => { + let l_words = + word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) - }, + } } } #[cfg(test)] pub mod test { - use maplit::hashmap; - use rand::{Rng, SeedableRng, rngs::StdRng}; - - use super::*; use std::collections::HashMap; - fn s(s: &str) -> String { s.to_string() } + use maplit::hashmap; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + + use super::*; + + fn s(s: &str) -> String { + s.to_string() + } pub struct TestContext<'t> { words_fst: fst::Set>, word_docids: HashMap, @@ -435,12 +524,22 @@ pub mod test { Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) } - fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + fn word_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { let key = (left.to_string(), right.to_string(), proximity.into()); Ok(self.word_pair_proximity_docids.get(&key).cloned()) } - fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + fn word_prefix_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { let key = (left.to_string(), right.to_string(), proximity.into()); Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) } @@ -453,24 +552,44 @@ pub mod test { self.word_prefix_docids.contains_key(&word.to_string()) } - fn docid_words_positions(&self, docid: DocumentId) -> heed::Result> { + fn docid_words_positions( + &self, + docid: DocumentId, + ) -> heed::Result> { if let Some(docid_words) = self.docid_words.get(&docid) { Ok(docid_words .iter() .enumerate() - .map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))) - .collect() - ) + .map(|(i, w)| { + (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32))) + }) + .collect()) } else { Ok(HashMap::new()) } } - fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option, _right: Option) -> heed::Result> + 'c>> { + fn word_position_iterator( + &self, + _word: &str, + _level: TreeLevel, + _in_prefix_cache: bool, + _left: Option, + _right: Option, + ) -> heed::Result< + Box< + dyn Iterator> + + 'c, + >, + > { todo!() } - fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result> { + fn word_position_last_level( + &self, + _word: &str, + _in_prefix_cache: bool, + ) -> heed::Result> { todo!() } @@ -478,15 +597,25 @@ pub mod test { todo!() } - fn searchable_fields_ids(&self) -> Result> { + fn searchable_fields_ids(&self) -> Result> { todo!() } - fn word_level_position_docids(&self, _word: &str, _level: TreeLevel, _left: u32, _right: u32) -> heed::Result> { + fn word_level_position_docids( + &self, + _word: &str, + _level: TreeLevel, + _left: u32, + _right: u32, + ) -> heed::Result> { todo!() } - fn field_id_word_count_docids(&self, _field_id: FieldId, _word_count: u8) -> heed::Result> { + fn field_id_word_count_docids( + &self, + _field_id: FieldId, + _word_count: u8, + ) -> heed::Result> { todo!() } } @@ -506,7 +635,7 @@ pub mod test { RoaringBitmap::from_sorted_iter(values.into_iter()) } - let word_docids = hashmap!{ + let word_docids = hashmap! { s("hello") => random_postings(rng, 1500), s("hi") => random_postings(rng, 4000), s("word") => random_postings(rng, 2500), @@ -530,7 +659,7 @@ pub mod test { } } - let word_prefix_docids = hashmap!{ + let word_prefix_docids = hashmap! { s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], @@ -540,7 +669,9 @@ pub mod test { let mut word_prefix_pair_proximity_docids = HashMap::new(); for (lword, lcandidates) in &word_docids { for (rword, rcandidates) in &word_docids { - if lword == rword { continue } + if lword == rword { + continue; + } let candidates = lcandidates & rcandidates; for candidate in candidates { if let Some(docid_words) = docid_words.get(&candidate) { @@ -551,24 +682,31 @@ pub mod test { } else { (s(lword), s(rword), (lposition - rposition + 1) as i32) }; - let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); + let docids = word_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); docids.push(candidate); } } } for (pword, pcandidates) in &word_prefix_docids { - if lword.starts_with(pword) { continue } + if lword.starts_with(pword) { + continue; + } let candidates = lcandidates & pcandidates; for candidate in candidates { if let Some(docid_words) = docid_words.get(&candidate) { let lposition = docid_words.iter().position(|w| w == lword).unwrap(); - let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); + let rposition = + docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); let key = if lposition < rposition { (s(lword), s(pword), (rposition - lposition) as i32) } else { (s(lword), s(pword), (lposition - rposition + 1) as i32) }; - let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); + let docids = word_prefix_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); docids.push(candidate); } } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index c3c8027cb..3e8196e93 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -2,22 +2,16 @@ use std::collections::btree_map::{self, BTreeMap}; use std::collections::hash_map::HashMap; use std::mem::take; -use roaring::RoaringBitmap; use log::debug; +use roaring::RoaringBitmap; -use crate::search::query_tree::{maximum_proximity, Operation, Query}; -use crate::search::{build_dfa, WordDerivationsCache}; -use crate::search::{query_tree::QueryKind}; -use crate::{DocumentId, Position, Result}; use super::{ - Context, - Criterion, - CriterionParameters, - CriterionResult, - query_docids, - query_pair_proximity_docids, - resolve_query_tree, + query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion, + CriterionParameters, CriterionResult, }; +use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; +use crate::search::{build_dfa, WordDerivationsCache}; +use crate::{DocumentId, Position, Result}; type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; @@ -63,28 +57,33 @@ impl<'t> Criterion for Proximity<'t> { } loop { - debug!("Proximity at iteration {} (max prox {:?}) ({:?})", + debug!( + "Proximity at iteration {} (max prox {:?}) ({:?})", self.proximity, self.state.as_ref().map(|(mp, _, _)| mp), self.state.as_ref().map(|(_, _, cd)| cd), ); match &mut self.state { - Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => { + Some((max_prox, _, allowed_candidates)) + if allowed_candidates.is_empty() || self.proximity > *max_prox => + { self.state = None; // reset state - }, + } Some((_, query_tree, allowed_candidates)) => { - let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { + let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD + && self.proximity > PROXIMITY_THRESHOLD + { if let Some(cache) = self.plane_sweep_cache.as_mut() { match cache.next() { Some((p, candidates)) => { self.proximity = p; candidates - }, + } None => { self.state = None; // reset state - continue - }, + continue; + } } } else { let cache = resolve_plane_sweep_candidates( @@ -95,9 +94,10 @@ impl<'t> Criterion for Proximity<'t> { )?; self.plane_sweep_cache = Some(cache.into_iter()); - continue + continue; } - } else { // use set theory based algorithm + } else { + // use set theory based algorithm resolve_candidates( self.ctx, &query_tree, @@ -117,39 +117,50 @@ impl<'t> Criterion for Proximity<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + bucket_candidates, + }) => { + let mut candidates = match candidates { + Some(candidates) => candidates, + None => { + resolve_query_tree(self.ctx, &query_tree, params.wdcache)? + - params.excluded_candidates } + }; - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, - } + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } - let maximum_proximity = maximum_proximity(&query_tree); - self.state = Some((maximum_proximity as u8, query_tree, candidates)); - self.proximity = 0; - self.plane_sweep_cache = None; - }, - Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - bucket_candidates, - })); - }, - None => return Ok(None), + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + let maximum_proximity = maximum_proximity(&query_tree); + self.state = Some((maximum_proximity as u8, query_tree, candidates)); + self.proximity = 0; + self.plane_sweep_cache = None; } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + })); + } + None => return Ok(None), }, } } @@ -162,46 +173,48 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> Result> - { - use Operation::{And, Phrase, Or}; + ) -> Result> { + use Operation::{And, Or, Phrase}; let result = match query_tree { And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, - Phrase(words) => if proximity == 0 { - let most_left = words.first().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let most_right = words.last().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let mut candidates = None; - for slice in words.windows(2) { - let (left, right) = (&slice[0], &slice[1]); - match ctx.word_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => { - match candidates.as_mut() { + Phrase(words) => { + if proximity == 0 { + let most_left = words + .first() + .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); + let most_right = words + .last() + .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); + let mut candidates = None; + for slice in words.windows(2) { + let (left, right) = (&slice[0], &slice[1]); + match ctx.word_pair_proximity_docids(left, right, 1)? { + Some(pair_docids) => match candidates.as_mut() { Some(candidates) => *candidates &= pair_docids, None => candidates = Some(pair_docids), + }, + None => { + candidates = None; + break; } - }, - None => { - candidates = None; - break; } } + match (most_left, most_right, candidates) { + (Some(l), Some(r), Some(c)) => vec![(l, r, c)], + _otherwise => Default::default(), + } + } else { + Default::default() } - match (most_left, most_right, candidates) { - (Some(l), Some(r), Some(c)) => vec![(l, r, c)], - _otherwise => Default::default(), - } - } else { - Default::default() - }, + } Or(_, ops) => { let mut output = Vec::new(); for op in ops { @@ -209,13 +222,15 @@ fn resolve_candidates<'t>( output.extend(result); } output - }, - Operation::Query(q) => if proximity == 0 { - let candidates = query_docids(ctx, q, wdcache)?; - vec![(q.clone(), q.clone(), candidates)] - } else { - Default::default() - }, + } + Operation::Query(q) => { + if proximity == 0 { + let candidates = query_docids(ctx, q, wdcache)?; + vec![(q.clone(), q.clone(), candidates)] + } else { + Default::default() + } + } }; Ok(result) @@ -228,8 +243,7 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> Result> - { + ) -> Result> { fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { (0..=mana.min(left_max)).map(move |m| (m, mana - m)) } @@ -257,7 +271,8 @@ fn resolve_candidates<'t>( for (ll, lr, lcandidates) in lefts { for (rl, rr, rcandidates) in rights { - let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; + let mut candidates = + query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; if lcandidates.len() < rcandidates.len() { candidates.intersect_with(lcandidates); candidates.intersect_with(rcandidates); @@ -282,22 +297,26 @@ fn resolve_candidates<'t>( proximity: u8, cache: &mut Cache, wdcache: &mut WordDerivationsCache, - ) -> Result> - { + ) -> Result> { // Extract the first two elements but gives the tail // that is just after the first element. - let next = branches.split_first().map(|(h1, t)| { - (h1, t.split_first().map(|(h2, _)| (h2, t))) - }); + let next = + branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t)))); match next { - Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache, wdcache), + Some((head1, Some((head2, [_])))) => { + mdfs_pair(ctx, head1, head2, proximity, cache, wdcache) + } Some((head1, Some((head2, tail)))) => { let mut output = Vec::new(); for p in 0..=proximity { - for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache, wdcache)? { + for (lhead, _, head_candidates) in + mdfs_pair(ctx, head1, head2, p, cache, wdcache)? + { if !head_candidates.is_empty() { - for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache, wdcache)? { + for (_, rtail, mut candidates) in + mdfs(ctx, tail, proximity - p, cache, wdcache)? + { candidates.intersect_with(&head_candidates); if !candidates.is_empty() { output.push((lhead.clone(), rtail, candidates)); @@ -307,7 +326,7 @@ fn resolve_candidates<'t>( } } Ok(output) - }, + } Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache), None => Ok(Default::default()), } @@ -325,47 +344,48 @@ fn resolve_plane_sweep_candidates( query_tree: &Operation, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> Result> -{ +) -> Result> { /// FIXME may be buggy with query like "new new york" fn plane_sweep( groups_positions: Vec>, consecutive: bool, - ) -> Result> - { + ) -> Result> { fn compute_groups_proximity( groups: &[(usize, (Position, u8, Position))], consecutive: bool, - ) -> Option<(Position, u8, Position)> - { + ) -> Option<(Position, u8, Position)> { // take the inner proximity of the first group as initial let (_, (_, mut proximity, _)) = groups.first()?; let (_, (left_most_pos, _, _)) = groups.first()?; - let (_, (_, _, right_most_pos)) = groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; + let (_, (_, _, right_most_pos)) = + groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; for pair in groups.windows(2) { if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair { // if two positions are equal, meaning that they share at least a word, we return None if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 { - return None + return None; } let pair_proximity = { // if intervals are disjoint [..].(..) - if lpos2 > rpos1 { lpos2 - rpos1 } + if lpos2 > rpos1 { + lpos2 - rpos1 + } // if the second interval is a subset of the first [.(..).] - else if rpos2 < rpos1 { (lpos2 - lpos1).min(rpos1 - rpos2) } + else if rpos2 < rpos1 { + (lpos2 - lpos1).min(rpos1 - rpos2) + } // if intervals overlaps [.(..].) - else { (lpos2 - lpos1).min(rpos2 - rpos1) } + else { + (lpos2 - lpos1).min(rpos2 - rpos1) + } }; // if groups are in the good order (query order) we remove 1 to the proximity // the proximity is clamped to 7 - let pair_proximity = if i1 < i2 { - (pair_proximity - 1).min(7) - } else { - pair_proximity.min(7) - }; + let pair_proximity = + if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) }; proximity += pair_proximity as u8 + prox2; } @@ -381,7 +401,8 @@ fn resolve_plane_sweep_candidates( let groups_len = groups_positions.len(); - let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); + let mut groups_positions: Vec<_> = + groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); // Pop top elements of each list. let mut current = Vec::with_capacity(groups_len); @@ -452,9 +473,8 @@ fn resolve_plane_sweep_candidates( rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, words_positions: &HashMap, wdcache: &mut WordDerivationsCache, - ) -> Result> - { - use Operation::{And, Phrase, Or}; + ) -> Result> { + use Operation::{And, Or, Phrase}; if let Some(result) = rocache.get(query_tree) { return Ok(result.clone()); @@ -462,13 +482,20 @@ fn resolve_plane_sweep_candidates( let result = match query_tree { And(ops) => { - let mut groups_positions = Vec::with_capacity(ops.len()); + let mut groups_positions = Vec::with_capacity(ops.len()); for operation in ops { - let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; + let positions = resolve_operation( + ctx, + operation, + docid, + rocache, + words_positions, + wdcache, + )?; groups_positions.push(positions); } plane_sweep(groups_positions, false)? - }, + } Phrase(words) => { let mut groups_positions = Vec::with_capacity(words.len()); for word in words { @@ -479,16 +506,23 @@ fn resolve_plane_sweep_candidates( groups_positions.push(positions); } plane_sweep(groups_positions, true)? - }, + } Or(_, ops) => { let mut result = Vec::new(); for op in ops { - result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?) + result.extend(resolve_operation( + ctx, + op, + docid, + rocache, + words_positions, + wdcache, + )?) } result.sort_unstable(); result - }, + } Operation::Query(Query { prefix, kind }) => { let mut result = Vec::new(); match kind { @@ -498,9 +532,9 @@ fn resolve_plane_sweep_candidates( .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); result.extend(iter); } else if let Some(positions) = words_positions.get(word) { - result.extend(positions.iter().map(|p| (p, 0, p))); + result.extend(positions.iter().map(|p| (p, 0, p))); } - }, + } QueryKind::Tolerant { typo, word } => { let iter = word_derivations(word, *prefix, *typo, &words_positions) .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); @@ -522,8 +556,7 @@ fn resolve_plane_sweep_candidates( is_prefix: bool, max_typo: u8, words_positions: &'a HashMap, - ) -> impl Iterator - { + ) -> impl Iterator { let dfa = build_dfa(word, max_typo, is_prefix); words_positions.iter().filter_map(move |(document_word, positions)| { use levenshtein_automata::Distance; @@ -539,7 +572,7 @@ fn resolve_plane_sweep_candidates( for docid in allowed_candidates { let words_positions = ctx.docid_words_positions(docid)?; resolve_operation_cache.clear(); - let positions = resolve_operation( + let positions = resolve_operation( ctx, query_tree, docid, diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 436f4affd..f4ae15f0a 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -1,20 +1,17 @@ -use std::{borrow::Cow, collections::HashMap, mem::take}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::mem::take; use log::debug; use roaring::RoaringBitmap; +use super::{ + query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, + CriterionResult, +}; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; use crate::Result; -use super::{ - Candidates, - Context, - Criterion, - CriterionParameters, - CriterionResult, - query_docids, - resolve_query_tree, -}; /// Maximum number of typo for a word of any length. const MAX_TYPOS_PER_WORD: u8 = 2; @@ -54,7 +51,8 @@ impl<'t> Criterion for Typo<'t> { } loop { - debug!("Typo at iteration {} (max typos {:?}) ({:?})", + debug!( + "Typo at iteration {} (max typos {:?}) ({:?})", self.typos, self.state.as_ref().map(|(mt, _, _)| mt), self.state.as_ref().map(|(_, _, cd)| cd), @@ -63,29 +61,42 @@ impl<'t> Criterion for Typo<'t> { match self.state.as_mut() { Some((max_typos, _, _)) if self.typos > *max_typos => { self.state = None; // reset state - }, + } Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { self.state = None; // reset state - }, + } Some((_, query_tree, candidates_authorization)) => { let fst = self.ctx.words_fst(); let new_query_tree = match self.typos { - typos if typos < MAX_TYPOS_PER_WORD => { - alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)? - }, + typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree( + &fst, + query_tree.clone(), + self.typos, + params.wdcache, + )?, MAX_TYPOS_PER_WORD => { // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, // we keep the altered query tree - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?; + *query_tree = alterate_query_tree( + &fst, + query_tree.clone(), + self.typos, + params.wdcache, + )?; // we compute the allowed candidates - let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?; + let query_tree_allowed_candidates = + resolve_query_tree(self.ctx, query_tree, params.wdcache)?; // we assign the allowed candidates to the candidates authorization. *candidates_authorization = match take(candidates_authorization) { - Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates), - Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates), + Allowed(allowed_candidates) => { + Allowed(query_tree_allowed_candidates & allowed_candidates) + } + Forbidden(forbidden_candidates) => { + Allowed(query_tree_allowed_candidates - forbidden_candidates) + } }; query_tree.clone() - }, + } _otherwise => query_tree.clone(), }; @@ -101,11 +112,11 @@ impl<'t> Criterion for Typo<'t> { Allowed(allowed_candidates) => { candidates &= &*allowed_candidates; *allowed_candidates -= &candidates; - }, + } Forbidden(forbidden_candidates) => { candidates -= &*forbidden_candidates; *forbidden_candidates |= &candidates; - }, + } } let bucket_candidates = match self.bucket_candidates.as_mut() { @@ -121,35 +132,45 @@ impl<'t> Criterion for Typo<'t> { filtered_candidates: None, bucket_candidates: Some(bucket_candidates), })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { - self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + bucket_candidates, + }) => { + self.bucket_candidates = + match (self.bucket_candidates.take(), bucket_candidates) { (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc), }; - let candidates = match candidates.or(filtered_candidates) { - Some(candidates) => Candidates::Allowed(candidates - params.excluded_candidates), - None => Candidates::Forbidden(params.excluded_candidates.clone()), - }; + let candidates = match candidates.or(filtered_candidates) { + Some(candidates) => { + Candidates::Allowed(candidates - params.excluded_candidates) + } + None => Candidates::Forbidden(params.excluded_candidates.clone()), + }; - let maximum_typos = maximum_typo(&query_tree) as u8; - self.state = Some((maximum_typos, query_tree, candidates)); - self.typos = 0; - - }, - Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - bucket_candidates, - })); - }, - None => return Ok(None), + let maximum_typos = maximum_typo(&query_tree) as u8; + self.state = Some((maximum_typos, query_tree, candidates)); + self.typos = 0; } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + })); + } + None => return Ok(None), }, } } @@ -164,21 +185,19 @@ fn alterate_query_tree( mut query_tree: Operation, number_typos: u8, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { fn recurse( words_fst: &fst::Set>, operation: &mut Operation, number_typos: u8, wdcache: &mut WordDerivationsCache, - ) -> Result<()> - { - use Operation::{And, Phrase, Or}; + ) -> Result<()> { + use Operation::{And, Or, Phrase}; match operation { And(ops) | Or(_, ops) => { ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) - }, + } // Because Phrases don't allow typos, no alteration can be done. Phrase(_words) => return Ok(()), Operation::Query(q) => { @@ -193,19 +212,25 @@ fn alterate_query_tree( } else { let typo = *typo.min(&number_typos); let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; - let queries = words.iter().map(|(word, typo)| { - Operation::Query(Query { - prefix: false, - kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() }, + let queries = words + .iter() + .map(|(word, typo)| { + Operation::Query(Query { + prefix: false, + kind: QueryKind::Exact { + original_typo: *typo, + word: word.to_string(), + }, + }) }) - }).collect(); + .collect(); *operation = Operation::or(false, queries); } } Ok(()) - }, + } } } @@ -219,22 +244,18 @@ fn resolve_candidates<'t>( number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, -) -> Result -{ +) -> Result { fn resolve_operation<'t>( ctx: &'t dyn Context, query_tree: &Operation, number_typos: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, - ) -> Result - { - use Operation::{And, Phrase, Or, Query}; + ) -> Result { + use Operation::{And, Or, Phrase, Query}; match query_tree { - And(ops) => { - mdfs(ctx, ops, number_typos, cache, wdcache) - }, + And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache), Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; @@ -250,12 +271,12 @@ fn resolve_candidates<'t>( } else { candidates &= pair_docids; } - }, - None => return Ok(RoaringBitmap::new()) + } + None => return Ok(RoaringBitmap::new()), } } Ok(candidates) - }, + } Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { @@ -263,12 +284,14 @@ fn resolve_candidates<'t>( candidates.union_with(&docids); } Ok(candidates) - }, - Query(q) => if q.kind.typo() == number_typos { - Ok(query_docids(ctx, q, wdcache)?) - } else { - Ok(RoaringBitmap::new()) - }, + } + Query(q) => { + if q.kind.typo() == number_typos { + Ok(query_docids(ctx, q, wdcache)?) + } else { + Ok(RoaringBitmap::new()) + } + } } } @@ -278,8 +301,7 @@ fn resolve_candidates<'t>( mana: u8, cache: &mut HashMap<(Operation, u8), RoaringBitmap>, wdcache: &mut WordDerivationsCache, - ) -> Result - { + ) -> Result { match branches.split_first() { Some((head, [])) => { let cache_key = (head.clone(), mana); @@ -290,7 +312,7 @@ fn resolve_candidates<'t>( cache.insert(cache_key, candidates.clone()); Ok(candidates) } - }, + } Some((head, tail)) => { let mut candidates = RoaringBitmap::new(); @@ -313,7 +335,7 @@ fn resolve_candidates<'t>( } Ok(candidates) - }, + } None => Ok(RoaringBitmap::new()), } } @@ -323,9 +345,9 @@ fn resolve_candidates<'t>( #[cfg(test)] mod test { - use super::*; use super::super::initial::Initial; use super::super::test::TestContext; + use super::*; #[test] fn initial_placeholder_no_facets() { @@ -348,13 +370,23 @@ mod test { #[test] fn initial_query_tree_no_facets() { let context = TestContext::default(); - let query_tree = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), - ]) - ]); + let query_tree = Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ); let facet_candidates = None; @@ -369,13 +401,23 @@ mod test { & context.word_docids("this").unwrap().unwrap() & context.word_docids("world").unwrap().unwrap(); let expected_1 = CriterionResult { - query_tree: Some(Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), - ]), - ])), + query_tree: Some(Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("world".to_string()), + }), + ])], + )), candidates: Some(candidates_1.clone()), bucket_candidates: Some(candidates_1), filtered_candidates: None, @@ -383,22 +425,37 @@ mod test { assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); - let candidates_2 = ( - context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("word").unwrap().unwrap() - ) - context.word_docids("world").unwrap().unwrap(); + let candidates_2 = (context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("word").unwrap().unwrap()) + - context.word_docids("world").unwrap().unwrap(); let expected_2 = CriterionResult { - query_tree: Some(Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), - ]), - ]), - ])), + query_tree: Some(Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact_with_typo(1, "word".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("world".to_string()), + }), + ], + ), + ])], + )), candidates: Some(candidates_2.clone()), bucket_candidates: Some(candidates_2), filtered_candidates: None, @@ -437,17 +494,26 @@ mod test { #[test] fn initial_query_tree_with_facets() { let context = TestContext::default(); - let query_tree = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), - ]) - ]); + let query_tree = Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ); let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criterion_parameters = CriterionParameters { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), @@ -459,13 +525,23 @@ mod test { & context.word_docids("this").unwrap().unwrap() & context.word_docids("world").unwrap().unwrap(); let expected_1 = CriterionResult { - query_tree: Some(Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), - ]), - ])), + query_tree: Some(Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("world".to_string()), + }), + ])], + )), candidates: Some(&candidates_1 & &facet_candidates), bucket_candidates: Some(&candidates_1 & &facet_candidates), filtered_candidates: None, @@ -473,22 +549,37 @@ mod test { assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); - let candidates_2 = ( - context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("word").unwrap().unwrap() - ) - context.word_docids("world").unwrap().unwrap(); + let candidates_2 = (context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("word").unwrap().unwrap()) + - context.word_docids("world").unwrap().unwrap(); let expected_2 = CriterionResult { - query_tree: Some(Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), - ]), - ]), - ])), + query_tree: Some(Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact_with_typo(1, "word".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("world".to_string()), + }), + ], + ), + ])], + )), candidates: Some(&candidates_2 & &facet_candidates), bucket_candidates: Some(&candidates_2 & &facet_candidates), filtered_candidates: None, diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index add90d80d..ccc6c0617 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -3,9 +3,9 @@ use std::mem::take; use log::debug; use roaring::RoaringBitmap; +use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; use crate::search::query_tree::Operation; use crate::Result; -use super::{Context, Criterion, CriterionParameters, CriterionResult, resolve_query_tree}; pub struct Words<'t> { ctx: &'t dyn Context<'t>, @@ -44,11 +44,12 @@ impl<'t> Criterion for Words<'t> { Some(query_tree) => { let candidates = match self.candidates.as_mut() { Some(allowed_candidates) => { - let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; + let mut candidates = + resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; candidates &= &*allowed_candidates; *allowed_candidates -= &candidates; Some(candidates) - }, + } None => None, }; @@ -63,29 +64,38 @@ impl<'t> Criterion for Words<'t> { filtered_candidates: self.filtered_candidates.clone(), bucket_candidates, })); - }, - None => { - match self.parent.next(params)? { - Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates, bucket_candidates }) => { - self.query_trees = explode_query_tree(query_tree); - self.candidates = candidates; - self.filtered_candidates = filtered_candidates; + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + bucket_candidates, + }) => { + self.query_trees = explode_query_tree(query_tree); + self.candidates = candidates; + self.filtered_candidates = filtered_candidates; - self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) { + self.bucket_candidates = + match (self.bucket_candidates.take(), bucket_candidates) { (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc), }; - }, - Some(CriterionResult { query_tree: None, candidates, filtered_candidates, bucket_candidates }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - bucket_candidates, - })); - }, - None => return Ok(None), } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + bucket_candidates, + })); + } + None => return Ok(None), }, } } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index b9ffd9d90..290a7602f 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -3,11 +3,11 @@ use std::mem::size_of; use heed::types::ByteSlice; use roaring::RoaringBitmap; +use super::{Distinct, DocIter}; use crate::error::InternalError; use crate::heed_codec::facet::*; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; -use super::{Distinct, DocIter}; const FID_SIZE: usize = size_of::(); const DOCID_SIZE: usize = size_of::(); @@ -28,11 +28,7 @@ pub struct FacetDistinct<'a> { impl<'a> FacetDistinct<'a> { pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { - Self { - distinct, - index, - txn, - } + Self { distinct, index, txn } } } @@ -47,16 +43,12 @@ pub struct FacetDistinctIter<'a> { impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { - self.index - .facet_id_string_docids - .get(self.txn, &(self.distinct, key)) + self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key)) } fn facet_number_docids(&self, key: f64) -> heed::Result> { // get facet docids on level 0 - self.index - .facet_id_f64_docids - .get(self.txn, &(self.distinct, 0, key, key)) + self.index.facet_id_f64_docids.get(self.txn, &(self.distinct, 0, key, key)) } fn distinct_string(&mut self, id: DocumentId) -> Result<()> { @@ -64,9 +56,8 @@ impl<'a> FacetDistinctIter<'a> { for item in iter { let ((_, _, value), _) = item?; - let facet_docids = self - .facet_string_docids(value)? - .ok_or(InternalError::DatabaseMissingEntry { + let facet_docids = + self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::FACET_ID_STRING_DOCIDS, key: None, })?; @@ -83,9 +74,8 @@ impl<'a> FacetDistinctIter<'a> { for item in iter { let ((_, _, value), _) = item?; - let facet_docids = self - .facet_number_docids(value)? - .ok_or(InternalError::DatabaseMissingEntry { + let facet_docids = + self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::FACET_ID_F64_DOCIDS, key: None, })?; diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 99bc74be0..ae3fdb91e 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -1,11 +1,11 @@ mod facet_distinct; mod noop_distinct; +pub use facet_distinct::FacetDistinct; +pub use noop_distinct::NoopDistinct; use roaring::RoaringBitmap; use crate::{DocumentId, Result}; -pub use facet_distinct::FacetDistinct; -pub use noop_distinct::NoopDistinct; /// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. /// It provides a way to get back the ownership to the excluded set. @@ -29,13 +29,15 @@ mod test { use std::collections::HashSet; use once_cell::sync::Lazy; - use rand::{seq::SliceRandom, Rng}; + use rand::seq::SliceRandom; + use rand::Rng; use roaring::RoaringBitmap; use serde_json::{json, Value}; - use crate::index::{Index, tests::TempIndex}; + use crate::index::tests::TempIndex; + use crate::index::Index; use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; - use crate::{BEU32, FieldId, DocumentId}; + use crate::{DocumentId, FieldId, BEU32}; static JSON: Lazy = Lazy::new(generate_json); @@ -89,9 +91,7 @@ mod test { addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); addition.update_format(UpdateFormat::Json); - addition - .execute(JSON.to_string().as_bytes(), |_, _| ()) - .unwrap(); + addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); @@ -103,13 +103,12 @@ mod test { (index, fid, map) } - /// Checks that all the candidates are distinct, and returns the candidates number. pub(crate) fn validate_distinct_candidates( candidates: impl Iterator>, distinct: FieldId, index: &Index, - ) -> usize { + ) -> usize { fn test(seen: &mut HashSet, value: &Value) { match value { Value::Null | Value::Object(_) | Value::Bool(_) => (), @@ -117,7 +116,7 @@ mod test { let s = value.to_string(); assert!(seen.insert(s)); } - Value::Array(values) => {values.into_iter().for_each(|value| test(seen, value))} + Value::Array(values) => values.into_iter().for_each(|value| test(seen, value)), } } diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs index 812701794..96a1f7d5d 100644 --- a/milli/src/search/distinct/noop_distinct.rs +++ b/milli/src/search/distinct/noop_distinct.rs @@ -1,7 +1,8 @@ -use roaring::{RoaringBitmap, bitmap::IntoIter}; +use roaring::bitmap::IntoIter; +use roaring::RoaringBitmap; +use super::{Distinct, DocIter}; use crate::{DocumentId, Result}; -use super::{DocIter, Distinct}; /// A distinct implementer that does not perform any distinct, /// and simply returns an iterator to the candidates. @@ -30,10 +31,7 @@ impl Distinct for NoopDistinct { type Iter = NoopDistinctIter; fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { - NoopDistinctIter { - candidates: candidates.into_iter(), - excluded, - } + NoopDistinctIter { candidates: candidates.into_iter(), excluded } } } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 265a8ffeb..0a2036494 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,16 +1,16 @@ -use std::collections::{HashSet, BTreeMap}; +use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; use std::{cmp, fmt}; -use heed::{Database, BytesDecode}; use heed::types::{ByteSlice, Unit}; +use heed::{BytesDecode, Database}; use roaring::RoaringBitmap; use crate::error::FieldIdMapMissingEntry; use crate::facet::FacetType; use crate::heed_codec::facet::FacetValueStringCodec; use crate::search::facet::{FacetIter, FacetRange}; -use crate::{Index, FieldId, DocumentId, Result}; +use crate::{DocumentId, FieldId, Index, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -43,7 +43,7 @@ impl<'a> FacetDistribution<'a> { } } - pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { + pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect()); self } @@ -66,8 +66,7 @@ impl<'a> FacetDistribution<'a> { facet_type: FacetType, candidates: &RoaringBitmap, distribution: &mut BTreeMap, - ) -> heed::Result<()> - { + ) -> heed::Result<()> { fn fetch_facet_values<'t, KC, K: 't>( rtxn: &'t heed::RoTxn, db: Database, @@ -102,7 +101,7 @@ impl<'a> FacetDistribution<'a> { FacetType::Number => { let db = self.index.field_id_docid_facet_f64s; fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) - }, + } FacetType::String => { let db = self.index.field_id_docid_facet_strings; fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) @@ -117,11 +116,9 @@ impl<'a> FacetDistribution<'a> { field_id: FieldId, candidates: &RoaringBitmap, distribution: &mut BTreeMap, - ) -> heed::Result<()> - { - let iter = FacetIter::new_non_reducing( - self.rtxn, self.index, field_id, candidates.clone(), - )?; + ) -> heed::Result<()> { + let iter = + FacetIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; for result in iter { let (value, mut docids) = result?; @@ -142,8 +139,7 @@ impl<'a> FacetDistribution<'a> { fn facet_values_from_raw_facet_database( &self, field_id: FieldId, - ) -> heed::Result> - { + ) -> heed::Result> { let mut distribution = BTreeMap::new(); let db = self.index.facet_id_f64_docids; @@ -157,7 +153,8 @@ impl<'a> FacetDistribution<'a> { } } - let iter = self.index + let iter = self + .index .facet_id_string_docids .remap_key_type::() .prefix_iter(self.rtxn, &[field_id])? @@ -182,11 +179,30 @@ impl<'a> FacetDistribution<'a> { // to those candidates. We also enter here for facet strings for performance reasons. let mut distribution = BTreeMap::new(); if candidates.len() <= CANDIDATES_THRESHOLD { - self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?; - self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; + self.facet_distribution_from_documents( + field_id, + Number, + candidates, + &mut distribution, + )?; + self.facet_distribution_from_documents( + field_id, + String, + candidates, + &mut distribution, + )?; } else { - self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?; - self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; + self.facet_numbers_distribution_from_facet_levels( + field_id, + candidates, + &mut distribution, + )?; + self.facet_distribution_from_documents( + field_id, + String, + candidates, + &mut distribution, + )?; } Ok(distribution) @@ -201,10 +217,11 @@ impl<'a> FacetDistribution<'a> { let mut distribution = BTreeMap::new(); for name in filterable_fields { - let fid = fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { - field_name: name.clone(), - process: "FacetDistribution::execute", - })?; + let fid = + fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: name.clone(), + process: "FacetDistribution::execute", + })?; let values = self.facet_values(fid)?; distribution.insert(name, values); } @@ -215,13 +232,7 @@ impl<'a> FacetDistribution<'a> { impl fmt::Debug for FacetDistribution<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let FacetDistribution { - facets, - candidates, - max_values_by_facet, - rtxn: _, - index: _, - } = self; + let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self; f.debug_struct("FacetDistribution") .field("facets", facets) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 424118f77..31fc6018c 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,6 +1,6 @@ use std::collections::HashSet; use std::fmt::Debug; -use std::ops::Bound::{self, Included, Excluded}; +use std::ops::Bound::{self, Excluded, Included}; use std::result::Result as StdResult; use std::str::FromStr; @@ -12,16 +12,13 @@ use pest::iterators::{Pair, Pairs}; use pest::Parser; use roaring::RoaringBitmap; -use crate::error::UserError; -use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; -use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec, Result}; - -use super::FacetRange; -use super::parser::Rule; -use super::parser::{PREC_CLIMBER, FilterParser}; - use self::FilterCondition::*; use self::Operator::*; +use super::parser::{FilterParser, Rule, PREC_CLIMBER}; +use super::FacetRange; +use crate::error::UserError; +use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec}; +use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq)] pub enum Operator { @@ -39,13 +36,13 @@ impl Operator { /// an OR operation for the between case (i.e. `TO`). fn negate(self) -> (Self, Option) { match self { - GreaterThan(n) => (LowerThanOrEqual(n), None), + GreaterThan(n) => (LowerThanOrEqual(n), None), GreaterThanOrEqual(n) => (LowerThan(n), None), - Equal(n, s) => (NotEqual(n, s), None), - NotEqual(n, s) => (Equal(n, s), None), - LowerThan(n) => (GreaterThanOrEqual(n), None), - LowerThanOrEqual(n) => (GreaterThan(n), None), - Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), + Equal(n, s) => (NotEqual(n, s), None), + NotEqual(n, s) => (Equal(n, s), None), + LowerThan(n) => (GreaterThanOrEqual(n), None), + LowerThanOrEqual(n) => (GreaterThan(n), None), + Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), } } } @@ -63,10 +60,11 @@ impl FilterCondition { index: &Index, array: I, ) -> Result> - where I: IntoIterator>, - J: IntoIterator, - A: AsRef, - B: AsRef, + where + I: IntoIterator>, + J: IntoIterator, + A: AsRef, + B: AsRef, { let mut ands = None; @@ -88,7 +86,7 @@ impl FilterCondition { None => Some(rule), }; } - }, + } Either::Right(rule) => { let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; ands = match ands.take() { @@ -106,11 +104,11 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, expression: &str, - ) -> Result - { + ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields_ids(rtxn)?; - let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; + let lexed = + FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } @@ -118,8 +116,7 @@ impl FilterCondition { fim: &FieldsIdsMap, ff: &HashSet, expression: Pairs, - ) -> Result - { + ) -> Result { PREC_CLIMBER.climb( expression, |pair: Pair| match pair.as_rule() { @@ -135,12 +132,10 @@ impl FilterCondition { Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), _ => unreachable!(), }, - |lhs: Result, op: Pair, rhs: Result| { - match op.as_rule() { - Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), - Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), - _ => unreachable!(), - } + |lhs: Result, op: Pair, rhs: Result| match op.as_rule() { + Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), + Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), + _ => unreachable!(), }, ) } @@ -160,8 +155,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -179,8 +173,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -196,8 +189,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -213,8 +205,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -230,8 +221,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -247,8 +237,7 @@ impl FilterCondition { fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, item: Pair, - ) -> Result - { + ) -> Result { let mut items = item.into_inner(); let fid = field_id(fields_ids_map, filterable_fields, &mut items) .map_err(UserError::InvalidFilterAttribute)?; @@ -272,13 +261,14 @@ impl FilterCondition { left: Bound, right: Bound, output: &mut RoaringBitmap, - ) -> Result<()> - { + ) -> Result<()> { match (left, right) { // If the request is an exact value we must go directly to the deepest level. (Included(l), Included(r)) if l == r && level > 0 => { - return Self::explore_facet_number_levels(rtxn, db, field_id, 0, left, right, output); - }, + return Self::explore_facet_number_levels( + rtxn, db, field_id, 0, left, right, output, + ); + } // lower TO upper when lower > upper must return no result (Included(l), Included(r)) if l > r => return Ok(()), (Included(l), Excluded(r)) if l >= r => return Ok(()), @@ -301,7 +291,9 @@ impl FilterCondition { debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); output.union_with(&docids); // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { left_found = Some(l); } + if i == 0 { + left_found = Some(l); + } right_found = Some(r); } @@ -318,20 +310,50 @@ impl FilterCondition { // If the bound is satisfied we avoid calling this function again. if !matches!(left, Included(l) if l == left_found) { let sub_right = Excluded(left_found); - debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); - Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, sub_right, output)?; + debug!( + "calling left with {:?} to {:?} (level {})", + left, sub_right, deeper_level + ); + Self::explore_facet_number_levels( + rtxn, + db, + field_id, + deeper_level, + left, + sub_right, + output, + )?; } if !matches!(right, Included(r) if r == right_found) { let sub_left = Excluded(right_found); - debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); - Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, sub_left, right, output)?; + debug!( + "calling right with {:?} to {:?} (level {})", + sub_left, right, deeper_level + ); + Self::explore_facet_number_levels( + rtxn, + db, + field_id, + deeper_level, + sub_left, + right, + output, + )?; } - }, + } None => { // If we found nothing at this level it means that we must find // the same bounds but at a deeper, more precise level. - Self::explore_facet_number_levels(rtxn, db, field_id, deeper_level, left, right, output)?; - }, + Self::explore_facet_number_levels( + rtxn, + db, + field_id, + deeper_level, + left, + right, + output, + )?; + } } Ok(()) @@ -344,27 +366,34 @@ impl FilterCondition { strings_db: heed::Database, field_id: FieldId, operator: &Operator, - ) -> Result - { + ) -> Result { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. let (left, right) = match operator { - GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), + GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), - Equal(number, string) => { + Equal(number, string) => { let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); let number_docids = match number { Some(n) => { let n = Included(*n); let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + 0, + n, + n, + &mut output, + )?; output - }, + } None => RoaringBitmap::new(), }; return Ok(string_docids | number_docids); - }, + } NotEqual(number, string) => { let all_numbers_ids = if number.is_some() { index.number_faceted_documents_ids(rtxn, field_id)? @@ -373,12 +402,14 @@ impl FilterCondition { }; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; let operator = Equal(*number, string.clone()); - let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?; + let docids = Self::evaluate_operator( + rtxn, index, numbers_db, strings_db, field_id, &operator, + )?; return Ok((all_numbers_ids | all_strings_ids) - docids); - }, - LowerThan(val) => (Included(f64::MIN), Excluded(*val)), + } + LowerThan(val) => (Included(f64::MIN), Excluded(*val)), LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), - Between(left, right) => (Included(*left), Included(*right)), + Between(left, right) => (Included(*left), Included(*right)), }; // Ask for the biggest value that can exist for this specific field, if it exists @@ -391,36 +422,39 @@ impl FilterCondition { match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + level, + left, + right, + &mut output, + )?; Ok(output) - }, + } None => Ok(RoaringBitmap::new()), } } - pub fn evaluate( - &self, - rtxn: &heed::RoTxn, - index: &Index, - ) -> Result - { + pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { let numbers_db = index.facet_id_f64_docids; let strings_db = index.facet_id_string_docids; match self { Operator(fid, op) => { Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op) - }, + } Or(lhs, rhs) => { let lhs = lhs.evaluate(rtxn, index)?; let rhs = rhs.evaluate(rtxn, index)?; Ok(lhs | rhs) - }, + } And(lhs, rhs) => { let lhs = lhs.evaluate(rtxn, index)?; let rhs = rhs.evaluate(rtxn, index)?; Ok(lhs & rhs) - }, + } } } } @@ -434,23 +468,24 @@ fn field_id( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, items: &mut Pairs, -) -> StdResult> -{ +) -> StdResult> { // lexing ensures that we at least have a key let key = items.next().unwrap(); let field_id = match fields_ids_map.id(key.as_str()) { Some(field_id) => field_id, - None => return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` not found, available attributes are: {}", - key.as_str(), - fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", "), - ), - }, - key.as_span(), - )), + None => { + return Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` not found, available attributes are: {}", + key.as_str(), + fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", "), + ), + }, + key.as_span(), + )) + } }; if !filterable_fields.contains(&field_id) { @@ -459,9 +494,11 @@ fn field_id( message: format!( "attribute `{}` is not filterable, available filterable attributes are: {}", key.as_str(), - filterable_fields.iter().flat_map(|id| { - fields_ids_map.name(*id) - }).collect::>().join(", "), + filterable_fields + .iter() + .flat_map(|id| { fields_ids_map.name(*id) }) + .collect::>() + .join(", "), ), }, key.as_span(), @@ -476,8 +513,9 @@ fn field_id( /// /// Returns the parsing error associated with the span if the conversion fails. fn pest_parse(pair: Pair) -> (StdResult>, String) -where T: FromStr, - T::Err: ToString, +where + T: FromStr, + T::Err: ToString, { let result = match pair.as_str().parse::() { Ok(value) => Ok(value), @@ -492,11 +530,12 @@ where T: FromStr, #[cfg(test)] mod tests { - use super::*; - use crate::update::Settings; + use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; - use big_s::S; + + use super::*; + use crate::update::Settings; #[test] fn string() { @@ -508,7 +547,7 @@ mod tests { // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset!{ S("channel") }); + builder.set_filterable_fields(hashset! { S("channel") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -537,7 +576,7 @@ mod tests { // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset!{ "timestamp".into() }); + builder.set_filterable_fields(hashset! { "timestamp".into() }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -548,10 +587,8 @@ mod tests { assert_eq!(condition, expected); let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); - let expected = Or( - Box::new(Operator(0, LowerThan(22.0))), - Box::new(Operator(0, GreaterThan(44.0))), - ); + let expected = + Or(Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0)))); assert_eq!(condition, expected); } @@ -566,29 +603,33 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") }); + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FilterCondition::from_str( - &rtxn, &index, + &rtxn, + &index, "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", - ).unwrap(); + ) + .unwrap(); let expected = Or( Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(And( Box::new(Operator(1, Between(22.0, 44.0))), Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), - )) + )), ); assert_eq!(condition, expected); let condition = FilterCondition::from_str( - &rtxn, &index, + &rtxn, + &index, "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", - ).unwrap(); + ) + .unwrap(); let expected = Or( Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(Or( @@ -613,20 +654,28 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset!{ S("channel"), S("timestamp") }); + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FilterCondition::from_array( - &rtxn, &index, - vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])], - ).unwrap().unwrap(); + &rtxn, + &index, + vec![ + Either::Right("channel = gotaga"), + Either::Left(vec!["timestamp = 44", "channel != ponce"]), + ], + ) + .unwrap() + .unwrap(); let expected = FilterCondition::from_str( - &rtxn, &index, + &rtxn, + &index, "channel = gotaga AND (timestamp = 44 OR channel != ponce)", - ).unwrap(); + ) + .unwrap(); assert_eq!(condition, expected); } } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index a1a03dba3..240d99ccc 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,20 +1,19 @@ -use std::ops::Bound::{self, Included, Excluded, Unbounded}; +use std::ops::Bound::{self, Excluded, Included, Unbounded}; use either::Either::{self, Left, Right}; -use heed::types::{DecodeIgnore, ByteSlice}; -use heed::{Database, RoRange, RoRevRange, LazyDecode}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{Database, LazyDecode, RoRange, RoRevRange}; use roaring::RoaringBitmap; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::{Index, FieldId}; - pub use self::facet_distribution::FacetDistribution; pub use self::filter_condition::{FilterCondition, Operator}; pub(crate) use self::parser::Rule as ParserRule; +use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::{FieldId, Index}; -mod filter_condition; mod facet_distribution; +mod filter_condition; mod parser; pub struct FacetRange<'t> { @@ -30,8 +29,7 @@ impl<'t> FacetRange<'t> { level: u8, left: Bound, right: Bound, - ) -> heed::Result> - { + ) -> heed::Result> { let left_bound = match left { Included(left) => Included((field_id, level, left, f64::MIN)), Excluded(left) => Excluded((field_id, level, left, f64::MIN)), @@ -62,7 +60,7 @@ impl<'t> Iterator for FacetRange<'t> { } else { None } - }, + } Some(Err(e)) => Some(Err(e)), None => None, } @@ -82,8 +80,7 @@ impl<'t> FacetRevRange<'t> { level: u8, left: Bound, right: Bound, - ) -> heed::Result> - { + ) -> heed::Result> { let left_bound = match left { Included(left) => Included((field_id, level, left, f64::MIN)), Excluded(left) => Excluded((field_id, level, left, f64::MIN)), @@ -114,7 +111,7 @@ impl<'t> Iterator for FacetRevRange<'t> { } } continue; - }, + } Some(Err(e)) => return Some(Err(e)), None => return None, } @@ -139,11 +136,11 @@ impl<'t> FacetIter<'t> { index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, - ) -> heed::Result> - { + ) -> heed::Result> { let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let highest_iter = + FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Left(highest_iter))]; Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) } @@ -156,11 +153,11 @@ impl<'t> FacetIter<'t> { index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, - ) -> heed::Result> - { + ) -> heed::Result> { let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let highest_iter = + FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Right(highest_iter))]; Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) } @@ -174,11 +171,11 @@ impl<'t> FacetIter<'t> { index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, - ) -> heed::Result> - { + ) -> heed::Result> { let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let highest_iter = + FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Left(highest_iter))]; Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) } @@ -187,12 +184,13 @@ impl<'t> FacetIter<'t> { rtxn: &'t heed::RoTxn, db: Database, fid: FieldId, - ) -> heed::Result> - { - let level = db.remap_types::() + ) -> heed::Result> { + let level = db + .remap_types::() .prefix_iter(rtxn, &[fid][..])? .remap_key_type::() - .last().transpose()? + .last() + .transpose()? .map(|((_, level, _, _), _)| level); Ok(level) } @@ -215,7 +213,6 @@ impl<'t> Iterator for FacetIter<'t> { match result { Ok(((_fid, level, left, right), mut docids)) => { - docids.intersect_with(&documents_ids); if !docids.is_empty() { if self.must_reduce { @@ -242,11 +239,11 @@ impl<'t> Iterator for FacetIter<'t> { Ok(iter) => { self.level_iters.push((docids, iter)); continue 'outer; - }, + } Err(e) => return Some(Err(e)), } } - }, + } Err(e) => return Some(Err(e)), } } diff --git a/milli/src/search/facet/parser.rs b/milli/src/search/facet/parser.rs index 0e8bd23ac..1bff27cfb 100644 --- a/milli/src/search/facet/parser.rs +++ b/milli/src/search/facet/parser.rs @@ -1,5 +1,5 @@ use once_cell::sync::Lazy; -use pest::prec_climber::{Operator, Assoc, PrecClimber}; +use pest::prec_climber::{Assoc, Operator, PrecClimber}; pub static PREC_CLIMBER: Lazy> = Lazy::new(|| { use Assoc::*; diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index c56db4e96..cd8e404b8 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -1,13 +1,11 @@ -use std::collections::HashSet; use std::cmp::{min, Reverse}; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashSet}; use std::ops::{Index, IndexMut}; -use levenshtein_automata::{DFA, Distance}; - -use crate::search::query_tree::{Operation, Query}; +use levenshtein_automata::{Distance, DFA}; use super::build_dfa; +use crate::search::query_tree::{Operation, Query}; type IsPrefix = bool; @@ -28,7 +26,9 @@ impl MatchingWords { .collect(); // Sort word by len in DESC order prioritizing the longuest word, // in order to highlight the longuest part of the matched word. - dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| Reverse(query_word.len())); + dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| { + Reverse(query_word.len()) + }); Self { dfas } } @@ -37,12 +37,13 @@ impl MatchingWords { self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) { Distance::Exact(t) if t <= *typo => { if *is_prefix { - let (_dist, len) = prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes()); + let (_dist, len) = + prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes()); Some(len) } else { Some(word.len()) } - }, + } _otherwise => None, }) } @@ -54,11 +55,11 @@ fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { match tree { Operation::Or(_, ops) | Operation::And(ops) => { ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); - }, + } Operation::Query(Query { prefix, kind }) => { let typo = if kind.is_exact() { 0 } else { kind.typo() }; out.insert((kind.word(), typo, *prefix)); - }, + } Operation::Phrase(words) => { for word in words { out.insert((word, 0, false)); @@ -80,10 +81,7 @@ struct N2Array { impl N2Array { fn new(x: usize, y: usize, value: T) -> N2Array { - N2Array { - y_size: y, - buf: vec![value; x * y], - } + N2Array { y_size: y, buf: vec![value; x * y] } } } @@ -178,9 +176,8 @@ fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) { #[cfg(test)] mod tests { use super::*; - - use crate::MatchingWords; use crate::search::query_tree::{Operation, Query, QueryKind}; + use crate::MatchingWords; #[test] fn matched_length() { @@ -194,13 +191,23 @@ mod tests { #[test] fn matching_words() { - let query_tree = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: true, kind: QueryKind::exact("split".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), - Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "world".to_string()) }), - ]), - ]); + let query_tree = Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: true, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ); let matching_words = MatchingWords::from_query_tree(&query_tree); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3c85796bc..f692df173 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -6,6 +6,7 @@ use std::result::Result as StdResult; use std::str::Utf8Error; use std::time::Instant; +use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use log::debug; @@ -13,16 +14,13 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; +pub(crate) use self::facet::ParserRule; +pub use self::facet::{FacetDistribution, FacetIter, FilterCondition, Operator}; +pub use self::matching_words::MatchingWords; +use self::query_tree::QueryTreeBuilder; use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::r#final::{Final, FinalResult}; -use crate::{Index, DocumentId, Result}; - -pub use self::facet::{FilterCondition, FacetDistribution, FacetIter, Operator}; -pub use self::matching_words::MatchingWords; -pub(crate) use self::facet::ParserRule; -use self::query_tree::QueryTreeBuilder; - -use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; +use crate::{DocumentId, Index, Result}; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -32,8 +30,8 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod criteria; mod distinct; mod facet; -mod query_tree; mod matching_words; +mod query_tree; pub struct Search<'a> { query: Option, @@ -117,7 +115,7 @@ impl<'a> Search<'a> { let result = analyzer.analyze(query); let tokens = result.tokens(); builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) - }, + } None => (None, None), }; @@ -144,10 +142,11 @@ impl<'a> Search<'a> { None => self.perform_sort(NoopDistinct, matching_words, criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; - let id = field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { - field_name: name.to_string(), - process: "distinct attribute", - })?; + let id = + field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { + field_name: name.to_string(), + process: "distinct attribute", + })?; let distinct = FacetDistinct::new(id, self.index, self.rtxn); self.perform_sort(distinct, matching_words, criteria) } @@ -159,14 +158,15 @@ impl<'a> Search<'a> { mut distinct: D, matching_words: MatchingWords, mut criteria: Final, - ) -> Result - { + ) -> Result { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); let mut excluded_candidates = RoaringBitmap::new(); let mut documents_ids = Vec::with_capacity(self.limit); - while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_candidates)? { + while let Some(FinalResult { candidates, bucket_candidates, .. }) = + criteria.next(&excluded_candidates)? + { debug!("Number of candidates found {}", candidates.len()); let excluded = take(&mut excluded_candidates); @@ -183,7 +183,9 @@ impl<'a> Search<'a> { for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { documents_ids.push(candidate?); } - if documents_ids.len() == self.limit { break } + if documents_ids.len() == self.limit { + break; + } excluded_candidates = candidates.into_excluded(); } @@ -247,7 +249,7 @@ pub fn word_derivations<'c>( } Ok(entry.insert(derived_words)) - }, + } } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index c371b07d4..8fa24b9d3 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,4 +1,4 @@ -use std::{fmt, cmp, mem}; +use std::{cmp, fmt, mem}; use fst::Set; use meilisearch_tokenizer::token::SeparatorKind; @@ -28,18 +28,18 @@ impl fmt::Debug for Operation { Operation::And(children) => { writeln!(f, "{:1$}AND", "", depth * 2)?; children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - }, + } Operation::Phrase(children) => { writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2) - }, + } Operation::Or(true, children) => { writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - }, + } Operation::Or(false, children) => { writeln!(f, "{:1$}OR", "", depth * 2)?; children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - }, + } Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2), } } @@ -136,10 +136,12 @@ impl fmt::Debug for Query { match kind { QueryKind::Exact { word, .. } => { f.debug_struct(&(prefix + "Exact")).field("word", &word).finish() - }, - QueryKind::Tolerant { typo, word } => { - f.debug_struct(&(prefix + "Tolerant")).field("word", &word).field("max typo", &typo).finish() - }, + } + QueryKind::Tolerant { typo, word } => f + .debug_struct(&(prefix + "Tolerant")) + .field("word", &word) + .field("max typo", &typo) + .finish(), } } } @@ -223,7 +225,12 @@ impl<'a> QueryTreeBuilder<'a> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { - let qt = create_query_tree(self, self.optional_words, self.authorize_typos, &primitive_query)?; + let qt = create_query_tree( + self, + self.optional_words, + self.authorize_typos, + &primitive_query, + )?; Ok(Some((qt, primitive_query))) } else { Ok(None) @@ -248,12 +255,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result QueryKind { match word.len() { 0..=4 => QueryKind::exact(word), 5..=8 => QueryKind::tolerant(1, word), - _ => QueryKind::tolerant(2, word), + _ => QueryKind::tolerant(2, word), } } else { QueryKind::exact(word) @@ -276,12 +278,18 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result Result -{ +) -> Result { /// Matches on the `PrimitiveQueryPart` and create an operation from it. fn resolve_primitive_part( ctx: &impl Context, authorize_typos: bool, part: PrimitiveQueryPart, - ) -> Result - { + ) -> Result { match part { // 1. try to split word in 2 // 2. try to fetch synonyms @@ -310,13 +316,12 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } - children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); + children + .push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); Ok(Operation::or(false, children)) - }, + } // create a CONSECUTIVE operation wrapping all word in the phrase - PrimitiveQueryPart::Phrase(words) => { - Ok(Operation::phrase(words)) - }, + PrimitiveQueryPart::Phrase(words) => Ok(Operation::phrase(words)), } } @@ -325,8 +330,7 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - ) -> Result - { + ) -> Result { const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); @@ -341,21 +345,26 @@ fn create_query_tree( match group { [part] => { - let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; + let operation = + resolve_primitive_part(ctx, authorize_typos, part.clone())?; and_op_children.push(operation); - }, + } words => { let is_prefix = words.last().map_or(false, |part| part.is_prefix()); - let words: Vec<_> = words.iter().filter_map(|part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }).collect(); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); - let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; + let query = + Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; operations.push(Operation::Query(query)); and_op_children.push(Operation::or(false, operations)); } @@ -379,26 +388,27 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: PrimitiveQuery, - ) -> Result - { + ) -> Result { let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); let mut operation_children = Vec::new(); let start = number_phrases + (number_phrases == 0) as usize; for len in start..=query.len() { let mut word_count = len - number_phrases; - let query: Vec<_> = query.iter().filter(|p| { - if p.is_phrase() { - true - } else if word_count != 0 { - word_count -= 1; - true - } else { - false - } - }) - .cloned() - .collect(); + let query: Vec<_> = query + .iter() + .filter(|p| { + if p.is_phrase() { + true + } else if word_count != 0 { + word_count -= 1; + true + } else { + false + } + }) + .cloned() + .collect(); let ngrams = ngrams(ctx, authorize_typos, &query)?; operation_children.push(ngrams); @@ -434,7 +444,11 @@ impl PrimitiveQueryPart { /// Create primitive query from tokenized query string, /// the primitive query is an intermediate state to build the query tree. -fn create_primitive_query(query: TokenStream, stop_words: Option>, words_limit: Option) -> PrimitiveQuery { +fn create_primitive_query( + query: TokenStream, + stop_words: Option>, + words_limit: Option, +) -> PrimitiveQuery { let mut primitive_query = Vec::new(); let mut phrase = Vec::new(); let mut quoted = false; @@ -444,23 +458,29 @@ fn create_primitive_query(query: TokenStream, stop_words: Option>, wo let mut peekable = query.peekable(); while let Some(token) = peekable.next() { // early return if word limit is exceeded - if primitive_query.len() >= parts_limit { return primitive_query } + if primitive_query.len() >= parts_limit { + return primitive_query; + } match token.kind { - TokenKind::Word | TokenKind::StopWord => { + TokenKind::Word | TokenKind::StopWord => { // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { phrase.push(token.word.to_string()); } else if peekable.peek().is_some() { - if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.word.as_ref())) { - primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false)); - } + if !stop_words + .as_ref() + .map_or(false, |swords| swords.contains(token.word.as_ref())) + { + primitive_query + .push(PrimitiveQueryPart::Word(token.word.to_string(), false)); + } } else { primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); } - }, + } TokenKind::Separator(separator_kind) => { let quote_count = token.word.chars().filter(|&s| s == '"').count(); // swap quoted state if we encounter a double quote @@ -468,10 +488,11 @@ fn create_primitive_query(query: TokenStream, stop_words: Option>, wo quoted = !quoted; } // if there is a quote or a hard separator we close the phrase. - if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { + if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) + { primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); } - }, + } _ => (), } } @@ -486,7 +507,7 @@ fn create_primitive_query(query: TokenStream, stop_words: Option>, wo /// Returns the maximum number of typos that this Operation allows. pub fn maximum_typo(operation: &Operation) -> usize { - use Operation::{Or, And, Query, Phrase}; + use Operation::{And, Or, Phrase, Query}; match operation { Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), And(ops) => ops.iter().map(maximum_typo).sum::(), @@ -498,13 +519,12 @@ pub fn maximum_typo(operation: &Operation) -> usize { /// Returns the maximum proximity that this Operation allows. pub fn maximum_proximity(operation: &Operation) -> usize { - use Operation::{Or, And, Query, Phrase}; + use Operation::{And, Or, Phrase, Query}; match operation { Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), And(ops) => { - ops.iter().map(maximum_proximity).sum::() - + ops.len().saturating_sub(1) * 7 - }, + ops.iter().map(maximum_proximity).sum::() + ops.len().saturating_sub(1) * 7 + } Query(_) | Phrase(_) => 0, } } @@ -515,7 +535,8 @@ mod test { use maplit::hashmap; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; - use rand::{Rng, SeedableRng, rngs::StdRng}; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use super::*; @@ -532,11 +553,11 @@ mod test { authorize_typos: bool, words_limit: Option, query: TokenStream, - ) -> Result> - { + ) -> Result> { let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { - let qt = create_query_tree(self, optional_words, authorize_typos, &primitive_query)?; + let qt = + create_query_tree(self, optional_words, authorize_typos, &primitive_query)?; Ok(Some((qt, primitive_query))) } else { Ok(None) @@ -571,7 +592,7 @@ mod test { } TestContext { - synonyms: hashmap!{ + synonyms: hashmap! { vec![String::from("hello")] => vec![ vec![String::from("hi")], vec![String::from("good"), String::from("morning")], @@ -594,7 +615,7 @@ mod test { vec![String::from("new"), String::from("york")], ], }, - postings: hashmap!{ + postings: hashmap! { String::from("hello") => random_postings(rng, 1500), String::from("hi") => random_postings(rng, 4000), String::from("word") => random_postings(rng, 2500), @@ -620,15 +641,28 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(1, "friends".to_string()) }), - ]), - Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), - ]); + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "friends".to_string()), + }), + ]), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(2, "heyfriends".to_string()), + }), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -640,15 +674,28 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friends".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), - ]); + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "friends".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "heyfriends".to_string()), + }), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -660,26 +707,60 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hi".to_string()) }), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("morning".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "hello".to_string()) }), + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hi".to_string()), + }), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("good".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("morning".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "hello".to_string()), + }), + ], + ), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("earth".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("nature".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ], + ), ]), - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("earth".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("nature".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), - ]), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }), - ]); + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "helloworld".to_string()), + }), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -691,40 +772,95 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), - Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "yorkcity".to_string()) }), - ]), - ]), - Operation::And(vec![ - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("nyc".to_string()) }), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "newyork".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("city".to_string()) }), - ]), - Operation::Or(false, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("nyc".to_string()) }), + let expected = Operation::Or( + false, + vec![ Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("new".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("york".to_string()) }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("new".to_string()), + }), + Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("york".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("city".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "yorkcity".to_string()), + }), + ], + ), ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "newyorkcity".to_string()) }), - ]), - ]); + Operation::And(vec![ + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("nyc".to_string()), + }), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("new".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("york".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("city".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "newyork".to_string()), + }), + ], + ), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("city".to_string()), + }), + ]), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("nyc".to_string()), + }), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("new".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("york".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "newyorkcity".to_string()), + }), + ], + ), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -736,15 +872,28 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("n".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "grams".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }), - ]); + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("n".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "grams".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "ngrams".to_string()), + }), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -756,21 +905,34 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Or(false, vec![ - Operation::Phrase(vec![ - "word".to_string(), - "split".to_string(), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }), + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Or( + false, + vec![ + Operation::Phrase(vec!["word".to_string(), "split".to_string()]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "wordsplit".to_string()), + }), + ], + ), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("fish".to_string()), + }), ]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("fish".to_string()) }) - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }), - ]); + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "wordsplitfish".to_string()), + }), + ], + ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -783,14 +945,12 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Phrase(vec![ - "hey".to_string(), - "friends".to_string(), - ]), + Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), ]); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -803,17 +963,12 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Phrase(vec![ - "hey".to_string(), - "friends".to_string(), - ]), - Operation::Phrase(vec![ - "wooop".to_string(), - "wooop".to_string(), - ]), + Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), + Operation::Phrase(vec!["wooop".to_string(), "wooop".to_string()]), ]); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -825,34 +980,80 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(true, vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }), - ]), - Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Or(false, vec![ + let expected = Operation::Or( + true, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Or( + false, + vec![ Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friend".to_string()) }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("my".to_string()), + }), ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "myfriend".to_string()) }) - ]) - ]), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "heymy".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "friend".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }), - ]), - ]); - let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "heymy".to_string()), + }), + ], + ), + Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("my".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "friend".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "myfriend".to_string()), + }), + ], + ), + ]), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "heymy".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "friend".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(2, "heymyfriend".to_string()), + }), + ], + ), + ], + ); + let (query_tree, _) = + TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -864,11 +1065,9 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Phrase(vec![ - "hey".to_string(), - "my".to_string(), - ]); - let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]); + let (query_tree, _) = + TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -880,29 +1079,66 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(true, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), - ]), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), - ]), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "mygood".to_string()) }), + let expected = Operation::Or( + true, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("friend".to_string()), + }), ]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), - ]), - ]); - let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("my".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("friend".to_string()), + }), + ]), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("my".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("good".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "mygood".to_string()), + }), + ], + ), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("friend".to_string()), + }), + ]), + ], + ); + let (query_tree, _) = + TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -914,14 +1150,27 @@ mod test { let result = analyzer.analyze(query); let tokens = result.tokens(); - let expected = Operation::Or(false, vec![ - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }), - ]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }), - ]); - let (query_tree, _) = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); + let expected = Operation::Or( + false, + vec![ + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("hey".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("friends".to_string()), + }), + ]), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("heyfriends".to_string()), + }), + ], + ); + let (query_tree, _) = + TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -934,14 +1183,12 @@ mod test { let tokens = result.tokens(); let expected = Operation::And(vec![ - Operation::Phrase(vec![ - "hey".to_string(), - "my".to_string(), - ]), + Operation::Phrase(vec!["hey".to_string(), "my".to_string()]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), ]); - let (query_tree, _) = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); + let (query_tree, _) = + TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs index 34ff743f0..9e3fce75d 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_documents_ids.rs @@ -1,6 +1,7 @@ use std::iter::{Chain, FromIterator}; use std::ops::RangeInclusive; -use roaring::bitmap::{RoaringBitmap, IntoIter}; + +use roaring::bitmap::{IntoIter, RoaringBitmap}; pub struct AvailableDocumentsIds { iter: Chain>, @@ -18,16 +19,12 @@ impl AvailableDocumentsIds { None => 1..=0, // empty range iterator }; - AvailableDocumentsIds { - iter: available.into_iter().chain(iter), - } - }, + AvailableDocumentsIds { iter: available.into_iter().chain(iter) } + } None => { let empty = RoaringBitmap::new().into_iter(); - AvailableDocumentsIds { - iter: empty.chain(0..=u32::max_value()), - } - }, + AvailableDocumentsIds { iter: empty.chain(0..=u32::max_value()) } + } } } } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 6e26bf027..42dd55443 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,7 +1,7 @@ use chrono::Utc; use roaring::RoaringBitmap; -use crate::{ExternalDocumentsIds, Index, FieldsDistribution, Result}; +use crate::{ExternalDocumentsIds, FieldsDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -13,9 +13,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - update_id: u64 + update_id: u64, ) -> ClearDocuments<'t, 'u, 'i> { - ClearDocuments { wtxn, index, _update_id: update_id } } @@ -80,8 +79,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { mod tests { use heed::EnvOpenOptions; - use crate::update::{IndexDocuments, UpdateFormat}; use super::*; + use crate::update::{IndexDocuments, UpdateFormat}; #[test] fn clear_documents() { diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 7fc7e5d77..dfb48dc58 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,5 +1,5 @@ -use std::collections::HashMap; use std::collections::hash_map::Entry; +use std::collections::HashMap; use chrono::Utc; use fst::IntoStreamer; @@ -7,11 +7,11 @@ use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; use serde_json::Value; -use crate::error::{InternalError, FieldIdMapMissingEntry, UserError}; +use super::ClearDocuments; +use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; -use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result}; -use super::ClearDocuments; +use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -26,11 +26,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, update_id: u64, - ) -> Result> - { - let external_documents_ids = index - .external_documents_ids(wtxn)? - .into_static(); + ) -> Result> { + let external_documents_ids = index.external_documents_ids(wtxn)?.into_static(); Ok(DeleteDocuments { wtxn, @@ -84,12 +81,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { key: Some(main_key::PRIMARY_KEY_KEY), } })?; - let id_field = fields_ids_map.id(primary_key).ok_or_else(|| { - FieldIdMapMissingEntry::FieldName { + let id_field = + fields_ids_map.id(primary_key).ok_or_else(|| FieldIdMapMissingEntry::FieldName { field_name: primary_key.to_string(), process: "DeleteDocuments::execute", - } - })?; + })?; let Index { env: _env, @@ -130,7 +126,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let external_id = match serde_json::from_slice(content).unwrap() { Value::String(string) => SmallString32::from(string.as_str()), Value::Number(number) => SmallString32::from(number.to_string()), - document_id => return Err(UserError::InvalidDocumentId { document_id }.into()), + document_id => { + return Err(UserError::InvalidDocumentId { document_id }.into()) + } }; external_ids.push(external_id); } @@ -160,7 +158,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { match entry.get().checked_sub(count_diff) { Some(0) | None => entry.remove(), - Some(count) => entry.insert(count) + Some(count) => entry.insert(count), }; } } @@ -206,9 +204,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We construct an FST set that contains the words to delete from the words FST. - let words_to_delete = words.iter().filter_map(|(word, must_remove)| { - if *must_remove { Some(word.as_ref()) } else { None } - }); + let words_to_delete = + words.iter().filter_map( + |(word, must_remove)| { + if *must_remove { + Some(word.as_ref()) + } else { + None + } + }, + ); let words_to_delete = fst::Set::from_iter(words_to_delete)?; let new_words_fst = { @@ -285,7 +290,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We delete the documents ids that are under the pairs of words, // it is faster and use no memory to iterate over all the words pairs than // to compute the cartesian product of every words of the deleted documents. - let mut iter = word_pair_proximity_docids.remap_key_type::().iter_mut(self.wtxn)?; + let mut iter = + word_pair_proximity_docids.remap_key_type::().iter_mut(self.wtxn)?; while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); @@ -300,7 +306,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the word level position docids. - let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + let mut iter = + word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); @@ -315,7 +322,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the word prefix level position docids. - let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + let mut iter = + word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); @@ -397,12 +405,11 @@ fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>( convert: F, ) -> heed::Result<()> where - C: heed::BytesDecode<'a, DItem=K> + heed::BytesEncode<'a, EItem=K>, + C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, F: Fn(K) -> DocumentId, { - let mut iter = db.remap_key_type::() - .prefix_iter_mut(wtxn, &[field_id])? - .remap_key_type::(); + let mut iter = + db.remap_key_type::().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::(); while let Some(result) = iter.next() { let (key, ()) = result?; @@ -441,8 +448,8 @@ where mod tests { use heed::EnvOpenOptions; - use crate::update::{IndexDocuments, UpdateFormat}; use super::*; + use crate::update::{IndexDocuments, UpdateFormat}; #[test] fn delete_documents_with_numbers_as_primary_key() { diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 757cbe810..09f962bbc 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -3,17 +3,18 @@ use std::fs::File; use std::num::NonZeroUsize; use chrono::Utc; -use grenad::{CompressionType, Reader, Writer, FileFuse}; +use grenad::{CompressionType, FileFuse, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; use crate::error::InternalError; -use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::update::index_documents::WriteMethod; -use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::update::index_documents::{ + create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, +}; use crate::{Index, Result}; pub struct Facets<'t, 'u, 'i> { @@ -32,8 +33,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, update_id: u64, - ) -> Facets<'t, 'u, 'i> - { + ) -> Facets<'t, 'u, 'i> { Facets { wtxn, index, @@ -72,11 +72,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; // Clear the facet number levels. - clear_field_number_levels( - self.wtxn, - self.index.facet_id_f64_docids, - field_id, - )?; + clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; // Compute and store the faceted numbers documents ids. let number_documents_ids = compute_faceted_documents_ids( @@ -96,8 +92,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?; - self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?; + self.index.put_string_faceted_documents_ids( + self.wtxn, + field_id, + &string_documents_ids, + )?; + self.index.put_number_faceted_documents_ids( + self.wtxn, + field_id, + &number_documents_ids, + )?; write_into_lmdb_database( self.wtxn, @@ -112,12 +116,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { } } -fn clear_field_number_levels<'t, >( +fn clear_field_number_levels<'t>( wtxn: &'t mut heed::RwTxn, db: heed::Database, field_id: u8, -) -> heed::Result<()> -{ +) -> heed::Result<()> { let left = (field_id, 1, f64::MIN, f64::MIN); let right = (field_id, u8::MAX, f64::MAX, f64::MAX); let range = left..=right; @@ -133,8 +136,7 @@ fn compute_facet_number_levels<'t>( level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, field_id: u8, -) -> Result> -{ +) -> Result> { let first_level_size = db .remap_key_type::() .prefix_iter(rtxn, &[field_id])? @@ -143,9 +145,8 @@ fn compute_facet_number_levels<'t>( // It is forbidden to keep a cursor and write in a database at the same time with LMDB // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = tempfile::tempfile().and_then(|file| { - create_writer(compression_type, compression_level, file) - })?; + let mut writer = tempfile::tempfile() + .and_then(|file| create_writer(compression_type, compression_level, file))?; let level_0_range = { let left = (field_id, 0, f64::MIN, f64::MIN); @@ -196,8 +197,7 @@ fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, field_id: u8, -) -> Result -{ +) -> Result { let mut documents_ids = RoaringBitmap::new(); for result in db.prefix_iter(rtxn, &[field_id])? { @@ -215,8 +215,7 @@ fn write_number_entry( left: f64, right: f64, ids: &RoaringBitmap, -) -> Result<()> -{ +) -> Result<()> { let key = (field_id, level, left, right); let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 51c8b948a..05242f540 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::fs::File; -use std::io::{self, Seek, SeekFrom, BufReader, BufRead}; +use std::io::{self, BufRead, BufReader, Seek, SeekFrom}; use std::num::{NonZeroU32, NonZeroUsize}; use std::result::Result as StdResult; use std::str; @@ -10,28 +10,26 @@ use std::time::Instant; use bstr::ByteSlice as _; use chrono::Utc; -use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType}; +use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer}; use heed::types::ByteSlice; -use log::{debug, info, error}; +use log::{debug, error, info}; use memmap::Mmap; use rayon::prelude::*; use rayon::ThreadPool; -use serde::{Serialize, Deserialize}; +use serde::{Deserialize, Serialize}; -use crate::error::{Error, InternalError}; -use crate::{Index, Result}; -use crate::update::{ - Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep, - WordPrefixPairProximityDocids, -}; -use self::store::{Store, Readers}; pub use self::merge_function::{ - fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first + cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, }; +use self::store::{Readers, Store}; pub use self::transform::{Transform, TransformOutput}; - -use crate::MergeFn; use super::UpdateBuilder; +use crate::error::{Error, InternalError}; +use crate::update::{ + Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, + WordsLevelPositions, WordsPrefixesFst, +}; +use crate::{Index, MergeFn, Result}; mod merge_function; mod store; @@ -48,7 +46,11 @@ pub enum WriteMethod { GetMergePut, } -pub fn create_writer(typ: CompressionType, level: Option, file: File) -> io::Result> { +pub fn create_writer( + typ: CompressionType, + level: Option, + file: File, +) -> io::Result> { let mut builder = Writer::builder(); builder.compression_type(typ); if let Some(level) = level { @@ -64,8 +66,7 @@ pub fn create_sorter( chunk_fusing_shrink_size: Option, max_nb_chunks: Option, max_memory: Option, -) -> Sorter> -{ +) -> Sorter> { let mut builder = Sorter::builder(merge); if let Some(shrink_size) = chunk_fusing_shrink_size { builder.file_fusing_shrink_size(shrink_size); @@ -83,7 +84,10 @@ pub fn create_sorter( builder.build() } -pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> Result> { +pub fn writer_into_reader( + writer: Writer, + shrink_size: Option, +) -> Result> { let mut file = writer.into_inner()?; file.seek(SeekFrom::Start(0))?; let file = if let Some(shrink_size) = shrink_size { @@ -97,8 +101,7 @@ pub fn writer_into_reader(writer: Writer, shrink_size: Option) -> Res pub fn merge_readers( sources: Vec>, merge: MergeFn, -) -> Merger> -{ +) -> Merger> { let mut builder = Merger::builder(merge); builder.extend(sources); builder.build() @@ -118,13 +121,7 @@ where let before = Instant::now(); let merger = merge_readers(sources, merge); - merger_iter_into_lmdb_database( - wtxn, - database, - merger.into_merge_iter()?, - merge, - method, - )?; + merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?; debug!("MTBL stores merged in {:.02?}!", before.elapsed()); Ok(()) @@ -149,7 +146,7 @@ where while let Some((k, v)) = reader.next()? { out_iter.append(k, v)?; } - }, + } WriteMethod::GetMergePut => { while let Some((k, v)) = reader.next()? { let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; @@ -158,11 +155,11 @@ where let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; let val = merge(k, &vals)?; iter.put_current(k, &val)?; - }, + } _ => { drop(iter); database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - }, + } } } } @@ -181,18 +178,12 @@ pub fn sorter_into_lmdb_database( ) -> Result<()> where Error: From, - Error: From> + Error: From>, { debug!("Writing MTBL sorter..."); let before = Instant::now(); - merger_iter_into_lmdb_database( - wtxn, - database, - sorter.into_iter()?, - merge, - method, - )?; + merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?; debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) @@ -214,7 +205,7 @@ where while let Some((k, v)) = sorter.next()? { out_iter.append(k, v)?; } - }, + } WriteMethod::GetMergePut => { while let Some((k, v)) = sorter.next()? { let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; @@ -226,14 +217,14 @@ where InternalError::IndexingMergingKeys { process: "get-put-merge" } })?; iter.put_current(k, &val)?; - }, + } _ => { drop(iter); database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - }, + } } } - }, + } } Ok(()) @@ -341,9 +332,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // Early return when there is no document to add if reader.buffer().is_empty() { - return Ok(DocumentAdditionResult { - nb_documents: 0, - }) + return Ok(DocumentAdditionResult { nb_documents: 0 }); } self.index.set_updated_at(self.wtxn, &Utc::now())?; @@ -367,7 +356,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let output = match self.update_format { UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?, UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?, - UpdateFormat::JsonStream => transform.output_from_json_stream(reader, &progress_callback)?, + UpdateFormat::JsonStream => { + transform.output_from_json_stream(reader, &progress_callback)? + } }; let nb_documents = output.documents_count; @@ -380,7 +371,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> Result<()> where - F: Fn(UpdateIndexingStep) + Sync + F: Fn(UpdateIndexingStep) + Sync, { let before_indexing = Instant::now(); @@ -457,7 +448,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // settings if none have already been set. backup_pool = rayon::ThreadPoolBuilder::new().build()?; &backup_pool - }, + } }; let readers = pool.install(|| { @@ -595,11 +586,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut documents_ids = self.index.documents_ids(self.wtxn)?; let contains_documents = !documents_ids.is_empty(); - let write_method = if contains_documents { - WriteMethod::GetMergePut - } else { - WriteMethod::Append - }; + let write_method = + if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append }; debug!("Writing using the write method: {:?}", write_method); @@ -634,7 +622,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { *self.index.docid_word_positions.as_polymorph(), docid_word_positions_readers, keep_first, - write_method + write_method, )?; database_count += 1; @@ -649,7 +637,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { *self.index.documents.as_polymorph(), documents_readers, keep_first, - write_method + write_method, )?; database_count += 1; @@ -730,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { fst_merge, WriteMethod::GetMergePut, )?; - }, + } DatabaseType::WordDocids => { debug!("Writing the words docids into LMDB on disk..."); let db = *self.index.word_docids.as_polymorph(); @@ -741,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { roaring_bitmap_merge, write_method, )?; - }, + } DatabaseType::FacetLevel0NumbersDocids => { debug!("Writing the facet numbers docids into LMDB on disk..."); let db = *self.index.facet_id_f64_docids.as_polymorph(); @@ -752,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { cbo_roaring_bitmap_merge, write_method, )?; - }, + } DatabaseType::FieldIdWordCountDocids => { debug!("Writing the field id word count docids into LMDB on disk..."); let db = *self.index.field_id_word_count_docids.as_polymorph(); @@ -763,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { cbo_roaring_bitmap_merge, write_method, )?; - }, + } DatabaseType::WordLevel0PositionDocids => { debug!("Writing the word level 0 positions docids into LMDB on disk..."); let db = *self.index.word_level_position_docids.as_polymorph(); @@ -848,9 +836,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { #[cfg(test)] mod tests { - use super::*; use heed::EnvOpenOptions; + use super::*; + #[test] fn simple_document_replacement() { let path = tempfile::tempdir().unwrap(); @@ -1053,9 +1042,8 @@ mod tests { assert_eq!(count, 3); let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); - let (kevin_id, _) = docs.iter().find(|(_, d)| { - d.get(0).unwrap() == br#""updated kevin""# - }).unwrap(); + let (kevin_id, _) = + docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); let (id, doc) = docs[*kevin_id as usize]; assert_eq!(id, *kevin_id); diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 94ae12108..7318c5bd0 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -8,25 +8,29 @@ use std::{cmp, iter}; use bstr::ByteSlice as _; use fst::Set; -use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; +use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; use log::{debug, info}; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; +use meilisearch_tokenizer::token::SeparatorKind; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; use tempfile::tempfile; +use super::merge_function::{ + cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, +}; +use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; use crate::error::{Error, InternalError, SerializationError}; -use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, + FieldDocIdFacetStringCodec, +}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result}; - -use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; -use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge}; +use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32}; const LMDB_MAX_KEY_LENGTH: usize = 511; const ONE_KILOBYTE: usize = 1024 * 1024; @@ -56,7 +60,8 @@ pub struct Store<'s, A> { word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, - words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, + words_pairs_proximities_docids: + LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, @@ -93,8 +98,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { chunk_compression_level: Option, chunk_fusing_shrink_size: Option, stop_words: Option<&'s Set>, - ) -> Result - { + ) -> Result { // We divide the max memory by the number of sorter the Store have. let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); @@ -172,12 +176,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Some(1024 * 1024 * 1024), // 1MB ); - let documents_writer = tempfile().and_then(|f| { - create_writer(chunk_compression_type, chunk_compression_level, f) - })?; - let docid_word_positions_writer = tempfile().and_then(|f| { - create_writer(chunk_compression_type, chunk_compression_level, f) - })?; + let documents_writer = tempfile() + .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; + let docid_word_positions_writer = tempfile() + .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; let mut config = AnalyzerConfig::default(); if let Some(stop_words) = stop_words { @@ -224,7 +226,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> { // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.word_docids.get_refresh(word.as_bytes()) { - Some(old) => { old.insert(id); }, + Some(old) => { + old.insert(id); + } None => { let word_vec = SmallVec32::from(word.as_bytes()); // A newly inserted element is append at the end of the linked hash map. @@ -246,15 +250,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { field_id: FieldId, value: OrderedFloat, id: DocumentId, - ) -> Result<()> - { + ) -> Result<()> { let sorter = &mut self.field_id_docid_facet_numbers_sorter; Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; let key = (field_id, value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.facet_field_number_docids.get_refresh(&key) { - Some(old) => { old.insert(id); }, + Some(old) => { + old.insert(id); + } None => { // A newly inserted element is append at the end of the linked hash map. self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); @@ -279,15 +284,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { field_id: FieldId, value: String, id: DocumentId, - ) -> Result<()> - { + ) -> Result<()> { let sorter = &mut self.field_id_docid_facet_strings_sorter; Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; let key = (field_id, value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.facet_field_string_docids.get_refresh(&key) { - Some(old) => { old.insert(id); }, + Some(old) => { + old.insert(id); + } None => { // A newly inserted element is append at the end of the linked hash map. self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); @@ -309,10 +315,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // Save the documents ids under the words pairs proximities that it contains. fn insert_words_pairs_proximities_docids<'a>( &mut self, - words_pairs_proximities: impl IntoIterator, + words_pairs_proximities: impl IntoIterator, id: DocumentId, - ) -> Result<()> - { + ) -> Result<()> { for ((w1, w2), prox) in words_pairs_proximities { let w1 = SmallVec32::from(w1.as_bytes()); let w2 = SmallVec32::from(w2.as_bytes()); @@ -320,7 +325,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // if get_refresh finds the element it is assured // to be at the end of the linked hash map. match self.words_pairs_proximities_docids.get_refresh(&key) { - Some(old) => { old.insert(id); }, + Some(old) => { + old.insert(id); + } None => { // A newly inserted element is append at the end of the linked hash map. let ids = RoaringBitmap::from_iter(Some(id)); @@ -337,7 +344,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // Removing front elements is equivalent to removing the LRUs. let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front()); iter.take(overflow).for_each(|x| lrus.push(x)); - Self::write_words_pairs_proximities(&mut self.words_pairs_proximities_docids_sorter, lrus)?; + Self::write_words_pairs_proximities( + &mut self.words_pairs_proximities_docids_sorter, + lrus, + )?; } Ok(()) @@ -350,8 +360,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { facet_numbers_values: &mut HashMap>, facet_strings_values: &mut HashMap>, record: &[u8], - ) -> Result<()> - { + ) -> Result<()> { // We compute the list of words pairs proximities (self-join) and write it directly to disk. let words_pair_proximities = compute_words_pair_proximities(&words_positions); self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; @@ -362,8 +371,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } self.documents_writer.insert(document_id.to_be_bytes(), record)?; - Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; - Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?; + Self::write_docid_word_positions( + &mut self.docid_word_positions_writer, + document_id, + words_positions, + )?; + Self::write_word_position_docids( + &mut self.word_level_position_docids_sorter, + document_id, + words_positions, + )?; words_positions.clear(); @@ -387,7 +404,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn write_words_pairs_proximities( sorter: &mut Sorter>, - iter: impl IntoIterator, SmallVec32, u8), RoaringBitmap)>, + iter: impl IntoIterator, SmallVec32, u8), RoaringBitmap)>, ) -> Result<()> where Error: From, @@ -419,8 +436,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { writer: &mut Writer, id: DocumentId, words_positions: &HashMap>, - ) -> Result<()> - { + ) -> Result<()> { // We prefix the words by the document id. let mut key = id.to_be_bytes().to_vec(); let mut buffer = Vec::new(); @@ -484,12 +500,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_facet_field_string_docids( - sorter: &mut Sorter>, - iter: I, - ) -> Result<()> + fn write_facet_field_string_docids(sorter: &mut Sorter>, iter: I) -> Result<()> where - I: IntoIterator, + I: IntoIterator, Error: From, { let mut key_buffer = Vec::new(); @@ -510,12 +523,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_facet_field_number_docids( - sorter: &mut Sorter>, - iter: I, - ) -> Result<()> + fn write_facet_field_number_docids(sorter: &mut Sorter>, iter: I) -> Result<()> where - I: IntoIterator), RoaringBitmap)>, + I: IntoIterator), RoaringBitmap)>, Error: From, { let mut data_buffer = Vec::new(); @@ -579,7 +589,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn write_word_docids(sorter: &mut Sorter>, iter: I) -> Result<()> where - I: IntoIterator, RoaringBitmap)>, + I: IntoIterator, RoaringBitmap)>, Error: From, { let mut key = Vec::new(); @@ -611,7 +621,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { log_every_n: Option, mut progress_callback: F, ) -> Result - where F: FnMut(UpdateIndexingStep), + where + F: FnMut(UpdateIndexingStep), { debug!("{:?}: Indexing in a Store...", thread_index); @@ -629,7 +640,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { if count % num_threads == thread_index { // This is a log routine that we do every `log_every_n` documents. if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { - info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); + info!( + "We have seen {} documents so far ({:.02?}).", + format_count(count), + before.elapsed() + ); progress_callback(UpdateIndexingStep::IndexDocuments { documents_seen: count, total_documents: documents_count, @@ -638,12 +653,20 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } for (attr, content) in document.iter() { - if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) { - let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; + if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) + { + let value = + serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; let (facet_numbers, facet_strings) = extract_facet_values(&value); - facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers); - facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings); + facet_numbers_values + .entry(attr) + .or_insert_with(Vec::new) + .extend(facet_numbers); + facet_strings_values + .entry(attr) + .or_insert_with(Vec::new) + .extend(facet_strings); if self.searchable_fields.contains(&attr) { let content = match json_to_string(&value) { @@ -658,12 +681,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { last_pos = Some(pos); let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); + words_positions + .entry(token.text().to_string()) + .or_insert_with(SmallVec32::new) + .push(position); } if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { let key = (attr, last_pos as u8 + 1); - self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id); + self.field_id_word_count_docids + .entry(key) + .or_insert_with(RoaringBitmap::new) + .insert(document_id); } } } @@ -713,7 +742,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { self.facet_field_string_docids, )?; - let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + let mut word_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut builder = fst::SetBuilder::memory(); let mut iter = self.word_docids_sorter.into_iter()?; @@ -737,37 +767,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.main_sorter.write_into(&mut main_wtr)?; - let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; + let mut words_pairs_proximities_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.words_pairs_proximities_docids_sorter + .write_into(&mut words_pairs_proximities_docids_wtr)?; - let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + let mut word_level_position_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; - let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + let mut field_id_word_count_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; - let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + let mut facet_field_numbers_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; - let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + let mut facet_field_strings_docids_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; - let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?; + let mut field_id_docid_facet_numbers_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_docid_facet_numbers_sorter + .write_into(&mut field_id_docid_facet_numbers_wtr)?; - let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?; + let mut field_id_docid_facet_strings_wtr = + tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_docid_facet_strings_sorter + .write_into(&mut field_id_docid_facet_strings_wtr)?; let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; - let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; - let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; - let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; - let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; - let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; - let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; - let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; - let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; + let words_pairs_proximities_docids = + writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; + let word_level_position_docids = + writer_into_reader(word_level_position_docids_wtr, shrink_size)?; + let field_id_word_count_docids = + writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; + let facet_field_numbers_docids = + writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; + let facet_field_strings_docids = + writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; + let field_id_docid_facet_numbers = + writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; + let field_id_docid_facet_strings = + writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; + let docid_word_positions = + writer_into_reader(self.docid_word_positions_writer, shrink_size)?; let documents = writer_into_reader(self.documents_writer, shrink_size)?; Ok(Readers { @@ -792,8 +840,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { /// close to each other. fn compute_words_pair_proximities( word_positions: &HashMap>, -) -> HashMap<(&str, &str), u8> -{ +) -> HashMap<(&str, &str), u8> { use itertools::Itertools; let mut words_pair_proximities = HashMap::new(); @@ -828,31 +875,34 @@ fn lmdb_key_valid_size(key: &[u8]) -> bool { /// take an iterator on tokens and compute their relative position depending on separator kinds /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, /// else we keep the standart proximity of 1 between words. -fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { +fn process_tokens<'a>( + tokens: impl Iterator>, +) -> impl Iterator)> { tokens .skip_while(|token| token.is_separator().is_some()) .scan((0, None), |(offset, prev_kind), token| { - match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { - *offset += match *prev_kind { - Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, - Some(_) => 1, - None => 0, - }; - *prev_kind = Some(token.kind) - } - TokenKind::Separator(SeparatorKind::Hard) => { - *prev_kind = Some(token.kind); - } - TokenKind::Separator(SeparatorKind::Soft) - if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => { - *prev_kind = Some(token.kind); - } - _ => (), + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => + { + *prev_kind = Some(token.kind); + } + _ => (), + } Some((*offset, token)) }) - .filter(|(_, t)| t.is_word()) + .filter(|(_, t)| t.is_word()) } fn extract_facet_values(value: &Value) -> (Vec, Vec) { @@ -865,18 +915,22 @@ fn extract_facet_values(value: &Value) -> (Vec, Vec) { match value { Value::Null => (), Value::Bool(b) => output_strings.push(b.to_string()), - Value::Number(number) => if let Some(float) = number.as_f64() { - output_numbers.push(float); - }, + Value::Number(number) => { + if let Some(float) = number.as_f64() { + output_numbers.push(float); + } + } Value::String(string) => { let string = string.trim().to_lowercase(); output_strings.push(string); - }, - Value::Array(values) => if can_recurse { - for value in values { - inner_extract_facet_values(value, false, output_numbers, output_strings); + } + Value::Array(values) => { + if can_recurse { + for value in values { + inner_extract_facet_values(value, false, output_numbers, output_strings); + } } - }, + } Value::Object(_) => (), } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 9e88559d0..756ff492e 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -10,14 +10,15 @@ use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use crate::error::{Error, UserError, InternalError}; -use crate::index::db_name; -use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv}; -use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; -use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; -use crate::{Index, Result}; use super::merge_function::merge_two_obkvs; -use super::{create_writer, create_sorter, IndexDocumentsMethod}; +use super::{create_sorter, create_writer, IndexDocumentsMethod}; +use crate::error::{Error, InternalError, UserError}; +use crate::index::db_name; +use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs}; +use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::{ + ExternalDocumentsIds, FieldId, FieldsDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32, +}; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; @@ -64,7 +65,11 @@ impl Transform<'_, '_> { self.output_from_generic_json(reader, false, progress_callback) } - pub fn output_from_json_stream(self, reader: R, progress_callback: F) -> Result + pub fn output_from_json_stream( + self, + reader: R, + progress_callback: F, + ) -> Result where R: Read, F: Fn(UpdateIndexingStep) + Sync, @@ -86,14 +91,16 @@ impl Transform<'_, '_> { let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); // Deserialize the whole batch of documents in memory. - let mut documents: Peekable>>>> = if is_stream { + let mut documents: Peekable< + Box>>>, + > = if is_stream { let iter = serde_json::Deserializer::from_reader(reader).into_iter(); - let iter = Box::new(iter) as Box>; + let iter = Box::new(iter) as Box>; iter.peekable() } else { let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?; let iter = vec.into_iter().map(Ok); - let iter = Box::new(iter) as Box>; + let iter = Box::new(iter) as Box>; iter.peekable() }; @@ -104,15 +111,16 @@ impl Transform<'_, '_> { Err(_) => { let error = documents.next().unwrap().unwrap_err(); return Err(UserError::SerdeJson(error).into()); - }, + } }; - let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); + let alternative_name = + first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); let (primary_key_id, primary_key) = compute_primary_key_pair( self.index.primary_key(self.rtxn)?, &mut fields_ids_map, alternative_name, - self.autogenerate_docids + self.autogenerate_docids, )?; if documents.peek().is_none() { @@ -173,9 +181,11 @@ impl Transform<'_, '_> { Some(value) => match value { Value::String(string) => Cow::Borrowed(string.as_str()), Value::Number(number) => Cow::Owned(number.to_string()), - content => return Err(UserError::InvalidDocumentId { - document_id: content.clone(), - }.into()), + content => { + return Err( + UserError::InvalidDocumentId { document_id: content.clone() }.into() + ) + } }, None => { if !self.autogenerate_docids { @@ -183,7 +193,7 @@ impl Transform<'_, '_> { } let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); Cow::Borrowed(uuid) - }, + } }; // We iterate in the fields ids ordered. @@ -194,7 +204,8 @@ impl Transform<'_, '_> { // and this should be the document id we return the one we generated. if let Some(value) = document.get(name) { // We serialize the attribute values. - serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?; + serde_json::to_writer(&mut json_buffer, value) + .map_err(InternalError::SerdeJson)?; writer.insert(field_id, &json_buffer)?; } @@ -202,7 +213,8 @@ impl Transform<'_, '_> { if field_id == primary_key_id && validate_document_id(&external_id).is_none() { return Err(UserError::InvalidDocumentId { document_id: Value::from(external_id), - }.into()); + } + .into()); } } @@ -248,9 +260,9 @@ impl Transform<'_, '_> { // Extract the position of the primary key in the current headers, None if not found. let primary_key_pos = match self.index.primary_key(self.rtxn)? { Some(primary_key) => { - // The primary key is known so we must find the position in the CSV headers. - headers.iter().position(|h| h == primary_key) - }, + // The primary key is known so we must find the position in the CSV headers. + headers.iter().position(|h| h == primary_key) + } None => headers.iter().position(is_primary_key), }; @@ -261,7 +273,7 @@ impl Transform<'_, '_> { self.index.primary_key(self.rtxn)?, &mut fields_ids_map, alternative_name, - self.autogenerate_docids + self.autogenerate_docids, )?; // The primary key field is not present in the header, so we need to create it. @@ -308,27 +320,30 @@ impl Transform<'_, '_> { // We validate the document id [a-zA-Z0-9\-_]. match validate_document_id(&external_id) { Some(valid) => valid, - None => return Err(UserError::InvalidDocumentId { - document_id: Value::from(external_id), - }.into()), + None => { + return Err(UserError::InvalidDocumentId { + document_id: Value::from(external_id), + } + .into()) + } } - }, + } None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), }; // When the primary_key_field_id is found in the fields ids list // we return the generated document id instead of the record field. - let iter = fields_ids.iter() - .map(|(fi, i)| { - let field = if *fi == primary_key_id { external_id } else { &record[*i] }; - (fi, field) - }); + let iter = fields_ids.iter().map(|(fi, i)| { + let field = if *fi == primary_key_id { external_id } else { &record[*i] }; + (fi, field) + }); // We retrieve the field id based on the fields ids map fields ids order. for (field_id, field) in iter { // We serialize the attribute values as JSON strings. json_buffer.clear(); - serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?; + serde_json::to_writer(&mut json_buffer, &field) + .map_err(InternalError::SerdeJson)?; writer.insert(*field_id, &json_buffer)?; } @@ -410,26 +425,27 @@ impl Transform<'_, '_> { IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), IndexDocumentsMethod::UpdateDocuments => { let key = BEU32::new(docid); - let base_obkv = self.index.documents.get(&self.rtxn, &key)? - .ok_or(InternalError::DatabaseMissingEntry { + let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None, - })?; + }, + )?; let update_obkv = obkv::KvReader::new(update_obkv); merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); (docid, obkv_buffer.as_slice()) } } - }, + } None => { // If this user id is new we add it to the external documents ids map // for new ids and into the list of new documents. - let new_docid = available_documents_ids.next() - .ok_or(UserError::DocumentLimitReached)?; + let new_docid = + available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?; new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; new_documents_ids.insert(new_docid); (new_docid, update_obkv) - }, + } }; // We insert the document under the documents ids map into the final file. @@ -450,7 +466,8 @@ impl Transform<'_, '_> { // We create a final writer to write the new documents in order from the sorter. let file = tempfile::tempfile()?; - let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; + let mut writer = + create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; // Once we have written all the documents into the final sorter, we write the documents // into this writer, extract the file and reset the seek to be able to read it again. @@ -485,8 +502,7 @@ impl Transform<'_, '_> { primary_key: String, old_fields_ids_map: FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, - ) -> Result - { + ) -> Result { let fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?; @@ -494,7 +510,8 @@ impl Transform<'_, '_> { // We create a final writer to write the new documents in order from the sorter. let file = tempfile::tempfile()?; - let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; + let mut writer = + create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; let mut obkv_buffer = Vec::new(); for result in self.index.documents.iter(self.rtxn)? { @@ -561,20 +578,19 @@ fn compute_primary_key_pair( return Err(UserError::MissingPrimaryKey.into()); } DEFAULT_PRIMARY_KEY_NAME.to_string() - }, + } }; let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; Ok((id, name)) - }, + } } } fn validate_document_id(document_id: &str) -> Option<&str> { let document_id = document_id.trim(); Some(document_id).filter(|id| { - !id.is_empty() && id.chars().all(|c| { - matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_') - }) + !id.is_empty() + && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) }) } @@ -583,8 +599,7 @@ mod test { use super::*; mod compute_primary_key { - use super::compute_primary_key_pair; - use super::FieldsIdsMap; + use super::{compute_primary_key_pair, FieldsIdsMap}; #[test] fn should_return_primary_key_if_is_some() { @@ -594,7 +609,8 @@ mod test { Some("toto"), &mut fields_map, Some("tata".to_string()), - false); + false, + ); assert_eq!(result.unwrap(), (0u8, "toto".to_string())); assert_eq!(fields_map.len(), 1); } @@ -602,11 +618,8 @@ mod test { #[test] fn should_return_alternative_if_primary_is_none() { let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair( - None, - &mut fields_map, - Some("tata".to_string()), - false); + let result = + compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); assert_eq!(result.unwrap(), (0u8, "tata".to_string())); assert_eq!(fields_map.len(), 1); } @@ -614,23 +627,15 @@ mod test { #[test] fn should_return_default_if_both_are_none() { let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair( - None, - &mut fields_map, - None, - true); + let result = compute_primary_key_pair(None, &mut fields_map, None, true); assert_eq!(result.unwrap(), (0u8, "id".to_string())); assert_eq!(fields_map.len(), 1); } #[test] - fn should_return_err_if_both_are_none_and_recompute_is_false(){ + fn should_return_err_if_both_are_none_and_recompute_is_false() { let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair( - None, - &mut fields_map, - None, - false); + let result = compute_primary_key_pair(None, &mut fields_map, None, false); assert!(result.is_err()); assert_eq!(fields_map.len(), 0); } diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 203937e2f..36ed7d8fa 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -2,7 +2,9 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::DeleteDocuments; pub use self::facets::Facets; -pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat}; +pub use self::index_documents::{ + DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat, +}; pub use self::settings::{Setting, Settings}; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 8f4fe48c9..c6540b33a 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -34,17 +34,24 @@ impl Setting { } impl Serialize for Setting { - fn serialize(&self, serializer: S) -> StdResult where S: Serializer { + fn serialize(&self, serializer: S) -> StdResult + where + S: Serializer, + { match self { Self::Set(value) => Some(value), // Usually not_set isn't serialized by setting skip_serializing_if field attribute Self::NotSet | Self::Reset => None, - }.serialize(serializer) + } + .serialize(serializer) } } impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting { - fn deserialize(deserializer: D) -> StdResult where D: Deserializer<'de> { + fn deserialize(deserializer: D) -> StdResult + where + D: Deserializer<'de>, + { Deserialize::deserialize(deserializer).map(|x| match x { Some(x) => Self::Set(x), None => Self::Reset, // Reset is forced by sending null value @@ -141,11 +148,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } pub fn set_stop_words(&mut self, stop_words: BTreeSet) { - self.stop_words = if stop_words.is_empty() { - Setting::Reset - } else { - Setting::Set(stop_words) - } + self.stop_words = + if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) } } pub fn reset_distinct_field(&mut self) { @@ -161,11 +165,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } pub fn set_synonyms(&mut self, synonyms: HashMap>) { - self.synonyms = if synonyms.is_empty() { - Setting::Reset - } else { - Setting::Set(synonyms) - } + self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) } } pub fn reset_primary_key(&mut self) { @@ -178,7 +178,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where - F: Fn(UpdateIndexingStep, u64) + Sync + F: Fn(UpdateIndexingStep, u64) + Sync, { let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let update_id = self.update_id; @@ -203,7 +203,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { }; // There already has been a document addition, the primary key should be set by now. - let primary_key = self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; + let primary_key = + self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; // We remap the documents fields based on the new `FieldsIdsMap`. let output = transform.remap_index_documents( @@ -236,21 +237,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // fields are deduplicated, only the first occurrence is taken into account - let names: Vec<_> = fields - .iter() - .unique() - .map(String::as_str) - .collect(); + let names: Vec<_> = fields.iter().unique().map(String::as_str).collect(); for name in names.iter() { - fields_ids_map - .insert(name) - .ok_or(UserError::AttributeLimitReached)?; + fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } self.index.put_displayed_fields(self.wtxn, &names)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_displayed_fields(self.wtxn)?; } + Setting::Reset => { + self.index.delete_displayed_fields(self.wtxn)?; + } Setting::NotSet => return Ok(false), } Ok(true) @@ -260,14 +257,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match self.distinct_field { Setting::Set(ref attr) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - fields_ids_map - .insert(attr) - .ok_or(UserError::AttributeLimitReached)?; + fields_ids_map.insert(attr).ok_or(UserError::AttributeLimitReached)?; self.index.put_distinct_field(self.wtxn, &attr)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; }, + Setting::Reset => { + self.index.delete_distinct_field(self.wtxn)?; + } Setting::NotSet => return Ok(false), } Ok(true) @@ -285,30 +282,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let mut new_fields_ids_map = FieldsIdsMap::new(); // fields are deduplicated, only the first occurrence is taken into account - let names = fields - .iter() - .unique() - .map(String::as_str) - .collect::>(); + let names = fields.iter().unique().map(String::as_str).collect::>(); // Add all the searchable attributes to the field map, and then add the // remaining fields from the old field map to the new one for name in names.iter() { - new_fields_ids_map - .insert(&name) - .ok_or(UserError::AttributeLimitReached)?; + new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; } for (_, name) in old_fields_ids_map.iter() { - new_fields_ids_map - .insert(&name) - .ok_or(UserError::AttributeLimitReached)?; + new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; } self.index.put_searchable_fields(self.wtxn, &names)?; self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; } - Setting::Reset => { self.index.delete_searchable_fields(self.wtxn)?; } + Setting::Reset => { + self.index.delete_searchable_fields(self.wtxn)?; + } Setting::NotSet => return Ok(false), } Ok(true) @@ -323,7 +314,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let fst = fst::Set::from_iter(stop_words)?; // Does the new FST differ from the previous one? - if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) { + if current + .map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) + { // we want to re-create our FST. self.index.put_stop_words(self.wtxn, &fst)?; Ok(true) @@ -343,9 +336,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { analyzer .analyze(text) .tokens() - .filter_map(|token| - if token.is_word() { Some(token.text().to_string()) } else { None } - ) + .filter_map(|token| { + if token.is_word() { + Some(token.text().to_string()) + } else { + None + } + }) .collect::>() } @@ -360,25 +357,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { for (word, synonyms) in synonyms { // Normalize both the word and associated synonyms. let normalized_word = normalize(&analyzer, word); - let normalized_synonyms = synonyms - .iter() - .map(|synonym| normalize(&analyzer, synonym)); + let normalized_synonyms = + synonyms.iter().map(|synonym| normalize(&analyzer, synonym)); // Store the normalized synonyms under the normalized word, // merging the possible duplicate words. - let entry = new_synonyms - .entry(normalized_word) - .or_insert_with(Vec::new); + let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new); entry.extend(normalized_synonyms); } // Make sure that we don't have duplicate synonyms. - new_synonyms - .iter_mut() - .for_each(|(_, synonyms)| { - synonyms.sort_unstable(); - synonyms.dedup(); - }); + new_synonyms.iter_mut().for_each(|(_, synonyms)| { + synonyms.sort_unstable(); + synonyms.dedup(); + }); let old_synonyms = self.index.synonyms(self.wtxn)?; @@ -406,7 +398,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_filterable_fields(self.wtxn, &new_facets)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; } + Setting::Reset => { + self.index.delete_filterable_fields(self.wtxn)?; + } Setting::NotSet => (), } Ok(()) @@ -427,7 +421,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_criteria(self.wtxn, &new_criteria)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_criteria(self.wtxn)?; } + Setting::Reset => { + self.index.delete_criteria(self.wtxn)?; + } Setting::NotSet => (), } Ok(()) @@ -445,7 +441,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } else { Err(UserError::PrimaryKeyCannotBeChanged.into()) } - }, + } Setting::Reset => { if self.index.number_of_documents(&self.wtxn)? == 0 { self.index.delete_primary_key(self.wtxn)?; @@ -453,14 +449,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } else { Err(UserError::PrimaryKeyCannotBeReset.into()) } - }, + } Setting::NotSet => Ok(()), } } pub fn execute(mut self, progress_callback: F) -> Result<()> - where - F: Fn(UpdateIndexingStep, u64) + Sync + where + F: Fn(UpdateIndexingStep, u64) + Sync, { self.index.set_updated_at(self.wtxn, &Utc::now())?; @@ -493,17 +489,16 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { #[cfg(test)] mod tests { - use heed::EnvOpenOptions; - use heed::types::ByteSlice; - use maplit::{btreeset, hashmap, hashset}; use big_s::S; + use heed::types::ByteSlice; + use heed::EnvOpenOptions; + use maplit::{btreeset, hashmap, hashset}; + use super::*; use crate::error::Error; use crate::update::{IndexDocuments, UpdateFormat}; use crate::{Criterion, FilterCondition, SearchResult}; - use super::*; - #[test] fn set_and_reset_searchable_fields() { let path = tempfile::tempdir().unwrap(); @@ -674,7 +669,7 @@ mod tests { // Set the filterable fields to be the age. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset!{ S("age") }); + builder.set_filterable_fields(hashset! { S("age") }); builder.execute(|_, _| ()).unwrap(); // Then index some documents. @@ -692,12 +687,15 @@ mod tests { // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.filterable_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashset!{ S("age") }); + assert_eq!(fields_ids, hashset! { S("age") }); // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. - let count = index.facet_id_f64_docids + let count = index + .facet_id_f64_docids .remap_key_type::() - .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); + .prefix_iter(&rtxn, &[0, 0]) + .unwrap() + .count(); assert_eq!(count, 3); drop(rtxn); @@ -718,9 +716,12 @@ mod tests { let rtxn = index.read_txn().unwrap(); // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. - let count = index.facet_id_f64_docids + let count = index + .facet_id_f64_docids .remap_key_type::() - .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); + .prefix_iter(&rtxn, &[0, 0]) + .unwrap() + .count(); assert_eq!(count, 4); } @@ -969,7 +970,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); - builder.set_filterable_fields(hashset!{ S("age"), S("toto") }); + builder.set_filterable_fields(hashset! { S("age"), S("toto") }); builder.set_criteria(vec!["asc(toto)".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 1d0e776b1..2816ebca0 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -1,8 +1,8 @@ use grenad::CompressionType; use rayon::ThreadPool; +use super::{ClearDocuments, DeleteDocuments, Facets, IndexDocuments, Settings}; use crate::{Index, Result}; -use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, @@ -67,8 +67,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> ClearDocuments<'t, 'u, 'i> - { + ) -> ClearDocuments<'t, 'u, 'i> { ClearDocuments::new(wtxn, index, self.update_id) } @@ -76,8 +75,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> Result> - { + ) -> Result> { DeleteDocuments::new(wtxn, index, self.update_id) } @@ -85,8 +83,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> IndexDocuments<'t, 'u, 'i, 'a> - { + ) -> IndexDocuments<'t, 'u, 'i, 'a> { let mut builder = IndexDocuments::new(wtxn, index, self.update_id); builder.log_every_n = self.log_every_n; @@ -105,8 +102,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> Settings<'a, 't, 'u, 'i> - { + ) -> Settings<'a, 't, 'u, 'i> { let mut builder = Settings::new(wtxn, index, self.update_id); builder.log_every_n = self.log_every_n; @@ -125,8 +121,7 @@ impl<'a> UpdateBuilder<'a> { self, wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> Facets<'t, 'u, 'i> - { + ) -> Facets<'t, 'u, 'i> { let mut builder = Facets::new(wtxn, index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index a2197b28c..ffc359719 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -1,15 +1,13 @@ use std::str; -use crate::Index; use fst::Streamer; use grenad::CompressionType; use heed::types::ByteSlice; -use crate::Result; -use crate::update::index_documents::WriteMethod; use crate::update::index_documents::{ - create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, + create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod, }; +use crate::{Index, Result}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -22,7 +20,10 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { } impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordPrefixDocids<'t, 'u, 'i> { WordPrefixDocids { wtxn, index, diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 9019b26e5..9b876321e 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,18 +1,17 @@ use std::str; use fst::automaton::{Automaton, Str}; -use fst::{Streamer, IntoStreamer}; +use fst::{IntoStreamer, Streamer}; use grenad::CompressionType; -use heed::BytesEncode; use heed::types::ByteSlice; +use heed::BytesEncode; use log::debug; -use crate::{Index, Result}; use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::{ - WriteMethod, create_sorter, sorter_into_lmdb_database, - cbo_roaring_bitmap_merge, + cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod, }; +use crate::{Index, Result}; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -28,8 +27,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> - { + ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { WordPrefixPairProximityDocids { wtxn, index, diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index e2e3f7b4c..d43cd19b8 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -1,25 +1,23 @@ -use std::{cmp, str}; use std::convert::TryFrom; use std::fs::File; use std::num::NonZeroU32; +use std::{cmp, str}; use fst::automaton::{self, Automaton}; -use fst::{Streamer, IntoStreamer}; -use grenad::{CompressionType, Reader, Writer, FileFuse}; +use fst::{IntoStreamer, Streamer}; +use grenad::{CompressionType, FileFuse, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; use crate::error::InternalError; -use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; -use crate::Result; -use crate::update::index_documents::WriteMethod; +use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; use crate::update::index_documents::{ - create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, - cbo_roaring_bitmap_merge, sorter_into_lmdb_database + cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database, + write_into_lmdb_database, writer_into_reader, WriteMethod, }; -use crate::{Index, TreeLevel}; +use crate::{Index, Result, TreeLevel}; pub struct WordsLevelPositions<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -34,7 +32,10 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { } impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordsLevelPositions<'t, 'u, 'i> { WordsLevelPositions { wtxn, index, @@ -144,7 +145,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_level_position_docids.as_polymorph(), entries, - |_, _| Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }), + |_, _| { + Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }) + }, WriteMethod::Append, )?; @@ -176,13 +179,11 @@ fn compute_positions_levels( shrink_size: Option, level_group_size: NonZeroU32, min_level_size: NonZeroU32, -) -> Result> -{ +) -> Result> { // It is forbidden to keep a cursor and write in a database at the same time with LMDB // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = tempfile::tempfile().and_then(|file| { - create_writer(compression_type, compression_level, file) - })?; + let mut writer = tempfile::tempfile() + .and_then(|file| create_writer(compression_type, compression_level, file))?; for result in words_db.iter(rtxn)? { let (word, ()) = result?; @@ -193,7 +194,8 @@ fn compute_positions_levels( left..=right }; - let first_level_size = words_positions_db.remap_data_type::() + let first_level_size = words_positions_db + .remap_data_type::() .range(rtxn, &level_0_range)? .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; @@ -253,8 +255,7 @@ fn write_level_entry( left: u32, right: u32, ids: &RoaringBitmap, -) -> Result<()> -{ +) -> Result<()> { let key = (word, level, left, right); let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index d1aa267b8..f35dea10d 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -2,7 +2,8 @@ use std::iter::FromIterator; use std::str; use fst::Streamer; -use crate::{Index, SmallString32, Result}; + +use crate::{Index, Result, SmallString32}; pub struct WordsPrefixesFst<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -17,8 +18,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, update_id: u64, - ) -> WordsPrefixesFst<'t, 'u, 'i> - { + ) -> WordsPrefixesFst<'t, 'u, 'i> { WordsPrefixesFst { wtxn, index, @@ -55,7 +55,6 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); for n in 1..=self.max_prefix_length { - let mut current_prefix = SmallString32::new(); let mut current_prefix_count = 0; let mut builder = fst::SetBuilder::memory(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 8c63e5e08..7842b6c13 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -1,9 +1,8 @@ -use milli::{Criterion, Index, DocumentId}; -use milli::update::{IndexDocuments, UpdateFormat, Settings}; - use big_s::S; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; +use milli::update::{IndexDocuments, Settings, UpdateFormat}; +use milli::{Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -11,7 +10,8 @@ mod query_criteria; pub const TEST_QUERY: &'static str = "hello world america"; -pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]; +pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = + &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]; pub const CONTENT: &str = include_str!("../assets/test_set.ndjson"); @@ -27,16 +27,16 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let criteria = criteria.iter().map(|c| c.to_string()).collect(); builder.set_criteria(criteria); - builder.set_filterable_fields(hashset!{ + builder.set_filterable_fields(hashset! { S("tag"), S("asc_desc_rank"), }); - builder.set_synonyms(hashmap!{ + builder.set_synonyms(hashmap! { S("hello") => vec![S("good morning")], S("world") => vec![S("earth")], S("america") => vec![S("the united states")], }); - builder.set_searchable_fields(vec![S("title"),S("description")]); + builder.set_searchable_fields(vec![S("title"), S("description")]); builder.execute(|_, _| ()).unwrap(); // index documents @@ -53,12 +53,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec { let mut rtxn = index.read_txn().unwrap(); let docid_map = index.external_documents_ids(&mut rtxn).unwrap(); - let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); + let docid_map: std::collections::HashMap<_, _> = + EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() } -pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_words: bool) -> Vec { - let dataset = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); +pub fn expected_order( + criteria: &[Criterion], + authorize_typo: bool, + optional_words: bool, +) -> Vec { + let dataset = + serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); let mut groups: Vec> = vec![dataset]; for criterion in criteria { @@ -67,32 +73,36 @@ pub fn expected_order(criteria: &[Criterion], authorize_typo: bool, optional_wor match criterion { Criterion::Attribute => { group.sort_by_key(|d| d.attribute_rank); - new_groups.extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from)); - }, + new_groups + .extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from)); + } Criterion::Exactness => { group.sort_by_key(|d| d.exact_rank); new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from)); - }, + } Criterion::Proximity => { group.sort_by_key(|d| d.proximity_rank); - new_groups.extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); - }, + new_groups + .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); + } Criterion::Typo => { group.sort_by_key(|d| d.typo_rank); new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); - }, + } Criterion::Words => { group.sort_by_key(|d| d.word_rank); new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from)); - }, + } Criterion::Asc(field_name) if field_name == "asc_desc_rank" => { group.sort_by_key(|d| d.asc_desc_rank); - new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); - }, - Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { + new_groups + .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); + } + Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank)); - new_groups.extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); - }, + new_groups + .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); + } Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()), } } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 2b9c5ae5e..19173bc72 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,9 +1,9 @@ use big_s::S; use milli::update::Settings; -use milli::{Search, SearchResult, Criterion}; +use milli::{Criterion, Search, SearchResult}; +use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; -use Criterion::*; const ALLOW_TYPOS: bool = true; const DISALLOW_TYPOS: bool = false; @@ -35,29 +35,54 @@ macro_rules! test_criterion { } } +#[rustfmt::skip] test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS); +#[rustfmt::skip] test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS); +#[rustfmt::skip] test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Words); +#[rustfmt::skip] test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Attribute); +#[rustfmt::skip] test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Attribute); +#[rustfmt::skip] test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Exactness); +#[rustfmt::skip] test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Exactness); +#[rustfmt::skip] test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Proximity); +#[rustfmt::skip] test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Proximity); +#[rustfmt::skip] test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("asc_desc_rank"))); +#[rustfmt::skip] test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("asc_desc_rank"))); +#[rustfmt::skip] test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("asc_desc_rank"))); +#[rustfmt::skip] test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("asc_desc_rank"))); +#[rustfmt::skip] test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("unexisting_field"))); +#[rustfmt::skip] test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("unexisting_field"))); +#[rustfmt::skip] test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("unexisting_field"))); +#[rustfmt::skip] test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("unexisting_field"))); #[test] fn criteria_mixup() { use Criterion::*; - let index = search::setup_search_index_with_criteria(&vec![Words, Attribute, Desc(S("asc_desc_rank")), Exactness, Proximity, Typo]); + let index = search::setup_search_index_with_criteria(&vec![ + Words, + Attribute, + Desc(S("asc_desc_rank")), + Exactness, + Proximity, + Typo, + ]); + #[rustfmt::skip] let criteria_mix = { // Criterion doesn't implement Copy, we create a new Criterion using a closure let desc = || Desc(S("asc_desc_rank")); @@ -205,10 +230,11 @@ fn criteria_mixup() { let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) - .into_iter() - .map(|d| d.id) - .collect(); + let expected_external_ids: Vec<_> = + search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) + .into_iter() + .map(|d| d.id) + .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); diff --git a/script/pre-commit b/script/pre-commit new file mode 100755 index 000000000..4819a3b52 --- /dev/null +++ b/script/pre-commit @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +cargo check --workspace --all-targets &>/dev/null +result=$? + +if [[ ${result} -ne 0 ]] ; then + cat <<\EOF +The project does not compile. You might want to fix your error before commiting. + +If you still want to commit you can do it by appending +--no-verify +at the end of your previous command. + +If you are running a variant of bash you can directly paste this command in your terminal: +!! --no-verify +EOF + exit 1 +fi + +cargo fmt --all -- --check &>/dev/null +result=$? + +if [[ ${result} -ne 0 ]] ; then + cat <<\EOF +The project is badly formatted. Please run: +cargo fmt --all + +If you want to create your commit without propper formatting you can add +--no-verify +at the end of your commit. + +If you are running a variant of bash you can directly paste this command in your terminal: +!! --no-verify +EOF + exit 1 +fi diff --git a/qc_loop.sh b/script/qc_loop.sh similarity index 100% rename from qc_loop.sh rename to script/qc_loop.sh diff --git a/search/src/main.rs b/search/src/main.rs index f7f95b730..fba714dab 100644 --- a/search/src/main.rs +++ b/search/src/main.rs @@ -6,10 +6,9 @@ use std::time::Instant; use byte_unit::Byte; use heed::EnvOpenOptions; use log::debug; +use milli::{obkv_to_json, Index}; use structopt::StructOpt; -use milli::{Index, obkv_to_json}; - #[cfg(target_os = "linux")] #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; @@ -86,7 +85,8 @@ fn main() -> anyhow::Result<()> { } if opt.print_facet_distribution { - let facets = index.facets_distribution(&rtxn).candidates(result.candidates).execute()?; + let facets = + index.facets_distribution(&rtxn).candidates(result.candidates).execute()?; serde_json::to_writer(&mut stdout, &facets)?; let _ = writeln!(&mut stdout); } From abbebad669a14e070aed127fe5deda14b926019d Mon Sep 17 00:00:00 2001 From: marin postma Date: Thu, 17 Jun 2021 11:44:01 +0200 Subject: [PATCH 0808/1889] change sub errors visibility --- milli/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 201035a8a..8db608561 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,7 +22,7 @@ use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; pub use self::criterion::{default_criteria, Criterion}; -pub use self::error::Error; +pub use self::error::{Error, InternalError, UserError}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ From 70bee7d405711d5e6d24b62710e92671be5ac67a Mon Sep 17 00:00:00 2001 From: marin Date: Thu, 17 Jun 2021 11:49:03 +0200 Subject: [PATCH 0809/1889] re-export remaining error types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 8db608561..7571d8a53 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,7 +22,7 @@ use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; pub use self::criterion::{default_criteria, Criterion}; -pub use self::error::{Error, InternalError, UserError}; +pub use self::error::{Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ From 9f4184208eb9f70f5d52a98ac1ddd7a75da9da17 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 17 Jun 2021 13:56:09 +0200 Subject: [PATCH 0810/1889] Add test on filters --- milli/tests/assets/test_set.ndjson | 2 +- milli/tests/search/filters.rs | 84 ++++++++++++++++++++++++++++ milli/tests/search/mod.rs | 60 +++++++++++++++++++- milli/tests/search/query_criteria.rs | 50 +++++++++-------- 4 files changed, 171 insertions(+), 25 deletions(-) create mode 100644 milli/tests/search/filters.rs diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index f219ab7e9..599d479ed 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -2,7 +2,7 @@ {"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","":""} {"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","":""} {"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","":""} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} {"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","":""} {"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","":""} {"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","":""} diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs new file mode 100644 index 000000000..790bd4a58 --- /dev/null +++ b/milli/tests/search/filters.rs @@ -0,0 +1,84 @@ +use either::{Either, Left, Right}; +use milli::{Criterion, FilterCondition, Search, SearchResult}; +use Criterion::*; + +use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; + +macro_rules! test_filter { + ($func:ident, $filter:expr) => { + #[test] + fn $func() { + let criteria = vec![Words, Typo, Proximity, Attribute, Exactness]; + let index = search::setup_search_index_with_criteria(&criteria); + let mut rtxn = index.read_txn().unwrap(); + + let filter_conditions = + FilterCondition::from_array::, &str>>, _, _, _>( + &rtxn, &index, $filter, + ) + .unwrap() + .unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.authorize_typos(true); + search.optional_words(true); + search.filter(filter_conditions); + + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + let filtered_ids = search::expected_filtered_ids($filter); + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true) + .into_iter() + .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None }) + .collect(); + + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + assert_eq!(documents_ids, expected_external_ids); + } + }; +} + +#[rustfmt::skip] +test_filter!(eq_simple_string_filter, vec![Right("tag=red")]); +#[rustfmt::skip] +test_filter!(eq_simple_number_filter, vec![Right("asc_desc_rank=1")]); +#[rustfmt::skip] +test_filter!(eq_string_and_filter_return_empty, vec![Right("tag=red"), Right("tag=green")]); +#[rustfmt::skip] +test_filter!(eq_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank=1")]); +#[rustfmt::skip] +test_filter!(eq_string_or_filter, vec![Left(vec!["tag=red", "tag=green"])]); +#[rustfmt::skip] +test_filter!(eq_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank=1"])]); +#[rustfmt::skip] +test_filter!(eq_number_or_filter, vec![Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])]); +#[rustfmt::skip] +test_filter!(eq_complex_filter, vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank=3")]); +#[rustfmt::skip] +test_filter!(eq_complex_filter_2, vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])]); +#[rustfmt::skip] +test_filter!(greater_simple_number_filter, vec![Right("asc_desc_rank>1")]); +#[rustfmt::skip] +test_filter!(greater_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank>1")]); +#[rustfmt::skip] +test_filter!(greater_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank>1"])]); +#[rustfmt::skip] +test_filter!(greater_number_or_filter, vec![Left(vec!["asc_desc_rank>3", "asc_desc_rank>1"])]); +#[rustfmt::skip] +test_filter!(greater_complex_filter, vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank>3")]); +#[rustfmt::skip] +test_filter!(greater_complex_filter_2, vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank>3", "asc_desc_rank>1"])]); +#[rustfmt::skip] +test_filter!(lower_simple_number_filter, vec![Right("asc_desc_rank<1")]); +#[rustfmt::skip] +test_filter!(lower_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank<1")]); +#[rustfmt::skip] +test_filter!(lower_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank<1"])]); +#[rustfmt::skip] +test_filter!(lower_number_or_filter, vec![Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])]); +#[rustfmt::skip] +test_filter!(lower_complex_filter, vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank<3")]); +#[rustfmt::skip] +test_filter!(lower_complex_filter_2, vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 7842b6c13..e48d7704d 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -1,4 +1,7 @@ +use std::collections::HashSet; + use big_s::S; +use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::update::{IndexDocuments, Settings, UpdateFormat}; @@ -6,6 +9,7 @@ use milli::{Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; +mod filters; mod query_criteria; pub const TEST_QUERY: &'static str = "hello world america"; @@ -120,7 +124,61 @@ pub fn expected_order( } } -#[derive(Debug, Clone, Deserialize)] +fn execute_filter(filter: &str, document: &TestDocument) -> Option { + let mut id = None; + if let Some((field, filter)) = filter.split_once("=") { + println!("eq on field {} with filter {}", field, filter); + if field == "tag" && document.tag == filter { + id = Some(document.id.clone()) + } else if field == "asc_desc_rank" + && document.asc_desc_rank == filter.parse::().unwrap() + { + id = Some(document.id.clone()) + } + } else if let Some(("asc_desc_rank", filter)) = filter.split_once("<") { + println!("lower on field asc_desc_rank with filter {}", filter); + if document.asc_desc_rank < filter.parse().unwrap() { + id = Some(document.id.clone()) + } + } else if let Some(("asc_desc_rank", filter)) = filter.split_once(">") { + println!("higher on field asc_desc_rank with filter {}", filter); + if document.asc_desc_rank > filter.parse().unwrap() { + id = Some(document.id.clone()) + } + } + id +} + +pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet { + let dataset: HashSet = + serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); + + let mut filtered_ids: HashSet<_> = dataset.iter().map(|d| d.id.clone()).collect(); + for either in filters { + let ids = match either { + Left(array) => array + .into_iter() + .map(|f| { + let ids: HashSet = + dataset.iter().filter_map(|d| execute_filter(f, d)).collect(); + ids + }) + .reduce(|a, b| a.union(&b).cloned().collect()) + .unwrap(), + Right(filter) => { + let ids: HashSet = + dataset.iter().filter_map(|d| execute_filter(filter, d)).collect(); + ids + } + }; + + filtered_ids = filtered_ids.intersection(&ids).cloned().collect(); + } + + filtered_ids +} + +#[derive(Debug, Clone, Deserialize, PartialEq, Eq, Hash)] pub struct TestDocument { pub id: String, pub word_rank: u32, diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 19173bc72..7ab9897b3 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -11,10 +11,10 @@ const ALLOW_OPTIONAL_WORDS: bool = true; const DISALLOW_OPTIONAL_WORDS: bool = false; macro_rules! test_criterion { - ($func:ident, $optional_word:ident, $authorize_typos:ident $(, $criterion:expr)?) => { + ($func:ident, $optional_word:ident, $authorize_typos:ident , $criterion:expr) => { #[test] fn $func() { - let criteria = vec![$($criterion)?]; + let criteria = $criterion; let index = search::setup_search_index_with_criteria(&criteria); let mut rtxn = index.read_txn().unwrap(); @@ -26,49 +26,53 @@ macro_rules! test_criterion { let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, $authorize_typos, $optional_word) - .into_iter() - .map(|d| d.id).collect(); + let expected_external_ids: Vec<_> = + search::expected_order(&criteria, $authorize_typos, $optional_word) + .into_iter() + .map(|d| d.id) + .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); } - } + }; } #[rustfmt::skip] -test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS); +test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![]); #[rustfmt::skip] -test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS); +test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![]); #[rustfmt::skip] -test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Words); +test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words]); #[rustfmt::skip] -test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Attribute); +test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Attribute]); #[rustfmt::skip] -test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Attribute); +test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Attribute]); #[rustfmt::skip] -test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Exactness); +test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Exactness]); #[rustfmt::skip] -test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Exactness); +test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Exactness]); #[rustfmt::skip] -test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Proximity); +test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Proximity]); #[rustfmt::skip] -test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Proximity); +test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Proximity]); #[rustfmt::skip] -test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("asc_desc_rank"))); +test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Asc(S("asc_desc_rank"))]); #[rustfmt::skip] -test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("asc_desc_rank"))); +test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Asc(S("asc_desc_rank"))]); #[rustfmt::skip] -test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("asc_desc_rank"))); +test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Desc(S("asc_desc_rank"))]); #[rustfmt::skip] -test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("asc_desc_rank"))); +test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Desc(S("asc_desc_rank"))]); #[rustfmt::skip] -test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Asc(S("unexisting_field"))); +test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Asc(S("unexisting_field"))]); #[rustfmt::skip] -test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Asc(S("unexisting_field"))); +test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Asc(S("unexisting_field"))]); #[rustfmt::skip] -test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, Desc(S("unexisting_field"))); +test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Desc(S("unexisting_field"))]); #[rustfmt::skip] -test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, Desc(S("unexisting_field"))); +test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Desc(S("unexisting_field"))]); +#[rustfmt::skip] +test_criterion!(default_criteria_order, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words, Typo, Proximity, Attribute, Exactness]); #[test] fn criteria_mixup() { From f496cd320d39c7219cc1679c3086cdb002561589 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 17 Jun 2021 14:24:59 +0200 Subject: [PATCH 0811/1889] Add distinct integration tests --- milli/tests/search/distinct.rs | 76 ++++++++++++++++++++++++++++ milli/tests/search/mod.rs | 1 + milli/tests/search/query_criteria.rs | 4 +- 3 files changed, 79 insertions(+), 2 deletions(-) create mode 100644 milli/tests/search/distinct.rs diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs new file mode 100644 index 000000000..1204a6bfb --- /dev/null +++ b/milli/tests/search/distinct.rs @@ -0,0 +1,76 @@ +use std::collections::HashSet; + +use big_s::S; +use milli::update::Settings; +use milli::{Criterion, Search, SearchResult}; +use Criterion::*; + +use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; + +macro_rules! test_distinct { + ($func:ident, $distinct:ident, $criteria:expr) => { + #[test] + fn $func() { + let criteria = $criteria; + let index = search::setup_search_index_with_criteria(&criteria); + + // update distinct attribute + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_distinct_field(S(stringify!($distinct))); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.authorize_typos(true); + search.optional_words(true); + + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + let mut distinct_values = HashSet::new(); + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true) + .into_iter() + .filter_map(|d| { + if distinct_values.contains(&d.$distinct) { + None + } else { + distinct_values.insert(d.$distinct.to_owned()); + Some(d.id) + } + }) + .collect(); + + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + assert_eq!(documents_ids, expected_external_ids); + } + }; +} + +#[rustfmt::skip] +test_distinct!(distinct_string_default_criteria, tag, vec![Words, Typo, Proximity, Attribute, Exactness]); +#[rustfmt::skip] +test_distinct!(distinct_number_default_criteria, asc_desc_rank, vec![Words, Typo, Proximity, Attribute, Exactness]); +#[rustfmt::skip] +test_distinct!(distinct_string_criterion_words, tag, vec![Words]); +#[rustfmt::skip] +test_distinct!(distinct_number_criterion_words, asc_desc_rank, vec![Words]); +#[rustfmt::skip] +test_distinct!(distinct_string_criterion_words_typo, tag, vec![Words, Typo]); +#[rustfmt::skip] +test_distinct!(distinct_number_criterion_words_typo, asc_desc_rank, vec![Words, Typo]); +#[rustfmt::skip] +test_distinct!(distinct_string_criterion_words_proximity, tag, vec![Words, Proximity]); +#[rustfmt::skip] +test_distinct!(distinct_number_criterion_words_proximity, asc_desc_rank, vec![Words, Proximity]); +#[rustfmt::skip] +test_distinct!(distinct_string_criterion_words_attribute, tag, vec![Words, Attribute]); +#[rustfmt::skip] +test_distinct!(distinct_number_criterion_words_attribute, asc_desc_rank, vec![Words, Attribute]); +#[rustfmt::skip] +test_distinct!(distinct_string_criterion_words_exactness, tag, vec![Words, Exactness]); +#[rustfmt::skip] +test_distinct!(distinct_number_criterion_words_exactness, asc_desc_rank, vec![Words, Exactness]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index e48d7704d..d37904942 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -9,6 +9,7 @@ use milli::{Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; +mod distinct; mod filters; mod query_criteria; diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 7ab9897b3..486768228 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -11,10 +11,10 @@ const ALLOW_OPTIONAL_WORDS: bool = true; const DISALLOW_OPTIONAL_WORDS: bool = false; macro_rules! test_criterion { - ($func:ident, $optional_word:ident, $authorize_typos:ident , $criterion:expr) => { + ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr) => { #[test] fn $func() { - let criteria = $criterion; + let criteria = $criteria; let index = search::setup_search_index_with_criteria(&criteria); let mut rtxn = index.read_txn().unwrap(); From ccd6f137934c3f60e702a1f4618aa46bac381724 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 17 Jun 2021 15:01:20 +0200 Subject: [PATCH 0812/1889] Update version to the next release (0.4.1) --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dae36eb0b..c6cc7b477 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -900,7 +900,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.4.0" +version = "0.4.1" dependencies = [ "anyhow", "byte-unit", @@ -954,7 +954,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.4.0" +version = "0.4.1" dependencies = [ "anyhow", "askama", @@ -1096,7 +1096,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.4.0" +version = "0.4.1" dependencies = [ "anyhow", "byte-unit", @@ -1375,7 +1375,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.4.0" +version = "0.4.1" dependencies = [ "big_s", "bstr", @@ -2230,7 +2230,7 @@ dependencies = [ [[package]] name = "search" -version = "0.4.0" +version = "0.4.1" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 4aa208d5e..ffe881da1 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.4.0" +version = "0.4.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 29c6d2b63..3ae49f01e 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.4.0" +version = "0.4.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 1f1affa2f..2e6690341 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.4.0" +version = "0.4.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7fb14a287..27cea3236 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.4.0" +version = "0.4.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index ebf9b491b..d54def886 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.4.0" +version = "0.4.1" authors = ["Clément Renault "] edition = "2018" From 969adaefdf743a43920132cf1e89897916caaf27 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Jun 2021 15:16:20 +0200 Subject: [PATCH 0813/1889] rename fields_distribution in field_distribution --- milli/src/index.rs | 22 +++++++++---------- milli/src/lib.rs | 4 +++- milli/src/update/clear_documents.rs | 4 ++-- milli/src/update/delete_documents.rs | 8 +++---- milli/src/update/index_documents/mod.rs | 6 ++--- milli/src/update/index_documents/transform.rs | 14 ++++++------ 6 files changed, 30 insertions(+), 28 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index a6c09f3d3..cba9b134f 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -26,7 +26,7 @@ pub mod main_key { pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; - pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; + pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; @@ -290,28 +290,28 @@ impl Index { .unwrap_or_default()) } - /* fields distribution */ + /* field distribution */ - /// Writes the fields distribution which associates every field name with + /// Writes the field distribution which associates every field name with /// the number of times it occurs in the documents. - pub(crate) fn put_fields_distribution( + pub(crate) fn put_field_distribution( &self, wtxn: &mut RwTxn, distribution: &FieldsDistribution, ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>( wtxn, - main_key::FIELDS_DISTRIBUTION_KEY, + main_key::FIELD_DISTRIBUTION_KEY, distribution, ) } - /// Returns the fields distribution which associates every field name with + /// Returns the field distribution which associates every field name with /// the number of times it occurs in the documents. - pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result { + pub fn field_distribution(&self, rtxn: &RoTxn) -> heed::Result { Ok(self .main - .get::<_, Str, SerdeJson>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)? + .get::<_, Str, SerdeJson>(rtxn, main_key::FIELD_DISTRIBUTION_KEY)? .unwrap_or_default()) } @@ -823,7 +823,7 @@ pub(crate) mod tests { } #[test] - fn initial_fields_distribution() { + fn initial_field_distribution() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -842,9 +842,9 @@ pub(crate) mod tests { let rtxn = index.read_txn().unwrap(); - let fields_distribution = index.fields_distribution(&rtxn).unwrap(); + let field_distribution = index.field_distribution(&rtxn).unwrap(); assert_eq!( - fields_distribution, + field_distribution, hashmap! { "id".to_string() => 2, "name".to_string() => 2, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 7571d8a53..a92e87e05 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,7 +22,9 @@ use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; pub use self::criterion::{default_criteria, Criterion}; -pub use self::error::{Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError}; +pub use self::error::{ + Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, +}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 42dd55443..dbb932bfe 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -47,7 +47,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; - self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?; + self.index.put_field_distribution(self.wtxn, &FieldsDistribution::default())?; // We clean all the faceted documents ids. let empty = RoaringBitmap::default(); @@ -113,7 +113,7 @@ mod tests { assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); assert!(index.documents_ids(&rtxn).unwrap().is_empty()); - assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); + assert!(index.field_distribution(&rtxn).unwrap().is_empty()); assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index dfb48dc58..4276de672 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -147,7 +147,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } - let mut fields_distribution = self.index.fields_distribution(self.wtxn)?; + let mut field_distribution = self.index.field_distribution(self.wtxn)?; // We use pre-calculated number of fields occurrences that needs to be deleted // to reflect deleted documents. @@ -155,7 +155,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // Otherwise, insert new number of occurrences (current_count - count_diff). for (field_id, count_diff) in fields_ids_distribution_diff { let field_name = fields_ids_map.name(field_id).unwrap(); - if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { + if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) { match entry.get().checked_sub(count_diff) { Some(0) | None => entry.remove(), Some(count) => entry.insert(count), @@ -163,7 +163,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } - self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; + self.index.put_field_distribution(self.wtxn, &field_distribution)?; // We create the FST map of the external ids that we must delete. external_ids.sort_unstable(); @@ -479,7 +479,7 @@ mod tests { let rtxn = index.read_txn().unwrap(); - assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); + assert!(index.field_distribution(&rtxn).unwrap().is_empty()); } #[test] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 05242f540..a25b0f3a7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -378,7 +378,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let TransformOutput { primary_key, fields_ids_map, - fields_distribution, + field_distribution, external_documents_ids, new_documents_ids, replaced_documents_ids, @@ -594,8 +594,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the fields ids map into the main database self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; - // We write the fields distribution into the main database - self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; + // We write the field distribution into the main database + self.index.put_field_distribution(self.wtxn, &field_distribution)?; // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, &primary_key)?; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 756ff492e..0ff068ebb 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -25,7 +25,7 @@ const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, - pub fields_distribution: FieldsDistribution, + pub field_distribution: FieldsDistribution, pub external_documents_ids: ExternalDocumentsIds<'static>, pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, @@ -127,7 +127,7 @@ impl Transform<'_, '_> { return Ok(TransformOutput { primary_key, fields_ids_map, - fields_distribution: self.index.fields_distribution(self.rtxn)?, + field_distribution: self.index.field_distribution(self.rtxn)?, external_documents_ids: ExternalDocumentsIds::default(), new_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(), @@ -385,7 +385,7 @@ impl Transform<'_, '_> { Error: From, { let documents_ids = self.index.documents_ids(self.rtxn)?; - let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; + let mut field_distribution = self.index.field_distribution(self.rtxn)?; let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); // Once we have sort and deduplicated the documents we write them into a final file. @@ -455,7 +455,7 @@ impl Transform<'_, '_> { let reader = obkv::KvReader::new(obkv); for (field_id, _) in reader.iter() { let field_name = fields_ids_map.name(field_id).unwrap(); - *fields_distribution.entry(field_name.to_string()).or_default() += 1; + *field_distribution.entry(field_name.to_string()).or_default() += 1; } } @@ -485,7 +485,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map, - fields_distribution, + field_distribution, external_documents_ids: external_documents_ids.into_static(), new_documents_ids, replaced_documents_ids, @@ -503,7 +503,7 @@ impl Transform<'_, '_> { old_fields_ids_map: FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, ) -> Result { - let fields_distribution = self.index.fields_distribution(self.rtxn)?; + let field_distribution = self.index.field_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_count = documents_ids.len() as usize; @@ -540,7 +540,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map: new_fields_ids_map, - fields_distribution, + field_distribution, external_documents_ids: external_documents_ids.into_static(), new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), From 6cb1102bdb6c1508bee51b0fc78d2038701ccee4 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 17 Jun 2021 15:19:03 +0200 Subject: [PATCH 0814/1889] Fix PR comments --- milli/tests/search/distinct.rs | 42 +++++------ milli/tests/search/filters.rs | 80 ++++++++++----------- milli/tests/search/mod.rs | 3 - milli/tests/search/query_criteria.rs | 103 +++++++++++++++++---------- 4 files changed, 121 insertions(+), 107 deletions(-) diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index 1204a6bfb..ef5af3272 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -21,9 +21,9 @@ macro_rules! test_distinct { builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); - let mut rtxn = index.read_txn().unwrap(); + let rtxn = index.read_txn().unwrap(); - let mut search = Search::new(&mut rtxn, &index); + let mut search = Search::new(&rtxn, &index); search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos(true); @@ -50,27 +50,23 @@ macro_rules! test_distinct { }; } -#[rustfmt::skip] -test_distinct!(distinct_string_default_criteria, tag, vec![Words, Typo, Proximity, Attribute, Exactness]); -#[rustfmt::skip] -test_distinct!(distinct_number_default_criteria, asc_desc_rank, vec![Words, Typo, Proximity, Attribute, Exactness]); -#[rustfmt::skip] -test_distinct!(distinct_string_criterion_words, tag, vec![Words]); -#[rustfmt::skip] -test_distinct!(distinct_number_criterion_words, asc_desc_rank, vec![Words]); -#[rustfmt::skip] -test_distinct!(distinct_string_criterion_words_typo, tag, vec![Words, Typo]); -#[rustfmt::skip] -test_distinct!(distinct_number_criterion_words_typo, asc_desc_rank, vec![Words, Typo]); -#[rustfmt::skip] -test_distinct!(distinct_string_criterion_words_proximity, tag, vec![Words, Proximity]); -#[rustfmt::skip] +test_distinct!( + distinct_string_default_criteria, + tag, + vec![Words, Typo, Proximity, Attribute, Exactness] +); +test_distinct!( + distinct_number_default_criteria, + asc_desc_rank, + vec![Words, Typo, Proximity, Attribute, Exactness] +); +test_distinct!(distinct_string_criterion_words, tag, vec![Words]); +test_distinct!(distinct_number_criterion_words, asc_desc_rank, vec![Words]); +test_distinct!(distinct_string_criterion_words_typo, tag, vec![Words, Typo]); +test_distinct!(distinct_number_criterion_words_typo, asc_desc_rank, vec![Words, Typo]); +test_distinct!(distinct_string_criterion_words_proximity, tag, vec![Words, Proximity]); test_distinct!(distinct_number_criterion_words_proximity, asc_desc_rank, vec![Words, Proximity]); -#[rustfmt::skip] -test_distinct!(distinct_string_criterion_words_attribute, tag, vec![Words, Attribute]); -#[rustfmt::skip] +test_distinct!(distinct_string_criterion_words_attribute, tag, vec![Words, Attribute]); test_distinct!(distinct_number_criterion_words_attribute, asc_desc_rank, vec![Words, Attribute]); -#[rustfmt::skip] -test_distinct!(distinct_string_criterion_words_exactness, tag, vec![Words, Exactness]); -#[rustfmt::skip] +test_distinct!(distinct_string_criterion_words_exactness, tag, vec![Words, Exactness]); test_distinct!(distinct_number_criterion_words_exactness, asc_desc_rank, vec![Words, Exactness]); diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 790bd4a58..318197ea3 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -10,7 +10,7 @@ macro_rules! test_filter { fn $func() { let criteria = vec![Words, Typo, Proximity, Attribute, Exactness]; let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); + let rtxn = index.read_txn().unwrap(); let filter_conditions = FilterCondition::from_array::, &str>>, _, _, _>( @@ -19,7 +19,7 @@ macro_rules! test_filter { .unwrap() .unwrap(); - let mut search = Search::new(&mut rtxn, &index); + let mut search = Search::new(&rtxn, &index); search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos(true); @@ -40,45 +40,39 @@ macro_rules! test_filter { }; } -#[rustfmt::skip] -test_filter!(eq_simple_string_filter, vec![Right("tag=red")]); -#[rustfmt::skip] -test_filter!(eq_simple_number_filter, vec![Right("asc_desc_rank=1")]); -#[rustfmt::skip] +test_filter!(eq_simple_string_filter, vec![Right("tag=red")]); +test_filter!(eq_simple_number_filter, vec![Right("asc_desc_rank=1")]); test_filter!(eq_string_and_filter_return_empty, vec![Right("tag=red"), Right("tag=green")]); -#[rustfmt::skip] -test_filter!(eq_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank=1")]); -#[rustfmt::skip] -test_filter!(eq_string_or_filter, vec![Left(vec!["tag=red", "tag=green"])]); -#[rustfmt::skip] -test_filter!(eq_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank=1"])]); -#[rustfmt::skip] -test_filter!(eq_number_or_filter, vec![Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])]); -#[rustfmt::skip] -test_filter!(eq_complex_filter, vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank=3")]); -#[rustfmt::skip] -test_filter!(eq_complex_filter_2, vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])]); -#[rustfmt::skip] -test_filter!(greater_simple_number_filter, vec![Right("asc_desc_rank>1")]); -#[rustfmt::skip] -test_filter!(greater_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank>1")]); -#[rustfmt::skip] -test_filter!(greater_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank>1"])]); -#[rustfmt::skip] -test_filter!(greater_number_or_filter, vec![Left(vec!["asc_desc_rank>3", "asc_desc_rank>1"])]); -#[rustfmt::skip] -test_filter!(greater_complex_filter, vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank>3")]); -#[rustfmt::skip] -test_filter!(greater_complex_filter_2, vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank>3", "asc_desc_rank>1"])]); -#[rustfmt::skip] -test_filter!(lower_simple_number_filter, vec![Right("asc_desc_rank<1")]); -#[rustfmt::skip] -test_filter!(lower_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank<1")]); -#[rustfmt::skip] -test_filter!(lower_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank<1"])]); -#[rustfmt::skip] -test_filter!(lower_number_or_filter, vec![Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])]); -#[rustfmt::skip] -test_filter!(lower_complex_filter, vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank<3")]); -#[rustfmt::skip] -test_filter!(lower_complex_filter_2, vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])]); +test_filter!(eq_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank=1")]); +test_filter!(eq_string_or_filter, vec![Left(vec!["tag=red", "tag=green"])]); +test_filter!(eq_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank=1"])]); +test_filter!(eq_number_or_filter, vec![Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])]); +test_filter!(eq_complex_filter, vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank=3")]); +test_filter!( + eq_complex_filter_2, + vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])] +); +test_filter!(greater_simple_number_filter, vec![Right("asc_desc_rank>1")]); +test_filter!(greater_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank>1")]); +test_filter!(greater_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank>1"])]); +test_filter!(greater_number_or_filter, vec![Left(vec!["asc_desc_rank>3", "asc_desc_rank>1"])]); +test_filter!( + greater_complex_filter, + vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank>3")] +); +test_filter!( + greater_complex_filter_2, + vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank>3", "asc_desc_rank>1"])] +); +test_filter!(lower_simple_number_filter, vec![Right("asc_desc_rank<1")]); +test_filter!(lower_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank<1")]); +test_filter!(lower_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank<1"])]); +test_filter!(lower_number_or_filter, vec![Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])]); +test_filter!( + lower_complex_filter, + vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank<3")] +); +test_filter!( + lower_complex_filter_2, + vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])] +); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index d37904942..c5724a921 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -128,7 +128,6 @@ pub fn expected_order( fn execute_filter(filter: &str, document: &TestDocument) -> Option { let mut id = None; if let Some((field, filter)) = filter.split_once("=") { - println!("eq on field {} with filter {}", field, filter); if field == "tag" && document.tag == filter { id = Some(document.id.clone()) } else if field == "asc_desc_rank" @@ -137,12 +136,10 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { id = Some(document.id.clone()) } } else if let Some(("asc_desc_rank", filter)) = filter.split_once("<") { - println!("lower on field asc_desc_rank with filter {}", filter); if document.asc_desc_rank < filter.parse().unwrap() { id = Some(document.id.clone()) } } else if let Some(("asc_desc_rank", filter)) = filter.split_once(">") { - println!("higher on field asc_desc_rank with filter {}", filter); if document.asc_desc_rank > filter.parse().unwrap() { id = Some(document.id.clone()) } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 486768228..f814508f5 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -16,9 +16,9 @@ macro_rules! test_criterion { fn $func() { let criteria = $criteria; let index = search::setup_search_index_with_criteria(&criteria); - let mut rtxn = index.read_txn().unwrap(); + let rtxn = index.read_txn().unwrap(); - let mut search = Search::new(&mut rtxn, &index); + let mut search = Search::new(&rtxn, &index); search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos($authorize_typos); @@ -37,42 +37,69 @@ macro_rules! test_criterion { }; } -#[rustfmt::skip] -test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![]); -#[rustfmt::skip] -test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![]); -#[rustfmt::skip] -test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words]); -#[rustfmt::skip] -test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Attribute]); -#[rustfmt::skip] -test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Attribute]); -#[rustfmt::skip] -test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Exactness]); -#[rustfmt::skip] -test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Exactness]); -#[rustfmt::skip] -test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Proximity]); -#[rustfmt::skip] -test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Proximity]); -#[rustfmt::skip] -test_criterion!(asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Asc(S("asc_desc_rank"))]); -#[rustfmt::skip] -test_criterion!(asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Asc(S("asc_desc_rank"))]); -#[rustfmt::skip] -test_criterion!(desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Desc(S("asc_desc_rank"))]); -#[rustfmt::skip] -test_criterion!(desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Desc(S("asc_desc_rank"))]); -#[rustfmt::skip] -test_criterion!(asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Asc(S("unexisting_field"))]); -#[rustfmt::skip] -test_criterion!(asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Asc(S("unexisting_field"))]); -#[rustfmt::skip] -test_criterion!(desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Desc(S("unexisting_field"))]); -#[rustfmt::skip] -test_criterion!(desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Desc(S("unexisting_field"))]); -#[rustfmt::skip] -test_criterion!(default_criteria_order, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words, Typo, Proximity, Attribute, Exactness]); +test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![]); +test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![]); +test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words]); +test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Attribute]); +test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Attribute]); +test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Exactness]); +test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Exactness]); +test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Proximity]); +test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Proximity]); +test_criterion!( + asc_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Asc(S("asc_desc_rank"))] +); +test_criterion!( + asc_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Asc(S("asc_desc_rank"))] +); +test_criterion!( + desc_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Desc(S("asc_desc_rank"))] +); +test_criterion!( + desc_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Desc(S("asc_desc_rank"))] +); +test_criterion!( + asc_unexisting_field_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Asc(S("unexisting_field"))] +); +test_criterion!( + asc_unexisting_field_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Asc(S("unexisting_field"))] +); +test_criterion!( + desc_unexisting_field_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Desc(S("unexisting_field"))] +); +test_criterion!( + desc_unexisting_field_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Desc(S("unexisting_field"))] +); +test_criterion!( + default_criteria_order, + ALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Words, Typo, Proximity, Attribute, Exactness] +); #[test] fn criteria_mixup() { From d08cfda7968e443117d3a4f9dbf169be7e0e51e0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Jun 2021 17:05:34 +0200 Subject: [PATCH 0815/1889] convert the field_distribution to a BTreeMap and avoid counting twice the same documents --- milli/src/index.rs | 47 ++++++++++++++++++- milli/src/lib.rs | 4 +- milli/src/update/delete_documents.rs | 2 +- milli/src/update/index_documents/transform.rs | 29 +++++++++--- 4 files changed, 70 insertions(+), 12 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index cba9b134f..2faf8d1f8 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -791,7 +791,7 @@ pub(crate) mod tests { use std::ops::Deref; use heed::EnvOpenOptions; - use maplit::hashmap; + use maplit::btreemap; use tempfile::TempDir; use crate::update::{IndexDocuments, UpdateFormat}; @@ -845,11 +845,54 @@ pub(crate) mod tests { let field_distribution = index.field_distribution(&rtxn).unwrap(); assert_eq!( field_distribution, - hashmap! { + btreemap! { "id".to_string() => 2, "name".to_string() => 2, "age".to_string() => 1, } ); + + // we add all the documents a second time. we are supposed to get the same + // field_distribution in the end + let mut wtxn = index.write_txn().unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let field_distribution = index.field_distribution(&rtxn).unwrap(); + assert_eq!( + field_distribution, + btreemap! { + "id".to_string() => 2, + "name".to_string() => 2, + "age".to_string() => 1, + } + ); + + // then we update a document by removing one field and another by adding one field + let content = &br#"[ + { "id": 1, "name": "kevin", "has_dog": true }, + { "id": 2, "name": "bob" } + ]"#[..]; + let mut wtxn = index.write_txn().unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let field_distribution = index.field_distribution(&rtxn).unwrap(); + assert_eq!( + field_distribution, + btreemap! { + "id".to_string() => 2, + "name".to_string() => 2, + "has_dog".to_string() => 1, + } + ); } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a92e87e05..e88ac62d5 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -14,7 +14,7 @@ pub mod tree_level; pub mod update; use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::hash::BuildHasherDefault; use std::result::Result as StdResult; @@ -50,7 +50,7 @@ pub type Attribute = u32; pub type DocumentId = u32; pub type FieldId = u8; pub type Position = u32; -pub type FieldsDistribution = HashMap; +pub type FieldsDistribution = BTreeMap; type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 4276de672..e291eb106 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,4 +1,4 @@ -use std::collections::hash_map::Entry; +use std::collections::btree_map::Entry; use std::collections::HashMap; use chrono::Utc; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 0ff068ebb..074d281ba 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::btree_map::Entry; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::iter::Peekable; @@ -419,18 +420,32 @@ impl Transform<'_, '_> { // we use it and insert it in the list of replaced documents. replaced_documents_ids.insert(docid); + let key = BEU32::new(docid); + let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + }, + )?; + + // we remove all the fields that were already counted + for (field_id, _) in base_obkv.iter() { + let field_name = fields_ids_map.name(field_id).unwrap(); + if let Entry::Occupied(mut entry) = + field_distribution.entry(field_name.to_string()) + { + match entry.get().checked_sub(1) { + Some(0) | None => entry.remove(), + Some(count) => entry.insert(count), + }; + } + } + // Depending on the update indexing method we will merge // the document update with the current document or not. match self.index_documents_method { IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), IndexDocumentsMethod::UpdateDocuments => { - let key = BEU32::new(docid); - let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( - InternalError::DatabaseMissingEntry { - db_name: db_name::DOCUMENTS, - key: None, - }, - )?; let update_obkv = obkv::KvReader::new(update_obkv); merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); (docid, obkv_buffer.as_slice()) From 35fcc351a0bda692812278d00ac2196a1e1738d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Sun, 20 Jun 2021 17:37:24 +0200 Subject: [PATCH 0816/1889] Update version for the next release (v0.4.2) --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c6cc7b477..94940fd1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -900,7 +900,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.4.1" +version = "0.4.2" dependencies = [ "anyhow", "byte-unit", @@ -954,7 +954,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.4.1" +version = "0.4.2" dependencies = [ "anyhow", "askama", @@ -1096,7 +1096,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.4.1" +version = "0.4.2" dependencies = [ "anyhow", "byte-unit", @@ -1375,7 +1375,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.4.1" +version = "0.4.2" dependencies = [ "big_s", "bstr", @@ -2230,7 +2230,7 @@ dependencies = [ [[package]] name = "search" -version = "0.4.1" +version = "0.4.2" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index ffe881da1..9ac26e5fa 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.4.1" +version = "0.4.2" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 3ae49f01e..199e0723f 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.4.1" +version = "0.4.2" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 2e6690341..174b1149b 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.4.1" +version = "0.4.2" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 27cea3236..dcca5b902 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.4.1" +version = "0.4.2" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index d54def886..b511c8ddc 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.4.1" +version = "0.4.2" authors = ["Clément Renault "] edition = "2018" From daef43f504ccbe1efa3f03f6f2480aa0bf39fbcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 21 Jun 2021 15:57:41 +0200 Subject: [PATCH 0817/1889] Rename FieldsDistribution into FieldDistribution --- milli/src/index.rs | 10 +++++----- milli/src/lib.rs | 2 +- milli/src/update/clear_documents.rs | 4 ++-- milli/src/update/index_documents/transform.rs | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 2faf8d1f8..8982154e5 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -16,7 +16,7 @@ use crate::heed_codec::facet::{ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldId, FieldIdWordCountCodec, - FieldsDistribution, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, + FieldDistribution, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, StrLevelPositionCodec, StrStrU8Codec, BEU32, }; @@ -297,9 +297,9 @@ impl Index { pub(crate) fn put_field_distribution( &self, wtxn: &mut RwTxn, - distribution: &FieldsDistribution, + distribution: &FieldDistribution, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>( + self.main.put::<_, Str, SerdeJson>( wtxn, main_key::FIELD_DISTRIBUTION_KEY, distribution, @@ -308,10 +308,10 @@ impl Index { /// Returns the field distribution which associates every field name with /// the number of times it occurs in the documents. - pub fn field_distribution(&self, rtxn: &RoTxn) -> heed::Result { + pub fn field_distribution(&self, rtxn: &RoTxn) -> heed::Result { Ok(self .main - .get::<_, Str, SerdeJson>(rtxn, main_key::FIELD_DISTRIBUTION_KEY)? + .get::<_, Str, SerdeJson>(rtxn, main_key::FIELD_DISTRIBUTION_KEY)? .unwrap_or_default()) } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index e88ac62d5..ec9bc32c6 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -50,7 +50,7 @@ pub type Attribute = u32; pub type DocumentId = u32; pub type FieldId = u8; pub type Position = u32; -pub type FieldsDistribution = BTreeMap; +pub type FieldDistribution = BTreeMap; type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index dbb932bfe..789970a8e 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,7 +1,7 @@ use chrono::Utc; use roaring::RoaringBitmap; -use crate::{ExternalDocumentsIds, FieldsDistribution, Index, Result}; +use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -47,7 +47,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; - self.index.put_field_distribution(self.wtxn, &FieldsDistribution::default())?; + self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; // We clean all the faceted documents ids. let empty = RoaringBitmap::default(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 074d281ba..d4a730fcc 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -18,7 +18,7 @@ use crate::index::db_name; use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ - ExternalDocumentsIds, FieldId, FieldsDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32, + ExternalDocumentsIds, FieldId, FieldDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32, }; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; @@ -26,7 +26,7 @@ const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, - pub field_distribution: FieldsDistribution, + pub field_distribution: FieldDistribution, pub external_documents_ids: ExternalDocumentsIds<'static>, pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, From 320670f8fe1ecc34d2c1600e4fc1ef97b1bbcde4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 21 Jun 2021 15:59:17 +0200 Subject: [PATCH 0818/1889] Update version for the next release (v0.5.0) --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 94940fd1b..43371f72c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -900,7 +900,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.4.2" +version = "0.5.0" dependencies = [ "anyhow", "byte-unit", @@ -954,7 +954,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.4.2" +version = "0.5.0" dependencies = [ "anyhow", "askama", @@ -1096,7 +1096,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.4.2" +version = "0.5.0" dependencies = [ "anyhow", "byte-unit", @@ -1375,7 +1375,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.4.2" +version = "0.5.0" dependencies = [ "big_s", "bstr", @@ -2230,7 +2230,7 @@ dependencies = [ [[package]] name = "search" -version = "0.4.2" +version = "0.5.0" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 9ac26e5fa..73c776db0 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.4.2" +version = "0.5.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 199e0723f..180d1ff29 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.4.2" +version = "0.5.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 174b1149b..d526afa1f 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.4.2" +version = "0.5.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index dcca5b902..3c4e528ee 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.4.2" +version = "0.5.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index b511c8ddc..b9e9731e6 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.4.2" +version = "0.5.0" authors = ["Clément Renault "] edition = "2018" From 481b0bf277f8a77eda23fde224d428fe885130f0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 22 Jun 2021 10:57:46 +0200 Subject: [PATCH 0819/1889] Warn for when a facet key is too large for LMDB --- milli/src/update/index_documents/store.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 7318c5bd0..f6f1756dd 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -11,7 +11,7 @@ use fst::Set; use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; -use log::{debug, info}; +use log::{debug, info, warn}; use meilisearch_tokenizer::token::SeparatorKind; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; use ordered_float::OrderedFloat; @@ -517,6 +517,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { if lmdb_key_valid_size(&key_buffer) { sorter.insert(&key_buffer, &data_buffer)?; + } else { + warn!("facet value {:?} is too large to be saved", value); } } @@ -582,6 +584,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { if lmdb_key_valid_size(&buffer) { sorter.insert(&buffer, &[])?; + } else { + warn!("facet value {:?} is too large to be saved", value); } Ok(()) From 0cca2ea24fccb701d8897e7430f20f455984cac9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 22 Jun 2021 11:22:33 +0200 Subject: [PATCH 0820/1889] Return a MissingDocumentId when a document doesn't have one --- milli/src/update/index_documents/transform.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index d4a730fcc..24ab276d0 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -18,7 +18,7 @@ use crate::index::db_name; use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ - ExternalDocumentsIds, FieldId, FieldDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32, + ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, MergeFn, Result, BEU32, }; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; @@ -190,7 +190,7 @@ impl Transform<'_, '_> { }, None => { if !self.autogenerate_docids { - return Err(UserError::MissingPrimaryKey.into()); + return Err(UserError::MissingDocumentId { document }.into()); } let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); Cow::Borrowed(uuid) From aecbd1476138fbf18e072b911509c294d96434d7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 22 Jun 2021 11:31:58 +0200 Subject: [PATCH 0821/1889] Improve the error message for InvalidDocumentId --- milli/src/error.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 31012c690..d927407f0 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -206,7 +206,13 @@ impl fmt::Display for UserError { Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); - write!(f, "document identifier is invalid {}", json) + write!( + f, + "document identifier is invalid {}, \ +a document id can be of type integer or string \ +only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)", + json + ) } Self::InvalidFilterAttribute(error) => error.fmt(f), Self::MissingDocumentId { document } => { From 51dbb2e06d5e2fba4a91b36d8f8ac64b877d0ad1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 22 Jun 2021 11:51:36 +0200 Subject: [PATCH 0822/1889] Warn for when a key is too large for LMDB --- milli/src/update/index_documents/store.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index f6f1756dd..766ed82b2 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -426,6 +426,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { sorter.insert(&key, &buffer)?; + } else { + warn!( + "words pairs proximity ({:?} - {:?}, {:?}) is too large to be saved", + w1, w2, min_prox + ); } } @@ -457,6 +462,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { writer.insert(&key, &buffer)?; + } else { + warn!("word {:?} is too large to be saved", word); } } @@ -493,6 +500,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // that we write under the generated key into MTBL if lmdb_key_valid_size(&key_buffer) { writer.insert(&key_buffer, &data_buffer)?; + } else { + warn!("word {:?} is too large to be saved", word); } } } @@ -610,6 +619,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // that we write under the generated key into MTBL if lmdb_key_valid_size(&key) { sorter.insert(&key, &buffer)?; + } else { + warn!("word {:?} is too large to be saved", word); } } From d53df8a002fe2a7eda7a6ac5c36276da0d0d9292 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 22 Jun 2021 14:04:16 +0200 Subject: [PATCH 0823/1889] enable the jemallocator dependencies only when we are running on linux --- helpers/Cargo.toml | 4 +++- infos/Cargo.toml | 4 +++- search/Cargo.toml | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 73c776db0..3a778cdab 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -8,7 +8,9 @@ edition = "2018" anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } heed = "0.10.6" -jemallocator = "0.3.2" milli = { path = "../milli" } stderrlog = "0.5.1" structopt = { version = "0.3.21", default-features = false } + +[target.'cfg(target_os = "linux")'.dependencies] +jemallocator = "0.3.2" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index d526afa1f..8043b9fe8 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -9,9 +9,11 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } csv = "1.1.5" heed = "0.10.6" -jemallocator = "0.3.2" milli = { path = "../milli" } roaring = "0.6.6" serde_json = "1.0.62" stderrlog = "0.5.1" structopt = { version = "0.3.21", default-features = false } + +[target.'cfg(target_os = "linux")'.dependencies] +jemallocator = "0.3.2" diff --git a/search/Cargo.toml b/search/Cargo.toml index b9e9731e6..364900b05 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -8,9 +8,11 @@ edition = "2018" anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } heed = "0.10.6" -jemallocator = "0.3.2" log = "0.4.14" milli = { path = "../milli" } serde_json = "1.0.62" stderrlog = "0.5.1" structopt = { version = "0.3.21", default-features = false } + +[target.'cfg(target_os = "linux")'.dependencies] +jemallocator = "0.3.2" From 77eb37934f81d8852f53af8a67fda14387b35fd8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 22 Jun 2021 14:17:56 +0200 Subject: [PATCH 0824/1889] add jemalloc to http-ui and the benchmarks --- Cargo.lock | 2 ++ benchmarks/Cargo.toml | 3 +++ benchmarks/benches/songs.rs | 4 ++++ benchmarks/benches/wiki.rs | 4 ++++ http-ui/Cargo.toml | 3 +++ http-ui/src/main.rs | 4 ++++ 6 files changed, 20 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 43371f72c..428aae05f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -132,6 +132,7 @@ dependencies = [ "criterion", "flate2", "heed", + "jemallocator", "milli", "reqwest", ] @@ -969,6 +970,7 @@ dependencies = [ "futures", "grenad", "heed", + "jemallocator", "log", "maplit", "meilisearch-tokenizer 0.2.3", diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index ed366022c..d211b0b6e 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -8,6 +8,9 @@ publish = false [dependencies] milli = { path = "../milli" } +[target.'cfg(target_os = "linux")'.dependencies] +jemallocator = "0.3.2" + [dev-dependencies] heed = "*" # we want to use the version milli uses criterion = { version = "0.3.4", features = ["html_reports"] } diff --git a/benchmarks/benches/songs.rs b/benchmarks/benches/songs.rs index 726190f77..726040692 100644 --- a/benchmarks/benches/songs.rs +++ b/benchmarks/benches/songs.rs @@ -5,6 +5,10 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + fn base_conf(builder: &mut Settings) { let displayed_fields = ["id", "title", "album", "artist", "genre", "country", "released", "duration"] diff --git a/benchmarks/benches/wiki.rs b/benchmarks/benches/wiki.rs index 3d8b6f1d4..9ef75efeb 100644 --- a/benchmarks/benches/wiki.rs +++ b/benchmarks/benches/wiki.rs @@ -5,6 +5,10 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + fn base_conf(builder: &mut Settings) { let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); builder.set_displayed_fields(displayed_fields); diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 180d1ff29..0319e4e50 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -42,3 +42,6 @@ funty = "=1.1" [dev-dependencies] maplit = "1.0.2" serde_test = "1.0.125" + +[target.'cfg(target_os = "linux")'.dependencies] +jemallocator = "0.3.2" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 703861058..6a79e77ca 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -36,6 +36,10 @@ use warp::Filter; use self::update_store::UpdateStore; +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); #[derive(Debug, StructOpt)] From 81643e6d70704d7675f9b50fcc41b764ecf78c15 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 22 Jun 2021 14:47:23 +0200 Subject: [PATCH 0825/1889] add the limit field to http-ui --- http-ui/src/main.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 703861058..8035cf789 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -683,6 +683,7 @@ async fn main() -> anyhow::Result<()> { filters: Option, facet_filters: Option, String>>>, facet_distribution: Option, + limit: Option, } #[derive(Debug, Serialize)] @@ -735,6 +736,10 @@ async fn main() -> anyhow::Result<()> { search.filter(condition); } + if let Some(limit) = query.limit { + search.limit(limit); + } + let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); From 3d90b03d7b0ec91db66c179dd82c3acdf487dc8e Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 22 Jun 2021 14:52:13 +0200 Subject: [PATCH 0826/1889] fix the limit There was no check on the limit and thus, if a user especified a very large number this line could causes a panic --- milli/src/search/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f692df173..71d200e0c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -162,7 +162,7 @@ impl<'a> Search<'a> { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); let mut excluded_candidates = RoaringBitmap::new(); - let mut documents_ids = Vec::with_capacity(self.limit); + let mut documents_ids = Vec::new(); while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next(&excluded_candidates)? From 8d2a0b43fff87fe5ff5f375f0cb6c5acacf83a98 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 22 Jun 2021 15:36:22 +0200 Subject: [PATCH 0827/1889] run the formatter on the whole project a second time --- milli/src/index.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 8982154e5..c8e5ab089 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -15,8 +15,8 @@ use crate::heed_codec::facet::{ }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, - DocumentId, ExternalDocumentsIds, FacetDistribution, FieldId, FieldIdWordCountCodec, - FieldDistribution, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, + DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, + FieldIdWordCountCodec, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, StrLevelPositionCodec, StrStrU8Codec, BEU32, }; From d8695da1d17032d785f9e43502757dc5aa9aa109 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 21 Jun 2021 19:54:43 +0200 Subject: [PATCH 0828/1889] improve the ci --- .github/workflows/rust.yml | 75 ++++++++++++++++++++++++++++++++++++++ .github/workflows/test.yml | 63 -------------------------------- 2 files changed, 75 insertions(+), 63 deletions(-) create mode 100644 .github/workflows/rust.yml delete mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 000000000..02efb7cd5 --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,75 @@ +name: Rust + +on: + push: + branches: [ staging, trying ] + pull_request: + branches: [ main ] + +env: + CARGO_TERM_COLOR: always + +jobs: + tests: + name: Tests on ${{ matrix.os }} with ${{ matrix.rust }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-18.04, macos-latest] + rust: + - stable + - beta + - nightly + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: ${{ matrix.rust }} + override: true + - name: Run cargo check + uses: actions-rs/cargo@v1 + with: + command: check + args: --all --locked + - name: Run cargo test + uses: actions-rs/cargo@v1 + with: + command: test + args: --locked --release + + # We don't run test on Windows since we get the following error: There is not enough space on the disk. + check-on-windows: + name: Cargo check on Windows + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + - name: Run cargo check without any default features + uses: actions-rs/cargo@v1 + with: + command: check + args: --all --locked + - name: Run cargo check with all default features + uses: actions-rs/cargo@v1 + with: + command: check + + fmt: + name: Run Rustfmt + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: nightly + override: true + components: rustfmt + - name: Run cargo fmt + run: | + # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. + # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate + # we are going to create an empty file where rustfmt expect it. + echo -ne "\n" > benchmarks/benches/datasets_paths.rs + cargo fmt --all -- --check diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml deleted file mode 100644 index 4ee17a239..000000000 --- a/.github/workflows/test.yml +++ /dev/null @@ -1,63 +0,0 @@ -name: Continuous integration - -on: - push: - branches: [ staging, trying ] - pull_request: - branches: [ main ] - -jobs: - ci: - runs-on: ubuntu-latest - strategy: - matrix: - rust: - - stable - - beta - # We temporarily stop building on nightly just to fix this issue - # https://github.com/bheisler/TinyTemplate/pull/17 - # Reenable it when the fix has been merged. - # - nightly - - steps: - - uses: actions/checkout@v2 - - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: ${{ matrix.rust }} - override: true - components: rustfmt, clippy - - - uses: actions-rs/cargo@v1 - with: - command: check - args: --all --locked - - - uses: actions-rs/cargo@v1 - with: - command: build - - - uses: actions-rs/cargo@v1 - with: - command: test - - - uses: actions-rs/cargo@v1 - with: - command: bench - args: --no-run -p benchmarks - - # - uses: actions-rs/cargo@v1 - # with: - # command: bench - # args: --no-run - - # - uses: actions-rs/cargo@v1 - # with: - # command: fmt - # args: -- --check - - # - uses: actions-rs/cargo@v1 - # with: - # command: clippy - # args: --all-targets -- -D warnings From 5099192c4499d57736a650813a17c1bf5474ad47 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 23 Jun 2021 10:21:16 +0200 Subject: [PATCH 0829/1889] update bors.toml --- .github/workflows/rust.yml | 6 +++--- bors.toml | 9 ++++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 02efb7cd5..e87848f94 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -67,9 +67,9 @@ jobs: override: true components: rustfmt - name: Run cargo fmt + # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. + # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate + # we are going to create an empty file where rustfmt expects it. run: | - # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. - # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate - # we are going to create an empty file where rustfmt expect it. echo -ne "\n" > benchmarks/benches/datasets_paths.rs cargo fmt --all -- --check diff --git a/bors.toml b/bors.toml index 3fbc6159e..b4deacf9a 100644 --- a/bors.toml +++ b/bors.toml @@ -1,5 +1,12 @@ status = [ - 'ci (stable)' + 'Tests on ubuntu-18.04 with stable', + 'Tests on ubuntu-18.04 with beta', + 'Tests on ubuntu-18.04 with nightly', + 'Tests on macos-latest with stable', + 'Tests on macos-latest with beta', + 'Tests on macos-latest with nightly', + 'Cargo check on Windows', + 'Run Rustfmt', ] # 3 hours timeout timeout-sec = 10800 From aeaac743ff08d9599084c0375f694a842b698f03 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 11:33:30 +0200 Subject: [PATCH 0830/1889] Replace an if let some by a match --- milli/src/search/facet/facet_distribution.rs | 67 ++++++++++---------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 0a2036494..1a66acbfa 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -174,40 +174,41 @@ impl<'a> FacetDistribution<'a> { fn facet_values(&self, field_id: FieldId) -> heed::Result> { use FacetType::{Number, String}; - if let Some(candidates) = self.candidates.as_ref() { - // Classic search, candidates were specified, we must return facet values only related - // to those candidates. We also enter here for facet strings for performance reasons. - let mut distribution = BTreeMap::new(); - if candidates.len() <= CANDIDATES_THRESHOLD { - self.facet_distribution_from_documents( - field_id, - Number, - candidates, - &mut distribution, - )?; - self.facet_distribution_from_documents( - field_id, - String, - candidates, - &mut distribution, - )?; - } else { - self.facet_numbers_distribution_from_facet_levels( - field_id, - candidates, - &mut distribution, - )?; - self.facet_distribution_from_documents( - field_id, - String, - candidates, - &mut distribution, - )?; - } + match self.candidates { + Some(ref candidates) => { + // Classic search, candidates were specified, we must return facet values only related + // to those candidates. We also enter here for facet strings for performance reasons. + let mut distribution = BTreeMap::new(); + if candidates.len() <= CANDIDATES_THRESHOLD { + self.facet_distribution_from_documents( + field_id, + Number, + candidates, + &mut distribution, + )?; + self.facet_distribution_from_documents( + field_id, + String, + candidates, + &mut distribution, + )?; + } else { + self.facet_numbers_distribution_from_facet_levels( + field_id, + candidates, + &mut distribution, + )?; + self.facet_distribution_from_documents( + field_id, + String, + candidates, + &mut distribution, + )?; + } - Ok(distribution) - } else { - self.facet_values_from_raw_facet_database(field_id) + Ok(distribution) + } + None => self.facet_values_from_raw_facet_database(field_id), } } From 2364777838c03cd393d3b407efbef6ecb19d4684 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 11:50:49 +0200 Subject: [PATCH 0831/1889] Return an error for when a field distribution cannot be done --- milli/src/search/facet/facet_distribution.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 1a66acbfa..4d077b8f1 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -215,9 +215,20 @@ impl<'a> FacetDistribution<'a> { pub fn execute(&self) -> Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let filterable_fields = self.index.filterable_fields(self.rtxn)?; + let fields = match self.facets { + Some(ref facets) => { + let invalid_fields: HashSet<_> = facets.difference(&filterable_fields).collect(); + if !invalid_fields.is_empty() { + todo!("return an error specifying that these fields are not filterable"); + } else { + facets.clone() + } + } + None => filterable_fields, + }; let mut distribution = BTreeMap::new(); - for name in filterable_fields { + for name in fields { let fid = fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { field_name: name.clone(), From a6218a20ae3eedbdd0927e3dc850cf6f960e6b2c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 13:56:13 +0200 Subject: [PATCH 0832/1889] Introduce a new InvalidFacetsDistribution user error --- milli/src/error.rs | 15 +++++++++++++-- milli/src/search/facet/facet_distribution.rs | 7 +++++-- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index d927407f0..713935869 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use std::convert::Infallible; use std::error::Error as StdError; use std::{fmt, io, str}; @@ -51,13 +52,14 @@ pub enum FieldIdMapMissingEntry { pub enum UserError { AttributeLimitReached, Csv(csv::Error), - MaxDatabaseSizeReached, DocumentLimitReached, - InvalidFilter(pest::error::Error), InvalidCriterionName { name: String }, InvalidDocumentId { document_id: Value }, + InvalidFacetsDistribution { invalid_facets_name: HashSet }, + InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), InvalidStoreFile, + MaxDatabaseSizeReached, MissingDocumentId { document: Object }, MissingPrimaryKey, NoSpaceLeftOnDevice, @@ -202,6 +204,15 @@ impl fmt::Display for UserError { Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), Self::Csv(error) => error.fmt(f), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), + Self::InvalidFacetsDistribution { invalid_facets_name } => { + let name_list = + invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", "); + write!( + f, + "invalid facet distribution, the fields {} are not set as filterable", + name_list + ) + } Self::InvalidFilter(error) => error.fmt(f), Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), Self::InvalidDocumentId { document_id } => { diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 4d077b8f1..71816cf5d 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -6,7 +6,7 @@ use heed::types::{ByteSlice, Unit}; use heed::{BytesDecode, Database}; use roaring::RoaringBitmap; -use crate::error::FieldIdMapMissingEntry; +use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; use crate::heed_codec::facet::FacetValueStringCodec; use crate::search::facet::{FacetIter, FacetRange}; @@ -219,7 +219,10 @@ impl<'a> FacetDistribution<'a> { Some(ref facets) => { let invalid_fields: HashSet<_> = facets.difference(&filterable_fields).collect(); if !invalid_fields.is_empty() { - todo!("return an error specifying that these fields are not filterable"); + return Err(UserError::InvalidFacetsDistribution { + invalid_facets_name: invalid_fields.into_iter().cloned().collect(), + } + .into()); } else { facets.clone() } From 9885fb415963995dd7890ec217926b0cfc6d3afe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 23 Jun 2021 14:05:20 +0200 Subject: [PATCH 0833/1889] Update version for the next release (v0.5.1) --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 428aae05f..ff600110e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -901,7 +901,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.5.0" +version = "0.5.1" dependencies = [ "anyhow", "byte-unit", @@ -955,7 +955,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.5.0" +version = "0.5.1" dependencies = [ "anyhow", "askama", @@ -1098,7 +1098,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.5.0" +version = "0.5.1" dependencies = [ "anyhow", "byte-unit", @@ -1377,7 +1377,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.5.0" +version = "0.5.1" dependencies = [ "big_s", "bstr", @@ -2232,7 +2232,7 @@ dependencies = [ [[package]] name = "search" -version = "0.5.0" +version = "0.5.1" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 3a778cdab..eeabf6971 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.5.0" +version = "0.5.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 0319e4e50..d12b4363d 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.5.0" +version = "0.5.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 8043b9fe8..bb548743c 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.5.0" +version = "0.5.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3c4e528ee..d708442dc 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.5.0" +version = "0.5.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 364900b05..82368af92 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.5.0" +version = "0.5.1" authors = ["Clément Renault "] edition = "2018" From faa3cd3b717a924869509ecfcd94b26f0432e36d Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 23 Jun 2021 14:30:33 +0200 Subject: [PATCH 0834/1889] Update bors.toml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't check nightly and beta channel Co-authored-by: Clément Renault --- bors.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/bors.toml b/bors.toml index b4deacf9a..bc33262b7 100644 --- a/bors.toml +++ b/bors.toml @@ -1,10 +1,6 @@ status = [ 'Tests on ubuntu-18.04 with stable', - 'Tests on ubuntu-18.04 with beta', - 'Tests on ubuntu-18.04 with nightly', 'Tests on macos-latest with stable', - 'Tests on macos-latest with beta', - 'Tests on macos-latest with nightly', 'Cargo check on Windows', 'Run Rustfmt', ] From c31cadb54f7bbee47b95c2c0204ff449f549ad68 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 17:23:43 +0200 Subject: [PATCH 0835/1889] Do not consider the searchable field as filterable --- milli/src/update/index_documents/store.rs | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 766ed82b2..b187b642c 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -673,15 +673,17 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; - let (facet_numbers, facet_strings) = extract_facet_values(&value); - facet_numbers_values - .entry(attr) - .or_insert_with(Vec::new) - .extend(facet_numbers); - facet_strings_values - .entry(attr) - .or_insert_with(Vec::new) - .extend(facet_strings); + if self.faceted_fields.contains(&attr) { + let (facet_numbers, facet_strings) = extract_facet_values(&value); + facet_numbers_values + .entry(attr) + .or_insert_with(Vec::new) + .extend(facet_numbers); + facet_strings_values + .entry(attr) + .or_insert_with(Vec::new) + .extend(facet_strings); + } if self.searchable_fields.contains(&attr) { let content = match json_to_string(&value) { From 4fc8f06791ae511d49f2819a9ec806a8d76ca30b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 17:25:39 +0200 Subject: [PATCH 0836/1889] Rename faceted_fields into filterable_fields --- milli/src/update/index_documents/store.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index b187b642c..9ac97c255 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -55,7 +55,7 @@ pub struct Readers { pub struct Store<'s, A> { // Indexing parameters searchable_fields: HashSet, - faceted_fields: HashSet, + filterable_fields: HashSet, // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, @@ -90,7 +90,7 @@ pub struct Store<'s, A> { impl<'s, A: AsRef<[u8]>> Store<'s, A> { pub fn new( searchable_fields: HashSet, - faceted_fields: HashSet, + filterable_fields: HashSet, linked_hash_map_size: Option, max_nb_chunks: Option, max_memory: Option, @@ -190,7 +190,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(Store { // Indexing parameters. searchable_fields, - faceted_fields, + filterable_fields, // Caches word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), field_id_word_count_docids: HashMap::new(), @@ -668,12 +668,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } for (attr, content) in document.iter() { - if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) + if self.filterable_fields.contains(&attr) + || self.searchable_fields.contains(&attr) { let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; - if self.faceted_fields.contains(&attr) { + if self.filterable_fields.contains(&attr) { let (facet_numbers, facet_strings) = extract_facet_values(&value); facet_numbers_values .entry(attr) From 98285b4b187188bd0186087fbf948f8ba52ef81c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 17:30:26 +0200 Subject: [PATCH 0837/1889] Bump milli to 0.6.0 --- Cargo.lock | 10 +++++----- benchmarks/Cargo.toml | 1 - helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 7 files changed, 10 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ff600110e..73114c7c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -901,7 +901,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.5.1" +version = "0.6.0" dependencies = [ "anyhow", "byte-unit", @@ -955,7 +955,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.5.1" +version = "0.6.0" dependencies = [ "anyhow", "askama", @@ -1098,7 +1098,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.5.1" +version = "0.6.0" dependencies = [ "anyhow", "byte-unit", @@ -1377,7 +1377,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.5.1" +version = "0.6.0" dependencies = [ "big_s", "bstr", @@ -2232,7 +2232,7 @@ dependencies = [ [[package]] name = "search" -version = "0.5.1" +version = "0.6.0" dependencies = [ "anyhow", "byte-unit", diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index d211b0b6e..6d5c99950 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -4,7 +4,6 @@ version = "0.1.0" edition = "2018" publish = false - [dependencies] milli = { path = "../milli" } diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index eeabf6971..224b0e5eb 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.5.1" +version = "0.6.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index d12b4363d..5e6ab9d09 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.5.1" +version = "0.6.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index bb548743c..d12036a07 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.5.1" +version = "0.6.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d708442dc..872c339ff 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.5.1" +version = "0.6.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 82368af92..bd1aba2dd 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.5.1" +version = "0.6.0" authors = ["Clément Renault "] edition = "2018" From 9e5f9a8a1051b839b54572aac685bbde68538a8c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 18:35:44 +0200 Subject: [PATCH 0838/1889] Add a test for the words level positions generation bug --- milli/src/update/index_documents/mod.rs | 41 +++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a25b0f3a7..316b0eb81 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -836,6 +836,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { #[cfg(test)] mod tests { + use std::io::Cursor; + use heed::EnvOpenOptions; use super::*; @@ -1258,4 +1260,43 @@ mod tests { drop(rtxn); } + + #[test] + fn simple_documents_replace() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // First we send 3 documents with an id for only one of them. + let mut wtxn = index.write_txn().unwrap(); + let documents = &r#"[ + { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5 }, + { "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 }, + { "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 }, + { "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" }, + { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" }, + { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams" } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + builder.execute(Cursor::new(documents), |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); + builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); + let documents = &r#"[ + { + "id": 2, + "author": "J. Austen", + "date": "1813" + } + ]"#[..]; + + builder.execute(Cursor::new(documents), |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + } } From 0013236e5db378c68d72dedeaf0594e6a851cdd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 28 Jun 2021 16:19:02 +0200 Subject: [PATCH 0839/1889] Fix the LMDB and heed invalid interactions. It is undefined behavior to keep a reference to the database while modifying it, we were keeping references in the database and also feeding the heed put_current methods with keys referenced inside the database itself. https://github.com/Kerollmops/heed/pull/108 --- milli/src/update/delete_documents.rs | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e291eb106..30ae55e62 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -197,7 +197,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { iter.del_current()?; *must_remove = true; } else if docids.len() != previous_len { - iter.put_current(key, &docids)?; + let key = key.to_owned(); + iter.put_current(&key, &docids)?; } } } @@ -238,13 +239,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let mut iter = word_prefix_docids.iter_mut(self.wtxn)?; while let Some(result) = iter.next() { let (prefix, mut docids) = result?; + let prefix = prefix.to_owned(); let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { iter.del_current()?; prefixes_to_delete.insert(prefix)?; } else if docids.len() != previous_len { - iter.put_current(prefix, &docids)?; + iter.put_current(&prefix, &docids)?; } } @@ -281,7 +283,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if docids.is_empty() { iter.del_current()?; } else if docids.len() != previous_len { - iter.put_current(key, &docids)?; + let key = key.to_owned(); + iter.put_current(&key, &docids)?; } } @@ -299,7 +302,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if docids.is_empty() { iter.del_current()?; } else if docids.len() != previous_len { - iter.put_current(bytes, &docids)?; + let bytes = bytes.to_owned(); + iter.put_current(&bytes, &docids)?; } } @@ -315,7 +319,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if docids.is_empty() { iter.del_current()?; } else if docids.len() != previous_len { - iter.put_current(bytes, &docids)?; + let bytes = bytes.to_owned(); + iter.put_current(&bytes, &docids)?; } } @@ -331,7 +336,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if docids.is_empty() { iter.del_current()?; } else if docids.len() != previous_len { - iter.put_current(bytes, &docids)?; + let bytes = bytes.to_owned(); + iter.put_current(&bytes, &docids)?; } } @@ -437,7 +443,8 @@ where if docids.is_empty() { iter.del_current()?; } else if docids.len() != previous_len { - iter.put_current(bytes, &docids)?; + let bytes = bytes.to_owned(); + iter.put_current(&bytes, &docids)?; } } From bdc5599b73d44984e8012763ca4a1e1f64d39cf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 28 Jun 2021 18:26:20 +0200 Subject: [PATCH 0840/1889] Bump heed to use the git repo with v0.12.0 --- Cargo.lock | 65 +++++++++++++++++++++---- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- milli/src/update/delete_documents.rs | 57 ++++++++++++++-------- milli/src/update/index_documents/mod.rs | 12 +++-- search/Cargo.toml | 2 +- 8 files changed, 106 insertions(+), 38 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 73114c7c1..70c21b542 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -131,7 +131,7 @@ dependencies = [ "convert_case", "criterion", "flate2", - "heed", + "heed 0.10.6", "jemallocator", "milli", "reqwest", @@ -868,10 +868,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "afcc6c911acaadad3ebe9f1ef1707d80bd71c92037566f47b6238a03b60adf1a" dependencies = [ "byteorder", - "heed-traits", - "heed-types", + "heed-traits 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", + "heed-types 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", "libc", - "lmdb-rkv-sys", + "lmdb-rkv-sys 0.11.0", + "once_cell", + "page_size", + "serde", + "synchronoise", + "url", + "zerocopy", +] + +[[package]] +name = "heed" +version = "0.12.0" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" +dependencies = [ + "byteorder", + "heed-traits 0.7.0 (git+https://github.com/Kerollmops/heed?tag=v0.12.0)", + "heed-types 0.7.2 (git+https://github.com/Kerollmops/heed?tag=v0.12.0)", + "libc", + "lmdb-rkv-sys 0.15.0", "once_cell", "page_size", "serde", @@ -886,6 +904,11 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b328f6260a7e51bdb0ca6b68e6ea27ee3d11fba5dee930896ee7ff6ad5fc072c" +[[package]] +name = "heed-traits" +version = "0.7.0" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" + [[package]] name = "heed-types" version = "0.7.2" @@ -893,7 +916,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e628efb08beaee58355f80dc4adba79d644940ea9eef60175ea17dc218aab405" dependencies = [ "bincode", - "heed-traits", + "heed-traits 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde", + "serde_json", + "zerocopy", +] + +[[package]] +name = "heed-types" +version = "0.7.2" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" +dependencies = [ + "bincode", + "heed-traits 0.7.0 (git+https://github.com/Kerollmops/heed?tag=v0.12.0)", "serde", "serde_json", "zerocopy", @@ -905,7 +940,7 @@ version = "0.6.0" dependencies = [ "anyhow", "byte-unit", - "heed", + "heed 0.10.6", "jemallocator", "milli", "stderrlog", @@ -969,7 +1004,7 @@ dependencies = [ "funty", "futures", "grenad", - "heed", + "heed 0.12.0", "jemallocator", "log", "maplit", @@ -1103,7 +1138,7 @@ dependencies = [ "anyhow", "byte-unit", "csv", - "heed", + "heed 0.10.6", "jemallocator", "milli", "roaring", @@ -1275,6 +1310,16 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "lmdb-rkv-sys" +version = "0.15.0" +source = "git+https://github.com/meilisearch/lmdb-rs#d0b50d02938ee84e4e4372697ea991fe2a4cae3b" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "log" version = "0.4.14" @@ -1389,7 +1434,7 @@ dependencies = [ "fst", "fxhash", "grenad", - "heed", + "heed 0.12.0", "human_format", "itertools 0.10.0", "levenshtein_automata", @@ -2236,7 +2281,7 @@ version = "0.6.0" dependencies = [ "anyhow", "byte-unit", - "heed", + "heed 0.10.6", "jemallocator", "log", "milli", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 224b0e5eb..bd509904d 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [dependencies] anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -heed = "0.10.6" +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" } milli = { path = "../milli" } stderrlog = "0.5.1" structopt = { version = "0.3.21", default-features = false } diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 5e6ab9d09..e9525dd98 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } -heed = "0.10.6" +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" } meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" } memmap = "0.7.0" milli = { path = "../milli" } diff --git a/infos/Cargo.toml b/infos/Cargo.toml index d12036a07..3c7bef2b8 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -8,7 +8,7 @@ edition = "2018" anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } csv = "1.1.5" -heed = "0.10.6" +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" } milli = { path = "../milli" } roaring = "0.6.6" serde_json = "1.0.62" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 872c339ff..1e6c2a9a0 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -14,7 +14,7 @@ flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } -heed = { version = "0.10.6", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 30ae55e62..a0c1f48f5 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -132,7 +132,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { }; external_ids.push(external_id); } - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; } drop(iter); @@ -143,7 +144,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let ((_docid, word), _positions) = result?; // This boolean will indicate if we must remove this word from the words FST. words.push((SmallString32::from(word), false)); - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; } } @@ -194,11 +196,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; *must_remove = true; } else if docids.len() != previous_len { let key = key.to_owned(); - iter.put_current(&key, &docids)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; } } } @@ -243,10 +247,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; prefixes_to_delete.insert(prefix)?; } else if docids.len() != previous_len { - iter.put_current(&prefix, &docids)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&prefix, &docids)? }; } } @@ -281,10 +287,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; } else if docids.len() != previous_len { let key = key.to_owned(); - iter.put_current(&key, &docids)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; } } @@ -300,10 +308,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; } else if docids.len() != previous_len { let bytes = bytes.to_owned(); - iter.put_current(&bytes, &docids)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &docids)? }; } } @@ -317,10 +327,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; } else if docids.len() != previous_len { let bytes = bytes.to_owned(); - iter.put_current(&bytes, &docids)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &docids)? }; } } @@ -334,10 +346,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; } else if docids.len() != previous_len { let bytes = bytes.to_owned(); - iter.put_current(&bytes, &docids)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &docids)? }; } } @@ -349,9 +363,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let previous_len = docids.len(); docids.difference_with(&self.documents_ids); if docids.is_empty() { - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; } else if docids.len() != previous_len { - iter.put_current(&key, &docids)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; } } @@ -420,7 +436,8 @@ where while let Some(result) = iter.next() { let (key, ()) = result?; if to_remove.contains(convert(key)) { - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; } } @@ -441,10 +458,12 @@ where let previous_len = docids.len(); docids.difference_with(to_remove); if docids.is_empty() { - iter.del_current()?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; } else if docids.len() != previous_len { let bytes = bytes.to_owned(); - iter.put_current(&bytes, &docids)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &docids)? }; } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 316b0eb81..7faa27588 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -144,7 +144,8 @@ where WriteMethod::Append => { let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; while let Some((k, v)) = reader.next()? { - out_iter.append(k, v)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; } } WriteMethod::GetMergePut => { @@ -154,7 +155,8 @@ where Some((key, old_val)) if key == k => { let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; let val = merge(k, &vals)?; - iter.put_current(k, &val)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(k, &val)? }; } _ => { drop(iter); @@ -203,7 +205,8 @@ where WriteMethod::Append => { let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; while let Some((k, v)) = sorter.next()? { - out_iter.append(k, v)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; } } WriteMethod::GetMergePut => { @@ -216,7 +219,8 @@ where // TODO just wrap this error? InternalError::IndexingMergingKeys { process: "get-put-merge" } })?; - iter.put_current(k, &val)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(k, &val)? }; } _ => { drop(iter); diff --git a/search/Cargo.toml b/search/Cargo.toml index bd1aba2dd..16c82a93f 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [dependencies] anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -heed = "0.10.6" +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" } log = "0.4.14" milli = { path = "../milli" } serde_json = "1.0.62" From 80c6aaf1fd5492f2f3147deecfad64964c1f1dc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 28 Jun 2021 17:45:43 +0200 Subject: [PATCH 0841/1889] Bump milli to 0.7.0 --- Cargo.lock | 617 +++++++++++++++++++++++---------------------- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 322 insertions(+), 305 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 70c21b542..a49424e88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -16,18 +16,18 @@ checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" [[package]] name = "aho-corasick" -version = "0.7.15" +version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] [[package]] name = "anyhow" -version = "1.0.39" +version = "1.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cddc5f91628367664cc7c69714ff08deee8a3efc54623011c772544d7b2767" +checksum = "15af2628f6890fe2609a3b91bef4c83450512802e59489f9c1cb1fa5df064a61" [[package]] name = "arrayvec" @@ -55,8 +55,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2925c4c290382f9d2fa3d1c1b6a63fa1427099721ecca4749b154cc9c25522" dependencies = [ "askama_shared", - "proc-macro2 1.0.24", - "syn 1.0.64", + "proc-macro2 1.0.27", + "syn 1.0.73", ] [[package]] @@ -76,10 +76,10 @@ dependencies = [ "nom", "num-traits", "percent-encoding", - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", "serde", - "syn 1.0.64", + "syn 1.0.73", "toml", ] @@ -131,7 +131,7 @@ dependencies = [ "convert_case", "criterion", "flate2", - "heed 0.10.6", + "heed 0.11.0", "jemallocator", "milli", "reqwest", @@ -145,11 +145,10 @@ checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8" [[package]] name = "bincode" -version = "1.3.1" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30d3a39baa26f9651f17b375061f3233dde33424a8b72b0dbe93a68a0bc896d" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" dependencies = [ - "byteorder", "serde", ] @@ -203,9 +202,9 @@ dependencies = [ [[package]] name = "bstr" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a40b47ad93e1a5404e6c18dec46b628214fee441c70f4ab5d6942142cc268a3d" +checksum = "90682c8d613ad3373e66de8c6411e0ae2ab2571e879d2efbf73558cc66f21279" dependencies = [ "lazy_static", "memchr", @@ -225,9 +224,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.6.1" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63396b8a4b9de3f4fdfb320ab6080762242f66a8ef174c49d8e19b674db4cdbe" +checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" [[package]] name = "byte-tools" @@ -237,18 +236,18 @@ checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" [[package]] name = "byte-unit" -version = "4.0.10" +version = "4.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9520900471c3a9bbcfe0fd4c7b6bcfeff41b20a76cf91c59b7474b09be1ee27" +checksum = "063197e6eb4b775b64160dedde7a0986bb2836cce140e9492e9e96f28e18bcd8" dependencies = [ "utf8-width", ] [[package]] name = "bytemuck" -version = "1.5.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bed57e2090563b83ba8f83366628ce535a7584c9afa4c9fc0612a03925c6df58" +checksum = "9966d2ab714d0f785dbac0a0396251a35280aeb42413281617d0209ab4898435" [[package]] name = "byteorder" @@ -270,18 +269,18 @@ checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040" [[package]] name = "cast" -version = "0.2.3" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0" +checksum = "57cdfa5d50aad6cb4d44dcab6101a7f79925bd59d82ca42f38a9856a28865374" dependencies = [ "rustc_version", ] [[package]] name = "cc" -version = "1.0.67" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd" +checksum = "4a72c244c1ff497a746a7e1fb3d14bd08420ecda70c8f25c7112f2781652d787" dependencies = [ "jobserver", ] @@ -355,10 +354,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" [[package]] -name = "cpuid-bool" -version = "0.1.2" +name = "cpufeatures" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aebca1129a03dc6dc2b127edd729435bbc4a37e1d5f4d7513165089ceb02634" +checksum = "66c99696f6c9dd7f35d486b9d04d7e6e202aa3e8c40d553f2fdf5e7e0c6a71ef" +dependencies = [ + "libc", +] [[package]] name = "crc32fast" @@ -380,7 +382,7 @@ dependencies = [ "clap", "criterion-plot", "csv", - "itertools 0.10.0", + "itertools 0.10.1", "lazy_static", "num-traits", "oorandom", @@ -407,12 +409,12 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.3", + "crossbeam-utils 0.8.5", ] [[package]] @@ -423,17 +425,17 @@ checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", - "crossbeam-utils 0.8.3", + "crossbeam-utils 0.8.5", ] [[package]] name = "crossbeam-epoch" -version = "0.9.3" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2584f639eb95fea8c798496315b297cf81b9b58b6d30ab066a75455333cf4b12" +checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" dependencies = [ "cfg-if 1.0.0", - "crossbeam-utils 0.8.3", + "crossbeam-utils 0.8.5", "lazy_static", "memoffset", "scopeguard", @@ -460,11 +462,10 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7e9d99fa91428effe99c5c6d4634cdeba32b8cf784fc428a2a687f61a952c49" +checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" dependencies = [ - "autocfg", "cfg-if 1.0.0", "lazy_static", ] @@ -493,9 +494,9 @@ dependencies = [ [[package]] name = "deunicode" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0b7756d6eb729250618a3693b34b3311b282e12aeeee7970ae2a70997c03eb6" +checksum = "7f37775d639f64aa16389eede0cbe6a70f56df4609d50d8b6858690d5d7bf8f2" [[package]] name = "digest" @@ -517,9 +518,9 @@ dependencies = [ [[package]] name = "dtoa" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d7ed2934d741c6b37e33e3832298e8850b53fd2d2bea03873375596c7cea4e" +checksum = "56899898ce76aaf4a0f24d914c97ea6ed976d42fec6ad33fcbb0a1103e07b2b0" [[package]] name = "either" @@ -578,9 +579,9 @@ checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" [[package]] name = "fst" -version = "0.4.5" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d79238883cf0307100b90aba4a755d8051a3182305dfe7f649a1e9dc0517006f" +checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" [[package]] name = "fuchsia-zircon" @@ -606,9 +607,9 @@ checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" [[package]] name = "futures" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f55667319111d593ba876406af7c409c0ebb44dc4be6132a783ccf163ea14c1" +checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27" dependencies = [ "futures-channel", "futures-core", @@ -621,9 +622,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c2dd2df839b57db9ab69c2c9d8f3e8c81984781937fe2807dc6dcf3b2ad2939" +checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" dependencies = [ "futures-core", "futures-sink", @@ -631,15 +632,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15496a72fabf0e62bdc3df11a59a3787429221dd0710ba8ef163d6f7a9112c94" +checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" [[package]] name = "futures-executor" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891a4b7b96d84d5940084b2a37632dd65deeae662c114ceaa2c879629c9c0ad1" +checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79" dependencies = [ "futures-core", "futures-task", @@ -648,40 +649,42 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71c2c65c57704c32f5241c1223167c2c3294fd34ac020c807ddbe6db287ba59" +checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" [[package]] name = "futures-macro" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea405816a5139fb39af82c2beb921d52143f556038378d6db21183a5c37fbfb7" +checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" dependencies = [ + "autocfg", "proc-macro-hack", - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", + "syn 1.0.73", ] [[package]] name = "futures-sink" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85754d98985841b7d4f5e8e6fbfa4a4ac847916893ec511a2917ccd8525b8bb3" +checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" [[package]] name = "futures-task" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa189ef211c15ee602667a6fcfe1c1fd9e07d42250d2156382820fba33c9df80" +checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" [[package]] name = "futures-util" -version = "0.3.13" +version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1812c7ab8aedf8d6f2701a43e1243acdbcc2b36ab26e2ad421eb99ac963d96d1" +checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" dependencies = [ + "autocfg", "futures-channel", "futures-core", "futures-io", @@ -689,7 +692,7 @@ dependencies = [ "futures-sink", "futures-task", "memchr", - "pin-project-lite 0.2.6", + "pin-project-lite 0.2.7", "pin-utils", "proc-macro-hack", "proc-macro-nested", @@ -737,9 +740,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9495705279e7140bf035dde1f6e750c162df8b625267cd52cc44e0b156732c8" +checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" dependencies = [ "cfg-if 1.0.0", "libc", @@ -800,7 +803,7 @@ dependencies = [ "http", "indexmap", "slab", - "tokio 1.6.0", + "tokio 1.7.1", "tokio-util 0.6.7", "tracing", ] @@ -827,6 +830,12 @@ version = "0.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" + [[package]] name = "headers" version = "0.3.4" @@ -839,7 +848,7 @@ dependencies = [ "headers-core", "http", "mime", - "sha-1 0.9.4", + "sha-1 0.9.6", "time", ] @@ -854,22 +863,23 @@ dependencies = [ [[package]] name = "heck" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" dependencies = [ "unicode-segmentation", ] [[package]] name = "heed" -version = "0.10.6" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afcc6c911acaadad3ebe9f1ef1707d80bd71c92037566f47b6238a03b60adf1a" +checksum = "269c7486ed6def5d7b59a427cec3e87b4d4dd4381d01e21c8c9f2d3985688392" dependencies = [ + "bytemuck", "byteorder", - "heed-traits 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", - "heed-types 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", + "heed-traits 0.8.0", + "heed-types 0.8.0", "libc", "lmdb-rkv-sys 0.11.0", "once_cell", @@ -877,7 +887,6 @@ dependencies = [ "serde", "synchronoise", "url", - "zerocopy", ] [[package]] @@ -886,8 +895,8 @@ version = "0.12.0" source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" dependencies = [ "byteorder", - "heed-traits 0.7.0 (git+https://github.com/Kerollmops/heed?tag=v0.12.0)", - "heed-types 0.7.2 (git+https://github.com/Kerollmops/heed?tag=v0.12.0)", + "heed-traits 0.7.0", + "heed-types 0.7.2", "libc", "lmdb-rkv-sys 0.15.0", "once_cell", @@ -901,22 +910,21 @@ dependencies = [ [[package]] name = "heed-traits" version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b328f6260a7e51bdb0ca6b68e6ea27ee3d11fba5dee930896ee7ff6ad5fc072c" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" [[package]] name = "heed-traits" -version = "0.7.0" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a53a94e5b2fd60417e83ffdfe136c39afacff0d4ac1d8d01cd66928ac610e1a2" [[package]] name = "heed-types" version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e628efb08beaee58355f80dc4adba79d644940ea9eef60175ea17dc218aab405" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" dependencies = [ "bincode", - "heed-traits 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", + "heed-traits 0.7.0", "serde", "serde_json", "zerocopy", @@ -924,23 +932,25 @@ dependencies = [ [[package]] name = "heed-types" -version = "0.7.2" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a6cf0a6952fcedc992602d5cddd1e3fff091fbe87d38636e3ec23a31f32acbd" dependencies = [ "bincode", - "heed-traits 0.7.0 (git+https://github.com/Kerollmops/heed?tag=v0.12.0)", + "bytemuck", + "byteorder", + "heed-traits 0.8.0", "serde", "serde_json", - "zerocopy", ] [[package]] name = "helpers" -version = "0.6.0" +version = "0.7.0" dependencies = [ "anyhow", "byte-unit", - "heed 0.10.6", + "heed 0.12.0", "jemallocator", "milli", "stderrlog", @@ -949,18 +959,18 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.1.18" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "322f4de77956e22ed0e5032c359a0f1273f1f7f0d79bfa3b8ffbc730d7fbcc5c" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "http" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7245cd7449cc792608c3c8a9eaf69bd4eabbabf802713748fd739c98b82f0747" +checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" dependencies = [ "bytes 1.0.1", "fnv", @@ -985,12 +995,12 @@ checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" dependencies = [ "bytes 1.0.1", "http", - "pin-project-lite 0.2.6", + "pin-project-lite 0.2.7", ] [[package]] name = "http-ui" -version = "0.6.0" +version = "0.7.0" dependencies = [ "anyhow", "askama", @@ -1025,9 +1035,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.3.5" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "615caabe2c3160b313d52ccc905335f4ed5f10881dd63dc5699d47e90be85691" +checksum = "f3a87b616e37e93c22fb19bcd386f02f3af5ea98a25670ad0fce773de23c5e68" [[package]] name = "httpdate" @@ -1035,6 +1045,12 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" +[[package]] +name = "httpdate" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440" + [[package]] name = "human_format" version = "1.0.3" @@ -1043,9 +1059,9 @@ checksum = "86cce260d758a9aa3d7c4b99d55c815a540f8a37514ba6046ab6be402a157cb0" [[package]] name = "humansize" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6cab2627acfc432780848602f3f558f7e9dd427352224b0d9324025796d2a5e" +checksum = "02296996cb8796d7c6e3bc2d9211b7802812d36999a51bb754123ead7d37d026" [[package]] name = "hyper" @@ -1061,9 +1077,9 @@ dependencies = [ "http", "http-body 0.3.1", "httparse", - "httpdate", + "httpdate 0.3.2", "itoa", - "pin-project 1.0.5", + "pin-project 1.0.7", "socket2 0.3.19", "tokio 0.2.25", "tower-service", @@ -1073,9 +1089,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.5" +version = "0.14.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bf09f61b52cfcf4c00de50df88ae423d6c02354e385a86341133b5338630ad1" +checksum = "07d6baa1b441335f3ce5098ac421fb6547c46dda735ca1bc6d0153c838f9dd83" dependencies = [ "bytes 1.0.1", "futures-channel", @@ -1085,11 +1101,11 @@ dependencies = [ "http", "http-body 0.4.2", "httparse", - "httpdate", + "httpdate 1.0.1", "itoa", - "pin-project 1.0.5", + "pin-project-lite 0.2.7", "socket2 0.4.0", - "tokio 1.6.0", + "tokio 1.7.1", "tower-service", "tracing", "want", @@ -1102,19 +1118,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64" dependencies = [ "futures-util", - "hyper 0.14.5", + "hyper 0.14.9", "log", "rustls", - "tokio 1.6.0", + "tokio 1.7.1", "tokio-rustls", "webpki", ] [[package]] name = "idna" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89829a5d69c23d348314a7ac337fe39173b61149a9864deabd260983aed48c21" +checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" dependencies = [ "matches", "unicode-bidi", @@ -1133,12 +1149,12 @@ dependencies = [ [[package]] name = "infos" -version = "0.6.0" +version = "0.7.0" dependencies = [ "anyhow", "byte-unit", "csv", - "heed 0.10.6", + "heed 0.12.0", "jemallocator", "milli", "roaring", @@ -1167,9 +1183,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.3.0" +version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47be2f14c678be2fdcab04ab1171db51b2762ce6f0a8ee87c8dd4a04ed216135" +checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" [[package]] name = "itertools" @@ -1182,9 +1198,9 @@ dependencies = [ [[package]] name = "itertools" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" dependencies = [ "either", ] @@ -1218,13 +1234,13 @@ dependencies = [ [[package]] name = "jieba-rs" -version = "0.6.2" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34fbdeee8786790f4a99fa30ff5c5f88aa5183f7583693e3788d17fc8a48f33a" +checksum = "fea3b3172a80f9958abc3b9a637e4e311cd696dc6813440e5cc929b8a5311055" dependencies = [ "cedarwood", "fxhash", - "hashbrown 0.9.1", + "hashbrown 0.11.2", "lazy_static", "phf", "phf_codegen", @@ -1233,18 +1249,18 @@ dependencies = [ [[package]] name = "jobserver" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c71313ebb9439f74b00d9d2dcec36440beaf57a6aa0623068441dd7cd81a7f2" +checksum = "972f5ae5d1cb9c6ae417789196c803205313edde988685da5e3aae0827b9e7fd" dependencies = [ "libc", ] [[package]] name = "js-sys" -version = "0.3.49" +version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc15e39392125075f60c95ba416f5381ff6c3a948ff02ab12464715adf56c821" +checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062" dependencies = [ "wasm-bindgen", ] @@ -1267,18 +1283,18 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "levenshtein_automata" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f44db4199cdb049b494a92d105acbfa43c25b3925e33803923ba9580b7bc9e1a" +checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" dependencies = [ "fst", ] [[package]] name = "lexical-core" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21f866863575d0e1d654fbeeabdc927292fdf862873dc3c96c6f753357e13374" +checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" dependencies = [ "arrayvec", "bitflags", @@ -1289,9 +1305,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.91" +version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8916b1f6ca17130ec6568feccee27c156ad12037880833a3b842a823236502e7" +checksum = "12b8adadd720df158f4d70dfe7ccc6adb0472d7c55ca83445f6a5ab3e36f8fb6" [[package]] name = "linked-hash-map" @@ -1397,9 +1413,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.3.4" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" +checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "memmap" @@ -1413,16 +1429,16 @@ dependencies = [ [[package]] name = "memoffset" -version = "0.6.1" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" dependencies = [ "autocfg", ] [[package]] name = "milli" -version = "0.6.0" +version = "0.7.0" dependencies = [ "big_s", "bstr", @@ -1436,7 +1452,7 @@ dependencies = [ "grenad", "heed 0.12.0", "human_format", - "itertools 0.10.0", + "itertools 0.10.1", "levenshtein_automata", "linked-hash-map", "log", @@ -1449,7 +1465,7 @@ dependencies = [ "ordered-float", "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", "pest_derive", - "rand 0.8.3", + "rand 0.8.4", "rayon", "regex", "roaring", @@ -1510,9 +1526,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.7.11" +version = "0.7.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf80d3e903b34e0bd7282b218398aec54e082c840d9baf8339e0080a0c542956" +checksum = "8c2bdb6314ec10835cd3293dd268473a835c02b7b352e788be788b3c6ca6bb16" dependencies = [ "libc", "log", @@ -1665,9 +1681,9 @@ checksum = "ddd8a5a0aa2f3adafe349259a5b3e21a19c388b792414c1161d60a69c1fa48e8" [[package]] name = "once_cell" -version = "1.7.2" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" +checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" [[package]] name = "oorandom" @@ -1689,9 +1705,9 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" [[package]] name = "ordered-float" -version = "2.1.1" +version = "2.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "766f840da25490628d8e63e529cd21c014f6600c6b8517add12a6fa6167a6218" +checksum = "f100fcfb41e5385e0991f74981732049f9b896821542a219420491046baafdc2" dependencies = [ "num-traits", ] @@ -1747,9 +1763,9 @@ checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" dependencies = [ "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", "pest_meta", - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", + "syn 1.0.73", ] [[package]] @@ -1803,42 +1819,42 @@ dependencies = [ [[package]] name = "pin-project" -version = "0.4.27" +version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ffbc8e94b38ea3d2d8ba92aea2983b503cd75d0888d75b86bb37970b5698e15" +checksum = "918192b5c59119d51e0cd221f4d49dde9112824ba717369e903c97d076083d0f" dependencies = [ - "pin-project-internal 0.4.27", + "pin-project-internal 0.4.28", ] [[package]] name = "pin-project" -version = "1.0.5" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96fa8ebb90271c4477f144354485b8068bd8f6b78b428b01ba892ca26caf0b63" +checksum = "c7509cc106041c40a4518d2af7a61530e1eed0e6285296a3d8c5472806ccc4a4" dependencies = [ - "pin-project-internal 1.0.5", + "pin-project-internal 1.0.7", ] [[package]] name = "pin-project-internal" -version = "0.4.27" +version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65ad2ae56b6abe3a1ee25f15ee605bacadb9a764edaba9c2bf4103800d4a1895" +checksum = "3be26700300be6d9d23264c73211d8190e755b6b5ca7a1b28230025511b52a5e" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", + "syn 1.0.73", ] [[package]] name = "pin-project-internal" -version = "1.0.5" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "758669ae3558c6f74bd2a18b41f7ac0b5a195aea6639d6a9b5e5d1ad5ba24c0b" +checksum = "48c950132583b500556b1efd71d45b319029f2b71518d979fcc208e16b42426f" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", + "syn 1.0.73", ] [[package]] @@ -1849,9 +1865,9 @@ checksum = "257b64915a082f7811703966789728173279bdebb956b143dbcd23f6f970a777" [[package]] name = "pin-project-lite" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc0e1f259c92177c30a4c9d177246edd0a3568b25756a977d0632cf8fa37e905" +checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" [[package]] name = "pin-utils" @@ -1867,9 +1883,9 @@ checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" [[package]] name = "plotters" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45ca0ae5f169d0917a7c7f5a9c1a3d3d9598f18f529dd2b8373ed988efea307a" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" dependencies = [ "num-traits", "plotters-backend", @@ -1906,9 +1922,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" dependencies = [ "proc-macro-error-attr", - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", + "syn 1.0.73", "version_check", ] @@ -1918,7 +1934,7 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", "version_check", ] @@ -1946,11 +1962,11 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.24" +version = "1.0.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" +checksum = "f0d8caf72986c1a598726adc988bb5984792ef84f5ee5aa50209145ee8077038" dependencies = [ - "unicode-xid 0.2.1", + "unicode-xid 0.2.2", ] [[package]] @@ -1974,7 +1990,7 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", ] [[package]] @@ -1999,14 +2015,14 @@ dependencies = [ [[package]] name = "rand" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ef9e7e66b4468674bfcb0c81af8b7fa0bb154fa9f28eb840da5c447baeb8d7e" +checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" dependencies = [ "libc", - "rand_chacha 0.3.0", - "rand_core 0.6.2", - "rand_hc 0.3.0", + "rand_chacha 0.3.1", + "rand_core 0.6.3", + "rand_hc 0.3.1", ] [[package]] @@ -2021,12 +2037,12 @@ dependencies = [ [[package]] name = "rand_chacha" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e12735cf05c9e10bf21534da50a147b924d555dc7a547c42e6bb2d5b6017ae0d" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core 0.6.2", + "rand_core 0.6.3", ] [[package]] @@ -2040,11 +2056,11 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34cf66eb183df1c5876e2dcf6b13d57340741e8dc255b48e40a26de954d06ae7" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" dependencies = [ - "getrandom 0.2.2", + "getrandom 0.2.3", ] [[package]] @@ -2058,11 +2074,11 @@ dependencies = [ [[package]] name = "rand_hc" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3190ef7066a446f2e7f42e239d161e905420ccab01eb967c9eb27d21b2322a73" +checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" dependencies = [ - "rand_core 0.6.2", + "rand_core 0.6.3", ] [[package]] @@ -2076,9 +2092,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b0d8e0819fadc20c74ea8373106ead0600e3a67ef1fe8da56e39b9ae7275674" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" dependencies = [ "autocfg", "crossbeam-deque", @@ -2088,31 +2104,31 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.9.0" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ab346ac5921dc62ffa9f89b7a773907511cdfa5490c572ae9be1be33e8afa4a" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" dependencies = [ "crossbeam-channel", "crossbeam-deque", - "crossbeam-utils 0.8.3", + "crossbeam-utils 0.8.5", "lazy_static", "num_cpus", ] [[package]] name = "redox_syscall" -version = "0.2.5" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9" +checksum = "5ab49abadf3f9e1c4bc499e8845e152ad87d2ad2d30371841171169e9d75feee" dependencies = [ "bitflags", ] [[package]] name = "regex" -version = "1.4.5" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "957056ecddbeba1b26965114e191d2e8589ce74db242b6ea25fc4062427a5c19" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ "aho-corasick", "memchr", @@ -2121,18 +2137,15 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4" -dependencies = [ - "byteorder", -] +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] name = "regex-syntax" -version = "0.6.23" +version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "remove_dir_all" @@ -2145,9 +2158,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.3" +version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2296f2fac53979e8ccbc4a1136b25dcefd37be9ed7e4a1f6b05a6029c84ff124" +checksum = "246e9f61b9bb77df069a947682be06e31ac43ea37862e244a69f177694ea6d22" dependencies = [ "base64 0.13.0", "bytes 1.0.1", @@ -2156,7 +2169,7 @@ dependencies = [ "futures-util", "http", "http-body 0.4.2", - "hyper 0.14.5", + "hyper 0.14.9", "hyper-rustls", "ipnet", "js-sys", @@ -2164,11 +2177,11 @@ dependencies = [ "log", "mime", "percent-encoding", - "pin-project-lite 0.2.6", + "pin-project-lite 0.2.7", "rustls", "serde", "serde_urlencoded 0.7.0", - "tokio 1.6.0", + "tokio 1.7.1", "tokio-rustls", "url", "wasm-bindgen", @@ -2180,9 +2193,9 @@ dependencies = [ [[package]] name = "retain_mut" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1" +checksum = "e9c17925a9027d298a4603d286befe3f9dc0e8ed02523141914eb628798d6e5b" [[package]] name = "ring" @@ -2201,9 +2214,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.6.6" +version = "0.6.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4b2e7ab0bbb2d144558ae3f4761a0db06d21463b45756fc64c3393cdba3d447" +checksum = "536cfa885fc388b8ae69edf96d7970849b7d9c1395da1b8330f17715babf8a09" dependencies = [ "bytemuck", "byteorder", @@ -2212,9 +2225,9 @@ dependencies = [ [[package]] name = "rustc_version" -version = "0.2.3" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" dependencies = [ "semver", ] @@ -2277,11 +2290,11 @@ dependencies = [ [[package]] name = "search" -version = "0.6.0" +version = "0.7.0" dependencies = [ "anyhow", "byte-unit", - "heed 0.10.6", + "heed 0.12.0", "jemallocator", "log", "milli", @@ -2292,24 +2305,27 @@ dependencies = [ [[package]] name = "semver" -version = "0.9.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" dependencies = [ "semver-parser", ] [[package]] name = "semver-parser" -version = "0.7.0" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" +checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] [[package]] name = "serde" -version = "1.0.125" +version = "1.0.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171" +checksum = "ec7505abeacaec74ae4778d9d9328fe5a5d04253220a85c4ee022239fc996d03" dependencies = [ "serde_derive", ] @@ -2326,13 +2342,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.125" +version = "1.0.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b093b7a2bb58203b5da3056c05b4ec1fed827dcfdb37347a8841695263b3d06d" +checksum = "963a7dbc9895aeac7ac90e74f34a5d5261828f79df35cbed41e10189d3804d43" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", + "syn 1.0.73", ] [[package]] @@ -2349,9 +2365,9 @@ dependencies = [ [[package]] name = "serde_test" -version = "1.0.125" +version = "1.0.126" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4bb5fef7eaf5a97917567183607ac4224c5b451c15023930f23b937cce879fe" +checksum = "bd1055d1c20532080b9da5040ec8e27425f4d4573d8e29eb19ba4ff1e4b9da2d" dependencies = [ "serde", ] @@ -2394,22 +2410,22 @@ dependencies = [ [[package]] name = "sha-1" -version = "0.9.4" +version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfebf75d25bd900fd1e7d11501efab59bc846dbc76196839663e6637bba9f25f" +checksum = "8c4cfa741c5832d0ef7fab46cabed29c2aae926db0b11bb2069edd8db5e64e16" dependencies = [ "block-buffer 0.9.0", "cfg-if 1.0.0", - "cpuid-bool", + "cpufeatures", "digest 0.9.0", "opaque-debug 0.3.0", ] [[package]] name = "signal-hook-registry" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f1d0fef1604ba8f7a073c7e701f213e056707210e9020af4528e0101ce11a6" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" dependencies = [ "libc", ] @@ -2422,9 +2438,9 @@ checksum = "cbce6d4507c7e4a3962091436e56e95290cb71fa302d0d270e32130b75fbff27" [[package]] name = "slab" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" +checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527" [[package]] name = "slice-group-by" @@ -2450,9 +2466,9 @@ checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" [[package]] name = "snap" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc725476a1398f0480d56cd0ad381f6f32acf2642704456f8f59a35df464b59a" +checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451" [[package]] name = "socket2" @@ -2519,9 +2535,9 @@ checksum = "5ba9cdfda491b814720b6b06e0cac513d922fc407582032e8706e9f137976f90" dependencies = [ "heck", "proc-macro-error", - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", + "syn 1.0.73", ] [[package]] @@ -2537,13 +2553,13 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.64" +version = "1.0.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fd9d1e9976102a03c542daa2eff1b43f9d72306342f3f8b3ed5fb8908195d6f" +checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "unicode-xid 0.2.1", + "unicode-xid 0.2.2", ] [[package]] @@ -2561,10 +2577,10 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", - "unicode-xid 0.2.1", + "syn 1.0.73", + "unicode-xid 0.2.2", ] [[package]] @@ -2581,7 +2597,7 @@ checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" dependencies = [ "cfg-if 1.0.0", "libc", - "rand 0.8.3", + "rand 0.8.4", "redox_syscall", "remove_dir_all", "winapi 0.3.9", @@ -2648,9 +2664,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "317cca572a0e89c3ce0ca1f1bdc9369547fe318a683418e42ac8f59d14701023" +checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342" dependencies = [ "tinyvec_macros", ] @@ -2687,17 +2703,18 @@ dependencies = [ [[package]] name = "tokio" -version = "1.6.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37" +checksum = "5fb2ed024293bb19f7a5dc54fe83bf86532a44c12a2bb8ba40d64a4509395ca2" dependencies = [ "autocfg", "bytes 1.0.1", "libc", "memchr", - "mio 0.7.11", + "mio 0.7.13", "num_cpus", - "pin-project-lite 0.2.6", + "pin-project-lite 0.2.7", + "winapi 0.3.9", ] [[package]] @@ -2706,9 +2723,9 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e44da00bfc73a25f814cd8d7e57a68a5c31b74b3152a0a1d1f590c97ed06265a" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", + "syn 1.0.73", ] [[package]] @@ -2718,7 +2735,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" dependencies = [ "rustls", - "tokio 1.6.0", + "tokio 1.7.1", "webpki", ] @@ -2730,7 +2747,7 @@ checksum = "6d9e878ad426ca286e4dcae09cbd4e1973a7f8987d97570e2469703dd7f5720c" dependencies = [ "futures-util", "log", - "pin-project 0.4.27", + "pin-project 0.4.28", "tokio 0.2.25", "tungstenite", ] @@ -2759,8 +2776,8 @@ dependencies = [ "futures-core", "futures-sink", "log", - "pin-project-lite 0.2.6", - "tokio 1.6.0", + "pin-project-lite 0.2.7", + "tokio 1.7.1", ] [[package]] @@ -2780,21 +2797,21 @@ checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" [[package]] name = "tracing" -version = "0.1.25" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01ebdc2bb4498ab1ab5f5b73c5803825e60199229ccba0698170e3be0e7f959f" +checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" dependencies = [ "cfg-if 1.0.0", "log", - "pin-project-lite 0.2.6", + "pin-project-lite 0.2.7", "tracing-core", ] [[package]] name = "tracing-core" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f50de3927f93d202783f4513cda820ab47ef17f624b03c096e86ef00c67e6b5f" +checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052" dependencies = [ "lazy_static", ] @@ -2805,7 +2822,7 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" dependencies = [ - "pin-project 1.0.5", + "pin-project 1.0.7", "tracing", ] @@ -2829,7 +2846,7 @@ dependencies = [ "input_buffer", "log", "rand 0.7.3", - "sha-1 0.9.4", + "sha-1 0.9.6", "url", "utf-8", ] @@ -2866,18 +2883,18 @@ dependencies = [ [[package]] name = "unicode-bidi" -version = "0.3.4" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" +checksum = "eeb8be209bb1c96b7c177c7420d26e04eccacb0eeae6b980e35fcb74678107e0" dependencies = [ "matches", ] [[package]] name = "unicode-normalization" -version = "0.1.17" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07fbfce1c8a97d547e8b5334978438d9d6ec8c20e38f56d4a4374d181493eaef" +checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" dependencies = [ "tinyvec", ] @@ -2902,9 +2919,9 @@ checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" [[package]] name = "unicode-xid" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" [[package]] name = "untrusted" @@ -2914,9 +2931,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.2.1" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ccd964113622c8e9322cfac19eb1004a07e636c545f325da085d5cdde6f1f8b" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" dependencies = [ "form_urlencoded", "idna", @@ -2926,21 +2943,21 @@ dependencies = [ [[package]] name = "urlencoding" -version = "1.1.1" +version = "1.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c9232eb53352b4442e40d7900465dfc534e8cb2dc8f18656fcb2ac16112b5593" +checksum = "5a1f0175e03a0973cf4afd476bef05c26e228520400eb1fd473ad417b1c00ffb" [[package]] name = "utf-8" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05e42f7c18b8f902290b009cde6d651262f956c98bc51bca4cd1d511c9cd85c7" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" [[package]] name = "utf8-width" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9071ac216321a4470a69fb2b28cfc68dcd1a39acd877c8be8e014df6772d8efa" +checksum = "7cf7d77f457ef8dfa11e4cd5933c5ddb5dc52a94664071951219a97710f0a32b" [[package]] name = "uuid" @@ -2948,7 +2965,7 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" dependencies = [ - "getrandom 0.2.2", + "getrandom 0.2.3", ] [[package]] @@ -2993,7 +3010,7 @@ dependencies = [ "mime", "mime_guess", "multipart", - "pin-project 0.4.27", + "pin-project 0.4.28", "scoped-tls", "serde", "serde_json", @@ -3020,9 +3037,9 @@ checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" [[package]] name = "wasm-bindgen" -version = "0.2.72" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fe8f61dba8e5d645a4d8132dc7a0a66861ed5e1045d2c0ed940fab33bac0fbe" +checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd" dependencies = [ "cfg-if 1.0.0", "serde", @@ -3032,24 +3049,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.72" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "046ceba58ff062da072c7cb4ba5b22a37f00a302483f7e2a6cdc18fedbdc1fd3" +checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900" dependencies = [ "bumpalo", "lazy_static", "log", - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", + "syn 1.0.73", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.22" +version = "0.4.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73157efb9af26fb564bb59a009afd1c7c334a44db171d280690d0c3faaec3468" +checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1" dependencies = [ "cfg-if 1.0.0", "js-sys", @@ -3059,9 +3076,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.72" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ef9aa01d36cda046f797c57959ff5f3c615c9cc63997a8d545831ec7976819b" +checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4" dependencies = [ "quote 1.0.9", "wasm-bindgen-macro-support", @@ -3069,28 +3086,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.72" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96eb45c1b2ee33545a813a92dbb53856418bf7eb54ab34f7f7ff1448a5b3735d" +checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.27", "quote 1.0.9", - "syn 1.0.64", + "syn 1.0.73", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.72" +version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7148f4696fb4960a346eaa60bbfb42a1ac4ebba21f750f75fc1375b098d5ffa" +checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f" [[package]] name = "web-sys" -version = "0.3.49" +version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59fe19d70f5dacc03f6e46777213facae5ac3801575d56ca6cbd4c93dcd12310" +checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582" dependencies = [ "js-sys", "wasm-bindgen", @@ -3208,8 +3225,8 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" dependencies = [ - "proc-macro2 1.0.24", - "syn 1.0.64", + "proc-macro2 1.0.27", + "syn 1.0.73", "synstructure", ] diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index bd509904d..86fecf5a1 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.6.0" +version = "0.7.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index e9525dd98..1e76559a0 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.6.0" +version = "0.7.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 3c7bef2b8..0d68055c2 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.6.0" +version = "0.7.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1e6c2a9a0..17c3a578c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.6.0" +version = "0.7.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 16c82a93f..7dbfcc52a 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.6.0" +version = "0.7.0" authors = ["Clément Renault "] edition = "2018" From 56fceb19287436391a85106674686dd772db35ce Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 29 Jun 2021 15:06:03 +0200 Subject: [PATCH 0842/1889] re-implement the Damerau-Levenshtein used for the highlighting --- milli/src/search/matching_words.rs | 123 ++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 38 deletions(-) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index cd8e404b8..22e63edfa 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -33,18 +33,19 @@ impl MatchingWords { } /// Returns the number of matching bytes if the word matches one of the query words. - pub fn matching_bytes(&self, word: &str) -> Option { - self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| match dfa.eval(word) { - Distance::Exact(t) if t <= *typo => { - if *is_prefix { - let (_dist, len) = - prefix_damerau_levenshtein(query_word.as_bytes(), word.as_bytes()); - Some(len) - } else { - Some(word.len()) + pub fn matching_bytes(&self, word_to_highlight: &str) -> Option { + self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| { + match dfa.eval(word_to_highlight) { + Distance::Exact(t) if t <= *typo => { + if *is_prefix { + let len = bytes_to_highlight(word_to_highlight, query_word); + Some(len) + } else { + Some(word_to_highlight.len()) + } } + _otherwise => None, } - _otherwise => None, }) } } @@ -101,20 +102,23 @@ impl IndexMut<(usize, usize)> for N2Array { } } -/// Returns the distance between the source word and the target word, -/// and the number of byte matching in the target word. -fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) { - let (n, m) = (source.len(), target.len()); +/// Returns the number of **bytes** we want to highlight in the `source` word. +/// Basically we want to highlight as much characters as possible in the source until it has too much +/// typos (= 2) +/// The algorithm is a modified +/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) +fn bytes_to_highlight(source: &str, target: &str) -> usize { + let (n, m) = (source.chars().count(), target.chars().count()); if n == 0 { - return (m as u32, 0); + return 0; } - if m == 0 { - return (n as u32, 0); + // since we allow two typos we can send two characters even if it's completely wrong + if m < 3 { + return source.chars().take(m).map(|c| c.len_utf8()).sum(); } - if n == m && source == target { - return (0, m); + return source.len(); } let inf = n + m; @@ -132,11 +136,11 @@ fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) { let mut last_row = BTreeMap::new(); - for (row, char_s) in source.iter().enumerate() { + for (row, char_s) in source.chars().enumerate() { let mut last_match_col = 0; let row = row + 1; - for (col, char_t) in target.iter().enumerate() { + for (col, char_t) in target.chars().enumerate() { let col = col + 1; let last_match_row = *last_row.get(&char_t).unwrap_or(&0); let cost = if char_s == char_t { 0 } else { 1 }; @@ -148,9 +152,7 @@ fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) { + (row - last_match_row - 1) + 1 + (col - last_match_col - 1); - let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans)); - matrix[(row + 1, col + 1)] = dist; if cost == 0 { @@ -161,32 +163,77 @@ fn prefix_damerau_levenshtein(source: &[u8], target: &[u8]) -> (u32, usize) { last_row.insert(char_s, row); } - let mut minimum = (u32::max_value(), 0); - - for x in 0..=m { - let dist = matrix[(n + 1, x + 1)] as u32; - if dist < minimum.0 { - minimum = (dist, x) + let mut minimum = 2; + for x in 0..=n { + // let dist = matrix[(x + 1, m + 1)]; + let min_dist = (0..=m).map(|y| matrix[(x + 1, y + 1)]).min().unwrap(); + if min_dist <= 2 { + minimum = x; } } - minimum + // everything was done characters wise and now we want to returns a number of bytes + source.chars().take(minimum).map(|c| c.len_utf8()).sum() } #[cfg(test)] mod tests { + use std::str::from_utf8; + use super::*; use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::MatchingWords; #[test] - fn matched_length() { - let query = "Levenste"; - let text = "Levenshtein"; + fn test_bytes_to_highlight() { + struct TestBytesToHighlight { + query: &'static str, + text: &'static str, + length: usize, + } + let tests = [ + TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() }, + TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() }, + TestBytesToHighlight { + query: "Levenshtein", + text: "Levenshtein", + length: "Levenshtein".len(), + }, + // we get to the end of our word with only one typo + TestBytesToHighlight { + query: "Levenste", + text: "Levenshtein", + length: "Levenste".len(), + }, + // we get our third and last authorized typo right on the last character + TestBytesToHighlight { + query: "Levenstein", + text: "Levenshte", + length: "Levenstei".len(), + }, + // we get to the end of our word with only two typos at the beginning + TestBytesToHighlight { + query: "Bavenshtein", + text: "Levenshtein", + length: "Bavenshtein".len(), + }, + // Since we calculate a distance char by char we are supposed to have only two mistakes + // here. That would've not be the case if we were computing the distance bytes per bytes + TestBytesToHighlight { query: "Båve", text: "Chiøt", length: "Bå".len() }, + TestBytesToHighlight { query: "💪🙂🍤", text: "plouf", length: "💪🙂".len() }, + TestBytesToHighlight { query: "clôu¿i", text: "bloubi", length: "clôu".len() }, + ]; - let (dist, length) = prefix_damerau_levenshtein(query.as_bytes(), text.as_bytes()); - assert_eq!(dist, 1); - assert_eq!(&text[..length], "Levenshte"); + for test in &tests { + let length = bytes_to_highlight(test.query, test.text); + assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text); + assert!( + from_utf8(&test.query.as_bytes()[..length]).is_ok(), + r#"converting {}[..{}] to an utf8 str failed"#, + test.query, + length + ); + } } #[test] @@ -214,9 +261,9 @@ mod tests { assert_eq!(matching_words.matching_bytes("word"), Some(4)); assert_eq!(matching_words.matching_bytes("nyc"), None); assert_eq!(matching_words.matching_bytes("world"), Some(5)); - assert_eq!(matching_words.matching_bytes("splitted"), Some(5)); + assert_eq!(matching_words.matching_bytes("splitted"), Some(7)); assert_eq!(matching_words.matching_bytes("thisnew"), None); assert_eq!(matching_words.matching_bytes("borld"), Some(5)); - assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4)); + assert_eq!(matching_words.matching_bytes("wordsplit"), Some(5)); } } From be75e738b1bd4d28099892a89723c21cd7d2758e Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 29 Jun 2021 16:18:53 +0200 Subject: [PATCH 0843/1889] add more tests --- milli/src/search/matching_words.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 22e63edfa..9aad52110 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -222,6 +222,22 @@ mod tests { TestBytesToHighlight { query: "Båve", text: "Chiøt", length: "Bå".len() }, TestBytesToHighlight { query: "💪🙂🍤", text: "plouf", length: "💪🙂".len() }, TestBytesToHighlight { query: "clôu¿i", text: "bloubi", length: "clôu".len() }, + TestBytesToHighlight { + query: "Альфа", text: "Альфой", length: "Альфа".len() + }, + TestBytesToHighlight { + query: "Go💼", text: "Go💼od luck.", length: "Go💼".len() + }, + TestBytesToHighlight { + query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len() + }, + TestBytesToHighlight { + query: "chäräcters", + text: "chäräcters", + length: "chäräcters".len(), + }, + TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() }, + TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() }, ]; for test in &tests { From 6044b8036243e33255522489f6ec66076ebfdc73 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 30 Jun 2021 00:35:26 +0200 Subject: [PATCH 0844/1889] Update milli/src/search/matching_words.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/matching_words.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 9aad52110..291378b43 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -165,7 +165,6 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize { let mut minimum = 2; for x in 0..=n { - // let dist = matrix[(x + 1, m + 1)]; let min_dist = (0..=m).map(|y| matrix[(x + 1, y + 1)]).min().unwrap(); if min_dist <= 2 { minimum = x; From 4bce66d5ff99a39d5f6eb9ce15b154ef6faf3a35 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 30 Jun 2021 10:07:31 +0200 Subject: [PATCH 0845/1889] Make the Index::delete_* method private --- milli/src/index.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index c8e5ab089..6bcb0aebd 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -218,7 +218,7 @@ impl Index { } /// Deletes the primary key of the documents, this can be done to reset indexes settings. - pub fn delete_primary_key(&self, wtxn: &mut RwTxn) -> heed::Result { + pub(crate) fn delete_primary_key(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, main_key::PRIMARY_KEY_KEY) } @@ -333,7 +333,7 @@ impl Index { /// Deletes the displayed fields ids, this will make the engine to display /// all the documents attributes in the order of the `FieldsIdsMap`. - pub fn delete_displayed_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + pub(crate) fn delete_displayed_fields(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, main_key::DISPLAYED_FIELDS_KEY) } @@ -383,7 +383,7 @@ impl Index { } /// Deletes the searchable fields, when no fields are specified, all fields are indexed. - pub fn delete_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + pub(crate) fn delete_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, main_key::SEARCHABLE_FIELDS_KEY) } @@ -429,7 +429,7 @@ impl Index { } /// Deletes the filterable fields ids in the database. - pub fn delete_filterable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + pub(crate) fn delete_filterable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, main_key::FILTERABLE_FIELDS_KEY) } @@ -602,7 +602,7 @@ impl Index { self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) } - pub fn delete_criteria(&self, wtxn: &mut RwTxn) -> heed::Result { + pub(crate) fn delete_criteria(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, main_key::CRITERIA_KEY) } @@ -642,7 +642,7 @@ impl Index { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) } - pub fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { + pub(crate) fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, main_key::STOP_WORDS_KEY) } @@ -663,7 +663,7 @@ impl Index { self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms) } - pub fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result { + pub(crate) fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY) } From 54889813cefe4f0978ecc87aea2c47551edbb3c5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 30 Jun 2021 10:43:12 +0200 Subject: [PATCH 0846/1889] Implement some debug functions on the ExternalDocumentsIds struct --- milli/src/external_documents_ids.rs | 34 ++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 3dec638da..419105bd5 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -1,8 +1,12 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::convert::TryInto; +use std::{fmt, str}; use fst::{IntoStreamer, Streamer}; +const DELETED_ID: u64 = u64::MAX; + pub struct ExternalDocumentsIds<'a> { pub(crate) hard: fst::Map>, pub(crate) soft: fst::Map>, @@ -32,7 +36,7 @@ impl<'a> ExternalDocumentsIds<'a> { let external_id = external_id.as_ref(); match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { // u64 MAX means deleted in the soft fst map - Some(id) if id != u64::MAX => Some(id.try_into().unwrap()), + Some(id) if id != DELETED_ID => Some(id.try_into().unwrap()), _otherwise => None, } } @@ -47,7 +51,7 @@ impl<'a> ExternalDocumentsIds<'a> { if docids.iter().any(|v| v.index == 1) { // If the `other` set returns a value here it means // that it must be marked as deleted. - new_soft_builder.insert(external_id, u64::MAX)?; + new_soft_builder.insert(external_id, DELETED_ID)?; } else { new_soft_builder.insert(external_id, docids[0].value)?; } @@ -77,6 +81,24 @@ impl<'a> ExternalDocumentsIds<'a> { self.merge_soft_into_hard() } + /// An helper function to debug this type, returns an `HashMap` of both, + /// soft and hard fst maps, combined. + pub fn to_hash_map(&self) -> HashMap { + let mut map = HashMap::new(); + + let union_op = self.hard.op().add(&self.soft).r#union(); + let mut iter = union_op.into_stream(); + while let Some((external_id, marked_docids)) = iter.next() { + let id = marked_docids.last().unwrap().value; + if id != DELETED_ID { + let external_id = str::from_utf8(external_id).unwrap(); + map.insert(external_id.to_owned(), id.try_into().unwrap()); + } + } + + map + } + fn merge_soft_into_hard(&mut self) -> fst::Result<()> { if self.soft.len() >= self.hard.len() / 2 { let union_op = self.hard.op().add(&self.soft).r#union(); @@ -85,7 +107,7 @@ impl<'a> ExternalDocumentsIds<'a> { let mut new_hard_builder = fst::MapBuilder::memory(); while let Some((external_id, docids)) = iter.next() { if docids.len() == 2 { - if docids[1].value != u64::MAX { + if docids[1].value != DELETED_ID { new_hard_builder.insert(external_id, docids[1].value)?; } } else { @@ -103,6 +125,12 @@ impl<'a> ExternalDocumentsIds<'a> { } } +impl fmt::Debug for ExternalDocumentsIds<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish() + } +} + impl Default for ExternalDocumentsIds<'static> { fn default() -> Self { ExternalDocumentsIds { From b489515f4d4252ccc0709dee073132ec5e787491 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 30 Jun 2021 13:52:46 +0200 Subject: [PATCH 0847/1889] Update milli version to v0.7.1 --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a49424e88..055243c32 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -946,7 +946,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.7.0" +version = "0.7.1" dependencies = [ "anyhow", "byte-unit", @@ -1000,7 +1000,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.7.0" +version = "0.7.1" dependencies = [ "anyhow", "askama", @@ -1149,7 +1149,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.7.0" +version = "0.7.1" dependencies = [ "anyhow", "byte-unit", @@ -1438,7 +1438,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.7.0" +version = "0.7.1" dependencies = [ "big_s", "bstr", @@ -2290,7 +2290,7 @@ dependencies = [ [[package]] name = "search" -version = "0.7.0" +version = "0.7.1" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 86fecf5a1..6997d0d3a 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.7.0" +version = "0.7.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 1e76559a0..f09406cce 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.7.0" +version = "0.7.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 0d68055c2..9a9d39744 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.7.0" +version = "0.7.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 17c3a578c..3b025a805 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.7.0" +version = "0.7.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 7dbfcc52a..be972cf23 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.7.0" +version = "0.7.1" authors = ["Clément Renault "] edition = "2018" From 28782ff99d73e1f4632063971d8b03899d3f8e99 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 30 Jun 2021 11:22:57 +0200 Subject: [PATCH 0848/1889] Fix ExternalDocumentsIds struct when inserting previously deleted ids --- milli/src/external_documents_ids.rs | 48 +++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 419105bd5..3dce18b00 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use std::convert::TryInto; use std::{fmt, str}; +use fst::map::IndexedValue; use fst::{IntoStreamer, Streamer}; const DELETED_ID: u64 = u64::MAX; @@ -35,7 +36,6 @@ impl<'a> ExternalDocumentsIds<'a> { pub fn get>(&self, external_id: A) -> Option { let external_id = external_id.as_ref(); match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { - // u64 MAX means deleted in the soft fst map Some(id) if id != DELETED_ID => Some(id.try_into().unwrap()), _otherwise => None, } @@ -53,7 +53,8 @@ impl<'a> ExternalDocumentsIds<'a> { // that it must be marked as deleted. new_soft_builder.insert(external_id, DELETED_ID)?; } else { - new_soft_builder.insert(external_id, docids[0].value)?; + let value = docids.iter().find(|v| v.index == 0).unwrap().value; + new_soft_builder.insert(external_id, value)?; } } @@ -69,8 +70,8 @@ impl<'a> ExternalDocumentsIds<'a> { let mut new_soft_builder = fst::MapBuilder::memory(); let mut iter = union_op.into_stream(); - while let Some((external_id, docids)) = iter.next() { - let id = docids.last().unwrap().value; + while let Some((external_id, marked_docids)) = iter.next() { + let id = indexed_last_value(marked_docids).unwrap(); new_soft_builder.insert(external_id, id)?; } @@ -89,7 +90,7 @@ impl<'a> ExternalDocumentsIds<'a> { let union_op = self.hard.op().add(&self.soft).r#union(); let mut iter = union_op.into_stream(); while let Some((external_id, marked_docids)) = iter.next() { - let id = marked_docids.last().unwrap().value; + let id = indexed_last_value(marked_docids).unwrap(); if id != DELETED_ID { let external_id = str::from_utf8(external_id).unwrap(); map.insert(external_id.to_owned(), id.try_into().unwrap()); @@ -105,13 +106,10 @@ impl<'a> ExternalDocumentsIds<'a> { let mut iter = union_op.into_stream(); let mut new_hard_builder = fst::MapBuilder::memory(); - while let Some((external_id, docids)) = iter.next() { - if docids.len() == 2 { - if docids[1].value != DELETED_ID { - new_hard_builder.insert(external_id, docids[1].value)?; - } - } else { - new_hard_builder.insert(external_id, docids[0].value)?; + while let Some((external_id, marked_docids)) = iter.next() { + let value = indexed_last_value(marked_docids).unwrap(); + if value != DELETED_ID { + new_hard_builder.insert(external_id, value)?; } } @@ -140,6 +138,11 @@ impl Default for ExternalDocumentsIds<'static> { } } +/// Returns the value of the `IndexedValue` with the highest _index_. +fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option { + indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value) +} + #[cfg(test)] mod tests { use super::*; @@ -190,4 +193,25 @@ mod tests { assert_eq!(external_documents_ids.get("g"), Some(7)); assert_eq!(external_documents_ids.get("h"), Some(8)); } + + #[test] + fn strange_delete_insert_ids() { + let mut external_documents_ids = ExternalDocumentsIds::default(); + + let new_ids = + fst::Map::from_iter(vec![("1", 0), ("123", 1), ("30", 2), ("456", 3)]).unwrap(); + external_documents_ids.insert_ids(&new_ids).unwrap(); + assert_eq!(external_documents_ids.get("1"), Some(0)); + assert_eq!(external_documents_ids.get("123"), Some(1)); + assert_eq!(external_documents_ids.get("30"), Some(2)); + assert_eq!(external_documents_ids.get("456"), Some(3)); + + let deleted_ids = fst::Set::from_iter(vec!["30"]).unwrap(); + external_documents_ids.delete_ids(deleted_ids).unwrap(); + assert_eq!(external_documents_ids.get("30"), None); + + let new_ids = fst::Map::from_iter(vec![("30", 2)]).unwrap(); + external_documents_ids.insert_ids(&new_ids).unwrap(); + assert_eq!(external_documents_ids.get("30"), Some(2)); + } } From c92ef54466dd702e421eeac45171b35471e5fbe7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 30 Jun 2021 11:23:29 +0200 Subject: [PATCH 0849/1889] Add a test for when we insert a previously deleted document --- milli/src/update/index_documents/mod.rs | 49 +++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7faa27588..9d88fb5b6 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -845,6 +845,7 @@ mod tests { use heed::EnvOpenOptions; use super::*; + use crate::update::DeleteDocuments; #[test] fn simple_document_replacement() { @@ -1303,4 +1304,52 @@ mod tests { builder.execute(Cursor::new(documents), |_, _| ()).unwrap(); wtxn.commit().unwrap(); } + + #[test] + fn delete_documents_then_insert() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = &br#"[ + { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" }, + { "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" }, + { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, + { "objectId": 30, "title": "Hamlet" } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + + assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + builder.delete_external_id("30"); + builder.execute().unwrap(); + + let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); + assert!(external_documents_ids.get("30").is_none()); + + let content = &br#"[ + { "objectId": 30, "title": "Hamlet" } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + + let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); + assert!(external_documents_ids.get("30").is_some()); + + let content = &br#"[ + { "objectId": 30, "title": "Hamlet" } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + } } From 32b7bd366f87f64932671d86701244a2a8f67db8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 30 Jun 2021 14:12:56 +0200 Subject: [PATCH 0850/1889] Remove the roaring operation functions warnings --- milli/src/search/criteria/mod.rs | 14 +++++------ milli/src/search/criteria/proximity.rs | 12 +++++----- milli/src/search/criteria/typo.rs | 6 ++--- milli/src/search/distinct/facet_distinct.rs | 6 ++--- milli/src/search/facet/facet_distribution.rs | 2 +- milli/src/search/facet/filter_condition.rs | 2 +- milli/src/search/facet/mod.rs | 4 ++-- milli/src/search/mod.rs | 2 +- milli/src/update/available_documents_ids.rs | 2 +- milli/src/update/delete_documents.rs | 24 +++++++++---------- milli/src/update/facets.rs | 2 +- .../update/index_documents/merge_function.rs | 6 ++--- milli/src/update/index_documents/mod.rs | 4 ++-- milli/src/update/words_level_positions.rs | 2 +- 14 files changed, 43 insertions(+), 45 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 228d48bd7..2ba3b388f 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -328,7 +328,7 @@ pub fn resolve_query_tree<'t>( candidates = docids; first_loop = false; } else { - candidates.intersect_with(&docids); + candidates &= &docids; } } Ok(candidates) @@ -358,7 +358,7 @@ pub fn resolve_query_tree<'t>( let mut candidates = RoaringBitmap::new(); for op in ops { let docids = resolve_operation(ctx, op, wdcache)?; - candidates.union_with(&docids); + candidates |= docids; } Ok(candidates) } @@ -381,7 +381,7 @@ fn all_word_pair_proximity_docids, U: AsRef>( let current_docids = ctx .word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)? .unwrap_or_default(); - docids.union_with(¤t_docids); + docids |= current_docids; } } Ok(docids) @@ -401,7 +401,7 @@ fn query_docids( let mut docids = RoaringBitmap::new(); for (word, _typo) in words { let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); - docids.union_with(¤t_docids); + docids |= current_docids; } Ok(docids) } else { @@ -413,7 +413,7 @@ fn query_docids( let mut docids = RoaringBitmap::new(); for (word, _typo) in words { let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); - docids.union_with(¤t_docids); + docids |= current_docids; } Ok(docids) } @@ -430,7 +430,7 @@ fn query_pair_proximity_docids( if proximity >= 8 { let mut candidates = query_docids(ctx, left, wdcache)?; let right_candidates = query_docids(ctx, right, wdcache)?; - candidates.intersect_with(&right_candidates); + candidates &= right_candidates; return Ok(candidates); } @@ -463,7 +463,7 @@ fn query_pair_proximity_docids( proximity, )? .unwrap_or_default(); - docids.union_with(¤t_docids); + docids |= current_docids; } Ok(docids) } else if prefix { diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 3e8196e93..f884de160 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -274,11 +274,11 @@ fn resolve_candidates<'t>( let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; if lcandidates.len() < rcandidates.len() { - candidates.intersect_with(lcandidates); - candidates.intersect_with(rcandidates); + candidates &= lcandidates; + candidates &= rcandidates; } else { - candidates.intersect_with(rcandidates); - candidates.intersect_with(lcandidates); + candidates &= rcandidates; + candidates &= lcandidates; } if !candidates.is_empty() { output.push((ll.clone(), rr.clone(), candidates)); @@ -317,7 +317,7 @@ fn resolve_candidates<'t>( for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache, wdcache)? { - candidates.intersect_with(&head_candidates); + candidates &= &head_candidates; if !candidates.is_empty() { output.push((lhead.clone(), rtail, candidates)); } @@ -334,7 +334,7 @@ fn resolve_candidates<'t>( let mut candidates = RoaringBitmap::new(); for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? { - candidates.union_with(&cds); + candidates |= cds; } Ok(candidates) } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index f4ae15f0a..97a9b4e4b 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -281,7 +281,7 @@ fn resolve_candidates<'t>( let mut candidates = RoaringBitmap::new(); for op in ops { let docids = resolve_operation(ctx, op, number_typos, cache, wdcache)?; - candidates.union_with(&docids); + candidates |= docids; } Ok(candidates) } @@ -329,8 +329,8 @@ fn resolve_candidates<'t>( }; if !head_candidates.is_empty() { let tail_candidates = mdfs(ctx, tail, mana - m, cache, wdcache)?; - head_candidates.intersect_with(&tail_candidates); - candidates.union_with(&head_candidates); + head_candidates &= tail_candidates; + candidates |= head_candidates; } } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 290a7602f..91620da2a 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -61,7 +61,7 @@ impl<'a> FacetDistinctIter<'a> { db_name: db_name::FACET_ID_STRING_DOCIDS, key: None, })?; - self.excluded.union_with(&facet_docids); + self.excluded |= facet_docids; } self.excluded.remove(id); @@ -79,7 +79,7 @@ impl<'a> FacetDistinctIter<'a> { db_name: db_name::FACET_ID_F64_DOCIDS, key: None, })?; - self.excluded.union_with(&facet_docids); + self.excluded |= facet_docids; } self.excluded.remove(id); @@ -92,7 +92,7 @@ impl<'a> FacetDistinctIter<'a> { /// handling easier. fn next_inner(&mut self) -> Result> { // The first step is to remove all the excluded documents from our candidates - self.candidates.difference_with(&self.excluded); + self.candidates -= &self.excluded; let mut candidates_iter = self.candidates.iter().skip(self.iter_offset); match candidates_iter.next() { diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 71816cf5d..3f55006f2 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -122,7 +122,7 @@ impl<'a> FacetDistribution<'a> { for result in iter { let (value, mut docids) = result?; - docids.intersect_with(candidates); + docids &= candidates; if !docids.is_empty() { distribution.insert(value.to_string(), docids.len()); } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 31fc6018c..1b1eafcab 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -289,7 +289,7 @@ impl FilterCondition { for (i, result) in iter.enumerate() { let ((_fid, level, l, r), docids) = result?; debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - output.union_with(&docids); + *output |= docids; // We save the leftest and rightest bounds we actually found at this level. if i == 0 { left_found = Some(l); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 240d99ccc..4e900bff4 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -213,10 +213,10 @@ impl<'t> Iterator for FacetIter<'t> { match result { Ok(((_fid, level, left, right), mut docids)) => { - docids.intersect_with(&documents_ids); + docids &= &*documents_ids; if !docids.is_empty() { if self.must_reduce { - documents_ids.difference_with(&docids); + *documents_ids -= &docids; } if level == 0 { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 71d200e0c..f40a6aed6 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -173,7 +173,7 @@ impl<'a> Search<'a> { let mut candidates = distinct.distinct(candidates, excluded); - initial_candidates.union_with(&bucket_candidates); + initial_candidates |= bucket_candidates; if offset != 0 { let discarded = candidates.by_ref().take(offset).count(); diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs index 9e3fce75d..653bc7dd2 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_documents_ids.rs @@ -12,7 +12,7 @@ impl AvailableDocumentsIds { match docids.max() { Some(last_id) => { let mut available = RoaringBitmap::from_iter(0..last_id); - available.difference_with(&docids); + available -= docids; let iter = match last_id.checked_add(1) { Some(id) => id..=u32::max_value(), diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index a0c1f48f5..313f8a909 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -43,7 +43,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } pub fn delete_documents(&mut self, docids: &RoaringBitmap) { - self.documents_ids.union_with(docids); + self.documents_ids |= docids; } pub fn delete_external_id(&mut self, external_id: &str) -> Option { @@ -65,7 +65,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We remove the documents ids that we want to delete // from the documents in the database and write them back. let current_documents_ids_len = documents_ids.len(); - documents_ids.difference_with(&self.documents_ids); + documents_ids -= &self.documents_ids; self.index.put_documents_ids(self.wtxn, &documents_ids)?; // We can execute a ClearDocuments operation when the number of documents @@ -194,7 +194,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if let Some((key, mut docids)) = iter.next().transpose()? { if key == word.as_ref() { let previous_len = docids.len(); - docids.difference_with(&self.documents_ids); + docids -= &self.documents_ids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -245,7 +245,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let (prefix, mut docids) = result?; let prefix = prefix.to_owned(); let previous_len = docids.len(); - docids.difference_with(&self.documents_ids); + docids -= &self.documents_ids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -285,7 +285,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (key, mut docids) = result?; let previous_len = docids.len(); - docids.difference_with(&self.documents_ids); + docids -= &self.documents_ids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -306,7 +306,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); - docids.difference_with(&self.documents_ids); + docids -= &self.documents_ids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -325,7 +325,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); - docids.difference_with(&self.documents_ids); + docids -= &self.documents_ids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -344,7 +344,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); - docids.difference_with(&self.documents_ids); + docids -= &self.documents_ids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -361,7 +361,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; while let Some((key, mut docids)) = iter.next().transpose()? { let previous_len = docids.len(); - docids.difference_with(&self.documents_ids); + docids -= &self.documents_ids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -390,7 +390,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { for field_id in self.index.faceted_fields_ids(self.wtxn)? { // Remove docids from the number faceted documents ids let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?; - docids.difference_with(&self.documents_ids); + docids -= &self.documents_ids; self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?; remove_docids_from_field_id_docid_facet_value( @@ -403,7 +403,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // Remove docids from the string faceted documents ids let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?; - docids.difference_with(&self.documents_ids); + docids -= &self.documents_ids; self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?; remove_docids_from_field_id_docid_facet_value( @@ -456,7 +456,7 @@ where while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); - docids.difference_with(to_remove); + docids -= to_remove; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 09f962bbc..0e2cad69d 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -181,7 +181,7 @@ fn compute_facet_number_levels<'t>( } // The right bound is always the bound we run through. - group_docids.union_with(&docids); + group_docids |= docids; right = value; } diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 3d9ffda6a..17283b232 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -61,8 +61,7 @@ pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result let mut head = RoaringBitmap::deserialize_from(&head[..])?; for value in tail { - let bitmap = RoaringBitmap::deserialize_from(&value[..])?; - head.union_with(&bitmap); + head |= RoaringBitmap::deserialize_from(&value[..])?; } let mut vec = Vec::with_capacity(head.serialized_size()); @@ -75,8 +74,7 @@ pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; // We merge the new documents ids with the existing ones. - documents_ids.union_with(&new_documents_ids); - documents_ids.union_with(&replaced_documents_ids); + documents_ids |= new_documents_ids; + documents_ids |= replaced_documents_ids; self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index d43cd19b8..c656d7105 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -236,7 +236,7 @@ fn compute_positions_levels( } // The right bound is always the bound we run through. - group_docids.union_with(&docids); + group_docids |= docids; } if !group_docids.is_empty() { From 3c149d8a437f8bc92190717185faa6bd68ca7c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 30 Jun 2021 18:41:35 +0200 Subject: [PATCH 0851/1889] Update tokenizer version to v0.2.3 --- Cargo.lock | 20 ++------------------ milli/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 055243c32..839dfd29e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1018,7 +1018,7 @@ dependencies = [ "jemallocator", "log", "maplit", - "meilisearch-tokenizer 0.2.3", + "meilisearch-tokenizer", "memmap", "milli", "once_cell", @@ -1379,22 +1379,6 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" -[[package]] -name = "meilisearch-tokenizer" -version = "0.2.2" -source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.2#eda4ed4968c8ac973cf1707ef89bd7012bb2722f" -dependencies = [ - "character_converter", - "cow-utils", - "deunicode", - "fst", - "jieba-rs", - "once_cell", - "slice-group-by", - "unicode-segmentation", - "whatlang", -] - [[package]] name = "meilisearch-tokenizer" version = "0.2.3" @@ -1458,7 +1442,7 @@ dependencies = [ "log", "logging_timer", "maplit", - "meilisearch-tokenizer 0.2.2", + "meilisearch-tokenizer", "memmap", "obkv", "once_cell", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3b025a805..b0af95557 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,7 +18,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0", default-fe human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.2" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" } memmap = "0.7.0" obkv = "0.1.1" once_cell = "1.5.2" From fc09d77e892c082b79340c7e477d78d1108c8a06 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 1 Jul 2021 11:38:30 +0200 Subject: [PATCH 0852/1889] fix the benchmarks dependcies --- Cargo.lock | 70 +++++++------------------------------------ benchmarks/Cargo.toml | 2 +- 2 files changed, 11 insertions(+), 61 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 839dfd29e..d2b7b9a5d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -131,7 +131,7 @@ dependencies = [ "convert_case", "criterion", "flate2", - "heed 0.11.0", + "heed", "jemallocator", "milli", "reqwest", @@ -870,35 +870,16 @@ dependencies = [ "unicode-segmentation", ] -[[package]] -name = "heed" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "269c7486ed6def5d7b59a427cec3e87b4d4dd4381d01e21c8c9f2d3985688392" -dependencies = [ - "bytemuck", - "byteorder", - "heed-traits 0.8.0", - "heed-types 0.8.0", - "libc", - "lmdb-rkv-sys 0.11.0", - "once_cell", - "page_size", - "serde", - "synchronoise", - "url", -] - [[package]] name = "heed" version = "0.12.0" source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" dependencies = [ "byteorder", - "heed-traits 0.7.0", - "heed-types 0.7.2", + "heed-traits", + "heed-types", "libc", - "lmdb-rkv-sys 0.15.0", + "lmdb-rkv-sys", "once_cell", "page_size", "serde", @@ -912,45 +893,25 @@ name = "heed-traits" version = "0.7.0" source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" -[[package]] -name = "heed-traits" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a53a94e5b2fd60417e83ffdfe136c39afacff0d4ac1d8d01cd66928ac610e1a2" - [[package]] name = "heed-types" version = "0.7.2" source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" dependencies = [ "bincode", - "heed-traits 0.7.0", + "heed-traits", "serde", "serde_json", "zerocopy", ] -[[package]] -name = "heed-types" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a6cf0a6952fcedc992602d5cddd1e3fff091fbe87d38636e3ec23a31f32acbd" -dependencies = [ - "bincode", - "bytemuck", - "byteorder", - "heed-traits 0.8.0", - "serde", - "serde_json", -] - [[package]] name = "helpers" version = "0.7.1" dependencies = [ "anyhow", "byte-unit", - "heed 0.12.0", + "heed", "jemallocator", "milli", "stderrlog", @@ -1014,7 +975,7 @@ dependencies = [ "funty", "futures", "grenad", - "heed 0.12.0", + "heed", "jemallocator", "log", "maplit", @@ -1154,7 +1115,7 @@ dependencies = [ "anyhow", "byte-unit", "csv", - "heed 0.12.0", + "heed", "jemallocator", "milli", "roaring", @@ -1315,17 +1276,6 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fb9b38af92608140b86b693604b9ffcc5824240a484d1ecd4795bacb2fe88f3" -[[package]] -name = "lmdb-rkv-sys" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b27470ac25167b3afdfb6af8fcd3bc1be67de50ffbdaf4073378cfded6ae24a5" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "lmdb-rkv-sys" version = "0.15.0" @@ -1434,7 +1384,7 @@ dependencies = [ "fst", "fxhash", "grenad", - "heed 0.12.0", + "heed", "human_format", "itertools 0.10.1", "levenshtein_automata", @@ -2278,7 +2228,7 @@ version = "0.7.1" dependencies = [ "anyhow", "byte-unit", - "heed 0.12.0", + "heed", "jemallocator", "log", "milli", diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 6d5c99950..d47becca6 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -11,7 +11,7 @@ milli = { path = "../milli" } jemallocator = "0.3.2" [dev-dependencies] -heed = "*" # we want to use the version milli uses +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0", default-features = false, features = ["lmdb", "sync-read-txn"] } criterion = { version = "0.3.4", features = ["html_reports"] } [build-dependencies] From ef965aa3f30781369cf9545770ce65cc33ca92ed Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 1 Jul 2021 11:43:09 +0200 Subject: [PATCH 0853/1889] fix the fmt of the auto-generated file --- benchmarks/build.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/build.rs b/benchmarks/build.rs index b1edd5499..827c2c2a3 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -22,7 +22,7 @@ fn main() -> anyhow::Result<()> { let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches"); let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?; - writeln!( + write!( manifest_paths_file, r#"//! This file is generated by the build script. //! Do not modify by hand, use the build.rs file. From ec87bf3dd5dc58f2a0d474972dae4f633ba4af05 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 1 Jul 2021 11:45:05 +0200 Subject: [PATCH 0854/1889] Update benchmarks/Cargo.toml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- benchmarks/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index d47becca6..4e0b0d00c 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -11,7 +11,7 @@ milli = { path = "../milli" } jemallocator = "0.3.2" [dev-dependencies] -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" } criterion = { version = "0.3.4", features = ["html_reports"] } [build-dependencies] From 9f62149b94a307de19510abd77e302aee6f9b149 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 1 Jul 2021 19:03:28 +0200 Subject: [PATCH 0855/1889] Fix matching lenghth in matching_words --- milli/src/search/matching_words.rs | 36 +++++++++++++----------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 291378b43..37754a782 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -108,7 +108,8 @@ impl IndexMut<(usize, usize)> for N2Array { /// The algorithm is a modified /// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) fn bytes_to_highlight(source: &str, target: &str) -> usize { - let (n, m) = (source.chars().count(), target.chars().count()); + let n = source.chars().count(); + let m = target.chars().count(); if n == 0 { return 0; @@ -125,11 +126,11 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize { let mut matrix = N2Array::new(n + 2, m + 2, 0); matrix[(0, 0)] = inf; - for i in 0..n + 1 { + for i in 0..=n { matrix[(i + 1, 0)] = inf; matrix[(i + 1, 1)] = i; } - for j in 0..m + 1 { + for j in 0..=m { matrix[(0, j + 1)] = inf; matrix[(1, j + 1)] = j; } @@ -163,16 +164,16 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize { last_row.insert(char_s, row); } - let mut minimum = 2; - for x in 0..=n { - let min_dist = (0..=m).map(|y| matrix[(x + 1, y + 1)]).min().unwrap(); - if min_dist <= 2 { - minimum = x; + let mut minimum = (u32::max_value(), 0); + for x in 0..=m { + let dist = matrix[(n + 1, x + 1)] as u32; + if dist < minimum.0 { + minimum = (dist, x); } } // everything was done characters wise and now we want to returns a number of bytes - source.chars().take(minimum).map(|c| c.len_utf8()).sum() + source.chars().take(minimum.1).map(|c| c.len_utf8()).sum() } #[cfg(test)] @@ -208,7 +209,7 @@ mod tests { TestBytesToHighlight { query: "Levenstein", text: "Levenshte", - length: "Levenstei".len(), + length: "Levenste".len(), }, // we get to the end of our word with only two typos at the beginning TestBytesToHighlight { @@ -216,13 +217,8 @@ mod tests { text: "Levenshtein", length: "Bavenshtein".len(), }, - // Since we calculate a distance char by char we are supposed to have only two mistakes - // here. That would've not be the case if we were computing the distance bytes per bytes - TestBytesToHighlight { query: "Båve", text: "Chiøt", length: "Bå".len() }, - TestBytesToHighlight { query: "💪🙂🍤", text: "plouf", length: "💪🙂".len() }, - TestBytesToHighlight { query: "clôu¿i", text: "bloubi", length: "clôu".len() }, TestBytesToHighlight { - query: "Альфа", text: "Альфой", length: "Альфа".len() + query: "Альфа", text: "Альфой", length: "Альф".len() }, TestBytesToHighlight { query: "Go💼", text: "Go💼od luck.", length: "Go💼".len() @@ -240,7 +236,7 @@ mod tests { ]; for test in &tests { - let length = bytes_to_highlight(test.query, test.text); + let length = bytes_to_highlight(test.text, test.query); assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text); assert!( from_utf8(&test.query.as_bytes()[..length]).is_ok(), @@ -273,12 +269,12 @@ mod tests { let matching_words = MatchingWords::from_query_tree(&query_tree); - assert_eq!(matching_words.matching_bytes("word"), Some(4)); + assert_eq!(matching_words.matching_bytes("word"), Some(3)); assert_eq!(matching_words.matching_bytes("nyc"), None); assert_eq!(matching_words.matching_bytes("world"), Some(5)); - assert_eq!(matching_words.matching_bytes("splitted"), Some(7)); + assert_eq!(matching_words.matching_bytes("splitted"), Some(5)); assert_eq!(matching_words.matching_bytes("thisnew"), None); assert_eq!(matching_words.matching_bytes("borld"), Some(5)); - assert_eq!(matching_words.matching_bytes("wordsplit"), Some(5)); + assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4)); } } From a6b4069172ffd232c4339e35ee1ca974b797ad46 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 5 Jul 2021 10:54:53 +0200 Subject: [PATCH 0856/1889] Bump to v0.7.2 --- Cargo.lock | 10 +++++----- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d2b7b9a5d..4aba285b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -907,7 +907,7 @@ dependencies = [ [[package]] name = "helpers" -version = "0.7.1" +version = "0.7.2" dependencies = [ "anyhow", "byte-unit", @@ -961,7 +961,7 @@ dependencies = [ [[package]] name = "http-ui" -version = "0.7.1" +version = "0.7.2" dependencies = [ "anyhow", "askama", @@ -1110,7 +1110,7 @@ dependencies = [ [[package]] name = "infos" -version = "0.7.1" +version = "0.7.2" dependencies = [ "anyhow", "byte-unit", @@ -1372,7 +1372,7 @@ dependencies = [ [[package]] name = "milli" -version = "0.7.1" +version = "0.7.2" dependencies = [ "big_s", "bstr", @@ -2224,7 +2224,7 @@ dependencies = [ [[package]] name = "search" -version = "0.7.1" +version = "0.7.2" dependencies = [ "anyhow", "byte-unit", diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 6997d0d3a..af56b5eb7 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.7.1" +version = "0.7.2" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index f09406cce..208917e54 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.7.1" +version = "0.7.2" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 9a9d39744..3377b141d 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.7.1" +version = "0.7.2" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b0af95557..83f956b62 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.7.1" +version = "0.7.2" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index be972cf23..223516721 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.7.1" +version = "0.7.2" authors = ["Clément Renault "] edition = "2018" From 91c5d0c042d21208dce81d5715a237a61827a3a6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 5 Jul 2021 16:36:13 +0200 Subject: [PATCH 0857/1889] Use the AlwaysFreePages flag when opening an index --- Cargo.lock | 6 +++--- benchmarks/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- milli/src/index.rs | 2 ++ search/Cargo.toml | 2 +- 8 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4aba285b7..050ba7c88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -873,7 +873,7 @@ dependencies = [ [[package]] name = "heed" version = "0.12.0" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#8e5dc6d71c8166a8d7d0db059e6e51478942b551" dependencies = [ "byteorder", "heed-traits", @@ -891,12 +891,12 @@ dependencies = [ [[package]] name = "heed-traits" version = "0.7.0" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#8e5dc6d71c8166a8d7d0db059e6e51478942b551" [[package]] name = "heed-types" version = "0.7.2" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.0#6c0b95793a805dc598f05c119494e6c069de0326" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#8e5dc6d71c8166a8d7d0db059e6e51478942b551" dependencies = [ "bincode", "heed-traits", diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 4e0b0d00c..b5ba9bf4f 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -11,7 +11,7 @@ milli = { path = "../milli" } jemallocator = "0.3.2" [dev-dependencies] -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" } +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } criterion = { version = "0.3.4", features = ["html_reports"] } [build-dependencies] diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index af56b5eb7..6fad00a22 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [dependencies] anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" } +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } milli = { path = "../milli" } stderrlog = "0.5.1" structopt = { version = "0.3.21", default-features = false } diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 208917e54..9f425af3f 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" } +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" } memmap = "0.7.0" milli = { path = "../milli" } diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 3377b141d..b257e6010 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -8,7 +8,7 @@ edition = "2018" anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } csv = "1.1.5" -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" } +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } milli = { path = "../milli" } roaring = "0.6.6" serde_json = "1.0.62" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 83f956b62..dfa02f89d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -14,7 +14,7 @@ flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" diff --git a/milli/src/index.rs b/milli/src/index.rs index 6bcb0aebd..247e67d52 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -3,6 +3,7 @@ use std::collections::{HashMap, HashSet}; use std::path::Path; use chrono::{DateTime, Utc}; +use heed::flags::Flags; use heed::types::*; use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use roaring::RoaringBitmap; @@ -106,6 +107,7 @@ impl Index { use db_name::*; options.max_dbs(14); + unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; let main = env.create_poly_database(Some(MAIN))?; diff --git a/search/Cargo.toml b/search/Cargo.toml index 223516721..83722c516 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [dependencies] anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.0" } +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } log = "0.4.14" milli = { path = "../milli" } serde_json = "1.0.62" From a57e522a6749658d1268beca4019640d4dbcdda5 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 5 Jul 2021 17:31:41 +0200 Subject: [PATCH 0858/1889] introduce a die route let the program exit itself alone --- http-ui/src/main.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index d0fc29573..f9f3a3c52 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -982,6 +982,11 @@ async fn main() -> anyhow::Result<()> { }) }); + let die_route = warp::filters::method::get().and(warp::path!("die")).map(move || { + std::process::exit(0); + warp::reply() + }); + let routes = dash_html_route .or(updates_list_or_html_route) .or(dash_bulma_route) @@ -1001,7 +1006,8 @@ async fn main() -> anyhow::Result<()> { .or(clearing_route) .or(change_settings_route) .or(change_facet_levels_route) - .or(update_ws_route); + .or(update_ws_route) + .or(die_route); let addr = SocketAddr::from_str(&opt.http_listen_addr)?; warp::serve(routes).run(addr).await; From 4562b278a8fa3d10601a541f25ce0eaa3f0b1997 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 5 Jul 2021 17:43:28 +0200 Subject: [PATCH 0859/1889] remove a warning and add a log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- http-ui/src/main.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index f9f3a3c52..ee32882c0 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -983,7 +983,9 @@ async fn main() -> anyhow::Result<()> { }); let die_route = warp::filters::method::get().and(warp::path!("die")).map(move || { + eprintln!("Killed by an HTTP request received on the die route"); std::process::exit(0); + #[allow(unreachable_code)] warp::reply() }); From 838ed1cd32379959ca5cb1f66384b22dfd0f769b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 6 Jul 2021 11:31:24 +0200 Subject: [PATCH 0860/1889] Use an u16 field id instead of one byte --- Cargo.lock | 16 ++++++++++-- milli/Cargo.toml | 3 ++- .../facet/facet_level_value_f64_codec.rs | 11 ++++---- .../facet/facet_value_string_codec.rs | 11 ++++---- .../facet/field_doc_id_facet_f64_codec.rs | 15 +++++------ .../facet/field_doc_id_facet_string_codec.rs | 18 +++++++------ .../heed_codec/field_id_word_count_codec.rs | 12 ++++++--- milli/src/heed_codec/obkv_codec.rs | 8 +++--- milli/src/index.rs | 22 +++++++++------- milli/src/lib.rs | 25 +++++++++++++++++-- milli/src/search/facet/facet_distribution.rs | 4 +-- milli/src/search/facet/mod.rs | 2 +- milli/src/update/delete_documents.rs | 6 +++-- milli/src/update/facets.rs | 14 +++++------ .../update/index_documents/merge_function.rs | 2 +- milli/src/update/index_documents/store.rs | 4 ++- milli/src/update/index_documents/transform.rs | 6 ++--- 17 files changed, 115 insertions(+), 64 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 050ba7c88..18d42029f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -341,6 +341,17 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "concat-arrays" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1df715824eb382e34b7afb7463b0247bf41538aeba731fba05241ecdb5dc3747" +dependencies = [ + "proc-macro2 1.0.27", + "quote 1.0.9", + "syn 1.0.73", +] + [[package]] name = "convert_case" version = "0.4.0" @@ -1378,6 +1389,7 @@ dependencies = [ "bstr", "byteorder", "chrono", + "concat-arrays", "csv", "either", "flate2", @@ -1609,9 +1621,9 @@ dependencies = [ [[package]] name = "obkv" -version = "0.1.1" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddd8a5a0aa2f3adafe349259a5b3e21a19c388b792414c1161d60a69c1fa48e8" +checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385" [[package]] name = "once_cell" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index dfa02f89d..6af928041 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" bstr = "0.2.15" byteorder = "1.4.2" chrono = { version = "0.4.19", features = ["serde"] } +concat-arrays = "0.1.2" csv = "1.1.5" either = "1.6.1" flate2 = "1.0.20" @@ -20,7 +21,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" } memmap = "0.7.0" -obkv = "0.1.1" +obkv = "0.2.0" once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" diff --git a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs index b23dcb269..1e66427ca 100644 --- a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::convert::TryInto; use crate::facet::value_encoding::f64_into_bytes; -use crate::FieldId; +use crate::{try_split_array_at, FieldId}; // TODO do not de/serialize right bound when level = 0 pub struct FacetLevelValueF64Codec; @@ -11,7 +11,8 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { type DItem = (FieldId, u8, f64, f64); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); let (level, bytes) = bytes.split_first()?; let (left, right) = if *level != 0 { @@ -23,7 +24,7 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { (left, left) }; - Some((*field_id, *level, left, right)) + Some((field_id, *level, left, right)) } } @@ -61,8 +62,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { 16 // length }; - let mut bytes = Vec::with_capacity(len + 2); - bytes.push(*field_id); + let mut bytes = Vec::with_capacity(len + 3); + bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.push(*level); bytes.extend_from_slice(&buffer[..len]); Some(Cow::Owned(bytes)) diff --git a/milli/src/heed_codec/facet/facet_value_string_codec.rs b/milli/src/heed_codec/facet/facet_value_string_codec.rs index 259dab972..54abb7886 100644 --- a/milli/src/heed_codec/facet/facet_value_string_codec.rs +++ b/milli/src/heed_codec/facet/facet_value_string_codec.rs @@ -1,14 +1,14 @@ use std::borrow::Cow; use std::str; -use crate::FieldId; +use crate::{try_split_array_at, FieldId}; pub struct FacetValueStringCodec; impl FacetValueStringCodec { pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { - out.reserve(value.len() + 1); - out.push(field_id); + out.reserve(value.len() + 2); + out.extend_from_slice(&field_id.to_be_bytes()); out.extend_from_slice(value.as_bytes()); } } @@ -17,9 +17,10 @@ impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { type DItem = (FieldId, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); let value = str::from_utf8(bytes).ok()?; - Some((*field_id, value)) + Some((field_id, value)) } } diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs index b3c0fa381..22159601c 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::convert::TryInto; use crate::facet::value_encoding::f64_into_bytes; -use crate::{DocumentId, FieldId}; +use crate::{try_split_array_at, DocumentId, FieldId}; pub struct FieldDocIdFacetF64Codec; @@ -10,14 +10,15 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { type DItem = (FieldId, DocumentId, f64); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); - let (document_id_bytes, bytes) = bytes.split_at(4); - let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; + let (document_id_bytes, bytes) = try_split_array_at(bytes)?; + let document_id = u32::from_be_bytes(document_id_bytes); let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?; - Some((*field_id, document_id, value)) + Some((field_id, document_id, value)) } } @@ -25,8 +26,8 @@ impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec { type EItem = (FieldId, DocumentId, f64); fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(1 + 4 + 8 + 8); - bytes.push(*field_id); + let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8); + bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.extend_from_slice(&document_id.to_be_bytes()); let value_bytes = f64_into_bytes(*value)?; bytes.extend_from_slice(&value_bytes); diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs index fd3f1143d..36408f578 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -1,8 +1,7 @@ use std::borrow::Cow; -use std::convert::TryInto; use std::str; -use crate::{DocumentId, FieldId}; +use crate::{try_split_array_at, DocumentId, FieldId}; pub struct FieldDocIdFacetStringCodec; @@ -13,8 +12,8 @@ impl FieldDocIdFacetStringCodec { value: &str, out: &mut Vec, ) { - out.reserve(1 + 4 + value.len()); - out.push(field_id); + out.reserve(2 + 4 + value.len()); + out.extend_from_slice(&field_id.to_be_bytes()); out.extend_from_slice(&document_id.to_be_bytes()); out.extend_from_slice(value.as_bytes()); } @@ -24,11 +23,14 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { type DItem = (FieldId, DocumentId, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; - let (document_id_bytes, bytes) = bytes.split_at(4); - let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + + let (document_id_bytes, bytes) = try_split_array_at(bytes)?; + let document_id = u32::from_be_bytes(document_id_bytes); + let value = str::from_utf8(bytes).ok()?; - Some((*field_id, document_id, value)) + Some((field_id, document_id, value)) } } diff --git a/milli/src/heed_codec/field_id_word_count_codec.rs b/milli/src/heed_codec/field_id_word_count_codec.rs index 64f0e1db6..aca7a80c4 100644 --- a/milli/src/heed_codec/field_id_word_count_codec.rs +++ b/milli/src/heed_codec/field_id_word_count_codec.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; -use std::convert::TryInto; -use crate::FieldId; +use crate::{try_split_array_at, FieldId}; pub struct FieldIdWordCountCodec; @@ -9,7 +8,9 @@ impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec { type DItem = (FieldId, u8); fn bytes_decode(bytes: &'a [u8]) -> Option { - let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + let ([word_count], _nothing) = try_split_array_at(bytes)?; Some((field_id, word_count)) } } @@ -18,6 +19,9 @@ impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec { type EItem = (FieldId, u8); fn bytes_encode((field_id, word_count): &Self::EItem) -> Option> { - Some(Cow::Owned(vec![*field_id, *word_count])) + let mut bytes = Vec::with_capacity(2 + 1); + bytes.extend_from_slice(&field_id.to_be_bytes()); + bytes.push(*word_count); + Some(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/obkv_codec.rs b/milli/src/heed_codec/obkv_codec.rs index b7414b693..6dad771a8 100644 --- a/milli/src/heed_codec/obkv_codec.rs +++ b/milli/src/heed_codec/obkv_codec.rs @@ -1,19 +1,19 @@ use std::borrow::Cow; -use obkv::{KvReader, KvWriter}; +use obkv::{KvReaderU16, KvWriterU16}; pub struct ObkvCodec; impl<'a> heed::BytesDecode<'a> for ObkvCodec { - type DItem = KvReader<'a>; + type DItem = KvReaderU16<'a>; fn bytes_decode(bytes: &'a [u8]) -> Option { - Some(KvReader::new(bytes)) + Some(KvReaderU16::new(bytes)) } } impl heed::BytesEncode<'_> for ObkvCodec { - type EItem = KvWriter>; + type EItem = KvWriterU16>; fn bytes_encode(item: &Self::EItem) -> Option> { item.clone().into_inner().map(Cow::Owned).ok() diff --git a/milli/src/index.rs b/milli/src/index.rs index 247e67d52..099a5891d 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -523,10 +523,11 @@ impl Index { field_id: FieldId, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - *buffer.last_mut().unwrap() = field_id; + buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + .copy_from_slice(&field_id.to_be_bytes()); self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } @@ -536,10 +537,11 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - *buffer.last_mut().unwrap() = field_id; + buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + .copy_from_slice(&field_id.to_be_bytes()); match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), @@ -553,10 +555,11 @@ impl Index { field_id: FieldId, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - *buffer.last_mut().unwrap() = field_id; + buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + .copy_from_slice(&field_id.to_be_bytes()); self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } @@ -569,7 +572,8 @@ impl Index { let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - *buffer.last_mut().unwrap() = field_id; + buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + .copy_from_slice(&field_id.to_be_bytes()); match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), @@ -723,7 +727,7 @@ impl Index { &self, rtxn: &'t RoTxn, ids: impl IntoIterator, - ) -> Result)>> { + ) -> Result)>> { let mut documents = Vec::new(); for id in ids { @@ -741,7 +745,7 @@ impl Index { pub fn all_documents<'t>( &self, rtxn: &'t RoTxn, - ) -> Result)>>> { + ) -> Result)>>> { Ok(self .documents .iter(rtxn)? diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ec9bc32c6..f3bababf6 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -15,6 +15,7 @@ pub mod update; use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; +use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; use std::result::Result as StdResult; @@ -48,7 +49,7 @@ pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; pub type Attribute = u32; pub type DocumentId = u32; -pub type FieldId = u8; +pub type FieldId = u16; pub type Position = u32; pub type FieldDistribution = BTreeMap; @@ -58,7 +59,7 @@ type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, - obkv: obkv::KvReader, + obkv: obkv::KvReaderU16, ) -> Result> { displayed_fields .iter() @@ -123,6 +124,26 @@ pub fn json_to_string(value: &Value) -> Option { } } +/// Divides one slice into two at an index, returns `None` if mid is out of bounds. +fn try_split_at(slice: &[T], mid: usize) -> Option<(&[T], &[T])> { + if mid <= slice.len() { + Some(slice.split_at(mid)) + } else { + None + } +} + +/// Divides one slice into an array and the tail at an index, +/// returns `None` if `N` is out of bounds. +fn try_split_array_at(slice: &[T]) -> Option<([T; N], &[T])> +where + [T; N]: for<'a> TryFrom<&'a [T]>, +{ + let (head, tail) = try_split_at(slice, N)?; + let head = head.try_into().ok()?; + Some((head, tail)) +} + #[cfg(test)] mod tests { use serde_json::json; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 3f55006f2..b0b22ac49 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -78,7 +78,7 @@ impl<'a> FacetDistribution<'a> { K: fmt::Display, KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, { - let mut key_buffer = vec![field_id]; + let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { key_buffer.truncate(1); @@ -157,7 +157,7 @@ impl<'a> FacetDistribution<'a> { .index .facet_id_string_docids .remap_key_type::() - .prefix_iter(self.rtxn, &[field_id])? + .prefix_iter(self.rtxn, &field_id.to_be_bytes())? .remap_key_type::(); for result in iter { diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 4e900bff4..9774bdd52 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -187,7 +187,7 @@ impl<'t> FacetIter<'t> { ) -> heed::Result> { let level = db .remap_types::() - .prefix_iter(rtxn, &[fid][..])? + .prefix_iter(rtxn, &fid.to_be_bytes())? .remap_key_type::() .last() .transpose()? diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 313f8a909..222f3b2d3 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -430,8 +430,10 @@ where C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, F: Fn(K) -> DocumentId, { - let mut iter = - db.remap_key_type::().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::(); + let mut iter = db + .remap_key_type::() + .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? + .remap_key_type::(); while let Some(result) = iter.next() { let (key, ()) = result?; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 0e2cad69d..5fabbc504 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -15,7 +15,7 @@ use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{ create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, }; -use crate::{Index, Result}; +use crate::{FieldId, Index, Result}; pub struct Facets<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -119,7 +119,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { fn clear_field_number_levels<'t>( wtxn: &'t mut heed::RwTxn, db: heed::Database, - field_id: u8, + field_id: FieldId, ) -> heed::Result<()> { let left = (field_id, 1, f64::MIN, f64::MIN); let right = (field_id, u8::MAX, f64::MAX, f64::MAX); @@ -135,11 +135,11 @@ fn compute_facet_number_levels<'t>( shrink_size: Option, level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, - field_id: u8, + field_id: FieldId, ) -> Result> { let first_level_size = db .remap_key_type::() - .prefix_iter(rtxn, &[field_id])? + .prefix_iter(rtxn, &field_id.to_be_bytes())? .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; @@ -196,11 +196,11 @@ fn compute_facet_number_levels<'t>( fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, - field_id: u8, + field_id: FieldId, ) -> Result { let mut documents_ids = RoaringBitmap::new(); - for result in db.prefix_iter(rtxn, &[field_id])? { + for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { let (_key, docids) = result?; documents_ids |= docids; } @@ -210,7 +210,7 @@ fn compute_faceted_documents_ids( fn write_number_entry( writer: &mut Writer, - field_id: u8, + field_id: FieldId, level: u8, left: f64, right: f64, diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 17283b232..8613a8824 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -40,7 +40,7 @@ pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { Ok(values.first().unwrap().to_vec()) } -pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mut Vec) { +pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec) { use itertools::merge_join_by; use itertools::EitherOrBoth::{Both, Left, Right}; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 9ac97c255..ebf365f44 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -7,6 +7,7 @@ use std::time::Instant; use std::{cmp, iter}; use bstr::ByteSlice as _; +use concat_arrays::concat_arrays; use fst::Set; use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer}; use heed::BytesEncode; @@ -776,7 +777,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { for ((fid, count), docids) in self.field_id_word_count_docids { docids_buffer.clear(); CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer); - self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?; + let key: [u8; 3] = concat_arrays!(fid.to_be_bytes(), [count]); + self.field_id_word_count_docids_sorter.insert(key, &docids_buffer)?; } let fst = builder.into_set(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 24ab276d0..b273460d1 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -626,7 +626,7 @@ mod test { Some("tata".to_string()), false, ); - assert_eq!(result.unwrap(), (0u8, "toto".to_string())); + assert_eq!(result.unwrap(), (0, "toto".to_string())); assert_eq!(fields_map.len(), 1); } @@ -635,7 +635,7 @@ mod test { let mut fields_map = FieldsIdsMap::new(); let result = compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); - assert_eq!(result.unwrap(), (0u8, "tata".to_string())); + assert_eq!(result.unwrap(), (0, "tata".to_string())); assert_eq!(fields_map.len(), 1); } @@ -643,7 +643,7 @@ mod test { fn should_return_default_if_both_are_none() { let mut fields_map = FieldsIdsMap::new(); let result = compute_primary_key_pair(None, &mut fields_map, None, true); - assert_eq!(result.unwrap(), (0u8, "id".to_string())); + assert_eq!(result.unwrap(), (0, "id".to_string())); assert_eq!(fields_map.len(), 1); } From a9553af635e6b496797634bc2a4e21d515aa49ee Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 6 Jul 2021 11:40:45 +0200 Subject: [PATCH 0861/1889] Add a test to check that we can index more that 256 fields --- milli/src/update/index_documents/mod.rs | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3876d5dc1..9ac05fe1a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -842,10 +842,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { mod tests { use std::io::Cursor; + use big_s::S; use heed::EnvOpenOptions; use super::*; use crate::update::DeleteDocuments; + use crate::HashMap; #[test] fn simple_document_replacement() { @@ -1352,4 +1354,30 @@ mod tests { wtxn.commit().unwrap(); } + + #[test] + fn index_more_than_256_fields() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut big_object = HashMap::new(); + big_object.insert(S("id"), "wow"); + for i in 0..1000 { + let key = i.to_string(); + big_object.insert(key, "I am a text!"); + } + + let content = vec![big_object]; + let content = serde_json::to_string(&content).unwrap(); + + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(Cursor::new(content), |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + } } From 0a7810752545c72c8dacdaa37b3673bf0fd3f37d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 6 Jul 2021 11:48:22 +0200 Subject: [PATCH 0862/1889] Fix the infos crate to make it read u16 field ids --- infos/src/main.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 151e8c664..d5d1ad0af 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -7,7 +7,7 @@ use byte_unit::Byte; use heed::EnvOpenOptions; use milli::facet::FacetType; use milli::index::db_name::*; -use milli::{Index, TreeLevel}; +use milli::{FieldId, Index, TreeLevel}; use structopt::StructOpt; use Command::*; @@ -322,7 +322,7 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>( rtxn: &'txn heed::RoTxn, db: heed::Database, - field_id: u8, + field_id: FieldId, ) -> heed::Result> + 'txn>> where KC: heed::BytesDecode<'txn>, @@ -330,7 +330,7 @@ where { let iter = db .remap_key_type::() - .prefix_iter(&rtxn, &[field_id])? + .prefix_iter(&rtxn, &field_id.to_be_bytes())? .remap_key_type::(); Ok(Box::new(iter)) From 931021fe57f78a0204acd0bb594e300c829bbedc Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 7 Jul 2021 11:42:14 +0200 Subject: [PATCH 0863/1889] add benchmarks for indexing --- .github/workflows/benchmarks.yml | 2 +- benchmarks/Cargo.toml | 4 + benchmarks/README.md | 13 +- benchmarks/benches/indexing.rs | 314 +++++++++++++++++++++++++++++++ benchmarks/build.rs | 15 +- 5 files changed, 336 insertions(+), 12 deletions(-) create mode 100644 benchmarks/benches/indexing.rs diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index e110c6be5..553f7e424 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: inputs: dataset_name: - description: 'The name of the dataset used to benchmark (songs or wiki)' + description: 'The name of the dataset used to benchmark (songs, wiki or indexing)' required: false default: 'songs' diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index b5ba9bf4f..dd319b4e6 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -28,3 +28,7 @@ harness = false [[bench]] name = "wiki" harness = false + +[[bench]] +name = "indexing" +harness = false diff --git a/benchmarks/README.md b/benchmarks/README.md index 843ea9b29..16838e488 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -30,13 +30,13 @@ _[More about critcmp](https://github.com/BurntSushi/critcmp)._ ### On your machine -To run all the benchmarks (~4h): +To run all the benchmarks (~5h): ```bash cargo bench ``` -To run only the `songs` (~1h) or `wiki` (~3h) benchmark: +To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark: ```bash cargo bench --bench @@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th ```bash mkdir ~/datasets -MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the two datasets are downloaded +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded touch build.rs MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded ``` @@ -84,6 +84,7 @@ Run the comparison script: The benchmarks are available for the following datasets: - `songs` - `wiki` +- `movies` ### Songs @@ -107,5 +108,9 @@ It was generated with the following command: xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv ``` -_[Download the generated `wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._ +### Movies + +`movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/) + +_[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._ diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs new file mode 100644 index 000000000..902b34dc8 --- /dev/null +++ b/benchmarks/benches/indexing.rs @@ -0,0 +1,314 @@ +mod datasets_paths; + +use std::fs::{create_dir_all, remove_dir_all, File}; +use std::path::Path; + +use criterion::{criterion_group, criterion_main, Criterion}; +use heed::EnvOpenOptions; +use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; +use milli::Index; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +fn setup_dir(path: impl AsRef) { + match remove_dir_all(path.as_ref()) { + Ok(_) => (), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), + Err(e) => panic!("{}", e), + } + create_dir_all(path).unwrap(); +} + +fn setup_index() -> Index { + let path = "benches.mmdb"; + setup_dir(&path); + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.max_readers(10); + Index::new(options, path).unwrap() +} + +fn indexing_songs_default(c: &mut Criterion) { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let index_ref = &index; + + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing songs with default settings", |b| { + b.iter_with_setup( + move || { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + () + }, + move |_| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( + "could not find the dataset in: {}", + datasets_paths::SMOL_SONGS + )); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + }, + ) + }); + + index.prepare_for_closing().wait(); +} + +fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let index_ref = &index; + + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing songs without faceted numbers", |b| { + b.iter_with_setup( + move || { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + () + }, + move |_| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( + "could not find the dataset in: {}", + datasets_paths::SMOL_SONGS + )); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + }, + ) + }); + index.prepare_for_closing().wait(); +} + +fn indexing_songs_without_faceted_fields(c: &mut Criterion) { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let index_ref = &index; + + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing songs without any facets", |b| { + b.iter_with_setup( + move || { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + () + }, + move |_| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( + "could not find the dataset in: {}", + datasets_paths::SMOL_SONGS + )); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + }, + ) + }); + index.prepare_for_closing().wait(); +} + +fn indexing_wiki(c: &mut Criterion) { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + // there is NO faceted fields at all + + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let index_ref = &index; + + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing wiki", |b| { + b.iter_with_setup( + move || { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + () + }, + move |_| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + + builder.update_format(UpdateFormat::Csv); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!( + "could not find the dataset in: {}", + datasets_paths::SMOL_SONGS + )); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + }, + ) + }); + index.prepare_for_closing().wait(); +} + +fn indexing_movies_default(c: &mut Criterion) { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "poster", "overview", "release_date", "genres"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "overview"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = ["released_date", "genres"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(faceted_fields); + + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let index_ref = &index; + + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing movies with default settings", |b| { + b.iter_with_setup( + move || { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + () + }, + move |_| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index_ref.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + + builder.update_format(UpdateFormat::Json); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::MOVIES) + .expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES)); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + }, + ) + }); + + index.prepare_for_closing().wait(); +} + +criterion_group!( + benches, + indexing_songs_default, + indexing_songs_without_faceted_numbers, + indexing_songs_without_faceted_fields, + indexing_wiki, + indexing_movies_default +); +criterion_main!(benches); diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 827c2c2a3..47a14f25b 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -10,8 +10,9 @@ use reqwest::IntoUrl; const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets"; -const DATASET_SONGS: &str = "smol-songs"; -const DATASET_WIKI: &str = "smol-wiki-articles"; +const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv"); +const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv"); +const DATASET_MOVIES: (&str, &str) = ("movies", "json"); /// The name of the environment variable used to select the path /// of the directory containing the datasets @@ -31,9 +32,9 @@ fn main() -> anyhow::Result<()> { )?; writeln!(manifest_paths_file)?; - for dataset in &[DATASET_SONGS, DATASET_WIKI] { + for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] { let out_path = out_dir.join(dataset); - let out_file = out_path.with_extension("csv"); + let out_file = out_path.with_extension(extension); writeln!( &mut manifest_paths_file, @@ -45,15 +46,15 @@ fn main() -> anyhow::Result<()> { if out_file.exists() { eprintln!( "The dataset {} already exists on the file system and will not be downloaded again", - dataset + out_path.display(), ); continue; } - let url = format!("{}/{}.csv.gz", BASE_URL, dataset); + let url = format!("{}/{}.{}.gz", BASE_URL, dataset, extension); eprintln!("downloading: {}", url); let bytes = download_dataset(url.clone())?; eprintln!("{} downloaded successfully", url); - eprintln!("uncompressing in {}", out_path.display()); + eprintln!("uncompressing in {}", out_file.display()); uncompress_in_file(bytes, &out_file)?; } From 0ab541627b6a6a2f8a21893638e4fabb3075f5fa Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 19 Jul 2021 14:32:31 +0200 Subject: [PATCH 0864/1889] add a $HOME var to the ci --- .github/workflows/benchmarks.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 553f7e424..02f54fe13 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -8,6 +8,9 @@ on: required: false default: 'songs' +env: + HOME: "/home/runner" # The actions-rs/toolchain@v1 can fail we have no $HOME defined + jobs: benchmarks: name: Run and upload benchmarks From 9f8095c06985bafb58272bc26374df7e81188cb9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 21 Jul 2021 10:35:35 +0200 Subject: [PATCH 0865/1889] Make sure that we don't keep a reference on the LMDB key when using put_current --- milli/src/update/delete_documents.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 222f3b2d3..e6f847d01 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -366,6 +366,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; } else if docids.len() != previous_len { + let key = key.to_owned(); // safety: we don't keep references from inside the LMDB database. unsafe { iter.put_current(&key, &docids)? }; } From f858f64b1f0f69d791d3ba38b52ea1d002faacad Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 10:29:00 +0200 Subject: [PATCH 0866/1889] Move the facet number iterators into their own module --- milli/src/search/criteria/asc_desc.rs | 9 +- milli/src/search/facet/facet_distribution.rs | 6 +- milli/src/search/facet/facet_number.rs | 248 +++++++++++++++++++ milli/src/search/facet/filter_condition.rs | 4 +- milli/src/search/facet/mod.rs | 248 +------------------ milli/src/search/mod.rs | 2 +- 6 files changed, 262 insertions(+), 255 deletions(-) create mode 100644 milli/src/search/facet/facet_number.rs diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index ccee2c393..99d63c90d 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::FacetIter; +use crate::search::facet::FacetNumberIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -172,8 +172,11 @@ fn facet_ordered<'t>( let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; Ok(Box::new(iter.map(Ok)) as Box>) } else { - let facet_fn = - if ascending { FacetIter::new_reducing } else { FacetIter::new_reverse_reducing }; + let facet_fn = if ascending { + FacetNumberIter::new_reducing + } else { + FacetNumberIter::new_reverse_reducing + }; let iter = facet_fn(rtxn, index, field_id, candidates)?; Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index b0b22ac49..080fd9af7 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -9,7 +9,7 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; use crate::heed_codec::facet::FacetValueStringCodec; -use crate::search::facet::{FacetIter, FacetRange}; +use crate::search::facet::{FacetNumberIter, FacetNumberRange}; use crate::{DocumentId, FieldId, Index, Result}; /// The default number of values by facets that will @@ -118,7 +118,7 @@ impl<'a> FacetDistribution<'a> { distribution: &mut BTreeMap, ) -> heed::Result<()> { let iter = - FacetIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; for result in iter { let (value, mut docids) = result?; @@ -143,7 +143,7 @@ impl<'a> FacetDistribution<'a> { let mut distribution = BTreeMap::new(); let db = self.index.facet_id_f64_docids; - let range = FacetRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; for result in range { let ((_, _, value, _), docids) = result?; diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs new file mode 100644 index 000000000..f943b96da --- /dev/null +++ b/milli/src/search/facet/facet_number.rs @@ -0,0 +1,248 @@ +use std::ops::Bound::{self, Excluded, Included, Unbounded}; + +use either::Either::{self, Left, Right}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{Database, LazyDecode, RoRange, RoRevRange}; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::{FieldId, Index}; + +pub struct FacetNumberRange<'t> { + iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, + end: Bound, +} + +impl<'t> FacetNumberRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, level, left, f64::MIN)), + Excluded(left) => Excluded((field_id, level, left, f64::MIN)), + Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), + }; + let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); + let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; + Ok(FacetNumberRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetNumberRange<'t> { + type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => Some(Ok(((fid, level, left, right), docids))), + Err(e) => Some(Err(e)), + } + } else { + None + } + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + +pub struct FacetNumberRevRange<'t> { + iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, + end: Bound, +} + +impl<'t> FacetNumberRevRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, level, left, f64::MIN)), + Excluded(left) => Excluded((field_id, level, left, f64::MIN)), + Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), + }; + let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); + let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; + Ok(FacetNumberRevRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetNumberRevRange<'t> { + type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; + + fn next(&mut self) -> Option { + loop { + match self.iter.next() { + Some(Ok(((fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), + Err(e) => return Some(Err(e)), + } + } + continue; + } + Some(Err(e)) => return Some(Err(e)), + None => return None, + } + } + } +} + +pub struct FacetNumberIter<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: Database, + field_id: FieldId, + level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, + must_reduce: bool, +} + +impl<'t> FacetNumberIter<'t> { + /// Create a `FacetNumberIter` that will iterate on the different facet entries + /// (facet value + documents ids) and that will reduce the given documents ids + /// while iterating on the different facet levels. + pub fn new_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_f64_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = + FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Left(highest_iter))]; + Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) + } + + /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse + /// (facet value + documents ids) and that will reduce the given documents ids + /// while iterating on the different facet levels. + pub fn new_reverse_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_f64_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = + FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Right(highest_iter))]; + Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) + } + + /// Create a `FacetNumberIter` that will iterate on the different facet entries + /// (facet value + documents ids) and that will not reduce the given documents ids + /// while iterating on the different facet levels, possibly returning multiple times + /// a document id associated with multiple facet values. + pub fn new_non_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_f64_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = + FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Left(highest_iter))]; + Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) + } + + fn highest_level( + rtxn: &'t heed::RoTxn, + db: Database, + fid: FieldId, + ) -> heed::Result> { + let level = db + .remap_types::() + .prefix_iter(rtxn, &fid.to_be_bytes())? + .remap_key_type::() + .last() + .transpose()? + .map(|((_, level, _, _), _)| level); + Ok(level) + } +} + +impl<'t> Iterator for FacetNumberIter<'t> { + type Item = heed::Result<(f64, RoaringBitmap)>; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, last) = self.level_iters.last_mut()?; + let is_ascending = last.is_left(); + for result in last { + // If the last iterator must find an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + match result { + Ok(((_fid, level, left, right), mut docids)) => { + docids &= &*documents_ids; + if !docids.is_empty() { + if self.must_reduce { + *documents_ids -= &docids; + } + + if level == 0 { + return Some(Ok((left, docids))); + } + + let rtxn = self.rtxn; + let db = self.db; + let fid = self.field_id; + let left = Included(left); + let right = Included(right); + + let result = if is_ascending { + FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) + .map(Left) + } else { + FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) + .map(Right) + }; + + match result { + Ok(iter) => { + self.level_iters.push((docids, iter)); + continue 'outer; + } + Err(e) => return Some(Err(e)), + } + } + } + Err(e) => return Some(Err(e)), + } + } + self.level_iters.pop(); + } + } +} diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 1b1eafcab..875fe3b27 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -15,7 +15,7 @@ use roaring::RoaringBitmap; use self::FilterCondition::*; use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; -use super::FacetRange; +use super::FacetNumberRange; use crate::error::UserError; use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec}; use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; @@ -282,7 +282,7 @@ impl FilterCondition { // We must create a custom iterator to be able to iterate over the // requested range as the range iterator cannot express some conditions. - let iter = FacetRange::new(rtxn, db, field_id, level, left, right)?; + let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; debug!("Iterating between {:?} and {:?} (level {})", left, right, level); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 9774bdd52..e6ea92543 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,253 +1,9 @@ -use std::ops::Bound::{self, Excluded, Included, Unbounded}; - -use either::Either::{self, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; - pub use self::facet_distribution::FacetDistribution; +pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::filter_condition::{FilterCondition, Operator}; pub(crate) use self::parser::Rule as ParserRule; -use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; mod facet_distribution; +mod facet_number; mod filter_condition; mod parser; - -pub struct FacetRange<'t> { - iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} - -impl<'t> FacetRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; - - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => Some(Ok(((fid, level, left, right), docids))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} - -pub struct FacetRevRange<'t> { - iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} - -impl<'t> FacetRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetRevRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetRevRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; - - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} - -pub struct FacetIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, FacetRevRange<'t>>)>, - must_reduce: bool, -} - -impl<'t> FacetIter<'t> { - /// Create a `FacetIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } - - /// Create a `FacetIter` that will iterate on the different facet entries in reverse - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Right(highest_iter))]; - Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } - - /// Create a `FacetIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will not reduce the given documents ids - /// while iterating on the different facet levels, possibly returning multiple times - /// a document id associated with multiple facet values. - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) - } - - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - let level = db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? - .remap_key_type::() - .last() - .transpose()? - .map(|((_, level, _, _), _)| level); - Ok(level) - } -} - -impl<'t> Iterator for FacetIter<'t> { - type Item = heed::Result<(f64, RoaringBitmap)>; - - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); - for result in last { - // If the last iterator must find an empty set of documents it means - // that we found all the documents in the sub level iterations already, - // we can pop this level iterator. - if documents_ids.is_empty() { - break; - } - - match result { - Ok(((_fid, level, left, right), mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } - - if level == 0 { - return Some(Ok((left, docids))); - } - - let rtxn = self.rtxn; - let db = self.db; - let fid = self.field_id; - let left = Included(left); - let right = Included(right); - - let result = if is_ascending { - FacetRange::new(rtxn, db, fid, level - 1, left, right).map(Left) - } else { - FacetRevRange::new(rtxn, db, fid, level - 1, left, right).map(Right) - }; - - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - self.level_iters.pop(); - } - } -} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f40a6aed6..574459547 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; pub(crate) use self::facet::ParserRule; -pub use self::facet::{FacetDistribution, FacetIter, FilterCondition, Operator}; +pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::FieldIdMapMissingEntry; From 851f9790393daf9693aa2c60c66149b1c17ad0a7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 11:27:17 +0200 Subject: [PATCH 0867/1889] Describe the way we want to group the facet strings --- milli/src/search/facet/facet_string.rs | 123 +++++++++++++++++++++++++ milli/src/search/facet/mod.rs | 1 + 2 files changed, 124 insertions(+) create mode 100644 milli/src/search/facet/facet_string.rs diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs new file mode 100644 index 000000000..61fc32f8e --- /dev/null +++ b/milli/src/search/facet/facet_string.rs @@ -0,0 +1,123 @@ +//! This module contains helpers iterators for facet strings. +//! +//! The purpose is to help iterate over the quite complex system of facets strings. A simple +//! description of the system would be that every facet string value is stored into an LMDB database +//! and that every value is associated with the document ids which are associated with this facet +//! string value. +//! +//! In reality it is a little bit more complex as we have to create aggregations of runs of facet +//! string values, those aggregations helps in choosing the right groups of facets to follow. +//! +//! ## A typical algorithm run +//! +//! If a group of aggregated facets values contains one of the documents ids, we must continue +//! iterating over the sub-groups. +//! +//! If this group is the lowest level and contain at least one document id we yield the associated +//! facet documents ids. +//! +//! If the group doesn't contain one of our documents ids, we continue to the next group at this +//! same level. +//! +//! ## The complexity comes from the strings +//! +//! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create +//! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the +//! two numbers bounds, the left and the right bound of the group, both inclusive. +//! +//! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and +//! puting two numbers big-endian encoded one after the other gives us ordered groups. The values +//! are simple unions of the documents ids coming from the groups below. +//! +//! ### Example of what a facet number LMDB database contain +//! +//! | level | left-bound | right-bound | docs | +//! |-------|------------|-------------|------------------| +//! | 0 | 0 | _skipped_ | 1, 2 | +//! | 0 | 1 | _skipped_ | 6, 7 | +//! | 0 | 3 | _skipped_ | 4, 7 | +//! | 0 | 5 | _skipped_ | 2, 3, 4 | +//! | 1 | 0 | 1 | 1, 2, 6, 7 | +//! | 1 | 3 | 5 | 2, 3, 4, 7 | +//! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | +//! +//! As you can see the level 0 have two equal bounds, therefore we skip serializing the second +//! bound, that's the base level where you can directly fetch the documents ids associated with an +//! exact number. +//! +//! The next levels have two different bounds and the associated documents ids are simply the result +//! of an union of all the documents ids associated with the aggregated groups above. +//! +//! ## The complexity of defining groups of facet strings +//! +//! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in +//! lexicographical order, it means that whatever the key represent the bytes are read in their raw +//! form and a simple `strcmp` will define the order in which keys will be read from the store. +//! +//! That's easy for types with a known size, like floats or integers, they are 64 bytes long and +//! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the +//! first number then by the second if the the first number is equal on two keys. +//! +//! For strings it is a lot more complex as those types are unsized, it means that the size of facet +//! strings is different for each facet value. +//! +//! ### Basic approach: padding the keys +//! +//! A first approach would be to simply define the maximum size of a facet string and pad the keys +//! with zeroes. The big problem of this approach is that it: +//! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the +//! other. +//! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB +//! performances. +//! +//! ### Better approach: number the facet groups +//! +//! A better approach would be to number the groups, this way we don't have the downsides of the +//! previously described approach but we need to be able to describe the groups by using a number. +//! +//! #### Example of facet strings with numbered groups +//! +//! | level | left-bound | right-bound | left-string | right-string | docs | +//! |-------|------------|-------------|-------------|--------------|------------------| +//! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | +//! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | +//! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | +//! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | +//! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | +//! | 1 | 3 | 5 | gamma | omega | 2, 3, 4, 7 | +//! | 2 | 0 | 5 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | +//! +//! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not +//! need to store the facet string value two times. +//! +//! In the value, not in the key, you can see that we added two new values: +//! the left-string and the right-string, which defines the original facet strings associated with +//! the given group. +//! +//! We put those two strings inside of the value, this way we do not limit the maximum size of the +//! facet string values, and the impact on performances is not important as, IIRC, LMDB put big +//! values on another page, this helps in iterating over keys fast enough and only fetch the page +//! with the values when required. +//! +//! The other little advantage with this solution is that there is no a big overhead, compared with +//! the facet number levels, we only duplicate the facet strings once for the level 1. +//! +//! #### A typical algorithm run +//! +//! Note that the algorithm is always moving from the highest level to the lowest one, one level +//! by one level, this is why it is ok to only store the facets string on the level 1. +//! +//! If a group of aggregated facets values, a group with numbers contains one of the documents ids, +//! we must continue iterating over the sub-groups. To do so: +//! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds +//! and iterate over the facet groups defined by these numbers over the current level - 1. +//! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the +//! value and just do the same as with the facet numbers but with strings: iterate over the +//! current level - 1 with both keys. +//! +//! If this group is the lowest level (level 0) and contain at least one document id we yield the +//! associated facet documents ids. +//! +//! If the group doesn't contain one of our documents ids, we continue to the next group at this +//! same level. +//! diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index e6ea92543..d92a8e4bd 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -5,5 +5,6 @@ pub(crate) use self::parser::Rule as ParserRule; mod facet_distribution; mod facet_number; +mod facet_string; mod filter_condition; mod parser; From a79661c6dc7fc605f0a4a0a87c7af73941ffb447 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 15:53:28 +0200 Subject: [PATCH 0868/1889] Introduce a lot of facet string helper iterators --- .../facet/facet_level_value_u32_codec.rs | 52 +++++++ .../facet/facet_string_level_zero_codec.rs | 49 ++++++ .../facet_string_zero_bounds_value_codec.rs | 80 ++++++++++ milli/src/heed_codec/facet/mod.rs | 6 + milli/src/search/facet/facet_string.rs | 140 +++++++++++++++++- 5 files changed, 319 insertions(+), 8 deletions(-) create mode 100644 milli/src/heed_codec/facet/facet_level_value_u32_codec.rs create mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_codec.rs create mode 100644 milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs new file mode 100644 index 000000000..6b51b306e --- /dev/null +++ b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs @@ -0,0 +1,52 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::num::NonZeroU8; + +use crate::FieldId; + +/// A codec that stores the field id, level 1 and higher and the groups ids. +/// +/// It can only be used to encode the facet string of the level 1 or higher. +pub struct FacetLevelValueU32Codec; + +impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { + type DItem = (FieldId, NonZeroU8, u32, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (level, bytes) = bytes.split_first()?; + let level = NonZeroU8::new(*level)?; + let left = bytes[16..20].try_into().ok().map(u32::from_be_bytes)?; + let right = bytes[20..].try_into().ok().map(u32::from_be_bytes)?; + Some((*field_id, level, left, right)) + } +} + +impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { + type EItem = (FieldId, NonZeroU8, u32, u32); + + fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // Write the big-endian integers. + let bytes = left.to_be_bytes(); + buffer[..4].copy_from_slice(&bytes[..]); + + let bytes = right.to_be_bytes(); + buffer[4..8].copy_from_slice(&bytes[..]); + + // Then the u32 values just to be able to read them back. + let bytes = left.to_be_bytes(); + buffer[8..12].copy_from_slice(&bytes[..]); + + let bytes = right.to_be_bytes(); + buffer[12..].copy_from_slice(&bytes[..]); + + let mut bytes = Vec::with_capacity(buffer.len() + 2); + bytes.push(*field_id); + bytes.push(level.get()); + bytes.extend_from_slice(&buffer); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs new file mode 100644 index 000000000..1c0c4be93 --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs @@ -0,0 +1,49 @@ +use std::borrow::Cow; +use std::str; + +use crate::FieldId; + +/// A codec that stores the field id, level 0, and facet string. +/// +/// It can only be used to encode the facet string of the level 0, +/// as it hardcodes the level. +/// +/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys, +/// and make sure that the levels are not mixed-up. The level 0 is special, the key +/// are strings, other levels represent groups and keys are simply two integers. +pub struct FacetStringLevelZeroCodec; + +impl FacetStringLevelZeroCodec { + pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { + out.reserve(value.len() + 2); + out.push(field_id); + out.push(0); // the level zero (for LMDB ordering only) + out.extend_from_slice(value.as_bytes()); + } +} + +impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { + type DItem = (FieldId, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (level, bytes) = bytes.split_first()?; + + if *level != 0 { + return None; + } + + let value = str::from_utf8(bytes).ok()?; + Some((*field_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec { + type EItem = (FieldId, &'a str); + + fn bytes_encode((field_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::new(); + FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes); + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs new file mode 100644 index 000000000..3c2ce4657 --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -0,0 +1,80 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::{marker, str}; + +/// A codec that encodes two strings in front of the value. +/// +/// The usecase is for the facet string levels algorithm where we must +/// know the origin of a group, the group left and right bounds are stored +/// in the value to not break the lexicographical ordering of the LMDB keys. +pub struct FacetStringZeroBoundsValueCodec(marker::PhantomData); + +impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec +where + C: heed::BytesDecode<'a>, +{ + type DItem = (Option<(&'a str, &'a str)>, C::DItem); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (contains_bounds, tail_bytes) = bytes.split_first()?; + + if *contains_bounds != 0 { + let (left_len, bytes) = try_split_at(bytes, 2)?; + let (right_len, bytes) = try_split_at(bytes, 2)?; + + let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?; + let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?; + + let (left, bytes) = try_split_at(bytes, left_len as usize)?; + let (right, bytes) = try_split_at(bytes, right_len as usize)?; + + let left = str::from_utf8(left).ok()?; + let right = str::from_utf8(right).ok()?; + + C::bytes_decode(bytes).map(|item| (Some((left, right)), item)) + } else { + C::bytes_decode(tail_bytes).map(|item| (None, item)) + } + } +} + +impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec +where + C: heed::BytesEncode<'a>, +{ + type EItem = (Option<(&'a str, &'a str)>, C::EItem); + + fn bytes_encode((bounds, value): &'a Self::EItem) -> Option> { + let mut bytes = Vec::new(); + + match bounds { + Some((left, right)) => { + let left_len: u16 = left.len().try_into().ok()?; + let right_len: u16 = right.len().try_into().ok()?; + bytes.extend_from_slice(&left_len.to_be_bytes()); + bytes.extend_from_slice(&right_len.to_be_bytes()); + + let value_bytes = C::bytes_encode(&value)?; + bytes.extend_from_slice(&value_bytes[..]); + + Some(Cow::Owned(bytes)) + } + None => { + bytes.push(0); + let value_bytes = C::bytes_encode(&value)?; + bytes.extend_from_slice(&value_bytes[..]); + Some(Cow::Owned(bytes)) + } + } + } +} + +/// Tries to split a slice in half at the given middle point, +/// `None` if the slice is too short. +fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if slice.len() >= mid { + Some(slice.split_at(mid)) + } else { + None + } +} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 532da12fa..90dc79134 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,9 +1,15 @@ mod facet_level_value_f64_codec; +mod facet_level_value_u32_codec; +mod facet_string_level_zero_codec; +mod facet_string_zero_bounds_value_codec; mod facet_value_string_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; +pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; +pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; +pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::facet_value_string_codec::FacetValueStringCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 61fc32f8e..d4d85153f 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -31,7 +31,7 @@ //! //! ### Example of what a facet number LMDB database contain //! -//! | level | left-bound | right-bound | docs | +//! | level | left-bound | right-bound | documents ids | //! |-------|------------|-------------|------------------| //! | 0 | 0 | _skipped_ | 1, 2 | //! | 0 | 1 | _skipped_ | 6, 7 | @@ -48,7 +48,7 @@ //! The next levels have two different bounds and the associated documents ids are simply the result //! of an union of all the documents ids associated with the aggregated groups above. //! -//! ## The complexity of defining groups of facet strings +//! ## The complexity of defining groups for facet strings //! //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in //! lexicographical order, it means that whatever the key represent the bytes are read in their raw @@ -77,22 +77,25 @@ //! //! #### Example of facet strings with numbered groups //! -//! | level | left-bound | right-bound | left-string | right-string | docs | +//! | level | left-bound | right-bound | left-string | right-string | documents ids | //! |-------|------------|-------------|-------------|--------------|------------------| //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | -//! | 1 | 3 | 5 | gamma | omega | 2, 3, 4, 7 | -//! | 2 | 0 | 5 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | +//! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | +//! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | //! //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not //! need to store the facet string value two times. //! -//! In the value, not in the key, you can see that we added two new values: -//! the left-string and the right-string, which defines the original facet strings associated with -//! the given group. +//! The number in the left-bound and right-bound columns are incremental numbers representing the +//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering +//! of the LMDB keys. +//! +//! In the value, not in the key, you can see that we added two new values: the left-string and the +//! right-string, which defines the original facet strings associated with the given group. //! //! We put those two strings inside of the value, this way we do not limit the maximum size of the //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big @@ -121,3 +124,124 @@ //! If the group doesn't contain one of our documents ids, we continue to the next group at this //! same level. //! + +use std::num::NonZeroU8; +use std::ops::Bound; +use std::ops::Bound::{Excluded, Included}; + +use heed::types::{ByteSlice, Str}; +use heed::{Database, LazyDecode, RoRange}; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::{ + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec, +}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::FieldId; + +/// An iterator that is used to explore the facets level strings +/// from the level 1 to infinity. +/// +/// It yields the level, group id that an entry covers, the optional group strings +/// that it covers of the level 0 only if it is an entry from the level 1 and +/// the roaring bitmap associated. +pub struct FacetStringGroupRange<'t> { + iter: RoRange< + 't, + FacetLevelValueU32Codec, + LazyDecode>, + >, + end: Bound, +} + +impl<'t> FacetStringGroupRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database< + FacetLevelValueU32Codec, + FacetStringZeroBoundsValueCodec, + >, + field_id: FieldId, + level: NonZeroU8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, level, left, u32::MIN)), + Excluded(left) => Excluded((field_id, level, left, u32::MIN)), + Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), + }; + let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); + let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; + Ok(FacetStringGroupRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetStringGroupRange<'t> { + type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => Some(Ok(((level, left, right), docids))), + Err(e) => Some(Err(e)), + } + } else { + None + } + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + +/// An iterator that is used to explore the level 0 of the facets string database. +/// +/// It yields the facet string and the roaring bitmap associated with it. +pub struct FacetStringLevelZeroRange<'t> { + iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, +} + +impl<'t> FacetStringLevelZeroRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + left: Bound<&str>, + right: Bound<&str>, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, left)), + Excluded(left) => Excluded((field_id, left)), + Unbounded => Included((field_id, "")), + }; + + let right_bound = match right { + Included(right) => Included((field_id, right)), + Excluded(right) => Excluded((field_id, right)), + Unbounded => Excluded((field_id + 1, "")), + }; + + db.range(rtxn, &(left_bound, right_bound)).map(|iter| FacetStringLevelZeroRange { iter }) + } +} + +impl<'t> Iterator for FacetStringLevelZeroRange<'t> { + type Item = heed::Result<(&'t str, RoaringBitmap)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))), + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} From adfd4da24cbd17b3d07a55d8a54b4555090c4e54 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 16:36:51 +0200 Subject: [PATCH 0869/1889] Introduce the FacetStringIter iterator --- milli/src/search/facet/facet_number.rs | 2 +- milli/src/search/facet/facet_string.rs | 149 +++++++++++++++++++++++-- 2 files changed, 140 insertions(+), 11 deletions(-) diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs index f943b96da..02390aac1 100644 --- a/milli/src/search/facet/facet_number.rs +++ b/milli/src/search/facet/facet_number.rs @@ -147,7 +147,7 @@ impl<'t> FacetNumberIter<'t> { field_id: FieldId, documents_ids: RoaringBitmap, ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); + let db = index.facet_id_f64_docids; let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index d4d85153f..09781f883 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -127,9 +127,10 @@ use std::num::NonZeroU8; use std::ops::Bound; -use std::ops::Bound::{Excluded, Included}; +use std::ops::Bound::{Excluded, Included, Unbounded}; -use heed::types::{ByteSlice, Str}; +use either::{Either, Left, Right}; +use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::{Database, LazyDecode, RoRange}; use roaring::RoaringBitmap; @@ -137,7 +138,7 @@ use crate::heed_codec::facet::{ FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec, }; use crate::heed_codec::CboRoaringBitmapCodec; -use crate::FieldId; +use crate::{FieldId, Index}; /// An iterator that is used to explore the facets level strings /// from the level 1 to infinity. @@ -155,17 +156,18 @@ pub struct FacetStringGroupRange<'t> { } impl<'t> FacetStringGroupRange<'t> { - pub fn new( + pub fn new( rtxn: &'t heed::RoTxn, - db: Database< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >, + db: Database, field_id: FieldId, level: NonZeroU8, left: Bound, right: Bound, ) -> heed::Result> { + let db = db.remap_types::< + FacetLevelValueU32Codec, + FacetStringZeroBoundsValueCodec, + >(); let left_bound = match left { Included(left) => Included((field_id, level, left, u32::MIN)), Excluded(left) => Excluded((field_id, level, left, u32::MIN)), @@ -211,13 +213,14 @@ pub struct FacetStringLevelZeroRange<'t> { } impl<'t> FacetStringLevelZeroRange<'t> { - pub fn new( + pub fn new( rtxn: &'t heed::RoTxn, - db: Database, + db: Database, field_id: FieldId, left: Bound<&str>, right: Bound<&str>, ) -> heed::Result> { + let db = db.remap_types::(); let left_bound = match left { Included(left) => Included((field_id, left)), Excluded(left) => Excluded((field_id, left)), @@ -245,3 +248,129 @@ impl<'t> Iterator for FacetStringLevelZeroRange<'t> { } } } + +/// An iterator that is used to explore the facet strings level by level, +/// it will only return facets strings that are associated with the +/// candidates documents ids given. +pub struct FacetStringIter<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: Database, + field_id: FieldId, + level_iters: + Vec<(RoaringBitmap, Either, FacetStringLevelZeroRange<'t>>)>, +} + +impl<'t> FacetStringIter<'t> { + pub fn new_non_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + // TODO make sure that we change the database before using it, or merging the PR. + let db = index.facet_id_string_docids.remap_types::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = match NonZeroU8::new(highest_level) { + Some(highest_level) => Left(FacetStringGroupRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + highest_level, + Unbounded, + Unbounded, + )?), + None => Right(FacetStringLevelZeroRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + Unbounded, + Unbounded, + )?), + }; + + Ok(FacetStringIter { rtxn, db, field_id, level_iters: vec![(documents_ids, highest_iter)] }) + } + + fn highest_level( + rtxn: &'t heed::RoTxn, + db: Database, + fid: FieldId, + ) -> heed::Result> { + Ok(db + .remap_types::() + .prefix_iter(rtxn, &[fid][..])? // the field id is the first bit + .last() + .transpose()? + .map(|(key_bytes, _)| key_bytes[1])) // the level is the second bit + } +} + +impl<'t> Iterator for FacetStringIter<'t> { + type Item = heed::Result<(&'t str, RoaringBitmap)>; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, last) = self.level_iters.last_mut()?; + match last { + Left(last) => { + for result in last { + match result { + Ok(((level, left, right), (string_bounds, mut docids))) => { + docids &= &*documents_ids; + if !docids.is_empty() { + *documents_ids -= &docids; + + let result = match string_bounds { + Some((left, right)) => FacetStringLevelZeroRange::new( + self.rtxn, + self.db, + self.field_id, + Included(left), + Included(right), + ) + .map(Right), + None => FacetStringGroupRange::new( + self.rtxn, + self.db, + self.field_id, + NonZeroU8::new(level.get() - 1).unwrap(), + Included(left), + Included(right), + ) + .map(Left), + }; + + match result { + Ok(iter) => { + self.level_iters.push((docids, iter)); + continue 'outer; + } + Err(e) => return Some(Err(e)), + } + } + } + Err(e) => return Some(Err(e)), + } + } + } + Right(last) => { + // level zero only + for result in last { + match result { + Ok((value, mut docids)) => { + docids &= &*documents_ids; + if !docids.is_empty() { + *documents_ids -= &docids; + return Some(Ok((value, docids))); + } + } + Err(e) => return Some(Err(e)), + } + } + } + } + + self.level_iters.pop(); + } + } +} From 757b2b502aab738f7539fbfc036fa7c8135e7ff4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 16:43:27 +0200 Subject: [PATCH 0870/1889] Remove the FacetValueStringCodec --- .../facet/facet_level_value_u32_codec.rs | 15 ++++++++------- .../facet/facet_string_level_zero_codec.rs | 9 +++++---- milli/src/heed_codec/facet/mod.rs | 2 -- milli/src/index.rs | 4 ++-- milli/src/search/facet/facet_distribution.rs | 8 ++++---- milli/src/search/facet/facet_string.rs | 6 +++--- milli/src/search/facet/filter_condition.rs | 4 ++-- milli/src/update/index_documents/store.rs | 4 ++-- 8 files changed, 26 insertions(+), 26 deletions(-) diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs index 6b51b306e..597335b6e 100644 --- a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs +++ b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::convert::TryInto; use std::num::NonZeroU8; -use crate::FieldId; +use crate::{try_split_array_at, FieldId}; /// A codec that stores the field id, level 1 and higher and the groups ids. /// @@ -13,12 +13,13 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { type DItem = (FieldId, NonZeroU8, u32, u32); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); let (level, bytes) = bytes.split_first()?; let level = NonZeroU8::new(*level)?; - let left = bytes[16..20].try_into().ok().map(u32::from_be_bytes)?; - let right = bytes[20..].try_into().ok().map(u32::from_be_bytes)?; - Some((*field_id, level, left, right)) + let left = bytes[8..12].try_into().ok().map(u32::from_be_bytes)?; + let right = bytes[12..].try_into().ok().map(u32::from_be_bytes)?; + Some((field_id, level, left, right)) } } @@ -42,8 +43,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { let bytes = right.to_be_bytes(); buffer[12..].copy_from_slice(&bytes[..]); - let mut bytes = Vec::with_capacity(buffer.len() + 2); - bytes.push(*field_id); + let mut bytes = Vec::with_capacity(buffer.len() + 2 + 1); + bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.push(level.get()); bytes.extend_from_slice(&buffer); diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs index 1c0c4be93..009c6454a 100644 --- a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::str; -use crate::FieldId; +use crate::{try_split_array_at, FieldId}; /// A codec that stores the field id, level 0, and facet string. /// @@ -16,7 +16,7 @@ pub struct FacetStringLevelZeroCodec; impl FacetStringLevelZeroCodec { pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { out.reserve(value.len() + 2); - out.push(field_id); + out.extend_from_slice(&field_id.to_be_bytes()); out.push(0); // the level zero (for LMDB ordering only) out.extend_from_slice(value.as_bytes()); } @@ -26,7 +26,8 @@ impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { type DItem = (FieldId, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); let (level, bytes) = bytes.split_first()?; if *level != 0 { @@ -34,7 +35,7 @@ impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { } let value = str::from_utf8(bytes).ok()?; - Some((*field_id, value)) + Some((field_id, value)) } } diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 90dc79134..ecab7eb7c 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -2,7 +2,6 @@ mod facet_level_value_f64_codec; mod facet_level_value_u32_codec; mod facet_string_level_zero_codec; mod facet_string_zero_bounds_value_codec; -mod facet_value_string_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; @@ -10,6 +9,5 @@ pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; -pub use self::facet_value_string_codec::FacetValueStringCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/index.rs b/milli/src/index.rs index 099a5891d..b2be10767 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -11,7 +11,7 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::{ @@ -91,7 +91,7 @@ pub struct Index { /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, /// Maps the facet field id and the string with the docids that corresponds to it. - pub facet_id_string_docids: Database, + pub facet_id_string_docids: Database, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 080fd9af7..ceefe785b 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; -use std::{cmp, fmt}; +use std::{cmp, fmt, mem}; use heed::types::{ByteSlice, Unit}; use heed::{BytesDecode, Database}; @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::FacetValueStringCodec; +use crate::heed_codec::facet::FacetStringLevelZeroCodec; use crate::search::facet::{FacetNumberIter, FacetNumberRange}; use crate::{DocumentId, FieldId, Index, Result}; @@ -81,7 +81,7 @@ impl<'a> FacetDistribution<'a> { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { - key_buffer.truncate(1); + key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db .remap_key_type::() @@ -158,7 +158,7 @@ impl<'a> FacetDistribution<'a> { .facet_id_string_docids .remap_key_type::() .prefix_iter(self.rtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + .remap_key_type::(); for result in iter { let ((_, value), docids) = result?; diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 09781f883..d0d9b54eb 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -130,7 +130,7 @@ use std::ops::Bound; use std::ops::Bound::{Excluded, Included, Unbounded}; use either::{Either, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore, Str}; +use heed::types::{ByteSlice, DecodeIgnore}; use heed::{Database, LazyDecode, RoRange}; use roaring::RoaringBitmap; @@ -298,10 +298,10 @@ impl<'t> FacetStringIter<'t> { ) -> heed::Result> { Ok(db .remap_types::() - .prefix_iter(rtxn, &[fid][..])? // the field id is the first bit + .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits .last() .transpose()? - .map(|(key_bytes, _)| key_bytes[1])) // the level is the second bit + .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit } } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 875fe3b27..c5ecb5a79 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -17,7 +17,7 @@ use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; use super::FacetNumberRange; use crate::error::UserError; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec}; +use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetStringLevelZeroCodec}; use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq)] @@ -363,7 +363,7 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, numbers_db: heed::Database, - strings_db: heed::Database, + strings_db: heed::Database, field_id: FieldId, operator: &Operator, ) -> Result { diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index ebf365f44..f0225ff43 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -26,7 +26,7 @@ use super::merge_function::{ use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; use crate::error::{Error, InternalError, SerializationError}; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; @@ -522,7 +522,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { key_buffer.clear(); data_buffer.clear(); - FacetValueStringCodec::serialize_into(field_id, &value, &mut key_buffer); + FacetStringLevelZeroCodec::serialize_into(field_id, &value, &mut key_buffer); CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); if lmdb_key_valid_size(&key_buffer) { From a7ae552ba78b19d7e2068f7d23374817b8019258 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 17:05:57 +0200 Subject: [PATCH 0871/1889] Fix the FacetStringLevelZeroRange range when unbounded --- milli/src/search/facet/facet_string.rs | 48 +++++++++++++++++++------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index d0d9b54eb..559bd41b6 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -210,6 +210,7 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { /// It yields the facet string and the roaring bitmap associated with it. pub struct FacetStringLevelZeroRange<'t> { iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, + field_id: FieldId, } impl<'t> FacetStringLevelZeroRange<'t> { @@ -220,20 +221,43 @@ impl<'t> FacetStringLevelZeroRange<'t> { left: Bound<&str>, right: Bound<&str>, ) -> heed::Result> { - let db = db.remap_types::(); - let left_bound = match left { - Included(left) => Included((field_id, left)), - Excluded(left) => Excluded((field_id, left)), - Unbounded => Included((field_id, "")), - }; + fn encode_bound<'a>( + buffer: &'a mut Vec, + field_id: FieldId, + bound: Bound<&str>, + ) -> Bound<&'a [u8]> { + match bound { + Included(value) => { + buffer.push(field_id); + buffer.push(0); + buffer.extend_from_slice(value.as_bytes()); + Included(&buffer[..]) + } + Excluded(value) => { + buffer.push(field_id); + buffer.push(0); + buffer.extend_from_slice(value.as_bytes()); + Excluded(&buffer[..]) + } + Unbounded => { + buffer.push(field_id); + buffer.push(1); // we must only get the level 0 + Excluded(&buffer[..]) + } + } + } - let right_bound = match right { - Included(right) => Included((field_id, right)), - Excluded(right) => Excluded((field_id, right)), - Unbounded => Excluded((field_id + 1, "")), - }; + let mut left_buffer = Vec::new(); + let mut right_buffer = Vec::new(); + let left_bound = encode_bound(&mut left_buffer, field_id, left); + let right_bound = encode_bound(&mut right_buffer, field_id, right); - db.range(rtxn, &(left_bound, right_bound)).map(|iter| FacetStringLevelZeroRange { iter }) + let iter = db + .remap_key_type::() + .range(rtxn, &(left_bound, right_bound))? + .remap_types::(); + + Ok(FacetStringLevelZeroRange { iter, field_id }) } } From 8c86348119a431cdffa784f8769b3448c77e00de Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 17:20:04 +0200 Subject: [PATCH 0872/1889] Indexing the facet strings levels --- milli/src/search/facet/facet_string.rs | 9 +- milli/src/update/facets.rs | 152 ++++++++++++++++++++++--- 2 files changed, 142 insertions(+), 19 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 559bd41b6..509bb4f0c 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -210,7 +210,6 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { /// It yields the facet string and the roaring bitmap associated with it. pub struct FacetStringLevelZeroRange<'t> { iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, - field_id: FieldId, } impl<'t> FacetStringLevelZeroRange<'t> { @@ -228,19 +227,19 @@ impl<'t> FacetStringLevelZeroRange<'t> { ) -> Bound<&'a [u8]> { match bound { Included(value) => { - buffer.push(field_id); + buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(0); buffer.extend_from_slice(value.as_bytes()); Included(&buffer[..]) } Excluded(value) => { - buffer.push(field_id); + buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(0); buffer.extend_from_slice(value.as_bytes()); Excluded(&buffer[..]) } Unbounded => { - buffer.push(field_id); + buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(1); // we must only get the level 0 Excluded(&buffer[..]) } @@ -257,7 +256,7 @@ impl<'t> FacetStringLevelZeroRange<'t> { .range(rtxn, &(left_bound, right_bound))? .remap_types::(); - Ok(FacetStringLevelZeroRange { iter, field_id }) + Ok(FacetStringLevelZeroRange { iter }) } } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 5fabbc504..d3bba6d6e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,6 +1,6 @@ -use std::cmp; use std::fs::File; -use std::num::NonZeroUsize; +use std::num::{NonZeroU8, NonZeroUsize}; +use std::{cmp, mem}; use chrono::Utc; use grenad::{CompressionType, FileFuse, Reader, Writer}; @@ -10,7 +10,10 @@ use log::debug; use roaring::RoaringBitmap; use crate::error::InternalError; -use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, + FacetStringZeroBoundsValueCodec, +}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{ create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, @@ -64,6 +67,13 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); for field_id in faceted_fields { + // Clear the facet string levels. + clear_field_string_levels( + self.wtxn, + self.index.facet_id_string_docids.remap_types::(), + field_id, + )?; + // Compute and store the faceted strings documents ids. let string_documents_ids = compute_faceted_documents_ids( self.wtxn, @@ -71,6 +81,17 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; + let facet_string_levels = compute_facet_string_levels( + self.wtxn, + self.index.facet_id_string_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + field_id, + )?; + // Clear the facet number levels. clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; @@ -81,7 +102,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - let content = compute_facet_number_levels( + let facet_number_levels = compute_facet_number_levels( self.wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, @@ -106,8 +127,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { write_into_lmdb_database( self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), - content, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number level" }), + facet_number_levels, + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" }), + WriteMethod::GetMergePut, + )?; + + write_into_lmdb_database( + self.wtxn, + *self.index.facet_id_string_docids.as_polymorph(), + facet_string_levels, + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" }), WriteMethod::GetMergePut, )?; } @@ -193,6 +222,21 @@ fn compute_facet_number_levels<'t>( writer_into_reader(writer, shrink_size) } +fn write_number_entry( + writer: &mut Writer, + field_id: FieldId, + level: u8, + left: f64, + right: f64, + ids: &RoaringBitmap, +) -> Result<()> { + let key = (field_id, level, left, right); + let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) +} + fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, @@ -208,17 +252,97 @@ fn compute_faceted_documents_ids( Ok(documents_ids) } -fn write_number_entry( +fn clear_field_string_levels<'t>( + wtxn: &'t mut heed::RwTxn, + db: heed::Database, + field_id: FieldId, +) -> heed::Result<()> { + let left = (field_id, NonZeroU8::new(1).unwrap(), u32::MIN, u32::MIN); + let right = (field_id, NonZeroU8::new(u8::MAX).unwrap(), u32::MAX, u32::MAX); + let range = left..=right; + db.remap_key_type::().delete_range(wtxn, &range).map(drop) +} + +fn compute_facet_string_levels<'t>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + shrink_size: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + field_id: FieldId, +) -> Result> { + let first_level_size = db + .remap_key_type::() + .prefix_iter(rtxn, &field_id.to_be_bytes())? + .remap_types::() + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile() + .and_then(|file| create_writer(compression_type, compression_level, file))?; + + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + + for (level, group_size) in group_size_iter { + let level = NonZeroU8::new(level).unwrap(); + let mut left = (0, ""); + let mut right = (0, ""); + let mut group_docids = RoaringBitmap::new(); + + // Because we know the size of the level 0 we can use a range iterator that starts + // at the first value of the level and goes to the last by simply counting. + for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() { + let ((_field_id, value), docids) = result?; + + if i == 0 { + left = (i as u32, value); + } else if i % group_size == 0 { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. We also reset the docids. + let docids = mem::take(&mut group_docids); + write_string_entry(&mut writer, field_id, level, left, right, docids)?; + + // We save the left bound for the new group. + left = (i as u32, value); + } + + // The right bound is always the bound we run through. + group_docids |= docids; + right = (i as u32, value); + } + + if !group_docids.is_empty() { + let docids = mem::take(&mut group_docids); + write_string_entry(&mut writer, field_id, level, left, right, docids)?; + } + } + + writer_into_reader(writer, shrink_size) +} + +fn write_string_entry( writer: &mut Writer, field_id: FieldId, - level: u8, - left: f64, - right: f64, - ids: &RoaringBitmap, + level: NonZeroU8, + (left_id, left_value): (u32, &str), + (right_id, right_value): (u32, &str), + docids: RoaringBitmap, ) -> Result<()> { - let key = (field_id, level, left, right); - let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + let key = (field_id, level, left_id, right_id); + let key = FacetLevelValueU32Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = match level.get() { + 1 => (Some((left_value, right_value)), docids), + _ => (None, docids), + }; + let data = FacetStringZeroBoundsValueCodec::::bytes_encode(&data) + .ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(()) } From 5676b204dd388c8cca7458ac7a3dd7ab7e0be565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 4 Jul 2021 18:09:53 +0200 Subject: [PATCH 0873/1889] Fix the facet string levels codecs --- .../facet_string_zero_bounds_value_codec.rs | 46 ++++++++++++++++++- milli/src/update/index_documents/store.rs | 4 ++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs index 3c2ce4657..6161118b6 100644 --- a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -16,7 +16,7 @@ where type DItem = (Option<(&'a str, &'a str)>, C::DItem); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (contains_bounds, tail_bytes) = bytes.split_first()?; + let (contains_bounds, bytes) = bytes.split_first()?; if *contains_bounds != 0 { let (left_len, bytes) = try_split_at(bytes, 2)?; @@ -33,7 +33,7 @@ where C::bytes_decode(bytes).map(|item| (Some((left, right)), item)) } else { - C::bytes_decode(tail_bytes).map(|item| (None, item)) + C::bytes_decode(bytes).map(|item| (None, item)) } } } @@ -49,11 +49,21 @@ where match bounds { Some((left, right)) => { + bytes.push(u8::max_value()); + + if left.is_empty() || right.is_empty() { + return None; + } + let left_len: u16 = left.len().try_into().ok()?; let right_len: u16 = right.len().try_into().ok()?; + bytes.extend_from_slice(&left_len.to_be_bytes()); bytes.extend_from_slice(&right_len.to_be_bytes()); + bytes.extend_from_slice(left.as_bytes()); + bytes.extend_from_slice(right.as_bytes()); + let value_bytes = C::bytes_encode(&value)?; bytes.extend_from_slice(&value_bytes[..]); @@ -78,3 +88,35 @@ fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } + +#[cfg(test)] +mod tests { + use heed::types::Unit; + use heed::{BytesDecode, BytesEncode}; + use roaring::RoaringBitmap; + + use super::*; + use crate::CboRoaringBitmapCodec; + + #[test] + fn deserialize_roaring_bitmaps() { + let bounds = Some(("abc", "def")); + let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); + let key = (bounds, docids.clone()); + let bytes = + FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); + let (out_bounds, out_docids) = + FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_bounds, out_docids), (bounds, docids)); + } + + #[test] + fn deserialize_unit() { + let bounds = Some(("abc", "def")); + let key = (bounds, ()); + let bytes = FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); + let (out_bounds, out_unit) = + FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_bounds, out_unit), (bounds, ())); + } +} diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index f0225ff43..4c1071aab 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -286,6 +286,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { value: String, id: DocumentId, ) -> Result<()> { + if value.is_empty() { + return Ok(()); + } + let sorter = &mut self.field_id_docid_facet_strings_sorter; Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; From 081278dfd6b99f720f409db92bdef8193bdadf16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 4 Jul 2021 18:11:26 +0200 Subject: [PATCH 0874/1889] Use the facet string levels when computing the facet distribution --- milli/src/search/facet/facet_distribution.rs | 28 +++++++++++++++++--- milli/src/search/facet/facet_string.rs | 2 +- milli/src/search/facet/mod.rs | 1 + 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index ceefe785b..6382e15e1 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -9,7 +9,7 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; use crate::heed_codec::facet::FacetStringLevelZeroCodec; -use crate::search::facet::{FacetNumberIter, FacetNumberRange}; +use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; use crate::{DocumentId, FieldId, Index, Result}; /// The default number of values by facets that will @@ -134,6 +134,29 @@ impl<'a> FacetDistribution<'a> { Ok(()) } + fn facet_strings_distribution_from_facet_levels( + &self, + field_id: FieldId, + candidates: &RoaringBitmap, + distribution: &mut BTreeMap, + ) -> heed::Result<()> { + let iter = + FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + + for result in iter { + let (value, mut docids) = result?; + docids &= candidates; + if !docids.is_empty() { + distribution.insert(value.to_string(), docids.len()); + } + if distribution.len() == self.max_values_by_facet { + break; + } + } + + Ok(()) + } + /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the /// facet values one by one and iterate on the facet level 0 for numbers. fn facet_values_from_raw_facet_database( @@ -198,9 +221,8 @@ impl<'a> FacetDistribution<'a> { candidates, &mut distribution, )?; - self.facet_distribution_from_documents( + self.facet_strings_distribution_from_facet_levels( field_id, - String, candidates, &mut distribution, )?; diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 509bb4f0c..e1fe6ab74 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -192,7 +192,7 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { }; if must_be_returned { match docids.decode() { - Ok(docids) => Some(Ok(((level, left, right), docids))), + Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), Err(e) => Some(Err(e)), } } else { diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index d92a8e4bd..ddf710e32 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,5 +1,6 @@ pub use self::facet_distribution::FacetDistribution; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; +pub use self::facet_string::FacetStringIter; pub use self::filter_condition::{FilterCondition, Operator}; pub(crate) use self::parser::Rule as ParserRule; From d23c250ad5928878a6db1f61272a6fec7093a1dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 13 Jul 2021 20:04:48 +0200 Subject: [PATCH 0875/1889] Fix a bound error in the facet string range construction --- milli/src/search/facet/facet_distribution.rs | 4 +- milli/src/search/facet/facet_string.rs | 51 +++++++++----------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 6382e15e1..fef4ecc87 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -22,7 +22,7 @@ const MAX_VALUES_BY_FACET: usize = 1000; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 1000; +const CANDIDATES_THRESHOLD: u64 = 35_000; pub struct FacetDistribution<'a> { facets: Option>, @@ -80,7 +80,7 @@ impl<'a> FacetDistribution<'a> { { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); - for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { + for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index e1fe6ab74..f0b527104 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -220,36 +220,34 @@ impl<'t> FacetStringLevelZeroRange<'t> { left: Bound<&str>, right: Bound<&str>, ) -> heed::Result> { - fn encode_bound<'a>( - buffer: &'a mut Vec, - field_id: FieldId, - bound: Bound<&str>, - ) -> Bound<&'a [u8]> { - match bound { - Included(value) => { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - Included(&buffer[..]) - } - Excluded(value) => { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - Excluded(&buffer[..]) - } - Unbounded => { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(1); // we must only get the level 0 - Excluded(&buffer[..]) - } - } + fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { + buffer.extend_from_slice(&field_id.to_be_bytes()); + buffer.push(0); + buffer.extend_from_slice(value.as_bytes()); + &buffer[..] } let mut left_buffer = Vec::new(); + let left_bound = match left { + Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), + Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), + Unbounded => { + left_buffer.extend_from_slice(&field_id.to_be_bytes()); + left_buffer.push(0); + Included(&left_buffer[..]) + } + }; + let mut right_buffer = Vec::new(); - let left_bound = encode_bound(&mut left_buffer, field_id, left); - let right_bound = encode_bound(&mut right_buffer, field_id, right); + let right_bound = match right { + Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), + Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), + Unbounded => { + right_buffer.extend_from_slice(&field_id.to_be_bytes()); + right_buffer.push(1); // we must only get the level 0 + Excluded(&right_buffer[..]) + } + }; let iter = db .remap_key_type::() @@ -290,7 +288,6 @@ impl<'t> FacetStringIter<'t> { field_id: FieldId, documents_ids: RoaringBitmap, ) -> heed::Result> { - // TODO make sure that we change the database before using it, or merging the PR. let db = index.facet_id_string_docids.remap_types::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = match NonZeroU8::new(highest_level) { From 03a01166bafa011f90920b945d69e8b67aa9ed5b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 15 Jul 2021 10:19:35 +0200 Subject: [PATCH 0876/1889] Display the original facet string value from the linear facet database --- .../facet/field_doc_id_facet_string_codec.rs | 19 +++-- milli/src/index.rs | 2 +- milli/src/search/distinct/facet_distinct.rs | 8 +- milli/src/search/facet/facet_distribution.rs | 83 ++++++++++--------- milli/src/update/delete_documents.rs | 11 +-- milli/src/update/index_documents/store.rs | 63 ++++++++------ milli/src/update/settings.rs | 3 +- 7 files changed, 108 insertions(+), 81 deletions(-) diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs index 36408f578..178bb21c1 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -9,13 +9,13 @@ impl FieldDocIdFacetStringCodec { pub fn serialize_into( field_id: FieldId, document_id: DocumentId, - value: &str, + normalized_value: &str, out: &mut Vec, ) { - out.reserve(2 + 4 + value.len()); + out.reserve(2 + 4 + normalized_value.len()); out.extend_from_slice(&field_id.to_be_bytes()); out.extend_from_slice(&document_id.to_be_bytes()); - out.extend_from_slice(value.as_bytes()); + out.extend_from_slice(normalized_value.as_bytes()); } } @@ -29,17 +29,22 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let document_id = u32::from_be_bytes(document_id_bytes); - let value = str::from_utf8(bytes).ok()?; - Some((field_id, document_id, value)) + let normalized_value = str::from_utf8(bytes).ok()?; + Some((field_id, document_id, normalized_value)) } } impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { type EItem = (FieldId, DocumentId, &'a str); - fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option> { let mut bytes = Vec::new(); - FieldDocIdFacetStringCodec::serialize_into(*field_id, *document_id, value, &mut bytes); + FieldDocIdFacetStringCodec::serialize_into( + *field_id, + *document_id, + normalized_value, + &mut bytes, + ); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/index.rs b/milli/src/index.rs index b2be10767..efc31ab46 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -96,7 +96,7 @@ pub struct Index { /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, /// Maps the document id, the facet field id and the strings. - pub field_id_docid_facet_strings: Database, + pub field_id_docid_facet_strings: Database, /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 91620da2a..d81f20732 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -1,6 +1,6 @@ use std::mem::size_of; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Str, Unit}; use roaring::RoaringBitmap; use super::{Distinct, DocIter}; @@ -127,7 +127,7 @@ fn facet_number_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index @@ -144,14 +144,14 @@ fn facet_string_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index .field_id_docid_facet_strings .remap_key_type::() .prefix_iter(txn, &key)? - .remap_key_type::(); + .remap_types::(); Ok(iter) } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index fef4ecc87..7c9acf276 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -2,15 +2,16 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; use std::{cmp, fmt, mem}; -use heed::types::{ByteSlice, Unit}; -use heed::{BytesDecode, Database}; +use heed::types::ByteSlice; use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::FacetStringLevelZeroCodec; +use crate::heed_codec::facet::{ + FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, +}; use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; -use crate::{DocumentId, FieldId, Index, Result}; +use crate::{FieldId, Index, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -67,46 +68,55 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - fn fetch_facet_values<'t, KC, K: 't>( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - candidates: &RoaringBitmap, - distribution: &mut BTreeMap, - ) -> heed::Result<()> - where - K: fmt::Display, - KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, - { - let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); - - for docid in candidates.into_iter() { - key_buffer.truncate(mem::size_of::()); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = db - .remap_key_type::() - .prefix_iter(rtxn, &key_buffer)? - .remap_key_type::(); - - for result in iter { - let ((_, _, value), ()) = result?; - *distribution.entry(value.to_string()).or_insert(0) += 1; - } - } - - Ok(()) - } - match facet_type { FacetType::Number => { + let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let db = self.index.field_id_docid_facet_f64s; - fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) + for docid in candidates.into_iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = db + .remap_key_type::() + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, value), ()) = result?; + *distribution.entry(value.to_string()).or_insert(0) += 1; + } + } } FacetType::String => { + let mut normalized_distribution = BTreeMap::new(); + let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let db = self.index.field_id_docid_facet_strings; - fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) + for docid in candidates.into_iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = db + .remap_key_type::() + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, normalized_value), original_value) = result?; + let (_, count) = normalized_distribution + .entry(normalized_value) + .or_insert_with(|| (original_value, 0)); + *count += 1; + } + } + + let iter = normalized_distribution + .into_iter() + .map(|(_normalized, (original, count))| (original.to_string(), count)); + distribution.extend(iter); } } + + Ok(()) } /// There is too much documents, we use the facet levels to move throught @@ -227,7 +237,6 @@ impl<'a> FacetDistribution<'a> { &mut distribution, )?; } - Ok(distribution) } None => self.facet_values_from_raw_facet_database(field_id), diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 222f3b2d3..e9c1e507a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use chrono::Utc; use fst::IntoStreamer; -use heed::types::{ByteSlice, Unit}; +use heed::types::ByteSlice; use roaring::RoaringBitmap; use serde_json::Value; @@ -419,15 +419,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } -fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>( +fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( wtxn: &'a mut heed::RwTxn, - db: &heed::Database, + db: &heed::Database, field_id: FieldId, to_remove: &RoaringBitmap, convert: F, ) -> heed::Result<()> where - C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, + C: heed::BytesDecode<'a, DItem = K>, + DC: heed::BytesDecode<'a, DItem = V>, F: Fn(K) -> DocumentId, { let mut iter = db @@ -436,7 +437,7 @@ where .remap_key_type::(); while let Some(result) = iter.next() { - let (key, ()) = result?; + let (key, _) = result?; if to_remove.contains(convert(key)) { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 4c1071aab..1538295f9 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -65,7 +65,7 @@ pub struct Store<'s, A> { LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, - facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, + facet_field_string_docids: LinkedHashMap<(FieldId, String), (String, RoaringBitmap)>, facet_field_value_docids_limit: usize, // MTBL parameters chunk_compression_type: CompressionType, @@ -283,25 +283,33 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn insert_facet_string_values_docid( &mut self, field_id: FieldId, - value: String, + normalized_value: String, + original_value: String, id: DocumentId, ) -> Result<()> { - if value.is_empty() { + if normalized_value.is_empty() { return Ok(()); } let sorter = &mut self.field_id_docid_facet_strings_sorter; - Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; + Self::write_field_id_docid_facet_string_value( + sorter, + field_id, + id, + &normalized_value, + &original_value, + )?; - let key = (field_id, value); + let key = (field_id, normalized_value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.facet_field_string_docids.get_refresh(&key) { - Some(old) => { + Some((_original_value, old)) => { old.insert(id); } None => { // A newly inserted element is append at the end of the linked hash map. - self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); + self.facet_field_string_docids + .insert(key, (original_value, RoaringBitmap::from_iter(Some(id)))); // If the word docids just reached it's capacity we must make sure to remove // one element, this way next time we insert we doesn't grow the capacity. if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { @@ -363,7 +371,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { document_id: DocumentId, words_positions: &mut HashMap>, facet_numbers_values: &mut HashMap>, - facet_strings_values: &mut HashMap>, + facet_strings_values: &mut HashMap>, record: &[u8], ) -> Result<()> { // We compute the list of words pairs proximities (self-join) and write it directly to disk. @@ -399,8 +407,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // We store document_id associated with all the facet strings fields ids and values. for (field, values) in facet_strings_values.drain() { - for value in values { - self.insert_facet_string_values_docid(field, value, document_id)?; + for (normalized, original) in values { + self.insert_facet_string_values_docid(field, normalized, original, document_id)?; } } @@ -516,23 +524,23 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn write_facet_field_string_docids(sorter: &mut Sorter>, iter: I) -> Result<()> where - I: IntoIterator, + I: IntoIterator, Error: From, { let mut key_buffer = Vec::new(); let mut data_buffer = Vec::new(); - for ((field_id, value), docids) in iter { + for ((field_id, normalized_value), (original_value, docids)) in iter { key_buffer.clear(); data_buffer.clear(); - FacetStringLevelZeroCodec::serialize_into(field_id, &value, &mut key_buffer); + FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer); CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); if lmdb_key_valid_size(&key_buffer) { sorter.insert(&key_buffer, &data_buffer)?; } else { - warn!("facet value {:?} is too large to be saved", value); + warn!("facet value {:?} is too large to be saved", original_value); } } @@ -587,19 +595,24 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { sorter: &mut Sorter>, field_id: FieldId, document_id: DocumentId, - value: &str, + normalized_value: &str, + original_value: &str, ) -> Result<()> where Error: From, { let mut buffer = Vec::new(); - - FieldDocIdFacetStringCodec::serialize_into(field_id, document_id, value, &mut buffer); + FieldDocIdFacetStringCodec::serialize_into( + field_id, + document_id, + normalized_value, + &mut buffer, + ); if lmdb_key_valid_size(&buffer) { - sorter.insert(&buffer, &[])?; + sorter.insert(&buffer, original_value.as_bytes())?; } else { - warn!("facet value {:?} is too large to be saved", value); + warn!("facet value {:?} is too large to be saved", original_value); } Ok(()) @@ -929,24 +942,24 @@ fn process_tokens<'a>( .filter(|(_, t)| t.is_word()) } -fn extract_facet_values(value: &Value) -> (Vec, Vec) { +fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { fn inner_extract_facet_values( value: &Value, can_recurse: bool, output_numbers: &mut Vec, - output_strings: &mut Vec, + output_strings: &mut Vec<(String, String)>, ) { match value { Value::Null => (), - Value::Bool(b) => output_strings.push(b.to_string()), + Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), Value::Number(number) => { if let Some(float) = number.as_f64() { output_numbers.push(float); } } - Value::String(string) => { - let string = string.trim().to_lowercase(); - output_strings.push(string); + Value::String(original) => { + let normalized = original.trim().to_lowercase(); + output_strings.push((normalized, original.clone())); } Value::Array(values) => { if can_recurse { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c6540b33a..e4adbccb9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -276,8 +276,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match self.searchable_fields { Setting::Set(ref fields) => { // every time the searchable attributes are updated, we need to update the - // ids for any settings that uses the facets. (displayed_fields, - // filterable_fields) + // ids for any settings that uses the facets. (distinct_fields, filterable_fields). let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_fields_ids_map = FieldsIdsMap::new(); From 0227254a65f9627099a74039932236522cff278e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 17 Jul 2021 12:50:01 +0200 Subject: [PATCH 0877/1889] Return the original string values for the inverted facet index database --- infos/src/main.rs | 4 +- .../facet_string_level_zero_value_codec.rs | 80 +++++++++++++++++++ .../facet_string_zero_bounds_value_codec.rs | 14 +--- milli/src/heed_codec/facet/mod.rs | 12 +++ milli/src/index.rs | 9 ++- milli/src/search/distinct/facet_distinct.rs | 11 +-- milli/src/search/distinct/mod.rs | 3 +- milli/src/search/facet/facet_distribution.rs | 24 ++++-- milli/src/search/facet/facet_string.rs | 26 ++++-- milli/src/search/facet/filter_condition.rs | 12 ++- milli/src/update/delete_documents.rs | 33 +++++++- milli/src/update/facets.rs | 30 +++++-- .../update/index_documents/merge_function.rs | 23 ++++++ milli/src/update/index_documents/mod.rs | 3 +- milli/src/update/index_documents/store.rs | 16 ++-- 15 files changed, 242 insertions(+), 58 deletions(-) create mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index d5d1ad0af..da15251b0 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -627,14 +627,14 @@ fn facet_values_docids( FacetType::String => { wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { - let ((_fid, value), docids) = result?; + let ((_fid, normalized), (_original, docids)) = result?; let count = docids.len(); let docids = if debug { format!("{:?}", docids) } else { format!("{:?}", docids.iter().collect::>()) }; - wtr.write_record(&[value.to_string(), count.to_string(), docids])?; + wtr.write_record(&[normalized.to_string(), count.to_string(), docids])?; } } } diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs new file mode 100644 index 000000000..b2434d453 --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -0,0 +1,80 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::{marker, str}; + +use super::try_split_at; + +/// A codec that encodes a string in front of the value. +/// +/// The usecase is for the facet string levels algorithm where we must know the +/// original string of a normalized facet value, the original values are stored +/// in the value to not break the lexicographical ordering of the LMDB keys. +pub struct FacetStringLevelZeroValueCodec(marker::PhantomData); + +impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec +where + C: heed::BytesDecode<'a>, +{ + type DItem = (&'a str, C::DItem); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (string_len, bytes) = try_split_at(bytes, 2)?; + let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?; + + let (string, bytes) = try_split_at(bytes, string_len as usize)?; + let string = str::from_utf8(string).ok()?; + + C::bytes_decode(bytes).map(|item| (string, item)) + } +} + +impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec +where + C: heed::BytesEncode<'a>, +{ + type EItem = (&'a str, C::EItem); + + fn bytes_encode((string, value): &'a Self::EItem) -> Option> { + let string_len: u16 = string.len().try_into().ok()?; + let value_bytes = C::bytes_encode(&value)?; + + let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); + bytes.extend_from_slice(&string_len.to_be_bytes()); + bytes.extend_from_slice(string.as_bytes()); + bytes.extend_from_slice(&value_bytes[..]); + + Some(Cow::Owned(bytes)) + } +} + +#[cfg(test)] +mod tests { + use heed::types::Unit; + use heed::{BytesDecode, BytesEncode}; + use roaring::RoaringBitmap; + + use super::*; + use crate::CboRoaringBitmapCodec; + + #[test] + fn deserialize_roaring_bitmaps() { + let string = "abc"; + let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); + let key = (string, docids.clone()); + let bytes = + FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); + let (out_string, out_docids) = + FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_string, out_docids), (string, docids)); + } + + #[test] + fn deserialize_unit() { + let string = "def"; + let key = (string, ()); + let bytes = FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); + let (out_string, out_unit) = + FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_string, out_unit), (string, ())); + } +} diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs index 6161118b6..337433c2b 100644 --- a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -2,7 +2,9 @@ use std::borrow::Cow; use std::convert::TryInto; use std::{marker, str}; -/// A codec that encodes two strings in front of the value. +use super::try_split_at; + +/// A codec that optionally encodes two strings in front of the value. /// /// The usecase is for the facet string levels algorithm where we must /// know the origin of a group, the group left and right bounds are stored @@ -79,16 +81,6 @@ where } } -/// Tries to split a slice in half at the given middle point, -/// `None` if the slice is too short. -fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { - if slice.len() >= mid { - Some(slice.split_at(mid)) - } else { - None - } -} - #[cfg(test)] mod tests { use heed::types::Unit; diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index ecab7eb7c..a6a805bf7 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,6 +1,7 @@ mod facet_level_value_f64_codec; mod facet_level_value_u32_codec; mod facet_string_level_zero_codec; +mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; @@ -8,6 +9,17 @@ mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; +pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec; pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; + +/// Tries to split a slice in half at the given middle point, +/// `None` if the slice is too short. +pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if slice.len() >= mid { + Some(slice.split_at(mid)) + } else { + None + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index efc31ab46..f26643de7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -11,8 +11,8 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, @@ -90,8 +90,9 @@ pub struct Index { /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, - /// Maps the facet field id and the string with the docids that corresponds to it. - pub facet_id_string_docids: Database, + /// Maps the facet field id and the string with the original string and docids that corresponds to it. + pub facet_id_string_docids: + Database>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index d81f20732..4436d4cda 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -1,5 +1,6 @@ use std::mem::size_of; +use concat_arrays::concat_arrays; use heed::types::{ByteSlice, Str, Unit}; use roaring::RoaringBitmap; @@ -43,7 +44,10 @@ pub struct FacetDistinctIter<'a> { impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { - self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key)) + self.index + .facet_id_string_docids + .get(self.txn, &(self.distinct, key)) + .map(|result| result.map(|(_original, docids)| docids)) } fn facet_number_docids(&self, key: f64) -> heed::Result> { @@ -116,10 +120,7 @@ impl<'a> FacetDistinctIter<'a> { } fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { - let mut key = [0; FID_SIZE + DOCID_SIZE]; - key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); - key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); - key + concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) } fn facet_number_values<'a>( diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index ae3fdb91e..e7dc52a82 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -47,7 +47,7 @@ mod test { let mut documents = Vec::new(); - let txts = ["toto", "titi", "tata"]; + let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); @@ -90,7 +90,6 @@ mod test { addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); addition.update_format(UpdateFormat::Json); - addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 7c9acf276..94f875dfc 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -23,7 +23,7 @@ const MAX_VALUES_BY_FACET: usize = 1000; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 35_000; +const CANDIDATES_THRESHOLD: u64 = 3000; pub struct FacetDistribution<'a> { facets: Option>, @@ -72,6 +72,7 @@ impl<'a> FacetDistribution<'a> { FacetType::Number => { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); @@ -84,6 +85,9 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, _, value), ()) = result?; *distribution.entry(value.to_string()).or_insert(0) += 1; + if distribution.len() - distribution_prelength == self.max_values_by_facet { + break; + } } } } @@ -106,6 +110,10 @@ impl<'a> FacetDistribution<'a> { .entry(normalized_value) .or_insert_with(|| (original_value, 0)); *count += 1; + + if normalized_distribution.len() == self.max_values_by_facet { + break; + } } } @@ -154,10 +162,10 @@ impl<'a> FacetDistribution<'a> { FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; for result in iter { - let (value, mut docids) = result?; + let (_normalized, original, mut docids) = result?; docids &= candidates; if !docids.is_empty() { - distribution.insert(value.to_string(), docids.len()); + distribution.insert(original.to_string(), docids.len()); } if distribution.len() == self.max_values_by_facet { break; @@ -193,14 +201,20 @@ impl<'a> FacetDistribution<'a> { .prefix_iter(self.rtxn, &field_id.to_be_bytes())? .remap_key_type::(); + let mut normalized_distribution = BTreeMap::new(); for result in iter { - let ((_, value), docids) = result?; - distribution.insert(value.to_string(), docids.len()); + let ((_, normalized_value), (original_value, docids)) = result?; + normalized_distribution.insert(normalized_value, (original_value, docids.len())); if distribution.len() == self.max_values_by_facet { break; } } + let iter = normalized_distribution + .into_iter() + .map(|(_normalized, (original, count))| (original.to_string(), count)); + distribution.extend(iter); + Ok(distribution) } diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index f0b527104..40ea8c04a 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -135,7 +135,8 @@ use heed::{Database, LazyDecode, RoRange}; use roaring::RoaringBitmap; use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec, + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FacetStringZeroBoundsValueCodec, }; use crate::heed_codec::CboRoaringBitmapCodec; use crate::{FieldId, Index}; @@ -209,7 +210,11 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { /// /// It yields the facet string and the roaring bitmap associated with it. pub struct FacetStringLevelZeroRange<'t> { - iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, + iter: RoRange< + 't, + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, } impl<'t> FacetStringLevelZeroRange<'t> { @@ -252,18 +257,23 @@ impl<'t> FacetStringLevelZeroRange<'t> { let iter = db .remap_key_type::() .range(rtxn, &(left_bound, right_bound))? - .remap_types::(); + .remap_types::< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec + >(); Ok(FacetStringLevelZeroRange { iter }) } } impl<'t> Iterator for FacetStringLevelZeroRange<'t> { - type Item = heed::Result<(&'t str, RoaringBitmap)>; + type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; fn next(&mut self) -> Option { match self.iter.next() { - Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))), + Some(Ok(((_fid, normalized), (original, docids)))) => { + Some(Ok((normalized, original, docids))) + } Some(Err(e)) => Some(Err(e)), None => None, } @@ -326,7 +336,7 @@ impl<'t> FacetStringIter<'t> { } impl<'t> Iterator for FacetStringIter<'t> { - type Item = heed::Result<(&'t str, RoaringBitmap)>; + type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; fn next(&mut self) -> Option { 'outer: loop { @@ -377,11 +387,11 @@ impl<'t> Iterator for FacetStringIter<'t> { // level zero only for result in last { match result { - Ok((value, mut docids)) => { + Ok((normalized, original, mut docids)) => { docids &= &*documents_ids; if !docids.is_empty() { *documents_ids -= &docids; - return Some(Ok((value, docids))); + return Some(Ok((normalized, original, docids))); } } Err(e) => return Some(Err(e)), diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index c5ecb5a79..cc108f855 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -17,7 +17,9 @@ use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; use super::FacetNumberRange; use crate::error::UserError; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetStringLevelZeroCodec}; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, +}; use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq)] @@ -363,7 +365,10 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, numbers_db: heed::Database, - strings_db: heed::Database, + strings_db: heed::Database< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, field_id: FieldId, operator: &Operator, ) -> Result { @@ -374,7 +379,8 @@ impl FilterCondition { GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), Equal(number, string) => { - let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); + let (_original_value, string_docids) = + strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); let number_docids = match number { Some(n) => { let n = Included(*n); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e9c1e507a..bcb7d7580 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -9,6 +9,7 @@ use serde_json::Value; use super::ClearDocuments; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; +use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; @@ -374,13 +375,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_value_docids( + remove_docids_from_facet_field_id_number_docids( self.wtxn, facet_id_f64_docids, &self.documents_ids, )?; - remove_docids_from_facet_field_id_value_docids( + remove_docids_from_facet_field_id_string_docids( self.wtxn, facet_id_string_docids, &self.documents_ids, @@ -447,7 +448,33 @@ where Ok(()) } -fn remove_docids_from_facet_field_id_value_docids<'a, C>( +fn remove_docids_from_facet_field_id_string_docids<'a, C>( + wtxn: &'a mut heed::RwTxn, + db: &heed::Database>, + to_remove: &RoaringBitmap, +) -> heed::Result<()> +where + C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, +{ + let mut iter = db.remap_key_type::().iter_mut(wtxn)?; + while let Some(result) = iter.next() { + let (bytes, (original_value, mut docids)) = result?; + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &(original_value, docids))? }; + } + } + + Ok(()) +} + +fn remove_docids_from_facet_field_id_number_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index d3bba6d6e..cb9a90f7e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -12,7 +12,7 @@ use roaring::RoaringBitmap; use crate::error::InternalError; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, - FacetStringZeroBoundsValueCodec, + FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, }; use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{ @@ -75,7 +75,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; // Compute and store the faceted strings documents ids. - let string_documents_ids = compute_faceted_documents_ids( + let string_documents_ids = compute_faceted_strings_documents_ids( self.wtxn, self.index.facet_id_string_docids.remap_key_type::(), field_id, @@ -96,7 +96,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; // Compute and store the faceted numbers documents ids. - let number_documents_ids = compute_faceted_documents_ids( + let number_documents_ids = compute_faceted_numbers_documents_ids( self.wtxn, self.index.facet_id_f64_docids.remap_key_type::(), field_id, @@ -237,13 +237,26 @@ fn write_number_entry( Ok(()) } -fn compute_faceted_documents_ids( +fn compute_faceted_strings_documents_ids( + rtxn: &heed::RoTxn, + db: heed::Database>, + field_id: FieldId, +) -> Result { + let mut documents_ids = RoaringBitmap::new(); + for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { + let (_key, (_original_value, docids)) = result?; + documents_ids |= docids; + } + + Ok(documents_ids) +} + +fn compute_faceted_numbers_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, field_id: FieldId, ) -> Result { let mut documents_ids = RoaringBitmap::new(); - for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { let (_key, docids) = result?; documents_ids |= docids; @@ -265,7 +278,10 @@ fn clear_field_string_levels<'t>( fn compute_facet_string_levels<'t>( rtxn: &'t heed::RoTxn, - db: heed::Database, + db: heed::Database< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, compression_type: CompressionType, compression_level: Option, shrink_size: Option, @@ -299,7 +315,7 @@ fn compute_facet_string_levels<'t>( // Because we know the size of the level 0 we can use a range iterator that starts // at the first value of the level and goes to the last by simply counting. for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() { - let ((_field_id, value), docids) = result?; + let ((_field_id, value), (_original_value, docids)) = result?; if i == 0 { left = (i as u32, value); diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 8613a8824..7e5d0b581 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -2,8 +2,11 @@ use std::borrow::Cow; use std::result::Result as StdResult; use fst::IntoStreamer; +use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; +use crate::error::SerializationError; +use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::Result; @@ -69,6 +72,26 @@ pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result Ok(vec) } +/// Uses the FacetStringLevelZeroValueCodec to merge the values. +pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { + let (head, tail) = values.split_first().unwrap(); + let (head_string, mut head_rb) = + FacetStringLevelZeroValueCodec::::bytes_decode(&head[..]) + .ok_or(SerializationError::Decoding { db_name: None })?; + + for value in tail { + let (_string, rb) = + FacetStringLevelZeroValueCodec::::bytes_decode(&value[..]) + .ok_or(SerializationError::Decoding { db_name: None })?; + head_rb |= rb; + } + + FacetStringLevelZeroValueCodec::::bytes_encode(&(head_string, head_rb)) + .map(|cow| cow.into_owned()) + .ok_or(SerializationError::Encoding { db_name: None }) + .map_err(Into::into) +} + pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { let (head, tail) = values.split_first().unwrap(); let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 9ac05fe1a..efe16def7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize}; pub use self::merge_function::{ cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, }; use self::store::{Readers, Store}; pub use self::transform::{Transform, TransformOutput}; @@ -655,7 +656,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_field_strings_docids_readers, - cbo_roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, write_method, )?; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 1538295f9..444b11e31 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -22,12 +22,13 @@ use tempfile::tempfile; use super::merge_function::{ cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, }; use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; use crate::error::{Error, InternalError, SerializationError}; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; @@ -153,7 +154,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let facet_field_strings_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -528,17 +529,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Error: From, { let mut key_buffer = Vec::new(); - let mut data_buffer = Vec::new(); for ((field_id, normalized_value), (original_value, docids)) in iter { key_buffer.clear(); - data_buffer.clear(); FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer); - CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); + + let data = (original_value.as_str(), docids); + let data = FacetStringLevelZeroValueCodec::::bytes_encode(&data) + .ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?; if lmdb_key_valid_size(&key_buffer) { - sorter.insert(&key_buffer, &data_buffer)?; + sorter.insert(&key_buffer, &data)?; } else { warn!("facet value {:?} is too large to be saved", original_value); } From aa02a7fdd8ca73f2474dcfd2b48fbc4e1447b95e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 22 Jul 2021 17:04:38 +0200 Subject: [PATCH 0878/1889] Add a test to check that we indeed impact the relevancy --- milli/src/update/settings.rs | 49 ++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index e4adbccb9..a78cc9da8 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1066,4 +1066,53 @@ mod tests { builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); } + + #[test] + fn setting_impact_relevancy() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the genres setting + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_filterable_fields(hashset! { S("genres") }); + builder.execute(|_, _| ()).unwrap(); + + let content = &br#"[ + { + "id": 11, + "title": "Star Wars", + "overview": + "Princess Leia is captured and held hostage by the evil Imperial forces in their effort to take over the galactic Empire. Venturesome Luke Skywalker and dashing captain Han Solo team together with the loveable robot duo R2-D2 and C-3PO to rescue the beautiful princess and restore peace and justice in the Empire.", + "genres": ["Adventure", "Action", "Science Fiction"], + "poster": "https://image.tmdb.org/t/p/w500/6FfCtAuVAW8XJjZ7eWeLibRLWTw.jpg", + "release_date": 233366400 + }, + { + "id": 30, + "title": "Magnetic Rose", + "overview": "", + "genres": ["Animation", "Science Fiction"], + "poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg", + "release_date": 819676800 + } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // We now try to reset the primary key + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).query("S").execute().unwrap(); + let first_id = documents_ids[0]; + let documents = index.documents(&rtxn, documents_ids).unwrap(); + let (_, content) = documents.iter().find(|(id, _)| *id == first_id).unwrap(); + + let fid = index.fields_ids_map(&rtxn).unwrap().id("title").unwrap(); + let line = std::str::from_utf8(content.get(fid).unwrap()).unwrap(); + assert_eq!(line, r#""Star Wars""#); + } } From 92c0a2cdc1d5c6afeef84c409422af2152787cee Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 22 Jul 2021 17:14:44 +0200 Subject: [PATCH 0879/1889] Add a test that triggers a panic when indexing zeroes --- milli/src/update/index_documents/mod.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 9ac05fe1a..cd48175f7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1380,4 +1380,26 @@ mod tests { wtxn.commit().unwrap(); } + + #[test] + fn index_documents_with_zeroes() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = r#"#id,title,au{hor,genre,price$ +2,"Prideand Prejudice","Jane Austin","romance",3.5$ +456,"Le Petit Prince","Antoine de Saint-Exupéry","adventure",10.0$ +1,Wonderland","Lewis Carroll","fantasy",25.99$ +4,"Harry Potter ing","fantasy\0lood Prince","J. K. Rowling","fantasy\0, +"#; + + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + } } From 0353fbb5df04146bd9297a5886a351e1d502be6f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 22 Jul 2021 16:23:36 +0200 Subject: [PATCH 0880/1889] Bump the tokenizer version to v0.2.4 --- Cargo.lock | 31 ++++++++++++++++++++++++++++--- milli/Cargo.toml | 2 +- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 18d42029f..cd1913ec8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -990,7 +990,7 @@ dependencies = [ "jemallocator", "log", "maplit", - "meilisearch-tokenizer", + "meilisearch-tokenizer 0.2.3", "memmap", "milli", "once_cell", @@ -1353,7 +1353,23 @@ dependencies = [ "once_cell", "slice-group-by", "unicode-segmentation", - "whatlang", + "whatlang 0.9.0", +] + +[[package]] +name = "meilisearch-tokenizer" +version = "0.2.4" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.4#135d08dce465a756abaf6a1bcad70f315bda99b9" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang 0.12.0", ] [[package]] @@ -1404,7 +1420,7 @@ dependencies = [ "log", "logging_timer", "maplit", - "meilisearch-tokenizer", + "meilisearch-tokenizer 0.2.4", "memmap", "obkv", "once_cell", @@ -3087,6 +3103,15 @@ dependencies = [ "hashbrown 0.7.2", ] +[[package]] +name = "whatlang" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a346d2eb29c03618693ed24a29d1acd0c3f2cb08ae58b9669d7461e033cf703" +dependencies = [ + "hashbrown 0.7.2", +] + [[package]] name = "winapi" version = "0.2.8" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 6af928041..dc839a209 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -19,7 +19,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.4" } memmap = "0.7.0" obkv = "0.2.0" once_cell = "1.5.2" From 7aa6cc9b04ab78ca5374faf2e1118d7fa6b269da Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 22 Jul 2021 17:11:17 +0200 Subject: [PATCH 0881/1889] Do not insert fields in the map when changing the settings --- milli/src/index.rs | 50 ++++---------------- milli/src/search/criteria/asc_desc.rs | 32 +++++++------ milli/src/search/facet/facet_distribution.rs | 13 ++--- milli/src/search/mod.rs | 15 +++--- milli/src/update/delete_documents.rs | 14 +++--- milli/src/update/settings.rs | 18 ------- 6 files changed, 46 insertions(+), 96 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index f26643de7..63da6b1e8 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -8,7 +8,7 @@ use heed::types::*; use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; +use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, @@ -353,15 +353,8 @@ impl Index { let fields_ids_map = self.fields_ids_map(rtxn)?; let mut fields_ids = Vec::new(); for name in fields.into_iter() { - match fields_ids_map.id(name) { - Some(field_id) => fields_ids.push(field_id), - None => { - return Err(FieldIdMapMissingEntry::FieldName { - field_name: name.to_string(), - process: "Index::displayed_fields_ids", - } - .into()) - } + if let Some(field_id) = fields_ids_map.id(name) { + fields_ids.push(field_id); } } Ok(Some(fields_ids)) @@ -403,15 +396,8 @@ impl Index { let fields_ids_map = self.fields_ids_map(rtxn)?; let mut fields_ids = Vec::new(); for name in fields { - match fields_ids_map.id(name) { - Some(field_id) => fields_ids.push(field_id), - None => { - return Err(FieldIdMapMissingEntry::FieldName { - field_name: name.to_string(), - process: "Index::searchable_fields_ids", - } - .into()) - } + if let Some(field_id) = fields_ids_map.id(name) { + fields_ids.push(field_id); } } Ok(Some(fields_ids)) @@ -451,17 +437,8 @@ impl Index { let mut fields_ids = HashSet::new(); for name in fields { - match fields_ids_map.id(&name) { - Some(field_id) => { - fields_ids.insert(field_id); - } - None => { - return Err(FieldIdMapMissingEntry::FieldName { - field_name: name, - process: "Index::filterable_fields_ids", - } - .into()) - } + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.insert(field_id); } } @@ -498,17 +475,8 @@ impl Index { let mut fields_ids = HashSet::new(); for name in fields.into_iter() { - match fields_ids_map.id(&name) { - Some(field_id) => { - fields_ids.insert(field_id); - } - None => { - return Err(FieldIdMapMissingEntry::FieldName { - field_name: name, - process: "Index::faceted_fields_ids", - } - .into()) - } + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.insert(field_id); } } diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 99d63c90d..4a664d042 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -6,7 +6,6 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetNumberIter; use crate::search::query_tree::Operation; @@ -20,7 +19,7 @@ pub struct AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn<'t>, field_name: String, - field_id: FieldId, + field_id: Option, ascending: bool, query_tree: Option, candidates: Box> + 't>, @@ -57,11 +56,11 @@ impl<'t> AscDesc<'t> { ascending: bool, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let field_id = - fields_ids_map.id(&field_name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { - field_name: field_name.clone(), - process: "AscDesc::new", - })?; + let field_id = fields_ids_map.id(&field_name); + let faceted_candidates = match field_id { + Some(field_id) => index.number_faceted_documents_ids(rtxn, field_id)?, + None => RoaringBitmap::default(), + }; Ok(AscDesc { index, @@ -72,7 +71,7 @@ impl<'t> AscDesc<'t> { query_tree: None, candidates: Box::new(std::iter::empty()), allowed_candidates: RoaringBitmap::new(), - faceted_candidates: index.number_faceted_documents_ids(rtxn, field_id)?, + faceted_candidates, bucket_candidates: RoaringBitmap::new(), parent, }) @@ -132,13 +131,16 @@ impl<'t> Criterion for AscDesc<'t> { } self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = facet_ordered( - self.index, - self.rtxn, - self.field_id, - self.ascending, - candidates & &self.faceted_candidates, - )?; + self.candidates = match self.field_id { + Some(field_id) => facet_ordered( + self.index, + self.rtxn, + field_id, + self.ascending, + candidates & &self.faceted_candidates, + )?, + None => Box::new(std::iter::empty()), + }; } None => return Ok(None), }, diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 94f875dfc..bfbea76c3 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -5,7 +5,7 @@ use std::{cmp, fmt, mem}; use heed::types::ByteSlice; use roaring::RoaringBitmap; -use crate::error::{FieldIdMapMissingEntry, UserError}; +use crate::error::UserError; use crate::facet::FacetType; use crate::heed_codec::facet::{ FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, @@ -277,13 +277,10 @@ impl<'a> FacetDistribution<'a> { let mut distribution = BTreeMap::new(); for name in fields { - let fid = - fields_ids_map.id(&name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { - field_name: name.clone(), - process: "FacetDistribution::execute", - })?; - let values = self.facet_values(fid)?; - distribution.insert(name, values); + if let Some(fid) = fields_ids_map.id(&name) { + let values = self.facet_values(fid)?; + distribution.insert(name, values); + } } Ok(distribution) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 574459547..871f464ef 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -18,7 +18,6 @@ pub(crate) use self::facet::ParserRule; pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; -use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{DocumentId, Index, Result}; @@ -142,13 +141,13 @@ impl<'a> Search<'a> { None => self.perform_sort(NoopDistinct, matching_words, criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; - let id = - field_ids_map.id(name).ok_or_else(|| FieldIdMapMissingEntry::FieldName { - field_name: name.to_string(), - process: "distinct attribute", - })?; - let distinct = FacetDistinct::new(id, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) + match field_ids_map.id(name) { + Some(fid) => { + let distinct = FacetDistinct::new(fid, self.index, self.rtxn); + self.perform_sort(distinct, matching_words, criteria) + } + None => Ok(SearchResult::default()), + } } } } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index bcb7d7580..bd56688f6 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; use serde_json::Value; use super::ClearDocuments; -use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; +use crate::error::{InternalError, UserError}; use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; @@ -82,11 +82,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { key: Some(main_key::PRIMARY_KEY_KEY), } })?; - let id_field = - fields_ids_map.id(primary_key).ok_or_else(|| FieldIdMapMissingEntry::FieldName { - field_name: primary_key.to_string(), - process: "DeleteDocuments::execute", - })?; + + // If we can't find the id of the primary key it means that the database + // is empty and it should be safe to return that we deleted 0 documents. + let id_field = match fields_ids_map.id(primary_key) { + Some(field) => field, + None => return Ok(0), + }; let Index { env: _env, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index a78cc9da8..743483613 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -235,15 +235,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_displayed(&mut self) -> Result { match self.displayed_fields { Setting::Set(ref fields) => { - let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // fields are deduplicated, only the first occurrence is taken into account let names: Vec<_> = fields.iter().unique().map(String::as_str).collect(); - - for name in names.iter() { - fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; - } self.index.put_displayed_fields(self.wtxn, &names)?; - self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } Setting::Reset => { self.index.delete_displayed_fields(self.wtxn)?; @@ -256,11 +250,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_distinct_field(&mut self) -> Result { match self.distinct_field { Setting::Set(ref attr) => { - let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - fields_ids_map.insert(attr).ok_or(UserError::AttributeLimitReached)?; - self.index.put_distinct_field(self.wtxn, &attr)?; - self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; @@ -388,14 +378,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { - let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_facets = HashSet::new(); for name in fields { - fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; new_facets.insert(name.clone()); } self.index.put_filterable_fields(self.wtxn, &new_facets)?; - self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; @@ -408,17 +395,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_criteria(&mut self) -> Result<()> { match self.criteria { Setting::Set(ref fields) => { - let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_criteria = Vec::new(); for name in fields { let criterion: Criterion = name.parse()?; - if let Some(name) = criterion.field_name() { - fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; - } new_criteria.push(criterion); } self.index.put_criteria(self.wtxn, &new_criteria)?; - self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } Setting::Reset => { self.index.delete_criteria(self.wtxn)?; From b12738cfe9f74efea0d474ffba05c947c275dd28 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 22 Jul 2021 19:18:22 +0200 Subject: [PATCH 0882/1889] Use the right DB prefixes to store the faceted fields --- milli/src/index.rs | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 63da6b1e8..305a127ca 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -492,10 +492,10 @@ impl Index { field_id: FieldId, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; - buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; + buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] .copy_from_slice(&field_id.to_be_bytes()); self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } @@ -506,10 +506,10 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; - buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; + buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] .copy_from_slice(&field_id.to_be_bytes()); match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), @@ -524,10 +524,10 @@ impl Index { field_id: FieldId, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; - buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; + buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] .copy_from_slice(&field_id.to_be_bytes()); self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } @@ -538,10 +538,10 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] + let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] .copy_from_slice(&field_id.to_be_bytes()); match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), From 88646a63a16ad3e5f235c0ee69ac515599824d53 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 8 Jul 2021 10:53:32 +0200 Subject: [PATCH 0883/1889] update bors --- bors.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bors.toml b/bors.toml index bc33262b7..717fd69d1 100644 --- a/bors.toml +++ b/bors.toml @@ -1,7 +1,7 @@ status = [ 'Tests on ubuntu-18.04 with stable', 'Tests on macos-latest with stable', - 'Cargo check on Windows', + 'Tests on windows-latest with stable', 'Run Rustfmt', ] # 3 hours timeout From 0038b3848af82d4229caea75281b775516a7b792 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 7 Jul 2021 13:47:25 +0200 Subject: [PATCH 0884/1889] add a simple github cache --- .github/workflows/rust.yml | 38 +- Cargo.lock | 3233 ------------------------------------ 2 files changed, 19 insertions(+), 3252 deletions(-) delete mode 100644 Cargo.lock diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e87848f94..4fdad2db8 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -16,13 +16,21 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-18.04, macos-latest] + os: [ubuntu-18.04, macos-latest, windows-latest] rust: - stable - beta - nightly steps: - uses: actions/checkout@v2 + - name: Cache dependencies + uses: actions/cache@v2 + with: + path: | + ~/.cargo + ./Cargo.lock + ./target + key: ${{ matrix.os }}-${{ matrix.rust }}-${{ hashFiles('Cargo.toml') }} - uses: actions-rs/toolchain@v1 with: profile: minimal @@ -32,34 +40,26 @@ jobs: uses: actions-rs/cargo@v1 with: command: check - args: --all --locked + args: --all - name: Run cargo test uses: actions-rs/cargo@v1 with: command: test - args: --locked --release - - # We don't run test on Windows since we get the following error: There is not enough space on the disk. - check-on-windows: - name: Cargo check on Windows - runs-on: windows-latest - steps: - - uses: actions/checkout@v2 - - name: Run cargo check without any default features - uses: actions-rs/cargo@v1 - with: - command: check - args: --all --locked - - name: Run cargo check with all default features - uses: actions-rs/cargo@v1 - with: - command: check + args: --release fmt: name: Run Rustfmt runs-on: ubuntu-18.04 steps: - uses: actions/checkout@v2 + - name: Cache dependencies + uses: actions/cache@v2 + with: + path: | + ~/.cargo + ./Cargo.lock + ./target + key: ${{ matrix.os }}-${{ matrix.rust}}-${{ hashFiles('Cargo.toml') }} - uses: actions-rs/toolchain@v1 with: profile: minimal diff --git a/Cargo.lock b/Cargo.lock deleted file mode 100644 index cd1913ec8..000000000 --- a/Cargo.lock +++ /dev/null @@ -1,3233 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "ahash" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" - -[[package]] -name = "aho-corasick" -version = "0.7.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" -dependencies = [ - "memchr", -] - -[[package]] -name = "anyhow" -version = "1.0.41" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15af2628f6890fe2609a3b91bef4c83450512802e59489f9c1cb1fa5df064a61" - -[[package]] -name = "arrayvec" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" - -[[package]] -name = "askama" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d298738b6e47e1034e560e5afe63aa488fea34e25ec11b855a76f0d7b8e73134" -dependencies = [ - "askama_derive", - "askama_escape", - "askama_shared", - "mime", - "mime_guess", -] - -[[package]] -name = "askama_derive" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2925c4c290382f9d2fa3d1c1b6a63fa1427099721ecca4749b154cc9c25522" -dependencies = [ - "askama_shared", - "proc-macro2 1.0.27", - "syn 1.0.73", -] - -[[package]] -name = "askama_escape" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90c108c1a94380c89d2215d0ac54ce09796823cca0fd91b299cfff3b33e346fb" - -[[package]] -name = "askama_shared" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2582b77e0f3c506ec4838a25fa8a5f97b9bed72bb6d3d272ea1c031d8bd373bc" -dependencies = [ - "askama_escape", - "humansize", - "nom", - "num-traits", - "percent-encoding", - "proc-macro2 1.0.27", - "quote 1.0.9", - "serde", - "syn 1.0.73", - "toml", -] - -[[package]] -name = "askama_warp" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c96f410ab17fa08f70b5fda07ce1112418642c914864961630808979343ea226" -dependencies = [ - "askama", - "warp", -] - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "autocfg" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" - -[[package]] -name = "base64" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" - -[[package]] -name = "base64" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" - -[[package]] -name = "benchmarks" -version = "0.1.0" -dependencies = [ - "anyhow", - "bytes 1.0.1", - "convert_case", - "criterion", - "flate2", - "heed", - "jemallocator", - "milli", - "reqwest", -] - -[[package]] -name = "big_s" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8" - -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - -[[package]] -name = "bitflags" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" - -[[package]] -name = "bitvec" -version = "0.19.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321" -dependencies = [ - "funty", - "radium", - "tap", - "wyz", -] - -[[package]] -name = "block-buffer" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" -dependencies = [ - "block-padding", - "byte-tools", - "byteorder", - "generic-array 0.12.4", -] - -[[package]] -name = "block-buffer" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" -dependencies = [ - "generic-array 0.14.4", -] - -[[package]] -name = "block-padding" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" -dependencies = [ - "byte-tools", -] - -[[package]] -name = "bstr" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90682c8d613ad3373e66de8c6411e0ae2ab2571e879d2efbf73558cc66f21279" -dependencies = [ - "lazy_static", - "memchr", - "regex-automata", - "serde", -] - -[[package]] -name = "buf_redux" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" -dependencies = [ - "memchr", - "safemem", -] - -[[package]] -name = "bumpalo" -version = "3.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" - -[[package]] -name = "byte-tools" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" - -[[package]] -name = "byte-unit" -version = "4.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "063197e6eb4b775b64160dedde7a0986bb2836cce140e9492e9e96f28e18bcd8" -dependencies = [ - "utf8-width", -] - -[[package]] -name = "bytemuck" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9966d2ab714d0f785dbac0a0396251a35280aeb42413281617d0209ab4898435" - -[[package]] -name = "byteorder" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" - -[[package]] -name = "bytes" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38" - -[[package]] -name = "bytes" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040" - -[[package]] -name = "cast" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57cdfa5d50aad6cb4d44dcab6101a7f79925bd59d82ca42f38a9856a28865374" -dependencies = [ - "rustc_version", -] - -[[package]] -name = "cc" -version = "1.0.68" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a72c244c1ff497a746a7e1fb3d14bd08420ecda70c8f25c7112f2781652d787" -dependencies = [ - "jobserver", -] - -[[package]] -name = "cedarwood" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d" -dependencies = [ - "smallvec", -] - -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "character_converter" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" -dependencies = [ - "bincode", -] - -[[package]] -name = "chrono" -version = "0.4.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" -dependencies = [ - "libc", - "num-integer", - "num-traits", - "serde", - "time", - "winapi 0.3.9", -] - -[[package]] -name = "clap" -version = "2.33.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" -dependencies = [ - "bitflags", - "term_size", - "textwrap", - "unicode-width", -] - -[[package]] -name = "concat-arrays" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1df715824eb382e34b7afb7463b0247bf41538aeba731fba05241ecdb5dc3747" -dependencies = [ - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", -] - -[[package]] -name = "convert_case" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" - -[[package]] -name = "cow-utils" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" - -[[package]] -name = "cpufeatures" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66c99696f6c9dd7f35d486b9d04d7e6e202aa3e8c40d553f2fdf5e7e0c6a71ef" -dependencies = [ - "libc", -] - -[[package]] -name = "crc32fast" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" -dependencies = [ - "cfg-if 1.0.0", -] - -[[package]] -name = "criterion" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab327ed7354547cc2ef43cbe20ef68b988e70b4b593cbd66a2a61733123a3d23" -dependencies = [ - "atty", - "cast", - "clap", - "criterion-plot", - "csv", - "itertools 0.10.1", - "lazy_static", - "num-traits", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_cbor", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", -] - -[[package]] -name = "criterion-plot" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e022feadec601fba1649cfa83586381a4ad31c6bf3a9ab7d408118b05dd9889d" -dependencies = [ - "cast", - "itertools 0.9.0", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.5", -] - -[[package]] -name = "crossbeam-deque" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-epoch", - "crossbeam-utils 0.8.5", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.5", - "lazy_static", - "memoffset", - "scopeguard", -] - -[[package]] -name = "crossbeam-queue" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b" -dependencies = [ - "crossbeam-utils 0.6.6", -] - -[[package]] -name = "crossbeam-utils" -version = "0.6.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04973fa96e96579258a5091af6003abde64af786b860f18622b82e026cca60e6" -dependencies = [ - "cfg-if 0.1.10", - "lazy_static", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" -dependencies = [ - "cfg-if 1.0.0", - "lazy_static", -] - -[[package]] -name = "csv" -version = "1.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" -dependencies = [ - "bstr", - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" -dependencies = [ - "memchr", -] - -[[package]] -name = "deunicode" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f37775d639f64aa16389eede0cbe6a70f56df4609d50d8b6858690d5d7bf8f2" - -[[package]] -name = "digest" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" -dependencies = [ - "generic-array 0.12.4", -] - -[[package]] -name = "digest" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" -dependencies = [ - "generic-array 0.14.4", -] - -[[package]] -name = "dtoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56899898ce76aaf4a0f24d914c97ea6ed976d42fec6ad33fcbb0a1103e07b2b0" - -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - -[[package]] -name = "encoding_rs" -version = "0.8.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" -dependencies = [ - "cfg-if 1.0.0", -] - -[[package]] -name = "fake-simd" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" - -[[package]] -name = "flate2" -version = "1.0.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" -dependencies = [ - "cfg-if 1.0.0", - "crc32fast", - "libc", - "miniz_oxide", -] - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "form_urlencoded" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" -dependencies = [ - "matches", - "percent-encoding", -] - -[[package]] -name = "fs_extra" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" - -[[package]] -name = "fst" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" - -[[package]] -name = "fuchsia-zircon" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" -dependencies = [ - "bitflags", - "fuchsia-zircon-sys", -] - -[[package]] -name = "fuchsia-zircon-sys" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" - -[[package]] -name = "funty" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" - -[[package]] -name = "futures" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7e43a803dae2fa37c1f6a8fe121e1f7bf9548b4dfc0522a42f34145dadfc27" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e682a68b29a882df0545c143dc3646daefe80ba479bcdede94d5a703de2871e2" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0402f765d8a89a26043b889b26ce3c4679d268fa6bb22cd7c6aad98340e179d1" - -[[package]] -name = "futures-executor" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "badaa6a909fac9e7236d0620a2f57f7664640c56575b71a7552fbd68deafab79" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acc499defb3b348f8d8f3f66415835a9131856ff7714bf10dadfc4ec4bdb29a1" - -[[package]] -name = "futures-macro" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4c40298486cdf52cc00cd6d6987892ba502c7656a16a4192a9992b1ccedd121" -dependencies = [ - "autocfg", - "proc-macro-hack", - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", -] - -[[package]] -name = "futures-sink" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a57bead0ceff0d6dde8f465ecd96c9338121bb7717d3e7b108059531870c4282" - -[[package]] -name = "futures-task" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a16bef9fc1a4dddb5bee51c989e3fbba26569cbb0e31f5b303c184e3dd33dae" - -[[package]] -name = "futures-util" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "feb5c238d27e2bf94ffdfd27b2c29e3df4a68c4193bb6427384259e2bf191967" -dependencies = [ - "autocfg", - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite 0.2.7", - "pin-utils", - "proc-macro-hack", - "proc-macro-nested", - "slab", -] - -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - -[[package]] -name = "generic-array" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd" -dependencies = [ - "typenum", -] - -[[package]] -name = "generic-array" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "getrandom" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", -] - -[[package]] -name = "getrandom" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", -] - -[[package]] -name = "glob" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" - -[[package]] -name = "grenad" -version = "0.1.0" -source = "git+https://github.com/Kerollmops/grenad.git?rev=3adcb26#3adcb267dcbc590c7da10eb5f887a254865b3dbe" -dependencies = [ - "byteorder", - "flate2", - "log", - "nix", - "snap", - "tempfile", - "zstd", -] - -[[package]] -name = "h2" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e4728fd124914ad25e99e3d15a9361a879f6620f63cb56bbb08f95abb97a535" -dependencies = [ - "bytes 0.5.6", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap", - "slab", - "tokio 0.2.25", - "tokio-util 0.3.1", - "tracing", - "tracing-futures", -] - -[[package]] -name = "h2" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" -dependencies = [ - "bytes 1.0.1", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap", - "slab", - "tokio 1.7.1", - "tokio-util 0.6.7", - "tracing", -] - -[[package]] -name = "half" -version = "1.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3" - -[[package]] -name = "hashbrown" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf" -dependencies = [ - "ahash", - "autocfg", -] - -[[package]] -name = "hashbrown" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7afe4a420e3fe79967a00898cc1f4db7c8a49a9333a29f8a4bd76a253d5cd04" - -[[package]] -name = "hashbrown" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" - -[[package]] -name = "headers" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0b7591fb62902706ae8e7aaff416b1b0fa2c0fd0878b46dc13baa3712d8a855" -dependencies = [ - "base64 0.13.0", - "bitflags", - "bytes 1.0.1", - "headers-core", - "http", - "mime", - "sha-1 0.9.6", - "time", -] - -[[package]] -name = "headers-core" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" -dependencies = [ - "http", -] - -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" -dependencies = [ - "unicode-segmentation", -] - -[[package]] -name = "heed" -version = "0.12.0" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#8e5dc6d71c8166a8d7d0db059e6e51478942b551" -dependencies = [ - "byteorder", - "heed-traits", - "heed-types", - "libc", - "lmdb-rkv-sys", - "once_cell", - "page_size", - "serde", - "synchronoise", - "url", - "zerocopy", -] - -[[package]] -name = "heed-traits" -version = "0.7.0" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#8e5dc6d71c8166a8d7d0db059e6e51478942b551" - -[[package]] -name = "heed-types" -version = "0.7.2" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#8e5dc6d71c8166a8d7d0db059e6e51478942b551" -dependencies = [ - "bincode", - "heed-traits", - "serde", - "serde_json", - "zerocopy", -] - -[[package]] -name = "helpers" -version = "0.7.2" -dependencies = [ - "anyhow", - "byte-unit", - "heed", - "jemallocator", - "milli", - "stderrlog", - "structopt", -] - -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - -[[package]] -name = "http" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" -dependencies = [ - "bytes 1.0.1", - "fnv", - "itoa", -] - -[[package]] -name = "http-body" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13d5ff830006f7646652e057693569bfe0d51760c0085a071769d142a205111b" -dependencies = [ - "bytes 0.5.6", - "http", -] - -[[package]] -name = "http-body" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60daa14be0e0786db0f03a9e57cb404c9d756eed2b6c62b9ea98ec5743ec75a9" -dependencies = [ - "bytes 1.0.1", - "http", - "pin-project-lite 0.2.7", -] - -[[package]] -name = "http-ui" -version = "0.7.2" -dependencies = [ - "anyhow", - "askama", - "askama_warp", - "byte-unit", - "bytes 0.5.6", - "crossbeam-channel", - "either", - "flate2", - "fst", - "funty", - "futures", - "grenad", - "heed", - "jemallocator", - "log", - "maplit", - "meilisearch-tokenizer 0.2.3", - "memmap", - "milli", - "once_cell", - "rayon", - "serde", - "serde_json", - "serde_test", - "stderrlog", - "structopt", - "tempfile", - "tokio 0.2.25", - "warp", -] - -[[package]] -name = "httparse" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a87b616e37e93c22fb19bcd386f02f3af5ea98a25670ad0fce773de23c5e68" - -[[package]] -name = "httpdate" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" - -[[package]] -name = "httpdate" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440" - -[[package]] -name = "human_format" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86cce260d758a9aa3d7c4b99d55c815a540f8a37514ba6046ab6be402a157cb0" - -[[package]] -name = "humansize" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02296996cb8796d7c6e3bc2d9211b7802812d36999a51bb754123ead7d37d026" - -[[package]] -name = "hyper" -version = "0.13.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a6f157065790a3ed2f88679250419b5cdd96e714a0d65f7797fd337186e96bb" -dependencies = [ - "bytes 0.5.6", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.2.7", - "http", - "http-body 0.3.1", - "httparse", - "httpdate 0.3.2", - "itoa", - "pin-project 1.0.7", - "socket2 0.3.19", - "tokio 0.2.25", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper" -version = "0.14.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07d6baa1b441335f3ce5098ac421fb6547c46dda735ca1bc6d0153c838f9dd83" -dependencies = [ - "bytes 1.0.1", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.3.3", - "http", - "http-body 0.4.2", - "httparse", - "httpdate 1.0.1", - "itoa", - "pin-project-lite 0.2.7", - "socket2 0.4.0", - "tokio 1.7.1", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper-rustls" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64" -dependencies = [ - "futures-util", - "hyper 0.14.9", - "log", - "rustls", - "tokio 1.7.1", - "tokio-rustls", - "webpki", -] - -[[package]] -name = "idna" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" -dependencies = [ - "matches", - "unicode-bidi", - "unicode-normalization", -] - -[[package]] -name = "indexmap" -version = "1.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "824845a0bf897a9042383849b02c1bc219c2383772efcd5c6f9766fa4b81aef3" -dependencies = [ - "autocfg", - "hashbrown 0.9.1", -] - -[[package]] -name = "infos" -version = "0.7.2" -dependencies = [ - "anyhow", - "byte-unit", - "csv", - "heed", - "jemallocator", - "milli", - "roaring", - "serde_json", - "stderrlog", - "structopt", -] - -[[package]] -name = "input_buffer" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19a8a95243d5a0398cae618ec29477c6e3cb631152be5c19481f80bc71559754" -dependencies = [ - "bytes 0.5.6", -] - -[[package]] -name = "iovec" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" -dependencies = [ - "libc", -] - -[[package]] -name = "ipnet" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" - -[[package]] -name = "itertools" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" -dependencies = [ - "either", -] - -[[package]] -name = "itertools" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" - -[[package]] -name = "jemalloc-sys" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45" -dependencies = [ - "cc", - "fs_extra", - "libc", -] - -[[package]] -name = "jemallocator" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69" -dependencies = [ - "jemalloc-sys", - "libc", -] - -[[package]] -name = "jieba-rs" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fea3b3172a80f9958abc3b9a637e4e311cd696dc6813440e5cc929b8a5311055" -dependencies = [ - "cedarwood", - "fxhash", - "hashbrown 0.11.2", - "lazy_static", - "phf", - "phf_codegen", - "regex", -] - -[[package]] -name = "jobserver" -version = "0.1.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "972f5ae5d1cb9c6ae417789196c803205313edde988685da5e3aae0827b9e7fd" -dependencies = [ - "libc", -] - -[[package]] -name = "js-sys" -version = "0.3.51" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062" -dependencies = [ - "wasm-bindgen", -] - -[[package]] -name = "kernel32-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "levenshtein_automata" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" -dependencies = [ - "fst", -] - -[[package]] -name = "lexical-core" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" -dependencies = [ - "arrayvec", - "bitflags", - "cfg-if 1.0.0", - "ryu", - "static_assertions", -] - -[[package]] -name = "libc" -version = "0.2.97" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12b8adadd720df158f4d70dfe7ccc6adb0472d7c55ca83445f6a5ab3e36f8fb6" - -[[package]] -name = "linked-hash-map" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fb9b38af92608140b86b693604b9ffcc5824240a484d1ecd4795bacb2fe88f3" - -[[package]] -name = "lmdb-rkv-sys" -version = "0.15.0" -source = "git+https://github.com/meilisearch/lmdb-rs#d0b50d02938ee84e4e4372697ea991fe2a4cae3b" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - -[[package]] -name = "log" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" -dependencies = [ - "cfg-if 1.0.0", -] - -[[package]] -name = "logging_timer" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40d0c249955c17c2f8f86b5f501b16d2509ebbe775f7b1d1d2b1ba85ade2a793" -dependencies = [ - "log", - "logging_timer_proc_macros", -] - -[[package]] -name = "logging_timer_proc_macros" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "482c2c28e6bcfe7c4274f82f701774d755e6aa873edfd619460fcd0966e0eb07" -dependencies = [ - "log", - "proc-macro2 0.4.30", - "quote 0.6.13", - "syn 0.15.44", -] - -[[package]] -name = "maplit" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" - -[[package]] -name = "matches" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" - -[[package]] -name = "meilisearch-tokenizer" -version = "0.2.3" -source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.3#c2399c3f879144ad92e20ae057e14984dfd22781" -dependencies = [ - "character_converter", - "cow-utils", - "deunicode", - "fst", - "jieba-rs", - "once_cell", - "slice-group-by", - "unicode-segmentation", - "whatlang 0.9.0", -] - -[[package]] -name = "meilisearch-tokenizer" -version = "0.2.4" -source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.4#135d08dce465a756abaf6a1bcad70f315bda99b9" -dependencies = [ - "character_converter", - "cow-utils", - "deunicode", - "fst", - "jieba-rs", - "once_cell", - "slice-group-by", - "unicode-segmentation", - "whatlang 0.12.0", -] - -[[package]] -name = "memchr" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" - -[[package]] -name = "memmap" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "memoffset" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" -dependencies = [ - "autocfg", -] - -[[package]] -name = "milli" -version = "0.7.2" -dependencies = [ - "big_s", - "bstr", - "byteorder", - "chrono", - "concat-arrays", - "csv", - "either", - "flate2", - "fst", - "fxhash", - "grenad", - "heed", - "human_format", - "itertools 0.10.1", - "levenshtein_automata", - "linked-hash-map", - "log", - "logging_timer", - "maplit", - "meilisearch-tokenizer 0.2.4", - "memmap", - "obkv", - "once_cell", - "ordered-float", - "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", - "pest_derive", - "rand 0.8.4", - "rayon", - "regex", - "roaring", - "serde", - "serde_json", - "slice-group-by", - "smallstr", - "smallvec", - "tempfile", - "tinytemplate", - "uuid", -] - -[[package]] -name = "mime" -version = "0.3.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" - -[[package]] -name = "mime_guess" -version = "2.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2684d4c2e97d99848d30b324b00c8fcc7e5c897b7cbb5819b09e7c90e8baf212" -dependencies = [ - "mime", - "unicase", -] - -[[package]] -name = "miniz_oxide" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" -dependencies = [ - "adler", - "autocfg", -] - -[[package]] -name = "mio" -version = "0.6.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4afd66f5b91bf2a3bc13fad0e21caedac168ca4c707504e75585648ae80e4cc4" -dependencies = [ - "cfg-if 0.1.10", - "fuchsia-zircon", - "fuchsia-zircon-sys", - "iovec", - "kernel32-sys", - "libc", - "log", - "miow 0.2.2", - "net2", - "slab", - "winapi 0.2.8", -] - -[[package]] -name = "mio" -version = "0.7.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c2bdb6314ec10835cd3293dd268473a835c02b7b352e788be788b3c6ca6bb16" -dependencies = [ - "libc", - "log", - "miow 0.3.7", - "ntapi", - "winapi 0.3.9", -] - -[[package]] -name = "mio-named-pipes" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" -dependencies = [ - "log", - "mio 0.6.23", - "miow 0.3.7", - "winapi 0.3.9", -] - -[[package]] -name = "mio-uds" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0" -dependencies = [ - "iovec", - "libc", - "mio 0.6.23", -] - -[[package]] -name = "miow" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebd808424166322d4a38da87083bfddd3ac4c131334ed55856112eb06d46944d" -dependencies = [ - "kernel32-sys", - "net2", - "winapi 0.2.8", - "ws2_32-sys", -] - -[[package]] -name = "miow" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "multipart" -version = "0.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d050aeedc89243f5347c3e237e3e13dc76fbe4ae3742a57b94dc14f69acf76d4" -dependencies = [ - "buf_redux", - "httparse", - "log", - "mime", - "mime_guess", - "quick-error", - "rand 0.7.3", - "safemem", - "tempfile", - "twoway", -] - -[[package]] -name = "net2" -version = "0.2.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "nix" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ccba0cfe4fdf15982d1674c69b1fd80bad427d293849982668dfe454bd61f2" -dependencies = [ - "bitflags", - "cc", - "cfg-if 1.0.0", - "libc", -] - -[[package]] -name = "nom" -version = "6.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7413f999671bd4745a7b624bd370a569fb6bc574b23c83a3c5ed2e453f3d5e2" -dependencies = [ - "bitvec", - "funty", - "lexical-core", - "memchr", - "version_check", -] - -[[package]] -name = "ntapi" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "num-integer" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" -dependencies = [ - "autocfg", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" -dependencies = [ - "autocfg", -] - -[[package]] -name = "num_cpus" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" -dependencies = [ - "hermit-abi", - "libc", -] - -[[package]] -name = "obkv" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385" - -[[package]] -name = "once_cell" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" - -[[package]] -name = "oorandom" -version = "11.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" - -[[package]] -name = "opaque-debug" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" - -[[package]] -name = "opaque-debug" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" - -[[package]] -name = "ordered-float" -version = "2.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f100fcfb41e5385e0991f74981732049f9b896821542a219420491046baafdc2" -dependencies = [ - "num-traits", -] - -[[package]] -name = "page_size" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "percent-encoding" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" - -[[package]] -name = "pest" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" -dependencies = [ - "ucd-trie", -] - -[[package]] -name = "pest" -version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" -dependencies = [ - "ucd-trie", -] - -[[package]] -name = "pest_derive" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" -dependencies = [ - "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "pest_generator", -] - -[[package]] -name = "pest_generator" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" -dependencies = [ - "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "pest_meta", - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", -] - -[[package]] -name = "pest_meta" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" -dependencies = [ - "maplit", - "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "sha-1 0.8.2", -] - -[[package]] -name = "phf" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_codegen" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" -dependencies = [ - "phf_generator", - "phf_shared", -] - -[[package]] -name = "phf_generator" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" -dependencies = [ - "phf_shared", - "rand 0.7.3", -] - -[[package]] -name = "phf_shared" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" -dependencies = [ - "siphasher", -] - -[[package]] -name = "pin-project" -version = "0.4.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "918192b5c59119d51e0cd221f4d49dde9112824ba717369e903c97d076083d0f" -dependencies = [ - "pin-project-internal 0.4.28", -] - -[[package]] -name = "pin-project" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7509cc106041c40a4518d2af7a61530e1eed0e6285296a3d8c5472806ccc4a4" -dependencies = [ - "pin-project-internal 1.0.7", -] - -[[package]] -name = "pin-project-internal" -version = "0.4.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be26700300be6d9d23264c73211d8190e755b6b5ca7a1b28230025511b52a5e" -dependencies = [ - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", -] - -[[package]] -name = "pin-project-internal" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48c950132583b500556b1efd71d45b319029f2b71518d979fcc208e16b42426f" -dependencies = [ - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", -] - -[[package]] -name = "pin-project-lite" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "257b64915a082f7811703966789728173279bdebb956b143dbcd23f6f970a777" - -[[package]] -name = "pin-project-lite" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkg-config" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" - -[[package]] -name = "plotters" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" -dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "plotters-backend" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b07fffcddc1cb3a1de753caa4e4df03b79922ba43cf882acc1bdd7e8df9f4590" - -[[package]] -name = "plotters-svg" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b38a02e23bd9604b842a812063aec4ef702b57989c37b655254bb61c471ad211" -dependencies = [ - "plotters-backend", -] - -[[package]] -name = "ppv-lite86" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" - -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2 1.0.27", - "quote 1.0.9", - "version_check", -] - -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro-nested" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" - -[[package]] -name = "proc-macro2" -version = "0.4.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" -dependencies = [ - "unicode-xid 0.1.0", -] - -[[package]] -name = "proc-macro2" -version = "1.0.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0d8caf72986c1a598726adc988bb5984792ef84f5ee5aa50209145ee8077038" -dependencies = [ - "unicode-xid 0.2.2", -] - -[[package]] -name = "quick-error" -version = "1.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" - -[[package]] -name = "quote" -version = "0.6.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1" -dependencies = [ - "proc-macro2 0.4.30", -] - -[[package]] -name = "quote" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" -dependencies = [ - "proc-macro2 1.0.27", -] - -[[package]] -name = "radium" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" - -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom 0.1.16", - "libc", - "rand_chacha 0.2.2", - "rand_core 0.5.1", - "rand_hc 0.2.0", - "rand_pcg", -] - -[[package]] -name = "rand" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" -dependencies = [ - "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.3", - "rand_hc 0.3.1", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core 0.5.1", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core 0.6.3", -] - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom 0.1.16", -] - -[[package]] -name = "rand_core" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" -dependencies = [ - "getrandom 0.2.3", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rand_hc" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" -dependencies = [ - "rand_core 0.6.3", -] - -[[package]] -name = "rand_pcg" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rayon" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" -dependencies = [ - "autocfg", - "crossbeam-deque", - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-utils 0.8.5", - "lazy_static", - "num_cpus", -] - -[[package]] -name = "redox_syscall" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab49abadf3f9e1c4bc499e8845e152ad87d2ad2d30371841171169e9d75feee" -dependencies = [ - "bitflags", -] - -[[package]] -name = "regex" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - -[[package]] -name = "regex-syntax" -version = "0.6.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" - -[[package]] -name = "remove_dir_all" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "reqwest" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "246e9f61b9bb77df069a947682be06e31ac43ea37862e244a69f177694ea6d22" -dependencies = [ - "base64 0.13.0", - "bytes 1.0.1", - "encoding_rs", - "futures-core", - "futures-util", - "http", - "http-body 0.4.2", - "hyper 0.14.9", - "hyper-rustls", - "ipnet", - "js-sys", - "lazy_static", - "log", - "mime", - "percent-encoding", - "pin-project-lite 0.2.7", - "rustls", - "serde", - "serde_urlencoded 0.7.0", - "tokio 1.7.1", - "tokio-rustls", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", - "webpki-roots", - "winreg", -] - -[[package]] -name = "retain_mut" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9c17925a9027d298a4603d286befe3f9dc0e8ed02523141914eb628798d6e5b" - -[[package]] -name = "ring" -version = "0.16.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" -dependencies = [ - "cc", - "libc", - "once_cell", - "spin", - "untrusted", - "web-sys", - "winapi 0.3.9", -] - -[[package]] -name = "roaring" -version = "0.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "536cfa885fc388b8ae69edf96d7970849b7d9c1395da1b8330f17715babf8a09" -dependencies = [ - "bytemuck", - "byteorder", - "retain_mut", -] - -[[package]] -name = "rustc_version" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" -dependencies = [ - "semver", -] - -[[package]] -name = "rustls" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" -dependencies = [ - "base64 0.13.0", - "log", - "ring", - "sct", - "webpki", -] - -[[package]] -name = "ryu" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" - -[[package]] -name = "safemem" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "scoped-tls" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" - -[[package]] -name = "scopeguard" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - -[[package]] -name = "sct" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "search" -version = "0.7.2" -dependencies = [ - "anyhow", - "byte-unit", - "heed", - "jemallocator", - "log", - "milli", - "serde_json", - "stderrlog", - "structopt", -] - -[[package]] -name = "semver" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" -dependencies = [ - "semver-parser", -] - -[[package]] -name = "semver-parser" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" -dependencies = [ - "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", -] - -[[package]] -name = "serde" -version = "1.0.126" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec7505abeacaec74ae4778d9d9328fe5a5d04253220a85c4ee022239fc996d03" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_cbor" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622" -dependencies = [ - "half", - "serde", -] - -[[package]] -name = "serde_derive" -version = "1.0.126" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963a7dbc9895aeac7ac90e74f34a5d5261828f79df35cbed41e10189d3804d43" -dependencies = [ - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", -] - -[[package]] -name = "serde_json" -version = "1.0.64" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "799e97dc9fdae36a5c8b8f2cae9ce2ee9fdce2058c57a93e6099d919fd982f79" -dependencies = [ - "indexmap", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "serde_test" -version = "1.0.126" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd1055d1c20532080b9da5040ec8e27425f4d4573d8e29eb19ba4ff1e4b9da2d" -dependencies = [ - "serde", -] - -[[package]] -name = "serde_urlencoded" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ec5d77e2d4c73717816afac02670d5c4f534ea95ed430442cad02e7a6e32c97" -dependencies = [ - "dtoa", - "itoa", - "serde", - "url", -] - -[[package]] -name = "serde_urlencoded" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9" -dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "sha-1" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" -dependencies = [ - "block-buffer 0.7.3", - "digest 0.8.1", - "fake-simd", - "opaque-debug 0.2.3", -] - -[[package]] -name = "sha-1" -version = "0.9.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c4cfa741c5832d0ef7fab46cabed29c2aae926db0b11bb2069edd8db5e64e16" -dependencies = [ - "block-buffer 0.9.0", - "cfg-if 1.0.0", - "cpufeatures", - "digest 0.9.0", - "opaque-debug 0.3.0", -] - -[[package]] -name = "signal-hook-registry" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" -dependencies = [ - "libc", -] - -[[package]] -name = "siphasher" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbce6d4507c7e4a3962091436e56e95290cb71fa302d0d270e32130b75fbff27" - -[[package]] -name = "slab" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f173ac3d1a7e3b28003f40de0b5ce7fe2710f9b9dc3fc38664cebee46b3b6527" - -[[package]] -name = "slice-group-by" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb" - -[[package]] -name = "smallstr" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e922794d168678729ffc7e07182721a14219c65814e66e91b839a272fe5ae4f" -dependencies = [ - "serde", - "smallvec", -] - -[[package]] -name = "smallvec" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" - -[[package]] -name = "snap" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45456094d1983e2ee2a18fdfebce3189fa451699d0502cb8e3b49dba5ba41451" - -[[package]] -name = "socket2" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "122e570113d28d773067fab24266b66753f6ea915758651696b6e35e49f88d6e" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "socket2" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e3dfc207c526015c632472a77be09cf1b6e46866581aecae5cc38fb4235dea2" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - -[[package]] -name = "stderrlog" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45a53e2eff3e94a019afa6265e8ee04cb05b9d33fe9f5078b14e4e391d155a38" -dependencies = [ - "atty", - "chrono", - "log", - "termcolor", - "thread_local", -] - -[[package]] -name = "structopt" -version = "0.3.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5277acd7ee46e63e5168a80734c9f6ee81b1367a7d8772a2d765df2a3705d28c" -dependencies = [ - "clap", - "lazy_static", - "structopt-derive", -] - -[[package]] -name = "structopt-derive" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ba9cdfda491b814720b6b06e0cac513d922fc407582032e8706e9f137976f90" -dependencies = [ - "heck", - "proc-macro-error", - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", -] - -[[package]] -name = "syn" -version = "0.15.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ca4b3b69a77cbe1ffc9e198781b7acb0c7365a883670e8f1c1bc66fba79a5c5" -dependencies = [ - "proc-macro2 0.4.30", - "quote 0.6.13", - "unicode-xid 0.1.0", -] - -[[package]] -name = "syn" -version = "1.0.73" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7" -dependencies = [ - "proc-macro2 1.0.27", - "quote 1.0.9", - "unicode-xid 0.2.2", -] - -[[package]] -name = "synchronoise" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d717ed0efc9d39ab3b642a096bc369a3e02a38a51c41845d7fe31bdad1d6eaeb" -dependencies = [ - "crossbeam-queue", -] - -[[package]] -name = "synstructure" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b834f2d66f734cb897113e34aaff2f1ab4719ca946f9a7358dba8f8064148701" -dependencies = [ - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", - "unicode-xid 0.2.2", -] - -[[package]] -name = "tap" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" - -[[package]] -name = "tempfile" -version = "3.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "rand 0.8.4", - "redox_syscall", - "remove_dir_all", - "winapi 0.3.9", -] - -[[package]] -name = "term_size" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "termcolor" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "term_size", - "unicode-width", -] - -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static", -] - -[[package]] -name = "time" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi 0.3.9", -] - -[[package]] -name = "tinytemplate" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d3dc76004a03cec1c5932bca4cdc2e39aaa798e3f82363dd94f9adf6098c12f" -dependencies = [ - "serde", - "serde_json", -] - -[[package]] -name = "tinyvec" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b5220f05bb7de7f3f53c7c065e1199b3172696fe2db9f9c4d8ad9b4ee74c342" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" - -[[package]] -name = "tokio" -version = "0.2.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6703a273949a90131b290be1fe7b039d0fc884aa1935860dfcbe056f28cd8092" -dependencies = [ - "bytes 0.5.6", - "fnv", - "futures-core", - "iovec", - "lazy_static", - "libc", - "memchr", - "mio 0.6.23", - "mio-named-pipes", - "mio-uds", - "num_cpus", - "pin-project-lite 0.1.12", - "signal-hook-registry", - "slab", - "tokio-macros", - "winapi 0.3.9", -] - -[[package]] -name = "tokio" -version = "1.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fb2ed024293bb19f7a5dc54fe83bf86532a44c12a2bb8ba40d64a4509395ca2" -dependencies = [ - "autocfg", - "bytes 1.0.1", - "libc", - "memchr", - "mio 0.7.13", - "num_cpus", - "pin-project-lite 0.2.7", - "winapi 0.3.9", -] - -[[package]] -name = "tokio-macros" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e44da00bfc73a25f814cd8d7e57a68a5c31b74b3152a0a1d1f590c97ed06265a" -dependencies = [ - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", -] - -[[package]] -name = "tokio-rustls" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" -dependencies = [ - "rustls", - "tokio 1.7.1", - "webpki", -] - -[[package]] -name = "tokio-tungstenite" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d9e878ad426ca286e4dcae09cbd4e1973a7f8987d97570e2469703dd7f5720c" -dependencies = [ - "futures-util", - "log", - "pin-project 0.4.28", - "tokio 0.2.25", - "tungstenite", -] - -[[package]] -name = "tokio-util" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be8242891f2b6cbef26a2d7e8605133c2c554cd35b3e4948ea892d6d68436499" -dependencies = [ - "bytes 0.5.6", - "futures-core", - "futures-sink", - "log", - "pin-project-lite 0.1.12", - "tokio 0.2.25", -] - -[[package]] -name = "tokio-util" -version = "0.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592" -dependencies = [ - "bytes 1.0.1", - "futures-core", - "futures-sink", - "log", - "pin-project-lite 0.2.7", - "tokio 1.7.1", -] - -[[package]] -name = "toml" -version = "0.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" -dependencies = [ - "serde", -] - -[[package]] -name = "tower-service" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" - -[[package]] -name = "tracing" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" -dependencies = [ - "cfg-if 1.0.0", - "log", - "pin-project-lite 0.2.7", - "tracing-core", -] - -[[package]] -name = "tracing-core" -version = "0.1.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9ff14f98b1a4b289c6248a023c1c2fa1491062964e9fed67ab29c4e4da4a052" -dependencies = [ - "lazy_static", -] - -[[package]] -name = "tracing-futures" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" -dependencies = [ - "pin-project 1.0.7", - "tracing", -] - -[[package]] -name = "try-lock" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" - -[[package]] -name = "tungstenite" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0308d80d86700c5878b9ef6321f020f29b1bb9d5ff3cab25e75e23f3a492a23" -dependencies = [ - "base64 0.12.3", - "byteorder", - "bytes 0.5.6", - "http", - "httparse", - "input_buffer", - "log", - "rand 0.7.3", - "sha-1 0.9.6", - "url", - "utf-8", -] - -[[package]] -name = "twoway" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b11b2b5241ba34be09c3cc85a36e56e48f9888862e19cedf23336d35316ed1" -dependencies = [ - "memchr", -] - -[[package]] -name = "typenum" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" - -[[package]] -name = "ucd-trie" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" - -[[package]] -name = "unicase" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" -dependencies = [ - "version_check", -] - -[[package]] -name = "unicode-bidi" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeb8be209bb1c96b7c177c7420d26e04eccacb0eeae6b980e35fcb74678107e0" -dependencies = [ - "matches", -] - -[[package]] -name = "unicode-normalization" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" -dependencies = [ - "tinyvec", -] - -[[package]] -name = "unicode-segmentation" -version = "1.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0d2e7be6ae3a5fa87eed5fb451aff96f2573d2694942e40543ae0bbe19c796" - -[[package]] -name = "unicode-width" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" - -[[package]] -name = "unicode-xid" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" - -[[package]] -name = "unicode-xid" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" - -[[package]] -name = "untrusted" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" - -[[package]] -name = "url" -version = "2.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" -dependencies = [ - "form_urlencoded", - "idna", - "matches", - "percent-encoding", -] - -[[package]] -name = "urlencoding" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a1f0175e03a0973cf4afd476bef05c26e228520400eb1fd473ad417b1c00ffb" - -[[package]] -name = "utf-8" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" - -[[package]] -name = "utf8-width" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cf7d77f457ef8dfa11e4cd5933c5ddb5dc52a94664071951219a97710f0a32b" - -[[package]] -name = "uuid" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" -dependencies = [ - "getrandom 0.2.3", -] - -[[package]] -name = "version_check" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" - -[[package]] -name = "walkdir" -version = "2.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" -dependencies = [ - "same-file", - "winapi 0.3.9", - "winapi-util", -] - -[[package]] -name = "want" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" -dependencies = [ - "log", - "try-lock", -] - -[[package]] -name = "warp" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f41be6df54c97904af01aa23e613d4521eed7ab23537cede692d4058f6449407" -dependencies = [ - "bytes 0.5.6", - "futures", - "headers", - "http", - "hyper 0.13.10", - "log", - "mime", - "mime_guess", - "multipart", - "pin-project 0.4.28", - "scoped-tls", - "serde", - "serde_json", - "serde_urlencoded 0.6.1", - "tokio 0.2.25", - "tokio-tungstenite", - "tower-service", - "tracing", - "tracing-futures", - "urlencoding", -] - -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - -[[package]] -name = "wasm-bindgen" -version = "0.2.74" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd" -dependencies = [ - "cfg-if 1.0.0", - "serde", - "serde_json", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.74" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900" -dependencies = [ - "bumpalo", - "lazy_static", - "log", - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fba7978c679d53ce2d0ac80c8c175840feb849a161664365d1287b41f2e67f1" -dependencies = [ - "cfg-if 1.0.0", - "js-sys", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.74" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4" -dependencies = [ - "quote 1.0.9", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.74" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97" -dependencies = [ - "proc-macro2 1.0.27", - "quote 1.0.9", - "syn 1.0.73", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.74" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f" - -[[package]] -name = "web-sys" -version = "0.3.51" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "webpki" -version = "0.21.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "webpki-roots" -version = "0.21.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940" -dependencies = [ - "webpki", -] - -[[package]] -name = "whatlang" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0289c1d1548414a5645e6583e118e9c569c579ec2a0c32417cc3dbf7a89075" -dependencies = [ - "hashbrown 0.7.2", -] - -[[package]] -name = "whatlang" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a346d2eb29c03618693ed24a29d1acd0c3f2cb08ae58b9669d7461e033cf703" -dependencies = [ - "hashbrown 0.7.2", -] - -[[package]] -name = "winapi" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-build" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "winreg" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "ws2_32-sys" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - -[[package]] -name = "wyz" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" - -[[package]] -name = "zerocopy" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" -dependencies = [ - "byteorder", - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" -dependencies = [ - "proc-macro2 1.0.27", - "syn 1.0.73", - "synstructure", -] - -[[package]] -name = "zstd" -version = "0.5.4+zstd.1.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69996ebdb1ba8b1517f61387a883857818a66c8a295f487b1ffd8fd9d2c82910" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "2.0.6+zstd.1.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98aa931fb69ecee256d44589d19754e61851ae4769bf963b385119b1cc37a49e" -dependencies = [ - "libc", - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "1.4.18+zstd.1.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1e6e8778706838f43f771d80d37787cb2fe06dafe89dd3aebaf6721b9eaec81" -dependencies = [ - "cc", - "glob", - "itertools 0.9.0", - "libc", -] From dc2b63abdf00b2d52c797028fe0b08f24d235f44 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 27 Jul 2021 16:24:21 +0200 Subject: [PATCH 0885/1889] Introduce an empty FilterCondition variant to support unknown fields --- milli/src/search/facet/filter_condition.rs | 111 ++++++++++++--------- milli/src/update/index_documents/mod.rs | 7 +- milli/src/update/settings.rs | 5 +- 3 files changed, 71 insertions(+), 52 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index cc108f855..5ca9f7e5a 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -6,6 +6,7 @@ use std::str::FromStr; use either::Either; use heed::types::DecodeIgnore; +use itertools::Itertools; use log::debug; use pest::error::{Error as PestError, ErrorVariant}; use pest::iterators::{Pair, Pairs}; @@ -54,6 +55,7 @@ pub enum FilterCondition { Operator(FieldId, Operator), Or(Box, Box), And(Box, Box), + Empty, } impl FilterCondition { @@ -108,7 +110,7 @@ impl FilterCondition { expression: &str, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let filterable_fields = index.filterable_fields_ids(rtxn)?; + let filterable_fields = index.filterable_fields(rtxn)?; let lexed = FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) @@ -116,7 +118,7 @@ impl FilterCondition { fn from_pairs( fim: &FieldsIdsMap, - ff: &HashSet, + ff: &HashSet, expression: Pairs, ) -> Result { PREC_CLIMBER.climb( @@ -150,17 +152,22 @@ impl FilterCondition { }, Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), + Empty => Empty, } } fn between( fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)?; + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; let (lresult, _) = pest_parse(items.next().unwrap()); let (rresult, _) = pest_parse(items.next().unwrap()); @@ -173,12 +180,16 @@ impl FilterCondition { fn equal( fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)?; + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; let value = items.next().unwrap(); let (result, svalue) = pest_parse(value); @@ -189,12 +200,16 @@ impl FilterCondition { fn greater_than( fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)?; + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -205,12 +220,16 @@ impl FilterCondition { fn greater_than_or_equal( fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)?; + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -221,12 +240,16 @@ impl FilterCondition { fn lower_than( fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)?; + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -237,12 +260,16 @@ impl FilterCondition { fn lower_than_or_equal( fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, + filterable_fields: &HashSet, item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)?; + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); @@ -461,57 +488,41 @@ impl FilterCondition { let rhs = rhs.evaluate(rtxn, index)?; Ok(lhs & rhs) } + Empty => Ok(RoaringBitmap::new()), } } } -/// Retrieve the field id base on the pest value, returns an error is -/// the field does not exist or is not filterable. +/// Retrieve the field id base on the pest value. +/// +/// Returns an error if the given value is not filterable. +/// +/// Returns Ok(None) if the given value is filterable, but is not yet ascociated to a field_id. /// /// The pest pair is simply a string associated with a span, a location to highlight in /// the error message. fn field_id( fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, + filterable_fields: &HashSet, items: &mut Pairs, -) -> StdResult> { +) -> StdResult, PestError> { // lexing ensures that we at least have a key let key = items.next().unwrap(); - let field_id = match fields_ids_map.id(key.as_str()) { - Some(field_id) => field_id, - None => { - return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` not found, available attributes are: {}", - key.as_str(), - fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", "), - ), - }, - key.as_span(), - )) - } - }; - - if !filterable_fields.contains(&field_id) { + if !filterable_fields.contains(key.as_str()) { return Err(PestError::new_from_span( ErrorVariant::CustomError { message: format!( "attribute `{}` is not filterable, available filterable attributes are: {}", key.as_str(), - filterable_fields - .iter() - .flat_map(|id| { fields_ids_map.name(*id) }) - .collect::>() - .join(", "), + filterable_fields.iter().join(", "), ), }, key.as_span(), )); } - Ok(field_id) + Ok(fields_ids_map.id(key.as_str())) } /// Tries to parse the pest pair into the type `T` specified, always returns @@ -552,6 +563,9 @@ mod tests { // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); + let mut map = index.fields_ids_map(&wtxn).unwrap(); + map.insert("channel"); + index.put_fields_ids_map(&mut wtxn, &map).unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_filterable_fields(hashset! { S("channel") }); builder.execute(|_, _| ()).unwrap(); @@ -581,6 +595,9 @@ mod tests { // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); + let mut map = index.fields_ids_map(&wtxn).unwrap(); + map.insert("timestamp"); + index.put_fields_ids_map(&mut wtxn, &map).unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_filterable_fields(hashset! { "timestamp".into() }); builder.execute(|_, _| ()).unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index efe16def7..aae1e4eb4 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -391,6 +391,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { documents_file, } = output; + // The fields_ids_map is put back to the store now so the rest of the transaction sees an + // up to date field map. + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + // We delete the documents that this document addition replaces. This way we are // able to simply insert all the documents even if they already exist in the database. if !replaced_documents_ids.is_empty() { @@ -596,9 +600,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { debug!("Writing using the write method: {:?}", write_method); - // We write the fields ids map into the main database - self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; - // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 743483613..07bdfd6fa 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -674,7 +674,8 @@ mod tests { let count = index .facet_id_f64_docids .remap_key_type::() - .prefix_iter(&rtxn, &[0, 0]) + // The faceted field id is 2u16 + .prefix_iter(&rtxn, &[0, 2, 0]) .unwrap() .count(); assert_eq!(count, 3); @@ -700,7 +701,7 @@ mod tests { let count = index .facet_id_f64_docids .remap_key_type::() - .prefix_iter(&rtxn, &[0, 0]) + .prefix_iter(&rtxn, &[0, 2, 0]) .unwrap() .count(); assert_eq!(count, 4); From 6a141694da58fb16f83da06f4194c154d3fd72f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 27 Jul 2021 16:38:42 +0200 Subject: [PATCH 0886/1889] Update version for the next release (v0.8.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 6fad00a22..c57f738ff 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.7.2" +version = "0.8.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 9f425af3f..3c1ccc1d7 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.7.2" +version = "0.8.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index b257e6010..c8873fcd8 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.7.2" +version = "0.8.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index dc839a209..ca297050a 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.7.2" +version = "0.8.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 83722c516..f793869ab 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.7.2" +version = "0.8.0" authors = ["Clément Renault "] edition = "2018" From 90514e03d151720bac22484cfcd232aa9c5869ba Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 29 Jul 2021 15:49:23 +0200 Subject: [PATCH 0887/1889] Fix invalid faceted documents ids buffer size --- milli/src/index.rs | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 305a127ca..120bcbadf 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; +use std::mem::size_of; use std::path::Path; use chrono::{DateTime, Utc}; @@ -492,7 +493,8 @@ impl Index { field_id: FieldId, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; + let mut buffer = + [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] @@ -506,7 +508,8 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - let mut buffer = [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; + let mut buffer = + [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] @@ -524,7 +527,8 @@ impl Index { field_id: FieldId, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 2]; + let mut buffer = + [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] @@ -538,7 +542,8 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - let mut buffer = [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + let mut buffer = + [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] From 341c244965eef46abc849c28bc491dd042ca07f4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 29 Jul 2021 15:56:36 +0200 Subject: [PATCH 0888/1889] Bump milli to v0.8.1 --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index c57f738ff..420008f5e 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.8.0" +version = "0.8.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 3c1ccc1d7..61284831c 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.8.0" +version = "0.8.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index c8873fcd8..2edf13850 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.8.0" +version = "0.8.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ca297050a..ac51c94be 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.8.0" +version = "0.8.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index f793869ab..34e2b16f6 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.8.0" +version = "0.8.1" authors = ["Clément Renault "] edition = "2018" From 976dc1f4bc7f5008350e9a6b8155b27a3f39b12f Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 29 Jul 2021 14:30:33 +0200 Subject: [PATCH 0889/1889] prefix the search benchmarks with 'search' --- .github/workflows/benchmarks.yml | 4 ++-- benchmarks/Cargo.toml | 4 ++-- benchmarks/benches/{songs.rs => search_songs.rs} | 0 benchmarks/benches/{wiki.rs => search_wiki.rs} | 0 4 files changed, 4 insertions(+), 4 deletions(-) rename benchmarks/benches/{songs.rs => search_songs.rs} (100%) rename benchmarks/benches/{wiki.rs => search_wiki.rs} (100%) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 02f54fe13..c1475e281 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -4,9 +4,9 @@ on: workflow_dispatch: inputs: dataset_name: - description: 'The name of the dataset used to benchmark (songs, wiki or indexing)' + description: 'The name of the dataset used to benchmark (search_songs, search_wiki or indexing)' required: false - default: 'songs' + default: 'search_songs' env: HOME: "/home/runner" # The actions-rs/toolchain@v1 can fail we have no $HOME defined diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index dd319b4e6..9e380b9a8 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -22,11 +22,11 @@ convert_case = "0.4" reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false } [[bench]] -name = "songs" +name = "search_songs" harness = false [[bench]] -name = "wiki" +name = "search_wiki" harness = false [[bench]] diff --git a/benchmarks/benches/songs.rs b/benchmarks/benches/search_songs.rs similarity index 100% rename from benchmarks/benches/songs.rs rename to benchmarks/benches/search_songs.rs diff --git a/benchmarks/benches/wiki.rs b/benchmarks/benches/search_wiki.rs similarity index 100% rename from benchmarks/benches/wiki.rs rename to benchmarks/benches/search_wiki.rs From 7eb2d71009c79065843e46c4fc0b701cc52e04bc Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 29 Jul 2021 14:31:00 +0200 Subject: [PATCH 0890/1889] fix the benchmarks --- benchmarks/benches/utils.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index fd1df0a90..5318527f4 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -115,5 +115,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { }); } group.finish(); + + index.prepare_for_closing().wait(); } } From cdeb07f0fde90a1ed03ad2d97ad9de16294a32df Mon Sep 17 00:00:00 2001 From: many Date: Tue, 3 Aug 2021 17:26:39 +0200 Subject: [PATCH 0891/1889] Fix prefix level position docids database The prefix search was inverted when we generated the DB. Instead of searching if word had a prefix in prefix fst, we were searching if the word was a prefix of a prefix contained in the prefix fst. The indexer, now, iterate over prefix contained in the fst and search them by prefix in the word-level-position-docids database, aggregating matches in a sorter. Fix #299 --- milli/src/update/words_level_positions.rs | 29 +++++++++++++++-------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index c656d7105..2f0995c18 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -3,16 +3,16 @@ use std::fs::File; use std::num::NonZeroU32; use std::{cmp, str}; -use fst::automaton::{self, Automaton}; -use fst::{IntoStreamer, Streamer}; +use fst::Streamer; use grenad::{CompressionType, FileFuse, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; -use crate::error::InternalError; +use crate::error::{InternalError, SerializationError}; use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; +use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, WriteMethod, @@ -102,13 +102,22 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { // in the prefix FST previously constructed. let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; let db = self.index.word_level_position_docids.remap_data_type::(); - for result in db.iter(self.wtxn)? { - let ((word, level, left, right), data) = result?; - if level == TreeLevel::min_value() { - let automaton = automaton::Str::new(word).starts_with(); - let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); - while let Some(prefix) = matching_prefixes.next() { - let prefix = str::from_utf8(prefix)?; + // iter over all prefixes in the prefix fst. + let mut word_stream = prefix_fst.stream(); + while let Some(prefix_bytes) = word_stream.next() { + let prefix = str::from_utf8(prefix_bytes).map_err(|_| { + SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } + })?; + + // iter over all lines of the DB where the key is prefixed by the current prefix. + let mut iter = db + .remap_key_type::() + .prefix_iter(self.wtxn, &prefix_bytes)? + .remap_key_type::(); + while let Some(((_word, level, left, right), data)) = iter.next().transpose()? { + // if level is 0, we push the line in the sorter + // replacing the complete word by the prefix. + if level == TreeLevel::min_value() { let key = (prefix, level, left, right); let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap(); word_prefix_level_positions_docids_sorter.insert(bytes, data)?; From 7f26c75610209512c5b4d9197bfd246f9554dda6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 4 Aug 2021 16:04:55 +0200 Subject: [PATCH 0892/1889] Update milli to v0.9.0 --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 420008f5e..fc7a8792d 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.8.1" +version = "0.9.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 61284831c..692e5f61a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.8.1" +version = "0.9.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 2edf13850..aed68b65f 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.8.1" +version = "0.9.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ac51c94be..56b72fa9b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.8.1" +version = "0.9.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 34e2b16f6..2feacc828 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.8.1" +version = "0.9.0" authors = ["Clément Renault "] edition = "2018" From 8fdf860c17efd131504beba7c8ecfdd3b20335fb Mon Sep 17 00:00:00 2001 From: many Date: Thu, 12 Aug 2021 11:29:20 +0200 Subject: [PATCH 0893/1889] Remove max values by facet limit for facet distribution --- milli/src/search/facet/facet_distribution.rs | 47 ++------------------ 1 file changed, 3 insertions(+), 44 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index bfbea76c3..91bf21cf7 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; -use std::{cmp, fmt, mem}; +use std::{fmt, mem}; use heed::types::ByteSlice; use roaring::RoaringBitmap; @@ -13,14 +13,6 @@ use crate::heed_codec::facet::{ use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; use crate::{FieldId, Index, Result}; -/// The default number of values by facets that will -/// be fetched from the key-value store. -const DEFAULT_VALUES_BY_FACET: usize = 100; - -/// The hard limit in the number of values by facets that will be fetched from -/// the key-value store. Searching for more values could slow down the engine. -const MAX_VALUES_BY_FACET: usize = 1000; - /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 3000; @@ -28,20 +20,13 @@ const CANDIDATES_THRESHOLD: u64 = 3000; pub struct FacetDistribution<'a> { facets: Option>, candidates: Option, - max_values_by_facet: usize, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } impl<'a> FacetDistribution<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> { - FacetDistribution { - facets: None, - candidates: None, - max_values_by_facet: DEFAULT_VALUES_BY_FACET, - rtxn, - index, - } + FacetDistribution { facets: None, candidates: None, rtxn, index } } pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { @@ -54,11 +39,6 @@ impl<'a> FacetDistribution<'a> { self } - pub fn max_values_by_facet(&mut self, max: usize) -> &mut Self { - self.max_values_by_facet = cmp::min(max, MAX_VALUES_BY_FACET); - self - } - /// There is a small amount of candidates OR we ask for facet string values so we /// decide to iterate over the facet values of each one of them, one by one. fn facet_distribution_from_documents( @@ -72,7 +52,6 @@ impl<'a> FacetDistribution<'a> { FacetType::Number => { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); - let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); @@ -85,9 +64,6 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, _, value), ()) = result?; *distribution.entry(value.to_string()).or_insert(0) += 1; - if distribution.len() - distribution_prelength == self.max_values_by_facet { - break; - } } } } @@ -110,10 +86,6 @@ impl<'a> FacetDistribution<'a> { .entry(normalized_value) .or_insert_with(|| (original_value, 0)); *count += 1; - - if normalized_distribution.len() == self.max_values_by_facet { - break; - } } } @@ -144,9 +116,6 @@ impl<'a> FacetDistribution<'a> { if !docids.is_empty() { distribution.insert(value.to_string(), docids.len()); } - if distribution.len() == self.max_values_by_facet { - break; - } } Ok(()) @@ -167,9 +136,6 @@ impl<'a> FacetDistribution<'a> { if !docids.is_empty() { distribution.insert(original.to_string(), docids.len()); } - if distribution.len() == self.max_values_by_facet { - break; - } } Ok(()) @@ -189,9 +155,6 @@ impl<'a> FacetDistribution<'a> { for result in range { let ((_, _, value, _), docids) = result?; distribution.insert(value.to_string(), docids.len()); - if distribution.len() == self.max_values_by_facet { - break; - } } let iter = self @@ -205,9 +168,6 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, normalized_value), (original_value, docids)) = result?; normalized_distribution.insert(normalized_value, (original_value, docids.len())); - if distribution.len() == self.max_values_by_facet { - break; - } } let iter = normalized_distribution @@ -289,12 +249,11 @@ impl<'a> FacetDistribution<'a> { impl fmt::Debug for FacetDistribution<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self; + let FacetDistribution { facets, candidates, rtxn: _, index: _ } = self; f.debug_struct("FacetDistribution") .field("facets", facets) .field("candidates", candidates) - .field("max_values_by_facet", max_values_by_facet) .finish() } } From 7dbefae1e39d20fb5183ecc1651da063ed2e5d97 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 12 Aug 2021 17:23:39 +0200 Subject: [PATCH 0894/1889] Make facet string iterator non reducing --- milli/src/search/facet/facet_string.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 40ea8c04a..ed5322607 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -289,6 +289,7 @@ pub struct FacetStringIter<'t> { field_id: FieldId, level_iters: Vec<(RoaringBitmap, Either, FacetStringLevelZeroRange<'t>>)>, + must_reduce: bool, } impl<'t> FacetStringIter<'t> { @@ -318,7 +319,13 @@ impl<'t> FacetStringIter<'t> { )?), }; - Ok(FacetStringIter { rtxn, db, field_id, level_iters: vec![(documents_ids, highest_iter)] }) + Ok(FacetStringIter { + rtxn, + db, + field_id, + level_iters: vec![(documents_ids, highest_iter)], + must_reduce: false, + }) } fn highest_level( @@ -348,7 +355,9 @@ impl<'t> Iterator for FacetStringIter<'t> { Ok(((level, left, right), (string_bounds, mut docids))) => { docids &= &*documents_ids; if !docids.is_empty() { - *documents_ids -= &docids; + if self.must_reduce { + *documents_ids -= &docids; + } let result = match string_bounds { Some((left, right)) => FacetStringLevelZeroRange::new( @@ -390,7 +399,9 @@ impl<'t> Iterator for FacetStringIter<'t> { Ok((normalized, original, mut docids)) => { docids &= &*documents_ids; if !docids.is_empty() { - *documents_ids -= &docids; + if self.must_reduce { + *documents_ids -= &docids; + } return Some(Ok((normalized, original, docids))); } } From fcc520e49ad416df90169cfca810f20400ee8f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 12 Aug 2021 18:35:17 +0200 Subject: [PATCH 0895/1889] Update version for the next release (v0.10.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index fc7a8792d..fc59028c9 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.9.0" +version = "0.10.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 692e5f61a..f1157b1e0 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.9.0" +version = "0.10.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index aed68b65f..3f18486dc 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.9.0" +version = "0.10.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 56b72fa9b..d252633f2 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.9.0" +version = "0.10.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 2feacc828..cdb166c26 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.9.0" +version = "0.10.0" authors = ["Clément Renault "] edition = "2018" From 01a405282827355908327aa29c0dc8e15cfb7d3f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 10:29:43 +0200 Subject: [PATCH 0896/1889] Move the FacetStringIter creation logic into a private new method --- milli/src/search/facet/facet_string.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index ed5322607..1a9bf6d91 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -298,6 +298,16 @@ impl<'t> FacetStringIter<'t> { index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, + ) -> heed::Result> { + FacetStringIter::new(rtxn, index, field_id, documents_ids, false) + } + + fn new( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + must_reduce: bool, ) -> heed::Result> { let db = index.facet_id_string_docids.remap_types::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); @@ -324,7 +334,7 @@ impl<'t> FacetStringIter<'t> { db, field_id, level_iters: vec![(documents_ids, highest_iter)], - must_reduce: false, + must_reduce, }) } From 64df15905771541700d5651440b078afc063b2bf Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 10:31:20 +0200 Subject: [PATCH 0897/1889] Introduce the new_reducing constructor on the FacetStringIter struct --- milli/src/search/facet/facet_string.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 1a9bf6d91..5ad45fdff 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -293,6 +293,15 @@ pub struct FacetStringIter<'t> { } impl<'t> FacetStringIter<'t> { + pub fn new_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + FacetStringIter::new(rtxn, index, field_id, documents_ids, true) + } + pub fn new_non_reducing( rtxn: &'t heed::RoTxn, index: &'t Index, From 1c604de1582077150a263cfe4c335eadb3aff78d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 10:41:11 +0200 Subject: [PATCH 0898/1889] Introduce the highest_iter private method on the FacetStringIter struct --- milli/src/search/facet/facet_string.rs | 70 ++++++++++++++------------ 1 file changed, 39 insertions(+), 31 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 5ad45fdff..83ff77cee 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -299,7 +299,15 @@ impl<'t> FacetStringIter<'t> { field_id: FieldId, documents_ids: RoaringBitmap, ) -> heed::Result> { - FacetStringIter::new(rtxn, index, field_id, documents_ids, true) + let db = index.facet_id_string_docids.remap_types::(); + let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; + Ok(FacetStringIter { + rtxn, + db, + field_id, + level_iters: vec![(documents_ids, highest_iter)], + must_reduce: true, + }) } pub fn new_non_reducing( @@ -307,43 +315,15 @@ impl<'t> FacetStringIter<'t> { index: &'t Index, field_id: FieldId, documents_ids: RoaringBitmap, - ) -> heed::Result> { - FacetStringIter::new(rtxn, index, field_id, documents_ids, false) - } - - fn new( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - must_reduce: bool, ) -> heed::Result> { let db = index.facet_id_string_docids.remap_types::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = match NonZeroU8::new(highest_level) { - Some(highest_level) => Left(FacetStringGroupRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - )?), - None => Right(FacetStringLevelZeroRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - )?), - }; - + let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; Ok(FacetStringIter { rtxn, db, field_id, level_iters: vec![(documents_ids, highest_iter)], - must_reduce, + must_reduce: false, }) } @@ -359,6 +339,34 @@ impl<'t> FacetStringIter<'t> { .transpose()? .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit } + + fn highest_iter( + rtxn: &'t heed::RoTxn, + index: &'t Index, + db: Database, + field_id: FieldId, + ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + match NonZeroU8::new(highest_level) { + Some(highest_level) => FacetStringGroupRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + highest_level, + Unbounded, + Unbounded, + ) + .map(Left), + None => FacetStringLevelZeroRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + Unbounded, + Unbounded, + ) + .map(Right), + } + } } impl<'t> Iterator for FacetStringIter<'t> { From 6214c38da94692637a390b9d7c1ab7bb3755f6f1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 10:44:27 +0200 Subject: [PATCH 0899/1889] Introduce the FacetStringGroupRevRange struct --- milli/src/search/facet/facet_string.rs | 61 +++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 83ff77cee..dd4d64726 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -131,7 +131,7 @@ use std::ops::Bound::{Excluded, Included, Unbounded}; use either::{Either, Left, Right}; use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange}; +use heed::{Database, LazyDecode, RoRange, RoRevRange}; use roaring::RoaringBitmap; use crate::heed_codec::facet::{ @@ -206,6 +206,65 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { } } +pub struct FacetStringGroupRevRange<'t> { + iter: RoRevRange< + 't, + FacetLevelValueU32Codec, + LazyDecode>, + >, + end: Bound, +} + +impl<'t> FacetStringGroupRevRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + level: NonZeroU8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let db = db.remap_types::< + FacetLevelValueU32Codec, + FacetStringZeroBoundsValueCodec, + >(); + let left_bound = match left { + Included(left) => Included((field_id, level, left, u32::MIN)), + Excluded(left) => Excluded((field_id, level, left, u32::MIN)), + Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), + }; + let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); + let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; + Ok(FacetStringGroupRevRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetStringGroupRevRange<'t> { + type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), + Err(e) => Some(Err(e)), + } + } else { + None + } + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + /// An iterator that is used to explore the level 0 of the facets string database. /// /// It yields the facet string and the roaring bitmap associated with it. From ad0d311f8a09282694af3df964f61345e888334e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 10:44:43 +0200 Subject: [PATCH 0900/1889] Introduce the FacetStringLevelZeroRevRange struct --- milli/src/search/facet/facet_string.rs | 71 ++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index dd4d64726..c19a8d7de 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -339,6 +339,77 @@ impl<'t> Iterator for FacetStringLevelZeroRange<'t> { } } +pub struct FacetStringLevelZeroRevRange<'t> { + iter: RoRevRange< + 't, + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, +} + +impl<'t> FacetStringLevelZeroRevRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + left: Bound<&str>, + right: Bound<&str>, + ) -> heed::Result> { + fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { + buffer.extend_from_slice(&field_id.to_be_bytes()); + buffer.push(0); + buffer.extend_from_slice(value.as_bytes()); + &buffer[..] + } + + let mut left_buffer = Vec::new(); + let left_bound = match left { + Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), + Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), + Unbounded => { + left_buffer.extend_from_slice(&field_id.to_be_bytes()); + left_buffer.push(0); + Included(&left_buffer[..]) + } + }; + + let mut right_buffer = Vec::new(); + let right_bound = match right { + Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), + Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), + Unbounded => { + right_buffer.extend_from_slice(&field_id.to_be_bytes()); + right_buffer.push(1); // we must only get the level 0 + Excluded(&right_buffer[..]) + } + }; + + let iter = db + .remap_key_type::() + .rev_range(rtxn, &(left_bound, right_bound))? + .remap_types::< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec + >(); + + Ok(FacetStringLevelZeroRevRange { iter }) + } +} + +impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { + type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, normalized), (original, docids)))) => { + Some(Ok((normalized, original, docids))) + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + /// An iterator that is used to explore the facet strings level by level, /// it will only return facets strings that are associated with the /// candidates documents ids given. From 7a5889bc5a817cacad26ed7bef9912ec6c58a2c8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 10:45:26 +0200 Subject: [PATCH 0901/1889] Introduce the highest_reverse_iter private method --- milli/src/search/facet/facet_string.rs | 28 ++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index c19a8d7de..777837bf1 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -497,6 +497,34 @@ impl<'t> FacetStringIter<'t> { .map(Right), } } + + fn highest_reverse_iter( + rtxn: &'t heed::RoTxn, + index: &'t Index, + db: Database, + field_id: FieldId, + ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + match NonZeroU8::new(highest_level) { + Some(highest_level) => FacetStringGroupRevRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + highest_level, + Unbounded, + Unbounded, + ) + .map(Left), + None => FacetStringLevelZeroRevRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + Unbounded, + Unbounded, + ) + .map(Right), + } + } } impl<'t> Iterator for FacetStringIter<'t> { From 22ebd2658fb9e5d1f00cdd225ff850303bbf9294 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 10:47:15 +0200 Subject: [PATCH 0902/1889] Introduce the EitherString/RevRange private aliases --- milli/src/search/facet/facet_string.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 777837bf1..4bb7c9efd 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -410,6 +410,10 @@ impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { } } +type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; +type EitherStringRevRange<'t> = + Either, FacetStringLevelZeroRevRange<'t>>; + /// An iterator that is used to explore the facet strings level by level, /// it will only return facets strings that are associated with the /// candidates documents ids given. @@ -417,8 +421,7 @@ pub struct FacetStringIter<'t> { rtxn: &'t heed::RoTxn<'t>, db: Database, field_id: FieldId, - level_iters: - Vec<(RoaringBitmap, Either, FacetStringLevelZeroRange<'t>>)>, + level_iters: Vec<(RoaringBitmap, EitherStringRange<'t>)>, must_reduce: bool, } From 110bf6b7783b1e94a3811b5784d42716358bc94d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 11:07:34 +0200 Subject: [PATCH 0903/1889] Make the FacetStringIter work in both, ascending and descending orders --- milli/src/search/facet/facet_string.rs | 100 +++++++++++++++++++------ 1 file changed, 79 insertions(+), 21 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 4bb7c9efd..927602c98 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -421,7 +421,7 @@ pub struct FacetStringIter<'t> { rtxn: &'t heed::RoTxn<'t>, db: Database, field_id: FieldId, - level_iters: Vec<(RoaringBitmap, EitherStringRange<'t>)>, + level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, must_reduce: bool, } @@ -438,7 +438,24 @@ impl<'t> FacetStringIter<'t> { rtxn, db, field_id, - level_iters: vec![(documents_ids, highest_iter)], + level_iters: vec![(documents_ids, Left(highest_iter))], + must_reduce: true, + }) + } + + pub fn new_reverse_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_string_docids.remap_types::(); + let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; + Ok(FacetStringIter { + rtxn, + db, + field_id, + level_iters: vec![(documents_ids, Right(highest_reverse_iter))], must_reduce: true, }) } @@ -455,7 +472,7 @@ impl<'t> FacetStringIter<'t> { rtxn, db, field_id, - level_iters: vec![(documents_ids, highest_iter)], + level_iters: vec![(documents_ids, Left(highest_iter))], must_reduce: false, }) } @@ -536,6 +553,21 @@ impl<'t> Iterator for FacetStringIter<'t> { fn next(&mut self) -> Option { 'outer: loop { let (documents_ids, last) = self.level_iters.last_mut()?; + let is_ascending = last.is_left(); + + // We remap the different iterator types to make + // the algorithm less complex to understand. + let last = match last { + Left(ascending) => match ascending { + Left(last) => Left(Left(last)), + Right(last) => Right(Left(last)), + }, + Right(descending) => match descending { + Left(last) => Left(Right(last)), + Right(last) => Right(Right(last)), + }, + }; + match last { Left(last) => { for result in last { @@ -547,24 +579,50 @@ impl<'t> Iterator for FacetStringIter<'t> { *documents_ids -= &docids; } - let result = match string_bounds { - Some((left, right)) => FacetStringLevelZeroRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right), - None => FacetStringGroupRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), + let result = if is_ascending { + match string_bounds { + Some((left, right)) => { + FacetStringLevelZeroRevRange::new( + self.rtxn, + self.db, + self.field_id, + Included(left), + Included(right), + ) + .map(Right) + } + None => FacetStringGroupRevRange::new( + self.rtxn, + self.db, + self.field_id, + NonZeroU8::new(level.get() - 1).unwrap(), + Included(left), + Included(right), + ) + .map(Left), + } + .map(Right) + } else { + match string_bounds { + Some((left, right)) => FacetStringLevelZeroRange::new( + self.rtxn, + self.db, + self.field_id, + Included(left), + Included(right), + ) + .map(Right), + None => FacetStringGroupRange::new( + self.rtxn, + self.db, + self.field_id, + NonZeroU8::new(level.get() - 1).unwrap(), + Included(left), + Included(right), + ) + .map(Left), + } + .map(Left) }; match result { From e9ada44509152dcfd006451ea80d5924db2fb431 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 11:40:07 +0200 Subject: [PATCH 0904/1889] AscDesc criterion returns documents ordered by numbers then by strings --- milli/src/search/criteria/asc_desc.rs | 102 +++++++++++++++++++++----- 1 file changed, 83 insertions(+), 19 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 4a664d042..6d50c1bb5 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::FacetNumberIter; +use crate::search::facet::{FacetNumberIter, FacetStringIter}; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -20,7 +20,7 @@ pub struct AscDesc<'t> { rtxn: &'t heed::RoTxn<'t>, field_name: String, field_id: Option, - ascending: bool, + is_ascending: bool, query_tree: Option, candidates: Box> + 't>, allowed_candidates: RoaringBitmap, @@ -53,12 +53,16 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ascending: bool, + is_ascending: bool, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map.id(&field_name); let faceted_candidates = match field_id { - Some(field_id) => index.number_faceted_documents_ids(rtxn, field_id)?, + Some(field_id) => { + let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?; + let string_faceted = index.string_faceted_documents_ids(rtxn, field_id)?; + number_faceted | string_faceted + } None => RoaringBitmap::default(), }; @@ -67,7 +71,7 @@ impl<'t> AscDesc<'t> { rtxn, field_name, field_id, - ascending, + is_ascending, query_tree: None, candidates: Box::new(std::iter::empty()), allowed_candidates: RoaringBitmap::new(), @@ -87,7 +91,7 @@ impl<'t> Criterion for AscDesc<'t> { loop { debug!( "Facet {}({}) iteration", - if self.ascending { "Asc" } else { "Desc" }, + if self.is_ascending { "Asc" } else { "Desc" }, self.field_name ); @@ -136,7 +140,7 @@ impl<'t> Criterion for AscDesc<'t> { self.index, self.rtxn, field_id, - self.ascending, + self.is_ascending, candidates & &self.faceted_candidates, )?, None => Box::new(std::iter::empty()), @@ -167,31 +171,49 @@ fn facet_ordered<'t>( index: &'t Index, rtxn: &'t heed::RoTxn, field_id: FieldId, - ascending: bool, + is_ascending: bool, candidates: RoaringBitmap, ) -> Result> + 't>> { if candidates.len() <= CANDIDATES_THRESHOLD { - let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; - Ok(Box::new(iter.map(Ok)) as Box>) + let number_iter = iterative_facet_number_ordered_iter( + index, + rtxn, + field_id, + is_ascending, + candidates.clone(), + )?; + let string_iter = + iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; + Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) } else { - let facet_fn = if ascending { + let facet_number_fn = if is_ascending { FacetNumberIter::new_reducing } else { FacetNumberIter::new_reverse_reducing }; - let iter = facet_fn(rtxn, index, field_id, candidates)?; - Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) + let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? + .map(|res| res.map(|(_, docids)| docids)); + + let facet_string_fn = if is_ascending { + FacetStringIter::new_reducing + } else { + FacetStringIter::new_reverse_reducing + }; + let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? + .map(|res| res.map(|(_, _, docids)| docids)); + + Ok(Box::new(number_iter.chain(string_iter))) } } -/// Fetch the whole list of candidates facet values one by one and order them by it. +/// Fetch the whole list of candidates facet number values one by one and order them by it. /// /// This function is fast when the amount of candidates to rank is small. -fn iterative_facet_ordered_iter<'t>( +fn iterative_facet_number_ordered_iter<'t>( index: &'t Index, rtxn: &'t heed::RoTxn, field_id: FieldId, - ascending: bool, + is_ascending: bool, candidates: RoaringBitmap, ) -> Result + 't> { let mut docids_values = Vec::with_capacity(candidates.len() as usize); @@ -199,14 +221,14 @@ fn iterative_facet_ordered_iter<'t>( let left = (field_id, docid, f64::MIN); let right = (field_id, docid, f64::MAX); let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; + let entry = if is_ascending { iter.next() } else { iter.last() }; if let Some(((_, _, value), ())) = entry.transpose()? { docids_values.push((docid, OrderedFloat(value))); } } docids_values.sort_unstable_by_key(|(_, v)| *v); let iter = docids_values.into_iter(); - let iter = if ascending { + let iter = if is_ascending { Box::new(iter) as Box> } else { Box::new(iter.rev()) @@ -216,7 +238,49 @@ fn iterative_facet_ordered_iter<'t>( // required to collect the result into an owned collection (a Vec). // https://github.com/rust-itertools/itertools/issues/499 let vec: Vec<_> = iter - .group_by(|(_, v)| v.clone()) + .group_by(|(_, v)| *v) + .into_iter() + .map(|(_, ids)| ids.map(|(id, _)| id).collect()) + .collect(); + + Ok(vec.into_iter()) +} + +/// Fetch the whole list of candidates facet string values one by one and order them by it. +/// +/// This function is fast when the amount of candidates to rank is small. +fn iterative_facet_string_ordered_iter<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, +) -> Result + 't> { + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, ""); + let right = (field_id, docid.saturating_add(1), ""); + // FIXME Doing this means that it will never be possible to retrieve + // the document with id 2^32, not sure this is a real problem. + let mut iter = index.field_id_docid_facet_strings.range(rtxn, &(left..right))?; + let entry = if is_ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), _)) = entry.transpose()? { + docids_values.push((docid, value)); + } + } + docids_values.sort_unstable_by_key(|(_, v)| *v); + let iter = docids_values.into_iter(); + let iter = if is_ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + + // The itertools GroupBy iterator doesn't provide an owned version, we are therefore + // required to collect the result into an owned collection (a Vec). + // https://github.com/rust-itertools/itertools/issues/499 + let vec: Vec<_> = iter + .group_by(|(_, v)| *v) .into_iter() .map(|(_, ids)| ids.map(|(id, _)| id).collect()) .collect(); From fcedff95e8cd770f2597b93659a3f362d3089bc1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 14:03:21 +0200 Subject: [PATCH 0905/1889] Change the Asc/Desc criterion syntax to use a colon (:) --- milli/Cargo.toml | 1 - milli/src/criterion.rs | 26 +++++--------------------- 2 files changed, 5 insertions(+), 22 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d252633f2..c08c1f76d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -25,7 +25,6 @@ obkv = "0.2.0" once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" -regex = "1.4.3" roaring = "0.6.6" serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index cc1fca01f..31ffed9b3 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,15 +1,10 @@ use std::fmt; use std::str::FromStr; -use once_cell::sync::Lazy; -use regex::Regex; use serde::{Deserialize, Serialize}; use crate::error::{Error, UserError}; -static ASC_DESC_REGEX: Lazy = - Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()); - #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { /// Sorted by decreasing number of matched query terms. @@ -50,22 +45,11 @@ impl FromStr for Criterion { "proximity" => Ok(Criterion::Proximity), "attribute" => Ok(Criterion::Attribute), "exactness" => Ok(Criterion::Exactness), - text => { - let caps = ASC_DESC_REGEX - .captures(text) - .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; - let order = caps.get(1).unwrap().as_str(); - let field_name = caps.get(2).unwrap().as_str(); - match order { - "asc" => Ok(Criterion::Asc(field_name.to_string())), - "desc" => Ok(Criterion::Desc(field_name.to_string())), - text => { - return Err( - UserError::InvalidCriterionName { name: text.to_string() }.into() - ) - } - } - } + text => match text.rsplit_once(':') { + Some((field_name, "asc")) => Ok(Criterion::Asc(field_name.to_string())), + Some((field_name, "desc")) => Ok(Criterion::Desc(field_name.to_string())), + _ => Err(UserError::InvalidCriterionName { name: text.to_string() }.into()), + }, } } } From 5b88df508ea6e7876af7102bfcd71483962017d8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 14:13:00 +0200 Subject: [PATCH 0906/1889] Use the new Asc/Desc syntax everywhere --- benchmarks/benches/search_songs.rs | 8 ++++---- http-ui/src/main.rs | 4 ++-- milli/src/criterion.rs | 6 +++--- milli/src/update/settings.rs | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmarks/benches/search_songs.rs b/benchmarks/benches/search_songs.rs index 726040692..6b11799ec 100644 --- a/benchmarks/benches/search_songs.rs +++ b/benchmarks/benches/search_songs.rs @@ -52,9 +52,9 @@ fn bench_songs(c: &mut criterion::Criterion) { milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); let default_criterion = default_criterion.iter().map(|s| s.as_str()); let asc_default: Vec<&str> = - std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect(); + std::iter::once("released-timestamp:asc").chain(default_criterion.clone()).collect(); let desc_default: Vec<&str> = - std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect(); + std::iter::once("released-timestamp:desc").chain(default_criterion.clone()).collect(); let basic_with_quote: Vec = BASE_CONF .queries @@ -118,12 +118,12 @@ fn bench_songs(c: &mut criterion::Criterion) { }, utils::Conf { group_name: "asc", - criterion: Some(&["asc(released-timestamp)"]), + criterion: Some(&["released-timestamp:desc"]), ..BASE_CONF }, utils::Conf { group_name: "desc", - criterion: Some(&["desc(released-timestamp)"]), + criterion: Some(&["released-timestamp:desc"]), ..BASE_CONF }, diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index ee32882c0..b34418465 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1030,7 +1030,7 @@ mod tests { displayed_attributes: Setting::Set(vec!["name".to_string()]), searchable_attributes: Setting::Set(vec!["age".to_string()]), filterable_attributes: Setting::Set(hashset! { "age".to_string() }), - criteria: Setting::Set(vec!["asc(age)".to_string()]), + criteria: Setting::Set(vec!["age:asc".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }), }; @@ -1058,7 +1058,7 @@ mod tests { Token::Str("criteria"), Token::Some, Token::Seq { len: Some(1) }, - Token::Str("asc(age)"), + Token::Str("age:asc"), Token::SeqEnd, Token::Str("stopWords"), Token::Some, diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 31ffed9b3..d38450a13 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -15,7 +15,7 @@ pub enum Criterion { /// Sorted by increasing distance between matched query terms. Proximity, /// Documents with quey words contained in more important - /// attributes are considred better. + /// attributes are considered better. Attribute, /// Sorted by the similarity of the matched words with the query words. Exactness, @@ -74,8 +74,8 @@ impl fmt::Display for Criterion { Proximity => f.write_str("proximity"), Attribute => f.write_str("attribute"), Exactness => f.write_str("exactness"), - Asc(attr) => write!(f, "asc({})", attr), - Desc(attr) => write!(f, "desc({})", attr), + Asc(attr) => write!(f, "{}:asc", attr), + Desc(attr) => write!(f, "{}:desc", attr), } } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 07bdfd6fa..156e566a5 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -719,7 +719,7 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index, 0); // Don't display the generated `id` field. builder.set_displayed_fields(vec![S("name")]); - builder.set_criteria(vec![S("asc(age)")]); + builder.set_criteria(vec![S("age:asc")]); builder.execute(|_, _| ()).unwrap(); // Then index some documents. @@ -953,7 +953,7 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); builder.set_filterable_fields(hashset! { S("age"), S("toto") }); - builder.set_criteria(vec!["asc(toto)".to_string()]); + builder.set_criteria(vec!["toto:asc".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -990,7 +990,7 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); // It is only Asc(toto), there is a facet database but it is denied to filter with toto. - builder.set_criteria(vec!["asc(toto)".to_string()]); + builder.set_criteria(vec!["toto:asc".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); From ecf8abc5185c1667e450afc7563213e25e7a240f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 16:49:17 +0200 Subject: [PATCH 0907/1889] Modify the README file --- README.md | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index b1498d0f5..58e781d83 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,21 @@ ## Introduction -This engine is a prototype, do not use it in production. -This is one of the most advanced search engine I have worked on. -It currently only supports the proximity criterion. +This repository contains the core engine used in [MeiliSearch]. -### Compile and Run the server +It contains a library that can manage one and only one index. MeiliSearch +manages the multi-index itself. Milli is unable to store updates in a store: +it is the job of something else above and this is why it is only able +to process one update at a time. + +This repository contains crates to quickly debug the engine: + - There are benchmarks located in the `benchmarks` crate. + - The `http-ui` crate is a simple HTTP dashboard to tests the features like for real! + - The `infos` crate is used to dump the internal data-structure and ensure correctness. + - The `search` crate is a simple command-line that helps run [flamegraph] on top of it. + - The `helpers` crate is only used to modify the database inplace, sometimes. + +### Compile and run the HTTP debug server You can specify the number of threads to use to index documents and many other settings too. @@ -42,7 +52,6 @@ the `content-type:application/json` and `content-type:application/x-ndjson` head You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700). - ## Contributing You can setup a `git-hook` to stop you from making a commit too fast. It'll stop you if: @@ -56,3 +65,6 @@ To enable the hook, run the following command from the root of the project: ``` cp script/pre-commit .git/hooks/pre-commit ``` + +[MeiliSearch]: https://github.com/MeiliSearch/MeiliSearch +[flamegraph]: https://github.com/flamegraph-rs/flamegraph From 42cf847a6348884eca6bf0e5477db5191853e2f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 18 Aug 2021 12:03:02 +0200 Subject: [PATCH 0908/1889] Update tokenizer version to v0.2.5 --- http-ui/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index f1157b1e0..bde141af9 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -11,7 +11,7 @@ byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } memmap = "0.7.0" milli = { path = "../milli" } once_cell = "1.5.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d252633f2..bd2c50a1d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -19,7 +19,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.4" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } memmap = "0.7.0" obkv = "0.2.0" once_cell = "1.5.2" From 6cb9c3b81ff9ffc5181a21cae9e5eea5705d9600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 18 Aug 2021 13:46:27 +0200 Subject: [PATCH 0909/1889] Update milli version to v0.10.1 --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index fc59028c9..fac6ed6b5 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.10.0" +version = "0.10.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index f1157b1e0..4f32174cd 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.10.0" +version = "0.10.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 3f18486dc..76456418e 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.10.0" +version = "0.10.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d252633f2..741856834 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.10.0" +version = "0.10.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index cdb166c26..b2a9c9d42 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.10.0" +version = "0.10.1" authors = ["Clément Renault "] edition = "2018" From 687cd2e20573abb850496c3590158030d02c2f13 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 15:05:00 +0200 Subject: [PATCH 0910/1889] Introduce the new Sort criterion and AscDesc enum --- milli/src/criterion.rs | 36 ++++++++++++++++++++++++++------ milli/src/search/criteria/mod.rs | 28 +++++++++++++++++++++++-- milli/src/search/mod.rs | 8 ++++++- milli/tests/search/mod.rs | 1 + 4 files changed, 64 insertions(+), 9 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index d38450a13..38162a74b 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -12,6 +12,9 @@ pub enum Criterion { Words, /// Sorted by increasing number of typos. Typo, + /// Dynamically sort at query time the documents. None, one or multiple Asc/Desc sortable + /// attributes can be used in place of this criterion at query time. + Sort, /// Sorted by increasing distance between matched query terms. Proximity, /// Documents with quey words contained in more important @@ -38,26 +41,46 @@ impl Criterion { impl FromStr for Criterion { type Err = Error; - fn from_str(txt: &str) -> Result { - match txt { + fn from_str(text: &str) -> Result { + match text { "words" => Ok(Criterion::Words), "typo" => Ok(Criterion::Typo), + "sort" => Ok(Criterion::Sort), "proximity" => Ok(Criterion::Proximity), "attribute" => Ok(Criterion::Attribute), "exactness" => Ok(Criterion::Exactness), - text => match text.rsplit_once(':') { - Some((field_name, "asc")) => Ok(Criterion::Asc(field_name.to_string())), - Some((field_name, "desc")) => Ok(Criterion::Desc(field_name.to_string())), - _ => Err(UserError::InvalidCriterionName { name: text.to_string() }.into()), + text => match AscDesc::from_str(text) { + Ok(AscDesc::Asc(field)) => Ok(Criterion::Asc(field)), + Ok(AscDesc::Desc(field)) => Ok(Criterion::Desc(field)), + Err(error) => Err(error.into()), }, } } } +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub enum AscDesc { + Asc(String), + Desc(String), +} + +impl FromStr for AscDesc { + type Err = UserError; + + fn from_str(text: &str) -> Result { + match text.rsplit_once(':') { + Some((field_name, "asc")) => Ok(AscDesc::Asc(field_name.to_string())), + Some((field_name, "desc")) => Ok(AscDesc::Desc(field_name.to_string())), + _ => Err(UserError::InvalidCriterionName { name: text.to_string() }), + } + } +} + pub fn default_criteria() -> Vec { vec![ Criterion::Words, Criterion::Typo, + Criterion::Sort, Criterion::Proximity, Criterion::Attribute, Criterion::Exactness, @@ -71,6 +94,7 @@ impl fmt::Display for Criterion { match self { Words => f.write_str("words"), Typo => f.write_str("typo"), + Sort => f.write_str("sort"), Proximity => f.write_str("proximity"), Attribute => f.write_str("attribute"), Exactness => f.write_str("exactness"), diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 2ba3b388f..814030c98 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::collections::HashMap; +use std::str::FromStr; use roaring::RoaringBitmap; @@ -273,8 +274,9 @@ impl<'t> CriteriaBuilder<'t> { query_tree: Option, primitive_query: Option>, filtered_candidates: Option, + sort_criteria: Option>, ) -> Result> { - use crate::criterion::Criterion as Name; + use crate::criterion::{AscDesc as AscDescName, Criterion as Name}; let primitive_query = primitive_query.unwrap_or_default(); @@ -282,8 +284,30 @@ impl<'t> CriteriaBuilder<'t> { Box::new(Initial::new(query_tree, filtered_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { - Name::Typo => Box::new(Typo::new(self, criterion)), Name::Words => Box::new(Words::new(self, criterion)), + Name::Typo => Box::new(Typo::new(self, criterion)), + Name::Sort => match sort_criteria { + Some(ref sort_criteria) => { + for text in sort_criteria { + criterion = match AscDescName::from_str(text)? { + AscDescName::Asc(field) => Box::new(AscDesc::asc( + &self.index, + &self.rtxn, + criterion, + field, + )?), + AscDescName::Desc(field) => Box::new(AscDesc::desc( + &self.index, + &self.rtxn, + criterion, + field, + )?), + }; + } + criterion + } + None => criterion, + }, Name::Proximity => Box::new(Proximity::new(self, criterion)), Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 871f464ef..43931b6af 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -135,7 +135,13 @@ impl<'a> Search<'a> { }; let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let criteria = criteria_builder.build(query_tree, primitive_query, filtered_candidates)?; + let sort_criteria = None; + let criteria = criteria_builder.build( + query_tree, + primitive_query, + filtered_candidates, + sort_criteria, + )?; match self.index.distinct_field(self.rtxn)? { None => self.perform_sort(NoopDistinct, matching_words, criteria), diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index c5724a921..b84d3fada 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -90,6 +90,7 @@ pub fn expected_order( new_groups .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); } + Criterion::Sort => todo!("sort not supported right now"), Criterion::Typo => { group.sort_by_key(|d| d.typo_rank); new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); From 407f53872a32f6f6b468f8f329580fe901108df2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 15:36:03 +0200 Subject: [PATCH 0911/1889] Add a sort_criteria method to the Search builder struct --- milli/src/search/criteria/mod.rs | 14 +++++++------- milli/src/search/mod.rs | 12 +++++++++++- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 814030c98..61b0fe049 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,6 +1,5 @@ use std::borrow::Cow; use std::collections::HashMap; -use std::str::FromStr; use roaring::RoaringBitmap; @@ -13,6 +12,7 @@ use self::r#final::Final; use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; +use crate::criterion::AscDesc as AscDescName; use crate::search::{word_derivations, WordDerivationsCache}; use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; @@ -274,9 +274,9 @@ impl<'t> CriteriaBuilder<'t> { query_tree: Option, primitive_query: Option>, filtered_candidates: Option, - sort_criteria: Option>, + sort_criteria: Option>, ) -> Result> { - use crate::criterion::{AscDesc as AscDescName, Criterion as Name}; + use crate::criterion::Criterion as Name; let primitive_query = primitive_query.unwrap_or_default(); @@ -288,19 +288,19 @@ impl<'t> CriteriaBuilder<'t> { Name::Typo => Box::new(Typo::new(self, criterion)), Name::Sort => match sort_criteria { Some(ref sort_criteria) => { - for text in sort_criteria { - criterion = match AscDescName::from_str(text)? { + for asc_desc in sort_criteria { + criterion = match asc_desc { AscDescName::Asc(field) => Box::new(AscDesc::asc( &self.index, &self.rtxn, criterion, - field, + field.to_string(), )?), AscDescName::Desc(field) => Box::new(AscDesc::desc( &self.index, &self.rtxn, criterion, - field, + field.to_string(), )?), }; } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 43931b6af..ce2efcc98 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -18,6 +18,7 @@ pub(crate) use self::facet::ParserRule; pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; +use crate::criterion::AscDesc; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{DocumentId, Index, Result}; @@ -37,6 +38,7 @@ pub struct Search<'a> { filter: Option, offset: usize, limit: usize, + sort_criteria: Option>, optional_words: bool, authorize_typos: bool, words_limit: usize, @@ -51,6 +53,7 @@ impl<'a> Search<'a> { filter: None, offset: 0, limit: 20, + sort_criteria: None, optional_words: true, authorize_typos: true, words_limit: 10, @@ -74,6 +77,11 @@ impl<'a> Search<'a> { self } + pub fn sort_criteria(&mut self, criteria: Vec) -> &mut Search<'a> { + self.sort_criteria = Some(criteria); + self + } + pub fn optional_words(&mut self, value: bool) -> &mut Search<'a> { self.optional_words = value; self @@ -135,7 +143,7 @@ impl<'a> Search<'a> { }; let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let sort_criteria = None; + let sort_criteria = self.sort_criteria.clone(); let criteria = criteria_builder.build( query_tree, primitive_query, @@ -205,6 +213,7 @@ impl fmt::Debug for Search<'_> { filter, offset, limit, + sort_criteria, optional_words, authorize_typos, words_limit, @@ -216,6 +225,7 @@ impl fmt::Debug for Search<'_> { .field("filter", filter) .field("offset", offset) .field("limit", limit) + .field("sort_criteria", sort_criteria) .field("optional_words", optional_words) .field("authorize_typos", authorize_typos) .field("words_limit", words_limit) From 71602e0f1b4a125207ab823f10b15b26c284ad5e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 15:57:40 +0200 Subject: [PATCH 0912/1889] Add the sortable fields into the settings and in the index --- milli/src/index.rs | 36 +++++++++++++++++++++++++++++++++++- milli/src/update/settings.rs | 24 ++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 120bcbadf..e2ab51a1c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -28,6 +28,7 @@ pub mod main_key { pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; + pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; @@ -446,13 +447,45 @@ impl Index { Ok(fields_ids) } + /* sortable fields */ + + /// Writes the sortable fields names in the database. + pub(crate) fn put_sortable_fields( + &self, + wtxn: &mut RwTxn, + fields: &HashSet, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::SORTABLE_FIELDS_KEY, fields) + } + + /// Deletes the sortable fields ids in the database. + pub(crate) fn delete_sortable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::SORTABLE_FIELDS_KEY) + } + + /// Returns the sortable fields names. + pub fn sortable_fields(&self, rtxn: &RoTxn) -> heed::Result> { + Ok(self + .main + .get::<_, Str, SerdeJson<_>>(rtxn, main_key::SORTABLE_FIELDS_KEY)? + .unwrap_or_default()) + } + + /// Identical to `sortable_fields`, but returns ids instead. + pub fn sortable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.sortable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect()) + } + /* faceted documents ids */ /// Returns the faceted fields names. /// - /// Faceted fields are the union of all the filterable, distinct, and Asc/Desc fields. + /// Faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result> { let filterable_fields = self.filterable_fields(rtxn)?; + let sortable_fields = self.sortable_fields(rtxn)?; let distinct_field = self.distinct_field(rtxn)?; let asc_desc_fields = self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { @@ -461,6 +494,7 @@ impl Index { }); let mut faceted_fields = filterable_fields; + faceted_fields.extend(sortable_fields); faceted_fields.extend(asc_desc_fields); if let Some(field) = distinct_field { faceted_fields.insert(field.to_owned()); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 156e566a5..c0b5e4549 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -75,6 +75,7 @@ pub struct Settings<'a, 't, 'u, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, filterable_fields: Setting>, + sortable_fields: Setting>, criteria: Setting>, stop_words: Setting>, distinct_field: Setting, @@ -102,6 +103,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { searchable_fields: Setting::NotSet, displayed_fields: Setting::NotSet, filterable_fields: Setting::NotSet, + sortable_fields: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, distinct_field: Setting::NotSet, @@ -135,6 +137,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.filterable_fields = Setting::Set(names); } + pub fn set_sortable_fields(&mut self, names: HashSet) { + self.sortable_fields = Setting::Set(names); + } + pub fn reset_criteria(&mut self) { self.criteria = Setting::Reset; } @@ -392,6 +398,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_sortable(&mut self) -> Result<()> { + match self.sortable_fields { + Setting::Set(ref fields) => { + let mut new_fields = HashSet::new(); + for name in fields { + new_fields.insert(name.clone()); + } + self.index.put_sortable_fields(self.wtxn, &new_fields)?; + } + Setting::Reset => { + self.index.delete_sortable_fields(self.wtxn)?; + } + Setting::NotSet => (), + } + Ok(()) + } + fn update_criteria(&mut self) -> Result<()> { match self.criteria { Setting::Set(ref fields) => { @@ -446,6 +469,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_displayed()?; self.update_filterable()?; + self.update_sortable()?; self.update_distinct_field()?; self.update_criteria()?; self.update_primary_key()?; From 1b7f6ea1e767a4856296417cc7ecb06ff38147df Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 17 Aug 2021 16:14:56 +0200 Subject: [PATCH 0913/1889] Return a new error when the sort criteria is not sortable --- milli/src/criterion.rs | 9 +++++++++ milli/src/error.rs | 10 ++++++++++ milli/src/search/mod.rs | 20 ++++++++++++++++++-- 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 38162a74b..47eb7c7dc 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -64,6 +64,15 @@ pub enum AscDesc { Desc(String), } +impl AscDesc { + pub fn field(&self) -> &str { + match self { + AscDesc::Asc(field) => field, + AscDesc::Desc(field) => field, + } + } +} + impl FromStr for AscDesc { type Err = UserError; diff --git a/milli/src/error.rs b/milli/src/error.rs index 713935869..9bda74631 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -58,6 +58,7 @@ pub enum UserError { InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), + InvalidSortableAttribute { field: String, valid_fields: HashSet }, InvalidStoreFile, MaxDatabaseSizeReached, MissingDocumentId { document: Object }, @@ -226,6 +227,15 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco ) } Self::InvalidFilterAttribute(error) => error.fmt(f), + Self::InvalidSortableAttribute { field, valid_fields } => { + let valid_names = + valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); + write!( + f, + "Attribute {} is not sortable, available sortable attributes are: {}", + field, valid_names + ) + } Self::MissingDocumentId { document } => { let json = serde_json::to_string(document).unwrap(); write!(f, "document doesn't have an identifier {}", json) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index ce2efcc98..23e5c1834 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -19,6 +19,7 @@ pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Opera pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::criterion::AscDesc; +use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{DocumentId, Index, Result}; @@ -142,13 +143,28 @@ impl<'a> Search<'a> { None => MatchingWords::default(), }; + // We check that we are allowed to use the sort criteria, we check + // that they are declared in the sortable fields. + let sortable_fields = self.index.sortable_fields(self.rtxn)?; + if let Some(sort_criteria) = &self.sort_criteria { + for asc_desc in sort_criteria { + let field = asc_desc.field(); + if !sortable_fields.contains(field) { + return Err(UserError::InvalidSortableAttribute { + field: field.to_string(), + valid_fields: sortable_fields, + } + .into()); + } + } + } + let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let sort_criteria = self.sort_criteria.clone(); let criteria = criteria_builder.build( query_tree, primitive_query, filtered_candidates, - sort_criteria, + self.sort_criteria.clone(), )?; match self.index.distinct_field(self.rtxn)? { From d1df0d20f9ef29f169ec070fc5ca302f7aed1fd5 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 18 Aug 2021 16:08:40 +0200 Subject: [PATCH 0914/1889] Add integration test of SortBy criterion --- milli/src/lib.rs | 2 +- milli/tests/assets/test_set.ndjson | 34 +++---- milli/tests/search/distinct.rs | 2 +- milli/tests/search/filters.rs | 2 +- milli/tests/search/mod.rs | 24 ++++- milli/tests/search/query_criteria.rs | 138 ++++++++++++++++++++++----- 6 files changed, 152 insertions(+), 50 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index f3bababf6..2b0bd2ed4 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,7 +22,7 @@ use std::result::Result as StdResult; use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; -pub use self::criterion::{default_criteria, Criterion}; +pub use self::criterion::{default_criteria, AscDesc, Criterion}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 599d479ed..89d9f1109 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -1,17 +1,17 @@ -{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","":""} -{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","":""} -{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","":""} -{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","":""} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} -{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","":""} -{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","":""} -{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","":""} -{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","":""} -{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","":""} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","":""} -{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","":""} -{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","":""} -{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","":""} -{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","":""} -{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","":""} -{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","":""} +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","":""} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","":""} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","":""} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","":""} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","":""} +{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","":""} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","":""} +{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","":""} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","":""} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","":""} +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","":""} +{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","":""} +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","":""} +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","":""} +{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","":""} +{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","":""} diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index ef5af3272..f044756eb 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -32,7 +32,7 @@ macro_rules! test_distinct { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let mut distinct_values = HashSet::new(); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true) + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) .into_iter() .filter_map(|d| { if distinct_values.contains(&d.$distinct) { diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 318197ea3..c810b47af 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -29,7 +29,7 @@ macro_rules! test_filter { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let filtered_ids = search::expected_filtered_ids($filter); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true) + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) .into_iter() .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None }) .collect(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index b84d3fada..7d4043ff1 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -1,3 +1,4 @@ +use std::cmp::Reverse; use std::collections::HashSet; use big_s::S; @@ -5,7 +6,7 @@ use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::update::{IndexDocuments, Settings, UpdateFormat}; -use milli::{Criterion, DocumentId, Index}; +use milli::{AscDesc, Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -36,6 +37,10 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("tag"), S("asc_desc_rank"), }); + builder.set_sortable_fields(hashset! { + S("tag"), + S("asc_desc_rank"), + }); builder.set_synonyms(hashmap! { S("hello") => vec![S("good morning")], S("world") => vec![S("earth")], @@ -67,6 +72,7 @@ pub fn expected_order( criteria: &[Criterion], authorize_typo: bool, optional_words: bool, + sort_by: &[AscDesc], ) -> Vec { let dataset = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); @@ -90,7 +96,14 @@ pub fn expected_order( new_groups .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); } - Criterion::Sort => todo!("sort not supported right now"), + Criterion::Sort if sort_by == [AscDesc::Asc(S("tag"))] => { + group.sort_by_key(|d| d.sort_by_rank); + new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); + } + Criterion::Sort if sort_by == [AscDesc::Desc(S("tag"))] => { + group.sort_by_key(|d| Reverse(d.sort_by_rank)); + new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); + } Criterion::Typo => { group.sort_by_key(|d| d.typo_rank); new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); @@ -105,11 +118,13 @@ pub fn expected_order( .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); } Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { - group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank)); + group.sort_by_key(|d| Reverse(d.asc_desc_rank)); new_groups .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); } - Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()), + Criterion::Asc(_) | Criterion::Desc(_) | Criterion::Sort => { + new_groups.push(group.clone()) + } } } groups = std::mem::take(&mut new_groups); @@ -186,6 +201,7 @@ pub struct TestDocument { pub attribute_rank: u32, pub exact_rank: u32, pub asc_desc_rank: u32, + pub sort_by_rank: u32, pub title: String, pub description: String, pub tag: String, diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index f814508f5..1723c1d6f 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,6 +1,6 @@ use big_s::S; use milli::update::Settings; -use milli::{Criterion, Search, SearchResult}; +use milli::{AscDesc, Criterion, Search, SearchResult}; use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -11,7 +11,7 @@ const ALLOW_OPTIONAL_WORDS: bool = true; const DISALLOW_OPTIONAL_WORDS: bool = false; macro_rules! test_criterion { - ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr) => { + ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr, $sort_criteria:expr) => { #[test] fn $func() { let criteria = $criteria; @@ -23,82 +23,168 @@ macro_rules! test_criterion { search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos($authorize_typos); search.optional_words($optional_word); + search.sort_criteria($sort_criteria); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let expected_external_ids: Vec<_> = - search::expected_order(&criteria, $authorize_typos, $optional_word) - .into_iter() - .map(|d| d.id) - .collect(); + let expected_external_ids: Vec<_> = search::expected_order( + &criteria, + $authorize_typos, + $optional_word, + &$sort_criteria[..], + ) + .into_iter() + .map(|d| d.id) + .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); } }; } -test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![]); -test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![]); -test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words]); -test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Attribute]); -test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Attribute]); -test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Exactness]); -test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Exactness]); -test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Proximity]); -test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Proximity]); +test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![], vec![]); +test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![], vec![]); +test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words], vec![]); +test_criterion!( + attribute_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Attribute], + vec![] +); +test_criterion!( + attribute_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Attribute], + vec![] +); +test_criterion!( + exactness_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Exactness], + vec![] +); +test_criterion!( + exactness_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Exactness], + vec![] +); +test_criterion!( + proximity_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Proximity], + vec![] +); +test_criterion!( + proximity_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Proximity], + vec![] +); test_criterion!( asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Asc(S("asc_desc_rank"))] + vec![Asc(S("asc_desc_rank"))], + vec![] ); test_criterion!( asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Asc(S("asc_desc_rank"))] + vec![Asc(S("asc_desc_rank"))], + vec![] ); test_criterion!( desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Desc(S("asc_desc_rank"))] + vec![Desc(S("asc_desc_rank"))], + vec![] ); test_criterion!( desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Desc(S("asc_desc_rank"))] + vec![Desc(S("asc_desc_rank"))], + vec![] ); test_criterion!( asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Asc(S("unexisting_field"))] + vec![Asc(S("unexisting_field"))], + vec![] ); test_criterion!( asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Asc(S("unexisting_field"))] + vec![Asc(S("unexisting_field"))], + vec![] ); test_criterion!( desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Desc(S("unexisting_field"))] + vec![Desc(S("unexisting_field"))], + vec![] ); test_criterion!( desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Desc(S("unexisting_field"))] + vec![Desc(S("unexisting_field"))], + vec![] +); +test_criterion!(empty_sort_by_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Sort], vec![]); +test_criterion!( + empty_sort_by_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Sort], + vec![] +); +test_criterion!( + sort_by_asc_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Asc(S("tag"))] +); +test_criterion!( + sort_by_asc_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Asc(S("tag"))] +); +test_criterion!( + sort_by_desc_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Desc(S("tag"))] +); +test_criterion!( + sort_by_desc_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Desc(S("tag"))] ); test_criterion!( default_criteria_order, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Words, Typo, Proximity, Attribute, Exactness] + vec![Words, Typo, Proximity, Attribute, Exactness], + vec![] ); #[test] @@ -262,7 +348,7 @@ fn criteria_mixup() { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let expected_external_ids: Vec<_> = - search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) + search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, &[]) .into_iter() .map(|d| d.id) .collect(); From 4b99d8cb91573e6b8ffc11d4584af36e8abc05c0 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 19 Aug 2021 15:02:43 +0200 Subject: [PATCH 0915/1889] rewrite the indexing benchmarks --- benchmarks/benches/indexing.rs | 284 +++++++++++++++------------------ 1 file changed, 129 insertions(+), 155 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 902b34dc8..bd056ea23 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -31,48 +31,43 @@ fn setup_index() -> Index { } fn indexing_songs_default(c: &mut Criterion) { - let index = setup_index(); - - let update_builder = UpdateBuilder::new(0); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let index_ref = &index; - let mut group = c.benchmark_group("indexing"); group.sample_size(10); group.bench_function("Indexing songs with default settings", |b| { b.iter_with_setup( move || { + let index = setup_index(); + let update_builder = UpdateBuilder::new(0); - let mut wtxn = index_ref.write_txn().unwrap(); - let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); - builder.execute().unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = + ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); - () + index }, - move |_| { + move |index| { let update_builder = UpdateBuilder::new(0); - let mut wtxn = index_ref.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); @@ -82,53 +77,48 @@ fn indexing_songs_default(c: &mut Criterion) { )); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); }, ) }); - - index.prepare_for_closing().wait(); } fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { - let index = setup_index(); - - let update_builder = UpdateBuilder::new(0); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect(); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let index_ref = &index; - let mut group = c.benchmark_group("indexing"); group.sample_size(10); group.bench_function("Indexing songs without faceted numbers", |b| { b.iter_with_setup( move || { + let index = setup_index(); + let update_builder = UpdateBuilder::new(0); - let mut wtxn = index_ref.write_txn().unwrap(); - let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); - builder.execute().unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = + ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = + ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); - () + index }, - move |_| { + move |index| { let update_builder = UpdateBuilder::new(0); - let mut wtxn = index_ref.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); @@ -138,49 +128,44 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { )); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); }, ) }); - index.prepare_for_closing().wait(); } fn indexing_songs_without_faceted_fields(c: &mut Criterion) { - let index = setup_index(); - - let update_builder = UpdateBuilder::new(0); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = ["title", "album", "artist", "genre", "country", "released", "duration"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let index_ref = &index; - let mut group = c.benchmark_group("indexing"); group.sample_size(10); group.bench_function("Indexing songs without any facets", |b| { b.iter_with_setup( move || { + let index = setup_index(); + let update_builder = UpdateBuilder::new(0); - let mut wtxn = index_ref.write_txn().unwrap(); - let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); - builder.execute().unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = + ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); - () + index }, - move |_| { + move |index| { let update_builder = UpdateBuilder::new(0); - let mut wtxn = index_ref.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); @@ -190,49 +175,43 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { )); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); }, ) }); - index.prepare_for_closing().wait(); } fn indexing_wiki(c: &mut Criterion) { - let index = setup_index(); - - let update_builder = UpdateBuilder::new(0); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - // there is NO faceted fields at all - - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let index_ref = &index; - let mut group = c.benchmark_group("indexing"); group.sample_size(10); group.bench_function("Indexing wiki", |b| { b.iter_with_setup( move || { + let index = setup_index(); + let update_builder = UpdateBuilder::new(0); - let mut wtxn = index_ref.write_txn().unwrap(); - let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); - builder.execute().unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = + ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + // there is NO faceted fields at all + + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); - () + index }, - move |_| { + move |index| { let update_builder = UpdateBuilder::new(0); - let mut wtxn = index_ref.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, &index); builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); @@ -242,53 +221,48 @@ fn indexing_wiki(c: &mut Criterion) { )); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); }, ) }); - index.prepare_for_closing().wait(); } fn indexing_movies_default(c: &mut Criterion) { - let index = setup_index(); - - let update_builder = UpdateBuilder::new(0); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = ["title", "poster", "overview", "release_date", "genres"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = ["title", "overview"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = ["released_date", "genres"].iter().map(|s| s.to_string()).collect(); - builder.set_filterable_fields(faceted_fields); - - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let index_ref = &index; - let mut group = c.benchmark_group("indexing"); group.sample_size(10); group.bench_function("Indexing movies with default settings", |b| { b.iter_with_setup( move || { + let index = setup_index(); + let update_builder = UpdateBuilder::new(0); - let mut wtxn = index_ref.write_txn().unwrap(); - let builder = update_builder.delete_documents(&mut wtxn, index_ref).unwrap(); - builder.execute().unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "poster", "overview", "release_date", "genres"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["title", "overview"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = + ["released_date", "genres"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(faceted_fields); + + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); - () + index }, - move |_| { + move |index| { let update_builder = UpdateBuilder::new(0); - let mut wtxn = index_ref.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, index_ref); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, &index); builder.update_format(UpdateFormat::Json); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); @@ -296,11 +270,11 @@ fn indexing_movies_default(c: &mut Criterion) { .expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES)); builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); }, ) }); - - index.prepare_for_closing().wait(); } criterion_group!( From 922f9fd4d5cd768bf585dd4c297a110849b52fb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Fri, 20 Aug 2021 18:09:17 +0200 Subject: [PATCH 0916/1889] Revert "Sort at query time" --- benchmarks/benches/search_songs.rs | 8 +- http-ui/src/main.rs | 4 +- milli/Cargo.toml | 1 + milli/src/criterion.rs | 69 ++---- milli/src/error.rs | 10 - milli/src/index.rs | 36 +-- milli/src/lib.rs | 2 +- milli/src/search/criteria/asc_desc.rs | 102 ++------ milli/src/search/criteria/mod.rs | 26 +- milli/src/search/facet/facet_string.rs | 328 ++++--------------------- milli/src/search/mod.rs | 34 +-- milli/src/update/settings.rs | 30 +-- milli/tests/assets/test_set.ndjson | 34 +-- milli/tests/search/distinct.rs | 2 +- milli/tests/search/filters.rs | 2 +- milli/tests/search/mod.rs | 23 +- milli/tests/search/query_criteria.rs | 138 ++--------- 17 files changed, 148 insertions(+), 701 deletions(-) diff --git a/benchmarks/benches/search_songs.rs b/benchmarks/benches/search_songs.rs index 6b11799ec..726040692 100644 --- a/benchmarks/benches/search_songs.rs +++ b/benchmarks/benches/search_songs.rs @@ -52,9 +52,9 @@ fn bench_songs(c: &mut criterion::Criterion) { milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); let default_criterion = default_criterion.iter().map(|s| s.as_str()); let asc_default: Vec<&str> = - std::iter::once("released-timestamp:asc").chain(default_criterion.clone()).collect(); + std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect(); let desc_default: Vec<&str> = - std::iter::once("released-timestamp:desc").chain(default_criterion.clone()).collect(); + std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect(); let basic_with_quote: Vec = BASE_CONF .queries @@ -118,12 +118,12 @@ fn bench_songs(c: &mut criterion::Criterion) { }, utils::Conf { group_name: "asc", - criterion: Some(&["released-timestamp:desc"]), + criterion: Some(&["asc(released-timestamp)"]), ..BASE_CONF }, utils::Conf { group_name: "desc", - criterion: Some(&["released-timestamp:desc"]), + criterion: Some(&["desc(released-timestamp)"]), ..BASE_CONF }, diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b34418465..ee32882c0 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1030,7 +1030,7 @@ mod tests { displayed_attributes: Setting::Set(vec!["name".to_string()]), searchable_attributes: Setting::Set(vec!["age".to_string()]), filterable_attributes: Setting::Set(hashset! { "age".to_string() }), - criteria: Setting::Set(vec!["age:asc".to_string()]), + criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }), }; @@ -1058,7 +1058,7 @@ mod tests { Token::Str("criteria"), Token::Some, Token::Seq { len: Some(1) }, - Token::Str("age:asc"), + Token::Str("asc(age)"), Token::SeqEnd, Token::Str("stopWords"), Token::Some, diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ae9298944..e860454d6 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -25,6 +25,7 @@ obkv = "0.2.0" once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" +regex = "1.4.3" roaring = "0.6.6" serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 47eb7c7dc..cc1fca01f 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,10 +1,15 @@ use std::fmt; use std::str::FromStr; +use once_cell::sync::Lazy; +use regex::Regex; use serde::{Deserialize, Serialize}; use crate::error::{Error, UserError}; +static ASC_DESC_REGEX: Lazy = + Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()); + #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { /// Sorted by decreasing number of matched query terms. @@ -12,13 +17,10 @@ pub enum Criterion { Words, /// Sorted by increasing number of typos. Typo, - /// Dynamically sort at query time the documents. None, one or multiple Asc/Desc sortable - /// attributes can be used in place of this criterion at query time. - Sort, /// Sorted by increasing distance between matched query terms. Proximity, /// Documents with quey words contained in more important - /// attributes are considered better. + /// attributes are considred better. Attribute, /// Sorted by the similarity of the matched words with the query words. Exactness, @@ -41,46 +43,29 @@ impl Criterion { impl FromStr for Criterion { type Err = Error; - fn from_str(text: &str) -> Result { - match text { + fn from_str(txt: &str) -> Result { + match txt { "words" => Ok(Criterion::Words), "typo" => Ok(Criterion::Typo), - "sort" => Ok(Criterion::Sort), "proximity" => Ok(Criterion::Proximity), "attribute" => Ok(Criterion::Attribute), "exactness" => Ok(Criterion::Exactness), - text => match AscDesc::from_str(text) { - Ok(AscDesc::Asc(field)) => Ok(Criterion::Asc(field)), - Ok(AscDesc::Desc(field)) => Ok(Criterion::Desc(field)), - Err(error) => Err(error.into()), - }, - } - } -} - -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] -pub enum AscDesc { - Asc(String), - Desc(String), -} - -impl AscDesc { - pub fn field(&self) -> &str { - match self { - AscDesc::Asc(field) => field, - AscDesc::Desc(field) => field, - } - } -} - -impl FromStr for AscDesc { - type Err = UserError; - - fn from_str(text: &str) -> Result { - match text.rsplit_once(':') { - Some((field_name, "asc")) => Ok(AscDesc::Asc(field_name.to_string())), - Some((field_name, "desc")) => Ok(AscDesc::Desc(field_name.to_string())), - _ => Err(UserError::InvalidCriterionName { name: text.to_string() }), + text => { + let caps = ASC_DESC_REGEX + .captures(text) + .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; + let order = caps.get(1).unwrap().as_str(); + let field_name = caps.get(2).unwrap().as_str(); + match order { + "asc" => Ok(Criterion::Asc(field_name.to_string())), + "desc" => Ok(Criterion::Desc(field_name.to_string())), + text => { + return Err( + UserError::InvalidCriterionName { name: text.to_string() }.into() + ) + } + } + } } } } @@ -89,7 +74,6 @@ pub fn default_criteria() -> Vec { vec![ Criterion::Words, Criterion::Typo, - Criterion::Sort, Criterion::Proximity, Criterion::Attribute, Criterion::Exactness, @@ -103,12 +87,11 @@ impl fmt::Display for Criterion { match self { Words => f.write_str("words"), Typo => f.write_str("typo"), - Sort => f.write_str("sort"), Proximity => f.write_str("proximity"), Attribute => f.write_str("attribute"), Exactness => f.write_str("exactness"), - Asc(attr) => write!(f, "{}:asc", attr), - Desc(attr) => write!(f, "{}:desc", attr), + Asc(attr) => write!(f, "asc({})", attr), + Desc(attr) => write!(f, "desc({})", attr), } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 9bda74631..713935869 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -58,7 +58,6 @@ pub enum UserError { InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), - InvalidSortableAttribute { field: String, valid_fields: HashSet }, InvalidStoreFile, MaxDatabaseSizeReached, MissingDocumentId { document: Object }, @@ -227,15 +226,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco ) } Self::InvalidFilterAttribute(error) => error.fmt(f), - Self::InvalidSortableAttribute { field, valid_fields } => { - let valid_names = - valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); - write!( - f, - "Attribute {} is not sortable, available sortable attributes are: {}", - field, valid_names - ) - } Self::MissingDocumentId { document } => { let json = serde_json::to_string(document).unwrap(); write!(f, "document doesn't have an identifier {}", json) diff --git a/milli/src/index.rs b/milli/src/index.rs index e2ab51a1c..120bcbadf 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -28,7 +28,6 @@ pub mod main_key { pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; - pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; @@ -447,45 +446,13 @@ impl Index { Ok(fields_ids) } - /* sortable fields */ - - /// Writes the sortable fields names in the database. - pub(crate) fn put_sortable_fields( - &self, - wtxn: &mut RwTxn, - fields: &HashSet, - ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::SORTABLE_FIELDS_KEY, fields) - } - - /// Deletes the sortable fields ids in the database. - pub(crate) fn delete_sortable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, main_key::SORTABLE_FIELDS_KEY) - } - - /// Returns the sortable fields names. - pub fn sortable_fields(&self, rtxn: &RoTxn) -> heed::Result> { - Ok(self - .main - .get::<_, Str, SerdeJson<_>>(rtxn, main_key::SORTABLE_FIELDS_KEY)? - .unwrap_or_default()) - } - - /// Identical to `sortable_fields`, but returns ids instead. - pub fn sortable_fields_ids(&self, rtxn: &RoTxn) -> Result> { - let fields = self.sortable_fields(rtxn)?; - let fields_ids_map = self.fields_ids_map(rtxn)?; - Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect()) - } - /* faceted documents ids */ /// Returns the faceted fields names. /// - /// Faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. + /// Faceted fields are the union of all the filterable, distinct, and Asc/Desc fields. pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result> { let filterable_fields = self.filterable_fields(rtxn)?; - let sortable_fields = self.sortable_fields(rtxn)?; let distinct_field = self.distinct_field(rtxn)?; let asc_desc_fields = self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { @@ -494,7 +461,6 @@ impl Index { }); let mut faceted_fields = filterable_fields; - faceted_fields.extend(sortable_fields); faceted_fields.extend(asc_desc_fields); if let Some(field) = distinct_field { faceted_fields.insert(field.to_owned()); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 2b0bd2ed4..f3bababf6 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,7 +22,7 @@ use std::result::Result as StdResult; use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; -pub use self::criterion::{default_criteria, AscDesc, Criterion}; +pub use self::criterion::{default_criteria, Criterion}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 6d50c1bb5..4a664d042 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::{FacetNumberIter, FacetStringIter}; +use crate::search::facet::FacetNumberIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -20,7 +20,7 @@ pub struct AscDesc<'t> { rtxn: &'t heed::RoTxn<'t>, field_name: String, field_id: Option, - is_ascending: bool, + ascending: bool, query_tree: Option, candidates: Box> + 't>, allowed_candidates: RoaringBitmap, @@ -53,16 +53,12 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - is_ascending: bool, + ascending: bool, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map.id(&field_name); let faceted_candidates = match field_id { - Some(field_id) => { - let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?; - let string_faceted = index.string_faceted_documents_ids(rtxn, field_id)?; - number_faceted | string_faceted - } + Some(field_id) => index.number_faceted_documents_ids(rtxn, field_id)?, None => RoaringBitmap::default(), }; @@ -71,7 +67,7 @@ impl<'t> AscDesc<'t> { rtxn, field_name, field_id, - is_ascending, + ascending, query_tree: None, candidates: Box::new(std::iter::empty()), allowed_candidates: RoaringBitmap::new(), @@ -91,7 +87,7 @@ impl<'t> Criterion for AscDesc<'t> { loop { debug!( "Facet {}({}) iteration", - if self.is_ascending { "Asc" } else { "Desc" }, + if self.ascending { "Asc" } else { "Desc" }, self.field_name ); @@ -140,7 +136,7 @@ impl<'t> Criterion for AscDesc<'t> { self.index, self.rtxn, field_id, - self.is_ascending, + self.ascending, candidates & &self.faceted_candidates, )?, None => Box::new(std::iter::empty()), @@ -171,49 +167,31 @@ fn facet_ordered<'t>( index: &'t Index, rtxn: &'t heed::RoTxn, field_id: FieldId, - is_ascending: bool, + ascending: bool, candidates: RoaringBitmap, ) -> Result> + 't>> { if candidates.len() <= CANDIDATES_THRESHOLD { - let number_iter = iterative_facet_number_ordered_iter( - index, - rtxn, - field_id, - is_ascending, - candidates.clone(), - )?; - let string_iter = - iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; - Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) + let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; + Ok(Box::new(iter.map(Ok)) as Box>) } else { - let facet_number_fn = if is_ascending { + let facet_fn = if ascending { FacetNumberIter::new_reducing } else { FacetNumberIter::new_reverse_reducing }; - let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? - .map(|res| res.map(|(_, docids)| docids)); - - let facet_string_fn = if is_ascending { - FacetStringIter::new_reducing - } else { - FacetStringIter::new_reverse_reducing - }; - let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? - .map(|res| res.map(|(_, _, docids)| docids)); - - Ok(Box::new(number_iter.chain(string_iter))) + let iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } } -/// Fetch the whole list of candidates facet number values one by one and order them by it. +/// Fetch the whole list of candidates facet values one by one and order them by it. /// /// This function is fast when the amount of candidates to rank is small. -fn iterative_facet_number_ordered_iter<'t>( +fn iterative_facet_ordered_iter<'t>( index: &'t Index, rtxn: &'t heed::RoTxn, field_id: FieldId, - is_ascending: bool, + ascending: bool, candidates: RoaringBitmap, ) -> Result + 't> { let mut docids_values = Vec::with_capacity(candidates.len() as usize); @@ -221,14 +199,14 @@ fn iterative_facet_number_ordered_iter<'t>( let left = (field_id, docid, f64::MIN); let right = (field_id, docid, f64::MAX); let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?; - let entry = if is_ascending { iter.next() } else { iter.last() }; + let entry = if ascending { iter.next() } else { iter.last() }; if let Some(((_, _, value), ())) = entry.transpose()? { docids_values.push((docid, OrderedFloat(value))); } } docids_values.sort_unstable_by_key(|(_, v)| *v); let iter = docids_values.into_iter(); - let iter = if is_ascending { + let iter = if ascending { Box::new(iter) as Box> } else { Box::new(iter.rev()) @@ -238,49 +216,7 @@ fn iterative_facet_number_ordered_iter<'t>( // required to collect the result into an owned collection (a Vec). // https://github.com/rust-itertools/itertools/issues/499 let vec: Vec<_> = iter - .group_by(|(_, v)| *v) - .into_iter() - .map(|(_, ids)| ids.map(|(id, _)| id).collect()) - .collect(); - - Ok(vec.into_iter()) -} - -/// Fetch the whole list of candidates facet string values one by one and order them by it. -/// -/// This function is fast when the amount of candidates to rank is small. -fn iterative_facet_string_ordered_iter<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, -) -> Result + 't> { - let mut docids_values = Vec::with_capacity(candidates.len() as usize); - for docid in candidates.iter() { - let left = (field_id, docid, ""); - let right = (field_id, docid.saturating_add(1), ""); - // FIXME Doing this means that it will never be possible to retrieve - // the document with id 2^32, not sure this is a real problem. - let mut iter = index.field_id_docid_facet_strings.range(rtxn, &(left..right))?; - let entry = if is_ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), _)) = entry.transpose()? { - docids_values.push((docid, value)); - } - } - docids_values.sort_unstable_by_key(|(_, v)| *v); - let iter = docids_values.into_iter(); - let iter = if is_ascending { - Box::new(iter) as Box> - } else { - Box::new(iter.rev()) - }; - - // The itertools GroupBy iterator doesn't provide an owned version, we are therefore - // required to collect the result into an owned collection (a Vec). - // https://github.com/rust-itertools/itertools/issues/499 - let vec: Vec<_> = iter - .group_by(|(_, v)| *v) + .group_by(|(_, v)| v.clone()) .into_iter() .map(|(_, ids)| ids.map(|(id, _)| id).collect()) .collect(); diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 61b0fe049..2ba3b388f 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -12,7 +12,6 @@ use self::r#final::Final; use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; -use crate::criterion::AscDesc as AscDescName; use crate::search::{word_derivations, WordDerivationsCache}; use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; @@ -274,7 +273,6 @@ impl<'t> CriteriaBuilder<'t> { query_tree: Option, primitive_query: Option>, filtered_candidates: Option, - sort_criteria: Option>, ) -> Result> { use crate::criterion::Criterion as Name; @@ -284,30 +282,8 @@ impl<'t> CriteriaBuilder<'t> { Box::new(Initial::new(query_tree, filtered_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { - Name::Words => Box::new(Words::new(self, criterion)), Name::Typo => Box::new(Typo::new(self, criterion)), - Name::Sort => match sort_criteria { - Some(ref sort_criteria) => { - for asc_desc in sort_criteria { - criterion = match asc_desc { - AscDescName::Asc(field) => Box::new(AscDesc::asc( - &self.index, - &self.rtxn, - criterion, - field.to_string(), - )?), - AscDescName::Desc(field) => Box::new(AscDesc::desc( - &self.index, - &self.rtxn, - criterion, - field.to_string(), - )?), - }; - } - criterion - } - None => criterion, - }, + Name::Words => Box::new(Words::new(self, criterion)), Name::Proximity => Box::new(Proximity::new(self, criterion)), Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 927602c98..ed5322607 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -131,7 +131,7 @@ use std::ops::Bound::{Excluded, Included, Unbounded}; use either::{Either, Left, Right}; use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; +use heed::{Database, LazyDecode, RoRange}; use roaring::RoaringBitmap; use crate::heed_codec::facet::{ @@ -206,65 +206,6 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { } } -pub struct FacetStringGroupRevRange<'t> { - iter: RoRevRange< - 't, - FacetLevelValueU32Codec, - LazyDecode>, - >, - end: Bound, -} - -impl<'t> FacetStringGroupRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: NonZeroU8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let db = db.remap_types::< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >(); - let left_bound = match left { - Included(left) => Included((field_id, level, left, u32::MIN)), - Excluded(left) => Excluded((field_id, level, left, u32::MIN)), - Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), - }; - let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetStringGroupRevRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetStringGroupRevRange<'t> { - type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} - /// An iterator that is used to explore the level 0 of the facets string database. /// /// It yields the facet string and the roaring bitmap associated with it. @@ -339,81 +280,6 @@ impl<'t> Iterator for FacetStringLevelZeroRange<'t> { } } -pub struct FacetStringLevelZeroRevRange<'t> { - iter: RoRevRange< - 't, - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - >, -} - -impl<'t> FacetStringLevelZeroRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - left: Bound<&str>, - right: Bound<&str>, - ) -> heed::Result> { - fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - &buffer[..] - } - - let mut left_buffer = Vec::new(); - let left_bound = match left { - Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), - Unbounded => { - left_buffer.extend_from_slice(&field_id.to_be_bytes()); - left_buffer.push(0); - Included(&left_buffer[..]) - } - }; - - let mut right_buffer = Vec::new(); - let right_bound = match right { - Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), - Unbounded => { - right_buffer.extend_from_slice(&field_id.to_be_bytes()); - right_buffer.push(1); // we must only get the level 0 - Excluded(&right_buffer[..]) - } - }; - - let iter = db - .remap_key_type::() - .rev_range(rtxn, &(left_bound, right_bound))? - .remap_types::< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec - >(); - - Ok(FacetStringLevelZeroRevRange { iter }) - } -} - -impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, normalized), (original, docids)))) => { - Some(Ok((normalized, original, docids))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} - -type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; -type EitherStringRevRange<'t> = - Either, FacetStringLevelZeroRevRange<'t>>; - /// An iterator that is used to explore the facet strings level by level, /// it will only return facets strings that are associated with the /// candidates documents ids given. @@ -421,45 +287,12 @@ pub struct FacetStringIter<'t> { rtxn: &'t heed::RoTxn<'t>, db: Database, field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, + level_iters: + Vec<(RoaringBitmap, Either, FacetStringLevelZeroRange<'t>>)>, must_reduce: bool, } impl<'t> FacetStringIter<'t> { - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Left(highest_iter))], - must_reduce: true, - }) - } - - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Right(highest_reverse_iter))], - must_reduce: true, - }) - } - pub fn new_non_reducing( rtxn: &'t heed::RoTxn, index: &'t Index, @@ -467,12 +300,30 @@ impl<'t> FacetStringIter<'t> { documents_ids: RoaringBitmap, ) -> heed::Result> { let db = index.facet_id_string_docids.remap_types::(); - let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = match NonZeroU8::new(highest_level) { + Some(highest_level) => Left(FacetStringGroupRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + highest_level, + Unbounded, + Unbounded, + )?), + None => Right(FacetStringLevelZeroRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + Unbounded, + Unbounded, + )?), + }; + Ok(FacetStringIter { rtxn, db, field_id, - level_iters: vec![(documents_ids, Left(highest_iter))], + level_iters: vec![(documents_ids, highest_iter)], must_reduce: false, }) } @@ -489,62 +340,6 @@ impl<'t> FacetStringIter<'t> { .transpose()? .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit } - - fn highest_iter( - rtxn: &'t heed::RoTxn, - index: &'t Index, - db: Database, - field_id: FieldId, - ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - match NonZeroU8::new(highest_level) { - Some(highest_level) => FacetStringGroupRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - ) - .map(Left), - None => FacetStringLevelZeroRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - ) - .map(Right), - } - } - - fn highest_reverse_iter( - rtxn: &'t heed::RoTxn, - index: &'t Index, - db: Database, - field_id: FieldId, - ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - match NonZeroU8::new(highest_level) { - Some(highest_level) => FacetStringGroupRevRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - ) - .map(Left), - None => FacetStringLevelZeroRevRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - ) - .map(Right), - } - } } impl<'t> Iterator for FacetStringIter<'t> { @@ -553,21 +348,6 @@ impl<'t> Iterator for FacetStringIter<'t> { fn next(&mut self) -> Option { 'outer: loop { let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); - - // We remap the different iterator types to make - // the algorithm less complex to understand. - let last = match last { - Left(ascending) => match ascending { - Left(last) => Left(Left(last)), - Right(last) => Right(Left(last)), - }, - Right(descending) => match descending { - Left(last) => Left(Right(last)), - Right(last) => Right(Right(last)), - }, - }; - match last { Left(last) => { for result in last { @@ -579,50 +359,24 @@ impl<'t> Iterator for FacetStringIter<'t> { *documents_ids -= &docids; } - let result = if is_ascending { - match string_bounds { - Some((left, right)) => { - FacetStringLevelZeroRevRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right) - } - None => FacetStringGroupRevRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Right) - } else { - match string_bounds { - Some((left, right)) => FacetStringLevelZeroRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right), - None => FacetStringGroupRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Left) + let result = match string_bounds { + Some((left, right)) => FacetStringLevelZeroRange::new( + self.rtxn, + self.db, + self.field_id, + Included(left), + Included(right), + ) + .map(Right), + None => FacetStringGroupRange::new( + self.rtxn, + self.db, + self.field_id, + NonZeroU8::new(level.get() - 1).unwrap(), + Included(left), + Included(right), + ) + .map(Left), }; match result { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 23e5c1834..871f464ef 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -18,8 +18,6 @@ pub(crate) use self::facet::ParserRule; pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; -use crate::criterion::AscDesc; -use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{DocumentId, Index, Result}; @@ -39,7 +37,6 @@ pub struct Search<'a> { filter: Option, offset: usize, limit: usize, - sort_criteria: Option>, optional_words: bool, authorize_typos: bool, words_limit: usize, @@ -54,7 +51,6 @@ impl<'a> Search<'a> { filter: None, offset: 0, limit: 20, - sort_criteria: None, optional_words: true, authorize_typos: true, words_limit: 10, @@ -78,11 +74,6 @@ impl<'a> Search<'a> { self } - pub fn sort_criteria(&mut self, criteria: Vec) -> &mut Search<'a> { - self.sort_criteria = Some(criteria); - self - } - pub fn optional_words(&mut self, value: bool) -> &mut Search<'a> { self.optional_words = value; self @@ -143,29 +134,8 @@ impl<'a> Search<'a> { None => MatchingWords::default(), }; - // We check that we are allowed to use the sort criteria, we check - // that they are declared in the sortable fields. - let sortable_fields = self.index.sortable_fields(self.rtxn)?; - if let Some(sort_criteria) = &self.sort_criteria { - for asc_desc in sort_criteria { - let field = asc_desc.field(); - if !sortable_fields.contains(field) { - return Err(UserError::InvalidSortableAttribute { - field: field.to_string(), - valid_fields: sortable_fields, - } - .into()); - } - } - } - let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let criteria = criteria_builder.build( - query_tree, - primitive_query, - filtered_candidates, - self.sort_criteria.clone(), - )?; + let criteria = criteria_builder.build(query_tree, primitive_query, filtered_candidates)?; match self.index.distinct_field(self.rtxn)? { None => self.perform_sort(NoopDistinct, matching_words, criteria), @@ -229,7 +199,6 @@ impl fmt::Debug for Search<'_> { filter, offset, limit, - sort_criteria, optional_words, authorize_typos, words_limit, @@ -241,7 +210,6 @@ impl fmt::Debug for Search<'_> { .field("filter", filter) .field("offset", offset) .field("limit", limit) - .field("sort_criteria", sort_criteria) .field("optional_words", optional_words) .field("authorize_typos", authorize_typos) .field("words_limit", words_limit) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c0b5e4549..07bdfd6fa 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -75,7 +75,6 @@ pub struct Settings<'a, 't, 'u, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, filterable_fields: Setting>, - sortable_fields: Setting>, criteria: Setting>, stop_words: Setting>, distinct_field: Setting, @@ -103,7 +102,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { searchable_fields: Setting::NotSet, displayed_fields: Setting::NotSet, filterable_fields: Setting::NotSet, - sortable_fields: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, distinct_field: Setting::NotSet, @@ -137,10 +135,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.filterable_fields = Setting::Set(names); } - pub fn set_sortable_fields(&mut self, names: HashSet) { - self.sortable_fields = Setting::Set(names); - } - pub fn reset_criteria(&mut self) { self.criteria = Setting::Reset; } @@ -398,23 +392,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } - fn update_sortable(&mut self) -> Result<()> { - match self.sortable_fields { - Setting::Set(ref fields) => { - let mut new_fields = HashSet::new(); - for name in fields { - new_fields.insert(name.clone()); - } - self.index.put_sortable_fields(self.wtxn, &new_fields)?; - } - Setting::Reset => { - self.index.delete_sortable_fields(self.wtxn)?; - } - Setting::NotSet => (), - } - Ok(()) - } - fn update_criteria(&mut self) -> Result<()> { match self.criteria { Setting::Set(ref fields) => { @@ -469,7 +446,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_displayed()?; self.update_filterable()?; - self.update_sortable()?; self.update_distinct_field()?; self.update_criteria()?; self.update_primary_key()?; @@ -743,7 +719,7 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index, 0); // Don't display the generated `id` field. builder.set_displayed_fields(vec![S("name")]); - builder.set_criteria(vec![S("age:asc")]); + builder.set_criteria(vec![S("asc(age)")]); builder.execute(|_, _| ()).unwrap(); // Then index some documents. @@ -977,7 +953,7 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); builder.set_filterable_fields(hashset! { S("age"), S("toto") }); - builder.set_criteria(vec!["toto:asc".to_string()]); + builder.set_criteria(vec!["asc(toto)".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1014,7 +990,7 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); // It is only Asc(toto), there is a facet database but it is denied to filter with toto. - builder.set_criteria(vec!["toto:asc".to_string()]); + builder.set_criteria(vec!["asc(toto)".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 89d9f1109..599d479ed 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -1,17 +1,17 @@ -{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","":""} -{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","":""} -{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","":""} -{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","":""} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} -{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","":""} -{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","":""} -{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","":""} -{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","":""} -{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","":""} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","":""} -{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","":""} -{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","":""} -{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","":""} -{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","":""} -{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","":""} -{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","":""} +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","":""} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","":""} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","":""} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","":""} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","":""} +{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","":""} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","":""} +{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","":""} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","":""} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","":""} +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","":""} +{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","":""} +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","":""} +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","":""} +{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","":""} +{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","":""} diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index f044756eb..ef5af3272 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -32,7 +32,7 @@ macro_rules! test_distinct { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let mut distinct_values = HashSet::new(); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true) .into_iter() .filter_map(|d| { if distinct_values.contains(&d.$distinct) { diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index c810b47af..318197ea3 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -29,7 +29,7 @@ macro_rules! test_filter { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let filtered_ids = search::expected_filtered_ids($filter); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true) .into_iter() .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None }) .collect(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 7d4043ff1..c5724a921 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -1,4 +1,3 @@ -use std::cmp::Reverse; use std::collections::HashSet; use big_s::S; @@ -6,7 +5,7 @@ use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::update::{IndexDocuments, Settings, UpdateFormat}; -use milli::{AscDesc, Criterion, DocumentId, Index}; +use milli::{Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -37,10 +36,6 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("tag"), S("asc_desc_rank"), }); - builder.set_sortable_fields(hashset! { - S("tag"), - S("asc_desc_rank"), - }); builder.set_synonyms(hashmap! { S("hello") => vec![S("good morning")], S("world") => vec![S("earth")], @@ -72,7 +67,6 @@ pub fn expected_order( criteria: &[Criterion], authorize_typo: bool, optional_words: bool, - sort_by: &[AscDesc], ) -> Vec { let dataset = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); @@ -96,14 +90,6 @@ pub fn expected_order( new_groups .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); } - Criterion::Sort if sort_by == [AscDesc::Asc(S("tag"))] => { - group.sort_by_key(|d| d.sort_by_rank); - new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); - } - Criterion::Sort if sort_by == [AscDesc::Desc(S("tag"))] => { - group.sort_by_key(|d| Reverse(d.sort_by_rank)); - new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); - } Criterion::Typo => { group.sort_by_key(|d| d.typo_rank); new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); @@ -118,13 +104,11 @@ pub fn expected_order( .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); } Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { - group.sort_by_key(|d| Reverse(d.asc_desc_rank)); + group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank)); new_groups .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); } - Criterion::Asc(_) | Criterion::Desc(_) | Criterion::Sort => { - new_groups.push(group.clone()) - } + Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()), } } groups = std::mem::take(&mut new_groups); @@ -201,7 +185,6 @@ pub struct TestDocument { pub attribute_rank: u32, pub exact_rank: u32, pub asc_desc_rank: u32, - pub sort_by_rank: u32, pub title: String, pub description: String, pub tag: String, diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 1723c1d6f..f814508f5 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,6 +1,6 @@ use big_s::S; use milli::update::Settings; -use milli::{AscDesc, Criterion, Search, SearchResult}; +use milli::{Criterion, Search, SearchResult}; use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -11,7 +11,7 @@ const ALLOW_OPTIONAL_WORDS: bool = true; const DISALLOW_OPTIONAL_WORDS: bool = false; macro_rules! test_criterion { - ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr, $sort_criteria:expr) => { + ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr) => { #[test] fn $func() { let criteria = $criteria; @@ -23,168 +23,82 @@ macro_rules! test_criterion { search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos($authorize_typos); search.optional_words($optional_word); - search.sort_criteria($sort_criteria); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let expected_external_ids: Vec<_> = search::expected_order( - &criteria, - $authorize_typos, - $optional_word, - &$sort_criteria[..], - ) - .into_iter() - .map(|d| d.id) - .collect(); + let expected_external_ids: Vec<_> = + search::expected_order(&criteria, $authorize_typos, $optional_word) + .into_iter() + .map(|d| d.id) + .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); } }; } -test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![], vec![]); -test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![], vec![]); -test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words], vec![]); -test_criterion!( - attribute_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Attribute], - vec![] -); -test_criterion!( - attribute_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Attribute], - vec![] -); -test_criterion!( - exactness_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Exactness], - vec![] -); -test_criterion!( - exactness_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Exactness], - vec![] -); -test_criterion!( - proximity_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Proximity], - vec![] -); -test_criterion!( - proximity_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Proximity], - vec![] -); +test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![]); +test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![]); +test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words]); +test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Attribute]); +test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Attribute]); +test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Exactness]); +test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Exactness]); +test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Proximity]); +test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Proximity]); test_criterion!( asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Asc(S("asc_desc_rank"))], - vec![] + vec![Asc(S("asc_desc_rank"))] ); test_criterion!( asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Asc(S("asc_desc_rank"))], - vec![] + vec![Asc(S("asc_desc_rank"))] ); test_criterion!( desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Desc(S("asc_desc_rank"))], - vec![] + vec![Desc(S("asc_desc_rank"))] ); test_criterion!( desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Desc(S("asc_desc_rank"))], - vec![] + vec![Desc(S("asc_desc_rank"))] ); test_criterion!( asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Asc(S("unexisting_field"))], - vec![] + vec![Asc(S("unexisting_field"))] ); test_criterion!( asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Asc(S("unexisting_field"))], - vec![] + vec![Asc(S("unexisting_field"))] ); test_criterion!( desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Desc(S("unexisting_field"))], - vec![] + vec![Desc(S("unexisting_field"))] ); test_criterion!( desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Desc(S("unexisting_field"))], - vec![] -); -test_criterion!(empty_sort_by_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Sort], vec![]); -test_criterion!( - empty_sort_by_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Sort], - vec![] -); -test_criterion!( - sort_by_asc_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Sort], - vec![AscDesc::Asc(S("tag"))] -); -test_criterion!( - sort_by_asc_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Sort], - vec![AscDesc::Asc(S("tag"))] -); -test_criterion!( - sort_by_desc_allow_typo, - DISALLOW_OPTIONAL_WORDS, - ALLOW_TYPOS, - vec![Sort], - vec![AscDesc::Desc(S("tag"))] -); -test_criterion!( - sort_by_desc_disallow_typo, - DISALLOW_OPTIONAL_WORDS, - DISALLOW_TYPOS, - vec![Sort], - vec![AscDesc::Desc(S("tag"))] + vec![Desc(S("unexisting_field"))] ); test_criterion!( default_criteria_order, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Words, Typo, Proximity, Attribute, Exactness], - vec![] + vec![Words, Typo, Proximity, Attribute, Exactness] ); #[test] @@ -348,7 +262,7 @@ fn criteria_mixup() { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let expected_external_ids: Vec<_> = - search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, &[]) + search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) .into_iter() .map(|d| d.id) .collect(); From c084f7f731cd1eabb128a29793b7228fe54691ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 20 Aug 2021 17:32:02 +0200 Subject: [PATCH 0917/1889] Fix the facet string docids filterable deletion bug --- milli/src/update/delete_documents.rs | 132 +++++++++++++++++++++++---- 1 file changed, 112 insertions(+), 20 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 5e1f7c6cb..e18c6bbd1 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -4,12 +4,15 @@ use std::collections::HashMap; use chrono::Utc; use fst::IntoStreamer; use heed::types::ByteSlice; +use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; use serde_json::Value; use super::ClearDocuments; -use crate::error::{InternalError, UserError}; -use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; +use crate::error::{InternalError, SerializationError, UserError}; +use crate::heed_codec::facet::{ + FacetLevelValueU32Codec, FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, +}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; @@ -451,26 +454,61 @@ where Ok(()) } -fn remove_docids_from_facet_field_id_string_docids<'a, C>( +fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( wtxn: &'a mut heed::RwTxn, - db: &heed::Database>, + db: &heed::Database, to_remove: &RoaringBitmap, -) -> heed::Result<()> -where - C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, -{ - let mut iter = db.remap_key_type::().iter_mut(wtxn)?; +) -> crate::Result<()> { + let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); + let mut iter = db.remap_types::().iter_mut(wtxn)?; while let Some(result) = iter.next() { - let (bytes, (original_value, mut docids)) = result?; - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &(original_value, docids))? }; + let (key, val) = result?; + match FacetLevelValueU32Codec::bytes_decode(key) { + Some(_) => { + // If we are able to parse this key it means it is a facet string group + // level key. We must then parse the value using the appropriate codec. + let (group, mut docids) = + FacetStringZeroBoundsValueCodec::::bytes_decode(val) + .ok_or_else(|| SerializationError::Decoding { db_name })?; + + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let key = key.to_owned(); + let val = &(group, docids); + let value_bytes = + FacetStringZeroBoundsValueCodec::::bytes_encode(val) + .ok_or_else(|| SerializationError::Encoding { db_name })?; + + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &value_bytes)? }; + } + } + None => { + // The key corresponds to a level zero facet string. + let (original_value, mut docids) = + FacetStringLevelZeroValueCodec::::bytes_decode(val) + .ok_or_else(|| SerializationError::Decoding { db_name })?; + + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let key = key.to_owned(); + let val = &(original_value, docids); + let value_bytes = + FacetStringLevelZeroValueCodec::::bytes_encode(val) + .ok_or_else(|| SerializationError::Encoding { db_name })?; + + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &value_bytes)? }; + } + } } } @@ -505,10 +543,13 @@ where #[cfg(test)] mod tests { + use big_s::S; use heed::EnvOpenOptions; + use maplit::hashset; use super::*; - use crate::update::{IndexDocuments, UpdateFormat}; + use crate::update::{IndexDocuments, Settings, UpdateFormat}; + use crate::FilterCondition; #[test] fn delete_documents_with_numbers_as_primary_key() { @@ -566,4 +607,55 @@ mod tests { wtxn.commit().unwrap(); } + + #[test] + fn delete_documents_with_filterable_attributes() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_primary_key(S("docid")); + builder.set_filterable_fields(hashset! { S("label") }); + builder.execute(|_, _| ()).unwrap(); + + let content = &br#"[ + {"docid":"1_4","label":"sign"}, + {"docid":"1_5","label":"letter"}, + {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, + {"docid":"1_36","label":"drawing,painting,pattern"}, + {"docid":"1_37","label":"art,drawing,outdoor"}, + {"docid":"1_38","label":"aquarium,art,drawing"}, + {"docid":"1_39","label":"abstract"}, + {"docid":"1_40","label":"cartoon"}, + {"docid":"1_41","label":"art,drawing"}, + {"docid":"1_42","label":"art,pattern"}, + {"docid":"1_43","label":"abstract,art,drawing,pattern"}, + {"docid":"1_44","label":"drawing"}, + {"docid":"1_45","label":"art"}, + {"docid":"1_46","label":"abstract,colorfulness,pattern"}, + {"docid":"1_47","label":"abstract,pattern"}, + {"docid":"1_52","label":"abstract,cartoon"}, + {"docid":"1_57","label":"abstract,drawing,pattern"}, + {"docid":"1_58","label":"abstract,art,cartoon"}, + {"docid":"1_68","label":"design"}, + {"docid":"1_69","label":"geometry"} + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + builder.delete_external_id("1_4"); + builder.execute().unwrap(); + + let filter = FilterCondition::from_str(&wtxn, &index, "label = sign").unwrap(); + let results = index.search(&wtxn).filter(filter).execute().unwrap(); + assert!(results.documents_ids.is_empty()); + + wtxn.commit().unwrap(); + } } From 88f6c18665693ac3891142acf972ab42f26acb96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 23 Aug 2021 11:33:30 +0200 Subject: [PATCH 0918/1889] Update version for the next release (v0.10.2) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index fac6ed6b5..f9bc3b1c5 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.10.1" +version = "0.10.2" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index b00168f35..6c7fe7e28 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.10.1" +version = "0.10.2" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 76456418e..304368fcb 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.10.1" +version = "0.10.2" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index e860454d6..4bec3d69d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.10.1" +version = "0.10.2" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index b2a9c9d42..f4de00122 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.10.1" +version = "0.10.2" authors = ["Clément Renault "] edition = "2018" From 89d075871362222de1205af4a1ebfa6a80a29d08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 23 Aug 2021 11:37:18 +0200 Subject: [PATCH 0919/1889] Revert "Revert "Sort at query time"" --- benchmarks/benches/search_songs.rs | 8 +- http-ui/src/main.rs | 4 +- milli/Cargo.toml | 1 - milli/src/criterion.rs | 69 ++++-- milli/src/error.rs | 10 + milli/src/index.rs | 36 ++- milli/src/lib.rs | 2 +- milli/src/search/criteria/asc_desc.rs | 102 ++++++-- milli/src/search/criteria/mod.rs | 26 +- milli/src/search/facet/facet_string.rs | 328 +++++++++++++++++++++---- milli/src/search/mod.rs | 34 ++- milli/src/update/settings.rs | 30 ++- milli/tests/assets/test_set.ndjson | 34 +-- milli/tests/search/distinct.rs | 2 +- milli/tests/search/filters.rs | 2 +- milli/tests/search/mod.rs | 23 +- milli/tests/search/query_criteria.rs | 138 +++++++++-- 17 files changed, 701 insertions(+), 148 deletions(-) diff --git a/benchmarks/benches/search_songs.rs b/benchmarks/benches/search_songs.rs index 726040692..6b11799ec 100644 --- a/benchmarks/benches/search_songs.rs +++ b/benchmarks/benches/search_songs.rs @@ -52,9 +52,9 @@ fn bench_songs(c: &mut criterion::Criterion) { milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); let default_criterion = default_criterion.iter().map(|s| s.as_str()); let asc_default: Vec<&str> = - std::iter::once("asc(released-timestamp)").chain(default_criterion.clone()).collect(); + std::iter::once("released-timestamp:asc").chain(default_criterion.clone()).collect(); let desc_default: Vec<&str> = - std::iter::once("desc(released-timestamp)").chain(default_criterion.clone()).collect(); + std::iter::once("released-timestamp:desc").chain(default_criterion.clone()).collect(); let basic_with_quote: Vec = BASE_CONF .queries @@ -118,12 +118,12 @@ fn bench_songs(c: &mut criterion::Criterion) { }, utils::Conf { group_name: "asc", - criterion: Some(&["asc(released-timestamp)"]), + criterion: Some(&["released-timestamp:desc"]), ..BASE_CONF }, utils::Conf { group_name: "desc", - criterion: Some(&["desc(released-timestamp)"]), + criterion: Some(&["released-timestamp:desc"]), ..BASE_CONF }, diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index ee32882c0..b34418465 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1030,7 +1030,7 @@ mod tests { displayed_attributes: Setting::Set(vec!["name".to_string()]), searchable_attributes: Setting::Set(vec!["age".to_string()]), filterable_attributes: Setting::Set(hashset! { "age".to_string() }), - criteria: Setting::Set(vec!["asc(age)".to_string()]), + criteria: Setting::Set(vec!["age:asc".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }), }; @@ -1058,7 +1058,7 @@ mod tests { Token::Str("criteria"), Token::Some, Token::Seq { len: Some(1) }, - Token::Str("asc(age)"), + Token::Str("age:asc"), Token::SeqEnd, Token::Str("stopWords"), Token::Some, diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 4bec3d69d..0c6fc6763 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -25,7 +25,6 @@ obkv = "0.2.0" once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" -regex = "1.4.3" roaring = "0.6.6" serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index cc1fca01f..47eb7c7dc 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,15 +1,10 @@ use std::fmt; use std::str::FromStr; -use once_cell::sync::Lazy; -use regex::Regex; use serde::{Deserialize, Serialize}; use crate::error::{Error, UserError}; -static ASC_DESC_REGEX: Lazy = - Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap()); - #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { /// Sorted by decreasing number of matched query terms. @@ -17,10 +12,13 @@ pub enum Criterion { Words, /// Sorted by increasing number of typos. Typo, + /// Dynamically sort at query time the documents. None, one or multiple Asc/Desc sortable + /// attributes can be used in place of this criterion at query time. + Sort, /// Sorted by increasing distance between matched query terms. Proximity, /// Documents with quey words contained in more important - /// attributes are considred better. + /// attributes are considered better. Attribute, /// Sorted by the similarity of the matched words with the query words. Exactness, @@ -43,29 +41,46 @@ impl Criterion { impl FromStr for Criterion { type Err = Error; - fn from_str(txt: &str) -> Result { - match txt { + fn from_str(text: &str) -> Result { + match text { "words" => Ok(Criterion::Words), "typo" => Ok(Criterion::Typo), + "sort" => Ok(Criterion::Sort), "proximity" => Ok(Criterion::Proximity), "attribute" => Ok(Criterion::Attribute), "exactness" => Ok(Criterion::Exactness), - text => { - let caps = ASC_DESC_REGEX - .captures(text) - .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; - let order = caps.get(1).unwrap().as_str(); - let field_name = caps.get(2).unwrap().as_str(); - match order { - "asc" => Ok(Criterion::Asc(field_name.to_string())), - "desc" => Ok(Criterion::Desc(field_name.to_string())), - text => { - return Err( - UserError::InvalidCriterionName { name: text.to_string() }.into() - ) - } - } - } + text => match AscDesc::from_str(text) { + Ok(AscDesc::Asc(field)) => Ok(Criterion::Asc(field)), + Ok(AscDesc::Desc(field)) => Ok(Criterion::Desc(field)), + Err(error) => Err(error.into()), + }, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub enum AscDesc { + Asc(String), + Desc(String), +} + +impl AscDesc { + pub fn field(&self) -> &str { + match self { + AscDesc::Asc(field) => field, + AscDesc::Desc(field) => field, + } + } +} + +impl FromStr for AscDesc { + type Err = UserError; + + fn from_str(text: &str) -> Result { + match text.rsplit_once(':') { + Some((field_name, "asc")) => Ok(AscDesc::Asc(field_name.to_string())), + Some((field_name, "desc")) => Ok(AscDesc::Desc(field_name.to_string())), + _ => Err(UserError::InvalidCriterionName { name: text.to_string() }), } } } @@ -74,6 +89,7 @@ pub fn default_criteria() -> Vec { vec![ Criterion::Words, Criterion::Typo, + Criterion::Sort, Criterion::Proximity, Criterion::Attribute, Criterion::Exactness, @@ -87,11 +103,12 @@ impl fmt::Display for Criterion { match self { Words => f.write_str("words"), Typo => f.write_str("typo"), + Sort => f.write_str("sort"), Proximity => f.write_str("proximity"), Attribute => f.write_str("attribute"), Exactness => f.write_str("exactness"), - Asc(attr) => write!(f, "asc({})", attr), - Desc(attr) => write!(f, "desc({})", attr), + Asc(attr) => write!(f, "{}:asc", attr), + Desc(attr) => write!(f, "{}:desc", attr), } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 713935869..9bda74631 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -58,6 +58,7 @@ pub enum UserError { InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), + InvalidSortableAttribute { field: String, valid_fields: HashSet }, InvalidStoreFile, MaxDatabaseSizeReached, MissingDocumentId { document: Object }, @@ -226,6 +227,15 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco ) } Self::InvalidFilterAttribute(error) => error.fmt(f), + Self::InvalidSortableAttribute { field, valid_fields } => { + let valid_names = + valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); + write!( + f, + "Attribute {} is not sortable, available sortable attributes are: {}", + field, valid_names + ) + } Self::MissingDocumentId { document } => { let json = serde_json::to_string(document).unwrap(); write!(f, "document doesn't have an identifier {}", json) diff --git a/milli/src/index.rs b/milli/src/index.rs index 120bcbadf..e2ab51a1c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -28,6 +28,7 @@ pub mod main_key { pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; + pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; @@ -446,13 +447,45 @@ impl Index { Ok(fields_ids) } + /* sortable fields */ + + /// Writes the sortable fields names in the database. + pub(crate) fn put_sortable_fields( + &self, + wtxn: &mut RwTxn, + fields: &HashSet, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::SORTABLE_FIELDS_KEY, fields) + } + + /// Deletes the sortable fields ids in the database. + pub(crate) fn delete_sortable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::SORTABLE_FIELDS_KEY) + } + + /// Returns the sortable fields names. + pub fn sortable_fields(&self, rtxn: &RoTxn) -> heed::Result> { + Ok(self + .main + .get::<_, Str, SerdeJson<_>>(rtxn, main_key::SORTABLE_FIELDS_KEY)? + .unwrap_or_default()) + } + + /// Identical to `sortable_fields`, but returns ids instead. + pub fn sortable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.sortable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect()) + } + /* faceted documents ids */ /// Returns the faceted fields names. /// - /// Faceted fields are the union of all the filterable, distinct, and Asc/Desc fields. + /// Faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result> { let filterable_fields = self.filterable_fields(rtxn)?; + let sortable_fields = self.sortable_fields(rtxn)?; let distinct_field = self.distinct_field(rtxn)?; let asc_desc_fields = self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { @@ -461,6 +494,7 @@ impl Index { }); let mut faceted_fields = filterable_fields; + faceted_fields.extend(sortable_fields); faceted_fields.extend(asc_desc_fields); if let Some(field) = distinct_field { faceted_fields.insert(field.to_owned()); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index f3bababf6..2b0bd2ed4 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,7 +22,7 @@ use std::result::Result as StdResult; use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; -pub use self::criterion::{default_criteria, Criterion}; +pub use self::criterion::{default_criteria, AscDesc, Criterion}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 4a664d042..6d50c1bb5 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::FacetNumberIter; +use crate::search::facet::{FacetNumberIter, FacetStringIter}; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -20,7 +20,7 @@ pub struct AscDesc<'t> { rtxn: &'t heed::RoTxn<'t>, field_name: String, field_id: Option, - ascending: bool, + is_ascending: bool, query_tree: Option, candidates: Box> + 't>, allowed_candidates: RoaringBitmap, @@ -53,12 +53,16 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ascending: bool, + is_ascending: bool, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map.id(&field_name); let faceted_candidates = match field_id { - Some(field_id) => index.number_faceted_documents_ids(rtxn, field_id)?, + Some(field_id) => { + let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?; + let string_faceted = index.string_faceted_documents_ids(rtxn, field_id)?; + number_faceted | string_faceted + } None => RoaringBitmap::default(), }; @@ -67,7 +71,7 @@ impl<'t> AscDesc<'t> { rtxn, field_name, field_id, - ascending, + is_ascending, query_tree: None, candidates: Box::new(std::iter::empty()), allowed_candidates: RoaringBitmap::new(), @@ -87,7 +91,7 @@ impl<'t> Criterion for AscDesc<'t> { loop { debug!( "Facet {}({}) iteration", - if self.ascending { "Asc" } else { "Desc" }, + if self.is_ascending { "Asc" } else { "Desc" }, self.field_name ); @@ -136,7 +140,7 @@ impl<'t> Criterion for AscDesc<'t> { self.index, self.rtxn, field_id, - self.ascending, + self.is_ascending, candidates & &self.faceted_candidates, )?, None => Box::new(std::iter::empty()), @@ -167,31 +171,49 @@ fn facet_ordered<'t>( index: &'t Index, rtxn: &'t heed::RoTxn, field_id: FieldId, - ascending: bool, + is_ascending: bool, candidates: RoaringBitmap, ) -> Result> + 't>> { if candidates.len() <= CANDIDATES_THRESHOLD { - let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; - Ok(Box::new(iter.map(Ok)) as Box>) + let number_iter = iterative_facet_number_ordered_iter( + index, + rtxn, + field_id, + is_ascending, + candidates.clone(), + )?; + let string_iter = + iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; + Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) } else { - let facet_fn = if ascending { + let facet_number_fn = if is_ascending { FacetNumberIter::new_reducing } else { FacetNumberIter::new_reverse_reducing }; - let iter = facet_fn(rtxn, index, field_id, candidates)?; - Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) + let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? + .map(|res| res.map(|(_, docids)| docids)); + + let facet_string_fn = if is_ascending { + FacetStringIter::new_reducing + } else { + FacetStringIter::new_reverse_reducing + }; + let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? + .map(|res| res.map(|(_, _, docids)| docids)); + + Ok(Box::new(number_iter.chain(string_iter))) } } -/// Fetch the whole list of candidates facet values one by one and order them by it. +/// Fetch the whole list of candidates facet number values one by one and order them by it. /// /// This function is fast when the amount of candidates to rank is small. -fn iterative_facet_ordered_iter<'t>( +fn iterative_facet_number_ordered_iter<'t>( index: &'t Index, rtxn: &'t heed::RoTxn, field_id: FieldId, - ascending: bool, + is_ascending: bool, candidates: RoaringBitmap, ) -> Result + 't> { let mut docids_values = Vec::with_capacity(candidates.len() as usize); @@ -199,14 +221,14 @@ fn iterative_facet_ordered_iter<'t>( let left = (field_id, docid, f64::MIN); let right = (field_id, docid, f64::MAX); let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; + let entry = if is_ascending { iter.next() } else { iter.last() }; if let Some(((_, _, value), ())) = entry.transpose()? { docids_values.push((docid, OrderedFloat(value))); } } docids_values.sort_unstable_by_key(|(_, v)| *v); let iter = docids_values.into_iter(); - let iter = if ascending { + let iter = if is_ascending { Box::new(iter) as Box> } else { Box::new(iter.rev()) @@ -216,7 +238,49 @@ fn iterative_facet_ordered_iter<'t>( // required to collect the result into an owned collection (a Vec). // https://github.com/rust-itertools/itertools/issues/499 let vec: Vec<_> = iter - .group_by(|(_, v)| v.clone()) + .group_by(|(_, v)| *v) + .into_iter() + .map(|(_, ids)| ids.map(|(id, _)| id).collect()) + .collect(); + + Ok(vec.into_iter()) +} + +/// Fetch the whole list of candidates facet string values one by one and order them by it. +/// +/// This function is fast when the amount of candidates to rank is small. +fn iterative_facet_string_ordered_iter<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, +) -> Result + 't> { + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, ""); + let right = (field_id, docid.saturating_add(1), ""); + // FIXME Doing this means that it will never be possible to retrieve + // the document with id 2^32, not sure this is a real problem. + let mut iter = index.field_id_docid_facet_strings.range(rtxn, &(left..right))?; + let entry = if is_ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), _)) = entry.transpose()? { + docids_values.push((docid, value)); + } + } + docids_values.sort_unstable_by_key(|(_, v)| *v); + let iter = docids_values.into_iter(); + let iter = if is_ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + + // The itertools GroupBy iterator doesn't provide an owned version, we are therefore + // required to collect the result into an owned collection (a Vec). + // https://github.com/rust-itertools/itertools/issues/499 + let vec: Vec<_> = iter + .group_by(|(_, v)| *v) .into_iter() .map(|(_, ids)| ids.map(|(id, _)| id).collect()) .collect(); diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 2ba3b388f..61b0fe049 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -12,6 +12,7 @@ use self::r#final::Final; use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; +use crate::criterion::AscDesc as AscDescName; use crate::search::{word_derivations, WordDerivationsCache}; use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; @@ -273,6 +274,7 @@ impl<'t> CriteriaBuilder<'t> { query_tree: Option, primitive_query: Option>, filtered_candidates: Option, + sort_criteria: Option>, ) -> Result> { use crate::criterion::Criterion as Name; @@ -282,8 +284,30 @@ impl<'t> CriteriaBuilder<'t> { Box::new(Initial::new(query_tree, filtered_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { - Name::Typo => Box::new(Typo::new(self, criterion)), Name::Words => Box::new(Words::new(self, criterion)), + Name::Typo => Box::new(Typo::new(self, criterion)), + Name::Sort => match sort_criteria { + Some(ref sort_criteria) => { + for asc_desc in sort_criteria { + criterion = match asc_desc { + AscDescName::Asc(field) => Box::new(AscDesc::asc( + &self.index, + &self.rtxn, + criterion, + field.to_string(), + )?), + AscDescName::Desc(field) => Box::new(AscDesc::desc( + &self.index, + &self.rtxn, + criterion, + field.to_string(), + )?), + }; + } + criterion + } + None => criterion, + }, Name::Proximity => Box::new(Proximity::new(self, criterion)), Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index ed5322607..927602c98 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -131,7 +131,7 @@ use std::ops::Bound::{Excluded, Included, Unbounded}; use either::{Either, Left, Right}; use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange}; +use heed::{Database, LazyDecode, RoRange, RoRevRange}; use roaring::RoaringBitmap; use crate::heed_codec::facet::{ @@ -206,6 +206,65 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { } } +pub struct FacetStringGroupRevRange<'t> { + iter: RoRevRange< + 't, + FacetLevelValueU32Codec, + LazyDecode>, + >, + end: Bound, +} + +impl<'t> FacetStringGroupRevRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + level: NonZeroU8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let db = db.remap_types::< + FacetLevelValueU32Codec, + FacetStringZeroBoundsValueCodec, + >(); + let left_bound = match left { + Included(left) => Included((field_id, level, left, u32::MIN)), + Excluded(left) => Excluded((field_id, level, left, u32::MIN)), + Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), + }; + let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); + let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; + Ok(FacetStringGroupRevRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetStringGroupRevRange<'t> { + type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), + Err(e) => Some(Err(e)), + } + } else { + None + } + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + /// An iterator that is used to explore the level 0 of the facets string database. /// /// It yields the facet string and the roaring bitmap associated with it. @@ -280,6 +339,81 @@ impl<'t> Iterator for FacetStringLevelZeroRange<'t> { } } +pub struct FacetStringLevelZeroRevRange<'t> { + iter: RoRevRange< + 't, + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, +} + +impl<'t> FacetStringLevelZeroRevRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + left: Bound<&str>, + right: Bound<&str>, + ) -> heed::Result> { + fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { + buffer.extend_from_slice(&field_id.to_be_bytes()); + buffer.push(0); + buffer.extend_from_slice(value.as_bytes()); + &buffer[..] + } + + let mut left_buffer = Vec::new(); + let left_bound = match left { + Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), + Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), + Unbounded => { + left_buffer.extend_from_slice(&field_id.to_be_bytes()); + left_buffer.push(0); + Included(&left_buffer[..]) + } + }; + + let mut right_buffer = Vec::new(); + let right_bound = match right { + Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), + Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), + Unbounded => { + right_buffer.extend_from_slice(&field_id.to_be_bytes()); + right_buffer.push(1); // we must only get the level 0 + Excluded(&right_buffer[..]) + } + }; + + let iter = db + .remap_key_type::() + .rev_range(rtxn, &(left_bound, right_bound))? + .remap_types::< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec + >(); + + Ok(FacetStringLevelZeroRevRange { iter }) + } +} + +impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { + type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, normalized), (original, docids)))) => { + Some(Ok((normalized, original, docids))) + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + +type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; +type EitherStringRevRange<'t> = + Either, FacetStringLevelZeroRevRange<'t>>; + /// An iterator that is used to explore the facet strings level by level, /// it will only return facets strings that are associated with the /// candidates documents ids given. @@ -287,12 +421,45 @@ pub struct FacetStringIter<'t> { rtxn: &'t heed::RoTxn<'t>, db: Database, field_id: FieldId, - level_iters: - Vec<(RoaringBitmap, Either, FacetStringLevelZeroRange<'t>>)>, + level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, must_reduce: bool, } impl<'t> FacetStringIter<'t> { + pub fn new_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_string_docids.remap_types::(); + let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; + Ok(FacetStringIter { + rtxn, + db, + field_id, + level_iters: vec![(documents_ids, Left(highest_iter))], + must_reduce: true, + }) + } + + pub fn new_reverse_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_string_docids.remap_types::(); + let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; + Ok(FacetStringIter { + rtxn, + db, + field_id, + level_iters: vec![(documents_ids, Right(highest_reverse_iter))], + must_reduce: true, + }) + } + pub fn new_non_reducing( rtxn: &'t heed::RoTxn, index: &'t Index, @@ -300,30 +467,12 @@ impl<'t> FacetStringIter<'t> { documents_ids: RoaringBitmap, ) -> heed::Result> { let db = index.facet_id_string_docids.remap_types::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = match NonZeroU8::new(highest_level) { - Some(highest_level) => Left(FacetStringGroupRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - )?), - None => Right(FacetStringLevelZeroRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - )?), - }; - + let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; Ok(FacetStringIter { rtxn, db, field_id, - level_iters: vec![(documents_ids, highest_iter)], + level_iters: vec![(documents_ids, Left(highest_iter))], must_reduce: false, }) } @@ -340,6 +489,62 @@ impl<'t> FacetStringIter<'t> { .transpose()? .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit } + + fn highest_iter( + rtxn: &'t heed::RoTxn, + index: &'t Index, + db: Database, + field_id: FieldId, + ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + match NonZeroU8::new(highest_level) { + Some(highest_level) => FacetStringGroupRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + highest_level, + Unbounded, + Unbounded, + ) + .map(Left), + None => FacetStringLevelZeroRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + Unbounded, + Unbounded, + ) + .map(Right), + } + } + + fn highest_reverse_iter( + rtxn: &'t heed::RoTxn, + index: &'t Index, + db: Database, + field_id: FieldId, + ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + match NonZeroU8::new(highest_level) { + Some(highest_level) => FacetStringGroupRevRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + highest_level, + Unbounded, + Unbounded, + ) + .map(Left), + None => FacetStringLevelZeroRevRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + Unbounded, + Unbounded, + ) + .map(Right), + } + } } impl<'t> Iterator for FacetStringIter<'t> { @@ -348,6 +553,21 @@ impl<'t> Iterator for FacetStringIter<'t> { fn next(&mut self) -> Option { 'outer: loop { let (documents_ids, last) = self.level_iters.last_mut()?; + let is_ascending = last.is_left(); + + // We remap the different iterator types to make + // the algorithm less complex to understand. + let last = match last { + Left(ascending) => match ascending { + Left(last) => Left(Left(last)), + Right(last) => Right(Left(last)), + }, + Right(descending) => match descending { + Left(last) => Left(Right(last)), + Right(last) => Right(Right(last)), + }, + }; + match last { Left(last) => { for result in last { @@ -359,24 +579,50 @@ impl<'t> Iterator for FacetStringIter<'t> { *documents_ids -= &docids; } - let result = match string_bounds { - Some((left, right)) => FacetStringLevelZeroRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right), - None => FacetStringGroupRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), + let result = if is_ascending { + match string_bounds { + Some((left, right)) => { + FacetStringLevelZeroRevRange::new( + self.rtxn, + self.db, + self.field_id, + Included(left), + Included(right), + ) + .map(Right) + } + None => FacetStringGroupRevRange::new( + self.rtxn, + self.db, + self.field_id, + NonZeroU8::new(level.get() - 1).unwrap(), + Included(left), + Included(right), + ) + .map(Left), + } + .map(Right) + } else { + match string_bounds { + Some((left, right)) => FacetStringLevelZeroRange::new( + self.rtxn, + self.db, + self.field_id, + Included(left), + Included(right), + ) + .map(Right), + None => FacetStringGroupRange::new( + self.rtxn, + self.db, + self.field_id, + NonZeroU8::new(level.get() - 1).unwrap(), + Included(left), + Included(right), + ) + .map(Left), + } + .map(Left) }; match result { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 871f464ef..23e5c1834 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -18,6 +18,8 @@ pub(crate) use self::facet::ParserRule; pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; +use crate::criterion::AscDesc; +use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{DocumentId, Index, Result}; @@ -37,6 +39,7 @@ pub struct Search<'a> { filter: Option, offset: usize, limit: usize, + sort_criteria: Option>, optional_words: bool, authorize_typos: bool, words_limit: usize, @@ -51,6 +54,7 @@ impl<'a> Search<'a> { filter: None, offset: 0, limit: 20, + sort_criteria: None, optional_words: true, authorize_typos: true, words_limit: 10, @@ -74,6 +78,11 @@ impl<'a> Search<'a> { self } + pub fn sort_criteria(&mut self, criteria: Vec) -> &mut Search<'a> { + self.sort_criteria = Some(criteria); + self + } + pub fn optional_words(&mut self, value: bool) -> &mut Search<'a> { self.optional_words = value; self @@ -134,8 +143,29 @@ impl<'a> Search<'a> { None => MatchingWords::default(), }; + // We check that we are allowed to use the sort criteria, we check + // that they are declared in the sortable fields. + let sortable_fields = self.index.sortable_fields(self.rtxn)?; + if let Some(sort_criteria) = &self.sort_criteria { + for asc_desc in sort_criteria { + let field = asc_desc.field(); + if !sortable_fields.contains(field) { + return Err(UserError::InvalidSortableAttribute { + field: field.to_string(), + valid_fields: sortable_fields, + } + .into()); + } + } + } + let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let criteria = criteria_builder.build(query_tree, primitive_query, filtered_candidates)?; + let criteria = criteria_builder.build( + query_tree, + primitive_query, + filtered_candidates, + self.sort_criteria.clone(), + )?; match self.index.distinct_field(self.rtxn)? { None => self.perform_sort(NoopDistinct, matching_words, criteria), @@ -199,6 +229,7 @@ impl fmt::Debug for Search<'_> { filter, offset, limit, + sort_criteria, optional_words, authorize_typos, words_limit, @@ -210,6 +241,7 @@ impl fmt::Debug for Search<'_> { .field("filter", filter) .field("offset", offset) .field("limit", limit) + .field("sort_criteria", sort_criteria) .field("optional_words", optional_words) .field("authorize_typos", authorize_typos) .field("words_limit", words_limit) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 07bdfd6fa..c0b5e4549 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -75,6 +75,7 @@ pub struct Settings<'a, 't, 'u, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, filterable_fields: Setting>, + sortable_fields: Setting>, criteria: Setting>, stop_words: Setting>, distinct_field: Setting, @@ -102,6 +103,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { searchable_fields: Setting::NotSet, displayed_fields: Setting::NotSet, filterable_fields: Setting::NotSet, + sortable_fields: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, distinct_field: Setting::NotSet, @@ -135,6 +137,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.filterable_fields = Setting::Set(names); } + pub fn set_sortable_fields(&mut self, names: HashSet) { + self.sortable_fields = Setting::Set(names); + } + pub fn reset_criteria(&mut self) { self.criteria = Setting::Reset; } @@ -392,6 +398,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_sortable(&mut self) -> Result<()> { + match self.sortable_fields { + Setting::Set(ref fields) => { + let mut new_fields = HashSet::new(); + for name in fields { + new_fields.insert(name.clone()); + } + self.index.put_sortable_fields(self.wtxn, &new_fields)?; + } + Setting::Reset => { + self.index.delete_sortable_fields(self.wtxn)?; + } + Setting::NotSet => (), + } + Ok(()) + } + fn update_criteria(&mut self) -> Result<()> { match self.criteria { Setting::Set(ref fields) => { @@ -446,6 +469,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_displayed()?; self.update_filterable()?; + self.update_sortable()?; self.update_distinct_field()?; self.update_criteria()?; self.update_primary_key()?; @@ -719,7 +743,7 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index, 0); // Don't display the generated `id` field. builder.set_displayed_fields(vec![S("name")]); - builder.set_criteria(vec![S("asc(age)")]); + builder.set_criteria(vec![S("age:asc")]); builder.execute(|_, _| ()).unwrap(); // Then index some documents. @@ -953,7 +977,7 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); builder.set_filterable_fields(hashset! { S("age"), S("toto") }); - builder.set_criteria(vec!["asc(toto)".to_string()]); + builder.set_criteria(vec!["toto:asc".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -990,7 +1014,7 @@ mod tests { let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); // It is only Asc(toto), there is a facet database but it is denied to filter with toto. - builder.set_criteria(vec!["asc(toto)".to_string()]); + builder.set_criteria(vec!["toto:asc".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 599d479ed..89d9f1109 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -1,17 +1,17 @@ -{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","":""} -{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","":""} -{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","":""} -{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","":""} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} -{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","":""} -{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","":""} -{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","":""} -{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","":""} -{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","":""} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","":""} -{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","":""} -{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","":""} -{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","":""} -{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","":""} -{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","":""} -{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","":""} +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","":""} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","":""} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","":""} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","":""} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","":""} +{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","":""} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","":""} +{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","":""} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","":""} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","":""} +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","":""} +{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","":""} +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","":""} +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","":""} +{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","":""} +{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","":""} diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index ef5af3272..f044756eb 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -32,7 +32,7 @@ macro_rules! test_distinct { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let mut distinct_values = HashSet::new(); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true) + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) .into_iter() .filter_map(|d| { if distinct_values.contains(&d.$distinct) { diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 318197ea3..c810b47af 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -29,7 +29,7 @@ macro_rules! test_filter { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let filtered_ids = search::expected_filtered_ids($filter); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true) + let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) .into_iter() .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None }) .collect(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index c5724a921..7d4043ff1 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -1,3 +1,4 @@ +use std::cmp::Reverse; use std::collections::HashSet; use big_s::S; @@ -5,7 +6,7 @@ use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::update::{IndexDocuments, Settings, UpdateFormat}; -use milli::{Criterion, DocumentId, Index}; +use milli::{AscDesc, Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -36,6 +37,10 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("tag"), S("asc_desc_rank"), }); + builder.set_sortable_fields(hashset! { + S("tag"), + S("asc_desc_rank"), + }); builder.set_synonyms(hashmap! { S("hello") => vec![S("good morning")], S("world") => vec![S("earth")], @@ -67,6 +72,7 @@ pub fn expected_order( criteria: &[Criterion], authorize_typo: bool, optional_words: bool, + sort_by: &[AscDesc], ) -> Vec { let dataset = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); @@ -90,6 +96,14 @@ pub fn expected_order( new_groups .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); } + Criterion::Sort if sort_by == [AscDesc::Asc(S("tag"))] => { + group.sort_by_key(|d| d.sort_by_rank); + new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); + } + Criterion::Sort if sort_by == [AscDesc::Desc(S("tag"))] => { + group.sort_by_key(|d| Reverse(d.sort_by_rank)); + new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); + } Criterion::Typo => { group.sort_by_key(|d| d.typo_rank); new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); @@ -104,11 +118,13 @@ pub fn expected_order( .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); } Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { - group.sort_by_key(|d| std::cmp::Reverse(d.asc_desc_rank)); + group.sort_by_key(|d| Reverse(d.asc_desc_rank)); new_groups .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); } - Criterion::Asc(_) | Criterion::Desc(_) => new_groups.push(group.clone()), + Criterion::Asc(_) | Criterion::Desc(_) | Criterion::Sort => { + new_groups.push(group.clone()) + } } } groups = std::mem::take(&mut new_groups); @@ -185,6 +201,7 @@ pub struct TestDocument { pub attribute_rank: u32, pub exact_rank: u32, pub asc_desc_rank: u32, + pub sort_by_rank: u32, pub title: String, pub description: String, pub tag: String, diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index f814508f5..1723c1d6f 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,6 +1,6 @@ use big_s::S; use milli::update::Settings; -use milli::{Criterion, Search, SearchResult}; +use milli::{AscDesc, Criterion, Search, SearchResult}; use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -11,7 +11,7 @@ const ALLOW_OPTIONAL_WORDS: bool = true; const DISALLOW_OPTIONAL_WORDS: bool = false; macro_rules! test_criterion { - ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr) => { + ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr, $sort_criteria:expr) => { #[test] fn $func() { let criteria = $criteria; @@ -23,82 +23,168 @@ macro_rules! test_criterion { search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos($authorize_typos); search.optional_words($optional_word); + search.sort_criteria($sort_criteria); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - let expected_external_ids: Vec<_> = - search::expected_order(&criteria, $authorize_typos, $optional_word) - .into_iter() - .map(|d| d.id) - .collect(); + let expected_external_ids: Vec<_> = search::expected_order( + &criteria, + $authorize_typos, + $optional_word, + &$sort_criteria[..], + ) + .into_iter() + .map(|d| d.id) + .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); } }; } -test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![]); -test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![]); -test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words]); -test_criterion!(attribute_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Attribute]); -test_criterion!(attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Attribute]); -test_criterion!(exactness_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Exactness]); -test_criterion!(exactness_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Exactness]); -test_criterion!(proximity_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Proximity]); -test_criterion!(proximity_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Proximity]); +test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![], vec![]); +test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![], vec![]); +test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words], vec![]); +test_criterion!( + attribute_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Attribute], + vec![] +); +test_criterion!( + attribute_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Attribute], + vec![] +); +test_criterion!( + exactness_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Exactness], + vec![] +); +test_criterion!( + exactness_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Exactness], + vec![] +); +test_criterion!( + proximity_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Proximity], + vec![] +); +test_criterion!( + proximity_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Proximity], + vec![] +); test_criterion!( asc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Asc(S("asc_desc_rank"))] + vec![Asc(S("asc_desc_rank"))], + vec![] ); test_criterion!( asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Asc(S("asc_desc_rank"))] + vec![Asc(S("asc_desc_rank"))], + vec![] ); test_criterion!( desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Desc(S("asc_desc_rank"))] + vec![Desc(S("asc_desc_rank"))], + vec![] ); test_criterion!( desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Desc(S("asc_desc_rank"))] + vec![Desc(S("asc_desc_rank"))], + vec![] ); test_criterion!( asc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Asc(S("unexisting_field"))] + vec![Asc(S("unexisting_field"))], + vec![] ); test_criterion!( asc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Asc(S("unexisting_field"))] + vec![Asc(S("unexisting_field"))], + vec![] ); test_criterion!( desc_unexisting_field_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Desc(S("unexisting_field"))] + vec![Desc(S("unexisting_field"))], + vec![] ); test_criterion!( desc_unexisting_field_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, - vec![Desc(S("unexisting_field"))] + vec![Desc(S("unexisting_field"))], + vec![] +); +test_criterion!(empty_sort_by_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Sort], vec![]); +test_criterion!( + empty_sort_by_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Sort], + vec![] +); +test_criterion!( + sort_by_asc_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Asc(S("tag"))] +); +test_criterion!( + sort_by_asc_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Asc(S("tag"))] +); +test_criterion!( + sort_by_desc_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Desc(S("tag"))] +); +test_criterion!( + sort_by_desc_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Desc(S("tag"))] ); test_criterion!( default_criteria_order, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, - vec![Words, Typo, Proximity, Attribute, Exactness] + vec![Words, Typo, Proximity, Attribute, Exactness], + vec![] ); #[test] @@ -262,7 +348,7 @@ fn criteria_mixup() { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let expected_external_ids: Vec<_> = - search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS) + search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, &[]) .into_iter() .map(|d| d.id) .collect(); From 2f20257070825ee2fc1dc81b4770bcc7091a7e4b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 24 Aug 2021 18:10:11 +0200 Subject: [PATCH 0920/1889] Update milli to the v0.11.0 --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index f9bc3b1c5..d4f11b458 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.10.2" +version = "0.11.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 6c7fe7e28..2e297ce57 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.10.2" +version = "0.11.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 304368fcb..3d82b8605 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.10.2" +version = "0.11.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 0c6fc6763..d28e67f81 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.10.2" +version = "0.11.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index f4de00122..fd161b480 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.10.2" +version = "0.11.0" authors = ["Clément Renault "] edition = "2018" From f2e15918267b6af311650c684bfc1a052de3d555 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 24 Aug 2021 18:10:53 +0200 Subject: [PATCH 0921/1889] Remove the unused tinytemplate dependency --- milli/Cargo.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d28e67f81..3baa2213d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -45,10 +45,6 @@ itertools = "0.10.0" log = "0.4.14" logging_timer = "1.0.0" -# We temporarily depend on this crate just to fix this issue -# https://github.com/bheisler/TinyTemplate/pull/17 -tinytemplate = "=1.1.0" - [dev-dependencies] big_s = "1.0.2" maplit = "1.0.2" From af65485ba74484cac2cbf7d7ccc1429269444b5a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 24 Aug 2021 18:15:31 +0200 Subject: [PATCH 0922/1889] Reexport the grenad CompressionType from milli --- http-ui/Cargo.toml | 1 - http-ui/src/main.rs | 3 +-- milli/src/lib.rs | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 2e297ce57..a01388a7e 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -9,7 +9,6 @@ edition = "2018" anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" -grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } memmap = "0.7.0" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b34418465..91d4c8513 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -16,12 +16,11 @@ use byte_unit::Byte; use either::Either; use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; -use grenad::CompressionType; use heed::EnvOpenOptions; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use milli::update::UpdateIndexingStep::*; use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; -use milli::{obkv_to_json, FilterCondition, Index, MatchingWords, SearchResult}; +use milli::{obkv_to_json, CompressionType, FilterCondition, Index, MatchingWords, SearchResult}; use once_cell::sync::OnceCell; use rayon::ThreadPool; use serde::{Deserialize, Serialize}; diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 2b0bd2ed4..5a5f2ac5c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -20,6 +20,7 @@ use std::hash::BuildHasherDefault; use std::result::Result as StdResult; use fxhash::{FxHasher32, FxHasher64}; +pub use grenad::CompressionType; use serde_json::{Map, Value}; pub use self::criterion::{default_criteria, AscDesc, Criterion}; From 01461af3338ecaf2b14e2a92ec8ef3d6ee0c8fc2 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Tue, 24 Aug 2021 22:18:13 +0300 Subject: [PATCH 0923/1889] chore(ci): remove Rust beta from tests job --- .github/workflows/rust.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 4fdad2db8..9aeb7e041 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -19,7 +19,6 @@ jobs: os: [ubuntu-18.04, macos-latest, windows-latest] rust: - stable - - beta - nightly steps: - uses: actions/checkout@v2 From f230ae6fd52001e9d61ed8f51a1e8065854efabf Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 25 Aug 2021 17:44:16 +0200 Subject: [PATCH 0924/1889] Introduce the reset_sortable_fields Settings method --- milli/src/update/settings.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c0b5e4549..1d0e15cff 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -141,6 +141,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.sortable_fields = Setting::Set(names); } + pub fn reset_sortable_fields(&mut self) { + self.sortable_fields = Setting::Reset; + } + pub fn reset_criteria(&mut self) { self.criteria = Setting::Reset; } From 49a6d2d5f13776dce5d7ed3398f5d4511dde1078 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 25 Aug 2021 12:23:02 +0200 Subject: [PATCH 0925/1889] run all benchmarks once every friday --- .github/workflows/benchmarks.yml | 8 +-- .../workflows/cron_benchmarks_indexing.yml | 70 +++++++++++++++++++ .../cron_benchmarks_search_songs.yml | 70 +++++++++++++++++++ .../workflows/cron_benchmarks_search_wiki.yml | 70 +++++++++++++++++++ 4 files changed, 214 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/cron_benchmarks_indexing.yml create mode 100644 .github/workflows/cron_benchmarks_search_songs.yml create mode 100644 .github/workflows/cron_benchmarks_search_wiki.yml diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index c1475e281..c64c6a64b 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -9,7 +9,7 @@ on: default: 'search_songs' env: - HOME: "/home/runner" # The actions-rs/toolchain@v1 can fail we have no $HOME defined + BENCH_NAME: ${{ github.event.inputs.dataset_name }} jobs: benchmarks: @@ -38,14 +38,14 @@ jobs: id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${{ github.event.inputs.dataset_name }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks - - name: Run benchmarks - Dataset ${{ github.event.inputs.dataset_name }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | cd benchmarks - cargo bench --bench ${{ github.event.inputs.dataset_name }} -- --save-baseline ${{ steps.file.outputs.basename }} + cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files - name: Install critcmp diff --git a/.github/workflows/cron_benchmarks_indexing.yml b/.github/workflows/cron_benchmarks_indexing.yml new file mode 100644 index 000000000..8b7446e8d --- /dev/null +++ b/.github/workflows/cron_benchmarks_indexing.yml @@ -0,0 +1,70 @@ +name: Benchmarks indexing + +on: + schedule: + - cron: "30 0 * * FRI" # every friday at 00:30 + +env: + BENCH_NAME: "indexing" + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: self-hosted + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmp files + - name: Install critcmp + run: cargo install critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json" diff --git a/.github/workflows/cron_benchmarks_search_songs.yml b/.github/workflows/cron_benchmarks_search_songs.yml new file mode 100644 index 000000000..67c4c0d6d --- /dev/null +++ b/.github/workflows/cron_benchmarks_search_songs.yml @@ -0,0 +1,70 @@ +name: Benchmarks search songs + +on: + schedule: + - cron: "30 08 * * FRI" # every friday at 08:30 + +env: + BENCH_NAME: "search_songs" + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: self-hosted + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmp files + - name: Install critcmp + run: cargo install critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json" diff --git a/.github/workflows/cron_benchmarks_search_wiki.yml b/.github/workflows/cron_benchmarks_search_wiki.yml new file mode 100644 index 000000000..3cad5182d --- /dev/null +++ b/.github/workflows/cron_benchmarks_search_wiki.yml @@ -0,0 +1,70 @@ +name: Benchmarks search wikipedia articles + +on: + schedule: + - cron: "30 16 * * FRI" # every friday at 16:30 (it’s snacky snack-time!) + +env: + BENCH_NAME: "search_wiki" + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: self-hosted + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmp files + - name: Install critcmp + run: cargo install critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json" From 5e639bc0c120045b40266626eb3665f3c22a6269 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 30 Aug 2021 13:49:38 +0200 Subject: [PATCH 0926/1889] postfix all action name with (cron) --- .github/workflows/cron_benchmarks_indexing.yml | 2 +- .github/workflows/cron_benchmarks_search_songs.yml | 2 +- .github/workflows/cron_benchmarks_search_wiki.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cron_benchmarks_indexing.yml b/.github/workflows/cron_benchmarks_indexing.yml index 8b7446e8d..e806edc84 100644 --- a/.github/workflows/cron_benchmarks_indexing.yml +++ b/.github/workflows/cron_benchmarks_indexing.yml @@ -1,4 +1,4 @@ -name: Benchmarks indexing +name: Benchmarks indexing (cron) on: schedule: diff --git a/.github/workflows/cron_benchmarks_search_songs.yml b/.github/workflows/cron_benchmarks_search_songs.yml index 67c4c0d6d..018c20817 100644 --- a/.github/workflows/cron_benchmarks_search_songs.yml +++ b/.github/workflows/cron_benchmarks_search_songs.yml @@ -1,4 +1,4 @@ -name: Benchmarks search songs +name: Benchmarks search songs (cron) on: schedule: diff --git a/.github/workflows/cron_benchmarks_search_wiki.yml b/.github/workflows/cron_benchmarks_search_wiki.yml index 3cad5182d..78f940e38 100644 --- a/.github/workflows/cron_benchmarks_search_wiki.yml +++ b/.github/workflows/cron_benchmarks_search_wiki.yml @@ -1,4 +1,4 @@ -name: Benchmarks search wikipedia articles +name: Benchmarks search wikipedia articles (cron) on: schedule: From d106eb5b9043cd9075ead6e1d85ad1014ad72afa Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 30 Aug 2021 16:12:05 +0200 Subject: [PATCH 0927/1889] add the sortable attributes to http-ui and fix the tests --- http-ui/src/main.rs | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 91d4c8513..83995c3e5 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -257,6 +257,9 @@ struct Settings { #[serde(default, skip_serializing_if = "Setting::is_not_set")] filterable_attributes: Setting>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + sortable_attributes: Setting>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] criteria: Setting>, @@ -448,6 +451,15 @@ async fn main() -> anyhow::Result<()> { Setting::NotSet => (), } + // We transpose the settings JSON struct into a real setting update. + match settings.sortable_attributes { + Setting::Set(sortable_attributes) => { + builder.set_sortable_fields(sortable_attributes) + } + Setting::Reset => builder.reset_sortable_fields(), + Setting::NotSet => (), + } + // We transpose the settings JSON struct into a real setting update. match settings.criteria { Setting::Set(criteria) => builder.set_criteria(criteria), @@ -1029,6 +1041,7 @@ mod tests { displayed_attributes: Setting::Set(vec!["name".to_string()]), searchable_attributes: Setting::Set(vec!["age".to_string()]), filterable_attributes: Setting::Set(hashset! { "age".to_string() }), + sortable_attributes: Setting::Set(hashset! { "age".to_string() }), criteria: Setting::Set(vec!["age:asc".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }), @@ -1037,7 +1050,7 @@ mod tests { assert_tokens( &settings, &[ - Token::Struct { name: "Settings", len: 6 }, + Token::Struct { name: "Settings", len: 7 }, Token::Str("displayedAttributes"), Token::Some, Token::Seq { len: Some(1) }, @@ -1048,12 +1061,16 @@ mod tests { Token::Seq { len: Some(1) }, Token::Str("age"), Token::SeqEnd, - Token::Str("facetedAttributes"), + Token::Str("filterableAttributes"), Token::Some, - Token::Map { len: Some(1) }, + Token::Seq { len: Some(1) }, Token::Str("age"), - Token::Str("integer"), - Token::MapEnd, + Token::SeqEnd, + Token::Str("sortableAttributes"), + Token::Some, + Token::Seq { len: Some(1) }, + Token::Str("age"), + Token::SeqEnd, Token::Str("criteria"), Token::Some, Token::Seq { len: Some(1) }, @@ -1083,6 +1100,7 @@ mod tests { displayed_attributes: Setting::Reset, searchable_attributes: Setting::Reset, filterable_attributes: Setting::Reset, + sortable_attributes: Setting::Reset, criteria: Setting::Reset, stop_words: Setting::Reset, synonyms: Setting::Reset, @@ -1091,12 +1109,14 @@ mod tests { assert_tokens( &settings, &[ - Token::Struct { name: "Settings", len: 6 }, + Token::Struct { name: "Settings", len: 7 }, Token::Str("displayedAttributes"), Token::None, Token::Str("searchableAttributes"), Token::None, - Token::Str("facetedAttributes"), + Token::Str("filterableAttributes"), + Token::None, + Token::Str("sortableAttributes"), Token::None, Token::Str("criteria"), Token::None, @@ -1115,6 +1135,7 @@ mod tests { displayed_attributes: Setting::NotSet, searchable_attributes: Setting::NotSet, filterable_attributes: Setting::NotSet, + sortable_attributes: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, synonyms: Setting::NotSet, From 0b02eb456ceb7d3563e582c2cafc1fe50b0a988d Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Tue, 31 Aug 2021 20:28:16 +0300 Subject: [PATCH 0928/1889] chore(update): wrap long values into BStr for warn logs --- milli/src/update/index_documents/store.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 444b11e31..a2aa26e19 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -477,7 +477,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { if lmdb_key_valid_size(&key) { writer.insert(&key, &buffer)?; } else { - warn!("word {:?} is too large to be saved", word); + warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); } } @@ -515,7 +515,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { if lmdb_key_valid_size(&key_buffer) { writer.insert(&key_buffer, &data_buffer)?; } else { - warn!("word {:?} is too large to be saved", word); + warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); } } } @@ -542,7 +542,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { if lmdb_key_valid_size(&key_buffer) { sorter.insert(&key_buffer, &data)?; } else { - warn!("facet value {:?} is too large to be saved", original_value); + warn!( + "facet value {:?} is too large to be saved", + original_value.as_bytes().as_bstr() + ); } } @@ -614,7 +617,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { if lmdb_key_valid_size(&buffer) { sorter.insert(&buffer, original_value.as_bytes())?; } else { - warn!("facet value {:?} is too large to be saved", original_value); + warn!("facet value {:?} is too large to be saved", original_value.as_bytes().as_bstr()); } Ok(()) @@ -640,7 +643,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { if lmdb_key_valid_size(&key) { sorter.insert(&key, &buffer)?; } else { - warn!("word {:?} is too large to be saved", word); + warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); } } From 0e379558a13462376b7f71c07d7d0deac43c9180 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Tue, 31 Aug 2021 21:29:58 +0300 Subject: [PATCH 0929/1889] fix(search): get sortable_fields only if criteria present --- milli/src/search/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 23e5c1834..56002b2e3 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -145,8 +145,8 @@ impl<'a> Search<'a> { // We check that we are allowed to use the sort criteria, we check // that they are declared in the sortable fields. - let sortable_fields = self.index.sortable_fields(self.rtxn)?; if let Some(sort_criteria) = &self.sort_criteria { + let sortable_fields = self.index.sortable_fields(self.rtxn)?; for asc_desc in sort_criteria { let field = asc_desc.field(); if !sortable_fields.contains(field) { From 3aaf1d62f307977dbf2264f9919bb86ef14b0b92 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 16 Aug 2021 13:35:03 +0200 Subject: [PATCH 0930/1889] Publish grenad CompressionType type in milli --- milli/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 5a5f2ac5c..a07303fd2 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -54,7 +54,7 @@ pub type FieldId = u16; pub type Position = u32; pub type FieldDistribution = BTreeMap; -type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; +type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( From 1d314328f0b60344c3cdc2f50efaa0c787d9c48d Mon Sep 17 00:00:00 2001 From: many Date: Mon, 16 Aug 2021 13:36:30 +0200 Subject: [PATCH 0931/1889] Plug new indexer --- http-ui/src/main.rs | 3 - milli/Cargo.toml | 4 +- .../facet_string_level_zero_value_codec.rs | 51 +- milli/src/heed_codec/facet/mod.rs | 4 +- milli/src/index.rs | 3 +- milli/src/lib.rs | 4 - milli/src/proximity.rs | 4 +- milli/src/search/criteria/exactness.rs | 4 + milli/src/search/facet/facet_string.rs | 22 +- milli/src/search/facet/filter_condition.rs | 5 +- milli/src/update/delete_documents.rs | 7 +- milli/src/update/facets.rs | 27 +- .../extract/extract_docid_word_positions.rs | 130 +++ .../extract/extract_facet_number_docids.rs | 41 + .../extract/extract_facet_string_docids.rs | 57 + .../extract/extract_fid_docid_facet_values.rs | 118 +++ .../extract/extract_fid_word_count_docids.rs | 91 ++ .../extract/extract_word_docids.rs | 42 + .../extract_word_level_position_docids.rs | 46 + .../extract_word_pair_proximity_docids.rs | 196 ++++ .../src/update/index_documents/extract/mod.rs | 199 ++++ .../index_documents/helpers/clonable_mmap.rs | 22 + .../index_documents/helpers/grenad_helpers.rs | 276 +++++ .../helpers/merge_functions.rs | 171 +++ .../src/update/index_documents/helpers/mod.rs | 49 + .../update/index_documents/merge_function.rs | 106 -- milli/src/update/index_documents/mod.rs | 696 ++----------- milli/src/update/index_documents/store.rs | 985 ------------------ milli/src/update/index_documents/transform.rs | 41 +- .../src/update/index_documents/typed_chunk.rs | 272 +++++ milli/src/update/settings.rs | 7 - milli/src/update/update_builder.rs | 17 - milli/src/update/word_prefix_docids.rs | 9 +- .../word_prefix_pair_proximity_docids.rs | 9 +- milli/src/update/words_level_positions.rs | 22 +- milli/tests/search/mod.rs | 6 +- 36 files changed, 1920 insertions(+), 1826 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_docid_word_positions.rs create mode 100644 milli/src/update/index_documents/extract/extract_facet_number_docids.rs create mode 100644 milli/src/update/index_documents/extract/extract_facet_string_docids.rs create mode 100644 milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs create mode 100644 milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs create mode 100644 milli/src/update/index_documents/extract/extract_word_docids.rs create mode 100644 milli/src/update/index_documents/extract/extract_word_level_position_docids.rs create mode 100644 milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs create mode 100644 milli/src/update/index_documents/extract/mod.rs create mode 100644 milli/src/update/index_documents/helpers/clonable_mmap.rs create mode 100644 milli/src/update/index_documents/helpers/grenad_helpers.rs create mode 100644 milli/src/update/index_documents/helpers/merge_functions.rs create mode 100644 milli/src/update/index_documents/helpers/mod.rs delete mode 100644 milli/src/update/index_documents/merge_function.rs create mode 100644 milli/src/update/index_documents/typed_chunk.rs diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 83995c3e5..fd7dd37de 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -343,10 +343,7 @@ async fn main() -> anyhow::Result<()> { update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap()); update_builder.log_every_n(indexer_opt_cloned.log_every_n); update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); - update_builder.linked_hash_map_size(indexer_opt_cloned.linked_hash_map_size); update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type); - update_builder - .chunk_fusing_shrink_size(indexer_opt_cloned.chunk_fusing_shrink_size.get_bytes()); let before_update = Instant::now(); // we extract the update type and execute the update itself. diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3baa2213d..edcec4d5b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -6,15 +6,17 @@ edition = "2018" [dependencies] bstr = "0.2.15" +byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } byteorder = "1.4.2" chrono = { version = "0.4.19", features = ["serde"] } concat-arrays = "0.1.2" +crossbeam-channel = "0.5.1" csv = "1.1.5" either = "1.6.1" flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" -grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } +grenad = "0.3.0" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs index b2434d453..914d7c3cd 100644 --- a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -2,51 +2,65 @@ use std::borrow::Cow; use std::convert::TryInto; use std::{marker, str}; -use super::try_split_at; +use crate::error::SerializationError; +use crate::heed_codec::RoaringBitmapCodec; +use crate::{try_split_array_at, try_split_at, Result}; +pub type FacetStringLevelZeroValueCodec = StringValueCodec; -/// A codec that encodes a string in front of the value. +/// A codec that encodes a string in front of a value. /// /// The usecase is for the facet string levels algorithm where we must know the /// original string of a normalized facet value, the original values are stored /// in the value to not break the lexicographical ordering of the LMDB keys. -pub struct FacetStringLevelZeroValueCodec(marker::PhantomData); +pub struct StringValueCodec(marker::PhantomData); -impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec +impl<'a, C> heed::BytesDecode<'a> for StringValueCodec where C: heed::BytesDecode<'a>, { type DItem = (&'a str, C::DItem); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (string_len, bytes) = try_split_at(bytes, 2)?; - let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?; - - let (string, bytes) = try_split_at(bytes, string_len as usize)?; - let string = str::from_utf8(string).ok()?; + let (string, bytes) = decode_prefix_string(bytes)?; C::bytes_decode(bytes).map(|item| (string, item)) } } -impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec +impl<'a, C> heed::BytesEncode<'a> for StringValueCodec where C: heed::BytesEncode<'a>, { type EItem = (&'a str, C::EItem); fn bytes_encode((string, value): &'a Self::EItem) -> Option> { - let string_len: u16 = string.len().try_into().ok()?; let value_bytes = C::bytes_encode(&value)?; let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); - bytes.extend_from_slice(&string_len.to_be_bytes()); - bytes.extend_from_slice(string.as_bytes()); + encode_prefix_string(string, &mut bytes).ok()?; bytes.extend_from_slice(&value_bytes[..]); Some(Cow::Owned(bytes)) } } +pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> { + let (original_length_bytes, bytes) = try_split_array_at(value)?; + let original_length = u16::from_be_bytes(original_length_bytes) as usize; + let (string, bytes) = try_split_at(bytes, original_length)?; + let string = str::from_utf8(string).ok()?; + + Some((string, bytes)) +} + +pub fn encode_prefix_string(string: &str, buffer: &mut Vec) -> Result<()> { + let string_len: u16 = + string.len().try_into().map_err(|_| SerializationError::InvalidNumberSerialization)?; + buffer.extend_from_slice(&string_len.to_be_bytes()); + buffer.extend_from_slice(string.as_bytes()); + Ok(()) +} + #[cfg(test)] mod tests { use heed::types::Unit; @@ -54,17 +68,15 @@ mod tests { use roaring::RoaringBitmap; use super::*; - use crate::CboRoaringBitmapCodec; #[test] fn deserialize_roaring_bitmaps() { let string = "abc"; let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); let key = (string, docids.clone()); - let bytes = - FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); + let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); let (out_string, out_docids) = - FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + StringValueCodec::::bytes_decode(&bytes).unwrap(); assert_eq!((out_string, out_docids), (string, docids)); } @@ -72,9 +84,8 @@ mod tests { fn deserialize_unit() { let string = "def"; let key = (string, ()); - let bytes = FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); - let (out_string, out_unit) = - FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); + let (out_string, out_unit) = StringValueCodec::::bytes_decode(&bytes).unwrap(); assert_eq!((out_string, out_unit), (string, ())); } } diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index a6a805bf7..e93fb57b9 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -9,7 +9,9 @@ mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; -pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec; +pub use self::facet_string_level_zero_value_codec::{ + decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, +}; pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/index.rs b/milli/src/index.rs index e2ab51a1c..f3a2a3e05 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -93,8 +93,7 @@ pub struct Index { /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, /// Maps the facet field id and the string with the original string and docids that corresponds to it. - pub facet_id_string_docids: - Database>, + pub facet_id_string_docids: Database, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a07303fd2..af811fe08 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -13,11 +13,9 @@ mod search; pub mod tree_level; pub mod update; -use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; -use std::result::Result as StdResult; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; @@ -54,8 +52,6 @@ pub type FieldId = u16; pub type Position = u32; pub type FieldDistribution = BTreeMap; -type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; - /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], diff --git a/milli/src/proximity.rs b/milli/src/proximity.rs index db98426a5..083e5a977 100644 --- a/milli/src/proximity.rs +++ b/milli/src/proximity.rs @@ -2,8 +2,8 @@ use std::cmp; use crate::{Attribute, Position}; -const ONE_ATTRIBUTE: u32 = 1000; -const MAX_DISTANCE: u32 = 8; +pub const ONE_ATTRIBUTE: u32 = 1000; +pub const MAX_DISTANCE: u32 = 8; pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { if lhs <= rhs { diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 1e4d4e7a2..22dcb9782 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -180,6 +180,10 @@ fn resolve_state( if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { + println!( + "found candidates that have the good count: {:?}", + attribute_allowed_docids + ); let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; attribute_candidates_array.push(attribute_allowed_docids); diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 927602c98..747b7fd3c 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -269,11 +269,7 @@ impl<'t> Iterator for FacetStringGroupRevRange<'t> { /// /// It yields the facet string and the roaring bitmap associated with it. pub struct FacetStringLevelZeroRange<'t> { - iter: RoRange< - 't, - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - >, + iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, } impl<'t> FacetStringLevelZeroRange<'t> { @@ -316,10 +312,7 @@ impl<'t> FacetStringLevelZeroRange<'t> { let iter = db .remap_key_type::() .range(rtxn, &(left_bound, right_bound))? - .remap_types::< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec - >(); + .remap_types::(); Ok(FacetStringLevelZeroRange { iter }) } @@ -340,11 +333,7 @@ impl<'t> Iterator for FacetStringLevelZeroRange<'t> { } pub struct FacetStringLevelZeroRevRange<'t> { - iter: RoRevRange< - 't, - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - >, + iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, } impl<'t> FacetStringLevelZeroRevRange<'t> { @@ -387,10 +376,7 @@ impl<'t> FacetStringLevelZeroRevRange<'t> { let iter = db .remap_key_type::() .rev_range(rtxn, &(left_bound, right_bound))? - .remap_types::< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec - >(); + .remap_types::(); Ok(FacetStringLevelZeroRevRange { iter }) } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 5ca9f7e5a..a92797e90 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -392,10 +392,7 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, numbers_db: heed::Database, - strings_db: heed::Database< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - >, + strings_db: heed::Database, field_id: FieldId, operator: &Operator, ) -> Result { diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e18c6bbd1..874eed6ee 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -490,7 +490,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( None => { // The key corresponds to a level zero facet string. let (original_value, mut docids) = - FacetStringLevelZeroValueCodec::::bytes_decode(val) + FacetStringLevelZeroValueCodec::bytes_decode(val) .ok_or_else(|| SerializationError::Decoding { db_name })?; let previous_len = docids.len(); @@ -501,9 +501,8 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( } else if docids.len() != previous_len { let key = key.to_owned(); let val = &(original_value, docids); - let value_bytes = - FacetStringLevelZeroValueCodec::::bytes_encode(val) - .ok_or_else(|| SerializationError::Encoding { db_name })?; + let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) + .ok_or_else(|| SerializationError::Encoding { db_name })?; // safety: we don't keep references from inside the LMDB database. unsafe { iter.put_current(&key, &value_bytes)? }; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index cb9a90f7e..3ae63f282 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -3,7 +3,7 @@ use std::num::{NonZeroU8, NonZeroUsize}; use std::{cmp, mem}; use chrono::Utc; -use grenad::{CompressionType, FileFuse, Reader, Writer}; +use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesEncode, Error}; use log::debug; @@ -25,7 +25,6 @@ pub struct Facets<'t, 'u, 'i> { index: &'i Index, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, _update_id: u64, @@ -42,7 +41,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { index, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, level_group_size: NonZeroUsize::new(4).unwrap(), min_level_size: NonZeroUsize::new(5).unwrap(), _update_id: update_id, @@ -86,7 +84,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.index.facet_id_string_docids, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.level_group_size, self.min_level_size, field_id, @@ -107,7 +104,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.index.facet_id_f64_docids, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.level_group_size, self.min_level_size, field_id, @@ -128,7 +124,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), facet_number_levels, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" }), + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" })?, WriteMethod::GetMergePut, )?; @@ -136,7 +132,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_string_levels, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" }), + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" })?, WriteMethod::GetMergePut, )?; } @@ -161,11 +157,10 @@ fn compute_facet_number_levels<'t>( db: heed::Database, compression_type: CompressionType, compression_level: Option, - shrink_size: Option, level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, field_id: FieldId, -) -> Result> { +) -> Result> { let first_level_size = db .remap_key_type::() .prefix_iter(rtxn, &field_id.to_be_bytes())? @@ -219,7 +214,7 @@ fn compute_facet_number_levels<'t>( } } - writer_into_reader(writer, shrink_size) + writer_into_reader(writer) } fn write_number_entry( @@ -239,7 +234,7 @@ fn write_number_entry( fn compute_faceted_strings_documents_ids( rtxn: &heed::RoTxn, - db: heed::Database>, + db: heed::Database, field_id: FieldId, ) -> Result { let mut documents_ids = RoaringBitmap::new(); @@ -278,17 +273,13 @@ fn clear_field_string_levels<'t>( fn compute_facet_string_levels<'t>( rtxn: &'t heed::RoTxn, - db: heed::Database< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - >, + db: heed::Database, compression_type: CompressionType, compression_level: Option, - shrink_size: Option, level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, field_id: FieldId, -) -> Result> { +) -> Result> { let first_level_size = db .remap_key_type::() .prefix_iter(rtxn, &field_id.to_be_bytes())? @@ -340,7 +331,7 @@ fn compute_facet_string_levels<'t>( } } - writer_into_reader(writer, shrink_size) + writer_into_reader(writer) } fn write_string_entry( diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs new file mode 100644 index 000000000..9a9d7cb85 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -0,0 +1,130 @@ +use std::collections::HashSet; +use std::convert::TryInto; +use std::fs::File; +use std::{io, mem, str}; + +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; +use roaring::RoaringBitmap; +use serde_json::Value; + +use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; +use crate::error::{InternalError, SerializationError}; +use crate::proximity::ONE_ATTRIBUTE; +use crate::{FieldId, Result}; + +/// Extracts the word and positions where this word appear and +/// prefixes it by the document id. +/// +/// Returns the generated internal documents ids and a grenad reader +/// with the list of extracted words from the given chunk of documents. +pub fn extract_docid_word_positions( + mut obkv_documents: grenad::Reader, + indexer: GrenadParameters, + searchable_fields: &Option>, +) -> Result<(RoaringBitmap, grenad::Reader)> { + let max_memory = indexer.max_memory_by_thread(); + + let mut documents_ids = RoaringBitmap::new(); + let mut docid_word_positions_sorter = create_sorter( + concat_u32s_array, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut field_buffer = String::new(); + let analyzer = Analyzer::>::new(AnalyzerConfig::default()); + + while let Some((key, value)) = obkv_documents.next()? { + let document_id = key + .try_into() + .map(u32::from_be_bytes) + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let obkv = obkv::KvReader::::new(value); + + documents_ids.push(document_id); + key_buffer.clear(); + key_buffer.extend_from_slice(&document_id.to_be_bytes()); + + for (field_id, field_bytes) in obkv.iter() { + if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + field_buffer.clear(); + if let Some(field) = json_to_string(&value, &mut field_buffer) { + let analyzed = analyzer.analyze(field); + let tokens = analyzed + .tokens() + .filter(Token::is_word) + .enumerate() + .take_while(|(i, _)| (*i as u32) < ONE_ATTRIBUTE); + + for (index, token) in tokens { + let token = token.text().trim(); + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(token.as_bytes()); + + let position: u32 = index + .try_into() + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let position = field_id as u32 * ONE_ATTRIBUTE + position; + docid_word_positions_sorter.insert(&key_buffer, &position.to_ne_bytes())?; + } + } + } + } + } + + sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader)) +} + +/// Transform a JSON value into a string that can be indexed. +fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> { + fn inner(value: &Value, output: &mut String) -> bool { + use std::fmt::Write; + match value { + Value::Null => false, + Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), + Value::Number(number) => write!(output, "{}", number).is_ok(), + Value::String(string) => write!(output, "{}", string).is_ok(), + Value::Array(array) => { + let mut count = 0; + for value in array { + if inner(value, output) { + output.push_str(". "); + count += 1; + } + } + // check that at least one value was written + count != 0 + } + Value::Object(object) => { + let mut buffer = String::new(); + let mut count = 0; + for (key, value) in object { + buffer.clear(); + let _ = write!(&mut buffer, "{}: ", key); + if inner(value, &mut buffer) { + buffer.push_str(". "); + // We write the "key: value. " pair only when + // we are sure that the value can be written. + output.push_str(&buffer); + count += 1; + } + } + // check that at least one value was written + count != 0 + } + } + } + + if let Value::String(string) = value { + Some(&string) + } else if inner(value, buffer) { + Some(buffer) + } else { + None + } +} diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs new file mode 100644 index 000000000..1734ef028 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -0,0 +1,41 @@ +use std::fs::File; +use std::io; + +use heed::{BytesDecode, BytesEncode}; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, +}; +use crate::heed_codec::facet::{FacetLevelValueF64Codec, FieldDocIdFacetF64Codec}; +use crate::Result; + +/// Extracts the facet number and the documents ids where this facet number appear. +/// +/// Returns a grenad reader with the list of extracted facet numbers and +/// documents ids from the given chunk of docid facet number positions. +pub fn extract_facet_number_docids( + mut docid_fid_facet_number: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_number_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + while let Some((key_bytes, _)) = docid_fid_facet_number.next()? { + let (field_id, document_id, number) = + FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); + + let key = (field_id, 0, number, number); + let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); + + facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + } + + sorter_into_reader(facet_number_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs new file mode 100644 index 000000000..66ede5f42 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -0,0 +1,57 @@ +use std::fs::File; +use std::iter::FromIterator; +use std::{io, str}; + +use roaring::RoaringBitmap; + +use super::helpers::{ + create_sorter, keep_first_prefix_value_merge_roaring_bitmaps, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; +use crate::{FieldId, Result}; + +/// Extracts the facet string and the documents ids where this facet string appear. +/// +/// Returns a grenad reader with the list of extracted facet strings and +/// documents ids from the given chunk of docid facet string positions. +pub fn extract_facet_string_docids( + mut docid_fid_facet_string: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_string_docids_sorter = create_sorter( + keep_first_prefix_value_merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut value_buffer = Vec::new(); + while let Some((key, original_value_bytes)) = docid_fid_facet_string.next()? { + let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); + let field_id = FieldId::from_be_bytes(field_id_bytes); + let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + let original_value = str::from_utf8(original_value_bytes)?; + + key_buffer.clear(); + FacetStringLevelZeroCodec::serialize_into( + field_id, + str::from_utf8(normalized_value_bytes)?, + &mut key_buffer, + ); + + value_buffer.clear(); + encode_prefix_string(original_value, &mut value_buffer)?; + let bitmap = RoaringBitmap::from_iter(Some(document_id)); + bitmap.serialize_into(&mut value_buffer)?; + + facet_string_docids_sorter.insert(&key_buffer, &value_buffer)?; + } + + sorter_into_reader(facet_string_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs new file mode 100644 index 000000000..e7e56a3c8 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -0,0 +1,118 @@ +use std::collections::HashSet; +use std::fs::File; +use std::io; +use std::mem::size_of; + +use heed::zerocopy::AsBytes; +use serde_json::Value; + +use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; +use crate::error::InternalError; +use crate::facet::value_encoding::f64_into_bytes; +use crate::{DocumentId, FieldId, Result}; + +/// Extracts the facet values of each faceted field of each document. +/// +/// Returns the generated grenad reader containing the docid the fid and the orginal value as key +/// and the normalized value as value extracted from the given chunk of documents. +pub fn extract_fid_docid_facet_values( + mut obkv_documents: grenad::Reader, + indexer: GrenadParameters, + faceted_fields: &HashSet, +) -> Result<(grenad::Reader, grenad::Reader)> { + let max_memory = indexer.max_memory_by_thread(); + + let mut fid_docid_facet_numbers_sorter = create_sorter( + keep_first, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut fid_docid_facet_strings_sorter = create_sorter( + keep_first, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut key_buffer = Vec::new(); + while let Some((docid_bytes, value)) = obkv_documents.next()? { + let obkv = obkv::KvReader::new(value); + + for (field_id, field_bytes) in obkv.iter() { + if faceted_fields.contains(&field_id) { + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + let (numbers, strings) = extract_facet_values(&value); + + key_buffer.clear(); + + // prefix key with the field_id and the document_id + key_buffer.extend_from_slice(&field_id.to_be_bytes()); + key_buffer.extend_from_slice(&docid_bytes); + + // insert facet numbers in sorter + for number in numbers { + key_buffer.truncate(size_of::() + size_of::()); + let value_bytes = f64_into_bytes(number).unwrap(); // invalid float + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; + } + + // insert normalized and original facet string in sorter + for (normalized, original) in strings { + key_buffer.truncate(size_of::() + size_of::()); + key_buffer.extend_from_slice(normalized.as_bytes()); + fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; + } + } + } + } + + Ok(( + sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, + sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + )) +} + +fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { + fn inner_extract_facet_values( + value: &Value, + can_recurse: bool, + output_numbers: &mut Vec, + output_strings: &mut Vec<(String, String)>, + ) { + match value { + Value::Null => (), + Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), + Value::Number(number) => { + if let Some(float) = number.as_f64() { + output_numbers.push(float); + } + } + Value::String(original) => { + let normalized = original.trim().to_lowercase(); + output_strings.push((normalized, original.clone())); + } + Value::Array(values) => { + if can_recurse { + for value in values { + inner_extract_facet_values(value, false, output_numbers, output_strings); + } + } + } + Value::Object(_) => (), + } + } + + let mut facet_number_values = Vec::new(); + let mut facet_string_values = Vec::new(); + inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); + + (facet_number_values, facet_string_values) +} diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs new file mode 100644 index 000000000..66b179663 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -0,0 +1,91 @@ +use std::collections::HashMap; +use std::fs::File; +use std::{cmp, io}; + +use grenad::Sorter; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, MergeFn, +}; +use crate::proximity::extract_position; +use crate::{DocumentId, FieldId, Result}; + +/// Extracts the field id word count and the documents ids where +/// this field id with this amount of words appear. +/// +/// Returns a grenad reader with the list of extracted field id word counts +/// and documents ids from the given chunk of docid word positions. +pub fn extract_fid_word_count_docids( + mut docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut fid_word_count_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + // This map is assumed to not consume a lot of memory. + let mut document_fid_wordcount = HashMap::new(); + let mut current_document_id = None; + + while let Some((key, value)) = docid_word_positions.next()? { + let (document_id_bytes, _word_bytes) = try_split_array_at(key).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + + let curr_document_id = *current_document_id.get_or_insert(document_id); + if curr_document_id != document_id { + drain_document_fid_wordcount_into_sorter( + &mut fid_word_count_docids_sorter, + &mut document_fid_wordcount, + curr_document_id, + )?; + current_document_id = Some(document_id); + } + + for position in read_u32_ne_bytes(value) { + let (field_id, position) = extract_position(position); + let word_count = position + 1; + + let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); + *value = cmp::max(*value, word_count); + } + } + + if let Some(document_id) = current_document_id { + // We must make sure that don't lose the current document field id + // word count map if we break because we reached the end of the chunk. + drain_document_fid_wordcount_into_sorter( + &mut fid_word_count_docids_sorter, + &mut document_fid_wordcount, + document_id, + )?; + } + + sorter_into_reader(fid_word_count_docids_sorter, indexer) +} + +fn drain_document_fid_wordcount_into_sorter( + fid_word_count_docids_sorter: &mut Sorter, + document_fid_wordcount: &mut HashMap, + document_id: DocumentId, +) -> Result<()> { + let mut key_buffer = Vec::new(); + + for (fid, count) in document_fid_wordcount.drain() { + if count <= 10 { + key_buffer.clear(); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + key_buffer.push(count as u8); + + fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + } + + Ok(()) +} diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs new file mode 100644 index 000000000..85453e173 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -0,0 +1,42 @@ +use std::fs::File; +use std::io; +use std::iter::FromIterator; + +use roaring::RoaringBitmap; + +use super::helpers::{ + create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::Result; + +/// Extracts the word and the documents ids where this word appear. +/// +/// Returns a grenad reader with the list of extracted words and +/// documents ids from the given chunk of docid word positions. +pub fn extract_word_docids( + mut docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_docids_sorter = create_sorter( + merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut value_buffer = Vec::new(); + while let Some((key, _value)) = docid_word_positions.next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + + let bitmap = RoaringBitmap::from_iter(Some(document_id)); + serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; + word_docids_sorter.insert(word_bytes, &value_buffer)?; + } + + sorter_into_reader(word_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs new file mode 100644 index 000000000..c7138b32a --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs @@ -0,0 +1,46 @@ +use std::fs::File; +use std::io; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::{DocumentId, Result}; +/// Extracts the word positions and the documents ids where this word appear. +/// +/// Returns a grenad reader with the list of extracted words at positions and +/// documents ids from the given chunk of docid word positions. +pub fn extract_word_level_position_docids( + mut docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_level_position_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + while let Some((key, value)) = docid_word_positions.next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let document_id = DocumentId::from_be_bytes(document_id_bytes); + + for position in read_u32_ne_bytes(value) { + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + key_buffer.push(0); // tree level + + // Levels are composed of left and right bounds. + key_buffer.extend_from_slice(&position.to_be_bytes()); + key_buffer.extend_from_slice(&position.to_be_bytes()); + + word_level_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; + } + } + + sorter_into_reader(word_level_position_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs new file mode 100644 index 000000000..2bc79aac5 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -0,0 +1,196 @@ +use std::cmp::Ordering; +use std::collections::{BinaryHeap, HashMap}; +use std::fs::File; +use std::time::{Duration, Instant}; +use std::{cmp, io, mem, str, vec}; + +use log::debug; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, MergeFn, +}; +use crate::proximity::{positions_proximity, MAX_DISTANCE}; +use crate::{DocumentId, Result}; + +/// Extracts the best proximity between pairs of words and the documents ids where this pair appear. +/// +/// Returns a grenad reader with the list of extracted word pairs proximities and +/// documents ids from the given chunk of docid word positions. +pub fn extract_word_pair_proximity_docids( + mut docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_pair_proximity_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut number_of_documents = 0; + let mut total_time_aggregation = Duration::default(); + let mut total_time_grenad_insert = Duration::default(); + + // This map is assumed to not consume a lot of memory. + let mut document_word_positions_heap = BinaryHeap::new(); + let mut current_document_id = None; + + while let Some((key, value)) = docid_word_positions.next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + let word = str::from_utf8(word_bytes)?; + + let curr_document_id = *current_document_id.get_or_insert(document_id); + if curr_document_id != document_id { + let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + document_word_positions_into_sorter( + curr_document_id, + document_word_positions_heap, + &mut word_pair_proximity_docids_sorter, + &mut total_time_aggregation, + &mut total_time_grenad_insert, + )?; + number_of_documents += 1; + current_document_id = Some(document_id); + } + + let word = word.to_string(); + let mut iter = read_u32_ne_bytes(value).collect::>().into_iter(); + if let Some(position) = iter.next() { + document_word_positions_heap.push(PeekedWordPosition { word, position, iter }); + } + } + + if let Some(document_id) = current_document_id { + // We must make sure that don't lose the current document field id + // word count map if we break because we reached the end of the chunk. + let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + document_word_positions_into_sorter( + document_id, + document_word_positions_heap, + &mut word_pair_proximity_docids_sorter, + &mut total_time_aggregation, + &mut total_time_grenad_insert, + )?; + } + + debug!( + "Number of documents {} + - we took {:02?} to aggregate proximities + - we took {:02?} to grenad insert those proximities", + number_of_documents, total_time_aggregation, total_time_grenad_insert, + ); + + sorter_into_reader(word_pair_proximity_docids_sorter, indexer) +} + +/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive. +/// +/// This list is used by the engine to calculate the documents containing words that are +/// close to each other. +fn document_word_positions_into_sorter<'b>( + document_id: DocumentId, + mut word_positions_heap: BinaryHeap>>, + word_pair_proximity_docids_sorter: &mut grenad::Sorter, + total_time_aggregation: &mut Duration, + total_time_grenad_insert: &mut Duration, +) -> Result<()> { + let before_aggregating = Instant::now(); + let mut word_pair_proximity = HashMap::new(); + let mut ordered_peeked_word_positions = Vec::new(); + while !word_positions_heap.is_empty() { + while let Some(peeked_word_position) = word_positions_heap.pop() { + ordered_peeked_word_positions.push(peeked_word_position); + if ordered_peeked_word_positions.len() == 7 { + break; + } + } + + if let Some((head, tail)) = ordered_peeked_word_positions.split_first() { + for PeekedWordPosition { word, position, .. } in tail { + let prox = positions_proximity(head.position, *position); + if prox > 0 && prox < MAX_DISTANCE { + word_pair_proximity + .entry((head.word.clone(), word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + + // We also compute the inverse proximity. + let prox = prox + 1; + if prox < MAX_DISTANCE { + word_pair_proximity + .entry((word.clone(), head.word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + } + } + } + + // Push the tail in the heap. + let tail_iter = ordered_peeked_word_positions.drain(1..); + word_positions_heap.extend(tail_iter); + + // Advance the head and push it in the heap. + if let Some(mut head) = ordered_peeked_word_positions.pop() { + if let Some(next_position) = head.iter.next() { + word_positions_heap.push(PeekedWordPosition { + word: head.word, + position: next_position, + iter: head.iter, + }); + } + } + } + } + + *total_time_aggregation += before_aggregating.elapsed(); + + let mut key_buffer = Vec::new(); + for ((w1, w2), prox) in word_pair_proximity { + key_buffer.clear(); + key_buffer.extend_from_slice(w1.as_bytes()); + key_buffer.push(0); + key_buffer.extend_from_slice(w2.as_bytes()); + key_buffer.push(prox as u8); + + let before_grenad_insert = Instant::now(); + word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; + *total_time_grenad_insert += before_grenad_insert.elapsed(); + } + + Ok(()) +} + +struct PeekedWordPosition { + word: String, + position: u32, + iter: I, +} + +impl Ord for PeekedWordPosition { + fn cmp(&self, other: &Self) -> Ordering { + self.position.cmp(&other.position).reverse() + } +} + +impl PartialOrd for PeekedWordPosition { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for PeekedWordPosition {} + +impl PartialEq for PeekedWordPosition { + fn eq(&self, other: &Self) -> bool { + self.position == other.position + } +} diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs new file mode 100644 index 000000000..b24c80da4 --- /dev/null +++ b/milli/src/update/index_documents/extract/mod.rs @@ -0,0 +1,199 @@ +mod extract_docid_word_positions; +mod extract_facet_number_docids; +mod extract_facet_string_docids; +mod extract_fid_docid_facet_values; +mod extract_fid_word_count_docids; +mod extract_word_docids; +mod extract_word_level_position_docids; +mod extract_word_pair_proximity_docids; + +use std::collections::HashSet; +use std::fs::File; + +use crossbeam_channel::Sender; +use rayon::prelude::*; + +use self::extract_docid_word_positions::extract_docid_word_positions; +use self::extract_facet_number_docids::extract_facet_number_docids; +use self::extract_facet_string_docids::extract_facet_string_docids; +use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; +use self::extract_fid_word_count_docids::extract_fid_word_count_docids; +use self::extract_word_docids::extract_word_docids; +use self::extract_word_level_position_docids::extract_word_level_position_docids; +use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; +use super::helpers::{ + into_clonable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, + merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, +}; +use super::{helpers, TypedChunk}; +use crate::{FieldId, Result}; + +/// Extract data for each databases from obkv documents in parallel. +/// Send data in grenad file over provided Sender. +pub(crate) fn data_from_obkv_documents( + obkv_chunks: impl Iterator>> + Send, + indexer: GrenadParameters, + lmdb_writer_sx: Sender, + searchable_fields: Option>, + faceted_fields: HashSet, +) -> Result<()> { + let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks + .par_bridge() + .map(|result| { + let documents_chunk = result.and_then(|c| unsafe { into_clonable_grenad(c) }).unwrap(); + + lmdb_writer_sx.send(TypedChunk::Documents(documents_chunk.clone())).unwrap(); + + let (docid_word_positions_chunk, docid_fid_facet_values_chunks): ( + Result<_>, + Result<_>, + ) = rayon::join( + || { + let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( + documents_chunk.clone(), + indexer.clone(), + &searchable_fields, + )?; + + // send documents_ids to DB writer + lmdb_writer_sx.send(TypedChunk::NewDocumentsIds(documents_ids)).unwrap(); + + // send docid_word_positions_chunk to DB writer + let docid_word_positions_chunk = + unsafe { into_clonable_grenad(docid_word_positions_chunk)? }; + lmdb_writer_sx + .send(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())) + .unwrap(); + Ok(docid_word_positions_chunk) + }, + || { + let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = + extract_fid_docid_facet_values( + documents_chunk.clone(), + indexer.clone(), + &faceted_fields, + )?; + + // send docid_fid_facet_numbers_chunk to DB writer + let docid_fid_facet_numbers_chunk = + unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? }; + lmdb_writer_sx + .send(TypedChunk::FieldIdDocidFacetNumbers( + docid_fid_facet_numbers_chunk.clone(), + )) + .unwrap(); + + // send docid_fid_facet_strings_chunk to DB writer + let docid_fid_facet_strings_chunk = + unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? }; + lmdb_writer_sx + .send(TypedChunk::FieldIdDocidFacetStrings( + docid_fid_facet_strings_chunk.clone(), + )) + .unwrap(); + + Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) + }, + ); + Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) + }) + .collect(); + + let ( + docid_word_positions_chunks, + (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), + ) = result?; + + spawn_extraction_task( + docid_word_positions_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_word_pair_proximity_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordPairProximityDocids, + "word-pair-proximity-docids", + ); + + spawn_extraction_task( + docid_word_positions_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_fid_word_count_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::FieldIdWordcountDocids, + "field-id-wordcount-docids", + ); + + spawn_extraction_task( + docid_word_positions_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_word_docids, + merge_roaring_bitmaps, + TypedChunk::WordDocids, + "word-docids", + ); + + spawn_extraction_task( + docid_word_positions_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_word_level_position_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordLevelPositionDocids, + "word-level-position-docids", + ); + + spawn_extraction_task( + docid_fid_facet_strings_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_facet_string_docids, + keep_first_prefix_value_merge_roaring_bitmaps, + TypedChunk::FieldIdFacetStringDocids, + "field-id-facet-string-docids", + ); + + spawn_extraction_task( + docid_fid_facet_numbers_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_facet_number_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::FieldIdFacetNumberDocids, + "field-id-facet-number-docids", + ); + + Ok(()) +} + +/// Spawn a new task to extract data for a specific DB using extract_fn. +/// Generated grenad chunks are merged using the merge_fn. +/// The result of merged chunks is serialized as TypedChunk using the serialize_fn +/// and sent into lmdb_writer_sx. +fn spawn_extraction_task( + chunks: Vec>, + indexer: GrenadParameters, + lmdb_writer_sx: Sender, + extract_fn: FE, + merge_fn: MergeFn, + serialize_fn: FS, + name: &'static str, +) where + FE: Fn(grenad::Reader, GrenadParameters) -> Result> + + Sync + + Send + + 'static, + FS: Fn(grenad::Reader) -> TypedChunk + Sync + Send + 'static, +{ + rayon::spawn(move || { + let chunks: Vec<_> = chunks + .into_par_iter() + .map(|chunk| extract_fn(chunk, indexer.clone()).unwrap()) + .collect(); + rayon::spawn(move || { + let reader = merge_readers(chunks, merge_fn, indexer).unwrap(); + lmdb_writer_sx.send(serialize_fn(reader)).unwrap(); + }); + }); +} diff --git a/milli/src/update/index_documents/helpers/clonable_mmap.rs b/milli/src/update/index_documents/helpers/clonable_mmap.rs new file mode 100644 index 000000000..b16c080ff --- /dev/null +++ b/milli/src/update/index_documents/helpers/clonable_mmap.rs @@ -0,0 +1,22 @@ +use std::sync::Arc; + +use memmap::Mmap; + +#[derive(Debug, Clone)] +pub struct ClonableMmap { + inner: Arc, +} + +impl AsRef<[u8]> for ClonableMmap { + fn as_ref(&self) -> &[u8] { + self.inner.as_ref() + } +} + +impl From for ClonableMmap { + fn from(inner: Mmap) -> ClonableMmap { + ClonableMmap { inner: Arc::new(inner) } + } +} + +pub type CursorClonableMmap = std::io::Cursor; diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs new file mode 100644 index 000000000..9dd261f73 --- /dev/null +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -0,0 +1,276 @@ +use std::borrow::Cow; +use std::fs::File; +use std::io::{self, Seek, SeekFrom}; +use std::time::Instant; + +use byte_unit::Byte; +use grenad::{CompressionType, MergerIter, Reader, Sorter}; +use heed::types::ByteSlice; +use log::debug; + +use super::{ClonableMmap, MergeFn}; +use crate::error::InternalError; +use crate::update::index_documents::WriteMethod; +use crate::Result; + +pub type CursorClonableMmap = io::Cursor; + +pub fn create_writer( + typ: grenad::CompressionType, + level: Option, + file: R, +) -> io::Result> { + let mut builder = grenad::Writer::builder(); + builder.compression_type(typ); + if let Some(level) = level { + builder.compression_level(level); + } + builder.build(file) +} + +pub fn create_sorter( + merge: MergeFn, + chunk_compression_type: grenad::CompressionType, + chunk_compression_level: Option, + max_nb_chunks: Option, + max_memory: Option, +) -> grenad::Sorter { + let mut builder = grenad::Sorter::builder(merge); + builder.chunk_compression_type(chunk_compression_type); + if let Some(level) = chunk_compression_level { + builder.chunk_compression_level(level); + } + if let Some(nb_chunks) = max_nb_chunks { + builder.max_nb_chunks(nb_chunks); + } + if let Some(memory) = max_memory { + builder.dump_threshold(memory); + builder.allow_realloc(false); + } + builder.build() +} + +pub fn sorter_into_reader( + sorter: grenad::Sorter, + indexer: GrenadParameters, +) -> Result> { + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) + })?; + sorter.write_into(&mut writer)?; + Ok(writer_into_reader(writer)?) +} + +pub fn writer_into_reader(writer: grenad::Writer) -> Result> { + let mut file = writer.into_inner()?; + file.seek(SeekFrom::Start(0))?; + grenad::Reader::new(file).map_err(Into::into) +} + +pub unsafe fn into_clonable_grenad( + reader: grenad::Reader, +) -> Result> { + let file = reader.into_inner(); + let mmap = memmap::Mmap::map(&file)?; + let cursor = io::Cursor::new(ClonableMmap::from(mmap)); + let reader = grenad::Reader::new(cursor)?; + Ok(reader) +} + +pub fn merge_readers( + readers: Vec>, + merge_fn: MergeFn, + indexer: GrenadParameters, +) -> Result> { + let mut merger_builder = grenad::MergerBuilder::new(merge_fn); + merger_builder.extend(readers); + let merger = merger_builder.build(); + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) + })?; + merger.write_into(&mut writer)?; + let reader = writer_into_reader(writer)?; + Ok(reader) +} + +#[derive(Debug, Clone, Copy)] +pub struct GrenadParameters { + pub chunk_compression_type: CompressionType, + pub chunk_compression_level: Option, + pub max_memory: Option, + pub max_nb_chunks: Option, +} + +impl Default for GrenadParameters { + fn default() -> Self { + Self { + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + max_memory: None, + max_nb_chunks: None, + } + } +} + +impl GrenadParameters { + pub fn max_memory_by_thread(&self) -> Option { + self.max_memory.map(|max_memory| max_memory / rayon::current_num_threads()) + } +} + +/// Returns an iterator that outputs grenad readers of obkv documents +/// with a maximum size of approximately `documents_chunks_size`. +/// +/// The grenad obkv entries are composed of an incremental document id big-endian +/// encoded as the key and an obkv object with an `u8` for the field as the key +/// and a simple UTF-8 encoded string as the value. +pub fn grenad_obkv_into_chunks( + mut reader: grenad::Reader, + indexer: GrenadParameters, + log_frequency: Option, + documents_chunk_size: Byte, +) -> Result>>> { + let mut document_count = 0; + let mut continue_reading = true; + + let indexer_clone = indexer.clone(); + let mut transposer = move || { + if !continue_reading { + return Ok(None); + } + + let mut current_chunk_size = 0u64; + let mut obkv_documents = tempfile::tempfile().and_then(|file| { + create_writer( + indexer_clone.chunk_compression_type, + indexer_clone.chunk_compression_level, + file, + ) + })?; + + while let Some((document_id, obkv)) = reader.next()? { + obkv_documents.insert(document_id, obkv)?; + current_chunk_size += document_id.len() as u64 + obkv.len() as u64; + + document_count += 1; + if log_frequency.map_or(false, |log_frequency| document_count % log_frequency == 0) { + debug!("reached {} chunked documents", document_count); + } + + if current_chunk_size >= documents_chunk_size.get_bytes() { + return writer_into_reader(obkv_documents).map(Some); + } + } + + continue_reading = false; + writer_into_reader(obkv_documents).map(Some) + }; + + Ok(std::iter::from_fn(move || { + let result = transposer().transpose(); + if result.as_ref().map_or(false, |r| r.is_ok()) { + debug!( + "A new chunk of approximately {} has been generated", + documents_chunk_size.get_appropriate_unit(true), + ); + } + result + })) +} + +pub fn write_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + mut reader: Reader, + merge: MergeFn, + method: WriteMethod, +) -> Result<()> { + debug!("Writing MTBL stores..."); + let before = Instant::now(); + + match method { + WriteMethod::Append => { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + while let Some((k, v)) = reader.next()? { + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; + } + } + WriteMethod::GetMergePut => { + while let Some((k, v)) = reader.next()? { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; + match iter.next().transpose()? { + Some((key, old_val)) if key == k => { + let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; + let val = merge(k, &vals)?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(k, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + } + } + } + + debug!("MTBL stores merged in {:.02?}!", before.elapsed()); + Ok(()) +} + +pub fn sorter_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + sorter: Sorter, + merge: MergeFn, + method: WriteMethod, +) -> Result<()> { + debug!("Writing MTBL sorter..."); + let before = Instant::now(); + + merger_iter_into_lmdb_database(wtxn, database, sorter.into_merger_iter()?, merge, method)?; + + debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); + Ok(()) +} + +fn merger_iter_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + mut sorter: MergerIter, + merge: MergeFn, + method: WriteMethod, +) -> Result<()> { + match method { + WriteMethod::Append => { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + while let Some((k, v)) = sorter.next()? { + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; + } + } + WriteMethod::GetMergePut => { + while let Some((k, v)) = sorter.next()? { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; + match iter.next().transpose()? { + Some((key, old_val)) if key == k => { + let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; + let val = merge(k, &vals).map_err(|_| { + // TODO just wrap this error? + InternalError::IndexingMergingKeys { process: "get-put-merge" } + })?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(k, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + } + } + } + + Ok(()) +} diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs new file mode 100644 index 000000000..6a592e54d --- /dev/null +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -0,0 +1,171 @@ +use std::borrow::Cow; +use std::io; +use std::result::Result as StdResult; + +use roaring::RoaringBitmap; + +use super::read_u32_ne_bytes; +use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::Result; + +pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result>; + +pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let capacity = values.iter().map(|v| v.len()).sum::(); + let mut output = Vec::with_capacity(capacity); + values.iter().for_each(|integers| output.extend_from_slice(integers)); + Ok(Cow::Owned(output)) + } +} + +pub fn roaring_bitmap_from_u32s_array(slice: &[u8]) -> RoaringBitmap { + read_u32_ne_bytes(slice).collect() +} + +pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec) -> io::Result<()> { + buffer.clear(); + buffer.reserve(bitmap.serialized_size()); + bitmap.serialize_into(buffer) +} + +pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let merged = values + .iter() + .map(AsRef::as_ref) + .map(RoaringBitmap::deserialize_from) + .map(StdResult::unwrap) + .reduce(|a, b| a | b) + .unwrap(); + let mut buffer = Vec::new(); + serialize_roaring_bitmap(&merged, &mut buffer)?; + Ok(Cow::Owned(buffer)) + } +} + +pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( + _key: &[u8], + values: &[Cow<'a, [u8]>], +) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let original = decode_prefix_string(&values[0]).unwrap().0; + let merged_bitmaps = values + .iter() + .map(AsRef::as_ref) + .map(decode_prefix_string) + .map(Option::unwrap) + .map(|(_, bitmap_bytes)| bitmap_bytes) + .map(RoaringBitmap::deserialize_from) + .map(StdResult::unwrap) + .reduce(|a, b| a | b) + .unwrap(); + + let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); + let mut buffer = Vec::with_capacity(cap); + encode_prefix_string(original, &mut buffer)?; + merged_bitmaps.serialize_into(&mut buffer)?; + Ok(Cow::Owned(buffer)) + } +} + +pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + Ok(values[0].clone()) +} + +/// Only the last value associated with an id is kept. +pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { + Ok(obkvs.last().unwrap().clone()) +} + +/// Merge all the obks in the order we see them. +pub fn merge_obkvs<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { + Ok(obkvs + .into_iter() + .cloned() + .reduce(|acc, current| { + let first = obkv::KvReader::new(&acc); + let second = obkv::KvReader::new(¤t); + let mut buffer = Vec::new(); + merge_two_obkvs(first, second, &mut buffer); + Cow::from(buffer) + }) + .unwrap()) +} + +pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec) { + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + buffer.clear(); + + let mut writer = obkv::KvWriter::new(buffer); + for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { + match eob { + Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), + } + } + + writer.finish().unwrap(); +} + +pub fn merge_cbo_roaring_bitmaps<'a>( + _key: &[u8], + values: &[Cow<'a, [u8]>], +) -> Result> { + match values.split_first().unwrap() { + (head, []) => Ok(head.clone()), + (head, tail) => { + let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; + + for value in tail { + head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; + } + + let mut vec = Vec::new(); + CboRoaringBitmapCodec::serialize_into(&head, &mut vec); + Ok(Cow::from(vec)) + } + } +} + +// /// Uses the FacetStringLevelZeroValueCodec to merge the values. +// pub fn tuple_string_cbo_roaring_bitmap_merge<'a>( +// _key: &[u8], +// values: &[Cow<[u8]>], +// ) -> Result> { +// let (head, tail) = values.split_first().unwrap(); +// let (head_string, mut head_rb) = FacetStringLevelZeroValueCodec::bytes_decode(&head[..]) +// .ok_or(SerializationError::Decoding { db_name: None })?; + +// for value in tail { +// let (_string, rb) = FacetStringLevelZeroValueCodec::bytes_decode(&value[..]) +// .ok_or(SerializationError::Decoding { db_name: None })?; +// head_rb |= rb; +// } + +// FacetStringLevelZeroValueCodec::bytes_encode(&(head_string, head_rb)) +// .map(|cow| cow.into_owned()) +// .ok_or(SerializationError::Encoding { db_name: None }) +// .map_err(Into::into) +// } + +// pub fn cbo_roaring_bitmap_merge<'a>(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { +// let (head, tail) = values.split_first().unwrap(); +// let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; + +// for value in tail { +// head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; +// } + +// let mut vec = Vec::new(); +// CboRoaringBitmapCodec::serialize_into(&head, &mut vec); +// Ok(vec) +// } diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs new file mode 100644 index 000000000..baacb0a1b --- /dev/null +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -0,0 +1,49 @@ +mod clonable_mmap; +mod grenad_helpers; +mod merge_functions; + +use std::convert::{TryFrom, TryInto}; + +pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; +pub use grenad_helpers::{ + create_sorter, create_writer, grenad_obkv_into_chunks, into_clonable_grenad, merge_readers, + sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, + GrenadParameters, +}; +pub use merge_functions::{ + concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, + merge_cbo_roaring_bitmaps, merge_obkvs, merge_roaring_bitmaps, merge_two_obkvs, + roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, +}; + +pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { + key.as_ref().len() <= 511 +} + +/// Divides one slice into two at an index, returns `None` if mid is out of bounds. +pub fn try_split_at(slice: &[T], mid: usize) -> Option<(&[T], &[T])> { + if mid <= slice.len() { + Some(slice.split_at(mid)) + } else { + None + } +} + +/// Divides one slice into an array and the tail at an index, +/// returns `None` if `N` is out of bounds. +pub fn try_split_array_at(slice: &[T]) -> Option<([T; N], &[T])> +where + [T; N]: for<'a> TryFrom<&'a [T]>, +{ + let (head, tail) = try_split_at(slice, N)?; + let head = head.try_into().ok()?; + Some((head, tail)) +} + +// pub fn pretty_thousands, T: fmt::Display>(number: A) -> String { +// thousands::Separable::separate_with_spaces(number.borrow()) +// } + +pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { + bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) +} diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs deleted file mode 100644 index 7e5d0b581..000000000 --- a/milli/src/update/index_documents/merge_function.rs +++ /dev/null @@ -1,106 +0,0 @@ -use std::borrow::Cow; -use std::result::Result as StdResult; - -use fst::IntoStreamer; -use heed::{BytesDecode, BytesEncode}; -use roaring::RoaringBitmap; - -use crate::error::SerializationError; -use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::Result; - -/// Only the last value associated with an id is kept. -pub fn keep_latest_obkv(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result> { - Ok(obkvs.last().unwrap().clone().into_owned()) -} - -/// Merge all the obks in the order we see them. -pub fn merge_obkvs(_key: &[u8], obkvs: &[Cow<[u8]>]) -> Result> { - let mut iter = obkvs.iter(); - let first = iter.next().map(|b| b.clone().into_owned()).unwrap(); - Ok(iter.fold(first, |acc, current| { - let first = obkv::KvReader::new(&acc); - let second = obkv::KvReader::new(current); - let mut buffer = Vec::new(); - merge_two_obkvs(first, second, &mut buffer); - buffer - })) -} - -// Union of multiple FSTs -pub fn fst_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { - let fsts = values.iter().map(fst::Set::new).collect::, _>>()?; - let op_builder: fst::set::OpBuilder = fsts.iter().map(|fst| fst.into_stream()).collect(); - let op = op_builder.r#union(); - - let mut build = fst::SetBuilder::memory(); - build.extend_stream(op.into_stream()).unwrap(); - Ok(build.into_inner().unwrap()) -} - -pub fn keep_first(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { - Ok(values.first().unwrap().to_vec()) -} - -pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec) { - use itertools::merge_join_by; - use itertools::EitherOrBoth::{Both, Left, Right}; - - buffer.clear(); - - let mut writer = obkv::KvWriter::new(buffer); - for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { - match eob { - Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), - } - } - - writer.finish().unwrap(); -} - -pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { - let (head, tail) = values.split_first().unwrap(); - let mut head = RoaringBitmap::deserialize_from(&head[..])?; - - for value in tail { - head |= RoaringBitmap::deserialize_from(&value[..])?; - } - - let mut vec = Vec::with_capacity(head.serialized_size()); - head.serialize_into(&mut vec)?; - Ok(vec) -} - -/// Uses the FacetStringLevelZeroValueCodec to merge the values. -pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { - let (head, tail) = values.split_first().unwrap(); - let (head_string, mut head_rb) = - FacetStringLevelZeroValueCodec::::bytes_decode(&head[..]) - .ok_or(SerializationError::Decoding { db_name: None })?; - - for value in tail { - let (_string, rb) = - FacetStringLevelZeroValueCodec::::bytes_decode(&value[..]) - .ok_or(SerializationError::Decoding { db_name: None })?; - head_rb |= rb; - } - - FacetStringLevelZeroValueCodec::::bytes_encode(&(head_string, head_rb)) - .map(|cow| cow.into_owned()) - .ok_or(SerializationError::Encoding { db_name: None }) - .map_err(Into::into) -} - -pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { - let (head, tail) = values.split_first().unwrap(); - let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; - - for value in tail { - head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; - } - - let mut vec = Vec::new(); - CboRoaringBitmapCodec::serialize_into(&head, &mut vec); - Ok(vec) -} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index afae8cae9..4f488337c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1,240 +1,41 @@ -use std::borrow::Cow; +mod extract; +mod helpers; +mod transform; +mod typed_chunk; + use std::collections::HashSet; -use std::fs::File; -use std::io::{self, BufRead, BufReader, Seek, SeekFrom}; +use std::io::{self, BufRead, BufReader}; +use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; -use std::result::Result as StdResult; -use std::str; -use std::sync::mpsc::sync_channel; use std::time::Instant; -use bstr::ByteSlice as _; +use byte_unit::Byte; use chrono::Utc; -use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer}; -use heed::types::ByteSlice; -use log::{debug, error, info}; -use memmap::Mmap; -use rayon::prelude::*; +use crossbeam_channel::{Receiver, Sender}; +use grenad::{self, CompressionType}; +use log::{debug, info}; use rayon::ThreadPool; +use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; +use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; -pub use self::merge_function::{ - cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, - tuple_string_cbo_roaring_bitmap_merge, +pub use self::helpers::{ + create_sorter, create_writer, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, + sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, }; -use self::store::{Readers, Store}; +use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use super::UpdateBuilder; -use crate::error::{Error, InternalError}; use crate::update::{ - Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, + Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordsLevelPositions, WordsPrefixesFst, }; -use crate::{Index, MergeFn, Result}; - -mod merge_function; -mod store; -mod transform; +use crate::{Index, Result}; #[derive(Debug, Serialize, Deserialize, Clone)] pub struct DocumentAdditionResult { pub nb_documents: usize, } -#[derive(Debug, Copy, Clone)] -pub enum WriteMethod { - Append, - GetMergePut, -} - -pub fn create_writer( - typ: CompressionType, - level: Option, - file: File, -) -> io::Result> { - let mut builder = Writer::builder(); - builder.compression_type(typ); - if let Some(level) = level { - builder.compression_level(level); - } - builder.build(file) -} - -pub fn create_sorter( - merge: MergeFn, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, - chunk_fusing_shrink_size: Option, - max_nb_chunks: Option, - max_memory: Option, -) -> Sorter> { - let mut builder = Sorter::builder(merge); - if let Some(shrink_size) = chunk_fusing_shrink_size { - builder.file_fusing_shrink_size(shrink_size); - } - builder.chunk_compression_type(chunk_compression_type); - if let Some(level) = chunk_compression_level { - builder.chunk_compression_level(level); - } - if let Some(nb_chunks) = max_nb_chunks { - builder.max_nb_chunks(nb_chunks); - } - if let Some(memory) = max_memory { - builder.max_memory(memory); - } - builder.build() -} - -pub fn writer_into_reader( - writer: Writer, - shrink_size: Option, -) -> Result> { - let mut file = writer.into_inner()?; - file.seek(SeekFrom::Start(0))?; - let file = if let Some(shrink_size) = shrink_size { - FileFuse::builder().shrink_size(shrink_size).build(file) - } else { - FileFuse::new(file) - }; - Reader::new(file).map_err(Into::into) -} - -pub fn merge_readers( - sources: Vec>, - merge: MergeFn, -) -> Merger> { - let mut builder = Merger::builder(merge); - builder.extend(sources); - builder.build() -} - -pub fn merge_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - sources: Vec>, - merge: MergeFn, - method: WriteMethod, -) -> Result<()> -where - Error: From, -{ - debug!("Merging {} MTBL stores...", sources.len()); - let before = Instant::now(); - - let merger = merge_readers(sources, merge); - merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?; - - debug!("MTBL stores merged in {:.02?}!", before.elapsed()); - Ok(()) -} - -pub fn write_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - mut reader: Reader, - merge: MergeFn, - method: WriteMethod, -) -> Result<()> -where - Error: From, -{ - debug!("Writing MTBL stores..."); - let before = Instant::now(); - - match method { - WriteMethod::Append => { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = reader.next()? { - // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; - } - } - WriteMethod::GetMergePut => { - while let Some((k, v)) = reader.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; - let val = merge(k, &vals)?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - } - } - } - - debug!("MTBL stores merged in {:.02?}!", before.elapsed()); - Ok(()) -} - -pub fn sorter_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - sorter: Sorter>, - merge: MergeFn, - method: WriteMethod, -) -> Result<()> -where - Error: From, - Error: From>, -{ - debug!("Writing MTBL sorter..."); - let before = Instant::now(); - - merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?; - - debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); - Ok(()) -} - -fn merger_iter_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - mut sorter: MergerIter>, - merge: MergeFn, - method: WriteMethod, -) -> Result<()> -where - Error: From, -{ - match method { - WriteMethod::Append => { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = sorter.next()? { - // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; - } - } - WriteMethod::GetMergePut => { - while let Some((k, v)) = sorter.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).map_err(|_| { - // TODO just wrap this error? - InternalError::IndexingMergingKeys { process: "get-put-merge" } - })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - } - } - } - - Ok(()) -} - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[non_exhaustive] pub enum IndexDocumentsMethod { @@ -247,6 +48,12 @@ pub enum IndexDocumentsMethod { UpdateDocuments, } +#[derive(Debug, Copy, Clone)] +pub enum WriteMethod { + Append, + GetMergePut, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[non_exhaustive] pub enum UpdateFormat { @@ -262,12 +69,11 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, pub(crate) log_every_n: Option, + pub(crate) documents_chunk_size: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, - pub(crate) linked_hash_map_size: Option, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, facet_level_group_size: Option, facet_min_level_size: Option, @@ -291,12 +97,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { wtxn, index, log_every_n: None, + documents_chunk_size: None, max_nb_chunks: None, max_memory: None, - linked_hash_map_size: None, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, thread_pool: None, facet_level_group_size: None, facet_min_level_size: None, @@ -344,14 +149,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let before_transform = Instant::now(); let update_id = self.update_id; let progress_callback = |step| progress_callback(step, update_id); - let transform = Transform { rtxn: &self.wtxn, index: self.index, log_every_n: self.log_every_n, chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, - chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, max_nb_chunks: self.max_nb_chunks, max_memory: self.max_memory, index_documents_method: self.update_method, @@ -378,8 +181,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { where F: Fn(UpdateIndexingStep) + Sync, { - let before_indexing = Instant::now(); - let TransformOutput { primary_key, fields_ids_map, @@ -395,6 +196,65 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // up to date field map. self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + let backup_pool; + let pool = match self.thread_pool { + Some(pool) => pool, + #[cfg(not(test))] + None => { + // We initialize a bakcup pool with the default + // settings if none have already been set. + backup_pool = rayon::ThreadPoolBuilder::new().build()?; + &backup_pool + } + #[cfg(test)] + None => { + // We initialize a bakcup pool with the default + // settings if none have already been set. + backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?; + &backup_pool + } + }; + + let documents_file = grenad::Reader::new(documents_file)?; + + // create LMDB writer channel + let (lmdb_writer_sx, lmdb_writer_rx): (Sender, Receiver) = + crossbeam_channel::unbounded(); + + // get searchable fields for word databases + let searchable_fields = + self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); + // get filterable fields for facet databases + let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + + // Run extraction pipeline in parallel. + pool.install(|| { + let params = GrenadParameters { + chunk_compression_type: self.chunk_compression_type, + chunk_compression_level: self.chunk_compression_level, + max_memory: self.max_memory, + max_nb_chunks: self.max_nb_chunks, // default value, may be chosen. + }; + + // split obkv file into several chuncks + let mut chunk_iter = grenad_obkv_into_chunks( + documents_file, + params.clone(), + self.log_every_n, + Byte::from_bytes(self.documents_chunk_size.unwrap_or(1024 * 1024 * 128) as u64), // 128MiB + ) + .unwrap(); + // extract all databases from the chunked obkv douments + extract::data_from_obkv_documents( + &mut chunk_iter, + params, + lmdb_writer_sx, + searchable_fields, + faceted_fields, + ) + .unwrap(); + }); + // We delete the documents that this document addition replaces. This way we are // able to simply insert all the documents even if they already exist in the database. if !replaced_documents_ids.is_empty() { @@ -402,10 +262,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { log_every_n: self.log_every_n, max_nb_chunks: self.max_nb_chunks, max_memory: self.max_memory, - linked_hash_map_size: self.linked_hash_map_size, chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, - chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, thread_pool: self.thread_pool, update_id: self.update_id, }; @@ -416,190 +274,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { debug!("{} documents actually deleted", deleted_documents_count); } - if documents_count == 0 { - return Ok(()); + let index_documents_ids = self.index.documents_ids(self.wtxn)?; + let index_is_empty = index_documents_ids.len() == 0; + let mut final_documents_ids = RoaringBitmap::new(); + + for typed_chunk in lmdb_writer_rx { + let docids = + write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?; + final_documents_ids |= docids; + debug!( + "We have seen {} documents on {} total document so far", + final_documents_ids.len(), + documents_count + ); } - let bytes = unsafe { Mmap::map(&documents_file)? }; - let documents = grenad::Reader::new(bytes.as_bytes()).unwrap(); - - // The enum which indicates the type of the readers - // merges that are potentially done on different threads. - enum DatabaseType { - Main, - WordDocids, - WordLevel0PositionDocids, - FieldIdWordCountDocids, - FacetLevel0NumbersDocids, - } - - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; - let searchable_fields: HashSet<_> = match self.index.searchable_fields_ids(self.wtxn)? { - Some(fields) => fields.iter().copied().collect(), - None => fields_ids_map.iter().map(|(id, _name)| id).collect(), - }; - - let stop_words = self.index.stop_words(self.wtxn)?; - let stop_words = stop_words.as_ref(); - let linked_hash_map_size = self.linked_hash_map_size; - let max_nb_chunks = self.max_nb_chunks; - let max_memory = self.max_memory; - let chunk_compression_type = self.chunk_compression_type; - let chunk_compression_level = self.chunk_compression_level; - let log_every_n = self.log_every_n; - let chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - - let backup_pool; - let pool = match self.thread_pool { - Some(pool) => pool, - None => { - // We initialize a bakcup pool with the default - // settings if none have already been set. - backup_pool = rayon::ThreadPoolBuilder::new().build()?; - &backup_pool - } - }; - - let readers = pool.install(|| { - let num_threads = rayon::current_num_threads(); - let max_memory_by_job = max_memory.map(|mm| mm / num_threads); - - let readers = rayon::iter::repeatn(documents, num_threads) - .enumerate() - .map(|(i, documents)| { - let store = Store::new( - searchable_fields.clone(), - faceted_fields.clone(), - linked_hash_map_size, - max_nb_chunks, - max_memory_by_job, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - stop_words, - )?; - store.index( - documents, - documents_count, - i, - num_threads, - log_every_n, - &progress_callback, - ) - }) - .collect::, _>>()?; - - let mut main_readers = Vec::with_capacity(readers.len()); - let mut word_docids_readers = Vec::with_capacity(readers.len()); - let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); - let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); - let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); - let mut field_id_word_count_docids_readers = Vec::with_capacity(readers.len()); - let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len()); - let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len()); - let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len()); - let mut field_id_docid_facet_strings_readers = Vec::with_capacity(readers.len()); - let mut documents_readers = Vec::with_capacity(readers.len()); - readers.into_iter().for_each(|readers| { - let Readers { - main, - word_docids, - docid_word_positions, - words_pairs_proximities_docids, - word_level_position_docids, - field_id_word_count_docids, - facet_field_numbers_docids, - facet_field_strings_docids, - field_id_docid_facet_numbers, - field_id_docid_facet_strings, - documents, - } = readers; - main_readers.push(main); - word_docids_readers.push(word_docids); - docid_word_positions_readers.push(docid_word_positions); - words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); - word_level_position_docids_readers.push(word_level_position_docids); - field_id_word_count_docids_readers.push(field_id_word_count_docids); - facet_field_numbers_docids_readers.push(facet_field_numbers_docids); - facet_field_strings_docids_readers.push(facet_field_strings_docids); - field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers); - field_id_docid_facet_strings_readers.push(field_id_docid_facet_strings); - documents_readers.push(documents); - }); - - // This is the function that merge the readers - // by using the given merge function. - let merge_readers = move |readers, merge| { - let mut writer = tempfile::tempfile().and_then(|f| { - create_writer(chunk_compression_type, chunk_compression_level, f) - })?; - let merger = merge_readers(readers, merge); - merger.write_into(&mut writer)?; - writer_into_reader(writer, chunk_fusing_shrink_size) - }; - - // The enum and the channel which is used to transfert - // the readers merges potentially done on another thread. - let (sender, receiver) = sync_channel(2); - - debug!("Merging the main, word docids and words pairs proximity docids in parallel..."); - rayon::spawn(move || { - vec![ - (DatabaseType::Main, main_readers, fst_merge as MergeFn<_>), - (DatabaseType::WordDocids, word_docids_readers, roaring_bitmap_merge), - ( - DatabaseType::FacetLevel0NumbersDocids, - facet_field_numbers_docids_readers, - cbo_roaring_bitmap_merge, - ), - ( - DatabaseType::WordLevel0PositionDocids, - word_level_position_docids_readers, - cbo_roaring_bitmap_merge, - ), - ( - DatabaseType::FieldIdWordCountDocids, - field_id_word_count_docids_readers, - cbo_roaring_bitmap_merge, - ), - ] - .into_par_iter() - .for_each(|(dbtype, readers, merge)| { - let result = merge_readers(readers, merge); - if let Err(e) = sender.send((dbtype, result)) { - error!("sender error: {}", e); - } - }); - }); - - Ok(( - receiver, - docid_word_positions_readers, - documents_readers, - words_pairs_proximities_docids_readers, - facet_field_strings_docids_readers, - field_id_docid_facet_numbers_readers, - field_id_docid_facet_strings_readers, - )) as Result<_> - })?; - - let ( - receiver, - docid_word_positions_readers, - documents_readers, - words_pairs_proximities_docids_readers, - facet_field_strings_docids_readers, - field_id_docid_facet_numbers_readers, - field_id_docid_facet_strings_readers, - ) = readers; - - let mut documents_ids = self.index.documents_ids(self.wtxn)?; - let contains_documents = !documents_ids.is_empty(); - let write_method = - if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append }; - - debug!("Writing using the write method: {:?}", write_method); - // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; @@ -609,180 +298,24 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the external documents ids into the main database. self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; - // We merge the new documents ids with the existing ones. - documents_ids |= new_documents_ids; - documents_ids |= replaced_documents_ids; - self.index.put_documents_ids(self.wtxn, &documents_ids)?; + let all_documents_ids = index_documents_ids | new_documents_ids; + self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; - let mut database_count = 0; - let total_databases = 11; - - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: 0, - total_databases, - }); - - debug!("Inserting the docid word positions into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.docid_word_positions.as_polymorph(), - docid_word_positions_readers, - keep_first, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - debug!("Inserting the documents into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.documents.as_polymorph(), - documents_readers, - keep_first, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - debug!("Writing the facet id string docids into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.facet_id_string_docids.as_polymorph(), - facet_field_strings_docids_readers, - tuple_string_cbo_roaring_bitmap_merge, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - debug!("Writing the field id docid facet numbers into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.field_id_docid_facet_f64s.as_polymorph(), - field_id_docid_facet_numbers_readers, - keep_first, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - debug!("Writing the field id docid facet strings into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.field_id_docid_facet_strings.as_polymorph(), - field_id_docid_facet_strings_readers, - keep_first, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - debug!("Writing the words pairs proximities docids into LMDB on disk..."); - merge_into_lmdb_database( - self.wtxn, - *self.index.word_pair_proximity_docids.as_polymorph(), - words_pairs_proximities_docids_readers, - cbo_roaring_bitmap_merge, - write_method, - )?; - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - - for (db_type, result) in receiver { - let content = result?; - match db_type { - DatabaseType::Main => { - debug!("Writing the main elements into LMDB on disk..."); - write_into_lmdb_database( - self.wtxn, - self.index.main, - content, - fst_merge, - WriteMethod::GetMergePut, - )?; - } - DatabaseType::WordDocids => { - debug!("Writing the words docids into LMDB on disk..."); - let db = *self.index.word_docids.as_polymorph(); - write_into_lmdb_database( - self.wtxn, - db, - content, - roaring_bitmap_merge, - write_method, - )?; - } - DatabaseType::FacetLevel0NumbersDocids => { - debug!("Writing the facet numbers docids into LMDB on disk..."); - let db = *self.index.facet_id_f64_docids.as_polymorph(); - write_into_lmdb_database( - self.wtxn, - db, - content, - cbo_roaring_bitmap_merge, - write_method, - )?; - } - DatabaseType::FieldIdWordCountDocids => { - debug!("Writing the field id word count docids into LMDB on disk..."); - let db = *self.index.field_id_word_count_docids.as_polymorph(); - write_into_lmdb_database( - self.wtxn, - db, - content, - cbo_roaring_bitmap_merge, - write_method, - )?; - } - DatabaseType::WordLevel0PositionDocids => { - debug!("Writing the word level 0 positions docids into LMDB on disk..."); - let db = *self.index.word_level_position_docids.as_polymorph(); - write_into_lmdb_database( - self.wtxn, - db, - content, - cbo_roaring_bitmap_merge, - write_method, - )?; - } - } - - database_count += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: database_count, - total_databases, - }); - } + self.execute_prefix_databases(progress_callback) + } + pub fn execute_prefix_databases( + self, + // output: TransformOutput, + progress_callback: F, + ) -> Result<()> + where + F: Fn(UpdateIndexingStep) + Sync, + { // Run the facets update operation. let mut builder = Facets::new(self.wtxn, self.index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; if let Some(value) = self.facet_level_group_size { builder.level_group_size(value); } @@ -805,7 +338,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut builder = WordPrefixDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; builder.execute()?; @@ -814,7 +346,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; builder.execute()?; @@ -823,7 +354,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut builder = WordsLevelPositions::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; if let Some(value) = self.words_positions_level_group_size { builder.level_group_size(value); } @@ -832,10 +362,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; - debug_assert_eq!(database_count, total_databases); - - info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); - Ok(()) } } diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index a2aa26e19..e69de29bb 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -1,985 +0,0 @@ -use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap, HashSet}; -use std::convert::{TryFrom, TryInto}; -use std::fs::File; -use std::iter::FromIterator; -use std::time::Instant; -use std::{cmp, iter}; - -use bstr::ByteSlice as _; -use concat_arrays::concat_arrays; -use fst::Set; -use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer}; -use heed::BytesEncode; -use linked_hash_map::LinkedHashMap; -use log::{debug, info, warn}; -use meilisearch_tokenizer::token::SeparatorKind; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; -use ordered_float::OrderedFloat; -use roaring::RoaringBitmap; -use serde_json::Value; -use tempfile::tempfile; - -use super::merge_function::{ - cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, - tuple_string_cbo_roaring_bitmap_merge, -}; -use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; -use crate::error::{Error, InternalError, SerializationError}; -use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, -}; -use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; -use crate::update::UpdateIndexingStep; -use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32}; - -const LMDB_MAX_KEY_LENGTH: usize = 511; -const ONE_KILOBYTE: usize = 1024 * 1024; - -const MAX_POSITION: usize = 1000; -const WORDS_FST_KEY: &[u8] = crate::index::main_key::WORDS_FST_KEY.as_bytes(); - -pub struct Readers { - pub main: Reader, - pub word_docids: Reader, - pub docid_word_positions: Reader, - pub words_pairs_proximities_docids: Reader, - pub word_level_position_docids: Reader, - pub field_id_word_count_docids: Reader, - pub facet_field_numbers_docids: Reader, - pub facet_field_strings_docids: Reader, - pub field_id_docid_facet_numbers: Reader, - pub field_id_docid_facet_strings: Reader, - pub documents: Reader, -} - -pub struct Store<'s, A> { - // Indexing parameters - searchable_fields: HashSet, - filterable_fields: HashSet, - // Caches - word_docids: LinkedHashMap, RoaringBitmap>, - word_docids_limit: usize, - field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, - words_pairs_proximities_docids: - LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, - words_pairs_proximities_docids_limit: usize, - facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, - facet_field_string_docids: LinkedHashMap<(FieldId, String), (String, RoaringBitmap)>, - facet_field_value_docids_limit: usize, - // MTBL parameters - chunk_compression_type: CompressionType, - chunk_compression_level: Option, - chunk_fusing_shrink_size: Option, - // MTBL sorters - main_sorter: Sorter>, - word_docids_sorter: Sorter>, - words_pairs_proximities_docids_sorter: Sorter>, - word_level_position_docids_sorter: Sorter>, - field_id_word_count_docids_sorter: Sorter>, - facet_field_numbers_docids_sorter: Sorter>, - facet_field_strings_docids_sorter: Sorter>, - field_id_docid_facet_numbers_sorter: Sorter>, - field_id_docid_facet_strings_sorter: Sorter>, - // MTBL writers - docid_word_positions_writer: Writer, - documents_writer: Writer, - // tokenizer - analyzer: Analyzer<'s, A>, -} - -impl<'s, A: AsRef<[u8]>> Store<'s, A> { - pub fn new( - searchable_fields: HashSet, - filterable_fields: HashSet, - linked_hash_map_size: Option, - max_nb_chunks: Option, - max_memory: Option, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, - chunk_fusing_shrink_size: Option, - stop_words: Option<&'s Set>, - ) -> Result { - // We divide the max memory by the number of sorter the Store have. - let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); - let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); - - let main_sorter = create_sorter( - fst_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let word_docids_sorter = create_sorter( - roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let words_pairs_proximities_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let word_level_position_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let field_id_word_count_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let facet_field_numbers_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let facet_field_strings_docids_sorter = create_sorter( - tuple_string_cbo_roaring_bitmap_merge, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - max_memory, - ); - let field_id_docid_facet_numbers_sorter = create_sorter( - keep_first, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - Some(1024 * 1024 * 1024), // 1MB - ); - let field_id_docid_facet_strings_sorter = create_sorter( - keep_first, - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - max_nb_chunks, - Some(1024 * 1024 * 1024), // 1MB - ); - - let documents_writer = tempfile() - .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; - let docid_word_positions_writer = tempfile() - .and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?; - - let mut config = AnalyzerConfig::default(); - if let Some(stop_words) = stop_words { - config.stop_words(stop_words); - } - let analyzer = Analyzer::new(config); - - Ok(Store { - // Indexing parameters. - searchable_fields, - filterable_fields, - // Caches - word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), - field_id_word_count_docids: HashMap::new(), - word_docids_limit: linked_hash_map_size, - words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), - words_pairs_proximities_docids_limit: linked_hash_map_size, - facet_field_number_docids: LinkedHashMap::with_capacity(linked_hash_map_size), - facet_field_string_docids: LinkedHashMap::with_capacity(linked_hash_map_size), - facet_field_value_docids_limit: linked_hash_map_size, - // MTBL parameters - chunk_compression_type, - chunk_compression_level, - chunk_fusing_shrink_size, - // MTBL sorters - main_sorter, - word_docids_sorter, - words_pairs_proximities_docids_sorter, - word_level_position_docids_sorter, - field_id_word_count_docids_sorter, - facet_field_numbers_docids_sorter, - facet_field_strings_docids_sorter, - field_id_docid_facet_numbers_sorter, - field_id_docid_facet_strings_sorter, - // MTBL writers - docid_word_positions_writer, - documents_writer, - // tokenizer - analyzer, - }) - } - - // Save the documents ids under the position and word we have seen it. - fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> { - // if get_refresh finds the element it is assured to be at the end of the linked hash map. - match self.word_docids.get_refresh(word.as_bytes()) { - Some(old) => { - old.insert(id); - } - None => { - let word_vec = SmallVec32::from(word.as_bytes()); - // A newly inserted element is append at the end of the linked hash map. - self.word_docids.insert(word_vec, RoaringBitmap::from_iter(Some(id))); - // If the word docids just reached it's capacity we must make sure to remove - // one element, this way next time we insert we doesn't grow the capacity. - if self.word_docids.len() == self.word_docids_limit { - // Removing the front element is equivalent to removing the LRU element. - let lru = self.word_docids.pop_front(); - Self::write_word_docids(&mut self.word_docids_sorter, lru)?; - } - } - } - Ok(()) - } - - fn insert_facet_number_values_docid( - &mut self, - field_id: FieldId, - value: OrderedFloat, - id: DocumentId, - ) -> Result<()> { - let sorter = &mut self.field_id_docid_facet_numbers_sorter; - Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; - - let key = (field_id, value); - // if get_refresh finds the element it is assured to be at the end of the linked hash map. - match self.facet_field_number_docids.get_refresh(&key) { - Some(old) => { - old.insert(id); - } - None => { - // A newly inserted element is append at the end of the linked hash map. - self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); - // If the word docids just reached it's capacity we must make sure to remove - // one element, this way next time we insert we doesn't grow the capacity. - if self.facet_field_number_docids.len() == self.facet_field_value_docids_limit { - // Removing the front element is equivalent to removing the LRU element. - Self::write_facet_field_number_docids( - &mut self.facet_field_numbers_docids_sorter, - self.facet_field_number_docids.pop_front(), - )?; - } - } - } - - Ok(()) - } - - // Save the documents ids under the facet field id and value we have seen it. - fn insert_facet_string_values_docid( - &mut self, - field_id: FieldId, - normalized_value: String, - original_value: String, - id: DocumentId, - ) -> Result<()> { - if normalized_value.is_empty() { - return Ok(()); - } - - let sorter = &mut self.field_id_docid_facet_strings_sorter; - Self::write_field_id_docid_facet_string_value( - sorter, - field_id, - id, - &normalized_value, - &original_value, - )?; - - let key = (field_id, normalized_value); - // if get_refresh finds the element it is assured to be at the end of the linked hash map. - match self.facet_field_string_docids.get_refresh(&key) { - Some((_original_value, old)) => { - old.insert(id); - } - None => { - // A newly inserted element is append at the end of the linked hash map. - self.facet_field_string_docids - .insert(key, (original_value, RoaringBitmap::from_iter(Some(id)))); - // If the word docids just reached it's capacity we must make sure to remove - // one element, this way next time we insert we doesn't grow the capacity. - if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { - // Removing the front element is equivalent to removing the LRU element. - Self::write_facet_field_string_docids( - &mut self.facet_field_strings_docids_sorter, - self.facet_field_string_docids.pop_front(), - )?; - } - } - } - - Ok(()) - } - - // Save the documents ids under the words pairs proximities that it contains. - fn insert_words_pairs_proximities_docids<'a>( - &mut self, - words_pairs_proximities: impl IntoIterator, - id: DocumentId, - ) -> Result<()> { - for ((w1, w2), prox) in words_pairs_proximities { - let w1 = SmallVec32::from(w1.as_bytes()); - let w2 = SmallVec32::from(w2.as_bytes()); - let key = (w1, w2, prox); - // if get_refresh finds the element it is assured - // to be at the end of the linked hash map. - match self.words_pairs_proximities_docids.get_refresh(&key) { - Some(old) => { - old.insert(id); - } - None => { - // A newly inserted element is append at the end of the linked hash map. - let ids = RoaringBitmap::from_iter(Some(id)); - self.words_pairs_proximities_docids.insert(key, ids); - } - } - } - - // If the linked hashmap is over capacity we must remove the overflowing elements. - let len = self.words_pairs_proximities_docids.len(); - let overflow = len.checked_sub(self.words_pairs_proximities_docids_limit); - if let Some(overflow) = overflow { - let mut lrus = Vec::with_capacity(overflow); - // Removing front elements is equivalent to removing the LRUs. - let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front()); - iter.take(overflow).for_each(|x| lrus.push(x)); - Self::write_words_pairs_proximities( - &mut self.words_pairs_proximities_docids_sorter, - lrus, - )?; - } - - Ok(()) - } - - fn write_document( - &mut self, - document_id: DocumentId, - words_positions: &mut HashMap>, - facet_numbers_values: &mut HashMap>, - facet_strings_values: &mut HashMap>, - record: &[u8], - ) -> Result<()> { - // We compute the list of words pairs proximities (self-join) and write it directly to disk. - let words_pair_proximities = compute_words_pair_proximities(&words_positions); - self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; - - // We store document_id associated with all the words the record contains. - for (word, _) in words_positions.iter() { - self.insert_word_docid(word, document_id)?; - } - - self.documents_writer.insert(document_id.to_be_bytes(), record)?; - Self::write_docid_word_positions( - &mut self.docid_word_positions_writer, - document_id, - words_positions, - )?; - Self::write_word_position_docids( - &mut self.word_level_position_docids_sorter, - document_id, - words_positions, - )?; - - words_positions.clear(); - - // We store document_id associated with all the facet numbers fields ids and values. - for (field, values) in facet_numbers_values.drain() { - for value in values { - let value = OrderedFloat::from(value); - self.insert_facet_number_values_docid(field, value, document_id)?; - } - } - - // We store document_id associated with all the facet strings fields ids and values. - for (field, values) in facet_strings_values.drain() { - for (normalized, original) in values { - self.insert_facet_string_values_docid(field, normalized, original, document_id)?; - } - } - - Ok(()) - } - - fn write_words_pairs_proximities( - sorter: &mut Sorter>, - iter: impl IntoIterator, SmallVec32, u8), RoaringBitmap)>, - ) -> Result<()> - where - Error: From, - { - let mut key = Vec::new(); - let mut buffer = Vec::new(); - - for ((w1, w2, min_prox), docids) in iter { - key.clear(); - key.extend_from_slice(w1.as_bytes()); - key.push(0); - key.extend_from_slice(w2.as_bytes()); - // Storing the minimun proximity found between those words - key.push(min_prox); - // We serialize the document ids into a buffer - buffer.clear(); - buffer.reserve(CboRoaringBitmapCodec::serialized_size(&docids)); - CboRoaringBitmapCodec::serialize_into(&docids, &mut buffer); - // that we write under the generated key into MTBL - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &buffer)?; - } else { - warn!( - "words pairs proximity ({:?} - {:?}, {:?}) is too large to be saved", - w1, w2, min_prox - ); - } - } - - Ok(()) - } - - fn write_docid_word_positions( - writer: &mut Writer, - id: DocumentId, - words_positions: &HashMap>, - ) -> Result<()> { - // We prefix the words by the document id. - let mut key = id.to_be_bytes().to_vec(); - let mut buffer = Vec::new(); - let base_size = key.len(); - - // We order the words lexicographically, this way we avoid passing by a sorter. - let words_positions = BTreeMap::from_iter(words_positions); - - for (word, positions) in words_positions { - key.truncate(base_size); - key.extend_from_slice(word.as_bytes()); - buffer.clear(); - - // We serialize the positions into a buffer. - let positions = RoaringBitmap::from_iter(positions.iter().cloned()); - BoRoaringBitmapCodec::serialize_into(&positions, &mut buffer); - - // that we write under the generated key into MTBL - if lmdb_key_valid_size(&key) { - writer.insert(&key, &buffer)?; - } else { - warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); - } - } - - Ok(()) - } - - fn write_word_position_docids( - writer: &mut Sorter>, - document_id: DocumentId, - words_positions: &HashMap>, - ) -> Result<()> - where - Error: From, - { - let mut key_buffer = Vec::new(); - let mut data_buffer = Vec::new(); - - for (word, positions) in words_positions { - key_buffer.clear(); - key_buffer.extend_from_slice(word.as_bytes()); - key_buffer.push(0); // level 0 - - for position in positions { - key_buffer.truncate(word.len() + 1); - let position_bytes = position.to_be_bytes(); - key_buffer.extend_from_slice(position_bytes.as_bytes()); - key_buffer.extend_from_slice(position_bytes.as_bytes()); - - data_buffer.clear(); - let positions = RoaringBitmap::from_iter(Some(document_id)); - // We serialize the positions into a buffer. - CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer); - - // that we write under the generated key into MTBL - if lmdb_key_valid_size(&key_buffer) { - writer.insert(&key_buffer, &data_buffer)?; - } else { - warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); - } - } - } - - Ok(()) - } - - fn write_facet_field_string_docids(sorter: &mut Sorter>, iter: I) -> Result<()> - where - I: IntoIterator, - Error: From, - { - let mut key_buffer = Vec::new(); - - for ((field_id, normalized_value), (original_value, docids)) in iter { - key_buffer.clear(); - - FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer); - - let data = (original_value.as_str(), docids); - let data = FacetStringLevelZeroValueCodec::::bytes_encode(&data) - .ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?; - - if lmdb_key_valid_size(&key_buffer) { - sorter.insert(&key_buffer, &data)?; - } else { - warn!( - "facet value {:?} is too large to be saved", - original_value.as_bytes().as_bstr() - ); - } - } - - Ok(()) - } - - fn write_facet_field_number_docids(sorter: &mut Sorter>, iter: I) -> Result<()> - where - I: IntoIterator), RoaringBitmap)>, - Error: From, - { - let mut data_buffer = Vec::new(); - - for ((field_id, value), docids) in iter { - data_buffer.clear(); - - let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value)) - .map(Cow::into_owned) - .ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?; - - CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); - - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &data_buffer)?; - } - } - - Ok(()) - } - - fn write_field_id_docid_facet_number_value( - sorter: &mut Sorter>, - field_id: FieldId, - document_id: DocumentId, - value: OrderedFloat, - ) -> Result<()> - where - Error: From, - { - let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value)) - .map(Cow::into_owned) - .ok_or(SerializationError::Encoding { db_name: Some("facet level value") })?; - - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &[])?; - } - - Ok(()) - } - - fn write_field_id_docid_facet_string_value( - sorter: &mut Sorter>, - field_id: FieldId, - document_id: DocumentId, - normalized_value: &str, - original_value: &str, - ) -> Result<()> - where - Error: From, - { - let mut buffer = Vec::new(); - FieldDocIdFacetStringCodec::serialize_into( - field_id, - document_id, - normalized_value, - &mut buffer, - ); - - if lmdb_key_valid_size(&buffer) { - sorter.insert(&buffer, original_value.as_bytes())?; - } else { - warn!("facet value {:?} is too large to be saved", original_value.as_bytes().as_bstr()); - } - - Ok(()) - } - - fn write_word_docids(sorter: &mut Sorter>, iter: I) -> Result<()> - where - I: IntoIterator, RoaringBitmap)>, - Error: From, - { - let mut key = Vec::new(); - let mut buffer = Vec::new(); - - for (word, ids) in iter { - key.clear(); - key.extend_from_slice(&word); - // We serialize the document ids into a buffer - buffer.clear(); - let ids = RoaringBitmap::from_iter(ids); - buffer.reserve(ids.serialized_size()); - ids.serialize_into(&mut buffer)?; - // that we write under the generated key into MTBL - if lmdb_key_valid_size(&key) { - sorter.insert(&key, &buffer)?; - } else { - warn!("word {:?} is too large to be saved", word.as_bytes().as_bstr()); - } - } - - Ok(()) - } - - pub fn index( - mut self, - mut documents: grenad::Reader<&[u8]>, - documents_count: usize, - thread_index: usize, - num_threads: usize, - log_every_n: Option, - mut progress_callback: F, - ) -> Result - where - F: FnMut(UpdateIndexingStep), - { - debug!("{:?}: Indexing in a Store...", thread_index); - - let mut before = Instant::now(); - let mut words_positions = HashMap::new(); - let mut facet_numbers_values = HashMap::new(); - let mut facet_strings_values = HashMap::new(); - - let mut count: usize = 0; - while let Some((key, value)) = documents.next()? { - let document_id = key.try_into().map(u32::from_be_bytes).unwrap(); - let document = obkv::KvReader::new(value); - - // We skip documents that must not be indexed by this thread. - if count % num_threads == thread_index { - // This is a log routine that we do every `log_every_n` documents. - if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { - info!( - "We have seen {} documents so far ({:.02?}).", - format_count(count), - before.elapsed() - ); - progress_callback(UpdateIndexingStep::IndexDocuments { - documents_seen: count, - total_documents: documents_count, - }); - before = Instant::now(); - } - - for (attr, content) in document.iter() { - if self.filterable_fields.contains(&attr) - || self.searchable_fields.contains(&attr) - { - let value = - serde_json::from_slice(content).map_err(InternalError::SerdeJson)?; - - if self.filterable_fields.contains(&attr) { - let (facet_numbers, facet_strings) = extract_facet_values(&value); - facet_numbers_values - .entry(attr) - .or_insert_with(Vec::new) - .extend(facet_numbers); - facet_strings_values - .entry(attr) - .or_insert_with(Vec::new) - .extend(facet_strings); - } - - if self.searchable_fields.contains(&attr) { - let content = match json_to_string(&value) { - Some(content) => content, - None => continue, - }; - - let analyzed = self.analyzer.analyze(&content); - let tokens = process_tokens(analyzed.tokens()); - - let mut last_pos = None; - for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { - last_pos = Some(pos); - let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions - .entry(token.text().to_string()) - .or_insert_with(SmallVec32::new) - .push(position); - } - - if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { - let key = (attr, last_pos as u8 + 1); - self.field_id_word_count_docids - .entry(key) - .or_insert_with(RoaringBitmap::new) - .insert(document_id); - } - } - } - } - - // We write the document in the documents store. - self.write_document( - document_id, - &mut words_positions, - &mut facet_numbers_values, - &mut facet_strings_values, - value, - )?; - } - - // Compute the document id of the next document. - count += 1; - } - - progress_callback(UpdateIndexingStep::IndexDocuments { - documents_seen: count, - total_documents: documents_count, - }); - - let readers = self.finish()?; - debug!("{:?}: Store created!", thread_index); - Ok(readers) - } - - fn finish(mut self) -> Result { - let comp_type = self.chunk_compression_type; - let comp_level = self.chunk_compression_level; - let shrink_size = self.chunk_fusing_shrink_size; - - Self::write_word_docids(&mut self.word_docids_sorter, self.word_docids)?; - Self::write_words_pairs_proximities( - &mut self.words_pairs_proximities_docids_sorter, - self.words_pairs_proximities_docids, - )?; - Self::write_facet_field_number_docids( - &mut self.facet_field_numbers_docids_sorter, - self.facet_field_number_docids, - )?; - - Self::write_facet_field_string_docids( - &mut self.facet_field_strings_docids_sorter, - self.facet_field_string_docids, - )?; - - let mut word_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - let mut builder = fst::SetBuilder::memory(); - - let mut iter = self.word_docids_sorter.into_iter()?; - while let Some((word, val)) = iter.next()? { - // This is a lexicographically ordered word position - // we use the key to construct the words fst. - builder.insert(word)?; - word_docids_wtr.insert(word, val)?; - } - - let mut docids_buffer = Vec::new(); - for ((fid, count), docids) in self.field_id_word_count_docids { - docids_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer); - let key: [u8; 3] = concat_arrays!(fid.to_be_bytes(), [count]); - self.field_id_word_count_docids_sorter.insert(key, &docids_buffer)?; - } - - let fst = builder.into_set(); - self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?; - - let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.main_sorter.write_into(&mut main_wtr)?; - - let mut words_pairs_proximities_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.words_pairs_proximities_docids_sorter - .write_into(&mut words_pairs_proximities_docids_wtr)?; - - let mut word_level_position_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; - - let mut field_id_word_count_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; - - let mut facet_field_numbers_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; - - let mut facet_field_strings_docids_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; - - let mut field_id_docid_facet_numbers_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_docid_facet_numbers_sorter - .write_into(&mut field_id_docid_facet_numbers_wtr)?; - - let mut field_id_docid_facet_strings_wtr = - tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_docid_facet_strings_sorter - .write_into(&mut field_id_docid_facet_strings_wtr)?; - - let main = writer_into_reader(main_wtr, shrink_size)?; - let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; - let words_pairs_proximities_docids = - writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; - let word_level_position_docids = - writer_into_reader(word_level_position_docids_wtr, shrink_size)?; - let field_id_word_count_docids = - writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; - let facet_field_numbers_docids = - writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; - let facet_field_strings_docids = - writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; - let field_id_docid_facet_numbers = - writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; - let field_id_docid_facet_strings = - writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; - let docid_word_positions = - writer_into_reader(self.docid_word_positions_writer, shrink_size)?; - let documents = writer_into_reader(self.documents_writer, shrink_size)?; - - Ok(Readers { - main, - word_docids, - docid_word_positions, - words_pairs_proximities_docids, - word_level_position_docids, - field_id_word_count_docids, - facet_field_numbers_docids, - facet_field_strings_docids, - field_id_docid_facet_numbers, - field_id_docid_facet_strings, - documents, - }) - } -} - -/// Outputs a list of all pairs of words with the shortest proximity between 1 and 7 inclusive. -/// -/// This list is used by the engine to calculate the documents containing words that are -/// close to each other. -fn compute_words_pair_proximities( - word_positions: &HashMap>, -) -> HashMap<(&str, &str), u8> { - use itertools::Itertools; - - let mut words_pair_proximities = HashMap::new(); - for ((w1, ps1), (w2, ps2)) in word_positions.iter().cartesian_product(word_positions) { - let mut min_prox = None; - for (ps1, ps2) in ps1.iter().cartesian_product(ps2) { - let prox = crate::proximity::positions_proximity(*ps1, *ps2); - let prox = u8::try_from(prox).unwrap(); - // We don't care about a word that appear at the - // same position or too far from the other. - if prox >= 1 && prox <= 7 && min_prox.map_or(true, |mp| prox < mp) { - min_prox = Some(prox) - } - } - - if let Some(min_prox) = min_prox { - words_pair_proximities.insert((w1.as_str(), w2.as_str()), min_prox); - } - } - - words_pair_proximities -} - -fn format_count(n: usize) -> String { - human_format::Formatter::new().with_decimals(1).with_separator("").format(n as f64) -} - -fn lmdb_key_valid_size(key: &[u8]) -> bool { - !key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH -} - -/// take an iterator on tokens and compute their relative position depending on separator kinds -/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, -/// else we keep the standart proximity of 1 between words. -fn process_tokens<'a>( - tokens: impl Iterator>, -) -> impl Iterator)> { - tokens - .skip_while(|token| token.is_separator().is_some()) - .scan((0, None), |(offset, prev_kind), token| { - match token.kind { - TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { - *offset += match *prev_kind { - Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, - Some(_) => 1, - None => 0, - }; - *prev_kind = Some(token.kind) - } - TokenKind::Separator(SeparatorKind::Hard) => { - *prev_kind = Some(token.kind); - } - TokenKind::Separator(SeparatorKind::Soft) - if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => - { - *prev_kind = Some(token.kind); - } - _ => (), - } - Some((*offset, token)) - }) - .filter(|(_, t)| t.is_word()) -} - -fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { - fn inner_extract_facet_values( - value: &Value, - can_recurse: bool, - output_numbers: &mut Vec, - output_strings: &mut Vec<(String, String)>, - ) { - match value { - Value::Null => (), - Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), - Value::Number(number) => { - if let Some(float) = number.as_f64() { - output_numbers.push(float); - } - } - Value::String(original) => { - let normalized = original.trim().to_lowercase(); - output_strings.push((normalized, original.clone())); - } - Value::Array(values) => { - if can_recurse { - for value in values { - inner_extract_facet_values(value, false, output_numbers, output_strings); - } - } - } - Value::Object(_) => (), - } - } - - let mut facet_number_values = Vec::new(); - let mut facet_string_values = Vec::new(); - inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); - - (facet_number_values, facet_string_values) -} diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index b273460d1..7bfaa6ecd 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -11,15 +11,14 @@ use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use super::merge_function::merge_two_obkvs; -use super::{create_sorter, create_writer, IndexDocumentsMethod}; -use crate::error::{Error, InternalError, UserError}; -use crate::index::db_name; -use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs}; -use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; -use crate::{ - ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, MergeFn, Result, BEU32, +use super::helpers::{ + create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn, }; +use super::IndexDocumentsMethod; +use crate::error::{InternalError, UserError}; +use crate::index::db_name; +use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32}; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; @@ -46,7 +45,6 @@ pub struct Transform<'t, 'i> { pub log_every_n: Option, pub chunk_compression_type: CompressionType, pub chunk_compression_level: Option, - pub chunk_fusing_shrink_size: Option, pub max_nb_chunks: Option, pub max_memory: Option, pub index_documents_method: IndexDocumentsMethod, @@ -149,7 +147,6 @@ impl Transform<'_, '_> { merge_function, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -169,7 +166,7 @@ impl Transform<'_, '_> { } obkv_buffer.clear(); - let mut writer = obkv::KvWriter::new(&mut obkv_buffer); + let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); // We prepare the fields ids map with the documents keys. for (key, _value) in &document { @@ -209,7 +206,6 @@ impl Transform<'_, '_> { .map_err(InternalError::SerdeJson)?; writer.insert(field_id, &json_buffer)?; } - // We validate the document id [a-zA-Z0-9\-_]. if field_id == primary_key_id && validate_document_id(&external_id).is_none() { return Err(UserError::InvalidDocumentId { @@ -291,7 +287,6 @@ impl Transform<'_, '_> { keep_latest_obkv, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -306,7 +301,7 @@ impl Transform<'_, '_> { let mut record = csv::StringRecord::new(); while csv.read_record(&mut record).map_err(UserError::Csv)? { obkv_buffer.clear(); - let mut writer = obkv::KvWriter::new(&mut obkv_buffer); + let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { @@ -372,9 +367,9 @@ impl Transform<'_, '_> { /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. - fn output_from_sorter( + fn output_from_sorter( self, - sorter: grenad::Sorter>, + sorter: grenad::Sorter, primary_key: String, fields_ids_map: FieldsIdsMap, approximate_number_of_documents: usize, @@ -383,7 +378,6 @@ impl Transform<'_, '_> { ) -> Result where F: Fn(UpdateIndexingStep) + Sync, - Error: From, { let documents_ids = self.index.documents_ids(self.rtxn)?; let mut field_distribution = self.index.field_distribution(self.rtxn)?; @@ -391,10 +385,15 @@ impl Transform<'_, '_> { // Once we have sort and deduplicated the documents we write them into a final file. let mut final_sorter = create_sorter( - |_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "documents" }), + |_id, obkvs| { + if obkvs.len() == 1 { + Ok(obkvs[0].clone()) + } else { + Err(InternalError::IndexingMergingKeys { process: "documents" }.into()) + } + }, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -405,7 +404,7 @@ impl Transform<'_, '_> { // While we write into final file we get or generate the internal documents ids. let mut documents_count = 0; - let mut iter = sorter.into_iter()?; + let mut iter = sorter.into_merger_iter()?; while let Some((external_id, update_obkv)) = iter.next()? { if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { @@ -534,7 +533,7 @@ impl Transform<'_, '_> { let docid = docid.get(); obkv_buffer.clear(); - let mut obkv_writer = obkv::KvWriter::new(&mut obkv_buffer); + let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. for (id, name) in new_fields_ids_map.iter() { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs new file mode 100644 index 000000000..e7617bdab --- /dev/null +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -0,0 +1,272 @@ +use std::fs::File; + +use heed::types::ByteSlice; +use heed::{BytesDecode, RwTxn}; +use roaring::RoaringBitmap; + +use super::helpers::{ + roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, +}; +use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; +use crate::update::index_documents::helpers::into_clonable_grenad; +use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Index, Result}; + +pub(crate) enum TypedChunk { + DocidWordPositions(grenad::Reader), + FieldIdDocidFacetStrings(grenad::Reader), + FieldIdDocidFacetNumbers(grenad::Reader), + Documents(grenad::Reader), + FieldIdWordcountDocids(grenad::Reader), + NewDocumentsIds(RoaringBitmap), + WordDocids(grenad::Reader), + WordLevelPositionDocids(grenad::Reader), + WordPairProximityDocids(grenad::Reader), + FieldIdFacetStringDocids(grenad::Reader), + FieldIdFacetNumberDocids(grenad::Reader), +} + +/// Write typed chunk in the corresponding LMDB database of the provided index. +/// Return new documents seen. +pub(crate) fn write_typed_chunk_into_index( + typed_chunk: TypedChunk, + index: &Index, + wtxn: &mut RwTxn, + index_is_empty: bool, +) -> Result { + match typed_chunk { + TypedChunk::DocidWordPositions(docid_word_positions_iter) => { + write_entries_into_database( + docid_word_positions_iter, + &index.docid_word_positions, + wtxn, + index_is_empty, + |value, buffer| { + // ensure that values are unique and ordered + let positions = roaring_bitmap_from_u32s_array(value); + BoRoaringBitmapCodec::serialize_into(&positions, buffer); + Ok(buffer) + }, + |new_values, db_values, buffer| { + let new_values = roaring_bitmap_from_u32s_array(new_values); + let positions = match BoRoaringBitmapCodec::bytes_decode(db_values) { + Some(db_values) => new_values | db_values, + None => new_values, // should not happen + }; + BoRoaringBitmapCodec::serialize_into(&positions, buffer); + Ok(()) + }, + )?; + } + TypedChunk::Documents(mut obkv_documents_iter) => { + while let Some((key, value)) = obkv_documents_iter.next()? { + index.documents.remap_types::().put(wtxn, key, value)?; + } + } + TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => { + append_entries_into_database( + fid_word_count_docids_iter, + &index.field_id_word_count_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + } + TypedChunk::NewDocumentsIds(documents_ids) => return Ok(documents_ids), + TypedChunk::WordDocids(word_docids_iter) => { + let mut word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?; + append_entries_into_database( + word_docids_iter.clone(), + &index.word_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_roaring_bitmaps, + )?; + + // create fst from word docids + let mut builder = fst::SetBuilder::memory(); + while let Some((word, _value)) = word_docids_iter.next()? { + // This is a lexicographically ordered word position + // we use the key to construct the words fst. + builder.insert(word)?; + } + let fst = builder.into_set().map_data(std::borrow::Cow::Owned).unwrap(); + let db_fst = index.words_fst(wtxn)?; + + // merge new fst with database fst + let union_stream = fst.op().add(db_fst.stream()).union(); + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(union_stream)?; + let fst = builder.into_set(); + index.put_words_fst(wtxn, &fst)?; + } + TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => { + append_entries_into_database( + word_level_position_docids_iter, + &index.word_level_position_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + } + TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { + append_entries_into_database( + facet_id_f64_docids_iter, + &index.facet_id_f64_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + } + TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { + append_entries_into_database( + word_pair_proximity_docids_iter, + &index.word_pair_proximity_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + } + TypedChunk::FieldIdDocidFacetNumbers(mut fid_docid_facet_number) => { + let index_fid_docid_facet_numbers = + index.field_id_docid_facet_f64s.remap_types::(); + while let Some((key, value)) = fid_docid_facet_number.next()? { + if valid_lmdb_key(key) { + index_fid_docid_facet_numbers.put(wtxn, key, &value)?; + } + } + } + TypedChunk::FieldIdDocidFacetStrings(mut fid_docid_facet_string) => { + let index_fid_docid_facet_strings = + index.field_id_docid_facet_strings.remap_types::(); + while let Some((key, value)) = fid_docid_facet_string.next()? { + if valid_lmdb_key(key) { + index_fid_docid_facet_strings.put(wtxn, key, &value)?; + } + } + } + TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { + append_entries_into_database( + facet_id_string_docids, + &index.facet_id_string_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + |new_values, db_values, buffer| { + let (_, new_values) = decode_prefix_string(new_values).unwrap(); + let new_values = RoaringBitmap::deserialize_from(new_values)?; + let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); + let db_values = RoaringBitmap::deserialize_from(db_values)?; + let values = new_values | db_values; + encode_prefix_string(db_original, buffer)?; + Ok(values.serialize_into(buffer)?) + }, + )?; + } + } + + Ok(RoaringBitmap::new()) +} + +fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { + let new_value = RoaringBitmap::deserialize_from(new_value)?; + let db_value = RoaringBitmap::deserialize_from(db_value)?; + let value = new_value | db_value; + Ok(serialize_roaring_bitmap(&value, buffer)?) +} + +fn merge_cbo_roaring_bitmaps( + new_value: &[u8], + db_value: &[u8], + buffer: &mut Vec, +) -> Result<()> { + let new_value = CboRoaringBitmapCodec::deserialize_from(new_value)?; + let db_value = CboRoaringBitmapCodec::deserialize_from(db_value)?; + let value = new_value | db_value; + Ok(CboRoaringBitmapCodec::serialize_into(&value, buffer)) +} + +/// Write provided entries in database using serialize_value function. +/// merge_values function is used if an entry already exist in the database. +fn write_entries_into_database( + mut data: grenad::Reader, + database: &heed::Database, + wtxn: &mut RwTxn, + index_is_empty: bool, + serialize_value: FS, + merge_values: FM, +) -> Result<()> +where + R: std::io::Read, + FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, + FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, +{ + let mut buffer = Vec::new(); + let database = database.remap_types::(); + + while let Some((key, value)) = data.next()? { + if valid_lmdb_key(key) { + buffer.clear(); + let value = if index_is_empty { + serialize_value(value, &mut buffer)? + } else { + match database.get(wtxn, key)? { + Some(prev_value) => { + merge_values(value, prev_value, &mut buffer)?; + &buffer[..] + } + None => serialize_value(value, &mut buffer)?, + } + }; + database.put(wtxn, key, value)?; + } + } + + Ok(()) +} + +/// Write provided entries in database using serialize_value function. +/// merge_values function is used if an entry already exist in the database. +/// All provided entries must be ordered. +/// If the index is not empty, write_entries_into_database is called instead. +fn append_entries_into_database( + mut data: grenad::Reader, + database: &heed::Database, + wtxn: &mut RwTxn, + index_is_empty: bool, + serialize_value: FS, + merge_values: FM, +) -> Result<()> +where + R: std::io::Read, + FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, + FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, +{ + if !index_is_empty { + return write_entries_into_database( + data, + database, + wtxn, + false, + serialize_value, + merge_values, + ); + } + + let mut buffer = Vec::new(); + let mut database = database.iter_mut(wtxn)?.remap_types::(); + + while let Some((key, value)) = data.next()? { + if valid_lmdb_key(key) { + buffer.clear(); + let value = serialize_value(value, &mut buffer)?; + unsafe { database.append(key, value)? }; + } + } + + Ok(()) +} diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1d0e15cff..ef23286ae 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -65,10 +65,8 @@ pub struct Settings<'a, 't, 'u, 'i> { pub(crate) log_every_n: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, - pub(crate) linked_hash_map_size: Option, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, update_id: u64, @@ -95,10 +93,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { log_every_n: None, max_nb_chunks: None, max_memory: None, - linked_hash_map_size: None, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, thread_pool: None, searchable_fields: Setting::NotSet, displayed_fields: Setting::NotSet, @@ -205,7 +201,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { log_every_n: self.log_every_n, chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, - chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, max_nb_chunks: self.max_nb_chunks, max_memory: self.max_memory, index_documents_method: IndexDocumentsMethod::ReplaceDocuments, @@ -232,10 +227,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.log_every_n = self.log_every_n; indexing_builder.max_nb_chunks = self.max_nb_chunks; indexing_builder.max_memory = self.max_memory; - indexing_builder.linked_hash_map_size = self.linked_hash_map_size; indexing_builder.chunk_compression_type = self.chunk_compression_type; indexing_builder.chunk_compression_level = self.chunk_compression_level; - indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; indexing_builder.thread_pool = self.thread_pool; indexing_builder.execute_raw(output, &cb)?; diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 2816ebca0..6035499b3 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -8,10 +8,8 @@ pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, - pub(crate) linked_hash_map_size: Option, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, pub(crate) update_id: u64, } @@ -22,10 +20,8 @@ impl<'a> UpdateBuilder<'a> { log_every_n: None, max_nb_chunks: None, max_memory: None, - linked_hash_map_size: None, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, thread_pool: None, update_id, } @@ -43,10 +39,6 @@ impl<'a> UpdateBuilder<'a> { self.max_memory = Some(max_memory); } - pub fn linked_hash_map_size(&mut self, linked_hash_map_size: usize) { - self.linked_hash_map_size = Some(linked_hash_map_size); - } - pub fn chunk_compression_type(&mut self, chunk_compression_type: CompressionType) { self.chunk_compression_type = chunk_compression_type; } @@ -55,10 +47,6 @@ impl<'a> UpdateBuilder<'a> { self.chunk_compression_level = Some(chunk_compression_level); } - pub fn chunk_fusing_shrink_size(&mut self, chunk_fusing_shrink_size: u64) { - self.chunk_fusing_shrink_size = Some(chunk_fusing_shrink_size); - } - pub fn thread_pool(&mut self, thread_pool: &'a ThreadPool) { self.thread_pool = Some(thread_pool); } @@ -89,10 +77,8 @@ impl<'a> UpdateBuilder<'a> { builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; - builder.linked_hash_map_size = self.linked_hash_map_size; builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; builder.thread_pool = self.thread_pool; builder @@ -108,10 +94,8 @@ impl<'a> UpdateBuilder<'a> { builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; - builder.linked_hash_map_size = self.linked_hash_map_size; builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; builder.thread_pool = self.thread_pool; builder @@ -126,7 +110,6 @@ impl<'a> UpdateBuilder<'a> { builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; - builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; builder } diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index ffc359719..b8a80938c 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -5,7 +5,7 @@ use grenad::CompressionType; use heed::types::ByteSlice; use crate::update::index_documents::{ - create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod, + create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, }; use crate::{Index, Result}; @@ -14,7 +14,6 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { index: &'i Index, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, } @@ -29,7 +28,6 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { index, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, max_nb_chunks: None, max_memory: None, } @@ -44,10 +42,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( - roaring_bitmap_merge, + merge_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -70,7 +67,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_docids.as_polymorph(), prefix_docids_sorter, - roaring_bitmap_merge, + merge_roaring_bitmaps, WriteMethod::Append, )?; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 9b876321e..8f04c23cf 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -9,7 +9,7 @@ use log::debug; use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::{ - cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, }; use crate::{Index, Result}; @@ -18,7 +18,6 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { index: &'i Index, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, } @@ -33,7 +32,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { index, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, max_nb_chunks: None, max_memory: None, } @@ -48,10 +46,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { // Here we create a sorter akin to the previous one. let mut word_prefix_pair_proximity_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, + merge_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -78,7 +75,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), word_prefix_pair_proximity_docids_sorter, - cbo_roaring_bitmap_merge, + merge_cbo_roaring_bitmaps, WriteMethod::Append, )?; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 2f0995c18..afd7d7736 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -4,7 +4,7 @@ use std::num::NonZeroU32; use std::{cmp, str}; use fst::Streamer; -use grenad::{CompressionType, FileFuse, Reader, Writer}; +use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::{BytesEncode, Error}; use log::debug; @@ -14,7 +14,7 @@ use crate::error::{InternalError, SerializationError}; use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ - cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database, + create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, WriteMethod, }; use crate::{Index, Result, TreeLevel}; @@ -24,7 +24,6 @@ pub struct WordsLevelPositions<'t, 'u, 'i> { index: &'i Index, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - pub(crate) chunk_fusing_shrink_size: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, level_group_size: NonZeroU32, @@ -41,7 +40,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { index, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - chunk_fusing_shrink_size: None, max_nb_chunks: None, max_memory: None, level_group_size: NonZeroU32::new(4).unwrap(), @@ -68,7 +66,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.index.word_level_position_docids, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.level_group_size, self.min_level_size, )?; @@ -81,7 +78,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_level_position_docids.as_polymorph(), entries, - |_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" }), + |_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" })?, WriteMethod::Append, )?; @@ -89,10 +86,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.index.word_prefix_level_position_docids.clear(self.wtxn)?; let mut word_prefix_level_positions_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, + merge_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.max_nb_chunks, self.max_memory, ); @@ -131,7 +127,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.wtxn, *self.index.word_prefix_level_position_docids.as_polymorph(), word_prefix_level_positions_docids_sorter, - cbo_roaring_bitmap_merge, + merge_cbo_roaring_bitmaps, WriteMethod::Append, )?; @@ -141,7 +137,6 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self.index.word_prefix_level_position_docids, self.chunk_compression_type, self.chunk_compression_level, - self.chunk_fusing_shrink_size, self.level_group_size, self.min_level_size, )?; @@ -155,7 +150,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { *self.index.word_prefix_level_position_docids.as_polymorph(), entries, |_, _| { - Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }) + Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })? }, WriteMethod::Append, )?; @@ -185,10 +180,9 @@ fn compute_positions_levels( words_positions_db: heed::Database, compression_type: CompressionType, compression_level: Option, - shrink_size: Option, level_group_size: NonZeroU32, min_level_size: NonZeroU32, -) -> Result> { +) -> Result> { // It is forbidden to keep a cursor and write in a database at the same time with LMDB // therefore we write the facet levels entries into a grenad file before transfering them. let mut writer = tempfile::tempfile() @@ -254,7 +248,7 @@ fn compute_positions_levels( } } - writer_into_reader(writer, shrink_size) + writer_into_reader(writer) } fn write_level_entry( diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 7d4043ff1..a533a4cbe 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -5,7 +5,7 @@ use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::update::{IndexDocuments, Settings, UpdateFormat}; +use milli::update::{IndexDocuments, Settings, UpdateBuilder, UpdateFormat}; use milli::{AscDesc, Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -50,7 +50,9 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { builder.execute(|_, _| ()).unwrap(); // index documents - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = UpdateBuilder::new(0); + builder.max_memory(10 * 1024 * 1024); // 10MiB + let mut builder = builder.index_documents(&mut wtxn, &index); builder.update_format(UpdateFormat::JsonStream); builder.enable_autogenerate_docids(); builder.execute(CONTENT.as_bytes(), |_, _| ()).unwrap(); From 823da19745d4c4ada50873d7eddc4e0332c506bc Mon Sep 17 00:00:00 2001 From: many Date: Tue, 17 Aug 2021 10:56:06 +0200 Subject: [PATCH 0932/1889] Fix test and use progress callback --- .../src/update/index_documents/extract/mod.rs | 2 + milli/src/update/index_documents/mod.rs | 78 +++++++++++++++---- .../src/update/index_documents/typed_chunk.rs | 15 +++- 3 files changed, 79 insertions(+), 16 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index b24c80da4..a389f36cf 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -11,6 +11,7 @@ use std::collections::HashSet; use std::fs::File; use crossbeam_channel::Sender; +use log::debug; use rayon::prelude::*; use self::extract_docid_word_positions::extract_docid_word_positions; @@ -192,6 +193,7 @@ fn spawn_extraction_task( .map(|chunk| extract_fn(chunk, indexer.clone()).unwrap()) .collect(); rayon::spawn(move || { + debug!("merge {} database", name); let reader = merge_readers(chunks, merge_fn, indexer).unwrap(); lmdb_writer_sx.send(serialize_fn(reader)).unwrap(); }); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4f488337c..51b0a6613 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -31,6 +31,10 @@ use crate::update::{ }; use crate::{Index, Result}; +static MERGED_DATABASE_COUNT: usize = 7; +static PREFIX_DATABASE_COUNT: usize = 5; +static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct DocumentAdditionResult { pub nb_documents: usize, @@ -278,15 +282,34 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let index_is_empty = index_documents_ids.len() == 0; let mut final_documents_ids = RoaringBitmap::new(); + let mut databases_seen = 0; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + for typed_chunk in lmdb_writer_rx { - let docids = + let (docids, is_merged_database) = write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?; - final_documents_ids |= docids; - debug!( - "We have seen {} documents on {} total document so far", - final_documents_ids.len(), - documents_count - ); + if !docids.is_empty() { + final_documents_ids |= docids; + let documents_seen_count = final_documents_ids.len(); + progress_callback(UpdateIndexingStep::IndexDocuments { + documents_seen: documents_seen_count as usize, + total_documents: documents_count, + }); + debug!( + "We have seen {} documents on {} total document so far", + documents_seen_count, documents_count + ); + } + if is_merged_database { + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + } } // We write the field distribution into the main database @@ -298,20 +321,19 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the external documents ids into the main database. self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; - let all_documents_ids = index_documents_ids | new_documents_ids; + let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids; self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; self.execute_prefix_databases(progress_callback) } - pub fn execute_prefix_databases( - self, - // output: TransformOutput, - progress_callback: F, - ) -> Result<()> + pub fn execute_prefix_databases(self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, { + // Merged databases are already been indexed, we start from this count; + let mut databases_seen = MERGED_DATABASE_COUNT; + // Run the facets update operation. let mut builder = Facets::new(self.wtxn, self.index, self.update_id); builder.chunk_compression_type = self.chunk_compression_type; @@ -324,6 +346,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + // Run the words prefixes update operation. let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id); if let Some(value) = self.words_prefix_threshold { @@ -334,6 +362,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + // Run the word prefix docids update operation. let mut builder = WordPrefixDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; @@ -342,6 +376,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { builder.max_memory = self.max_memory; builder.execute()?; + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + // Run the word prefix pair proximity docids update operation. let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; @@ -350,6 +390,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { builder.max_memory = self.max_memory; builder.execute()?; + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + // Run the words level positions update operation. let mut builder = WordsLevelPositions::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; @@ -362,6 +408,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } builder.execute()?; + databases_seen += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + Ok(()) } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e7617bdab..e8790af16 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -32,7 +32,8 @@ pub(crate) fn write_typed_chunk_into_index( index: &Index, wtxn: &mut RwTxn, index_is_empty: bool, -) -> Result { +) -> Result<(RoaringBitmap, bool)> { + let mut is_merged_database = false; match typed_chunk { TypedChunk::DocidWordPositions(docid_word_positions_iter) => { write_entries_into_database( @@ -71,8 +72,11 @@ pub(crate) fn write_typed_chunk_into_index( |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; + is_merged_database = true; + } + TypedChunk::NewDocumentsIds(documents_ids) => { + return Ok((documents_ids, is_merged_database)) } - TypedChunk::NewDocumentsIds(documents_ids) => return Ok(documents_ids), TypedChunk::WordDocids(word_docids_iter) => { let mut word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?; append_entries_into_database( @@ -100,6 +104,7 @@ pub(crate) fn write_typed_chunk_into_index( builder.extend_stream(union_stream)?; let fst = builder.into_set(); index.put_words_fst(wtxn, &fst)?; + is_merged_database = true; } TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => { append_entries_into_database( @@ -110,6 +115,7 @@ pub(crate) fn write_typed_chunk_into_index( |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; + is_merged_database = true; } TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { append_entries_into_database( @@ -120,6 +126,7 @@ pub(crate) fn write_typed_chunk_into_index( |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; + is_merged_database = true; } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { append_entries_into_database( @@ -130,6 +137,7 @@ pub(crate) fn write_typed_chunk_into_index( |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; + is_merged_database = true; } TypedChunk::FieldIdDocidFacetNumbers(mut fid_docid_facet_number) => { let index_fid_docid_facet_numbers = @@ -166,10 +174,11 @@ pub(crate) fn write_typed_chunk_into_index( Ok(values.serialize_into(buffer)?) }, )?; + is_merged_database = true; } } - Ok(RoaringBitmap::new()) + Ok((RoaringBitmap::new(), is_merged_database)) } fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { From 2d1727697dbc802822a5b427f11cdfd1aada768c Mon Sep 17 00:00:00 2001 From: many Date: Tue, 17 Aug 2021 12:25:07 +0200 Subject: [PATCH 0933/1889] Take stop word in account --- .../index_documents/extract/extract_docid_word_positions.rs | 5 +++++ milli/src/update/index_documents/extract/mod.rs | 2 ++ milli/src/update/index_documents/mod.rs | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 9a9d7cb85..3ee7ee3b3 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -21,6 +21,7 @@ pub fn extract_docid_word_positions( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, searchable_fields: &Option>, + stop_words: Option<&fst::Set<&[u8]>>, ) -> Result<(RoaringBitmap, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); @@ -35,6 +36,10 @@ pub fn extract_docid_word_positions( let mut key_buffer = Vec::new(); let mut field_buffer = String::new(); + let mut config = AnalyzerConfig::default(); + if let Some(stop_words) = stop_words { + config.stop_words(stop_words); + } let analyzer = Analyzer::>::new(AnalyzerConfig::default()); while let Some((key, value)) = obkv_documents.next()? { diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index a389f36cf..00c0a4a5f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -37,6 +37,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx: Sender, searchable_fields: Option>, faceted_fields: HashSet, + stop_words: Option>, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() @@ -54,6 +55,7 @@ pub(crate) fn data_from_obkv_documents( documents_chunk.clone(), indexer.clone(), &searchable_fields, + stop_words.as_ref(), )?; // send documents_ids to DB writer diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 51b0a6613..c9f5da0c1 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -231,6 +231,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // get filterable fields for facet databases let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + let stop_words = self.index.stop_words(self.wtxn)?; + // let stop_words = stop_words.as_ref(); + // Run extraction pipeline in parallel. pool.install(|| { let params = GrenadParameters { @@ -255,6 +258,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { lmdb_writer_sx, searchable_fields, faceted_fields, + stop_words, ) .unwrap(); }); From 5c962c03dd5ba2027f408292e64ea31403eb5e38 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 18 Aug 2021 18:04:24 +0200 Subject: [PATCH 0934/1889] Fix and optimize word_prefix_pair_proximity_docids database --- infos/src/main.rs | 64 +++++++++++ milli/src/search/criteria/exactness.rs | 4 - milli/src/search/criteria/mod.rs | 43 ++++--- milli/src/update/index_documents/mod.rs | 4 +- .../word_prefix_pair_proximity_docids.rs | 108 ++++++++++++++---- milli/src/update/words_prefixes_fst.rs | 18 ++- 6 files changed, 187 insertions(+), 54 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index da15251b0..bb09d7234 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -207,6 +207,24 @@ enum Command { word2: String, }, + /// Outputs a CSV with the proximities for the two specified words and + /// the documents ids where these relations appears. + /// + /// `word1`, `prefix` defines the word pair specified *in this specific order*. + /// `proximity` defines the proximity between the two specified words. + /// `documents_ids` defines the documents ids where the relation appears. + WordPrefixPairProximitiesDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// First word of the word pair. + word1: String, + + /// Second word of the word pair. + prefix: String, + }, + /// Outputs the words FST to standard output. /// /// One can use the FST binary helper to dissect and analyze it, @@ -282,6 +300,9 @@ fn main() -> anyhow::Result<()> { WordPairProximitiesDocids { full_display, word1, word2 } => { word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) } + WordPrefixPairProximitiesDocids { full_display, word1, prefix } => { + word_prefix_pair_proximities_docids(&index, &rtxn, !full_display, word1, prefix) + } ExportWordsFst => export_words_fst(&index, &rtxn), ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), ExportDocuments { internal_documents_ids } => { @@ -1131,3 +1152,46 @@ fn word_pair_proximities_docids( Ok(wtr.flush()?) } + +fn word_prefix_pair_proximities_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + word1: String, + word_prefix: String, +) -> anyhow::Result<()> { + use heed::types::ByteSlice; + use milli::RoaringBitmapCodec; + + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["word1", "word_prefix", "proximity", "documents_ids"])?; + + // Create the prefix key with only the pair of words. + let mut prefix = Vec::with_capacity(word1.len() + word_prefix.len() + 1); + prefix.extend_from_slice(word1.as_bytes()); + prefix.push(0); + prefix.extend_from_slice(word_prefix.as_bytes()); + + let db = index.word_prefix_pair_proximity_docids.as_polymorph(); + let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?; + for result in iter { + let (key, docids) = result?; + + // Skip keys that are longer than the requested one, + // a longer key means that the second word is a prefix of the request word. + if key.len() != prefix.len() + 1 { + continue; + } + + let proximity = key.last().unwrap(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[&word1, &word_prefix, &proximity.to_string(), &docids])?; + } + + Ok(wtr.flush()?) +} diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 22dcb9782..1e4d4e7a2 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -180,10 +180,6 @@ fn resolve_state( if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { - println!( - "found candidates that have the good count: {:?}", - attribute_allowed_docids - ); let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; attribute_candidates_array.push(attribute_allowed_docids); diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 61b0fe049..2a883de67 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -461,13 +461,18 @@ fn query_pair_proximity_docids( let prefix = right.prefix; match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { - if prefix && ctx.in_prefix_cache(&right) { - Ok(ctx - .word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? - .unwrap_or_default()) - } else if prefix { - let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + if prefix { + match ctx.word_prefix_pair_proximity_docids( + left.as_str(), + right.as_str(), + proximity, + )? { + Some(docids) => Ok(docids), + None => { + let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + } + } } else { Ok(ctx .word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? @@ -477,22 +482,24 @@ fn query_pair_proximity_docids( (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { let l_words = word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); - if prefix && ctx.in_prefix_cache(&right) { + if prefix { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { - let current_docids = ctx - .word_prefix_pair_proximity_docids( - left.as_ref(), - right.as_ref(), - proximity, - )? - .unwrap_or_default(); + let current_docids = match ctx.word_prefix_pair_proximity_docids( + left.as_str(), + right.as_str(), + proximity, + )? { + Some(docids) => Ok(docids), + None => { + let r_words = + word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + } + }?; docids |= current_docids; } Ok(docids) - } else if prefix { - let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) } else { all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c9f5da0c1..b7fa1492c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -21,7 +21,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; pub use self::helpers::{ create_sorter, create_writer, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, + sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; @@ -81,7 +81,7 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { pub(crate) thread_pool: Option<&'a ThreadPool>, facet_level_group_size: Option, facet_min_level_size: Option, - words_prefix_threshold: Option, + words_prefix_threshold: Option, max_prefix_length: Option, words_positions_level_group_size: Option, words_positions_min_level_size: Option, diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 8f04c23cf..cabe1053b 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,15 +1,13 @@ -use std::str; +use std::collections::HashMap; -use fst::automaton::{Automaton, Str}; -use fst::{IntoStreamer, Streamer}; +use fst::IntoStreamer; use grenad::CompressionType; use heed::types::ByteSlice; -use heed::BytesEncode; use log::debug; +use slice_group_by::GroupBy; -use crate::heed_codec::StrStrU8Codec; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, MergeFn, WriteMethod, }; use crate::{Index, Result}; @@ -20,6 +18,7 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, + threshold: u32, } impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { @@ -34,16 +33,26 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { chunk_compression_level: None, max_nb_chunks: None, max_memory: None, + threshold: 100, } } + /// Set the number of words required to make a prefix be part of the words prefixes + /// database. If a word prefix is supposed to match more than this number of words in the + /// dictionnary, therefore this prefix is added to the words prefixes datastructures. + /// + /// Default value is 100. This value must be higher than 50 and will be clamped + /// to these bound otherwise. + pub fn threshold(&mut self, value: u32) -> &mut Self { + self.threshold = value.max(50); + self + } + pub fn execute(self) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; - let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - // Here we create a sorter akin to the previous one. let mut word_prefix_pair_proximity_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, @@ -53,22 +62,59 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self.max_memory, ); - // We insert all the word pairs corresponding to the word-prefix pairs - // where the prefixes appears in the prefix FST previously constructed. - let db = self.index.word_pair_proximity_docids.remap_data_type::(); - for result in db.iter(self.wtxn)? { - let ((word1, word2, prox), data) = result?; - let automaton = Str::new(word2).starts_with(); - let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); - while let Some(prefix) = matching_prefixes.next() { - let prefix = str::from_utf8(prefix)?; - let pair = (word1, prefix, prox); - let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); - word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + let prefix_fst_keys = prefix_fst.into_stream().into_bytes(); + let prefix_fst_keys: Vec<_> = prefix_fst_keys + .as_slice() + .linear_group_by_key(|x| std::str::from_utf8(&x).unwrap().chars().nth(0).unwrap()) + .collect(); + + let mut db = + self.index.word_pair_proximity_docids.remap_data_type::().iter(self.wtxn)?; + + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[Vec]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some(((w1, w2, prox), data)) = db.next().transpose()? { + current_prefixes = match current_prefixes.take() { + Some(prefixes) if w2.as_bytes().starts_with(&prefixes[0]) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter( + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + self.threshold, + )?; + prefix_fst_keys.iter().find(|prefixes| w2.as_bytes().starts_with(&prefixes[0])) + } + }; + + if let Some(prefixes) = current_prefixes { + buffer.clear(); + buffer.extend_from_slice(w1.as_bytes()); + buffer.push(0); + for prefix in prefixes.iter().filter(|prefix| w2.as_bytes().starts_with(prefix)) { + buffer.truncate(w1.len() + 1); + buffer.extend_from_slice(prefix); + buffer.push(prox); + + match prefixes_cache.get_mut(&buffer) { + Some(value) => value.push(data), + None => { + prefixes_cache.insert(buffer.clone(), vec![data]); + } + } + } } } + write_prefixes_in_sorter( + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + self.threshold, + )?; + drop(prefix_fst); + drop(db); // We finally write the word prefix pair proximity docids into the LMDB database. sorter_into_lmdb_database( @@ -82,3 +128,25 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { Ok(()) } } + +fn write_prefixes_in_sorter( + prefixes: &mut HashMap, Vec<&[u8]>>, + sorter: &mut grenad::Sorter, + min_word_per_prefix: u32, +) -> Result<()> { + for (i, (key, data_slices)) in prefixes.drain().enumerate() { + // if the number of words prefixed by the prefix is higher than the threshold, + // we insert it in the sorter. + if data_slices.len() > min_word_per_prefix as usize { + for data in data_slices { + sorter.insert(&key, data)?; + } + // if the first prefix isn't elligible for insertion, + // then the other prefixes can't be elligible. + } else if i == 0 { + break; + } + } + + Ok(()) +} diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index f35dea10d..be33c156b 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -8,7 +8,7 @@ use crate::{Index, Result, SmallString32}; pub struct WordsPrefixesFst<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - threshold: f64, + threshold: u32, max_prefix_length: usize, _update_id: u64, } @@ -22,20 +22,20 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { WordsPrefixesFst { wtxn, index, - threshold: 0.1 / 100.0, // .01% + threshold: 100, max_prefix_length: 4, _update_id: update_id, } } - /// Set the ratio of concerned words required to make a prefix be part of the words prefixes + /// Set the number of words required to make a prefix be part of the words prefixes /// database. If a word prefix is supposed to match more than this number of words in the /// dictionnary, therefore this prefix is added to the words prefixes datastructures. /// - /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped - /// to these bounds otherwise. - pub fn threshold(&mut self, value: f64) -> &mut Self { - self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] + /// Default value is 100. This value must be higher than 50 and will be clamped + /// to this bound otherwise. + pub fn threshold(&mut self, value: u32) -> &mut Self { + self.threshold = value.max(50); self } @@ -50,8 +50,6 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { pub fn execute(self) -> Result<()> { let words_fst = self.index.words_fst(&self.wtxn)?; - let number_of_words = words_fst.len(); - let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); for n in 1..=self.max_prefix_length { @@ -80,7 +78,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { current_prefix_count += 1; // There is enough words corresponding to this prefix to add it to the cache. - if current_prefix_count == min_number_of_words { + if current_prefix_count >= self.threshold { builder.insert(prefix)?; } } From a2f59a28f7685882df7b96ffbb1527596f8a0823 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 24 Aug 2021 13:01:31 +0200 Subject: [PATCH 0935/1889] Remove unwrap sending errors in channel --- .../src/update/index_documents/extract/mod.rs | 162 ++++++++++-------- milli/src/update/index_documents/mod.rs | 42 +++-- 2 files changed, 120 insertions(+), 84 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 00c0a4a5f..591c8d4cd 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -34,7 +34,7 @@ use crate::{FieldId, Result}; pub(crate) fn data_from_obkv_documents( obkv_chunks: impl Iterator>> + Send, indexer: GrenadParameters, - lmdb_writer_sx: Sender, + lmdb_writer_sx: Sender>, searchable_fields: Option>, faceted_fields: HashSet, stop_words: Option>, @@ -42,63 +42,14 @@ pub(crate) fn data_from_obkv_documents( let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() .map(|result| { - let documents_chunk = result.and_then(|c| unsafe { into_clonable_grenad(c) }).unwrap(); - - lmdb_writer_sx.send(TypedChunk::Documents(documents_chunk.clone())).unwrap(); - - let (docid_word_positions_chunk, docid_fid_facet_values_chunks): ( - Result<_>, - Result<_>, - ) = rayon::join( - || { - let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( - documents_chunk.clone(), - indexer.clone(), - &searchable_fields, - stop_words.as_ref(), - )?; - - // send documents_ids to DB writer - lmdb_writer_sx.send(TypedChunk::NewDocumentsIds(documents_ids)).unwrap(); - - // send docid_word_positions_chunk to DB writer - let docid_word_positions_chunk = - unsafe { into_clonable_grenad(docid_word_positions_chunk)? }; - lmdb_writer_sx - .send(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())) - .unwrap(); - Ok(docid_word_positions_chunk) - }, - || { - let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = - extract_fid_docid_facet_values( - documents_chunk.clone(), - indexer.clone(), - &faceted_fields, - )?; - - // send docid_fid_facet_numbers_chunk to DB writer - let docid_fid_facet_numbers_chunk = - unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? }; - lmdb_writer_sx - .send(TypedChunk::FieldIdDocidFacetNumbers( - docid_fid_facet_numbers_chunk.clone(), - )) - .unwrap(); - - // send docid_fid_facet_strings_chunk to DB writer - let docid_fid_facet_strings_chunk = - unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? }; - lmdb_writer_sx - .send(TypedChunk::FieldIdDocidFacetStrings( - docid_fid_facet_strings_chunk.clone(), - )) - .unwrap(); - - Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) - }, - ); - Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) + extract_documents_data( + result, + indexer, + lmdb_writer_sx.clone(), + &searchable_fields, + &faceted_fields, + &stop_words, + ) }) .collect(); @@ -177,7 +128,7 @@ pub(crate) fn data_from_obkv_documents( fn spawn_extraction_task( chunks: Vec>, indexer: GrenadParameters, - lmdb_writer_sx: Sender, + lmdb_writer_sx: Sender>, extract_fn: FE, merge_fn: MergeFn, serialize_fn: FS, @@ -190,14 +141,89 @@ fn spawn_extraction_task( FS: Fn(grenad::Reader) -> TypedChunk + Sync + Send + 'static, { rayon::spawn(move || { - let chunks: Vec<_> = chunks - .into_par_iter() - .map(|chunk| extract_fn(chunk, indexer.clone()).unwrap()) - .collect(); - rayon::spawn(move || { - debug!("merge {} database", name); - let reader = merge_readers(chunks, merge_fn, indexer).unwrap(); - lmdb_writer_sx.send(serialize_fn(reader)).unwrap(); - }); + let chunks: Result> = + chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect(); + rayon::spawn(move || match chunks { + Ok(chunks) => { + debug!("merge {} database", name); + let reader = merge_readers(chunks, merge_fn, indexer); + lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))).unwrap(); + } + Err(e) => lmdb_writer_sx.send(Err(e)).unwrap(), + }) }); } + +/// Extract chuncked data and send it into lmdb_writer_sx sender: +/// - documents +/// - documents_ids +/// - docid_word_positions +/// - docid_fid_facet_numbers +/// - docid_fid_facet_strings +fn extract_documents_data( + documents_chunk: Result>, + indexer: GrenadParameters, + lmdb_writer_sx: Sender>, + searchable_fields: &Option>, + faceted_fields: &HashSet, + stop_words: &Option>, +) -> Result<( + grenad::Reader, + (grenad::Reader, grenad::Reader), +)> { + let documents_chunk = documents_chunk.and_then(|c| unsafe { into_clonable_grenad(c) })?; + + lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))).unwrap(); + + let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = + rayon::join( + || { + let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( + documents_chunk.clone(), + indexer.clone(), + searchable_fields, + stop_words.as_ref(), + )?; + + // send documents_ids to DB writer + lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))).unwrap(); + + // send docid_word_positions_chunk to DB writer + let docid_word_positions_chunk = + unsafe { into_clonable_grenad(docid_word_positions_chunk)? }; + lmdb_writer_sx + .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))) + .unwrap(); + Ok(docid_word_positions_chunk) + }, + || { + let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = + extract_fid_docid_facet_values( + documents_chunk.clone(), + indexer.clone(), + faceted_fields, + )?; + + // send docid_fid_facet_numbers_chunk to DB writer + let docid_fid_facet_numbers_chunk = + unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? }; + lmdb_writer_sx + .send(Ok(TypedChunk::FieldIdDocidFacetNumbers( + docid_fid_facet_numbers_chunk.clone(), + ))) + .unwrap(); + + // send docid_fid_facet_strings_chunk to DB writer + let docid_fid_facet_strings_chunk = + unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? }; + lmdb_writer_sx + .send(Ok(TypedChunk::FieldIdDocidFacetStrings( + docid_fid_facet_strings_chunk.clone(), + ))) + .unwrap(); + + Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) + }, + ); + Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) +} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b7fa1492c..4cf7c83f1 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -222,8 +222,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let documents_file = grenad::Reader::new(documents_file)?; // create LMDB writer channel - let (lmdb_writer_sx, lmdb_writer_rx): (Sender, Receiver) = - crossbeam_channel::unbounded(); + let (lmdb_writer_sx, lmdb_writer_rx): ( + Sender>, + Receiver>, + ) = crossbeam_channel::unbounded(); // get searchable fields for word databases let searchable_fields = @@ -244,23 +246,31 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }; // split obkv file into several chuncks - let mut chunk_iter = grenad_obkv_into_chunks( + let chunk_iter = grenad_obkv_into_chunks( documents_file, params.clone(), self.log_every_n, Byte::from_bytes(self.documents_chunk_size.unwrap_or(1024 * 1024 * 128) as u64), // 128MiB - ) - .unwrap(); - // extract all databases from the chunked obkv douments - extract::data_from_obkv_documents( - &mut chunk_iter, - params, - lmdb_writer_sx, - searchable_fields, - faceted_fields, - stop_words, - ) - .unwrap(); + ); + + let result = chunk_iter.map(|chunk_iter| { + // extract all databases from the chunked obkv douments + extract::data_from_obkv_documents( + chunk_iter, + params, + lmdb_writer_sx.clone(), + searchable_fields, + faceted_fields, + stop_words, + ) + }); + + if let Err(e) = result { + lmdb_writer_sx.send(Err(e)).unwrap(); + } + + // needs to be droped to avoid channel waiting lock. + drop(lmdb_writer_sx) }); // We delete the documents that this document addition replaces. This way we are @@ -294,7 +304,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { for typed_chunk in lmdb_writer_rx { let (docids, is_merged_database) = - write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?; + write_typed_chunk_into_index(typed_chunk?, &self.index, self.wtxn, index_is_empty)?; if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); From fc7cc770d408c223e3c1efc058c21ea122851522 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 24 Aug 2021 13:55:53 +0200 Subject: [PATCH 0936/1889] Add logging timers --- milli/src/update/facets.rs | 1 + .../index_documents/extract/extract_docid_word_positions.rs | 1 + .../index_documents/extract/extract_facet_number_docids.rs | 1 + .../index_documents/extract/extract_facet_string_docids.rs | 1 + .../index_documents/extract/extract_fid_docid_facet_values.rs | 1 + .../index_documents/extract/extract_fid_word_count_docids.rs | 1 + .../src/update/index_documents/extract/extract_word_docids.rs | 1 + .../extract/extract_word_level_position_docids.rs | 1 + .../extract/extract_word_pair_proximity_docids.rs | 1 + milli/src/update/index_documents/mod.rs | 3 +++ milli/src/update/word_prefix_docids.rs | 1 + milli/src/update/word_prefix_pair_proximity_docids.rs | 1 + milli/src/update/words_level_positions.rs | 1 + milli/src/update/words_prefixes_fst.rs | 1 + 14 files changed, 16 insertions(+) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 3ae63f282..9b7d6d42c 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -57,6 +57,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self } + #[logging_timer::time("Facets::{}")] pub fn execute(self) -> Result<()> { self.index.set_updated_at(self.wtxn, &Utc::now())?; // We get the faceted fields to be able to create the facet levels. diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 3ee7ee3b3..fb3372660 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -17,6 +17,7 @@ use crate::{FieldId, Result}; /// /// Returns the generated internal documents ids and a grenad reader /// with the list of extracted words from the given chunk of documents. +#[logging_timer::time] pub fn extract_docid_word_positions( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 1734ef028..5480bd605 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -13,6 +13,7 @@ use crate::Result; /// /// Returns a grenad reader with the list of extracted facet numbers and /// documents ids from the given chunk of docid facet number positions. +#[logging_timer::time] pub fn extract_facet_number_docids( mut docid_fid_facet_number: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 66ede5f42..e08d062cf 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -15,6 +15,7 @@ use crate::{FieldId, Result}; /// /// Returns a grenad reader with the list of extracted facet strings and /// documents ids from the given chunk of docid facet string positions. +#[logging_timer::time] pub fn extract_facet_string_docids( mut docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index e7e56a3c8..08f2cadf0 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -15,6 +15,7 @@ use crate::{DocumentId, FieldId, Result}; /// /// Returns the generated grenad reader containing the docid the fid and the orginal value as key /// and the normalized value as value extracted from the given chunk of documents. +#[logging_timer::time] pub fn extract_fid_docid_facet_values( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 66b179663..cf698507d 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -16,6 +16,7 @@ use crate::{DocumentId, FieldId, Result}; /// /// Returns a grenad reader with the list of extracted field id word counts /// and documents ids from the given chunk of docid word positions. +#[logging_timer::time] pub fn extract_fid_word_count_docids( mut docid_word_positions: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 85453e173..8ca8e39eb 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -14,6 +14,7 @@ use crate::Result; /// /// Returns a grenad reader with the list of extracted words and /// documents ids from the given chunk of docid word positions. +#[logging_timer::time] pub fn extract_word_docids( mut docid_word_positions: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs index c7138b32a..e099b0b49 100644 --- a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs @@ -10,6 +10,7 @@ use crate::{DocumentId, Result}; /// /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. +#[logging_timer::time] pub fn extract_word_level_position_docids( mut docid_word_positions: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 2bc79aac5..96bd965d8 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -17,6 +17,7 @@ use crate::{DocumentId, Result}; /// /// Returns a grenad reader with the list of extracted word pairs proximities and /// documents ids from the given chunk of docid word positions. +#[logging_timer::time] pub fn extract_word_pair_proximity_docids( mut docid_word_positions: grenad::Reader, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4cf7c83f1..d6fbd3e93 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -136,6 +136,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.autogenerate_docids = false; } + #[logging_timer::time("IndexDocuments::{}")] pub fn execute(self, reader: R, progress_callback: F) -> Result where R: io::Read, @@ -181,6 +182,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { Ok(DocumentAdditionResult { nb_documents }) } + #[logging_timer::time("IndexDocuments::{}")] pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -341,6 +343,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.execute_prefix_databases(progress_callback) } + #[logging_timer::time("IndexDocuments::{}")] pub fn execute_prefix_databases(self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index b8a80938c..30dabf1ae 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -33,6 +33,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } } + #[logging_timer::time("WordPrefixDocids::{}")] pub fn execute(self) -> Result<()> { // Clear the word prefix docids database. self.index.word_prefix_docids.clear(self.wtxn)?; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index cabe1053b..eb098a91f 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -48,6 +48,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute(self) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index afd7d7736..0af51fbb2 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -57,6 +57,7 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { self } + #[logging_timer::time("WordsLevelPositions::{}")] pub fn execute(self) -> Result<()> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index be33c156b..eaaacc26f 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -48,6 +48,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { self } + #[logging_timer::time("WordsPrefixesFst::{}")] pub fn execute(self) -> Result<()> { let words_fst = self.index.words_fst(&self.wtxn)?; From e09eec37bc4fc6529129f24fd45c7c6d28ec2297 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 25 Aug 2021 15:09:46 +0200 Subject: [PATCH 0937/1889] Handle distance addition with hard separators --- .../extract/extract_docid_word_positions.rs | 43 ++++++++++++++++--- 1 file changed, 37 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index fb3372660..894a193bf 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -3,7 +3,8 @@ use std::convert::TryInto; use std::fs::File; use std::{io, mem, str}; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; +use meilisearch_tokenizer::token::SeparatorKind; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; use roaring::RoaringBitmap; use serde_json::Value; @@ -61,11 +62,8 @@ pub fn extract_docid_word_positions( field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut field_buffer) { let analyzed = analyzer.analyze(field); - let tokens = analyzed - .tokens() - .filter(Token::is_word) - .enumerate() - .take_while(|(i, _)| (*i as u32) < ONE_ATTRIBUTE); + let tokens = process_tokens(analyzed.tokens()) + .take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE); for (index, token) in tokens { let token = token.text().trim(); @@ -134,3 +132,36 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st None } } + +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standart proximity of 1 between words. +fn process_tokens<'a>( + tokens: impl Iterator>, +) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator().is_some()) + .scan((0, None), |(offset, prev_kind), token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => + { + *prev_kind = Some(token.kind); + } + _ => (), + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} From 8f702828ca23bed257def6d452705e707e5c3e1c Mon Sep 17 00:00:00 2001 From: many Date: Thu, 26 Aug 2021 11:01:30 +0200 Subject: [PATCH 0938/1889] Ignore errors comming from crossbeam channel senders --- .../src/update/index_documents/extract/mod.rs | 34 +++++++++---------- milli/src/update/index_documents/mod.rs | 2 +- .../src/update/index_documents/typed_chunk.rs | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 591c8d4cd..04c57b0fa 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -147,9 +147,11 @@ fn spawn_extraction_task( Ok(chunks) => { debug!("merge {} database", name); let reader = merge_readers(chunks, merge_fn, indexer); - lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))).unwrap(); + let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); } - Err(e) => lmdb_writer_sx.send(Err(e)).unwrap(), }) }); } @@ -173,7 +175,7 @@ fn extract_documents_data( )> { let documents_chunk = documents_chunk.and_then(|c| unsafe { into_clonable_grenad(c) })?; - lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))).unwrap(); + let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( @@ -186,14 +188,14 @@ fn extract_documents_data( )?; // send documents_ids to DB writer - lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))).unwrap(); + let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); // send docid_word_positions_chunk to DB writer let docid_word_positions_chunk = unsafe { into_clonable_grenad(docid_word_positions_chunk)? }; - lmdb_writer_sx - .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))) - .unwrap(); + let _ = lmdb_writer_sx + .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); + Ok(docid_word_positions_chunk) }, || { @@ -207,20 +209,18 @@ fn extract_documents_data( // send docid_fid_facet_numbers_chunk to DB writer let docid_fid_facet_numbers_chunk = unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? }; - lmdb_writer_sx - .send(Ok(TypedChunk::FieldIdDocidFacetNumbers( - docid_fid_facet_numbers_chunk.clone(), - ))) - .unwrap(); + + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( + docid_fid_facet_numbers_chunk.clone(), + ))); // send docid_fid_facet_strings_chunk to DB writer let docid_fid_facet_strings_chunk = unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? }; - lmdb_writer_sx - .send(Ok(TypedChunk::FieldIdDocidFacetStrings( - docid_fid_facet_strings_chunk.clone(), - ))) - .unwrap(); + + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( + docid_fid_facet_strings_chunk.clone(), + ))); Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) }, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d6fbd3e93..98b0aa80e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -268,7 +268,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }); if let Err(e) = result { - lmdb_writer_sx.send(Err(e)).unwrap(); + let _ = lmdb_writer_sx.send(Err(e)); } // needs to be droped to avoid channel waiting lock. diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e8790af16..84333addb 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -95,7 +95,7 @@ pub(crate) fn write_typed_chunk_into_index( // we use the key to construct the words fst. builder.insert(word)?; } - let fst = builder.into_set().map_data(std::borrow::Cow::Owned).unwrap(); + let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?; let db_fst = index.words_fst(wtxn)?; // merge new fst with database fst From 9452fabfb2ed590db1a7bde089c87e9b41f5a561 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 26 Aug 2021 15:56:24 +0200 Subject: [PATCH 0939/1889] Optimize cbo roaring bitmaps merge --- .../cbo_roaring_bitmap_codec.rs | 76 +++++++++++++++++++ .../helpers/merge_functions.rs | 53 ++----------- .../src/update/index_documents/typed_chunk.rs | 15 +++- 3 files changed, 93 insertions(+), 51 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 53f64d648..c0e984d44 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -52,6 +52,46 @@ impl CboRoaringBitmapCodec { RoaringBitmap::deserialize_from(bytes) } } + + /// Merge serialized CboRoaringBitmaps in a buffer. + /// + /// if the merged values len is under the threshold, + /// values are directly serialized in the buffer; + /// else a RoaringBitmap is created from the values and is serialized in the buffer. + pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { + let mut roaring = RoaringBitmap::new(); + let mut vec = Vec::new(); + + for bytes in slices { + if bytes.len() <= THRESHOLD * size_of::() { + let mut reader = bytes.as_ref(); + while let Ok(integer) = reader.read_u32::() { + vec.push(integer); + } + } else { + roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?; + } + } + + if roaring.is_empty() { + vec.sort_unstable(); + vec.dedup(); + + if vec.len() <= THRESHOLD { + for integer in vec { + buffer.extend_from_slice(&integer.to_ne_bytes()); + } + } else { + let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()); + roaring.serialize_into(buffer)?; + } + } else { + roaring.extend(vec); + roaring.serialize_into(buffer)?; + } + + Ok(()) + } } impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { @@ -106,4 +146,40 @@ mod tests { assert!(roaring_size > bo_size); } + + #[test] + fn merge_cbo_roaring_bitmaps() { + let mut buffer = Vec::new(); + + let small_data = vec![ + RoaringBitmap::from_sorted_iter(1..4), + RoaringBitmap::from_sorted_iter(2..5), + RoaringBitmap::from_sorted_iter(4..6), + RoaringBitmap::from_sorted_iter(1..3), + ]; + + let small_data: Vec<_> = + small_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); + CboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap(); + let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); + let expected = RoaringBitmap::from_sorted_iter(1..6); + assert_eq!(bitmap, expected); + + let medium_data = vec![ + RoaringBitmap::from_sorted_iter(1..4), + RoaringBitmap::from_sorted_iter(2..5), + RoaringBitmap::from_sorted_iter(4..8), + RoaringBitmap::from_sorted_iter(0..3), + RoaringBitmap::from_sorted_iter(7..23), + ]; + + let medium_data: Vec<_> = + medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); + buffer.clear(); + CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap(); + + let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); + let expected = RoaringBitmap::from_sorted_iter(0..23); + assert_eq!(bitmap, expected); + } } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 6a592e54d..c5385e347 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -120,52 +120,11 @@ pub fn merge_cbo_roaring_bitmaps<'a>( _key: &[u8], values: &[Cow<'a, [u8]>], ) -> Result> { - match values.split_first().unwrap() { - (head, []) => Ok(head.clone()), - (head, tail) => { - let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; - - for value in tail { - head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; - } - - let mut vec = Vec::new(); - CboRoaringBitmapCodec::serialize_into(&head, &mut vec); - Ok(Cow::from(vec)) - } + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let mut vec = Vec::new(); + CboRoaringBitmapCodec::merge_into(values, &mut vec)?; + Ok(Cow::from(vec)) } } - -// /// Uses the FacetStringLevelZeroValueCodec to merge the values. -// pub fn tuple_string_cbo_roaring_bitmap_merge<'a>( -// _key: &[u8], -// values: &[Cow<[u8]>], -// ) -> Result> { -// let (head, tail) = values.split_first().unwrap(); -// let (head_string, mut head_rb) = FacetStringLevelZeroValueCodec::bytes_decode(&head[..]) -// .ok_or(SerializationError::Decoding { db_name: None })?; - -// for value in tail { -// let (_string, rb) = FacetStringLevelZeroValueCodec::bytes_decode(&value[..]) -// .ok_or(SerializationError::Decoding { db_name: None })?; -// head_rb |= rb; -// } - -// FacetStringLevelZeroValueCodec::bytes_encode(&(head_string, head_rb)) -// .map(|cow| cow.into_owned()) -// .ok_or(SerializationError::Encoding { db_name: None }) -// .map_err(Into::into) -// } - -// pub fn cbo_roaring_bitmap_merge<'a>(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { -// let (head, tail) = values.split_first().unwrap(); -// let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; - -// for value in tail { -// head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; -// } - -// let mut vec = Vec::new(); -// CboRoaringBitmapCodec::serialize_into(&head, &mut vec); -// Ok(vec) -// } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 84333addb..c3c71bbf4 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -188,15 +188,22 @@ fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec Ok(serialize_roaring_bitmap(&value, buffer)?) } +use std::borrow::Cow; + fn merge_cbo_roaring_bitmaps( new_value: &[u8], db_value: &[u8], buffer: &mut Vec, ) -> Result<()> { - let new_value = CboRoaringBitmapCodec::deserialize_from(new_value)?; - let db_value = CboRoaringBitmapCodec::deserialize_from(db_value)?; - let value = new_value | db_value; - Ok(CboRoaringBitmapCodec::serialize_into(&value, buffer)) + Ok(CboRoaringBitmapCodec::merge_into( + &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], + buffer, + )?) + + // let new_value = CboRoaringBitmapCodec::deserialize_from(new_value)?; + // let db_value = CboRoaringBitmapCodec::deserialize_from(db_value)?; + // let value = new_value | db_value; + // Ok(CboRoaringBitmapCodec::serialize_into(&value, buffer)) } /// Write provided entries in database using serialize_value function. From b3a22f31f6d8b87110a7ef330223b31c7eadcb20 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 30 Aug 2021 13:43:41 +0200 Subject: [PATCH 0940/1889] Fix memory consuption in word pair proximity extractor --- .../extract/extract_word_pair_proximity_docids.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 96bd965d8..ce75c319e 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -29,7 +29,7 @@ pub fn extract_word_pair_proximity_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory, + max_memory.map(|m| m / 2), ); let mut number_of_documents = 0; From 4860fd452965d234a11cc8430309bd9782a21bfd Mon Sep 17 00:00:00 2001 From: many Date: Wed, 1 Sep 2021 16:24:58 +0200 Subject: [PATCH 0941/1889] Ignore empty facet values --- .../index_documents/extract/extract_fid_docid_facet_values.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 08f2cadf0..c46329f61 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -66,7 +66,7 @@ pub fn extract_fid_docid_facet_values( } // insert normalized and original facet string in sorter - for (normalized, original) in strings { + for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { key_buffer.truncate(size_of::() + size_of::()); key_buffer.extend_from_slice(normalized.as_bytes()); fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; From 285849e3a6df0336a8dfda723a4f78d2480f2660 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 2 Sep 2021 10:08:41 +0200 Subject: [PATCH 0942/1889] Update version for the next release (v0.12.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index d4f11b458..6cccdd7ad 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.11.0" +version = "0.12.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index a01388a7e..aba7ef5c3 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.11.0" +version = "0.12.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 3d82b8605..5f95ac787 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.11.0" +version = "0.12.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3baa2213d..102114abd 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.11.0" +version = "0.12.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index fd161b480..809295c4a 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.11.0" +version = "0.12.0" authors = ["Clément Renault "] edition = "2018" From db0c681baef3f0f0e90c5c5feb56cc3ec1509248 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 2 Sep 2021 15:17:52 +0200 Subject: [PATCH 0943/1889] Fix Pr comments --- milli/Cargo.toml | 1 - .../facet_string_level_zero_value_codec.rs | 3 +- .../cbo_roaring_bitmap_codec.rs | 6 ++-- .../extract/extract_fid_docid_facet_values.rs | 9 +++--- .../extract/extract_fid_word_count_docids.rs | 5 ++- .../extract/extract_word_docids.rs | 5 ++- .../extract_word_level_position_docids.rs | 6 +++- .../extract_word_pair_proximity_docids.rs | 31 +++---------------- .../src/update/index_documents/extract/mod.rs | 1 + .../index_documents/helpers/clonable_mmap.rs | 2 ++ .../index_documents/helpers/grenad_helpers.rs | 12 ++++--- .../src/update/index_documents/helpers/mod.rs | 4 --- milli/src/update/index_documents/mod.rs | 3 +- .../src/update/index_documents/typed_chunk.rs | 8 +---- 14 files changed, 38 insertions(+), 58 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index edcec4d5b..8616dcf4a 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -6,7 +6,6 @@ edition = "2018" [dependencies] bstr = "0.2.15" -byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } byteorder = "1.4.2" chrono = { version = "0.4.19", features = ["serde"] } concat-arrays = "0.1.2" diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs index 914d7c3cd..22031c474 100644 --- a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -5,6 +5,7 @@ use std::{marker, str}; use crate::error::SerializationError; use crate::heed_codec::RoaringBitmapCodec; use crate::{try_split_array_at, try_split_at, Result}; + pub type FacetStringLevelZeroValueCodec = StringValueCodec; /// A codec that encodes a string in front of a value. @@ -22,7 +23,6 @@ where fn bytes_decode(bytes: &'a [u8]) -> Option { let (string, bytes) = decode_prefix_string(bytes)?; - C::bytes_decode(bytes).map(|item| (string, item)) } } @@ -49,7 +49,6 @@ pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> { let original_length = u16::from_be_bytes(original_length_bytes) as usize; let (string, bytes) = try_split_at(bytes, original_length)?; let string = str::from_utf8(string).ok()?; - Some((string, bytes)) } diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index c0e984d44..519997274 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -55,9 +55,9 @@ impl CboRoaringBitmapCodec { /// Merge serialized CboRoaringBitmaps in a buffer. /// - /// if the merged values len is under the threshold, - /// values are directly serialized in the buffer; - /// else a RoaringBitmap is created from the values and is serialized in the buffer. + /// if the merged values length is under the threshold, values are directly + /// serialized in the buffer else a RoaringBitmap is created from the + /// values and is serialized in the buffer. pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { let mut roaring = RoaringBitmap::new(); let mut vec = Vec::new(); diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index c46329f61..a1bf0b1e3 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -58,11 +58,12 @@ pub fn extract_fid_docid_facet_values( // insert facet numbers in sorter for number in numbers { key_buffer.truncate(size_of::() + size_of::()); - let value_bytes = f64_into_bytes(number).unwrap(); // invalid float - key_buffer.extend_from_slice(&value_bytes); - key_buffer.extend_from_slice(&number.to_be_bytes()); + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); - fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; + fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; + } } // insert normalized and original facet string in sorter diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index cf698507d..1fbc55714 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -8,6 +8,8 @@ use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, try_split_array_at, GrenadParameters, MergeFn, }; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::proximity::extract_position; use crate::{DocumentId, FieldId, Result}; @@ -36,7 +38,8 @@ pub fn extract_fid_word_count_docids( let mut current_document_id = None; while let Some((key, value)) = docid_word_positions.next()? { - let (document_id_bytes, _word_bytes) = try_split_array_at(key).unwrap(); + let (document_id_bytes, _word_bytes) = try_split_array_at(key) + .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let curr_document_id = *current_document_id.get_or_insert(document_id); diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 8ca8e39eb..6d99fda44 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -8,6 +8,8 @@ use super::helpers::{ create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, try_split_array_at, GrenadParameters, }; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::Result; /// Extracts the word and the documents ids where this word appear. @@ -31,7 +33,8 @@ pub fn extract_word_docids( let mut value_buffer = Vec::new(); while let Some((key, _value)) = docid_word_positions.next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let bitmap = RoaringBitmap::from_iter(Some(document_id)); diff --git a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs index e099b0b49..04cedf5c7 100644 --- a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs @@ -5,7 +5,10 @@ use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, try_split_array_at, GrenadParameters, }; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::{DocumentId, Result}; + /// Extracts the word positions and the documents ids where this word appear. /// /// Returns a grenad reader with the list of extracted words at positions and @@ -27,7 +30,8 @@ pub fn extract_word_level_position_docids( let mut key_buffer = Vec::new(); while let Some((key, value)) = docid_word_positions.next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = DocumentId::from_be_bytes(document_id_bytes); for position in read_u32_ne_bytes(value) { diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index ce75c319e..982799a65 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -1,15 +1,14 @@ use std::cmp::Ordering; use std::collections::{BinaryHeap, HashMap}; use std::fs::File; -use std::time::{Duration, Instant}; use std::{cmp, io, mem, str, vec}; -use log::debug; - use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, try_split_array_at, GrenadParameters, MergeFn, }; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::proximity::{positions_proximity, MAX_DISTANCE}; use crate::{DocumentId, Result}; @@ -32,16 +31,13 @@ pub fn extract_word_pair_proximity_docids( max_memory.map(|m| m / 2), ); - let mut number_of_documents = 0; - let mut total_time_aggregation = Duration::default(); - let mut total_time_grenad_insert = Duration::default(); - // This map is assumed to not consume a lot of memory. let mut document_word_positions_heap = BinaryHeap::new(); let mut current_document_id = None; while let Some((key, value)) = docid_word_positions.next()? { - let (document_id_bytes, word_bytes) = try_split_array_at(key).unwrap(); + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let word = str::from_utf8(word_bytes)?; @@ -52,10 +48,7 @@ pub fn extract_word_pair_proximity_docids( curr_document_id, document_word_positions_heap, &mut word_pair_proximity_docids_sorter, - &mut total_time_aggregation, - &mut total_time_grenad_insert, )?; - number_of_documents += 1; current_document_id = Some(document_id); } @@ -74,18 +67,9 @@ pub fn extract_word_pair_proximity_docids( document_id, document_word_positions_heap, &mut word_pair_proximity_docids_sorter, - &mut total_time_aggregation, - &mut total_time_grenad_insert, )?; } - debug!( - "Number of documents {} - - we took {:02?} to aggregate proximities - - we took {:02?} to grenad insert those proximities", - number_of_documents, total_time_aggregation, total_time_grenad_insert, - ); - sorter_into_reader(word_pair_proximity_docids_sorter, indexer) } @@ -97,10 +81,7 @@ fn document_word_positions_into_sorter<'b>( document_id: DocumentId, mut word_positions_heap: BinaryHeap>>, word_pair_proximity_docids_sorter: &mut grenad::Sorter, - total_time_aggregation: &mut Duration, - total_time_grenad_insert: &mut Duration, ) -> Result<()> { - let before_aggregating = Instant::now(); let mut word_pair_proximity = HashMap::new(); let mut ordered_peeked_word_positions = Vec::new(); while !word_positions_heap.is_empty() { @@ -152,8 +133,6 @@ fn document_word_positions_into_sorter<'b>( } } - *total_time_aggregation += before_aggregating.elapsed(); - let mut key_buffer = Vec::new(); for ((w1, w2), prox) in word_pair_proximity { key_buffer.clear(); @@ -162,9 +141,7 @@ fn document_word_positions_into_sorter<'b>( key_buffer.extend_from_slice(w2.as_bytes()); key_buffer.push(prox as u8); - let before_grenad_insert = Instant::now(); word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; - *total_time_grenad_insert += before_grenad_insert.elapsed(); } Ok(()) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 04c57b0fa..bb49e3e51 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -225,5 +225,6 @@ fn extract_documents_data( Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) }, ); + Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) } diff --git a/milli/src/update/index_documents/helpers/clonable_mmap.rs b/milli/src/update/index_documents/helpers/clonable_mmap.rs index b16c080ff..691d10593 100644 --- a/milli/src/update/index_documents/helpers/clonable_mmap.rs +++ b/milli/src/update/index_documents/helpers/clonable_mmap.rs @@ -2,6 +2,8 @@ use std::sync::Arc; use memmap::Mmap; +/// Wrapper around Mmap allowing to virtualy clone grenad-chunks +/// in a parallel process like the indexing. #[derive(Debug, Clone)] pub struct ClonableMmap { inner: Arc, diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 9dd261f73..1dfaaf945 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -3,7 +3,6 @@ use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::time::Instant; -use byte_unit::Byte; use grenad::{CompressionType, MergerIter, Reader, Sorter}; use heed::types::ByteSlice; use log::debug; @@ -113,6 +112,9 @@ impl Default for GrenadParameters { } impl GrenadParameters { + /// This function use the number of threads in the current threadpool to compute the value. + /// This should be called inside of a rayon thread pool, + /// Otherwise, it will take the global number of threads. pub fn max_memory_by_thread(&self) -> Option { self.max_memory.map(|max_memory| max_memory / rayon::current_num_threads()) } @@ -128,7 +130,7 @@ pub fn grenad_obkv_into_chunks( mut reader: grenad::Reader, indexer: GrenadParameters, log_frequency: Option, - documents_chunk_size: Byte, + documents_chunk_size: usize, ) -> Result>>> { let mut document_count = 0; let mut continue_reading = true; @@ -157,7 +159,7 @@ pub fn grenad_obkv_into_chunks( debug!("reached {} chunked documents", document_count); } - if current_chunk_size >= documents_chunk_size.get_bytes() { + if current_chunk_size >= documents_chunk_size as u64 { return writer_into_reader(obkv_documents).map(Some); } } @@ -170,8 +172,8 @@ pub fn grenad_obkv_into_chunks( let result = transposer().transpose(); if result.as_ref().map_or(false, |r| r.is_ok()) { debug!( - "A new chunk of approximately {} has been generated", - documents_chunk_size.get_appropriate_unit(true), + "A new chunk of approximately {:.2} MiB has been generated", + documents_chunk_size as f64 / 1024.0 / 1024.0, ); } result diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index baacb0a1b..3f38d4f25 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -40,10 +40,6 @@ where Some((head, tail)) } -// pub fn pretty_thousands, T: fmt::Display>(number: A) -> String { -// thousands::Separable::separate_with_spaces(number.borrow()) -// } - pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 98b0aa80e..b27f2042f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -9,7 +9,6 @@ use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; use std::time::Instant; -use byte_unit::Byte; use chrono::Utc; use crossbeam_channel::{Receiver, Sender}; use grenad::{self, CompressionType}; @@ -252,7 +251,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { documents_file, params.clone(), self.log_every_n, - Byte::from_bytes(self.documents_chunk_size.unwrap_or(1024 * 1024 * 128) as u64), // 128MiB + self.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB ); let result = chunk_iter.map(|chunk_iter| { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index c3c71bbf4..5f28034fe 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::fs::File; use heed::types::ByteSlice; @@ -188,8 +189,6 @@ fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec Ok(serialize_roaring_bitmap(&value, buffer)?) } -use std::borrow::Cow; - fn merge_cbo_roaring_bitmaps( new_value: &[u8], db_value: &[u8], @@ -199,11 +198,6 @@ fn merge_cbo_roaring_bitmaps( &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], buffer, )?) - - // let new_value = CboRoaringBitmapCodec::deserialize_from(new_value)?; - // let db_value = CboRoaringBitmapCodec::deserialize_from(db_value)?; - // let value = new_value | db_value; - // Ok(CboRoaringBitmapCodec::serialize_into(&value, buffer)) } /// Write provided entries in database using serialize_value function. From 7f7fafb8579d52aa3ae954d44395e115c093ffb0 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 2 Sep 2021 15:25:39 +0200 Subject: [PATCH 0944/1889] Make document_chunk_size settable from update builder --- milli/src/update/index_documents/mod.rs | 1 + milli/src/update/settings.rs | 3 +++ milli/src/update/update_builder.rs | 8 ++++++++ 3 files changed, 12 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b27f2042f..e4c798163 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -281,6 +281,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { log_every_n: self.log_every_n, max_nb_chunks: self.max_nb_chunks, max_memory: self.max_memory, + documents_chunk_size: self.documents_chunk_size, chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, thread_pool: self.thread_pool, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ef23286ae..f1b3e2628 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -65,6 +65,7 @@ pub struct Settings<'a, 't, 'u, 'i> { pub(crate) log_every_n: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, + pub(crate) documents_chunk_size: Option, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, @@ -93,6 +94,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { log_every_n: None, max_nb_chunks: None, max_memory: None, + documents_chunk_size: None, chunk_compression_type: CompressionType::None, chunk_compression_level: None, thread_pool: None, @@ -227,6 +229,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.log_every_n = self.log_every_n; indexing_builder.max_nb_chunks = self.max_nb_chunks; indexing_builder.max_memory = self.max_memory; + indexing_builder.documents_chunk_size = self.documents_chunk_size; indexing_builder.chunk_compression_type = self.chunk_compression_type; indexing_builder.chunk_compression_level = self.chunk_compression_level; indexing_builder.thread_pool = self.thread_pool; diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 6035499b3..561c4bc50 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -7,6 +7,7 @@ use crate::{Index, Result}; pub struct UpdateBuilder<'a> { pub(crate) log_every_n: Option, pub(crate) max_nb_chunks: Option, + pub(crate) documents_chunk_size: Option, pub(crate) max_memory: Option, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, @@ -19,6 +20,7 @@ impl<'a> UpdateBuilder<'a> { UpdateBuilder { log_every_n: None, max_nb_chunks: None, + documents_chunk_size: None, max_memory: None, chunk_compression_type: CompressionType::None, chunk_compression_level: None, @@ -39,6 +41,10 @@ impl<'a> UpdateBuilder<'a> { self.max_memory = Some(max_memory); } + pub fn documents_chunk_size(&mut self, documents_chunk_size: usize) { + self.documents_chunk_size = Some(documents_chunk_size); + } + pub fn chunk_compression_type(&mut self, chunk_compression_type: CompressionType) { self.chunk_compression_type = chunk_compression_type; } @@ -77,6 +83,7 @@ impl<'a> UpdateBuilder<'a> { builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; + builder.documents_chunk_size = self.documents_chunk_size; builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.thread_pool = self.thread_pool; @@ -94,6 +101,7 @@ impl<'a> UpdateBuilder<'a> { builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; builder.max_memory = self.max_memory; + builder.documents_chunk_size = self.documents_chunk_size; builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.thread_pool = self.thread_pool; From 741a4444a9b8eb115db6585fcc4b2ecefc9ba52c Mon Sep 17 00:00:00 2001 From: many Date: Thu, 2 Sep 2021 16:57:46 +0200 Subject: [PATCH 0945/1889] Remove log in chunk generator --- .../index_documents/helpers/grenad_helpers.rs | 18 +----------------- milli/src/update/index_documents/mod.rs | 1 - milli/tests/search/mod.rs | 2 +- 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 1dfaaf945..fbdf2b42e 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -129,10 +129,8 @@ impl GrenadParameters { pub fn grenad_obkv_into_chunks( mut reader: grenad::Reader, indexer: GrenadParameters, - log_frequency: Option, documents_chunk_size: usize, ) -> Result>>> { - let mut document_count = 0; let mut continue_reading = true; let indexer_clone = indexer.clone(); @@ -154,11 +152,6 @@ pub fn grenad_obkv_into_chunks( obkv_documents.insert(document_id, obkv)?; current_chunk_size += document_id.len() as u64 + obkv.len() as u64; - document_count += 1; - if log_frequency.map_or(false, |log_frequency| document_count % log_frequency == 0) { - debug!("reached {} chunked documents", document_count); - } - if current_chunk_size >= documents_chunk_size as u64 { return writer_into_reader(obkv_documents).map(Some); } @@ -168,16 +161,7 @@ pub fn grenad_obkv_into_chunks( writer_into_reader(obkv_documents).map(Some) }; - Ok(std::iter::from_fn(move || { - let result = transposer().transpose(); - if result.as_ref().map_or(false, |r| r.is_ok()) { - debug!( - "A new chunk of approximately {:.2} MiB has been generated", - documents_chunk_size as f64 / 1024.0 / 1024.0, - ); - } - result - })) + Ok(std::iter::from_fn(move || transposer().transpose())) } pub fn write_into_lmdb_database( diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e4c798163..7800ae55a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -250,7 +250,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let chunk_iter = grenad_obkv_into_chunks( documents_file, params.clone(), - self.log_every_n, self.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB ); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index a533a4cbe..0fbc0e1b6 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -5,7 +5,7 @@ use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::update::{IndexDocuments, Settings, UpdateBuilder, UpdateFormat}; +use milli::update::{Settings, UpdateBuilder, UpdateFormat}; use milli::{AscDesc, Criterion, DocumentId, Index}; use serde::Deserialize; use slice_group_by::GroupBy; From c2517e7d5f277e24b9dd83cedf571e6a04d22f68 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Thu, 2 Sep 2021 19:41:19 +0300 Subject: [PATCH 0946/1889] fix(facet): string fields sorting --- milli/src/search/facet/facet_string.rs | 91 +++++++++++++------------- 1 file changed, 47 insertions(+), 44 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 747b7fd3c..c55430cf1 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -243,24 +243,27 @@ impl<'t> Iterator for FacetStringGroupRevRange<'t> { type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), - Err(e) => Some(Err(e)), + loop { + match self.iter.next() { + Some(Ok(((_fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok((bounds, docids)) => { + return Some(Ok(((level, left, right), (bounds, docids)))) + } + Err(e) => return Some(Err(e)), + } } - } else { - None + continue; } + Some(Err(e)) => return Some(Err(e)), + None => return None, } - Some(Err(e)) => Some(Err(e)), - None => None, } } } @@ -545,18 +548,18 @@ impl<'t> Iterator for FacetStringIter<'t> { // the algorithm less complex to understand. let last = match last { Left(ascending) => match ascending { - Left(last) => Left(Left(last)), - Right(last) => Right(Left(last)), + Left(group) => Left(Left(group)), + Right(zero_level) => Right(Left(zero_level)), }, Right(descending) => match descending { - Left(last) => Left(Right(last)), - Right(last) => Right(Right(last)), + Left(group) => Left(Right(group)), + Right(zero_level) => Right(Right(zero_level)), }, }; match last { - Left(last) => { - for result in last { + Left(group) => { + for result in group { match result { Ok(((level, left, right), (string_bounds, mut docids))) => { docids &= &*documents_ids; @@ -566,6 +569,27 @@ impl<'t> Iterator for FacetStringIter<'t> { } let result = if is_ascending { + match string_bounds { + Some((left, right)) => FacetStringLevelZeroRange::new( + self.rtxn, + self.db, + self.field_id, + Included(left), + Included(right), + ) + .map(Right), + None => FacetStringGroupRange::new( + self.rtxn, + self.db, + self.field_id, + NonZeroU8::new(level.get() - 1).unwrap(), + Included(left), + Included(right), + ) + .map(Left), + } + .map(Left) + } else { match string_bounds { Some((left, right)) => { FacetStringLevelZeroRevRange::new( @@ -588,27 +612,6 @@ impl<'t> Iterator for FacetStringIter<'t> { .map(Left), } .map(Right) - } else { - match string_bounds { - Some((left, right)) => FacetStringLevelZeroRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right), - None => FacetStringGroupRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Left) }; match result { @@ -624,9 +627,9 @@ impl<'t> Iterator for FacetStringIter<'t> { } } } - Right(last) => { + Right(zero_level) => { // level zero only - for result in last { + for result in zero_level { match result { Ok((normalized, original, mut docids)) => { docids &= &*documents_ids; From 0be09555f1ec7724ac2789618d0f5c1132a30001 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Fri, 3 Sep 2021 13:00:48 +0300 Subject: [PATCH 0947/1889] test(search): asc/desc criteria for large datasets --- milli/tests/search/query_criteria.rs | 101 ++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 2 deletions(-) diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 1723c1d6f..c9720d652 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,6 +1,12 @@ +use std::cmp::Reverse; + use big_s::S; -use milli::update::Settings; -use milli::{AscDesc, Criterion, Search, SearchResult}; +use heed::EnvOpenOptions; +use itertools::Itertools; +use maplit::hashset; +use milli::update::{Settings, UpdateBuilder, UpdateFormat}; +use milli::{AscDesc, Criterion, Index, Search, SearchResult}; +use rand::Rng; use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -9,6 +15,7 @@ const ALLOW_TYPOS: bool = true; const DISALLOW_TYPOS: bool = false; const ALLOW_OPTIONAL_WORDS: bool = true; const DISALLOW_OPTIONAL_WORDS: bool = false; +const ASC_DESC_CANDIDATES_THRESHOLD: usize = 1000; macro_rules! test_criterion { ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr, $sort_criteria:expr) => { @@ -357,3 +364,93 @@ fn criteria_mixup() { assert_eq!(documents_ids, expected_external_ids); } } + +#[test] +fn criteria_ascdesc() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut builder = Settings::new(&mut wtxn, &index, 0); + + builder.set_sortable_fields(hashset! { + S("name"), + S("age"), + }); + builder.execute(|_, _| ()).unwrap(); + + // index documents + let mut builder = UpdateBuilder::new(0); + builder.max_memory(10 * 1024 * 1024); // 10MiB + let mut builder = builder.index_documents(&mut wtxn, &index); + builder.update_format(UpdateFormat::Csv); + builder.enable_autogenerate_docids(); + + let content = [ + vec![S("name,age")], + (0..ASC_DESC_CANDIDATES_THRESHOLD + 1) + .map(|_| { + let mut rng = rand::thread_rng(); + + let age = rng.gen::().to_string(); + let name = rng + .sample_iter(&rand::distributions::Alphanumeric) + .map(char::from) + .filter(|c| *c >= 'a' && *c <= 'z') + .take(10) + .collect::(); + + format!("{},{}", name, age) + }) + .collect::>(), + ] + .iter() + .flatten() + .join("\n"); + builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let documents = index.all_documents(&rtxn).unwrap().map(|doc| doc.unwrap()).collect::>(); + + for criterion in [Asc(S("name")), Desc(S("name")), Asc(S("age")), Desc(S("age"))] { + eprintln!("Testing with criterion: {:?}", &criterion); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_criteria(vec![criterion.to_string()]); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.limit(ASC_DESC_CANDIDATES_THRESHOLD + 1); + + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + let expected_document_ids = match criterion { + Asc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| obkv.get(0).unwrap()) + } + Desc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| Reverse(obkv.get(0).unwrap())) + } + Asc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| obkv.get(1).unwrap()) + } + Desc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| Reverse(obkv.get(1).unwrap())) + } + _ => continue, + } + .map(|(id, _)| *id) + .collect::>(); + + assert_eq!(documents_ids, expected_document_ids); + } +} From 8dca36433c67c862b1b41ce68695d05a00f75dcb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 7 Sep 2021 10:37:57 +0200 Subject: [PATCH 0948/1889] Introduce the new SortRankingRuleMissing user error variant --- milli/src/error.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/milli/src/error.rs b/milli/src/error.rs index 9bda74631..56028f742 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -59,6 +59,7 @@ pub enum UserError { InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), InvalidSortableAttribute { field: String, valid_fields: HashSet }, + SortRankingRuleMissing, InvalidStoreFile, MaxDatabaseSizeReached, MissingDocumentId { document: Object }, @@ -236,6 +237,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco field, valid_names ) } + Self::SortRankingRuleMissing => f.write_str( + "The sort ranking rule must be specified in the \ + ranking rules settings to use the sort parameter at search time", + ), Self::MissingDocumentId { document } => { let json = serde_json::to_string(document).unwrap(); write!(f, "document doesn't have an identifier {}", json) From fd3daa442362d23d9db477665faa6437fdb77593 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 7 Sep 2021 10:52:19 +0200 Subject: [PATCH 0949/1889] Throw a query time error when a sort param is used but sort ranking rule is missing --- milli/src/search/mod.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 56002b2e3..207f46f8a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -18,7 +18,7 @@ pub(crate) use self::facet::ParserRule; pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; -use crate::criterion::AscDesc; +use crate::criterion::{AscDesc, Criterion}; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{DocumentId, Index, Result}; @@ -159,6 +159,14 @@ impl<'a> Search<'a> { } } + // We check that the sort ranking rule exists and throw an + // error if we try to use it and that it doesn't. + let sort_ranking_rule_missing = !self.index.criteria(self.rtxn)?.contains(&Criterion::Sort); + let empty_sort_criteria = self.sort_criteria.as_ref().map_or(true, |s| s.is_empty()); + if sort_ranking_rule_missing && !empty_sort_criteria { + return Err(UserError::SortRankingRuleMissing.into()); + } + let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; let criteria = criteria_builder.build( query_tree, From 5989528833d1f70baf49a288f566a09555ff052c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 7 Sep 2021 11:01:37 +0200 Subject: [PATCH 0950/1889] Add a test to make sure we throw the right error message --- milli/tests/search/mod.rs | 1 + milli/tests/search/sort.rs | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 milli/tests/search/sort.rs diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 0fbc0e1b6..c34434c4e 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -13,6 +13,7 @@ use slice_group_by::GroupBy; mod distinct; mod filters; mod query_criteria; +mod sort; pub const TEST_QUERY: &'static str = "hello world america"; diff --git a/milli/tests/search/sort.rs b/milli/tests/search/sort.rs new file mode 100644 index 000000000..fe87f0623 --- /dev/null +++ b/milli/tests/search/sort.rs @@ -0,0 +1,23 @@ +use big_s::S; +use milli::Criterion::{Attribute, Exactness, Proximity, Typo, Words}; +use milli::{AscDesc, Error, Search, UserError}; + +use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; + +#[test] +fn sort_ranking_rule_missing() { + let criteria = vec![Words, Typo, Proximity, Attribute, Exactness]; + // sortables: `tag` and `asc_desc_rank` + let index = search::setup_search_index_with_criteria(&criteria); + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.authorize_typos(true); + search.optional_words(true); + search.sort_criteria(vec![AscDesc::Asc(S("tag"))]); + + let result = search.execute(); + assert!(matches!(result, Err(Error::UserError(UserError::SortRankingRuleMissing)))); +} From cd043d4461d5a0d7f1ac1e5b8b5e16376ceaa2f5 Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 7 Sep 2021 16:21:46 +0200 Subject: [PATCH 0951/1889] remove unused grenad default features --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 0ebea179f..795dd5438 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -15,7 +15,7 @@ either = "1.6.1" flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" -grenad = "0.3.0" +grenad = { version = "0.3.0", default-features = false } heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } From e2cefc9b4f760eba09327d2bcdcab377f9e5e51d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 7 Sep 2021 16:11:44 +0200 Subject: [PATCH 0952/1889] Move the sort ranking rule before the exactness ranking rule --- milli/src/criterion.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 47eb7c7dc..209a71b0d 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -12,14 +12,14 @@ pub enum Criterion { Words, /// Sorted by increasing number of typos. Typo, - /// Dynamically sort at query time the documents. None, one or multiple Asc/Desc sortable - /// attributes can be used in place of this criterion at query time. - Sort, /// Sorted by increasing distance between matched query terms. Proximity, /// Documents with quey words contained in more important /// attributes are considered better. Attribute, + /// Dynamically sort at query time the documents. None, one or multiple Asc/Desc sortable + /// attributes can be used in place of this criterion at query time. + Sort, /// Sorted by the similarity of the matched words with the query words. Exactness, /// Sorted by the increasing value of the field specified. @@ -45,9 +45,9 @@ impl FromStr for Criterion { match text { "words" => Ok(Criterion::Words), "typo" => Ok(Criterion::Typo), - "sort" => Ok(Criterion::Sort), "proximity" => Ok(Criterion::Proximity), "attribute" => Ok(Criterion::Attribute), + "sort" => Ok(Criterion::Sort), "exactness" => Ok(Criterion::Exactness), text => match AscDesc::from_str(text) { Ok(AscDesc::Asc(field)) => Ok(Criterion::Asc(field)), @@ -89,9 +89,9 @@ pub fn default_criteria() -> Vec { vec![ Criterion::Words, Criterion::Typo, - Criterion::Sort, Criterion::Proximity, Criterion::Attribute, + Criterion::Sort, Criterion::Exactness, ] } @@ -103,9 +103,9 @@ impl fmt::Display for Criterion { match self { Words => f.write_str("words"), Typo => f.write_str("typo"), - Sort => f.write_str("sort"), Proximity => f.write_str("proximity"), Attribute => f.write_str("attribute"), + Sort => f.write_str("sort"), Exactness => f.write_str("exactness"), Asc(attr) => write!(f, "{}:asc", attr), Desc(attr) => write!(f, "{}:desc", attr), From dbd91e715194556ade2b3e596daeaba28551d343 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Tue, 7 Sep 2021 16:35:16 +0300 Subject: [PATCH 0953/1889] chore(ci): use smarter dependencies cache --- .github/workflows/rust.yml | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 9aeb7e041..b5335d799 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -22,19 +22,13 @@ jobs: - nightly steps: - uses: actions/checkout@v2 - - name: Cache dependencies - uses: actions/cache@v2 - with: - path: | - ~/.cargo - ./Cargo.lock - ./target - key: ${{ matrix.os }}-${{ matrix.rust }}-${{ hashFiles('Cargo.toml') }} - uses: actions-rs/toolchain@v1 with: profile: minimal toolchain: ${{ matrix.rust }} override: true + - name: Cache dependencies + uses: Swatinem/rust-cache@v1.3.0 - name: Run cargo check uses: actions-rs/cargo@v1 with: @@ -51,20 +45,14 @@ jobs: runs-on: ubuntu-18.04 steps: - uses: actions/checkout@v2 - - name: Cache dependencies - uses: actions/cache@v2 - with: - path: | - ~/.cargo - ./Cargo.lock - ./target - key: ${{ matrix.os }}-${{ matrix.rust}}-${{ hashFiles('Cargo.toml') }} - uses: actions-rs/toolchain@v1 with: profile: minimal toolchain: nightly override: true components: rustfmt + - name: Cache dependencies + uses: Swatinem/rust-cache@v1.3.0 - name: Run cargo fmt # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate From eb7b9d9dbf0bc5e3e7b9fdeeded8647ed65010ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 8 Sep 2021 10:59:30 +0200 Subject: [PATCH 0954/1889] Update version for the next release (v0.13.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 6cccdd7ad..c319bfdab 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.12.0" +version = "0.13.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index aba7ef5c3..a05050f41 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.12.0" +version = "0.13.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 5f95ac787..bf1b24a05 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.12.0" +version = "0.13.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 795dd5438..92d59c5f6 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.12.0" +version = "0.13.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 809295c4a..c7de7f03f 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.12.0" +version = "0.13.0" authors = ["Clément Renault "] edition = "2018" From 9961b78b0630202aad2e62285def036325081802 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 8 Sep 2021 13:30:26 +0200 Subject: [PATCH 0955/1889] Drop sorter before creating a new one --- milli/src/update/index_documents/transform.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 7bfaa6ecd..e8b61ef14 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -383,6 +383,9 @@ impl Transform<'_, '_> { let mut field_distribution = self.index.field_distribution(self.rtxn)?; let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); + // consume sorter, in order to free the internal allocation, before creating a new one. + let mut iter = sorter.into_merger_iter()?; + // Once we have sort and deduplicated the documents we write them into a final file. let mut final_sorter = create_sorter( |_id, obkvs| { @@ -404,7 +407,6 @@ impl Transform<'_, '_> { // While we write into final file we get or generate the internal documents ids. let mut documents_count = 0; - let mut iter = sorter.into_merger_iter()?; while let Some((external_id, update_obkv)) = iter.next()? { if self.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { From 20ad43b9081ddc9e0ca52ed4c85900d2fce09f59 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 8 Sep 2021 14:06:28 +0200 Subject: [PATCH 0956/1889] Enable the grenad tempfile feature back --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 92d59c5f6..d5a2e2978 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -15,7 +15,7 @@ either = "1.6.1" flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" -grenad = { version = "0.3.0", default-features = false } +grenad = { version = "0.3.0", default-features = false, features = ["tempfile"] } heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } From 8a088fb99e47750460d854368a3f2825110b0df8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 8 Sep 2021 14:08:55 +0200 Subject: [PATCH 0957/1889] Bump grenad to v0.3.1 --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d5a2e2978..c6fe3ea95 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -15,7 +15,7 @@ either = "1.6.1" flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" -grenad = { version = "0.3.0", default-features = false, features = ["tempfile"] } +grenad = { version = "0.3.1", default-features = false, features = ["tempfile"] } heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } From 68856e5e2f9dd145b72a99486265b3bc0095ee75 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 8 Sep 2021 14:10:39 +0200 Subject: [PATCH 0958/1889] Disable the default snappy compression for the http-ui crate --- http-ui/src/main.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index fd7dd37de..108ec0549 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -103,8 +103,8 @@ pub struct IndexerOpt { /// chunks during indexing documents. /// /// Choosing a fast algorithm will make the indexing faster but may consume more memory. - #[structopt(long, default_value = "snappy", possible_values = &["snappy", "zlib", "lz4", "lz4hc", "zstd"])] - pub chunk_compression_type: CompressionType, + #[structopt(long, possible_values = &["snappy", "zlib", "lz4", "lz4hc", "zstd"])] + pub chunk_compression_type: Option, /// The level of compression of the chosen algorithm. #[structopt(long, requires = "chunk-compression-type")] @@ -343,7 +343,9 @@ async fn main() -> anyhow::Result<()> { update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap()); update_builder.log_every_n(indexer_opt_cloned.log_every_n); update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); - update_builder.chunk_compression_type(indexer_opt_cloned.chunk_compression_type); + update_builder.chunk_compression_type( + indexer_opt_cloned.chunk_compression_type.unwrap_or(CompressionType::None), + ); let before_update = Instant::now(); // we extract the update type and execute the update itself. From d18ee58ab99b8dea7d632576cab8acf1e9a78926 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 8 Sep 2021 15:24:32 +0200 Subject: [PATCH 0959/1889] Check if key are not empty in validator --- milli/src/update/index_documents/helpers/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 3f38d4f25..128288982 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -17,7 +17,7 @@ pub use merge_functions::{ }; pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { - key.as_ref().len() <= 511 + key.as_ref().len() <= 511 && !key.as_ref().is_empty() } /// Divides one slice into two at an index, returns `None` if mid is out of bounds. From e54280fbfc86b2c5b84361026e8a811715a5f347 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 8 Sep 2021 15:24:52 +0200 Subject: [PATCH 0960/1889] Skip empty normalized words --- .../extract/extract_docid_word_positions.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 894a193bf..ca65f0874 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -67,14 +67,17 @@ pub fn extract_docid_word_positions( for (index, token) in tokens { let token = token.text().trim(); - key_buffer.truncate(mem::size_of::()); - key_buffer.extend_from_slice(token.as_bytes()); + if !token.is_empty() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(token.as_bytes()); - let position: u32 = index - .try_into() - .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let position = field_id as u32 * ONE_ATTRIBUTE + position; - docid_word_positions_sorter.insert(&key_buffer, &position.to_ne_bytes())?; + let position: u32 = index + .try_into() + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let position = field_id as u32 * ONE_ATTRIBUTE + position; + docid_word_positions_sorter + .insert(&key_buffer, &position.to_ne_bytes())?; + } } } } From 932998f5ccdd909a44115f8355731a4557d59535 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 7 Sep 2021 11:41:25 +0200 Subject: [PATCH 0961/1889] let the caller decide if they want to return an invalidSortName or an invalidCriterionName error --- milli/src/criterion.rs | 11 +++++++++-- milli/src/error.rs | 8 ++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 209a71b0d..d91d4a7e1 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -52,7 +52,12 @@ impl FromStr for Criterion { text => match AscDesc::from_str(text) { Ok(AscDesc::Asc(field)) => Ok(Criterion::Asc(field)), Ok(AscDesc::Desc(field)) => Ok(Criterion::Desc(field)), - Err(error) => Err(error.into()), + Err(UserError::InvalidAscDescSyntax { name }) => { + Err(UserError::InvalidCriterionName { name }.into()) + } + Err(error) => { + Err(UserError::InvalidCriterionName { name: error.to_string() }.into()) + } }, } } @@ -76,11 +81,13 @@ impl AscDesc { impl FromStr for AscDesc { type Err = UserError; + /// Since we don't know if this was deserialized for a criterion or a sort we just return a + /// string and let the caller create his own error fn from_str(text: &str) -> Result { match text.rsplit_once(':') { Some((field_name, "asc")) => Ok(AscDesc::Asc(field_name.to_string())), Some((field_name, "desc")) => Ok(AscDesc::Desc(field_name.to_string())), - _ => Err(UserError::InvalidCriterionName { name: text.to_string() }), + _ => Err(UserError::InvalidAscDescSyntax { name: text.to_string() }), } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 56028f742..3f473a673 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -53,11 +53,13 @@ pub enum UserError { AttributeLimitReached, Csv(csv::Error), DocumentLimitReached, + InvalidAscDescSyntax { name: String }, InvalidCriterionName { name: String }, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), + InvalidSortName { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, InvalidStoreFile, @@ -216,6 +218,9 @@ impl fmt::Display for UserError { ) } Self::InvalidFilter(error) => error.fmt(f), + Self::InvalidAscDescSyntax { name } => { + write!(f, "invalid asc/desc syntax for {}", name) + } Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); @@ -228,6 +233,9 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco ) } Self::InvalidFilterAttribute(error) => error.fmt(f), + Self::InvalidSortName { name } => { + write!(f, "Invalid syntax for the sort parameter: {}", name) + } Self::InvalidSortableAttribute { field, valid_fields } => { let valid_names = valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); From 8d9c2c4425de329fb593ec2b40fed545b15e03fa Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 23 Aug 2021 16:32:11 +0200 Subject: [PATCH 0962/1889] create a new db with getters and setters --- milli/Cargo.toml | 1 + milli/src/index.rs | 30 ++++++++++++++++++++++++++++-- milli/src/lib.rs | 1 + 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index c6fe3ea95..d2767afd4 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -27,6 +27,7 @@ once_cell = "1.5.2" ordered-float = "2.1.1" rayon = "1.5.0" roaring = "0.6.6" +rstar = { version = "0.9.1", features = ["serde"] } serde = { version = "1.0.123", features = ["derive"] } serde_json = { version = "1.0.62", features = ["preserve_order"] } slice-group-by = "0.2.6" diff --git a/milli/src/index.rs b/milli/src/index.rs index f3a2a3e05..d2b4598d3 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -8,6 +8,7 @@ use heed::flags::Flags; use heed::types::*; use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use roaring::RoaringBitmap; +use rstar::RTree; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; @@ -18,8 +19,8 @@ use crate::heed_codec::facet::{ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, - FieldIdWordCountCodec, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, - StrLevelPositionCodec, StrStrU8Codec, BEU32, + FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, + Search, StrLevelPositionCodec, StrStrU8Codec, BEU32, }; pub mod main_key { @@ -31,6 +32,7 @@ pub mod main_key { pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; + pub const GEO_RTREE_KEY: &str = "geo"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -294,6 +296,30 @@ impl Index { .unwrap_or_default()) } + /* geo rtree */ + + pub(crate) fn put_geo_rtree>( + &self, + wtxn: &mut RwTxn, + rtree: &RTree, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode>>(wtxn, main_key::GEO_RTREE_KEY, rtree) + } + + pub(crate) fn delete_geo_rtree(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::GEO_RTREE_KEY) + } + + pub fn geo_rtree<'t>(&self, rtxn: &'t RoTxn) -> Result>> { + match self + .main + .get::<_, Str, SerdeBincode>>(rtxn, main_key::GEO_RTREE_KEY)? + { + Some(rtree) => Ok(Some(rtree)), + None => Ok(None), + } + } + /* field distribution */ /// Writes the field distribution which associates every field name with diff --git a/milli/src/lib.rs b/milli/src/lib.rs index af811fe08..2a55b6f3a 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -51,6 +51,7 @@ pub type DocumentId = u32; pub type FieldId = u16; pub type Position = u32; pub type FieldDistribution = BTreeMap; +pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>; /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( From 44d6b6ae9e119b51c402bba3300c076e9b952bf4 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 23 Aug 2021 18:41:48 +0200 Subject: [PATCH 0963/1889] Index the geo points --- milli/src/index.rs | 2 +- .../extract/extract_geo_points.rs | 46 +++++++++++++++++++ .../src/update/index_documents/extract/mod.rs | 15 +++++- milli/src/update/index_documents/mod.rs | 3 ++ .../src/update/index_documents/typed_chunk.rs | 22 ++++++++- 5 files changed, 84 insertions(+), 4 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_geo_points.rs diff --git a/milli/src/index.rs b/milli/src/index.rs index d2b4598d3..70aefa9be 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -298,7 +298,7 @@ impl Index { /* geo rtree */ - pub(crate) fn put_geo_rtree>( + pub(crate) fn put_geo_rtree( &self, wtxn: &mut RwTxn, rtree: &RTree, diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs new file mode 100644 index 000000000..9f6e43199 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -0,0 +1,46 @@ +use std::fs::File; +use std::io; + +use concat_arrays::concat_arrays; +use log::warn; +use serde_json::Value; + +use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; +use crate::{FieldId, InternalError, Result}; + +/// Extracts the geographical coordinates contained in each document under the `_geo` field. +/// +/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude) +pub fn extract_geo_points( + mut obkv_documents: grenad::Reader, + indexer: GrenadParameters, + geo_field_id: Option, // faire un grenad vide +) -> Result> { + let mut writer = tempfile::tempfile().and_then(|file| { + create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) + })?; + + // we never encountered any documents with a `_geo` field. We can skip entirely this step + if geo_field_id.is_none() { + return Ok(writer_into_reader(writer)?); + } + let geo_field_id = geo_field_id.unwrap(); + + while let Some((docid_bytes, value)) = obkv_documents.next()? { + let obkv = obkv::KvReader::new(value); + let point = obkv.get(geo_field_id).unwrap(); // TODO: TAMO where should we handle this error? + let point: Value = serde_json::from_slice(point).map_err(InternalError::SerdeJson)?; + + if let Some((lat, long)) = point["lat"].as_f64().zip(point["long"].as_f64()) { + // this will create an array of 16 bytes (two 8 bytes floats) + let bytes: [u8; 16] = concat_arrays![lat.to_le_bytes(), long.to_le_bytes()]; + writer.insert(docid_bytes, bytes)?; + } else { + // TAMO: improve the warn + warn!("Malformed `_geo` field"); + continue; + } + } + + Ok(writer_into_reader(writer)?) +} diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index bb49e3e51..90a279815 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -3,6 +3,7 @@ mod extract_facet_number_docids; mod extract_facet_string_docids; mod extract_fid_docid_facet_values; mod extract_fid_word_count_docids; +mod extract_geo_points; mod extract_word_docids; mod extract_word_level_position_docids; mod extract_word_pair_proximity_docids; @@ -19,6 +20,7 @@ use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_string_docids::extract_facet_string_docids; use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; +use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; use self::extract_word_level_position_docids::extract_word_level_position_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; @@ -37,6 +39,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx: Sender>, searchable_fields: Option>, faceted_fields: HashSet, + geo_field_id: Option, stop_words: Option>, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks @@ -54,7 +57,7 @@ pub(crate) fn data_from_obkv_documents( .collect(); let ( - docid_word_positions_chunks, + (docid_word_positions_chunks), (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), ) = result?; @@ -118,6 +121,16 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-number-docids", ); + spawn_extraction_task( + documents_chunk, + indexer.clone(), + lmdb_writer_sx.clone(), + move |documents, indexer| extract_geo_points(documents, indexer, geo_field_id), + merge_cbo_roaring_bitmaps, + TypedChunk::GeoPoints, + "geo-points", + ); + Ok(()) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7800ae55a..44b108076 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -233,6 +233,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); // get filterable fields for facet databases let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + // get the fid of the `_geo` field. + let geo_field_id = self.index.fields_ids_map(self.wtxn)?.id("_geo"); let stop_words = self.index.stop_words(self.wtxn)?; // let stop_words = stop_words.as_ref(); @@ -261,6 +263,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { lmdb_writer_sx.clone(), searchable_fields, faceted_fields, + geo_field_id, stop_words, ) }); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 5f28034fe..dcefee153 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -6,11 +6,12 @@ use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ - roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, + self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, + CursorClonableMmap, }; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::into_clonable_grenad; -use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Index, Result}; +use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, GeoPoint, Index, Result}; pub(crate) enum TypedChunk { DocidWordPositions(grenad::Reader), @@ -24,6 +25,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), + GeoPoints(grenad::Reader), } /// Write typed chunk in the corresponding LMDB database of the provided index. @@ -177,6 +179,22 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::GeoPoints(mut geo_points) => { + // TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function + let mut rtree = index.geo_rtree(&index.read_txn()?)?.unwrap_or_default(); + while let Some((key, value)) = geo_points.next()? { + // convert the key back to a u32 (4 bytes) + let (key, _) = helpers::try_split_array_at::(key).unwrap(); + let key = u32::from_le_bytes(key); + + // convert the latitude and longitude back to a f64 (8 bytes) + let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); + let (long, _) = helpers::try_split_array_at::(tail).unwrap(); + let point = [f64::from_le_bytes(lat), f64::from_le_bytes(long)]; + rtree.insert(GeoPoint::new(point, key)); + } + index.put_geo_rtree(wtxn, &rtree)?; + } } Ok((RoaringBitmap::new(), is_merged_database)) From d344489c124287a0fb8a601f5983afe0e350cc67 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 25 Aug 2021 14:58:36 +0200 Subject: [PATCH 0964/1889] implement the deletion of geo points --- milli/src/update/delete_documents.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 874eed6ee..0c29b744d 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -380,6 +380,19 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { + let points_to_remove: Vec<_> = rtree + .iter() + .filter(|&point| self.documents_ids.contains(point.data)) + .cloned() + .collect(); + points_to_remove.iter().for_each(|point| { + rtree.remove(&point); + }); + + self.index.put_geo_rtree(self.wtxn, &rtree)?; + } + // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_field_id_number_docids( self.wtxn, From 3b9f1db061e12e53dc76ff8a1a3d86ec796be523 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 25 Aug 2021 15:32:41 +0200 Subject: [PATCH 0965/1889] implement the clear of the rtree --- milli/src/update/clear_documents.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 789970a8e..ef91991e8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -48,6 +48,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; + self.index.delete_geo_rtree(self.wtxn)?; // We clean all the faceted documents ids. let empty = RoaringBitmap::default(); @@ -93,7 +94,7 @@ mod tests { let content = &br#"[ { "id": 0, "name": "kevin", "age": 20 }, { "id": 1, "name": "kevina" }, - { "id": 2, "name": "benoit", "country": "France" } + { "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); @@ -107,13 +108,14 @@ mod tests { let rtxn = index.read_txn().unwrap(); - assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 4); + assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 5); assert!(index.words_fst(&rtxn).unwrap().is_empty()); assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); assert!(index.documents_ids(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); + assert!(index.geo_rtree(&rtxn).unwrap().is_none()); assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); From b4b6ba6d8285d95267ee496f0e10db12eca1bb64 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 25 Aug 2021 16:00:25 +0200 Subject: [PATCH 0966/1889] =?UTF-8?q?rename=20all=20the=20=E2=80=99long?= =?UTF-8?q?=E2=80=99=20into=20=E2=80=99lng=E2=80=99=20like=20written=20in?= =?UTF-8?q?=20the=20specification?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../src/update/index_documents/extract/extract_geo_points.rs | 4 ++-- milli/src/update/index_documents/typed_chunk.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 9f6e43199..0a73e5ed4 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -31,9 +31,9 @@ pub fn extract_geo_points( let point = obkv.get(geo_field_id).unwrap(); // TODO: TAMO where should we handle this error? let point: Value = serde_json::from_slice(point).map_err(InternalError::SerdeJson)?; - if let Some((lat, long)) = point["lat"].as_f64().zip(point["long"].as_f64()) { + if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) { // this will create an array of 16 bytes (two 8 bytes floats) - let bytes: [u8; 16] = concat_arrays![lat.to_le_bytes(), long.to_le_bytes()]; + let bytes: [u8; 16] = concat_arrays![lat.to_le_bytes(), lng.to_le_bytes()]; writer.insert(docid_bytes, bytes)?; } else { // TAMO: improve the warn diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index dcefee153..0dfeabece 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -189,8 +189,8 @@ pub(crate) fn write_typed_chunk_into_index( // convert the latitude and longitude back to a f64 (8 bytes) let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); - let (long, _) = helpers::try_split_array_at::(tail).unwrap(); - let point = [f64::from_le_bytes(lat), f64::from_le_bytes(long)]; + let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); + let point = [f64::from_le_bytes(lat), f64::from_le_bytes(lng)]; rtree.insert(GeoPoint::new(point, key)); } index.put_geo_rtree(wtxn, &rtree)?; From 70ab2c37c5115dee987fd3be46b8dc808a06022f Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 25 Aug 2021 16:59:38 +0200 Subject: [PATCH 0967/1889] remove multiple bugs --- .../extract/extract_geo_points.rs | 15 ++++++++------ .../src/update/index_documents/extract/mod.rs | 20 +++++++++---------- .../src/update/index_documents/typed_chunk.rs | 6 +++--- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 0a73e5ed4..1849d5f5d 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -21,19 +21,22 @@ pub fn extract_geo_points( })?; // we never encountered any documents with a `_geo` field. We can skip entirely this step - if geo_field_id.is_none() { - return Ok(writer_into_reader(writer)?); - } - let geo_field_id = geo_field_id.unwrap(); + let geo_field_id = match geo_field_id { + Some(geo) => geo, + None => return Ok(writer_into_reader(writer)?), + }; while let Some((docid_bytes, value)) = obkv_documents.next()? { let obkv = obkv::KvReader::new(value); - let point = obkv.get(geo_field_id).unwrap(); // TODO: TAMO where should we handle this error? + let point = match obkv.get(geo_field_id) { + Some(point) => point, + None => continue, + }; let point: Value = serde_json::from_slice(point).map_err(InternalError::SerdeJson)?; if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) { // this will create an array of 16 bytes (two 8 bytes floats) - let bytes: [u8; 16] = concat_arrays![lat.to_le_bytes(), lng.to_le_bytes()]; + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; } else { // TAMO: improve the warn diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 90a279815..736060b15 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -51,13 +51,14 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), &searchable_fields, &faceted_fields, + geo_field_id, &stop_words, ) }) .collect(); let ( - (docid_word_positions_chunks), + docid_word_positions_chunks, (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), ) = result?; @@ -121,16 +122,6 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-number-docids", ); - spawn_extraction_task( - documents_chunk, - indexer.clone(), - lmdb_writer_sx.clone(), - move |documents, indexer| extract_geo_points(documents, indexer, geo_field_id), - merge_cbo_roaring_bitmaps, - TypedChunk::GeoPoints, - "geo-points", - ); - Ok(()) } @@ -181,6 +172,7 @@ fn extract_documents_data( lmdb_writer_sx: Sender>, searchable_fields: &Option>, faceted_fields: &HashSet, + geo_field_id: Option, stop_words: &Option>, ) -> Result<( grenad::Reader, @@ -190,6 +182,12 @@ fn extract_documents_data( let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); + let (documents_chunk_cloned, lmdb_writer_sx_cloned) = (documents_chunk.clone(), lmdb_writer_sx.clone()); + rayon::spawn(move || { + let geo_points = extract_geo_points(documents_chunk_cloned, indexer, geo_field_id).unwrap(); + lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))).unwrap(); + }); + let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 0dfeabece..9605fea7d 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -181,16 +181,16 @@ pub(crate) fn write_typed_chunk_into_index( } TypedChunk::GeoPoints(mut geo_points) => { // TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function - let mut rtree = index.geo_rtree(&index.read_txn()?)?.unwrap_or_default(); + let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); while let Some((key, value)) = geo_points.next()? { // convert the key back to a u32 (4 bytes) let (key, _) = helpers::try_split_array_at::(key).unwrap(); - let key = u32::from_le_bytes(key); + let key = u32::from_be_bytes(key); // convert the latitude and longitude back to a f64 (8 bytes) let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); - let point = [f64::from_le_bytes(lat), f64::from_le_bytes(lng)]; + let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; rtree.insert(GeoPoint::new(point, key)); } index.put_geo_rtree(wtxn, &rtree)?; From a21c8547904af27bcf9dd0d057f8981842a56a50 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 11:43:17 +0200 Subject: [PATCH 0968/1889] handle errors --- milli/src/update/index_documents/extract/mod.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 736060b15..aefc0ff92 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -182,10 +182,13 @@ fn extract_documents_data( let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); - let (documents_chunk_cloned, lmdb_writer_sx_cloned) = (documents_chunk.clone(), lmdb_writer_sx.clone()); + let documents_chunk_cloned = documents_chunk.clone(); + let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); rayon::spawn(move || { - let geo_points = extract_geo_points(documents_chunk_cloned, indexer, geo_field_id).unwrap(); - lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))).unwrap(); + let _ = match extract_geo_points(documents_chunk_cloned, indexer, geo_field_id) { + Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), + Err(error) => lmdb_writer_sx_cloned.send(Err(error)), + }; }); let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = From 216a8aa3b26d23d84034ce92af9101442c8ddf23 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 13:27:32 +0200 Subject: [PATCH 0969/1889] add a tests for the indexation of the geosearch --- milli/src/update/delete_documents.rs | 70 +++++++++++++++++++++++++ milli/src/update/index_documents/mod.rs | 10 ++-- 2 files changed, 75 insertions(+), 5 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 0c29b744d..84fc3215f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -670,4 +670,74 @@ mod tests { wtxn.commit().unwrap(); } + + #[test] + fn delete_documents_with_geo_points() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_primary_key(S("id")); + builder.execute(|_, _| ()).unwrap(); + + let content = &r#"[ + {"id":"1","city":"Lille", "_geo": { "lat": 50.629973371633746, "lng": 3.0569447399419570 } }, + {"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.641586120121050, "lng": 3.1106593480348670 } }, + {"id":"3","city":"Hellemmes", "_geo": { "lat": 50.631220965518080, "lng": 3.1106399673339933 } }, + {"id":"4","city":"Villeneuve-d'Ascq", "_geo": { "lat": 50.622468098014565, "lng": 3.1476425513437140 } }, + {"id":"5","city":"Hem", "_geo": { "lat": 50.655250871381355, "lng": 3.1897297266244130 } }, + {"id":"6","city":"Roubaix", "_geo": { "lat": 50.692473451896710, "lng": 3.1763326737747650 } }, + {"id":"7","city":"Tourcoing", "_geo": { "lat": 50.726397466736480, "lng": 3.1541653659578670 } }, + {"id":"8","city":"Mouscron", "_geo": { "lat": 50.745325554908610, "lng": 3.2206407854429853 } }, + {"id":"9","city":"Tournai", "_geo": { "lat": 50.605342528602630, "lng": 3.3758586941351414 } }, + {"id":"10","city":"Ghent", "_geo": { "lat": 51.053777403679035, "lng": 3.6957733119926930 } }, + {"id":"11","city":"Brussels", "_geo": { "lat": 50.846640974544690, "lng": 4.3370663564281840 } }, + {"id":"12","city":"Charleroi", "_geo": { "lat": 50.409570138889480, "lng": 4.4347354315085520 } }, + {"id":"13","city":"Mons", "_geo": { "lat": 50.450294178855420, "lng": 3.9623722870904690 } }, + {"id":"14","city":"Valenciennes", "_geo": { "lat": 50.351817774473545, "lng": 3.5326283646928800 } }, + {"id":"15","city":"Arras", "_geo": { "lat": 50.284487528579950, "lng": 2.7637515844478160 } }, + {"id":"16","city":"Cambrai", "_geo": { "lat": 50.179340577906700, "lng": 3.2189409952502930 } }, + {"id":"17","city":"Bapaume", "_geo": { "lat": 50.111276127236400, "lng": 2.8547894666083120 } }, + {"id":"18","city":"Amiens", "_geo": { "lat": 49.931472529669996, "lng": 2.2710499758317080 } }, + {"id":"19","city":"Compiègne", "_geo": { "lat": 49.444980887725656, "lng": 2.7913841281529015 } }, + {"id":"20","city":"Paris", "_geo": { "lat": 48.902100060895480, "lng": 2.3708400867406930 } } + ]"#[..]; + let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; + + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + + let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); + let ids_to_delete: Vec = external_ids_to_delete + .iter() + .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) + .collect(); + + // Delete some documents. + let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + external_ids_to_delete.iter().for_each(|id| drop(builder.delete_external_id(id))); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let rtree = index.geo_rtree(&rtxn).unwrap().unwrap(); + + let all_geo_ids = rtree.iter().map(|point| point.data).collect::>(); + let all_geo_documents = index.documents(&rtxn, all_geo_ids.iter().copied()).unwrap(); + + for (id, _) in all_geo_documents.iter() { + assert!(!ids_to_delete.contains(&id), "The document {} was supposed to be deleted", id); + } + + assert_eq!( + all_geo_ids.len(), + all_geo_documents.len(), + "We deleted documents that were not supposed to be deleted" + ); + } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 44b108076..ba550afb9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -877,12 +877,12 @@ mod tests { // First we send 3 documents with an id for only one of them. let mut wtxn = index.write_txn().unwrap(); let documents = &r#"[ - { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5 }, + { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, { "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 }, { "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 }, { "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" }, { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" }, - { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams" } + { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); @@ -918,7 +918,7 @@ mod tests { { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" }, { "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" }, { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, - { "objectId": 30, "title": "Hamlet" } + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); @@ -935,7 +935,7 @@ mod tests { assert!(external_documents_ids.get("30").is_none()); let content = &br#"[ - { "objectId": 30, "title": "Hamlet" } + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); @@ -945,7 +945,7 @@ mod tests { assert!(external_documents_ids.get("30").is_some()); let content = &br#"[ - { "objectId": 30, "title": "Hamlet" } + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.update_format(UpdateFormat::Json); From 6d70978edc29f25bfa213c459db55cdab0ea2d29 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 15:51:54 +0200 Subject: [PATCH 0970/1889] update the facet filter grammar --- milli/src/search/facet/grammar.pest | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest index 2096517d3..10783b632 100644 --- a/milli/src/search/facet/grammar.pest +++ b/milli/src/search/facet/grammar.pest @@ -16,10 +16,11 @@ neq = {key ~ "!=" ~ value} eq = {key ~ "=" ~ value} greater = {key ~ ">" ~ value} less = {key ~ "<" ~ value} +geo_radius = {"_geoRadius(" ~ value ~ "," ~ value ~ "," ~ value ~ ")"} prgm = {SOI ~ expr ~ EOI} expr = _{ ( term ~ (operation ~ term)* ) } -term = { ("(" ~ expr ~ ")") | condition | not } +term = { ("(" ~ expr ~ ")") | condition | not | geo_radius } operation = _{ and | or } and = {"AND"} or = {"OR"} From 4b459768a0360d7c0759f2c068d47feddae65879 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 16:38:29 +0200 Subject: [PATCH 0971/1889] create the _geoRadius filter --- milli/src/search/facet/filter_condition.rs | 48 ++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index a92797e90..1480fc95a 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -32,6 +32,8 @@ pub enum Operator { LowerThan(f64), LowerThanOrEqual(f64), Between(f64, f64), + GeoLowerThan([f64; 2], f64), + GeoGreaterThan([f64; 2], f64), } impl Operator { @@ -46,6 +48,8 @@ impl Operator { LowerThan(n) => (GreaterThanOrEqual(n), None), LowerThanOrEqual(n) => (GreaterThan(n), None), Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), + GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None), + GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None), } } } @@ -131,6 +135,7 @@ impl FilterCondition { Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), Rule::less => Ok(Self::lower_than(fim, ff, pair)?), Rule::between => Ok(Self::between(fim, ff, pair)?), + Rule::geo_radius => Ok(Self::geo_radius(fim, pair)?), Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), @@ -156,6 +161,23 @@ impl FilterCondition { } } + fn geo_radius(fields_ids_map: &FieldsIdsMap, item: Pair) -> Result { + let mut items = item.into_inner(); + let fid = match fields_ids_map.id("_geo") { + Some(fid) => fid, + None => return Ok(Empty), + }; + let (lat_result, _) = pest_parse(items.next().unwrap()); + let (lng_result, _) = pest_parse(items.next().unwrap()); + let lat = lat_result.map_err(UserError::InvalidFilter)?; + let lng = lng_result.map_err(UserError::InvalidFilter)?; + let point = [lat, lng]; + let (distance_result, _) = pest_parse(items.next().unwrap()); + let distance = distance_result.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, GeoLowerThan(point, distance))) + } + fn between( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, @@ -440,6 +462,32 @@ impl FilterCondition { LowerThan(val) => (Included(f64::MIN), Excluded(*val)), LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), Between(left, right) => (Included(*left), Included(*right)), + GeoLowerThan(point, distance) => { + let mut result = RoaringBitmap::new(); + let rtree = match index.geo_rtree(rtxn)? { + Some(rtree) => rtree, + None => return Ok(result), + }; + + let iter = rtree + .nearest_neighbor_iter_with_distance_2(point) + .take_while(|(_, dist)| dist <= distance); + iter.for_each(|(point, _)| drop(result.insert(point.data))); + + return Ok(result); + } + GeoGreaterThan(point, distance) => { + let result = Self::evaluate_operator( + rtxn, + index, + numbers_db, + strings_db, + field_id, + &GeoLowerThan(point.clone(), *distance), + )?; + let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; + return Ok(geo_faceted_doc_ids - result); + } }; // Ask for the biggest value that can exist for this specific field, if it exists From ea2f2ecf96dbfd03e419a7207e8c17202eeac03d Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 26 Aug 2021 17:49:50 +0200 Subject: [PATCH 0972/1889] create a new database containing all the documents that were geo-faceted --- milli/src/index.rs | 38 ++++++++++++++++++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 29 +++++++++++++- .../src/update/index_documents/typed_chunk.rs | 4 ++ 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 70aefa9be..f2ddba699 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -32,7 +32,8 @@ pub mod main_key { pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; - pub const GEO_RTREE_KEY: &str = "geo"; + pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; + pub const GEO_RTREE_KEY: &str = "geo-rtree"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -320,6 +321,41 @@ impl Index { } } + /* geo faceted */ + + /// Writes the documents ids that are faceted with a _geo field + pub(crate) fn put_geo_faceted_documents_ids( + &self, + wtxn: &mut RwTxn, + docids: &RoaringBitmap, + ) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>( + wtxn, + main_key::GEO_FACETED_DOCUMENTS_IDS_KEY, + docids, + ) + } + + /// Delete the documents ids that are faceted with a _geo field + pub(crate) fn delete_geo_faceted_documents_ids(&self, wtxn: &mut RwTxn) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>( + wtxn, + main_key::GEO_FACETED_DOCUMENTS_IDS_KEY, + &RoaringBitmap::new(), + ) + } + + /// Retrieve all the documents ids that faceted with a _geo field + pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { + match self + .main + .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY)? + { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /* field distribution */ /// Writes the field distribution which associates every field name with diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ef91991e8..e937cb65f 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -49,6 +49,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; + self.index.delete_geo_faceted_documents_ids(self.wtxn)?; // We clean all the faceted documents ids. let empty = RoaringBitmap::default(); @@ -116,6 +117,7 @@ mod tests { assert!(index.documents_ids(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); assert!(index.geo_rtree(&rtxn).unwrap().is_none()); + assert!(index.geo_faceted_documents_ids(&rtxn).unwrap().is_empty()); assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 84fc3215f..cfd777d11 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -381,6 +381,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { + let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; + let points_to_remove: Vec<_> = rtree .iter() .filter(|&point| self.documents_ids.contains(point.data)) @@ -388,9 +390,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { .collect(); points_to_remove.iter().for_each(|point| { rtree.remove(&point); + geo_faceted_doc_ids.remove(point.data); }); self.index.put_geo_rtree(self.wtxn, &rtree)?; + self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; } // We delete the documents ids that are under the facet field id values. @@ -555,6 +559,8 @@ where #[cfg(test)] mod tests { + use std::collections::HashSet; + use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; @@ -726,11 +732,30 @@ mod tests { let rtxn = index.read_txn().unwrap(); let rtree = index.geo_rtree(&rtxn).unwrap().unwrap(); + let geo_faceted_doc_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); let all_geo_ids = rtree.iter().map(|point| point.data).collect::>(); - let all_geo_documents = index.documents(&rtxn, all_geo_ids.iter().copied()).unwrap(); + let all_geo_documents = index + .documents(&rtxn, all_geo_ids.iter().copied()) + .unwrap() + .iter() + .map(|(id, _)| *id) + .collect::>(); - for (id, _) in all_geo_documents.iter() { + let all_geo_faceted_ids = geo_faceted_doc_ids.iter().collect::>(); + let all_geo_faceted_documents = index + .documents(&rtxn, all_geo_faceted_ids.iter().copied()) + .unwrap() + .iter() + .map(|(id, _)| *id) + .collect::>(); + + assert_eq!( + all_geo_documents, all_geo_faceted_documents, + "There is an inconsistency between the geo_faceted database and the rtree" + ); + + for id in all_geo_documents.iter() { assert!(!ids_to_delete.contains(&id), "The document {} was supposed to be deleted", id); } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 9605fea7d..b09bee213 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -182,6 +182,8 @@ pub(crate) fn write_typed_chunk_into_index( TypedChunk::GeoPoints(mut geo_points) => { // TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); + let mut doc_ids = index.geo_faceted_documents_ids(wtxn)?; + while let Some((key, value)) = geo_points.next()? { // convert the key back to a u32 (4 bytes) let (key, _) = helpers::try_split_array_at::(key).unwrap(); @@ -192,8 +194,10 @@ pub(crate) fn write_typed_chunk_into_index( let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; rtree.insert(GeoPoint::new(point, key)); + doc_ids.insert(key); } index.put_geo_rtree(wtxn, &rtree)?; + index.put_geo_faceted_documents_ids(wtxn, &doc_ids)?; } } From 4fd0116a0d4a27c95091c46c29db24df6490e5e4 Mon Sep 17 00:00:00 2001 From: cvermand <33010418+bidoubiwa@users.noreply.github.com> Date: Thu, 26 Aug 2021 19:12:30 +0200 Subject: [PATCH 0973/1889] Stringify objects on dashboard to avoid [Object object] --- http-ui/public/script.js | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/http-ui/public/script.js b/http-ui/public/script.js index b621cd453..e4de86672 100644 --- a/http-ui/public/script.js +++ b/http-ui/public/script.js @@ -60,7 +60,13 @@ $('#query, #filters').on('input', function () { const content = document.createElement('div'); content.classList.add("content"); - content.innerHTML = element[prop]; + + // Stringify Objects and Arrays to avoid [Object object] + if (typeof element[prop] === 'object' && element[prop] !== null) { + content.innerHTML = JSON.stringify(element[prop]); + } else { + content.innerHTML = element[prop]; + } field.appendChild(attribute); field.appendChild(content); From f73273d71c3342801e52395d7ab52b337bb2d89c Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 30 Aug 2021 15:47:11 +0200 Subject: [PATCH 0974/1889] only call the extractor if needed --- .../extract/extract_geo_points.rs | 8 +------- .../src/update/index_documents/extract/mod.rs | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 1849d5f5d..88ae7c177 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -14,18 +14,12 @@ use crate::{FieldId, InternalError, Result}; pub fn extract_geo_points( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, - geo_field_id: Option, // faire un grenad vide + geo_field_id: FieldId, ) -> Result> { let mut writer = tempfile::tempfile().and_then(|file| { create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) })?; - // we never encountered any documents with a `_geo` field. We can skip entirely this step - let geo_field_id = match geo_field_id { - Some(geo) => geo, - None => return Ok(writer_into_reader(writer)?), - }; - while let Some((docid_bytes, value)) = obkv_documents.next()? { let obkv = obkv::KvReader::new(value); let point = match obkv.get(geo_field_id) { diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index aefc0ff92..4cb21c8e4 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -182,14 +182,16 @@ fn extract_documents_data( let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); - let documents_chunk_cloned = documents_chunk.clone(); - let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); - rayon::spawn(move || { - let _ = match extract_geo_points(documents_chunk_cloned, indexer, geo_field_id) { - Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), - Err(error) => lmdb_writer_sx_cloned.send(Err(error)), - }; - }); + if let Some(geo_field_id) = geo_field_id { + let documents_chunk_cloned = documents_chunk.clone(); + let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); + rayon::spawn(move || { + let _ = match extract_geo_points(documents_chunk_cloned, indexer, geo_field_id) { + Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), + Err(error) => lmdb_writer_sx_cloned.send(Err(error)), + }; + }); + } let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( From 5bb175fc90eb2d9cb3c8bc5c39b7deb8dc1bf5ed Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 30 Aug 2021 15:47:33 +0200 Subject: [PATCH 0975/1889] only index _geo if it's set as sortable OR filterable and only allow the filters if geo was set to filterable --- milli/src/search/facet/filter_condition.rs | 6 ++++++ milli/src/update/delete_documents.rs | 2 ++ milli/src/update/index_documents/mod.rs | 8 +++++++- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 1480fc95a..66a2ffac7 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -463,6 +463,9 @@ impl FilterCondition { LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), Between(left, right) => (Included(*left), Included(*right)), GeoLowerThan(point, distance) => { + if index.filterable_fields(rtxn)?.contains("_geo") { + Err(UserError::AttributeLimitReached)?; // TODO: TAMO use a real error + } let mut result = RoaringBitmap::new(); let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, @@ -477,6 +480,9 @@ impl FilterCondition { return Ok(result); } GeoGreaterThan(point, distance) => { + if index.filterable_fields(rtxn)?.contains("_geo") { + Err(UserError::AttributeLimitReached)?; // TODO: TAMO use a real error + } let result = Self::evaluate_operator( rtxn, index, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index cfd777d11..639770bd6 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -687,6 +687,8 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_primary_key(S("id")); + builder.set_filterable_fields(hashset!(S("_geo"))); + builder.set_sortable_fields(hashset!(S("_geo"))); builder.execute(|_, _| ()).unwrap(); let content = &r#"[ diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba550afb9..d4fd3570e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -234,7 +234,13 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // get filterable fields for facet databases let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; // get the fid of the `_geo` field. - let geo_field_id = self.index.fields_ids_map(self.wtxn)?.id("_geo"); + let geo_field_id = if let Some(gfid) = self.index.fields_ids_map(self.wtxn)?.id("_geo") { + (self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid) + || self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid)) + .then(|| gfid) + } else { + None + }; let stop_words = self.index.stop_words(self.wtxn)?; // let stop_words = stop_words.as_ref(); From 13c78e5aa2dc9413f02c2df3d28d6c700f88fb93 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 30 Aug 2021 18:22:52 +0200 Subject: [PATCH 0976/1889] Implement the _geoPoint in the sortable --- milli/src/criterion.rs | 78 +++++++++++++++++++++++---- milli/src/search/criteria/asc_desc.rs | 75 +++++++++++++++++++------- milli/src/search/criteria/mod.rs | 10 ++-- milli/src/search/mod.rs | 14 ++--- 4 files changed, 138 insertions(+), 39 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index d91d4a7e1..2bca6948b 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -58,24 +58,84 @@ impl FromStr for Criterion { Err(error) => { Err(UserError::InvalidCriterionName { name: error.to_string() }.into()) } + Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { + Err(UserError::AttributeLimitReached)? // TODO: TAMO: use a real error + } + Err(error) => Err(error.into()), }, } } } -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +pub enum Member { + Field(String), + Geo([f64; 2]), +} + +impl FromStr for Member { + type Err = UserError; + + fn from_str(text: &str) -> Result { + if text.starts_with("_geoPoint(") { + let point = + text.strip_prefix("_geoPoint(") + .and_then(|point| point.strip_suffix(")")) + .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; + let point = point + .split(',') + .map(|el| el.parse()) + .collect::, _>>() + .map_err(|_| UserError::InvalidCriterionName { name: text.to_string() })?; + Ok(Member::Geo([point[0], point[1]])) + } else { + Ok(Member::Field(text.to_string())) + } + } +} + +impl fmt::Display for Member { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Member::Field(name) => write!(f, "{}", name), + Member::Geo([lat, lng]) => write!(f, "_geoPoint({}, {})", lat, lng), + } + } +} + +impl Member { + pub fn field(&self) -> Option<&str> { + match self { + Member::Field(field) => Some(field), + Member::Geo(_) => None, + } + } + + pub fn geo_point(&self) -> Option<&[f64; 2]> { + match self { + Member::Geo(point) => Some(point), + Member::Field(_) => None, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] pub enum AscDesc { - Asc(String), - Desc(String), + Asc(Member), + Desc(Member), } impl AscDesc { - pub fn field(&self) -> &str { + pub fn member(&self) -> &Member { match self { - AscDesc::Asc(field) => field, - AscDesc::Desc(field) => field, + AscDesc::Asc(member) => member, + AscDesc::Desc(member) => member, } } + + pub fn field(&self) -> Option<&str> { + self.member().field() + } } impl FromStr for AscDesc { @@ -85,9 +145,9 @@ impl FromStr for AscDesc { /// string and let the caller create his own error fn from_str(text: &str) -> Result { match text.rsplit_once(':') { - Some((field_name, "asc")) => Ok(AscDesc::Asc(field_name.to_string())), - Some((field_name, "desc")) => Ok(AscDesc::Desc(field_name.to_string())), - _ => Err(UserError::InvalidAscDescSyntax { name: text.to_string() }), + Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)), + Some((left, "desc")) => Ok(AscDesc::Desc(left.parse()?)), + _ => Err(UserError::InvalidCriterionName { name: text.to_string() }), } } } diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 6d50c1bb5..b0951f655 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -4,12 +4,14 @@ use itertools::Itertools; use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; +use rstar::RTree; use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::criterion::Member; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::{FacetNumberIter, FacetStringIter}; use crate::search::query_tree::Operation; -use crate::{FieldId, Index, Result}; +use crate::{FieldId, GeoPoint, Index, Result}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -18,10 +20,11 @@ const CANDIDATES_THRESHOLD: u64 = 1000; pub struct AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn<'t>, - field_name: String, + member: Member, field_id: Option, is_ascending: bool, query_tree: Option, + rtree: Option>, candidates: Box> + 't>, allowed_candidates: RoaringBitmap, bucket_candidates: RoaringBitmap, @@ -34,29 +37,29 @@ impl<'t> AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - field_name: String, + member: Member, ) -> Result { - Self::new(index, rtxn, parent, field_name, true) + Self::new(index, rtxn, parent, member, true) } pub fn desc( index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - field_name: String, + member: Member, ) -> Result { - Self::new(index, rtxn, parent, field_name, false) + Self::new(index, rtxn, parent, member, false) } fn new( index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - field_name: String, + member: Member, is_ascending: bool, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let field_id = fields_ids_map.id(&field_name); + let field_id = member.field().and_then(|field| fields_ids_map.id(&field)); let faceted_candidates = match field_id { Some(field_id) => { let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?; @@ -65,14 +68,16 @@ impl<'t> AscDesc<'t> { } None => RoaringBitmap::default(), }; + let rtree = index.geo_rtree(rtxn)?; Ok(AscDesc { index, rtxn, - field_name, + member, field_id, is_ascending, query_tree: None, + rtree, candidates: Box::new(std::iter::empty()), allowed_candidates: RoaringBitmap::new(), faceted_candidates, @@ -92,7 +97,7 @@ impl<'t> Criterion for AscDesc<'t> { debug!( "Facet {}({}) iteration", if self.is_ascending { "Asc" } else { "Desc" }, - self.field_name + self.member ); match self.candidates.next().transpose()? { @@ -135,15 +140,31 @@ impl<'t> Criterion for AscDesc<'t> { } self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = match self.field_id { - Some(field_id) => facet_ordered( - self.index, - self.rtxn, - field_id, - self.is_ascending, - candidates & &self.faceted_candidates, - )?, - None => Box::new(std::iter::empty()), + + match &self.member { + Member::Field(field_name) => { + self.candidates = match self.field_id { + Some(field_id) => facet_ordered( + self.index, + self.rtxn, + field_id, + self.is_ascending, + candidates & &self.faceted_candidates, + )?, + None => Box::new(std::iter::empty()), + } + } + Member::Geo(point) => { + self.candidates = match &self.rtree { + Some(rtree) => { + // TODO: TAMO how to remove that? + let rtree = Box::new(rtree.clone()); + let rtree = Box::leak(rtree); + geo_point(rtree, candidates, point.clone())? + } + None => Box::new(std::iter::empty()), + } + } }; } None => return Ok(None), @@ -163,6 +184,22 @@ impl<'t> Criterion for AscDesc<'t> { } } +fn geo_point<'t>( + rtree: &'t RTree, + candidates: RoaringBitmap, + point: [f64; 2], +) -> Result> + 't>> { + Ok(Box::new( + rtree + .nearest_neighbor_iter_with_distance_2(&point) + .filter_map(move |(point, _distance)| { + candidates.contains(point.data).then(|| point.data) + }) + .map(|id| std::iter::once(id).collect::()) + .map(Ok), + )) +} + /// Returns an iterator over groups of the given candidates in ascending or descending order. /// /// It will either use an iterative or a recursive method on the whole facet database depending diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 2a883de67..92c0d284a 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -12,7 +12,7 @@ use self::r#final::Final; use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; -use crate::criterion::AscDesc as AscDescName; +use crate::criterion::{AscDesc as AscDescName, Member}; use crate::search::{word_derivations, WordDerivationsCache}; use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; @@ -294,13 +294,13 @@ impl<'t> CriteriaBuilder<'t> { &self.index, &self.rtxn, criterion, - field.to_string(), + field.clone(), )?), AscDescName::Desc(field) => Box::new(AscDesc::desc( &self.index, &self.rtxn, criterion, - field.to_string(), + field.clone(), )?), }; } @@ -312,10 +312,10 @@ impl<'t> CriteriaBuilder<'t> { Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), Name::Asc(field) => { - Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?) + Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, Member::Field(field))?) } Name::Desc(field) => { - Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?) + Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, Member::Field(field))?) } }; } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 207f46f8a..f752f5822 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -148,13 +148,15 @@ impl<'a> Search<'a> { if let Some(sort_criteria) = &self.sort_criteria { let sortable_fields = self.index.sortable_fields(self.rtxn)?; for asc_desc in sort_criteria { - let field = asc_desc.field(); - if !sortable_fields.contains(field) { - return Err(UserError::InvalidSortableAttribute { - field: field.to_string(), - valid_fields: sortable_fields, + // we are not supposed to find any geoPoint in the criterion + if let Some(field) = asc_desc.field() { + if !sortable_fields.contains(field) { + return Err(UserError::InvalidSortableAttribute { + field: field.to_string(), + valid_fields: sortable_fields, + } + .into()); } - .into()); } } } From 4820ac71a66de224979a03260c20bbb03796cf56 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 31 Aug 2021 18:48:40 +0200 Subject: [PATCH 0977/1889] allow spaces in a geoRadius --- milli/src/criterion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 2bca6948b..35268461e 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -84,7 +84,7 @@ impl FromStr for Member { .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; let point = point .split(',') - .map(|el| el.parse()) + .map(|el| el.trim()).parse() .collect::, _>>() .map_err(|_| UserError::InvalidCriterionName { name: text.to_string() })?; Ok(Member::Geo([point[0], point[1]])) From 7483614b75b8ea57e3960f5ac94c62a6acb7a8b8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 31 Aug 2021 18:49:06 +0200 Subject: [PATCH 0978/1889] [HTTP-UI] add the sorters --- http-ui/src/main.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 108ec0549..89f3dcab2 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -695,6 +695,7 @@ async fn main() -> anyhow::Result<()> { struct QueryBody { query: Option, filters: Option, + sorters: Option, facet_filters: Option, String>>>, facet_distribution: Option, limit: Option, @@ -754,6 +755,10 @@ async fn main() -> anyhow::Result<()> { search.limit(limit); } + if let Some(sort) = query.sorters { + search.sort_criteria(vec![sort.parse().unwrap()]); + } + let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); From dc84ecc40b16eaf3ced19b9b8282d4723525e431 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 1 Sep 2021 10:56:19 +0200 Subject: [PATCH 0979/1889] fix a bug --- milli/src/criterion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 35268461e..ea3214c8e 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -84,7 +84,7 @@ impl FromStr for Member { .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; let point = point .split(',') - .map(|el| el.trim()).parse() + .map(|el| el.trim().parse()) .collect::, _>>() .map_err(|_| UserError::InvalidCriterionName { name: text.to_string() })?; Ok(Member::Geo([point[0], point[1]])) From a8a1f5bd556aa28c7e1e0ea304a2919a1deb6161 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 1 Sep 2021 15:14:23 +0200 Subject: [PATCH 0980/1889] move the geosearch criteria out of asc_desc.rs --- milli/src/search/criteria/asc_desc.rs | 75 +++++------------ milli/src/search/criteria/geo.rs | 115 ++++++++++++++++++++++++++ milli/src/search/criteria/mod.rs | 40 +++++---- 3 files changed, 160 insertions(+), 70 deletions(-) create mode 100644 milli/src/search/criteria/geo.rs diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index b0951f655..6d50c1bb5 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -4,14 +4,12 @@ use itertools::Itertools; use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; -use rstar::RTree; use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::criterion::Member; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::{FacetNumberIter, FacetStringIter}; use crate::search::query_tree::Operation; -use crate::{FieldId, GeoPoint, Index, Result}; +use crate::{FieldId, Index, Result}; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -20,11 +18,10 @@ const CANDIDATES_THRESHOLD: u64 = 1000; pub struct AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn<'t>, - member: Member, + field_name: String, field_id: Option, is_ascending: bool, query_tree: Option, - rtree: Option>, candidates: Box> + 't>, allowed_candidates: RoaringBitmap, bucket_candidates: RoaringBitmap, @@ -37,29 +34,29 @@ impl<'t> AscDesc<'t> { index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - member: Member, + field_name: String, ) -> Result { - Self::new(index, rtxn, parent, member, true) + Self::new(index, rtxn, parent, field_name, true) } pub fn desc( index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - member: Member, + field_name: String, ) -> Result { - Self::new(index, rtxn, parent, member, false) + Self::new(index, rtxn, parent, field_name, false) } fn new( index: &'t Index, rtxn: &'t heed::RoTxn, parent: Box, - member: Member, + field_name: String, is_ascending: bool, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let field_id = member.field().and_then(|field| fields_ids_map.id(&field)); + let field_id = fields_ids_map.id(&field_name); let faceted_candidates = match field_id { Some(field_id) => { let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?; @@ -68,16 +65,14 @@ impl<'t> AscDesc<'t> { } None => RoaringBitmap::default(), }; - let rtree = index.geo_rtree(rtxn)?; Ok(AscDesc { index, rtxn, - member, + field_name, field_id, is_ascending, query_tree: None, - rtree, candidates: Box::new(std::iter::empty()), allowed_candidates: RoaringBitmap::new(), faceted_candidates, @@ -97,7 +92,7 @@ impl<'t> Criterion for AscDesc<'t> { debug!( "Facet {}({}) iteration", if self.is_ascending { "Asc" } else { "Desc" }, - self.member + self.field_name ); match self.candidates.next().transpose()? { @@ -140,31 +135,15 @@ impl<'t> Criterion for AscDesc<'t> { } self.allowed_candidates = &candidates - params.excluded_candidates; - - match &self.member { - Member::Field(field_name) => { - self.candidates = match self.field_id { - Some(field_id) => facet_ordered( - self.index, - self.rtxn, - field_id, - self.is_ascending, - candidates & &self.faceted_candidates, - )?, - None => Box::new(std::iter::empty()), - } - } - Member::Geo(point) => { - self.candidates = match &self.rtree { - Some(rtree) => { - // TODO: TAMO how to remove that? - let rtree = Box::new(rtree.clone()); - let rtree = Box::leak(rtree); - geo_point(rtree, candidates, point.clone())? - } - None => Box::new(std::iter::empty()), - } - } + self.candidates = match self.field_id { + Some(field_id) => facet_ordered( + self.index, + self.rtxn, + field_id, + self.is_ascending, + candidates & &self.faceted_candidates, + )?, + None => Box::new(std::iter::empty()), }; } None => return Ok(None), @@ -184,22 +163,6 @@ impl<'t> Criterion for AscDesc<'t> { } } -fn geo_point<'t>( - rtree: &'t RTree, - candidates: RoaringBitmap, - point: [f64; 2], -) -> Result> + 't>> { - Ok(Box::new( - rtree - .nearest_neighbor_iter_with_distance_2(&point) - .filter_map(move |(point, _distance)| { - candidates.contains(point.data).then(|| point.data) - }) - .map(|id| std::iter::once(id).collect::()) - .map(Ok), - )) -} - /// Returns an iterator over groups of the given candidates in ascending or descending order. /// /// It will either use an iterative or a recursive method on the whole facet database depending diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs new file mode 100644 index 000000000..740bcb3a8 --- /dev/null +++ b/milli/src/search/criteria/geo.rs @@ -0,0 +1,115 @@ +use roaring::RoaringBitmap; +use rstar::RTree; + +use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; +use crate::{GeoPoint, Index, Result}; + +pub struct Geo<'t> { + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + candidates: Box>, + allowed_candidates: RoaringBitmap, + bucket_candidates: RoaringBitmap, + rtree: Option>, + point: [f64; 2], +} + +impl<'t> Geo<'t> { + pub fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + point: [f64; 2], + ) -> Result { + let candidates = Box::new(std::iter::empty()); + let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; + let bucket_candidates = RoaringBitmap::new(); + let rtree = index.geo_rtree(rtxn)?; + + Ok(Self { index, rtxn, parent, candidates, allowed_candidates, bucket_candidates, rtree, point }) + } +} + +impl<'t> Criterion for Geo<'t> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + // if there is no rtree we have nothing to returns + let rtree = match self.rtree.as_ref() { + Some(rtree) => rtree, + None => return Ok(None), + }; + + loop { + match self.candidates.next() { + Some(mut candidates) => { + candidates -= params.excluded_candidates; + self.allowed_candidates -= &candidates; + return Ok(Some(CriterionResult { + query_tree: None, + candidates: Some(candidates), + filtered_candidates: None, + bucket_candidates: Some(self.bucket_candidates.clone()), + })); + } + None => { + match self.parent.next(params)? { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + let mut candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + resolve_query_tree(&context, qt, params.wdcache)? + } + // TODO: TAMO: why are we doing this? + (None, None) => self.index.documents_ids(self.rtxn)?, + }; + + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + + match bucket_candidates { + // why not are we keeping elements from the previous bucket? + Some(bucket_candidates) => { + self.bucket_candidates |= bucket_candidates + } + None => self.bucket_candidates |= &candidates, + } + + if candidates.is_empty() { + continue; + } + let rtree = Box::new(rtree.clone()); + let rtree = Box::leak(rtree); + + self.allowed_candidates = &candidates - params.excluded_candidates; + self.candidates = geo_point(rtree, self.allowed_candidates.clone(), self.point)?; + } + None => return Ok(None), + } + } + } + } + } +} + +fn geo_point<'t>( + rtree: &'t RTree, + candidates: RoaringBitmap, + point: [f64; 2], +) -> Result + 't>> { + Ok(Box::new( + rtree + .nearest_neighbor_iter_with_distance_2(&point) + .filter_map(move |(point, _distance)| { + candidates.contains(point.data).then(|| point.data) + }) + .map(|id| std::iter::once(id).collect::()) + )) +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 92c0d284a..185761632 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -13,10 +13,12 @@ use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::criterion::{AscDesc as AscDescName, Member}; +use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; mod asc_desc; +mod geo; mod attribute; mod exactness; pub mod r#final; @@ -290,18 +292,28 @@ impl<'t> CriteriaBuilder<'t> { Some(ref sort_criteria) => { for asc_desc in sort_criteria { criterion = match asc_desc { - AscDescName::Asc(field) => Box::new(AscDesc::asc( - &self.index, - &self.rtxn, - criterion, - field.clone(), - )?), - AscDescName::Desc(field) => Box::new(AscDesc::desc( - &self.index, - &self.rtxn, - criterion, - field.clone(), - )?), + AscDescName::Asc(Member::Field(field)) => { + Box::new(AscDesc::asc( + &self.index, + &self.rtxn, + criterion, + field.to_string(), + )?) + } + AscDescName::Desc(Member::Field(field)) => { + Box::new(AscDesc::desc( + &self.index, + &self.rtxn, + criterion, + field.to_string(), + )?) + } + AscDescName::Asc(Member::Geo(point)) => { + Box::new(Geo::new(&self.index, &self.rtxn, criterion, point.clone())?) + } + AscDescName::Desc(Member::Geo(_point)) => { + panic!("You can't desc geosort"); // TODO: TAMO: remove this + } }; } criterion @@ -312,10 +324,10 @@ impl<'t> CriteriaBuilder<'t> { Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), Name::Asc(field) => { - Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, Member::Field(field))?) + Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?) } Name::Desc(field) => { - Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, Member::Field(field))?) + Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?) } }; } From aca707413c29d3e0c9d00e4549ae7c27a6735a43 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 1 Sep 2021 17:02:51 +0200 Subject: [PATCH 0981/1889] remove the memory leak --- milli/src/search/criteria/geo.rs | 100 +++++++++++++++---------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index 740bcb3a8..bf3ae7aba 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -28,7 +28,16 @@ impl<'t> Geo<'t> { let bucket_candidates = RoaringBitmap::new(); let rtree = index.geo_rtree(rtxn)?; - Ok(Self { index, rtxn, parent, candidates, allowed_candidates, bucket_candidates, rtree, point }) + Ok(Self { + index, + rtxn, + parent, + candidates, + allowed_candidates, + bucket_candidates, + rtree, + point, + }) } } @@ -52,64 +61,55 @@ impl<'t> Criterion for Geo<'t> { bucket_candidates: Some(self.bucket_candidates.clone()), })); } - None => { - match self.parent.next(params)? { - Some(CriterionResult { - query_tree, - candidates, - filtered_candidates, - bucket_candidates, - }) => { - let mut candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; - resolve_query_tree(&context, qt, params.wdcache)? - } - // TODO: TAMO: why are we doing this? - (None, None) => self.index.documents_ids(self.rtxn)?, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + bucket_candidates, + }) => { + let mut candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + resolve_query_tree(&context, qt, params.wdcache)? } + (None, None) => self.index.documents_ids(self.rtxn)?, + }; - match bucket_candidates { - // why not are we keeping elements from the previous bucket? - Some(bucket_candidates) => { - self.bucket_candidates |= bucket_candidates - } - None => self.bucket_candidates |= &candidates, - } - - if candidates.is_empty() { - continue; - } - let rtree = Box::new(rtree.clone()); - let rtree = Box::leak(rtree); - - self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = geo_point(rtree, self.allowed_candidates.clone(), self.point)?; + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; } - None => return Ok(None), + + match bucket_candidates { + Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, + None => self.bucket_candidates |= &candidates, + } + + if candidates.is_empty() { + continue; + } + self.allowed_candidates = &candidates - params.excluded_candidates; + self.candidates = + geo_point(rtree, self.allowed_candidates.clone(), self.point); } - } + None => return Ok(None), + }, } } } } -fn geo_point<'t>( - rtree: &'t RTree, +fn geo_point( + rtree: &RTree, candidates: RoaringBitmap, point: [f64; 2], -) -> Result + 't>> { - Ok(Box::new( - rtree - .nearest_neighbor_iter_with_distance_2(&point) - .filter_map(move |(point, _distance)| { - candidates.contains(point.data).then(|| point.data) - }) - .map(|id| std::iter::once(id).collect::()) - )) +) -> Box> { + let results = rtree + .nearest_neighbor_iter_with_distance_2(&point) + .filter_map(move |(point, _distance)| candidates.contains(point.data).then(|| point.data)) + .map(|id| std::iter::once(id).collect::()) + .collect::>(); + + Box::new(results.into_iter()) } From b1bf7d4f405b418ed5c071e12a600b212b808c1d Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 1 Sep 2021 17:03:42 +0200 Subject: [PATCH 0982/1889] reformat --- milli/src/search/criteria/mod.rs | 39 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 185761632..782fedcc8 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -18,10 +18,10 @@ use crate::search::{word_derivations, WordDerivationsCache}; use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; mod asc_desc; -mod geo; mod attribute; mod exactness; pub mod r#final; +mod geo; mod initial; mod proximity; mod typo; @@ -292,25 +292,24 @@ impl<'t> CriteriaBuilder<'t> { Some(ref sort_criteria) => { for asc_desc in sort_criteria { criterion = match asc_desc { - AscDescName::Asc(Member::Field(field)) => { - Box::new(AscDesc::asc( - &self.index, - &self.rtxn, - criterion, - field.to_string(), - )?) - } - AscDescName::Desc(Member::Field(field)) => { - Box::new(AscDesc::desc( - &self.index, - &self.rtxn, - criterion, - field.to_string(), - )?) - } - AscDescName::Asc(Member::Geo(point)) => { - Box::new(Geo::new(&self.index, &self.rtxn, criterion, point.clone())?) - } + AscDescName::Asc(Member::Field(field)) => Box::new(AscDesc::asc( + &self.index, + &self.rtxn, + criterion, + field.to_string(), + )?), + AscDescName::Desc(Member::Field(field)) => Box::new(AscDesc::desc( + &self.index, + &self.rtxn, + criterion, + field.to_string(), + )?), + AscDescName::Asc(Member::Geo(point)) => Box::new(Geo::new( + &self.index, + &self.rtxn, + criterion, + point.clone(), + )?), AscDescName::Desc(Member::Geo(_point)) => { panic!("You can't desc geosort"); // TODO: TAMO: remove this } From f0b74637dc2e41f8521946dc379def1866520b67 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 1 Sep 2021 17:43:18 +0200 Subject: [PATCH 0983/1889] fix all the tests --- milli/src/lib.rs | 2 +- milli/tests/search/mod.rs | 6 +++--- milli/tests/search/query_criteria.rs | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 2a55b6f3a..a3cede1fd 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -21,7 +21,7 @@ use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; use serde_json::{Map, Value}; -pub use self::criterion::{default_criteria, AscDesc, Criterion}; +pub use self::criterion::{default_criteria, AscDesc, Criterion, Member}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index c34434c4e..b4dfb3080 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -6,7 +6,7 @@ use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::update::{Settings, UpdateBuilder, UpdateFormat}; -use milli::{AscDesc, Criterion, DocumentId, Index}; +use milli::{AscDesc, Criterion, DocumentId, Index, Member}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -99,11 +99,11 @@ pub fn expected_order( new_groups .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); } - Criterion::Sort if sort_by == [AscDesc::Asc(S("tag"))] => { + Criterion::Sort if sort_by == [AscDesc::Asc(Member::Field(S("tag")))] => { group.sort_by_key(|d| d.sort_by_rank); new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); } - Criterion::Sort if sort_by == [AscDesc::Desc(S("tag"))] => { + Criterion::Sort if sort_by == [AscDesc::Desc(Member::Field(S("tag")))] => { group.sort_by_key(|d| Reverse(d.sort_by_rank)); new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index c9720d652..cc08ec863 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -163,28 +163,28 @@ test_criterion!( DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Sort], - vec![AscDesc::Asc(S("tag"))] + vec![AscDesc::Asc(Member::Field(S("tag")))] ); test_criterion!( sort_by_asc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Sort], - vec![AscDesc::Asc(S("tag"))] + vec![AscDesc::Asc(Member::Field(S("tag")))] ); test_criterion!( sort_by_desc_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Sort], - vec![AscDesc::Desc(S("tag"))] + vec![AscDesc::Desc(Member::Field(S("tag")))] ); test_criterion!( sort_by_desc_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![Sort], - vec![AscDesc::Desc(S("tag"))] + vec![AscDesc::Desc(Member::Field(S("tag")))] ); test_criterion!( default_criteria_order, From e8c093c1d028ce28277d3e92f3111493d3054225 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 2 Sep 2021 15:55:19 +0200 Subject: [PATCH 0984/1889] fix the error handling in the filters --- milli/src/search/facet/filter_condition.rs | 25 +++++++++++++++------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 66a2ffac7..f8ea2ca74 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -135,7 +135,7 @@ impl FilterCondition { Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), Rule::less => Ok(Self::lower_than(fim, ff, pair)?), Rule::between => Ok(Self::between(fim, ff, pair)?), - Rule::geo_radius => Ok(Self::geo_radius(fim, pair)?), + Rule::geo_radius => Ok(Self::geo_radius(fim, ff, pair)?), Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), @@ -161,7 +161,22 @@ impl FilterCondition { } } - fn geo_radius(fields_ids_map: &FieldsIdsMap, item: Pair) -> Result { + fn geo_radius( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + if !filterable_fields.contains("_geo") { + return Err(UserError::InvalidFilterAttribute(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `_geo` is not filterable, available filterable attributes are: {}", + filterable_fields.iter().join(", "), + ), + }, + item.as_span(), + )))?; + } let mut items = item.into_inner(); let fid = match fields_ids_map.id("_geo") { Some(fid) => fid, @@ -463,9 +478,6 @@ impl FilterCondition { LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), Between(left, right) => (Included(*left), Included(*right)), GeoLowerThan(point, distance) => { - if index.filterable_fields(rtxn)?.contains("_geo") { - Err(UserError::AttributeLimitReached)?; // TODO: TAMO use a real error - } let mut result = RoaringBitmap::new(); let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, @@ -480,9 +492,6 @@ impl FilterCondition { return Ok(result); } GeoGreaterThan(point, distance) => { - if index.filterable_fields(rtxn)?.contains("_geo") { - Err(UserError::AttributeLimitReached)?; // TODO: TAMO use a real error - } let result = Self::evaluate_operator( rtxn, index, From bd4c2482921c2501f3c78ef52a160f9fe0950534 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 2 Sep 2021 15:57:40 +0200 Subject: [PATCH 0985/1889] improve the error handling in general and introduce the concept of reserved keywords --- milli/src/criterion.rs | 22 ++++++++++--------- milli/src/error.rs | 18 +++++++++++++++ .../extract/extract_geo_points.rs | 11 +++++----- .../src/update/index_documents/extract/mod.rs | 10 ++++++++- milli/src/update/index_documents/mod.rs | 4 ++++ .../src/update/index_documents/typed_chunk.rs | 2 +- 6 files changed, 50 insertions(+), 17 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index ea3214c8e..29c477473 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -3,7 +3,7 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; -use crate::error::{Error, UserError}; +use crate::error::{is_reserved_keyword, Error, UserError}; #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { @@ -50,18 +50,20 @@ impl FromStr for Criterion { "sort" => Ok(Criterion::Sort), "exactness" => Ok(Criterion::Exactness), text => match AscDesc::from_str(text) { - Ok(AscDesc::Asc(field)) => Ok(Criterion::Asc(field)), - Ok(AscDesc::Desc(field)) => Ok(Criterion::Desc(field)), + Ok(AscDesc::Asc(Member::Field(field))) if is_reserved_keyword(&field) => { + Err(UserError::InvalidReservedRankingRuleName { name: text.to_string() })? + } + Ok(AscDesc::Asc(Member::Field(field))) => Ok(Criterion::Asc(field)), + Ok(AscDesc::Desc(Member::Field(field))) => Ok(Criterion::Desc(field)), + Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { + Err(UserError::InvalidRankingRuleName { name: text.to_string() })? + } Err(UserError::InvalidAscDescSyntax { name }) => { Err(UserError::InvalidCriterionName { name }.into()) } Err(error) => { Err(UserError::InvalidCriterionName { name: error.to_string() }.into()) } - Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { - Err(UserError::AttributeLimitReached)? // TODO: TAMO: use a real error - } - Err(error) => Err(error.into()), }, } } @@ -81,12 +83,12 @@ impl FromStr for Member { let point = text.strip_prefix("_geoPoint(") .and_then(|point| point.strip_suffix(")")) - .ok_or_else(|| UserError::InvalidCriterionName { name: text.to_string() })?; + .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() })?; let point = point .split(',') .map(|el| el.trim().parse()) .collect::, _>>() - .map_err(|_| UserError::InvalidCriterionName { name: text.to_string() })?; + .map_err(|_| UserError::InvalidRankingRuleName { name: text.to_string() })?; Ok(Member::Geo([point[0], point[1]])) } else { Ok(Member::Field(text.to_string())) @@ -147,7 +149,7 @@ impl FromStr for AscDesc { match text.rsplit_once(':') { Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)), Some((left, "desc")) => Ok(AscDesc::Desc(left.parse()?)), - _ => Err(UserError::InvalidCriterionName { name: text.to_string() }), + _ => Err(UserError::InvalidRankingRuleName { name: text.to_string() }), } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 3f473a673..f4601ea9a 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -12,6 +12,12 @@ use crate::{DocumentId, FieldId}; pub type Object = Map; +const RESERVED_KEYWORD: &[&'static str] = &["_geo", "_geoDistance"]; + +pub fn is_reserved_keyword(keyword: &str) -> bool { + RESERVED_KEYWORD.contains(&keyword) +} + #[derive(Debug)] pub enum Error { InternalError(InternalError), @@ -60,6 +66,9 @@ pub enum UserError { InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), InvalidSortName { name: String }, + InvalidGeoField { document_id: Value, object: Value }, + InvalidRankingRuleName { name: String }, + InvalidReservedRankingRuleName { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, InvalidStoreFile, @@ -222,6 +231,15 @@ impl fmt::Display for UserError { write!(f, "invalid asc/desc syntax for {}", name) } Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), + Self::InvalidGeoField { document_id, object } => write!( + f, + "the document with the id: {} contains an invalid _geo field: {}", + document_id, object + ), + Self::InvalidRankingRuleName { name } => write!(f, "invalid criterion {}", name), + Self::InvalidReservedRankingRuleName { name } => { + write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name) + } Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); write!( diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 88ae7c177..c4bdce211 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -2,11 +2,10 @@ use std::fs::File; use std::io; use concat_arrays::concat_arrays; -use log::warn; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; -use crate::{FieldId, InternalError, Result}; +use crate::{FieldId, InternalError, Result, UserError}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. /// @@ -14,6 +13,7 @@ use crate::{FieldId, InternalError, Result}; pub fn extract_geo_points( mut obkv_documents: grenad::Reader, indexer: GrenadParameters, + primary_key_id: FieldId, geo_field_id: FieldId, ) -> Result> { let mut writer = tempfile::tempfile().and_then(|file| { @@ -33,9 +33,10 @@ pub fn extract_geo_points( let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; } else { - // TAMO: improve the warn - warn!("Malformed `_geo` field"); - continue; + let primary_key = obkv.get(primary_key_id).unwrap(); // TODO: TAMO: is this valid? + let primary_key = + serde_json::from_slice(primary_key).map_err(InternalError::SerdeJson)?; + Err(UserError::InvalidGeoField { document_id: primary_key, object: point })? } } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 4cb21c8e4..36e3c870f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -39,6 +39,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx: Sender>, searchable_fields: Option>, faceted_fields: HashSet, + primary_key_id: FieldId, geo_field_id: Option, stop_words: Option>, ) -> Result<()> { @@ -51,6 +52,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), &searchable_fields, &faceted_fields, + primary_key_id, geo_field_id, &stop_words, ) @@ -172,6 +174,7 @@ fn extract_documents_data( lmdb_writer_sx: Sender>, searchable_fields: &Option>, faceted_fields: &HashSet, + primary_key_id: FieldId, geo_field_id: Option, stop_words: &Option>, ) -> Result<( @@ -186,7 +189,12 @@ fn extract_documents_data( let documents_chunk_cloned = documents_chunk.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); rayon::spawn(move || { - let _ = match extract_geo_points(documents_chunk_cloned, indexer, geo_field_id) { + let _ = match extract_geo_points( + documents_chunk_cloned, + indexer, + primary_key_id, + geo_field_id, + ) { Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), Err(error) => lmdb_writer_sx_cloned.send(Err(error)), }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d4fd3570e..38eea954b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -228,6 +228,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { Receiver>, ) = crossbeam_channel::unbounded(); + // get the primary key field id + let primary_key_id = fields_ids_map.id(&primary_key).unwrap(); // TODO: TAMO: is this unwrap 100% valid? + // get searchable fields for word databases let searchable_fields = self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); @@ -269,6 +272,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { lmdb_writer_sx.clone(), searchable_fields, faceted_fields, + primary_key_id, geo_field_id, stop_words, ) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b09bee213..5c27c195f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -180,7 +180,7 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::GeoPoints(mut geo_points) => { - // TODO: TAMO: we should create the rtree with the `RTree::bulk_load` function + // TODO: we should create the rtree with the `RTree::bulk_load` function let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); let mut doc_ids = index.geo_faceted_documents_ids(wtxn)?; From ebf82ac28cec7c5cdb3997fea20c083baff8bb36 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 6 Sep 2021 17:07:34 +0200 Subject: [PATCH 0986/1889] improve the error messages and add tests for the filters --- milli/src/search/facet/filter_condition.rs | 125 +++++++++++++++++++-- milli/src/search/facet/grammar.pest | 4 +- 2 files changed, 119 insertions(+), 10 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index f8ea2ca74..bfcf7d9c7 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -182,15 +182,42 @@ impl FilterCondition { Some(fid) => fid, None => return Ok(Empty), }; - let (lat_result, _) = pest_parse(items.next().unwrap()); - let (lng_result, _) = pest_parse(items.next().unwrap()); - let lat = lat_result.map_err(UserError::InvalidFilter)?; - let lng = lng_result.map_err(UserError::InvalidFilter)?; - let point = [lat, lng]; - let (distance_result, _) = pest_parse(items.next().unwrap()); - let distance = distance_result.map_err(UserError::InvalidFilter)?; - - Ok(Operator(fid, GeoLowerThan(point, distance))) + let parameters_item = items.next().unwrap(); + // We don't need more than 3 parameters, but to handle errors correctly we are still going + // to extract the first 4 parameters + let param_span = parameters_item.as_span(); + let parameters = parameters_item + .into_inner() + .take(4) + .map(|param| (param.clone(), param.as_span())) + .map(|(param, span)| pest_parse(param).0.map(|arg| (arg, span))) + .collect::, _>>() + .map_err(UserError::InvalidFilter)?; + if parameters.len() != 3 { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), + }, + // we want to point to the last parameters and if there was no parameters we + // point to the parenthesis + parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), + )))?; + } + let (lat, lng, distance) = (¶meters[0], ¶meters[1], parameters[2].0); + if let Some(span) = (!(-181.0..181.).contains(&lat.0)) + .then(|| &lat.1) + .or((!(-181.0..181.).contains(&lng.0)).then(|| &lng.1)) + { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "Latitude and longitude must be contained between -180 to 180 degrees." + ), + }, + span.clone(), + )))?; + } + Ok(Operator(fid, GeoLowerThan([lat.0, lng.0], distance))) } fn between( @@ -726,6 +753,86 @@ mod tests { assert_eq!(condition, expected); } + #[test] + fn geo_radius() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + // basic test + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([12., 13.0005], 2000.)); + assert_eq!(condition, expected); + + // test the negation of the GeoLowerThan + let condition = + FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); + let expected = Operator(0, GeoGreaterThan([50., 18.], 2000.500)); + assert_eq!(condition, expected); + + // composition of multiple operations + let condition = FilterCondition::from_str( + &rtxn, + &index, + "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", + ) + .unwrap(); + let expected = Or( + Box::new(And( + Box::new(Operator(0, GeoGreaterThan([1., 2.], 300.))), + Box::new(Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), + )), + Box::new(Operator(1, LowerThanOrEqual(10.))), + ); + assert_eq!(condition, expected); + + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius don't have enough parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have too many parameters + let result = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have a bad latitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-200, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude and longitude must be contained between -180 to 180 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 181, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude and longitude must be contained between -180 to 180 degrees.")); + } + #[test] fn from_array() { let path = tempfile::tempdir().unwrap(); diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest index 10783b632..973fb5156 100644 --- a/milli/src/search/facet/grammar.pest +++ b/milli/src/search/facet/grammar.pest @@ -8,6 +8,8 @@ char = _{ !(PEEK | "\\") ~ ANY | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} +// we deliberately choose to allow empty parameters to generate more specific error message later +parameters ={"(" ~ (value ~ ",")* ~ value? ~ ")"} condition = _{between | eq | greater | less | geq | leq | neq} between = {key ~ value ~ "TO" ~ value} geq = {key ~ ">=" ~ value} @@ -16,7 +18,7 @@ neq = {key ~ "!=" ~ value} eq = {key ~ "=" ~ value} greater = {key ~ ">" ~ value} less = {key ~ "<" ~ value} -geo_radius = {"_geoRadius(" ~ value ~ "," ~ value ~ "," ~ value ~ ")"} +geo_radius = {"_geoRadius" ~ parameters } prgm = {SOI ~ expr ~ EOI} expr = _{ ( term ~ (operation ~ term)* ) } From 6d5762a6c85bc0d48acb579a13ef27c79aa01288 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 6 Sep 2021 17:28:49 +0200 Subject: [PATCH 0987/1889] handle the case where you forgot entirely the parenthesis --- milli/src/search/facet/filter_condition.rs | 6 ++++++ milli/src/search/facet/grammar.pest | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index bfcf7d9c7..c6dbbf056 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -797,6 +797,12 @@ mod tests { ); assert_eq!(condition, expected); + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + // georadius don't have any parameters let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); assert!(result.is_err()); diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest index 973fb5156..8285f81a6 100644 --- a/milli/src/search/facet/grammar.pest +++ b/milli/src/search/facet/grammar.pest @@ -9,7 +9,7 @@ char = _{ !(PEEK | "\\") ~ ANY | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} // we deliberately choose to allow empty parameters to generate more specific error message later -parameters ={"(" ~ (value ~ ",")* ~ value? ~ ")"} +parameters ={("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} condition = _{between | eq | greater | less | geq | leq | neq} between = {key ~ value ~ "TO" ~ value} geq = {key ~ ">=" ~ value} From 7ae2a7341c270f6d84e66e7803017b34203d36d9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 6 Sep 2021 18:15:20 +0200 Subject: [PATCH 0988/1889] introduce the reserved keywords in the filters --- milli/src/search/facet/filter_condition.rs | 19 +++++++++++++++++++ milli/src/search/facet/grammar.pest | 7 ++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index c6dbbf056..a36fddb01 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -595,6 +595,18 @@ fn field_id( ) -> StdResult, PestError> { // lexing ensures that we at least have a key let key = items.next().unwrap(); + if key.as_rule() == Rule::reserved { + return Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "`{}` is a reserved keyword and thus can't be used as a filter expression. Available filterable attributes are: {}", + key.as_str(), + filterable_fields.iter().join(", "), + ), + }, + key.as_span(), + )); + } if !filterable_fields.contains(key.as_str()) { return Err(PestError::new_from_span( @@ -671,6 +683,13 @@ mod tests { let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); + + let result = FilterCondition::from_str(&rtxn, &index, "_geo = France"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains( + "`_geo` is a reserved keyword and thus can't be used as a filter expression." + )); } #[test] diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest index 8285f81a6..d07d5bca5 100644 --- a/milli/src/search/facet/grammar.pest +++ b/milli/src/search/facet/grammar.pest @@ -1,5 +1,5 @@ -key = _{quoted | word} -value = _{quoted | word} +key = _{reserved | quoted | word } +value = _{quoted | word } quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP } string = {char*} word = ${(LETTER | NUMBER | "_" | "-" | ".")+} @@ -8,8 +8,9 @@ char = _{ !(PEEK | "\\") ~ ANY | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} +reserved = { "_geo" | "_geoDistance" | "_geoPoint" | ("_geoPoint" ~ parameters) } // we deliberately choose to allow empty parameters to generate more specific error message later -parameters ={("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} +parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} condition = _{between | eq | greater | less | geq | leq | neq} between = {key ~ value ~ "TO" ~ value} geq = {key ~ ">=" ~ value} From 4f69b190bc31620c1a03ade5f154a85a4c1fa850 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 7 Sep 2021 11:49:27 +0200 Subject: [PATCH 0989/1889] remove the distance from the search, the computation of the distance will be made on meilisearch side --- milli/src/search/criteria/geo.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index bf3ae7aba..6f8f1406a 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -106,8 +106,8 @@ fn geo_point( point: [f64; 2], ) -> Box> { let results = rtree - .nearest_neighbor_iter_with_distance_2(&point) - .filter_map(move |(point, _distance)| candidates.contains(point.data).then(|| point.data)) + .nearest_neighbor_iter(&point) + .filter_map(move |point| candidates.contains(point.data).then(|| point.data)) .map(|id| std::iter::once(id).collect::()) .collect::>(); From e5ef0cad9a78d6db3aa94c4f83486b9ee7ab5f8c Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 7 Sep 2021 12:11:03 +0200 Subject: [PATCH 0990/1889] use meters in the filters --- milli/Cargo.toml | 1 + milli/src/lib.rs | 8 ++++++++ milli/src/search/facet/filter_condition.rs | 13 ++++++++----- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d2767afd4..be507332e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -16,6 +16,7 @@ flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" grenad = { version = "0.3.1", default-features = false, features = ["tempfile"] } +haversine = "0.2.1" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a3cede1fd..3c7713308 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -142,6 +142,14 @@ where Some((head, tail)) } +/// Return the distance between two points in meters +fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { + let a = haversine::Location { latitude: a[0], longitude: a[1] }; + let b = haversine::Location { latitude: b[0], longitude: b[1] }; + + haversine::distance(a, b, haversine::Units::Kilometers) * 1000. +} + #[cfg(test)] mod tests { use serde_json::json; diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index a36fddb01..08a84899f 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -504,17 +504,20 @@ impl FilterCondition { LowerThan(val) => (Included(f64::MIN), Excluded(*val)), LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), Between(left, right) => (Included(*left), Included(*right)), - GeoLowerThan(point, distance) => { + GeoLowerThan(base_point, distance) => { let mut result = RoaringBitmap::new(); let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, None => return Ok(result), }; - let iter = rtree - .nearest_neighbor_iter_with_distance_2(point) - .take_while(|(_, dist)| dist <= distance); - iter.for_each(|(point, _)| drop(result.insert(point.data))); + rtree + .nearest_neighbor_iter(base_point) + .take_while(|point| { + dbg!(crate::distance_between_two_points(base_point, point.geom())) + < *distance + }) + .for_each(|point| drop(result.insert(point.data))); return Ok(result); } From 2988d3c76d8a03765bc5aa94ab4f930190686adc Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 8 Sep 2021 13:08:48 +0200 Subject: [PATCH 0991/1889] tests the geo filters --- milli/tests/assets/test_set.ndjson | 34 +++++++++++++++--------------- milli/tests/search/filters.rs | 5 +++++ milli/tests/search/mod.rs | 6 ++++++ 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 89d9f1109..9a0fe5b0a 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -1,17 +1,17 @@ -{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","":""} -{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","":""} -{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","":""} -{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","":""} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","":""} -{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","":""} -{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","":""} -{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","":""} -{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","":""} -{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","":""} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","":""} -{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","":""} -{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","":""} -{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","":""} -{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","":""} -{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","":""} -{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","":""} +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":""} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":""} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":""} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":""} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":""} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":""} +{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":""} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":""} +{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":""} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""} +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":""} +{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":""} +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":""} +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":""} +{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":""} +{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""} diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index c810b47af..d992a8e95 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -47,6 +47,11 @@ test_filter!(eq_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank=1")] test_filter!(eq_string_or_filter, vec![Left(vec!["tag=red", "tag=green"])]); test_filter!(eq_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank=1"])]); test_filter!(eq_number_or_filter, vec![Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])]); +test_filter!(geo_radius, vec![Right("_geoRadius(50.630010347667806, 3.086251829166809, 100000)")]); +test_filter!( + not_geo_radius, + vec![Right("NOT _geoRadius(50.630010347667806, 3.086251829166809, 1000000)")] +); test_filter!(eq_complex_filter, vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank=3")]); test_filter!( eq_complex_filter_2, diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index b4dfb3080..e3f6c5b09 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -37,6 +37,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { builder.set_filterable_fields(hashset! { S("tag"), S("asc_desc_rank"), + S("_geo"), }); builder.set_sortable_fields(hashset! { S("tag"), @@ -162,6 +163,10 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { if document.asc_desc_rank > filter.parse().unwrap() { id = Some(document.id.clone()) } + } else if filter.starts_with("_geoRadius") { + id = (document.geo_rank < 100000).then(|| document.id.clone()); + } else if filter.starts_with("NOT _geoRadius") { + id = (document.geo_rank > 1000000).then(|| document.id.clone()); } id } @@ -205,6 +210,7 @@ pub struct TestDocument { pub exact_rank: u32, pub asc_desc_rank: u32, pub sort_by_rank: u32, + pub geo_rank: u32, pub title: String, pub description: String, pub tag: String, From 4b618b95e409a08a615aeabaf46b8ef40c69f099 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 8 Sep 2021 17:11:29 +0200 Subject: [PATCH 0992/1889] rebase on main --- milli/tests/search/query_criteria.rs | 2 +- milli/tests/search/sort.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index cc08ec863..f6a937f67 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -5,7 +5,7 @@ use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; use milli::update::{Settings, UpdateBuilder, UpdateFormat}; -use milli::{AscDesc, Criterion, Index, Search, SearchResult}; +use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; use rand::Rng; use Criterion::*; diff --git a/milli/tests/search/sort.rs b/milli/tests/search/sort.rs index fe87f0623..86404bb99 100644 --- a/milli/tests/search/sort.rs +++ b/milli/tests/search/sort.rs @@ -1,6 +1,6 @@ use big_s::S; use milli::Criterion::{Attribute, Exactness, Proximity, Typo, Words}; -use milli::{AscDesc, Error, Search, UserError}; +use milli::{AscDesc, Error, Member, Search, UserError}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -16,7 +16,7 @@ fn sort_ranking_rule_missing() { search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos(true); search.optional_words(true); - search.sort_criteria(vec![AscDesc::Asc(S("tag"))]); + search.sort_criteria(vec![AscDesc::Asc(Member::Field(S("tag")))]); let result = search.execute(); assert!(matches!(result, Err(Error::UserError(UserError::SortRankingRuleMissing)))); From b15c77ebc4608953e3009e99c9b2be8aaac84d1a Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 8 Sep 2021 18:08:51 +0200 Subject: [PATCH 0993/1889] return an error in case a user try to sort with :desc --- milli/src/search/criteria/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 782fedcc8..fca159900 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -15,7 +15,7 @@ use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::criterion::{AscDesc as AscDescName, Member}; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; -use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; +use crate::{DocumentId, FieldId, Index, Result, TreeLevel, UserError}; mod asc_desc; mod attribute; @@ -311,7 +311,7 @@ impl<'t> CriteriaBuilder<'t> { point.clone(), )?), AscDescName::Desc(Member::Geo(_point)) => { - panic!("You can't desc geosort"); // TODO: TAMO: remove this + return Err(UserError::InvalidSortName { name: "Sorting in descending order is currently not supported for the geosearch".to_string() })? } }; } From bad8ea47d5884eaf862adacf33377bf8d8672049 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 8 Sep 2021 18:12:10 +0200 Subject: [PATCH 0994/1889] edit the two lasts TODO comments --- milli/src/update/index_documents/extract/extract_geo_points.rs | 3 ++- milli/src/update/index_documents/mod.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index c4bdce211..1af22d010 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -33,7 +33,8 @@ pub fn extract_geo_points( let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; } else { - let primary_key = obkv.get(primary_key_id).unwrap(); // TODO: TAMO: is this valid? + // All document must have a primary key so we can unwrap safely here + let primary_key = obkv.get(primary_key_id).unwrap(); let primary_key = serde_json::from_slice(primary_key).map_err(InternalError::SerdeJson)?; Err(UserError::InvalidGeoField { document_id: primary_key, object: point })? diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 38eea954b..d3b8e47b0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -229,7 +229,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { ) = crossbeam_channel::unbounded(); // get the primary key field id - let primary_key_id = fields_ids_map.id(&primary_key).unwrap(); // TODO: TAMO: is this unwrap 100% valid? + let primary_key_id = fields_ids_map.id(&primary_key).unwrap(); // get searchable fields for word databases let searchable_fields = From c81ff22c5b980eab5a5b41bb945abac093f31b4e Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 8 Sep 2021 19:17:00 +0200 Subject: [PATCH 0995/1889] delete the invalid criterion name error in favor of invalid ranking rule name --- milli/src/criterion.rs | 4 ++-- milli/src/error.rs | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 29c477473..e05829eb4 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -59,10 +59,10 @@ impl FromStr for Criterion { Err(UserError::InvalidRankingRuleName { name: text.to_string() })? } Err(UserError::InvalidAscDescSyntax { name }) => { - Err(UserError::InvalidCriterionName { name }.into()) + Err(UserError::InvalidRankingRuleName { name }.into()) } Err(error) => { - Err(UserError::InvalidCriterionName { name: error.to_string() }.into()) + Err(UserError::InvalidRankingRuleName { name: error.to_string() }.into()) } }, } diff --git a/milli/src/error.rs b/milli/src/error.rs index f4601ea9a..157fe4be9 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -60,7 +60,6 @@ pub enum UserError { Csv(csv::Error), DocumentLimitReached, InvalidAscDescSyntax { name: String }, - InvalidCriterionName { name: String }, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidFilter(pest::error::Error), @@ -230,7 +229,6 @@ impl fmt::Display for UserError { Self::InvalidAscDescSyntax { name } => { write!(f, "invalid asc/desc syntax for {}", name) } - Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), Self::InvalidGeoField { document_id, object } => write!( f, "the document with the id: {} contains an invalid _geo field: {}", From a84f3a8b317eecfc09ce9361e64c0ea5171da6d0 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 9 Sep 2021 12:20:08 +0200 Subject: [PATCH 0996/1889] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- http-ui/src/main.rs | 4 +-- milli/src/criterion.rs | 18 ++++++---- milli/src/error.rs | 4 +-- milli/src/index.rs | 17 +++++---- milli/src/lib.rs | 2 +- milli/src/search/criteria/geo.rs | 8 +++-- milli/src/search/criteria/mod.rs | 4 ++- milli/src/search/facet/filter_condition.rs | 36 ++++++++++--------- milli/src/update/delete_documents.rs | 7 ++-- .../extract/extract_geo_points.rs | 5 ++- .../src/update/index_documents/extract/mod.rs | 9 ++--- milli/src/update/index_documents/mod.rs | 17 +++++---- .../src/update/index_documents/typed_chunk.rs | 15 ++++---- 13 files changed, 77 insertions(+), 69 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 89f3dcab2..5dbb0c326 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -695,7 +695,7 @@ async fn main() -> anyhow::Result<()> { struct QueryBody { query: Option, filters: Option, - sorters: Option, + sort: Option, facet_filters: Option, String>>>, facet_distribution: Option, limit: Option, @@ -755,7 +755,7 @@ async fn main() -> anyhow::Result<()> { search.limit(limit); } - if let Some(sort) = query.sorters { + if let Some(sort) = query.sort { search.sort_criteria(vec![sort.parse().unwrap()]); } diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index e05829eb4..b95080b4b 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -84,12 +84,16 @@ impl FromStr for Member { text.strip_prefix("_geoPoint(") .and_then(|point| point.strip_suffix(")")) .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() })?; - let point = point - .split(',') - .map(|el| el.trim().parse()) - .collect::, _>>() - .map_err(|_| UserError::InvalidRankingRuleName { name: text.to_string() })?; - Ok(Member::Geo([point[0], point[1]])) + let (lat, long) = point + .split_once(',') + .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() }) + .and_then(|(lat, long)| { + lat.trim() + .parse() + .and_then(|lat| long.trim().parse().map(|long| (lat, long))) + .map_err(|_| UserError::InvalidRankingRuleName { name: text.to_string() }) + })?; + Ok(Member::Geo([lat, long])) } else { Ok(Member::Field(text.to_string())) } @@ -99,7 +103,7 @@ impl FromStr for Member { impl fmt::Display for Member { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Member::Field(name) => write!(f, "{}", name), + Member::Field(name) => f.write_str(name), Member::Geo([lat, lng]) => write!(f, "_geoPoint({}, {})", lat, lng), } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 157fe4be9..21b77b5a7 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -12,10 +12,8 @@ use crate::{DocumentId, FieldId}; pub type Object = Map; -const RESERVED_KEYWORD: &[&'static str] = &["_geo", "_geoDistance"]; - pub fn is_reserved_keyword(keyword: &str) -> bool { - RESERVED_KEYWORD.contains(&keyword) + ["_geo", "_geoDistance"].contains(&keyword) } #[derive(Debug)] diff --git a/milli/src/index.rs b/milli/src/index.rs index f2ddba699..4ab19f175 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -299,6 +299,7 @@ impl Index { /* geo rtree */ + /// Writes the provided `rtree` which associates coordinates to documents ids. pub(crate) fn put_geo_rtree( &self, wtxn: &mut RwTxn, @@ -307,10 +308,12 @@ impl Index { self.main.put::<_, Str, SerdeBincode>>(wtxn, main_key::GEO_RTREE_KEY, rtree) } + /// Delete the `rtree` which associates coordinates to documents ids. pub(crate) fn delete_geo_rtree(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, main_key::GEO_RTREE_KEY) } + /// Returns the `rtree` which associates coordinates to documents ids. pub fn geo_rtree<'t>(&self, rtxn: &'t RoTxn) -> Result>> { match self .main @@ -323,7 +326,7 @@ impl Index { /* geo faceted */ - /// Writes the documents ids that are faceted with a _geo field + /// Writes the documents ids that are faceted with a _geo field. pub(crate) fn put_geo_faceted_documents_ids( &self, wtxn: &mut RwTxn, @@ -336,16 +339,12 @@ impl Index { ) } - /// Delete the documents ids that are faceted with a _geo field - pub(crate) fn delete_geo_faceted_documents_ids(&self, wtxn: &mut RwTxn) -> heed::Result<()> { - self.main.put::<_, Str, RoaringBitmapCodec>( - wtxn, - main_key::GEO_FACETED_DOCUMENTS_IDS_KEY, - &RoaringBitmap::new(), - ) + /// Delete the documents ids that are faceted with a _geo field. + pub(crate) fn delete_geo_faceted_documents_ids(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY) } - /// Retrieve all the documents ids that faceted with a _geo field + /// Retrieve all the documents ids that faceted with a _geo field. pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { match self .main diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 3c7713308..4f066365c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -142,7 +142,7 @@ where Some((head, tail)) } -/// Return the distance between two points in meters +/// Return the distance between two points in meters. fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { let a = haversine::Location { latitude: a[0], longitude: a[1] }; let b = haversine::Location { latitude: b[0], longitude: b[1] }; diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index 6f8f1406a..78f741b57 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -1,3 +1,5 @@ +use std::iter; + use roaring::RoaringBitmap; use rstar::RTree; @@ -23,7 +25,7 @@ impl<'t> Geo<'t> { parent: Box, point: [f64; 2], ) -> Result { - let candidates = Box::new(std::iter::empty()); + let candidates = Box::new(iter::empty()); let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; let bucket_candidates = RoaringBitmap::new(); let rtree = index.geo_rtree(rtxn)?; @@ -41,7 +43,7 @@ impl<'t> Geo<'t> { } } -impl<'t> Criterion for Geo<'t> { +impl Criterion for Geo<'_> { fn next(&mut self, params: &mut CriterionParameters) -> Result> { // if there is no rtree we have nothing to returns let rtree = match self.rtree.as_ref() { @@ -108,7 +110,7 @@ fn geo_point( let results = rtree .nearest_neighbor_iter(&point) .filter_map(move |point| candidates.contains(point.data).then(|| point.data)) - .map(|id| std::iter::once(id).collect::()) + .map(|id| iter::once(id).collect::()) .collect::>(); Box::new(results.into_iter()) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index fca159900..105a69194 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -311,7 +311,9 @@ impl<'t> CriteriaBuilder<'t> { point.clone(), )?), AscDescName::Desc(Member::Geo(_point)) => { - return Err(UserError::InvalidSortName { name: "Sorting in descending order is currently not supported for the geosearch".to_string() })? + return Err(UserError::InvalidSortName { + name: "Sorting in descending order is currently not supported for the geosearch".to_string(), + })? } }; } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 08a84899f..1e5bf9ad0 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -21,7 +21,9 @@ use crate::error::UserError; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, }; -use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; +use crate::{ + distance_between_two_points, CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result, +}; #[derive(Debug, Clone, PartialEq)] pub enum Operator { @@ -198,10 +200,10 @@ impl FilterCondition { ErrorVariant::CustomError { message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), }, - // we want to point to the last parameters and if there was no parameters we + // we want to point to the last parameters and if there was no parameters we // point to the parenthesis - parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), - )))?; + parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), + )))?; } let (lat, lng, distance) = (¶meters[0], ¶meters[1], parameters[2].0); if let Some(span) = (!(-181.0..181.).contains(&lat.0)) @@ -505,19 +507,18 @@ impl FilterCondition { LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), Between(left, right) => (Included(*left), Included(*right)), GeoLowerThan(base_point, distance) => { - let mut result = RoaringBitmap::new(); let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, - None => return Ok(result), + None => return Ok(RoaringBitmap::new()), }; - rtree + let result = rtree .nearest_neighbor_iter(base_point) .take_while(|point| { - dbg!(crate::distance_between_two_points(base_point, point.geom())) - < *distance + distance_between_two_points(base_point, point.geom()) < *distance }) - .for_each(|point| drop(result.insert(point.data))); + .map(|point| point.data) + .collect(); return Ok(result); } @@ -600,14 +601,15 @@ fn field_id( let key = items.next().unwrap(); if key.as_rule() == Rule::reserved { return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "`{}` is a reserved keyword and thus can't be used as a filter expression. Available filterable attributes are: {}", + ErrorVariant::CustomError { + message: format!( + "`{}` is a reserved keyword and therefore can't be used as a filter expression. \ + Available filterable attributes are: {}", key.as_str(), filterable_fields.iter().join(", "), - ), - }, - key.as_span(), + ), + }, + key.as_span(), )); } @@ -691,7 +693,7 @@ mod tests { assert!(result.is_err()); let error = result.unwrap_err(); assert!(error.to_string().contains( - "`_geo` is a reserved keyword and thus can't be used as a filter expression." + "`_geo` is a reserved keyword and therefore can't be used as a filter expression." )); } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 639770bd6..b49cdc3cd 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -383,15 +383,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; - let points_to_remove: Vec<_> = rtree + let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree .iter() .filter(|&point| self.documents_ids.contains(point.data)) .cloned() - .collect(); + .map(|point| (point, point.data)) + .unzip(); points_to_remove.iter().for_each(|point| { rtree.remove(&point); - geo_faceted_doc_ids.remove(point.data); }); + geo_faceted_doc_ids -= docids_to_remove; self.index.put_geo_rtree(self.wtxn, &rtree)?; self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 1af22d010..a36b608ee 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -22,11 +22,10 @@ pub fn extract_geo_points( while let Some((docid_bytes, value)) = obkv_documents.next()? { let obkv = obkv::KvReader::new(value); - let point = match obkv.get(geo_field_id) { - Some(point) => point, + let point: Value = match obkv.get(geo_field_id) { + Some(point) => serde_json::from_slice(point).map_err(InternalError::SerdeJson)?, None => continue, }; - let point: Value = serde_json::from_slice(point).map_err(InternalError::SerdeJson)?; if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) { // this will create an array of 16 bytes (two 8 bytes floats) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 36e3c870f..47a62be67 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -189,12 +189,9 @@ fn extract_documents_data( let documents_chunk_cloned = documents_chunk.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); rayon::spawn(move || { - let _ = match extract_geo_points( - documents_chunk_cloned, - indexer, - primary_key_id, - geo_field_id, - ) { + let result = + extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_field_id); + let _ = match result { Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), Err(error) => lmdb_writer_sx_cloned.send(Err(error)), }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d3b8e47b0..336165894 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -237,12 +237,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // get filterable fields for facet databases let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; // get the fid of the `_geo` field. - let geo_field_id = if let Some(gfid) = self.index.fields_ids_map(self.wtxn)?.id("_geo") { - (self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid) - || self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid)) - .then(|| gfid) - } else { - None + let geo_field_id = match self.index.fields_ids_map(self.wtxn)?.id("_geo") { + Some(gfid) => { + let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid); + let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid); + if is_sortable || is_filterable { + Some(gfid) + } else { + None + } + } + None => None, }; let stop_words = self.index.stop_words(self.wtxn)?; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 5c27c195f..b17f28b66 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::convert::TryInto; use std::fs::File; use heed::types::ByteSlice; @@ -11,7 +12,7 @@ use super::helpers::{ }; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::into_clonable_grenad; -use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, GeoPoint, Index, Result}; +use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result}; pub(crate) enum TypedChunk { DocidWordPositions(grenad::Reader), @@ -180,24 +181,22 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::GeoPoints(mut geo_points) => { - // TODO: we should create the rtree with the `RTree::bulk_load` function let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); - let mut doc_ids = index.geo_faceted_documents_ids(wtxn)?; + let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; while let Some((key, value)) = geo_points.next()? { // convert the key back to a u32 (4 bytes) - let (key, _) = helpers::try_split_array_at::(key).unwrap(); - let key = u32::from_be_bytes(key); + let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); // convert the latitude and longitude back to a f64 (8 bytes) let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; - rtree.insert(GeoPoint::new(point, key)); - doc_ids.insert(key); + rtree.insert(GeoPoint::new(point, docid)); + geo_faceted_docids.insert(docid); } index.put_geo_rtree(wtxn, &rtree)?; - index.put_geo_faceted_documents_ids(wtxn, &doc_ids)?; + index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } } From 3fc145c25447f44c4638c127937594bf0f406922 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 9 Sep 2021 15:19:47 +0200 Subject: [PATCH 0997/1889] if we have no rtree we return all other provided documents --- milli/src/lib.rs | 5 +++-- milli/src/search/criteria/geo.rs | 14 +++++++------- milli/src/search/criteria/mod.rs | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 4f066365c..fc27b9d72 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -142,8 +142,9 @@ where Some((head, tail)) } -/// Return the distance between two points in meters. -fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { +/// Return the distance between two points in meters. Each points are composed of two f64, +/// one latitude and one longitude. +pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { let a = haversine::Location { latitude: a[0], longitude: a[1] }; let b = haversine::Location { latitude: b[0], longitude: b[1] }; diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index 78f741b57..c9dff307b 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -45,11 +45,7 @@ impl<'t> Geo<'t> { impl Criterion for Geo<'_> { fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // if there is no rtree we have nothing to returns - let rtree = match self.rtree.as_ref() { - Some(rtree) => rtree, - None => return Ok(None), - }; + let rtree = self.rtree.as_ref(); loop { match self.candidates.next() { @@ -92,8 +88,12 @@ impl Criterion for Geo<'_> { continue; } self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = - geo_point(rtree, self.allowed_candidates.clone(), self.point); + self.candidates = match rtree { + Some(rtree) => { + geo_point(rtree, self.allowed_candidates.clone(), self.point) + } + None => Box::new(std::iter::empty()), + }; } None => return Ok(None), }, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 105a69194..3c9485012 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -312,7 +312,7 @@ impl<'t> CriteriaBuilder<'t> { )?), AscDescName::Desc(Member::Geo(_point)) => { return Err(UserError::InvalidSortName { - name: "Sorting in descending order is currently not supported for the geosearch".to_string(), + name: "sorting in descending order is not supported for the geosearch".to_string(), })? } }; From 26deeb45a352d62c7dd44c866e8c9213b7f52e48 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 9 Sep 2021 17:49:04 +0200 Subject: [PATCH 0998/1889] Add lacking parameter to word level position builder --- milli/src/update/index_documents/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7800ae55a..861a5489e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -416,6 +416,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut builder = WordsLevelPositions::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; + builder.max_nb_chunks = self.max_nb_chunks; + builder.max_memory = self.max_memory; if let Some(value) = self.words_positions_level_group_size { builder.level_group_size(value); } From cfc62a1c15c9dc2c0d2d10697430e3ae666a4f19 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 9 Sep 2021 18:11:38 +0200 Subject: [PATCH 0999/1889] use geoutils instead of haversine --- milli/Cargo.toml | 2 +- milli/src/lib.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index be507332e..171a7ec4c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -16,7 +16,7 @@ flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" grenad = { version = "0.3.1", default-features = false, features = ["tempfile"] } -haversine = "0.2.1" +geoutils = "0.4.1" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index fc27b9d72..7c9f56665 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -145,10 +145,10 @@ where /// Return the distance between two points in meters. Each points are composed of two f64, /// one latitude and one longitude. pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { - let a = haversine::Location { latitude: a[0], longitude: a[1] }; - let b = haversine::Location { latitude: b[0], longitude: b[1] }; + let a = geoutils::Location::new(a[0], a[1]); + let b = geoutils::Location::new(b[0], b[1]); - haversine::distance(a, b, haversine::Units::Kilometers) * 1000. + a.haversine_distance_to(&b).meters() } #[cfg(test)] From f167f7b412a9ec96e93725c8a9677785cb94b681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Fri, 10 Sep 2021 09:48:17 +0200 Subject: [PATCH 1000/1889] Update version for the next release (v0.13.1) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index c319bfdab..ebee3c271 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.13.0" +version = "0.13.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index a05050f41..9f5cc2287 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.13.0" +version = "0.13.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index bf1b24a05..421729de2 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.13.0" +version = "0.13.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index c6fe3ea95..98eac15a9 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.13.0" +version = "0.13.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index c7de7f03f..da976a0cb 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.13.0" +version = "0.13.1" authors = ["Clément Renault "] edition = "2018" From a43f99c600066452092eee35ab396bab1c69bafc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 13 Sep 2021 14:00:56 +0200 Subject: [PATCH 1001/1889] Inform the users that documents must have an id in there documents --- README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 58e781d83..674c1919e 100644 --- a/README.md +++ b/README.md @@ -40,13 +40,12 @@ All of that on a 39$/month machine with 4cores. You can feed the engine with your CSV (comma-seperated, yes) data like this: ```bash -printf "name,age\nhello,32\nkiki,24\n" | http POST 127.0.0.1:9700/documents content-type:text/csv +printf "id,name,age\n1,hello,32\n2,kiki,24\n" | http POST 127.0.0.1:9700/documents content-type:text/csv ``` -Here ids will be automatically generated as UUID v4 if they doesn't exist in some or every documents. - -Note that it also support JSON and JSON streaming, you can send them to the engine by using -the `content-type:application/json` and `content-type:application/x-ndjson` headers respectively. +Don't forget to specify the `id` of the documents. Also Note that it also support JSON and +JSON streaming, you can send them to the engine by using the `content-type:application/json` +and `content-type:application/x-ndjson` headers respectively. ### Querying the engine via the website From 2741aa8589cc69bb51a6e7f530f52a276b60366a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 13 Sep 2021 16:06:45 +0200 Subject: [PATCH 1002/1889] Update the indexing timings in the README --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 674c1919e..07071183e 100644 --- a/README.md +++ b/README.md @@ -32,10 +32,10 @@ cargo run --release -- --db my-database.mdb -vvv --indexing-jobs 8 ### Index your documents It can index a massive amount of documents in not much time, I already achieved to index: - - 115m songs (song and artist name) in ~1h and take 107GB on disk. - - 12m cities (name, timezone and country ID) in 15min and take 10GB on disk. + - 115m songs (song and artist name) in \~48min and take 81GiB on disk. + - 12m cities (name, timezone and country ID) in \~4min and take 6GiB on disk. -All of that on a 39$/month machine with 4cores. +These metrics are done on a MacBook Pro with the M1 processor. You can feed the engine with your CSV (comma-seperated, yes) data like this: @@ -43,9 +43,9 @@ You can feed the engine with your CSV (comma-seperated, yes) data like this: printf "id,name,age\n1,hello,32\n2,kiki,24\n" | http POST 127.0.0.1:9700/documents content-type:text/csv ``` -Don't forget to specify the `id` of the documents. Also Note that it also support JSON and -JSON streaming, you can send them to the engine by using the `content-type:application/json` -and `content-type:application/x-ndjson` headers respectively. +Don't forget to specify the `id` of the documents. Also, note that it supports JSON and JSON +streaming: you can send them to the engine by using the `content-type:application/json` and +`content-type:application/x-ndjson` headers respectively. ### Querying the engine via the website From 91ce4d1721765cb5862f6cf531fea0d2021021f0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 Sep 2021 12:42:06 +0200 Subject: [PATCH 1003/1889] Stop iterating through the whole list of points We stop when there is no possible candidates left --- milli/src/search/criteria/geo.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index c9dff307b..9629a4a15 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -104,14 +104,18 @@ impl Criterion for Geo<'_> { fn geo_point( rtree: &RTree, - candidates: RoaringBitmap, + mut candidates: RoaringBitmap, point: [f64; 2], ) -> Box> { - let results = rtree - .nearest_neighbor_iter(&point) - .filter_map(move |point| candidates.contains(point.data).then(|| point.data)) - .map(|id| iter::once(id).collect::()) - .collect::>(); + let mut results = Vec::new(); + for point in rtree.nearest_neighbor_iter(&point) { + if candidates.remove(point.data) { + results.push(std::iter::once(point.data).collect()); + if candidates.is_empty() { + break; + } + } + } Box::new(results.into_iter()) } From c695a1ffd2306e663cf027d1727dca4e8c0c1503 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 Sep 2021 14:27:14 +0200 Subject: [PATCH 1004/1889] add the possibility to sort by descending order on geoPoint --- milli/src/search/criteria/geo.rs | 39 ++++++++++++++++++++++++++++---- milli/src/search/criteria/mod.rs | 15 ++++++------ 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index 9629a4a15..de6de8912 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -10,6 +10,7 @@ use crate::{GeoPoint, Index, Result}; pub struct Geo<'t> { index: &'t Index, rtxn: &'t heed::RoTxn<'t>, + ascending: bool, parent: Box, candidates: Box>, allowed_candidates: RoaringBitmap, @@ -19,11 +20,30 @@ pub struct Geo<'t> { } impl<'t> Geo<'t> { - pub fn new( + pub fn asc( index: &'t Index, rtxn: &'t heed::RoTxn<'t>, parent: Box, point: [f64; 2], + ) -> Result { + Self::new(index, rtxn, parent, point, true) + } + + pub fn desc( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + point: [f64; 2], + ) -> Result { + Self::new(index, rtxn, parent, point, false) + } + + fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + point: [f64; 2], + ascending: bool, ) -> Result { let candidates = Box::new(iter::empty()); let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; @@ -33,6 +53,7 @@ impl<'t> Geo<'t> { Ok(Self { index, rtxn, + ascending, parent, candidates, allowed_candidates, @@ -89,9 +110,12 @@ impl Criterion for Geo<'_> { } self.allowed_candidates = &candidates - params.excluded_candidates; self.candidates = match rtree { - Some(rtree) => { - geo_point(rtree, self.allowed_candidates.clone(), self.point) - } + Some(rtree) => geo_point( + rtree, + self.allowed_candidates.clone(), + self.point, + self.ascending, + ), None => Box::new(std::iter::empty()), }; } @@ -106,6 +130,7 @@ fn geo_point( rtree: &RTree, mut candidates: RoaringBitmap, point: [f64; 2], + ascending: bool, ) -> Box> { let mut results = Vec::new(); for point in rtree.nearest_neighbor_iter(&point) { @@ -117,5 +142,9 @@ fn geo_point( } } - Box::new(results.into_iter()) + if ascending { + Box::new(results.into_iter()) + } else { + Box::new(results.into_iter().rev()) + } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 3c9485012..c2de55de5 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -15,7 +15,7 @@ use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::criterion::{AscDesc as AscDescName, Member}; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; -use crate::{DocumentId, FieldId, Index, Result, TreeLevel, UserError}; +use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; mod asc_desc; mod attribute; @@ -304,17 +304,18 @@ impl<'t> CriteriaBuilder<'t> { criterion, field.to_string(), )?), - AscDescName::Asc(Member::Geo(point)) => Box::new(Geo::new( + AscDescName::Asc(Member::Geo(point)) => Box::new(Geo::asc( + &self.index, + &self.rtxn, + criterion, + point.clone(), + )?), + AscDescName::Desc(Member::Geo(point)) => Box::new(Geo::desc( &self.index, &self.rtxn, criterion, point.clone(), )?), - AscDescName::Desc(Member::Geo(_point)) => { - return Err(UserError::InvalidSortName { - name: "sorting in descending order is not supported for the geosearch".to_string(), - })? - } }; } criterion From f6c6b026bb71e20c6a41287ece8dff9bf02bc0d8 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 16 Sep 2021 11:25:51 +0200 Subject: [PATCH 1005/1889] improve the comparison script --- benchmarks/scripts/compare.sh | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 506c94015..84d1dc0e6 100755 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -17,21 +17,9 @@ if [[ "$?" -ne 0 ]]; then exit 1 fi -if [[ $# -ne 2 ]] - then - echo 'Need 2 arguments.' - echo 'Usage: ' - echo ' $ ./compare.sh old new' - echo 'Ex:' - echo ' $ ./compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json' - exit 1 -fi - -old_file="$1" -new_file="$2" s3_url='https://milli-benchmarks.fra1.digitaloceanspaces.com/critcmp_results' -for file in $old_file $new_file +for file in $@ do file_s3_url="$s3_url/$file" file_local_path="/tmp/$file" @@ -45,6 +33,12 @@ do fi done -# Print the diff changes between the old and new benchmarks -# by only displaying the lines that have a diff of more than 5%. -critcmp --threshold 5 "/tmp/$old_file" "/tmp/$new_file" +path_list=$(echo " $@" | sed 's/ / \/tmp\//g') + +if [[ ${#@} -gt 1 ]]; then + # Print the diff changes between the old and new benchmarks + # by only displaying the lines that have a diff of more than 5%. + critcmp --threshold 5 $path_list +else + critcmp $path_list +fi From 5e683ba472e00421594658a2a5c90c73f26b8514 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 Sep 2021 18:08:28 +0200 Subject: [PATCH 1006/1889] add benchmarks for the geosearch --- .github/workflows/benchmarks.yml | 2 +- benchmarks/Cargo.toml | 4 + benchmarks/README.md | 32 ++++++-- benchmarks/benches/indexing.rs | 59 ++++++++++++++- benchmarks/benches/search_geo.rs | 123 +++++++++++++++++++++++++++++++ benchmarks/benches/utils.rs | 11 ++- benchmarks/build.rs | 3 +- 7 files changed, 222 insertions(+), 12 deletions(-) create mode 100644 benchmarks/benches/search_geo.rs diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index c64c6a64b..7a9fbb5de 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -4,7 +4,7 @@ on: workflow_dispatch: inputs: dataset_name: - description: 'The name of the dataset used to benchmark (search_songs, search_wiki or indexing)' + description: 'The name of the dataset used to benchmark (search_songs, search_wiki, search_geo or indexing)' required: false default: 'search_songs' diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 9e380b9a8..b598f2f6f 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -29,6 +29,10 @@ harness = false name = "search_wiki" harness = false +[[bench]] +name = "search_geo" +harness = false + [[bench]] name = "indexing" harness = false diff --git a/benchmarks/README.md b/benchmarks/README.md index 16838e488..7a387dfdd 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -36,7 +36,7 @@ To run all the benchmarks (~5h): cargo bench ``` -To run only the `songs` (~1h), `wiki` (~3h) or `indexing` (~4h) benchmark: +To run only the `search_songs` (~1h), `search_wiki` (~3h), `search_geo` (~20m) or `indexing` (~2h) benchmark: ```bash cargo bench --bench @@ -47,7 +47,7 @@ If you don't want to download the datasets every time you update something on th ```bash mkdir ~/datasets -MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the three datasets are downloaded +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench search_songs # the four datasets are downloaded touch build.rs MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded ``` @@ -81,14 +81,15 @@ Run the comparison script: ## Datasets -The benchmarks are available for the following datasets: -- `songs` -- `wiki` +The benchmarks uses the following datasets: +- `smol-songs` +- `smol-wiki` - `movies` +- `smol-all-countries` ### Songs -`songs` is a subset of the [`songs.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/songs.csv.gz). +`smol-songs` is a subset of the [`songs.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/songs.csv.gz). It was generated with this command: @@ -96,11 +97,11 @@ It was generated with this command: xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv ``` -_[Download the generated `songs` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-songs.csv.gz)._ +_[Download the generated `smol-songs` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-songs.csv.gz)._ ### Wiki -`wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/wiki-articles.csv.gz). +`smol-wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/wiki-articles.csv.gz). It was generated with the following command: @@ -108,9 +109,24 @@ It was generated with the following command: xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv ``` +_[Download the `smol-wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki.csv.gz)._ + ### Movies `movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/) _[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._ + +### All Countries + +`smol-all-countries` is a subset of the [`all-countries.csv` dataset]() +It has been converted to jsonlines and then edited so it matches our format for the `_geo` field. + +It was generated with the following command: +```bash +bat all-countries.csv.gz | gunzip | xsv sample --seed 42 1000000 | csv2json-lite | sd '"latitude":"(.*?)","longitude":"(.*?)"' '"_geo": { "lat": $1, "lng": $2 }' | sd '\[|\]|,$' '' | gzip > smol-all-countries.jsonl.gz +``` + +_[Download the `smol-all-countries` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-all-countries.jsonl.gz)._ + diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index bd056ea23..30532aef8 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -277,12 +277,69 @@ fn indexing_movies_default(c: &mut Criterion) { }); } +fn indexing_geo(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing geo_point", |b| { + b.iter_with_setup( + move || { + let index = setup_index(); + + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + builder.set_primary_key("geonameid".to_owned()); + let displayed_fields = + ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let filterable_fields = + ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); + + let sortable_fields = + ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_sortable_fields(sortable_fields); + + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + index + }, + move |index| { + let update_builder = UpdateBuilder::new(0); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.index_documents(&mut wtxn, &index); + + builder.update_format(UpdateFormat::JsonStream); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); + let reader = File::open(datasets_paths::SMOL_ALL_COUNTRIES).expect(&format!( + "could not find the dataset in: {}", + datasets_paths::SMOL_ALL_COUNTRIES + )); + builder.execute(reader, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + criterion_group!( benches, indexing_songs_default, indexing_songs_without_faceted_numbers, indexing_songs_without_faceted_fields, indexing_wiki, - indexing_movies_default + indexing_movies_default, + indexing_geo ); criterion_main!(benches); diff --git a/benchmarks/benches/search_geo.rs b/benchmarks/benches/search_geo.rs new file mode 100644 index 000000000..1432f691b --- /dev/null +++ b/benchmarks/benches/search_geo.rs @@ -0,0 +1,123 @@ +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::{Settings, UpdateFormat}; +use utils::Conf; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = + ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let filterable_fields = + ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); + + let sortable_fields = + ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_sortable_fields(sortable_fields); +} + +#[rustfmt::skip] +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_ALL_COUNTRIES, + dataset_format: UpdateFormat::JsonStream, + queries: &[ + "", + ], + configure: base_conf, + primary_key: Some("geonameid"), + ..Conf::BASE +}; + +fn bench_geo(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + // A basic placeholder with no geo + utils::Conf { + group_name: "placeholder with no geo", + ..BASE_CONF + }, + // Medium aglomeration: probably the most common usecase + utils::Conf { + group_name: "asc sort from Lille", + sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):asc"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc sort from Lille", + sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):desc"]), + ..BASE_CONF + }, + // Big agglomeration: a lot of documents close to our point + utils::Conf { + group_name: "asc sort from Tokyo", + sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):asc"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc sort from Tokyo", + sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):desc"]), + ..BASE_CONF + }, + // The furthest point from any civilization + utils::Conf { + group_name: "asc sort from Point Nemo", + sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):asc"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc sort from Point Nemo", + sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):desc"]), + ..BASE_CONF + }, + // Filters + utils::Conf { + group_name: "filter of 100km from Lille", + filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 100000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 1km from Lille", + filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 1000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 100km from Tokyo", + filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 100000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 1km from Tokyo", + filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 1000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 100km from Point Nemo", + filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 100000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 1km from Point Nemo", + filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 1000)"), + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_geo); +criterion_main!(benches); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 5318527f4..72eac59d9 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -12,6 +12,8 @@ pub struct Conf<'a> { pub database_name: &'a str, /// the dataset to be used, it must be an uncompressed csv pub dataset: &'a str, + /// The format of the dataset + pub dataset_format: UpdateFormat, pub group_name: &'a str, pub queries: &'a [&'a str], /// here you can change which criterion are used and in which order. @@ -21,6 +23,7 @@ pub struct Conf<'a> { /// the last chance to configure your database as you want pub configure: fn(&mut Settings), pub filter: Option<&'a str>, + pub sort: Option>, /// enable or disable the optional words on the query pub optional_words: bool, /// primary key, if there is None we'll auto-generate docids for every documents @@ -30,12 +33,14 @@ pub struct Conf<'a> { impl Conf<'_> { pub const BASE: Self = Conf { database_name: "benches.mmdb", + dataset_format: UpdateFormat::Csv, dataset: "", group_name: "", queries: &[], criterion: None, configure: |_| (), filter: None, + sort: None, optional_words: true, primary_key: None, }; @@ -82,7 +87,7 @@ pub fn base_setup(conf: &Conf) -> Index { if let None = conf.primary_key { builder.enable_autogenerate_docids(); } - builder.update_format(UpdateFormat::Csv); + builder.update_format(conf.dataset_format); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); let reader = File::open(conf.dataset) .expect(&format!("could not find the dataset in: {}", conf.dataset)); @@ -110,6 +115,10 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let filter = FilterCondition::from_str(&rtxn, &index, filter).unwrap(); search.filter(filter); } + if let Some(sort) = &conf.sort { + let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); + search.sort_criteria(sort); + } let _ids = search.execute().unwrap(); }); }); diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 47a14f25b..2495930bb 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -13,6 +13,7 @@ const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/dat const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv"); const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv"); const DATASET_MOVIES: (&str, &str) = ("movies", "json"); +const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl"); /// The name of the environment variable used to select the path /// of the directory containing the datasets @@ -32,7 +33,7 @@ fn main() -> anyhow::Result<()> { )?; writeln!(manifest_paths_file)?; - for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES] { + for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES, DATASET_GEO] { let out_path = out_dir.join(dataset); let out_file = out_path.with_extension(extension); From 9a920d1f93ccf7afb698285289b6b56bdd54d900 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 20 Sep 2021 10:37:38 +0200 Subject: [PATCH 1007/1889] Fix datasets links in the readme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Urquizar --- benchmarks/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 7a387dfdd..a6fdf9360 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -109,7 +109,7 @@ It was generated with the following command: xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv ``` -_[Download the `smol-wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki.csv.gz)._ +_[Download the `smol-wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._ ### Movies @@ -120,7 +120,7 @@ _[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._ ### All Countries -`smol-all-countries` is a subset of the [`all-countries.csv` dataset]() +`smol-all-countries` is a subset of the [`all-countries.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/all-countries.csv.gz) It has been converted to jsonlines and then edited so it matches our format for the `_geo` field. It was generated with the following command: From eaba772f214df658f6bf961b86ea01e94a4916a4 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 20 Sep 2021 10:51:04 +0200 Subject: [PATCH 1008/1889] update the README to better match the new critcmp usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Urquizar --- benchmarks/README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 16838e488..81447c4f2 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -54,7 +54,7 @@ MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is com ## Comparison between benchmarks -The benchmark reports we push are generated with `critcmp`. Thus, we use `critcmp` to generate comparison results between 2 benchmarks. +The benchmark reports we push are generated with `critcmp`. Thus, we use `critcmp` to show the result of a benchmark, or compare results between multiple benchmarks. We provide a script to download and display the comparison report. @@ -71,12 +71,18 @@ List the available file in the DO Space: ```bash songs_main_09a4321.json songs_geosearch_24ec456.json +search_songs_main_cb45a10b.json ``` Run the comparison script: ```bash +# we get the result of ONE benchmark, this give you an idea of how much time an operation took +./benchmarks/scripts/compare.sh son songs_geosearch_24ec456.json +# we compare two benchmarks ./benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json +# we compare three benchmarks +./benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json search_songs_main_cb45a10b.json ``` ## Datasets From 3b7a2cdbced36fe15fa7e19e7efd6e4687f3ea81 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 20 Sep 2021 16:10:39 +0200 Subject: [PATCH 1009/1889] fix typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/index.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 4ab19f175..f7603148d 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -344,7 +344,7 @@ impl Index { self.main.delete::<_, Str>(wtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY) } - /// Retrieve all the documents ids that faceted with a _geo field. + /// Retrieve all the documents ids that are faceted with a _geo field. pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { match self .main From f4b8e5675d903161b262910126cbf6ebf461f89a Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 20 Sep 2021 17:21:02 +0200 Subject: [PATCH 1010/1889] move the reserved keyword logic for the criterion and sort + add test --- milli/src/criterion.rs | 73 +++++++++++++++++++++++++++++++++++++----- milli/src/error.rs | 2 +- 2 files changed, 66 insertions(+), 9 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index b95080b4b..eb9a6c86a 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -50,9 +50,6 @@ impl FromStr for Criterion { "sort" => Ok(Criterion::Sort), "exactness" => Ok(Criterion::Exactness), text => match AscDesc::from_str(text) { - Ok(AscDesc::Asc(Member::Field(field))) if is_reserved_keyword(&field) => { - Err(UserError::InvalidReservedRankingRuleName { name: text.to_string() })? - } Ok(AscDesc::Asc(Member::Field(field))) => Ok(Criterion::Asc(field)), Ok(AscDesc::Desc(Member::Field(field))) => Ok(Criterion::Desc(field)), Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { @@ -79,11 +76,8 @@ impl FromStr for Member { type Err = UserError; fn from_str(text: &str) -> Result { - if text.starts_with("_geoPoint(") { - let point = - text.strip_prefix("_geoPoint(") - .and_then(|point| point.strip_suffix(")")) - .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() })?; + if let Some(point) = text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(")")) + { let (lat, long) = point .split_once(',') .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() }) @@ -95,6 +89,9 @@ impl FromStr for Member { })?; Ok(Member::Geo([lat, long])) } else { + if is_reserved_keyword(text) { + return Err(UserError::InvalidReservedRankingRuleName { name: text.to_string() })?; + } Ok(Member::Field(text.to_string())) } } @@ -185,3 +182,63 @@ impl fmt::Display for Criterion { } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_asc_desc() { + use big_s::S; + use AscDesc::*; + use Member::*; + + let valid_req = [ + ("truc:asc", Asc(Field(S("truc")))), + ("bidule:desc", Desc(Field(S("bidule")))), + ("a-b:desc", Desc(Field(S("a-b")))), + ("a:b:desc", Desc(Field(S("a:b")))), + ("a12:asc", Asc(Field(S("a12")))), + ("42:asc", Asc(Field(S("42")))), + ("_geoPoint(42, 59):asc", Asc(Geo([42., 59.]))), + ("_geoPoint(42.459, 59):desc", Desc(Geo([42.459, 59.]))), + ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), + ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), + ("_geoPoint(42.0002, 59.895):desc", Desc(Geo([42.0002, 59.895]))), + ("_geoPoint(42., 59.):desc", Desc(Geo([42., 59.]))), + ("truc(12, 13):desc", Desc(Field(S("truc(12, 13)")))), + ]; + + for (req, expected) in valid_req { + let res = req.parse(); + assert!(res.is_ok(), "Failed to parse `{}`, was expecting `{:?}`", req, expected); + assert_eq!(expected, res.unwrap()); + } + + let invalid_req = [ + "truc:machin", + "truc:deesc", + "truc:asc:deesc", + "42desc", + "_geoPoint:asc", + "_geoDistance:asc", + "_geoPoint(42.12 , 59.598)", + "_geoPoint(42.12 , 59.598):deesc", + "_geoPoint(42.12 , 59.598):machin", + "_geoPoint(42.12 , 59.598):asc:aasc", + "_geoPoint(42,12 , 59,598):desc", + "_geoPoint(35, 85, 75):asc", + "_geoPoint(18):asc", + ]; + + for req in invalid_req { + let res = req.parse::(); + assert!( + res.is_err(), + "Should no be able to parse `{}`, was expecting an error but instead got: `{:?}`", + req, + res, + ); + } + } +} diff --git a/milli/src/error.rs b/milli/src/error.rs index 21b77b5a7..e6bd3fd62 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -13,7 +13,7 @@ use crate::{DocumentId, FieldId}; pub type Object = Map; pub fn is_reserved_keyword(keyword: &str) -> bool { - ["_geo", "_geoDistance"].contains(&keyword) + ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword) } #[derive(Debug)] From 3f1453f4708564b94559d9f296b78734a8fac860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 20 Sep 2021 18:12:23 +0200 Subject: [PATCH 1011/1889] Update version for the next release (v0.14.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index ebee3c271..98e20da78 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.13.1" +version = "0.14.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 9f5cc2287..44183eaa2 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.13.1" +version = "0.14.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 421729de2..ad5460778 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.13.1" +version = "0.14.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 98eac15a9..15223530f 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.13.1" +version = "0.14.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index da976a0cb..ebb2607c0 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.13.1" +version = "0.14.0" authors = ["Clément Renault "] edition = "2018" From 0d104a0fceab3555c5e2330ab4992d97f74d225d Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 20 Sep 2021 18:08:22 +0200 Subject: [PATCH 1012/1889] Update milli/src/criterion.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/criterion.rs | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index eb9a6c86a..24879cdd4 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -76,23 +76,29 @@ impl FromStr for Member { type Err = UserError; fn from_str(text: &str) -> Result { - if let Some(point) = text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(")")) - { - let (lat, long) = point - .split_once(',') - .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() }) - .and_then(|(lat, long)| { - lat.trim() - .parse() - .and_then(|lat| long.trim().parse().map(|long| (lat, long))) - .map_err(|_| UserError::InvalidRankingRuleName { name: text.to_string() }) - })?; - Ok(Member::Geo([lat, long])) - } else { - if is_reserved_keyword(text) { - return Err(UserError::InvalidReservedRankingRuleName { name: text.to_string() })?; + match text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(")")) { + Some(point) => { + let (lat, long) = point + .split_once(',') + .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() }) + .and_then(|(lat, long)| { + lat.trim() + .parse() + .and_then(|lat| long.trim().parse().map(|long| (lat, long))) + .map_err(|_| UserError::InvalidRankingRuleName { + name: text.to_string(), + }) + })?; + Ok(Member::Geo([lat, long])) + } + None => { + if is_reserved_keyword(text) { + return Err(UserError::InvalidReservedRankingRuleName { + name: text.to_string(), + })?; + } + Ok(Member::Field(text.to_string())) } - Ok(Member::Field(text.to_string())) } } } From aa6c5df0bc7a34ea3d40179385b368a0362fe785 Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 31 Aug 2021 11:44:15 +0200 Subject: [PATCH 1013/1889] Implement documents format document reader transform remove update format support document sequences fix document transform clean transform improve error handling add documents! macro fix transform bug fix tests remove csv dependency Add comments on the transform process replace search cli fmt review edits fix http ui fix clippy warnings Revert "fix clippy warnings" This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620. fix review comments remove smallvec in transform loop review edits --- Cargo.lock | 3410 +++++++++++++++++ Cargo.toml | 2 +- cli/Cargo.toml | 24 + cli/src/main.rs | 335 ++ http-ui/Cargo.toml | 2 + http-ui/src/main.rs | 94 +- milli/Cargo.toml | 3 +- milli/src/documents/builder.rs | 80 + milli/src/documents/mod.rs | 233 ++ milli/src/documents/reader.rs | 75 + milli/src/documents/serde.rs | 465 +++ milli/src/error.rs | 2 - milli/src/index.rs | 24 +- milli/src/lib.rs | 3 + milli/src/search/distinct/mod.rs | 25 +- milli/src/update/clear_documents.rs | 10 +- milli/src/update/delete_documents.rs | 31 +- milli/src/update/index_documents/mod.rs | 337 +- milli/src/update/index_documents/transform.rs | 377 +- milli/src/update/mod.rs | 4 +- milli/src/update/settings.rs | 113 +- milli/src/update/update_step.rs | 9 +- milli/tests/search/mod.rs | 19 +- milli/tests/search/query_criteria.rs | 52 +- search/src/main.rs | 98 - 25 files changed, 5114 insertions(+), 713 deletions(-) create mode 100644 Cargo.lock create mode 100644 cli/Cargo.toml create mode 100644 cli/src/main.rs create mode 100644 milli/src/documents/builder.rs create mode 100644 milli/src/documents/mod.rs create mode 100644 milli/src/documents/reader.rs create mode 100644 milli/src/documents/serde.rs delete mode 100644 search/src/main.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 000000000..601f711ff --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,3410 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e61f2b7f93d2c7d2b08263acaa4a363b3e276806c68af6134c44f523bf1aacd" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "ahash" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" + +[[package]] +name = "aho-corasick" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" +dependencies = [ + "memchr", +] + +[[package]] +name = "ansi_term" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "anyhow" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1" + +[[package]] +name = "arrayvec" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" + +[[package]] +name = "as-slice" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45403b49e3954a4b8428a0ac21a4b7afadccf92bfd96273f1a58cd4812496ae0" +dependencies = [ + "generic-array 0.12.4", + "generic-array 0.13.3", + "generic-array 0.14.4", + "stable_deref_trait", +] + +[[package]] +name = "askama" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d298738b6e47e1034e560e5afe63aa488fea34e25ec11b855a76f0d7b8e73134" +dependencies = [ + "askama_derive", + "askama_escape", + "askama_shared", + "mime", + "mime_guess", +] + +[[package]] +name = "askama_derive" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2925c4c290382f9d2fa3d1c1b6a63fa1427099721ecca4749b154cc9c25522" +dependencies = [ + "askama_shared", + "proc-macro2 1.0.29", + "syn 1.0.76", +] + +[[package]] +name = "askama_escape" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90c108c1a94380c89d2215d0ac54ce09796823cca0fd91b299cfff3b33e346fb" + +[[package]] +name = "askama_shared" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2582b77e0f3c506ec4838a25fa8a5f97b9bed72bb6d3d272ea1c031d8bd373bc" +dependencies = [ + "askama_escape", + "humansize", + "nom", + "num-traits", + "percent-encoding", + "proc-macro2 1.0.29", + "quote 1.0.9", + "serde", + "syn 1.0.76", + "toml", +] + +[[package]] +name = "askama_warp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96f410ab17fa08f70b5fda07ce1112418642c914864961630808979343ea226" +dependencies = [ + "askama", + "warp", +] + +[[package]] +name = "atty" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "backtrace" +version = "0.3.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7a905d892734eea339e896738c14b9afce22b5318f64b951e70bf3844419b01" +dependencies = [ + "addr2line", + "cc", + "cfg-if 1.0.0", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" + +[[package]] +name = "base64" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" + +[[package]] +name = "benchmarks" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes 1.1.0", + "convert_case", + "criterion", + "flate2", + "heed", + "jemallocator", + "milli", + "reqwest", +] + +[[package]] +name = "big_s" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8" + +[[package]] +name = "bimap" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50ae17cabbc8a38a1e3e4c1a6a664e9a09672dc14d0896fa8d865d3a5a446b07" +dependencies = [ + "serde", +] + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitvec" +version = "0.19.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + +[[package]] +name = "block-buffer" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" +dependencies = [ + "block-padding", + "byte-tools", + "byteorder", + "generic-array 0.12.4", +] + +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "generic-array 0.14.4", +] + +[[package]] +name = "block-padding" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" +dependencies = [ + "byte-tools", +] + +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "buf_redux" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" +dependencies = [ + "memchr", + "safemem", +] + +[[package]] +name = "bumpalo" +version = "3.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9df67f7bf9ef8498769f994239c45613ef0c5899415fb58e9add412d2c1a538" + +[[package]] +name = "byte-tools" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" + +[[package]] +name = "byte-unit" +version = "4.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "063197e6eb4b775b64160dedde7a0986bb2836cce140e9492e9e96f28e18bcd8" +dependencies = [ + "serde", + "utf8-width", +] + +[[package]] +name = "bytemuck" +version = "1.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72957246c41db82b8ef88a5486143830adeb8227ef9837740bdec67724cf2c5b" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e215f8c2f9f79cb53c8335e687ffd07d5bfcb6fe5fc80723762d0be46e7cc54" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", +] + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "bytes" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38" + +[[package]] +name = "bytes" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" + +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "cc" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26a6ce4b6a484fa3edb70f7efa6fc430fd2b87285fe8b84304fd0936faa0dc0" + +[[package]] +name = "cedarwood" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d" +dependencies = [ + "smallvec", +] + +[[package]] +name = "cfg-if" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "character_converter" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" +dependencies = [ + "bincode", +] + +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "serde", + "time", + "winapi 0.3.9", +] + +[[package]] +name = "clap" +version = "2.33.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim", + "term_size", + "textwrap", + "unicode-width", + "vec_map", +] + +[[package]] +name = "cli" +version = "0.1.0" +dependencies = [ + "bimap", + "byte-unit", + "color-eyre", + "csv", + "eyre", + "heed", + "indicatif", + "jemallocator", + "milli", + "serde", + "serde_json", + "stderrlog", + "structopt", +] + +[[package]] +name = "color-eyre" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f1885697ee8a177096d42f158922251a41973117f6d8a234cee94b9509157b7" +dependencies = [ + "backtrace", + "color-spantrace", + "eyre", + "indenter", + "once_cell", + "owo-colors", + "tracing-error", +] + +[[package]] +name = "color-spantrace" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6eee477a4a8a72f4addd4de416eb56d54bc307b284d6601bafdee1f4ea462d1" +dependencies = [ + "once_cell", + "owo-colors", + "tracing-core", + "tracing-error", +] + +[[package]] +name = "concat-arrays" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1df715824eb382e34b7afb7463b0247bf41538aeba731fba05241ecdb5dc3747" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", +] + +[[package]] +name = "console" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3993e6445baa160675931ec041a5e03ca84b9c6e32a056150d3aa2bdda0a1f45" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "terminal_size", + "winapi 0.3.9", +] + +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + +[[package]] +name = "cow-utils" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" + +[[package]] +name = "cpufeatures" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "criterion" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +dependencies = [ + "atty", + "cast", + "clap", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils 0.8.5", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-epoch", + "crossbeam-utils 0.8.5", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" +dependencies = [ + "cfg-if 1.0.0", + "crossbeam-utils 0.8.5", + "lazy_static", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b" +dependencies = [ + "crossbeam-utils 0.6.6", +] + +[[package]] +name = "crossbeam-utils" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04973fa96e96579258a5091af6003abde64af786b860f18622b82e026cca60e6" +dependencies = [ + "cfg-if 0.1.10", + "lazy_static", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" +dependencies = [ + "cfg-if 1.0.0", + "lazy_static", +] + +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + +[[package]] +name = "deunicode" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2c9736e15e7df1638a7f6eee92a6511615c738246a052af5ba86f039b65aede" + +[[package]] +name = "digest" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" +dependencies = [ + "generic-array 0.12.4", +] + +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "generic-array 0.14.4", +] + +[[package]] +name = "dtoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56899898ce76aaf4a0f24d914c97ea6ed976d42fec6ad33fcbb0a1103e07b2b0" + +[[package]] +name = "either" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" + +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + +[[package]] +name = "encoding_rs" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "eyre" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "221239d1d5ea86bf5d6f91c9d6bc3646ffe471b08ff9b0f91c44f115ac969d2b" +dependencies = [ + "indenter", + "once_cell", +] + +[[package]] +name = "fake-simd" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" + +[[package]] +name = "flate2" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6988e897c1c9c485f43b47a529cef42fde0547f9d8d41a7062518f1d8fc53f" +dependencies = [ + "cfg-if 1.0.0", + "crc32fast", + "libc", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "form_urlencoded" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" +dependencies = [ + "matches", + "percent-encoding", +] + +[[package]] +name = "fs_extra" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" + +[[package]] +name = "fst" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" + +[[package]] +name = "fuchsia-zircon" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" +dependencies = [ + "bitflags", + "fuchsia-zircon-sys", +] + +[[package]] +name = "fuchsia-zircon-sys" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" + +[[package]] +name = "funty" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" + +[[package]] +name = "futures" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a12aa0eb539080d55c3f2d45a67c3b58b6b0773c1a3ca2dfec66d58c97fd66ca" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5da6ba8c3bb3c165d3c7319fc1cc8304facf1fb8db99c5de877183c08a273888" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88d1c26957f23603395cd326b0ffe64124b818f4449552f960d815cfba83a53d" + +[[package]] +name = "futures-executor" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45025be030969d763025784f7f355043dc6bc74093e4ecc5000ca4dc50d8745c" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "522de2a0fe3e380f1bc577ba0474108faf3f6b18321dbf60b3b9c39a75073377" + +[[package]] +name = "futures-macro" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18e4a4b95cea4b4ccbcf1c5675ca7c4ee4e9e75eb79944d07defde18068f79bb" +dependencies = [ + "autocfg", + "proc-macro-hack", + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", +] + +[[package]] +name = "futures-sink" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36ea153c13024fe480590b3e3d4cad89a0cfacecc24577b68f86c6ced9c2bc11" + +[[package]] +name = "futures-task" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d3d00f4eddb73e498a54394f228cd55853bdf059259e8e7bc6e69d408892e99" + +[[package]] +name = "futures-util" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36568465210a3a6ee45e1f165136d68671471a501e632e9a98d96872222b5481" +dependencies = [ + "autocfg", + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite 0.2.7", + "pin-utils", + "proc-macro-hack", + "proc-macro-nested", + "slab", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "generic-array" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd" +dependencies = [ + "typenum", +] + +[[package]] +name = "generic-array" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f797e67af32588215eaaab8327027ee8e71b9dd0b2b26996aedf20c030fce309" +dependencies = [ + "typenum", +] + +[[package]] +name = "generic-array" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "geoutils" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e006f616a407d396ace1d2ebb3f43ed73189db8b098079bd129928d7645dd1e" + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", +] + +[[package]] +name = "gimli" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0a01e0497841a3b2db4f8afa483cce65f7e96a3498bd6c541734792aeac8fe7" + +[[package]] +name = "grenad" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a7a9cc43b28a20f791b17863f34a36654fdfa50be6d0a67bb18c1e34d145f18" +dependencies = [ + "bytemuck", + "byteorder", + "tempfile", +] + +[[package]] +name = "h2" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e4728fd124914ad25e99e3d15a9361a879f6620f63cb56bbb08f95abb97a535" +dependencies = [ + "bytes 0.5.6", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio 0.2.25", + "tokio-util 0.3.1", + "tracing", + "tracing-futures", +] + +[[package]] +name = "h2" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7f3675cfef6a30c8031cf9e6493ebdc3bb3272a3fea3923c4210d1830e6a472" +dependencies = [ + "bytes 1.1.0", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio 1.11.0", + "tokio-util 0.6.8", + "tracing", +] + +[[package]] +name = "half" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3" + +[[package]] +name = "hash32" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4041af86e63ac4298ce40e5cca669066e75b6f1aa3390fe2561ffa5e1d9f4cc" +dependencies = [ + "byteorder", +] + +[[package]] +name = "hashbrown" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf" +dependencies = [ + "ahash", + "autocfg", +] + +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" + +[[package]] +name = "headers" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0b7591fb62902706ae8e7aaff416b1b0fa2c0fd0878b46dc13baa3712d8a855" +dependencies = [ + "base64 0.13.0", + "bitflags", + "bytes 1.1.0", + "headers-core", + "http", + "mime", + "sha-1 0.9.8", + "time", +] + +[[package]] +name = "headers-core" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" +dependencies = [ + "http", +] + +[[package]] +name = "heapless" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634bd4d29cbf24424d0a4bfcbf80c6960129dc24424752a7d1d1390607023422" +dependencies = [ + "as-slice", + "generic-array 0.14.4", + "hash32", + "stable_deref_trait", +] + +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "heed" +version = "0.12.1" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#fc017cf3394af737f92fd71e16f0499a78b79d65" +dependencies = [ + "byteorder", + "heed-traits", + "heed-types", + "libc", + "lmdb-rkv-sys", + "once_cell", + "page_size", + "serde", + "synchronoise", + "url", + "zerocopy", +] + +[[package]] +name = "heed-traits" +version = "0.7.0" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#fc017cf3394af737f92fd71e16f0499a78b79d65" + +[[package]] +name = "heed-types" +version = "0.7.2" +source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#fc017cf3394af737f92fd71e16f0499a78b79d65" +dependencies = [ + "bincode", + "heed-traits", + "serde", + "serde_json", + "zerocopy", +] + +[[package]] +name = "helpers" +version = "0.14.0" +dependencies = [ + "anyhow", + "byte-unit", + "heed", + "jemallocator", + "milli", + "stderrlog", + "structopt", +] + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "http" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" +dependencies = [ + "bytes 1.1.0", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13d5ff830006f7646652e057693569bfe0d51760c0085a071769d142a205111b" +dependencies = [ + "bytes 0.5.6", + "http", +] + +[[package]] +name = "http-body" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "399c583b2979440c60be0821a6199eca73bc3c8dcd9d070d75ac726e2c6186e5" +dependencies = [ + "bytes 1.1.0", + "http", + "pin-project-lite 0.2.7", +] + +[[package]] +name = "http-ui" +version = "0.14.0" +dependencies = [ + "anyhow", + "askama", + "askama_warp", + "bimap", + "byte-unit", + "bytes 0.5.6", + "crossbeam-channel", + "csv", + "either", + "flate2", + "fst", + "funty", + "futures", + "heed", + "jemallocator", + "log", + "maplit", + "meilisearch-tokenizer", + "memmap", + "milli", + "once_cell", + "rayon", + "serde", + "serde_json", + "serde_test", + "stderrlog", + "structopt", + "tempfile", + "tokio 0.2.25", + "warp", +] + +[[package]] +name = "httparse" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acd94fdbe1d4ff688b67b04eee2e17bd50995534a61539e45adfefb45e5e5503" + +[[package]] +name = "httpdate" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" + +[[package]] +name = "httpdate" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440" + +[[package]] +name = "human_format" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86cce260d758a9aa3d7c4b99d55c815a540f8a37514ba6046ab6be402a157cb0" + +[[package]] +name = "humansize" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02296996cb8796d7c6e3bc2d9211b7802812d36999a51bb754123ead7d37d026" + +[[package]] +name = "hyper" +version = "0.13.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a6f157065790a3ed2f88679250419b5cdd96e714a0d65f7797fd337186e96bb" +dependencies = [ + "bytes 0.5.6", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.2.7", + "http", + "http-body 0.3.1", + "httparse", + "httpdate 0.3.2", + "itoa", + "pin-project 1.0.8", + "socket2 0.3.19", + "tokio 0.2.25", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper" +version = "0.14.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15d1cfb9e4f68655fa04c01f59edb405b6074a0f7118ea881e5026e4a1cd8593" +dependencies = [ + "bytes 1.1.0", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.3.4", + "http", + "http-body 0.4.3", + "httparse", + "httpdate 1.0.1", + "itoa", + "pin-project-lite 0.2.7", + "socket2 0.4.2", + "tokio 1.11.0", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64" +dependencies = [ + "futures-util", + "hyper 0.14.13", + "log", + "rustls", + "tokio 1.11.0", + "tokio-rustls", + "webpki", +] + +[[package]] +name = "idna" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" +dependencies = [ + "matches", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indenter" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce23b50ad8242c51a442f3ff322d56b02f08852c77e4c0b4d3fd684abc89c683" + +[[package]] +name = "indexmap" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5" +dependencies = [ + "autocfg", + "hashbrown 0.11.2", +] + +[[package]] +name = "indicatif" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b" +dependencies = [ + "console", + "lazy_static", + "number_prefix", + "regex", +] + +[[package]] +name = "infos" +version = "0.14.0" +dependencies = [ + "anyhow", + "byte-unit", + "csv", + "heed", + "jemallocator", + "milli", + "roaring", + "serde_json", + "stderrlog", + "structopt", +] + +[[package]] +name = "input_buffer" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19a8a95243d5a0398cae618ec29477c6e3cb631152be5c19481f80bc71559754" +dependencies = [ + "bytes 0.5.6", +] + +[[package]] +name = "iovec" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" +dependencies = [ + "libc", +] + +[[package]] +name = "ipnet" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" + +[[package]] +name = "itertools" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "jemalloc-sys" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45" +dependencies = [ + "cc", + "fs_extra", + "libc", +] + +[[package]] +name = "jemallocator" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69" +dependencies = [ + "jemalloc-sys", + "libc", +] + +[[package]] +name = "jieba-rs" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94287627d13ab7b943787ab20b54b37f8af11179ce85de4734071c88f9eab354" +dependencies = [ + "cedarwood", + "fxhash", + "hashbrown 0.11.2", + "lazy_static", + "phf", + "phf_codegen", + "regex", +] + +[[package]] +name = "js-sys" +version = "0.3.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cc9ffccd38c451a86bf13657df244e9c3f37493cce8e5e21e940963777acc84" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "kernel32-sys" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "levenshtein_automata" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" +dependencies = [ + "fst", +] + +[[package]] +name = "lexical-core" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" +dependencies = [ + "arrayvec", + "bitflags", + "cfg-if 1.0.0", + "ryu", + "static_assertions", +] + +[[package]] +name = "libc" +version = "0.2.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2a5ac8f984bfcf3a823267e5fde638acc3325f6496633a5da6bb6eb2171e103" + +[[package]] +name = "linked-hash-map" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fb9b38af92608140b86b693604b9ffcc5824240a484d1ecd4795bacb2fe88f3" + +[[package]] +name = "lmdb-rkv-sys" +version = "0.15.0" +source = "git+https://github.com/meilisearch/lmdb-rs#d0b50d02938ee84e4e4372697ea991fe2a4cae3b" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "log" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" +dependencies = [ + "cfg-if 1.0.0", +] + +[[package]] +name = "logging_timer" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40d0c249955c17c2f8f86b5f501b16d2509ebbe775f7b1d1d2b1ba85ade2a793" +dependencies = [ + "log", + "logging_timer_proc_macros", +] + +[[package]] +name = "logging_timer_proc_macros" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "482c2c28e6bcfe7c4274f82f701774d755e6aa873edfd619460fcd0966e0eb07" +dependencies = [ + "log", + "proc-macro2 0.4.30", + "quote 0.6.13", + "syn 0.15.44", +] + +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + +[[package]] +name = "matches" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" + +[[package]] +name = "meilisearch-tokenizer" +version = "0.2.5" +source = "git+https://github.com/meilisearch/tokenizer.git?tag=v0.2.5#c0b5cf741ed9485147f2cbe523f2214d4fa4c395" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang", +] + +[[package]] +name = "memchr" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" + +[[package]] +name = "memmap" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" +dependencies = [ + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "memoffset" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" +dependencies = [ + "autocfg", +] + +[[package]] +name = "milli" +version = "0.14.0" +dependencies = [ + "big_s", + "bimap", + "bincode", + "bstr", + "byteorder", + "chrono", + "concat-arrays", + "crossbeam-channel", + "either", + "flate2", + "fst", + "fxhash", + "geoutils", + "grenad", + "heed", + "human_format", + "itertools", + "levenshtein_automata", + "linked-hash-map", + "log", + "logging_timer", + "maplit", + "meilisearch-tokenizer", + "memmap", + "obkv", + "once_cell", + "ordered-float", + "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", + "pest_derive", + "rand 0.8.4", + "rayon", + "roaring", + "rstar", + "serde", + "serde_json", + "slice-group-by", + "smallstr", + "smallvec", + "tempfile", + "uuid", +] + +[[package]] +name = "mime" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" + +[[package]] +name = "mime_guess" +version = "2.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2684d4c2e97d99848d30b324b00c8fcc7e5c897b7cbb5819b09e7c90e8baf212" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "miniz_oxide" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" +dependencies = [ + "adler", + "autocfg", +] + +[[package]] +name = "mio" +version = "0.6.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4afd66f5b91bf2a3bc13fad0e21caedac168ca4c707504e75585648ae80e4cc4" +dependencies = [ + "cfg-if 0.1.10", + "fuchsia-zircon", + "fuchsia-zircon-sys", + "iovec", + "kernel32-sys", + "libc", + "log", + "miow 0.2.2", + "net2", + "slab", + "winapi 0.2.8", +] + +[[package]] +name = "mio" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c2bdb6314ec10835cd3293dd268473a835c02b7b352e788be788b3c6ca6bb16" +dependencies = [ + "libc", + "log", + "miow 0.3.7", + "ntapi", + "winapi 0.3.9", +] + +[[package]] +name = "mio-named-pipes" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" +dependencies = [ + "log", + "mio 0.6.23", + "miow 0.3.7", + "winapi 0.3.9", +] + +[[package]] +name = "mio-uds" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0" +dependencies = [ + "iovec", + "libc", + "mio 0.6.23", +] + +[[package]] +name = "miow" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebd808424166322d4a38da87083bfddd3ac4c131334ed55856112eb06d46944d" +dependencies = [ + "kernel32-sys", + "net2", + "winapi 0.2.8", + "ws2_32-sys", +] + +[[package]] +name = "miow" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "multipart" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050aeedc89243f5347c3e237e3e13dc76fbe4ae3742a57b94dc14f69acf76d4" +dependencies = [ + "buf_redux", + "httparse", + "log", + "mime", + "mime_guess", + "quick-error", + "rand 0.7.3", + "safemem", + "tempfile", + "twoway", +] + +[[package]] +name = "net2" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" +dependencies = [ + "cfg-if 0.1.10", + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "nom" +version = "6.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7413f999671bd4745a7b624bd370a569fb6bc574b23c83a3c5ed2e453f3d5e2" +dependencies = [ + "bitvec", + "funty", + "lexical-core", + "memchr", + "version_check", +] + +[[package]] +name = "ntapi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "num-integer" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "object" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39f37e50073ccad23b6d09bcb5b263f4e76d3bb6038e4a3c08e52162ffa8abc2" +dependencies = [ + "memchr", +] + +[[package]] +name = "obkv" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385" + +[[package]] +name = "once_cell" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" + +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + +[[package]] +name = "opaque-debug" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" + +[[package]] +name = "opaque-debug" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" + +[[package]] +name = "ordered-float" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97c9d06878b3a851e8026ef94bf7fef9ba93062cd412601da4d9cf369b1cc62d" +dependencies = [ + "num-traits", +] + +[[package]] +name = "owo-colors" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2386b4ebe91c2f7f51082d4cefa145d030e33a1842a96b12e4885cc3c01f7a55" + +[[package]] +name = "page_size" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" +dependencies = [ + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "pdqselect" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec91767ecc0a0bbe558ce8c9da33c068066c57ecc8bb8477ef8c1ad3ef77c27" + +[[package]] +name = "percent-encoding" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" + +[[package]] +name = "pest" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest" +version = "2.1.3" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +dependencies = [ + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" +dependencies = [ + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "pest_meta", + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", +] + +[[package]] +name = "pest_meta" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" +dependencies = [ + "maplit", + "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "sha-1 0.8.2", +] + +[[package]] +name = "phf" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ac8b67553a7ca9457ce0e526948cad581819238f4a9d1ea74545851fa24f37" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "963adb11cf22ee65dfd401cf75577c1aa0eca58c0b97f9337d2da61d3e640503" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d43f3220d96e0080cc9ea234978ccd80d904eafb17be31bb0f76daaea6493082" +dependencies = [ + "phf_shared", + "rand 0.8.4", +] + +[[package]] +name = "phf_shared" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a68318426de33640f02be62b4ae8eb1261be2efbc337b60c54d845bf4484e0d9" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "918192b5c59119d51e0cd221f4d49dde9112824ba717369e903c97d076083d0f" +dependencies = [ + "pin-project-internal 0.4.28", +] + +[[package]] +name = "pin-project" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "576bc800220cc65dac09e99e97b08b358cfab6e17078de8dc5fee223bd2d0c08" +dependencies = [ + "pin-project-internal 1.0.8", +] + +[[package]] +name = "pin-project-internal" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be26700300be6d9d23264c73211d8190e755b6b5ca7a1b28230025511b52a5e" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", +] + +[[package]] +name = "pin-project-internal" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e8fe8163d14ce7f0cdac2e040116f22eac817edabff0be91e8aff7e9accf389" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", +] + +[[package]] +name = "pin-project-lite" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "257b64915a082f7811703966789728173279bdebb956b143dbcd23f6f970a777" + +[[package]] +name = "pin-project-lite" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" + +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "version_check", +] + +[[package]] +name = "proc-macro-hack" +version = "0.5.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" + +[[package]] +name = "proc-macro-nested" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" + +[[package]] +name = "proc-macro2" +version = "0.4.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" +dependencies = [ + "unicode-xid 0.1.0", +] + +[[package]] +name = "proc-macro2" +version = "1.0.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d" +dependencies = [ + "unicode-xid 0.2.2", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quote" +version = "0.6.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1" +dependencies = [ + "proc-macro2 0.4.30", +] + +[[package]] +name = "quote" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +dependencies = [ + "proc-macro2 1.0.29", +] + +[[package]] +name = "radium" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc 0.2.0", +] + +[[package]] +name = "rand" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.3", + "rand_hc 0.3.1", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.3", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + +[[package]] +name = "rand_core" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +dependencies = [ + "getrandom 0.2.3", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "rand_hc" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" +dependencies = [ + "rand_core 0.6.3", +] + +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils 0.8.5", + "lazy_static", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "reqwest" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "246e9f61b9bb77df069a947682be06e31ac43ea37862e244a69f177694ea6d22" +dependencies = [ + "base64 0.13.0", + "bytes 1.1.0", + "encoding_rs", + "futures-core", + "futures-util", + "http", + "http-body 0.4.3", + "hyper 0.14.13", + "hyper-rustls", + "ipnet", + "js-sys", + "lazy_static", + "log", + "mime", + "percent-encoding", + "pin-project-lite 0.2.7", + "rustls", + "serde", + "serde_urlencoded 0.7.0", + "tokio 1.11.0", + "tokio-rustls", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "webpki-roots", + "winreg", +] + +[[package]] +name = "retain_mut" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9c17925a9027d298a4603d286befe3f9dc0e8ed02523141914eb628798d6e5b" + +[[package]] +name = "ring" +version = "0.16.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" +dependencies = [ + "cc", + "libc", + "once_cell", + "spin", + "untrusted", + "web-sys", + "winapi 0.3.9", +] + +[[package]] +name = "roaring" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "536cfa885fc388b8ae69edf96d7970849b7d9c1395da1b8330f17715babf8a09" +dependencies = [ + "bytemuck", + "byteorder", + "retain_mut", +] + +[[package]] +name = "rstar" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d535e658ada8c1987a113e5261f8b907f721b2854d666e72820671481b7ee125" +dependencies = [ + "heapless", + "num-traits", + "pdqselect", + "serde", + "smallvec", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "rustls" +version = "0.19.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" +dependencies = [ + "base64 0.13.0", + "log", + "ring", + "sct", + "webpki", +] + +[[package]] +name = "ryu" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" + +[[package]] +name = "safemem" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scoped-tls" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "sct" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "semver" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012" + +[[package]] +name = "serde" +version = "1.0.130" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.130" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", +] + +[[package]] +name = "serde_json" +version = "1.0.68" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f690853975602e1bfe1ccbf50504d67174e3bcf340f23b5ea9992e0587a52d8" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_test" +version = "1.0.130" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82178225dbdeae2d5d190e8649287db6a3a32c6d24da22ae3146325aa353e4c" +dependencies = [ + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ec5d77e2d4c73717816afac02670d5c4f534ea95ed430442cad02e7a6e32c97" +dependencies = [ + "dtoa", + "itoa", + "serde", + "url", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha-1" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" +dependencies = [ + "block-buffer 0.7.3", + "digest 0.8.1", + "fake-simd", + "opaque-debug 0.2.3", +] + +[[package]] +name = "sha-1" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99cd6713db3cf16b6c84e06321e049a9b9f699826e16096d23bbcc44d15d51a6" +dependencies = [ + "block-buffer 0.9.0", + "cfg-if 1.0.0", + "cpufeatures", + "digest 0.9.0", + "opaque-debug 0.3.0", +] + +[[package]] +name = "sharded-slab" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" +dependencies = [ + "libc", +] + +[[package]] +name = "siphasher" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "533494a8f9b724d33625ab53c6c4800f7cc445895924a8ef649222dcb76e938b" + +[[package]] +name = "slab" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c307a32c1c5c437f38c7fd45d753050587732ba8628319fbdf12a7e289ccc590" + +[[package]] +name = "slice-group-by" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb" + +[[package]] +name = "smallstr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e922794d168678729ffc7e07182721a14219c65814e66e91b839a272fe5ae4f" +dependencies = [ + "serde", + "smallvec", +] + +[[package]] +name = "smallvec" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" + +[[package]] +name = "socket2" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "122e570113d28d773067fab24266b66753f6ea915758651696b6e35e49f88d6e" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "socket2" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dc90fe6c7be1a323296982db1836d1ea9e47b6839496dde9a541bc496df3516" +dependencies = [ + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "stderrlog" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45a53e2eff3e94a019afa6265e8ee04cb05b9d33fe9f5078b14e4e391d155a38" +dependencies = [ + "atty", + "chrono", + "log", + "termcolor", + "thread_local", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + +[[package]] +name = "structopt" +version = "0.3.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf9d950ef167e25e0bdb073cf1d68e9ad2795ac826f2f3f59647817cf23c0bfa" +dependencies = [ + "clap", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", +] + +[[package]] +name = "syn" +version = "0.15.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ca4b3b69a77cbe1ffc9e198781b7acb0c7365a883670e8f1c1bc66fba79a5c5" +dependencies = [ + "proc-macro2 0.4.30", + "quote 0.6.13", + "unicode-xid 0.1.0", +] + +[[package]] +name = "syn" +version = "1.0.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6f107db402c2c2055242dbf4d2af0e69197202e9faacbef9571bbe47f5a1b84" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "unicode-xid 0.2.2", +] + +[[package]] +name = "synchronoise" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d717ed0efc9d39ab3b642a096bc369a3e02a38a51c41845d7fe31bdad1d6eaeb" +dependencies = [ + "crossbeam-queue", +] + +[[package]] +name = "synstructure" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "474aaa926faa1603c40b7885a9eaea29b444d1cb2850cb7c0e37bb1a4182f4fa" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", + "unicode-xid 0.2.2", +] + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + +[[package]] +name = "tempfile" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" +dependencies = [ + "cfg-if 1.0.0", + "libc", + "rand 0.8.4", + "redox_syscall", + "remove_dir_all", + "winapi 0.3.9", +] + +[[package]] +name = "term_size" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9" +dependencies = [ + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "termcolor" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "terminal_size" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "633c1a546cee861a1a6d0dc69ebeca693bf4296661ba7852b9d21d159e0506df" +dependencies = [ + "libc", + "winapi 0.3.9", +] + +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "term_size", + "unicode-width", +] + +[[package]] +name = "thread_local" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "time" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" +dependencies = [ + "libc", + "wasi 0.10.0+wasi-snapshot-preview1", + "winapi 0.3.9", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "tinyvec" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5241dd6f21443a3606b432718b166d3cedc962fd4b8bea54a8bc7f514ebda986" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + +[[package]] +name = "tokio" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6703a273949a90131b290be1fe7b039d0fc884aa1935860dfcbe056f28cd8092" +dependencies = [ + "bytes 0.5.6", + "fnv", + "futures-core", + "iovec", + "lazy_static", + "libc", + "memchr", + "mio 0.6.23", + "mio-named-pipes", + "mio-uds", + "num_cpus", + "pin-project-lite 0.1.12", + "signal-hook-registry", + "slab", + "tokio-macros", + "winapi 0.3.9", +] + +[[package]] +name = "tokio" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4efe6fc2395938c8155973d7be49fe8d03a843726e285e100a8a383cc0154ce" +dependencies = [ + "autocfg", + "bytes 1.1.0", + "libc", + "memchr", + "mio 0.7.13", + "num_cpus", + "pin-project-lite 0.2.7", + "winapi 0.3.9", +] + +[[package]] +name = "tokio-macros" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e44da00bfc73a25f814cd8d7e57a68a5c31b74b3152a0a1d1f590c97ed06265a" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", +] + +[[package]] +name = "tokio-rustls" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" +dependencies = [ + "rustls", + "tokio 1.11.0", + "webpki", +] + +[[package]] +name = "tokio-tungstenite" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d9e878ad426ca286e4dcae09cbd4e1973a7f8987d97570e2469703dd7f5720c" +dependencies = [ + "futures-util", + "log", + "pin-project 0.4.28", + "tokio 0.2.25", + "tungstenite", +] + +[[package]] +name = "tokio-util" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be8242891f2b6cbef26a2d7e8605133c2c554cd35b3e4948ea892d6d68436499" +dependencies = [ + "bytes 0.5.6", + "futures-core", + "futures-sink", + "log", + "pin-project-lite 0.1.12", + "tokio 0.2.25", +] + +[[package]] +name = "tokio-util" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d3725d3efa29485e87311c5b699de63cde14b00ed4d256b8318aa30ca452cd" +dependencies = [ + "bytes 1.1.0", + "futures-core", + "futures-sink", + "log", + "pin-project-lite 0.2.7", + "tokio 1.11.0", +] + +[[package]] +name = "toml" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" +dependencies = [ + "serde", +] + +[[package]] +name = "tower-service" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" + +[[package]] +name = "tracing" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84f96e095c0c82419687c20ddf5cb3eadb61f4e1405923c9dc8e53a1adacbda8" +dependencies = [ + "cfg-if 1.0.0", + "log", + "pin-project-lite 0.2.7", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98863d0dd09fa59a1b79c6750ad80dbda6b75f4e71c437a6a1a8cb91a8bcbd77" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", +] + +[[package]] +name = "tracing-core" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46125608c26121c81b0c6d693eab5a420e416da7e43c426d2e8f7df8da8a3acf" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "tracing-error" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4d7c0b83d4a500748fa5879461652b361edf5c9d51ede2a2ac03875ca185e24" +dependencies = [ + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "tracing-futures" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" +dependencies = [ + "pin-project 1.0.8", + "tracing", +] + +[[package]] +name = "tracing-subscriber" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd0568dbfe3baf7048b7908d2b32bca0d81cd56bec6d2a8f894b01d74f86be3" +dependencies = [ + "sharded-slab", + "thread_local", + "tracing-core", +] + +[[package]] +name = "try-lock" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" + +[[package]] +name = "tungstenite" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0308d80d86700c5878b9ef6321f020f29b1bb9d5ff3cab25e75e23f3a492a23" +dependencies = [ + "base64 0.12.3", + "byteorder", + "bytes 0.5.6", + "http", + "httparse", + "input_buffer", + "log", + "rand 0.7.3", + "sha-1 0.9.8", + "url", + "utf-8", +] + +[[package]] +name = "twoway" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b11b2b5241ba34be09c3cc85a36e56e48f9888862e19cedf23336d35316ed1" +dependencies = [ + "memchr", +] + +[[package]] +name = "typenum" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b63708a265f51345575b27fe43f9500ad611579e764c79edbc2037b1121959ec" + +[[package]] +name = "ucd-trie" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" + +[[package]] +name = "unicase" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" +dependencies = [ + "version_check", +] + +[[package]] +name = "unicode-bidi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "246f4c42e67e7a4e3c6106ff716a5d067d4132a642840b242e357e468a2a0085" + +[[package]] +name = "unicode-normalization" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" + +[[package]] +name = "unicode-width" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" + +[[package]] +name = "unicode-xid" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "untrusted" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" + +[[package]] +name = "url" +version = "2.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" +dependencies = [ + "form_urlencoded", + "idna", + "matches", + "percent-encoding", +] + +[[package]] +name = "urlencoding" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a1f0175e03a0973cf4afd476bef05c26e228520400eb1fd473ad417b1c00ffb" + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "utf8-width" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cf7d77f457ef8dfa11e4cd5933c5ddb5dc52a94664071951219a97710f0a32b" + +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +dependencies = [ + "getrandom 0.2.3", +] + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + +[[package]] +name = "version_check" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" + +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi 0.3.9", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" +dependencies = [ + "log", + "try-lock", +] + +[[package]] +name = "warp" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f41be6df54c97904af01aa23e613d4521eed7ab23537cede692d4058f6449407" +dependencies = [ + "bytes 0.5.6", + "futures", + "headers", + "http", + "hyper 0.13.10", + "log", + "mime", + "mime_guess", + "multipart", + "pin-project 0.4.28", + "scoped-tls", + "serde", + "serde_json", + "serde_urlencoded 0.6.1", + "tokio 0.2.25", + "tokio-tungstenite", + "tower-service", + "tracing", + "tracing-futures", + "urlencoding", +] + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "wasm-bindgen" +version = "0.2.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "632f73e236b219150ea279196e54e610f5dbafa5d61786303d4da54f84e47fce" +dependencies = [ + "cfg-if 1.0.0", + "serde", + "serde_json", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a317bf8f9fba2476b4b2c85ef4c4af8ff39c3c7f0cdfeed4f82c34a880aa837b" +dependencies = [ + "bumpalo", + "lazy_static", + "log", + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e8d7523cb1f2a4c96c1317ca690031b714a51cc14e05f712446691f413f5d39" +dependencies = [ + "cfg-if 1.0.0", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d56146e7c495528bf6587663bea13a8eb588d39b36b679d83972e1a2dbbdacf9" +dependencies = [ + "quote 1.0.9", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7803e0eea25835f8abdc585cd3021b3deb11543c6fe226dcd30b228857c5c5ab" +dependencies = [ + "proc-macro2 1.0.29", + "quote 1.0.9", + "syn 1.0.76", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.78" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0237232789cf037d5480773fe568aac745bfe2afbc11a863e97901780a6b47cc" + +[[package]] +name = "web-sys" +version = "0.3.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38eb105f1c59d9eaa6b5cdc92b859d85b926e82cb2e0945cd0c9259faa6fe9fb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "webpki-roots" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940" +dependencies = [ + "webpki", +] + +[[package]] +name = "whatlang" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a346d2eb29c03618693ed24a29d1acd0c3f2cb08ae58b9669d7461e033cf703" +dependencies = [ + "hashbrown 0.7.2", +] + +[[package]] +name = "winapi" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-build" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "winreg" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" +dependencies = [ + "winapi 0.3.9", +] + +[[package]] +name = "ws2_32-sys" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e" +dependencies = [ + "winapi 0.2.8", + "winapi-build", +] + +[[package]] +name = "wyz" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" + +[[package]] +name = "zerocopy" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" +dependencies = [ + "proc-macro2 1.0.29", + "syn 1.0.76", + "synstructure", +] diff --git a/Cargo.toml b/Cargo.toml index 822907ca8..b78989f50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "search"] +members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "cli"] default-members = ["milli"] [profile.dev] diff --git a/cli/Cargo.toml b/cli/Cargo.toml new file mode 100644 index 000000000..24fb214b9 --- /dev/null +++ b/cli/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "cli" +version = "0.1.0" +edition = "2018" +description = "A CLI to interact with a milli index" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +indicatif = "0.16.2" +serde = "1.0.129" +serde_json = "1.0.66" +structopt = "0.3.22" +milli = { path = "../milli" } +eyre = "0.6.5" +color-eyre = "0.5.11" +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +byte-unit = { version = "4.0.12", features = ["serde"] } +bimap = "0.6.1" +csv = "1.1.6" +stderrlog = "0.5.1" + +[target.'cfg(target_os = "linux")'.dependencies] +jemallocator = "0.3.2" diff --git a/cli/src/main.rs b/cli/src/main.rs new file mode 100644 index 000000000..b84ff3243 --- /dev/null +++ b/cli/src/main.rs @@ -0,0 +1,335 @@ +use std::fs::File; +use std::io::{stdin, Cursor, Read}; +use std::path::PathBuf; +use std::str::FromStr; + +use byte_unit::Byte; +use eyre::Result; +use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; +use milli::update::UpdateIndexingStep::{ + ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, +}; +use serde_json::{Map, Value}; +use structopt::StructOpt; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +#[derive(Debug, StructOpt)] +#[structopt(name = "Milli CLI", about = "A simple CLI to manipulate a milli index.")] +struct Cli { + #[structopt(short, long)] + index_path: PathBuf, + #[structopt(short = "s", long, default_value = "100GiB")] + index_size: Byte, + /// Verbose mode (-v, -vv, -vvv, etc.) + #[structopt(short, long, parse(from_occurrences))] + verbose: usize, + #[structopt(subcommand)] + subcommand: Command, +} + +#[derive(Debug, StructOpt)] +enum Command { + DocumentAddition(DocumentAddition), + Search(Search), + SettingsUpdate(SettingsUpdate), +} + +fn setup(opt: &Cli) -> Result<()> { + color_eyre::install()?; + stderrlog::new() + .verbosity(opt.verbose) + .show_level(false) + .timestamp(stderrlog::Timestamp::Off) + .init()?; + Ok(()) +} + +fn main() -> Result<()> { + let command = Cli::from_args(); + + setup(&command)?; + + let mut options = heed::EnvOpenOptions::new(); + options.map_size(command.index_size.get_bytes() as usize); + let index = milli::Index::new(options, command.index_path)?; + + match command.subcommand { + Command::DocumentAddition(addition) => addition.perform(index)?, + Command::Search(search) => search.perform(index)?, + Command::SettingsUpdate(update) => update.perform(index)?, + } + + Ok(()) +} + +#[derive(Debug)] +enum DocumentAdditionFormat { + Csv, + Json, + Jsonl, +} + +impl FromStr for DocumentAdditionFormat { + type Err = eyre::Error; + + fn from_str(s: &str) -> Result { + match s { + "csv" => Ok(Self::Csv), + "jsonl" => Ok(Self::Jsonl), + "json" => Ok(Self::Json), + other => eyre::bail!("invalid format: {}", other), + } + } +} + +#[derive(Debug, StructOpt)] +struct DocumentAddition { + #[structopt(short, long, default_value = "json", possible_values = &["csv", "jsonl", "json"])] + format: DocumentAdditionFormat, + /// Path to the update file, if not present, will read from stdin. + #[structopt(short, long)] + path: Option, + /// Whether to generate missing document ids. + #[structopt(short, long)] + autogen_docids: bool, + /// Whether to update or replace the documents if they already exist. + #[structopt(short, long)] + update_documents: bool, +} + +impl DocumentAddition { + fn perform(&self, index: milli::Index) -> Result<()> { + let reader: Box = match self.path { + Some(ref path) => { + let file = File::open(path)?; + Box::new(file) + } + None => Box::new(stdin()), + }; + + println!("parsing documents..."); + + let documents = match self.format { + DocumentAdditionFormat::Csv => documents_from_csv(reader)?, + DocumentAdditionFormat::Json => documents_from_json(reader)?, + DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?, + }; + + let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?; + + println!("Adding {} documents to the index.", reader.len()); + + let mut txn = index.env.write_txn()?; + let mut addition = milli::update::IndexDocuments::new(&mut txn, &index, 0); + + if self.update_documents { + addition.index_documents_method(milli::update::IndexDocumentsMethod::UpdateDocuments); + } + + addition.log_every_n(100); + + if self.autogen_docids { + addition.enable_autogenerate_docids() + } + + let mut bars = Vec::new(); + let progesses = MultiProgress::new(); + for _ in 0..4 { + let bar = ProgressBar::hidden(); + let bar = progesses.add(bar); + bars.push(bar); + } + + std::thread::spawn(move || { + progesses.join().unwrap(); + }); + + let result = addition.execute(reader, |step, _| indexing_callback(step, &bars))?; + + txn.commit()?; + + println!("{:?}", result); + Ok(()) + } +} + +fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBar]) { + let step_index = step.step(); + let bar = &bars[step_index]; + if step_index > 0 { + let prev = &bars[step_index - 1]; + if !prev.is_finished() { + prev.disable_steady_tick(); + prev.finish_at_current_pos(); + } + } + + let style = ProgressStyle::default_bar() + .template("[eta: {eta_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}") + .progress_chars("##-"); + + match step { + RemapDocumentAddition { documents_seen } => { + bar.set_style(ProgressStyle::default_spinner()); + bar.set_message(format!("remaped {} documents so far.", documents_seen)); + } + ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { + bar.set_style(style); + bar.set_length(total_documents as u64); + bar.set_message("Merging documents..."); + bar.set_position(documents_seen as u64); + } + IndexDocuments { documents_seen, total_documents } => { + bar.set_style(style); + bar.set_length(total_documents as u64); + bar.set_message("Indexing documents..."); + bar.set_position(documents_seen as u64); + } + MergeDataIntoFinalDatabase { databases_seen, total_databases } => { + bar.set_style(style); + bar.set_length(total_databases as u64); + bar.set_message("Merging databases..."); + bar.set_position(databases_seen as u64); + } + } + bar.enable_steady_tick(200); +} + +fn documents_from_jsonl(reader: impl Read) -> Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let values = serde_json::Deserializer::from_reader(reader) + .into_iter::>(); + for document in values { + let document = document?; + documents.add_documents(document)?; + } + documents.finish()?; + + Ok(writer.into_inner()) +} + +fn documents_from_json(reader: impl Read) -> Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let json: serde_json::Value = serde_json::from_reader(reader)?; + documents.add_documents(json)?; + documents.finish()?; + + Ok(writer.into_inner()) +} + +fn documents_from_csv(reader: impl Read) -> Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let mut records = csv::Reader::from_reader(reader); + let iter = records.deserialize::>(); + + for doc in iter { + let doc = doc?; + documents.add_documents(doc)?; + } + + documents.finish()?; + + Ok(writer.into_inner()) +} + +#[derive(Debug, StructOpt)] +struct Search { + query: Option, + #[structopt(short, long)] + filter: Option, + #[structopt(short, long)] + offset: Option, + #[structopt(short, long)] + limit: Option, +} + +impl Search { + fn perform(&self, index: milli::Index) -> Result<()> { + let txn = index.env.read_txn()?; + let mut search = index.search(&txn); + + if let Some(ref query) = self.query { + search.query(query); + } + + if let Some(ref filter) = self.filter { + let condition = milli::FilterCondition::from_str(&txn, &index, filter)?; + search.filter(condition); + } + + if let Some(offset) = self.offset { + search.offset(offset); + } + + if let Some(limit) = self.limit { + search.limit(limit); + } + + let result = search.execute()?; + + let fields_ids_map = index.fields_ids_map(&txn)?; + let displayed_fields = + index.displayed_fields_ids(&txn)?.unwrap_or_else(|| fields_ids_map.ids().collect()); + let documents = index.documents(&txn, result.documents_ids)?; + let mut jsons = Vec::new(); + for (_, obkv) in documents { + let json = milli::obkv_to_json(&displayed_fields, &fields_ids_map, obkv)?; + jsons.push(json); + } + + let hits = serde_json::to_string_pretty(&jsons)?; + + println!("{}", hits); + + Ok(()) + } +} + +#[derive(Debug, StructOpt)] +struct SettingsUpdate { + #[structopt(short, long)] + filterable_attributes: Option>, +} + +impl SettingsUpdate { + fn perform(&self, index: milli::Index) -> Result<()> { + let mut txn = index.env.write_txn()?; + + let mut update = milli::update::Settings::new(&mut txn, &index, 0); + update.log_every_n(100); + + if let Some(ref filterable_attributes) = self.filterable_attributes { + if !filterable_attributes.is_empty() { + update.set_filterable_fields(filterable_attributes.iter().cloned().collect()); + } else { + update.reset_filterable_fields(); + } + } + + let mut bars = Vec::new(); + let progesses = MultiProgress::new(); + for _ in 0..4 { + let bar = ProgressBar::hidden(); + let bar = progesses.add(bar); + bars.push(bar); + } + + std::thread::spawn(move || { + progesses.join().unwrap(); + }); + + update.execute(|step, _| indexing_callback(step, &bars))?; + + txn.commit()?; + Ok(()) + } +} diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 44183eaa2..1f897e820 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -37,6 +37,8 @@ fst = "0.4.5" # Temporary fix for bitvec, remove once fixed. (https://github.com/bitvecto-rs/bitvec/issues/105) funty = "=1.1" +bimap = "0.6.1" +csv = "1.1.6" [dev-dependencies] maplit = "1.0.2" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 5dbb0c326..1bacdfbed 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -3,6 +3,7 @@ mod update_store; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; +use std::io::Cursor; use std::net::SocketAddr; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; @@ -18,8 +19,9 @@ use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; use heed::EnvOpenOptions; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; +use milli::documents::DocumentBatchReader; use milli::update::UpdateIndexingStep::*; -use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; +use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder}; use milli::{obkv_to_json, CompressionType, FilterCondition, Index, MatchingWords, SearchResult}; use once_cell::sync::OnceCell; use rayon::ThreadPool; @@ -350,19 +352,12 @@ async fn main() -> anyhow::Result<()> { let before_update = Instant::now(); // we extract the update type and execute the update itself. let result: anyhow::Result<()> = - match meta { + (|| match meta { UpdateMeta::DocumentsAddition { method, format, encoding } => { // We must use the write transaction of the update here. let mut wtxn = index_cloned.write_txn()?; let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned); - match format.as_str() { - "csv" => builder.update_format(UpdateFormat::Csv), - "json" => builder.update_format(UpdateFormat::Json), - "json-stream" => builder.update_format(UpdateFormat::JsonStream), - otherwise => panic!("invalid update format {:?}", otherwise), - }; - match method.as_str() { "replace" => builder .index_documents_method(IndexDocumentsMethod::ReplaceDocuments), @@ -377,11 +372,18 @@ async fn main() -> anyhow::Result<()> { otherwise => panic!("invalid encoding format {:?}", otherwise), }; - let result = builder.execute(reader, |indexing_step, update_id| { + let documents = match format.as_str() { + "csv" => documents_from_csv(reader)?, + "json" => documents_from_json(reader)?, + "jsonl" => documents_from_jsonl(reader)?, + otherwise => panic!("invalid update format {:?}", otherwise), + }; + + let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?; + + let result = builder.execute(documents, |indexing_step, update_id| { let (current, total) = match indexing_step { - TransformFromUserIntoGenericFormat { documents_seen } => { - (documents_seen, None) - } + RemapDocumentAddition { documents_seen } => (documents_seen, None), ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { (documents_seen, Some(total_documents)) } @@ -482,9 +484,7 @@ async fn main() -> anyhow::Result<()> { let result = builder.execute(|indexing_step, update_id| { let (current, total) = match indexing_step { - TransformFromUserIntoGenericFormat { documents_seen } => { - (documents_seen, None) - } + RemapDocumentAddition { documents_seen } => (documents_seen, None), ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { (documents_seen, Some(total_documents)) } @@ -526,7 +526,7 @@ async fn main() -> anyhow::Result<()> { Err(e) => Err(e.into()), } } - }; + })(); let meta = match result { Ok(()) => { @@ -842,7 +842,7 @@ async fn main() -> anyhow::Result<()> { UpdateStatus, >, update_method: Option, - update_format: UpdateFormat, + format: String, encoding: Option, mut stream: impl futures::Stream> + Unpin, ) -> Result { @@ -863,13 +863,6 @@ async fn main() -> anyhow::Result<()> { _ => String::from("replace"), }; - let format = match update_format { - UpdateFormat::Csv => String::from("csv"), - UpdateFormat::Json => String::from("json"), - UpdateFormat::JsonStream => String::from("json-stream"), - _ => panic!("Unknown update format"), - }; - let meta = UpdateMeta::DocumentsAddition { method, format, encoding }; let update_id = update_store.register_update(&meta, &mmap[..]).unwrap(); let _ = update_status_sender.send(UpdateStatus::Pending { update_id, meta }); @@ -893,9 +886,9 @@ async fn main() -> anyhow::Result<()> { .and(warp::body::stream()) .and_then(move |content_type: String, content_encoding, params: QueryUpdate, stream| { let format = match content_type.as_str() { - "text/csv" => UpdateFormat::Csv, - "application/json" => UpdateFormat::Json, - "application/x-ndjson" => UpdateFormat::JsonStream, + "text/csv" => "csv", + "application/json" => "json", + "application/x-ndjson" => "jsonl", otherwise => panic!("invalid update format: {}", otherwise), }; @@ -903,7 +896,7 @@ async fn main() -> anyhow::Result<()> { update_store_cloned.clone(), update_status_sender_cloned.clone(), params.method, - format, + format.to_string(), content_encoding, stream, ) @@ -1031,6 +1024,49 @@ async fn main() -> anyhow::Result<()> { Ok(()) } +fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let values = serde_json::Deserializer::from_reader(reader) + .into_iter::>(); + for document in values { + let document = document?; + documents.add_documents(document)?; + } + documents.finish()?; + + Ok(writer.into_inner()) +} + +fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let json: serde_json::Value = serde_json::from_reader(reader)?; + documents.add_documents(json)?; + documents.finish()?; + + Ok(writer.into_inner()) +} + +fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let mut records = csv::Reader::from_reader(reader); + let iter = records.deserialize::>(); + + for doc in iter { + let doc = doc?; + documents.add_documents(doc)?; + } + + documents.finish()?; + + Ok(writer.into_inner()) +} + #[cfg(test)] mod tests { use maplit::{btreeset, hashmap, hashset}; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 35e4644fa..37a524197 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -5,12 +5,13 @@ authors = ["Kerollmops "] edition = "2018" [dependencies] +bimap = { version = "0.6.1", features = ["serde"] } +bincode = "1.3.3" bstr = "0.2.15" byteorder = "1.4.2" chrono = { version = "0.4.19", features = ["serde"] } concat-arrays = "0.1.2" crossbeam-channel = "0.5.1" -csv = "1.1.5" either = "1.6.1" flate2 = "1.0.20" fst = "0.4.5" diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs new file mode 100644 index 000000000..ba1319eff --- /dev/null +++ b/milli/src/documents/builder.rs @@ -0,0 +1,80 @@ +use std::io; + +use byteorder::{BigEndian, WriteBytesExt}; +use serde::ser::Serialize; + +use super::serde::DocumentSerializer; +use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; + +/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary +/// format used by milli. +/// +/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to +/// iterate other the documents. +/// +/// ## example: +/// ``` +/// use milli::documents::DocumentBatchBuilder; +/// use serde_json::json; +/// use std::io::Cursor; +/// +/// let mut writer = Cursor::new(Vec::new()); +/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); +/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap(); +/// builder.finish().unwrap(); +/// ``` +pub struct DocumentBatchBuilder { + serializer: DocumentSerializer, +} + +impl DocumentBatchBuilder { + pub fn new(writer: W) -> Result { + let index = DocumentsBatchIndex::new(); + let mut writer = ByteCounter::new(writer); + // add space to write the offset of the metadata at the end of the writer + writer.write_u64::(0)?; + + let serializer = + DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true }; + + Ok(Self { serializer }) + } + + /// Returns the number of documents that have been written to the builder. + pub fn len(&self) -> usize { + self.serializer.count + } + + /// This method must be called after the document addition is terminated. It will put the + /// metadata at the end of the file, and write the metadata offset at the beginning on the + /// file. + pub fn finish(self) -> Result<(), Error> { + let DocumentSerializer { + writer: ByteCounter { mut writer, count: offset }, + index, + count, + .. + } = self.serializer; + + let meta = DocumentsMetadata { count, index }; + + bincode::serialize_into(&mut writer, &meta)?; + + writer.seek(io::SeekFrom::Start(0))?; + writer.write_u64::(offset as u64)?; + + writer.flush()?; + + Ok(()) + } + + /// Adds documents to the builder. + /// + /// The internal index is updated with the fields found + /// in the documents. Document must either be a map or a sequences of map, anything else will + /// fail. + pub fn add_documents(&mut self, document: T) -> Result<(), Error> { + document.serialize(&mut self.serializer)?; + Ok(()) + } +} diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs new file mode 100644 index 000000000..da4227e6b --- /dev/null +++ b/milli/src/documents/mod.rs @@ -0,0 +1,233 @@ +mod builder; +/// The documents module defines an intermediary document format that milli uses for indexation, and +/// provides an API to easily build and read such documents. +/// +/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can +/// later be read by milli using the `DocumentBatchReader` interface. +mod reader; +mod serde; + +use std::{fmt, io}; + +use ::serde::{Deserialize, Serialize}; +use bimap::BiHashMap; +pub use builder::DocumentBatchBuilder; +pub use reader::DocumentBatchReader; + +use crate::FieldId; + +/// A bidirectional map that links field ids to their name in a document batch. +pub type DocumentsBatchIndex = BiHashMap; + +#[derive(Debug, Serialize, Deserialize)] +struct DocumentsMetadata { + count: usize, + index: DocumentsBatchIndex, +} + +pub struct ByteCounter { + count: usize, + writer: W, +} + +impl ByteCounter { + fn new(writer: W) -> Self { + Self { count: 0, writer } + } +} + +impl io::Write for ByteCounter { + fn write(&mut self, buf: &[u8]) -> io::Result { + let count = self.writer.write(buf)?; + self.count += count; + Ok(count) + } + + fn flush(&mut self) -> io::Result<()> { + self.writer.flush() + } +} + +#[derive(Debug)] +pub enum Error { + InvalidDocumentFormat, + Custom(String), + JsonError(serde_json::Error), + Serialize(bincode::Error), + Io(io::Error), + DocumentTooLarge, +} + +impl From for Error { + fn from(other: io::Error) -> Self { + Self::Io(other) + } +} + +impl From for Error { + fn from(other: bincode::Error) -> Self { + Self::Serialize(other) + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), + Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), + Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), + Error::Io(e) => e.fmt(f), + Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"), + Error::Serialize(e) => e.fmt(f), + } + } +} + +impl std::error::Error for Error {} + +/// Macro used to generate documents, with the same syntax as `serde_json::json` +#[cfg(test)] +macro_rules! documents { + ($data:tt) => {{ + let documents = serde_json::json!($data); + let mut writer = std::io::Cursor::new(Vec::new()); + let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); + builder.add_documents(documents).unwrap(); + builder.finish().unwrap(); + + writer.set_position(0); + + crate::documents::DocumentBatchReader::from_reader(writer).unwrap() + }}; +} + +#[cfg(test)] +mod test { + use serde_json::{json, Value}; + + use super::*; + + #[test] + fn create_documents_no_errors() { + let json = json!({ + "number": 1, + "string": "this is a field", + "array": ["an", "array"], + "object": { + "key": "value", + }, + "bool": true + }); + + let mut v = Vec::new(); + let mut cursor = io::Cursor::new(&mut v); + + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + builder.add_documents(json).unwrap(); + + builder.finish().unwrap(); + + let mut documents = + DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); + + assert_eq!(documents.index().iter().count(), 5); + + let reader = documents.next_document_with_index().unwrap().unwrap(); + + assert_eq!(reader.1.iter().count(), 5); + assert!(documents.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn test_add_multiple_documents() { + let doc1 = json!({ + "bool": true, + }); + let doc2 = json!({ + "toto": false, + }); + + let mut v = Vec::new(); + let mut cursor = io::Cursor::new(&mut v); + + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + builder.add_documents(doc1).unwrap(); + builder.add_documents(doc2).unwrap(); + + builder.finish().unwrap(); + + let mut documents = + DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); + + assert_eq!(documents.index().iter().count(), 2); + + let reader = documents.next_document_with_index().unwrap().unwrap(); + + assert_eq!(reader.1.iter().count(), 1); + assert!(documents.next_document_with_index().unwrap().is_some()); + assert!(documents.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn add_documents_array() { + let docs = json!([ + { "toto": false }, + { "tata": "hello" }, + ]); + + let mut v = Vec::new(); + let mut cursor = io::Cursor::new(&mut v); + + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + builder.add_documents(docs).unwrap(); + + builder.finish().unwrap(); + + let mut documents = + DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); + + assert_eq!(documents.index().iter().count(), 2); + + let reader = documents.next_document_with_index().unwrap().unwrap(); + + assert_eq!(reader.1.iter().count(), 1); + assert!(documents.next_document_with_index().unwrap().is_some()); + assert!(documents.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn add_invalid_document_format() { + let mut v = Vec::new(); + let mut cursor = io::Cursor::new(&mut v); + + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + let docs = json!([[ + { "toto": false }, + { "tata": "hello" }, + ]]); + + assert!(builder.add_documents(docs).is_err()); + + let docs = json!("hello"); + + assert!(builder.add_documents(docs).is_err()); + } + + #[test] + fn test_nested() { + let mut docs = documents!([{ + "hello": { + "toto": ["hello"] + } + }]); + + let (_index, doc) = docs.next_document_with_index().unwrap().unwrap(); + + let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); + assert_eq!(nested, json!({ "toto": ["hello"] })); + } +} diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs new file mode 100644 index 000000000..14d7c8ceb --- /dev/null +++ b/milli/src/documents/reader.rs @@ -0,0 +1,75 @@ +use std::io; +use std::io::{BufReader, Read}; +use std::mem::size_of; + +use byteorder::{BigEndian, ReadBytesExt}; +use obkv::KvReader; + +use super::{DocumentsBatchIndex, DocumentsMetadata, Error}; +use crate::FieldId; + +/// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with +/// a `DocumentsBatchWriter`. +/// +/// The documents are returned in the form of `obkv::Reader` where each field is identified with a +/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. +pub struct DocumentBatchReader { + reader: BufReader, + metadata: DocumentsMetadata, + buffer: Vec, + seen_documents: usize, +} + +impl DocumentBatchReader { + /// Construct a `DocumentsReader` from a reader. + /// + /// It first retrieves the index, then moves to the first document. Subsequent calls to + /// `next_document` advance the document reader until all the documents have been read. + pub fn from_reader(mut reader: R) -> Result { + let mut buffer = Vec::new(); + + let meta_offset = reader.read_u64::()?; + reader.seek(io::SeekFrom::Start(meta_offset))?; + reader.read_to_end(&mut buffer)?; + let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?; + + reader.seek(io::SeekFrom::Start(size_of::() as u64))?; + buffer.clear(); + + let reader = BufReader::new(reader); + + Ok(Self { reader, metadata, buffer, seen_documents: 0 }) + } + + /// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a + /// reference to the addition index. + pub fn next_document_with_index<'a>( + &'a mut self, + ) -> io::Result)>> { + if self.seen_documents < self.metadata.count { + let doc_len = self.reader.read_u32::()?; + self.buffer.resize(doc_len as usize, 0); + self.reader.read_exact(&mut self.buffer)?; + self.seen_documents += 1; + + let reader = KvReader::new(&self.buffer); + Ok(Some((&self.metadata.index, reader))) + } else { + Ok(None) + } + } + + /// Return the fields index for the documents batch. + pub fn index(&self) -> &DocumentsBatchIndex { + &self.metadata.index + } + + /// Returns the number of documents in the reader. + pub fn len(&self) -> usize { + self.metadata.count + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs new file mode 100644 index 000000000..76dc8915c --- /dev/null +++ b/milli/src/documents/serde.rs @@ -0,0 +1,465 @@ +use std::convert::TryInto; +use std::{fmt, io}; + +use byteorder::{BigEndian, WriteBytesExt}; +use obkv::KvWriter; +use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer}; + +use super::{ByteCounter, DocumentsBatchIndex, Error}; +use crate::FieldId; + +pub struct DocumentSerializer { + pub writer: ByteCounter, + pub buffer: Vec, + pub index: DocumentsBatchIndex, + pub count: usize, + pub allow_seq: bool, +} + +impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer { + type Ok = (); + + type Error = Error; + + type SerializeSeq = SeqSerializer<'a, W>; + type SerializeTuple = Impossible<(), Self::Error>; + type SerializeTupleStruct = Impossible<(), Self::Error>; + type SerializeTupleVariant = Impossible<(), Self::Error>; + type SerializeMap = MapSerializer<'a, &'a mut ByteCounter>; + type SerializeStruct = Impossible<(), Self::Error>; + type SerializeStructVariant = Impossible<(), Self::Error>; + fn serialize_map(self, _len: Option) -> Result { + self.buffer.clear(); + let cursor = io::Cursor::new(&mut self.buffer); + self.count += 1; + let map_serializer = MapSerializer { + map: KvWriter::new(cursor), + index: &mut self.index, + writer: &mut self.writer, + buffer: Vec::new(), + }; + + Ok(map_serializer) + } + + fn serialize_seq(self, _len: Option) -> Result { + if self.allow_seq { + // Only allow sequence of documents of depth 1. + self.allow_seq = false; + Ok(SeqSerializer { serializer: self }) + } else { + Err(Error::InvalidDocumentFormat) + } + } + + fn serialize_bool(self, _v: bool) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_i8(self, _v: i8) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_i16(self, _v: i16) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_i32(self, _v: i32) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_i64(self, _v: i64) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_u8(self, _v: u8) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_u16(self, _v: u16) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_u32(self, _v: u32) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_u64(self, _v: u64) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_f32(self, _v: f32) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_f64(self, _v: f64) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_char(self, _v: char) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_str(self, _v: &str) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_none(self) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_some(self, _value: &T) -> Result + where + T: Serialize, + { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_unit(self) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + ) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + _value: &T, + ) -> Result + where + T: Serialize, + { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T, + ) -> Result + where + T: Serialize, + { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize, + ) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize, + ) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize, + ) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize, + ) -> Result { + Err(Error::InvalidDocumentFormat) + } +} + +pub struct SeqSerializer<'a, W> { + serializer: &'a mut DocumentSerializer, +} + +impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> { + type Ok = (); + type Error = Error; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where + T: Serialize, + { + value.serialize(&mut *self.serializer)?; + Ok(()) + } + + fn end(self) -> Result { + Ok(()) + } +} + +pub struct MapSerializer<'a, W> { + map: KvWriter>, FieldId>, + index: &'a mut DocumentsBatchIndex, + writer: W, + buffer: Vec, +} + +/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and +/// serialize_value, therefore these to methods remain unimplemented. +impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { + type Ok = (); + type Error = Error; + + fn serialize_key(&mut self, _key: &T) -> Result<(), Self::Error> { + unreachable!() + } + + fn serialize_value(&mut self, _value: &T) -> Result<(), Self::Error> { + unreachable!() + } + + fn end(mut self) -> Result { + let data = self.map.into_inner().map_err(Error::Io)?.into_inner(); + let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?; + + self.writer.write_u32::(data_len).map_err(Error::Io)?; + self.writer.write_all(&data).map_err(Error::Io)?; + + Ok(()) + } + + fn serialize_entry( + &mut self, + key: &K, + value: &V, + ) -> Result<(), Self::Error> + where + K: Serialize, + V: Serialize, + { + let field_serializer = FieldSerializer { index: &mut self.index }; + let field_id: FieldId = key.serialize(field_serializer)?; + + self.buffer.clear(); + let mut cursor = io::Cursor::new(&mut self.buffer); + serde_json::to_writer(&mut cursor, value).map_err(Error::JsonError)?; + + self.map.insert(field_id, cursor.into_inner()).map_err(Error::Io)?; + + Ok(()) + } +} + +struct FieldSerializer<'a> { + index: &'a mut DocumentsBatchIndex, +} + +impl<'a> serde::Serializer for FieldSerializer<'a> { + type Ok = FieldId; + + type Error = Error; + + type SerializeSeq = Impossible; + type SerializeTuple = Impossible; + type SerializeTupleStruct = Impossible; + type SerializeTupleVariant = Impossible; + type SerializeMap = Impossible; + type SerializeStruct = Impossible; + type SerializeStructVariant = Impossible; + + fn serialize_str(self, ws: &str) -> Result { + let field_id = match self.index.get_by_right(ws) { + Some(field_id) => *field_id, + None => { + let field_id = self.index.len() as FieldId; + self.index.insert(field_id, ws.to_string()); + field_id + } + }; + + Ok(field_id) + } + + fn serialize_bool(self, _v: bool) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_i8(self, _v: i8) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_i16(self, _v: i16) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_i32(self, _v: i32) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_i64(self, _v: i64) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_u8(self, _v: u8) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_u16(self, _v: u16) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_u32(self, _v: u32) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_u64(self, _v: u64) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_f32(self, _v: f32) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_f64(self, _v: f64) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_char(self, _v: char) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_none(self) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_some(self, _value: &T) -> Result + where + T: Serialize, + { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_unit(self) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + ) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + _value: &T, + ) -> Result + where + T: Serialize, + { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T, + ) -> Result + where + T: Serialize, + { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize, + ) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize, + ) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_map(self, _len: Option) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize, + ) -> Result { + Err(Error::InvalidDocumentFormat) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize, + ) -> Result { + Err(Error::InvalidDocumentFormat) + } +} + +impl serde::ser::Error for Error { + fn custom(msg: T) -> Self { + Error::Custom(msg.to_string()) + } +} diff --git a/milli/src/error.rs b/milli/src/error.rs index e6bd3fd62..fe0ac2cf7 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -55,7 +55,6 @@ pub enum FieldIdMapMissingEntry { #[derive(Debug)] pub enum UserError { AttributeLimitReached, - Csv(csv::Error), DocumentLimitReached, InvalidAscDescSyntax { name: String }, InvalidDocumentId { document_id: Value }, @@ -212,7 +211,6 @@ impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), - Self::Csv(error) => error.fmt(f), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), Self::InvalidFacetsDistribution { invalid_facets_name } => { let name_list = diff --git a/milli/src/index.rs b/milli/src/index.rs index f7603148d..dd5851ccc 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -868,7 +868,7 @@ pub(crate) mod tests { use maplit::btreemap; use tempfile::TempDir; - use crate::update::{IndexDocuments, UpdateFormat}; + use crate::update::IndexDocuments; use crate::Index; pub(crate) struct TempIndex { @@ -904,13 +904,12 @@ pub(crate) mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "id": 1, "name": "kevin" }, { "id": 2, "name": "bob", "age": 20 }, { "id": 2, "name": "bob", "age": 20 } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -929,8 +928,12 @@ pub(crate) mod tests { // we add all the documents a second time. we are supposed to get the same // field_distribution in the end let mut wtxn = index.write_txn().unwrap(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + let content = documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "bob", "age": 20 }, + { "id": 2, "name": "bob", "age": 20 } + ]); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -947,13 +950,12 @@ pub(crate) mod tests { ); // then we update a document by removing one field and another by adding one field - let content = &br#"[ + let content = documents!([ { "id": 1, "name": "kevin", "has_dog": true }, { "id": 2, "name": "bob" } - ]"#[..]; + ]); let mut wtxn = index.write_txn().unwrap(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 7c9f56665..550e7f13d 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,6 +1,9 @@ #[macro_use] extern crate pest_derive; +#[macro_use] +pub mod documents; + mod criterion; mod error; mod external_documents_ids; diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index e7dc52a82..deb51a053 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -27,6 +27,7 @@ pub trait Distinct { #[cfg(test)] mod test { use std::collections::HashSet; + use std::io::Cursor; use once_cell::sync::Lazy; use rand::seq::SliceRandom; @@ -34,19 +35,20 @@ mod test { use roaring::RoaringBitmap; use serde_json::{json, Value}; + use crate::documents::{DocumentBatchBuilder, DocumentBatchReader}; use crate::index::tests::TempIndex; use crate::index::Index; - use crate::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; + use crate::update::{IndexDocumentsMethod, UpdateBuilder}; use crate::{DocumentId, FieldId, BEU32}; - static JSON: Lazy = Lazy::new(generate_json); + static JSON: Lazy> = Lazy::new(generate_documents); - fn generate_json() -> Value { + fn generate_documents() -> Vec { let mut rng = rand::thread_rng(); let num_docs = rng.gen_range(10..30); - let mut documents = Vec::new(); - + let mut cursor = Cursor::new(Vec::new()); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); @@ -66,10 +68,11 @@ mod test { "txts": sample_txts[..(rng.gen_range(0..3))], "cat-ints": sample_ints[..(rng.gen_range(0..3))], }); - documents.push(doc); + builder.add_documents(doc).unwrap(); } - Value::Array(documents) + builder.finish().unwrap(); + cursor.into_inner() } /// Returns a temporary index populated with random test documents, the FieldId for the @@ -89,13 +92,15 @@ mod test { let mut addition = builder.index_documents(&mut txn, &index); addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - addition.update_format(UpdateFormat::Json); - addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap(); + let reader = + crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); + addition.execute(reader, |_, _| ()).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); - let map = (0..JSON.as_array().unwrap().len() as u32).collect(); + let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); + let map = (0..documents.len() as u32).collect(); txn.commit().unwrap(); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index e937cb65f..ea4193eaf 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -82,7 +82,7 @@ mod tests { use heed::EnvOpenOptions; use super::*; - use crate::update::{IndexDocuments, UpdateFormat}; + use crate::update::IndexDocuments; #[test] fn clear_documents() { @@ -92,14 +92,12 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "id": 0, "name": "kevin", "age": 20 }, { "id": 1, "name": "kevina" }, { "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); + ]); + IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap(); // Clear all documents from the database. let builder = ClearDocuments::new(&mut wtxn, &index, 1); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index b49cdc3cd..1b16ba9bf 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -567,7 +567,7 @@ mod tests { use maplit::hashset; use super::*; - use crate::update::{IndexDocuments, Settings, UpdateFormat}; + use crate::update::{IndexDocuments, Settings}; use crate::FilterCondition; #[test] @@ -578,13 +578,12 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); // delete those documents, ids are synchronous therefore 0, 1, and 2. @@ -609,13 +608,12 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "mysuperid": 0, "name": "kevin" }, { "mysuperid": 1, "name": "kevina" }, { "mysuperid": 2, "name": "benoit" } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); // Delete not all of the documents but some of them. @@ -640,7 +638,7 @@ mod tests { builder.set_filterable_fields(hashset! { S("label") }); builder.execute(|_, _| ()).unwrap(); - let content = &br#"[ + let content = documents!([ {"docid":"1_4","label":"sign"}, {"docid":"1_5","label":"letter"}, {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, @@ -661,9 +659,8 @@ mod tests { {"docid":"1_58","label":"abstract,art,cartoon"}, {"docid":"1_68","label":"design"}, {"docid":"1_69","label":"geometry"} - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); // Delete not all of the documents but some of them. @@ -692,7 +689,7 @@ mod tests { builder.set_sortable_fields(hashset!(S("_geo"))); builder.execute(|_, _| ()).unwrap(); - let content = &r#"[ + let content = documents!([ {"id":"1","city":"Lille", "_geo": { "lat": 50.629973371633746, "lng": 3.0569447399419570 } }, {"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.641586120121050, "lng": 3.1106593480348670 } }, {"id":"3","city":"Hellemmes", "_geo": { "lat": 50.631220965518080, "lng": 3.1106399673339933 } }, @@ -713,12 +710,10 @@ mod tests { {"id":"18","city":"Amiens", "_geo": { "lat": 49.931472529669996, "lng": 2.2710499758317080 } }, {"id":"19","city":"Compiègne", "_geo": { "lat": 49.444980887725656, "lng": 2.7913841281529015 } }, {"id":"20","city":"Paris", "_geo": { "lat": 48.902100060895480, "lng": 2.3708400867406930 } } - ]"#[..]; + ]); let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); - builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap(); let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); let ids_to_delete: Vec = external_ids_to_delete diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index bdd00dc56..f9577243f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -4,7 +4,7 @@ mod transform; mod typed_chunk; use std::collections::HashSet; -use std::io::{self, BufRead, BufReader}; +use std::io::{Read, Seek}; use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; use std::time::Instant; @@ -24,6 +24,7 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; +use crate::documents::DocumentBatchReader; use crate::update::{ Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordsLevelPositions, WordsPrefixesFst, @@ -57,17 +58,6 @@ pub enum WriteMethod { GetMergePut, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -#[non_exhaustive] -pub enum UpdateFormat { - /// The given update is a real **comma seperated** CSV with headers on the first line. - Csv, - /// The given update is a JSON array with documents inside. - Json, - /// The given update is a JSON stream with a document on each line. - JsonStream, -} - pub struct IndexDocuments<'t, 'u, 'i, 'a> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, @@ -85,7 +75,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { words_positions_level_group_size: Option, words_positions_min_level_size: Option, update_method: IndexDocumentsMethod, - update_format: UpdateFormat, autogenerate_docids: bool, update_id: u64, } @@ -113,18 +102,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { words_positions_level_group_size: None, words_positions_min_level_size: None, update_method: IndexDocumentsMethod::ReplaceDocuments, - update_format: UpdateFormat::Json, autogenerate_docids: false, update_id, } } - pub fn index_documents_method(&mut self, method: IndexDocumentsMethod) { - self.update_method = method; + pub fn log_every_n(&mut self, n: usize) { + self.log_every_n = Some(n); } - pub fn update_format(&mut self, format: UpdateFormat) { - self.update_format = format; + pub fn index_documents_method(&mut self, method: IndexDocumentsMethod) { + self.update_method = method; } pub fn enable_autogenerate_docids(&mut self) { @@ -136,16 +124,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } #[logging_timer::time("IndexDocuments::{}")] - pub fn execute(self, reader: R, progress_callback: F) -> Result + pub fn execute( + self, + reader: DocumentBatchReader, + progress_callback: F, + ) -> Result where - R: io::Read, + R: Read + Seek, F: Fn(UpdateIndexingStep, u64) + Sync, { - let mut reader = BufReader::new(reader); - reader.fill_buf()?; - // Early return when there is no document to add - if reader.buffer().is_empty() { + if reader.is_empty() { return Ok(DocumentAdditionResult { nb_documents: 0 }); } @@ -165,14 +154,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { autogenerate_docids: self.autogenerate_docids, }; - let output = match self.update_format { - UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?, - UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?, - UpdateFormat::JsonStream => { - transform.output_from_json_stream(reader, &progress_callback)? - } - }; - + let output = transform.read_documents(reader, progress_callback)?; let nb_documents = output.documents_count; info!("Update transformed in {:.02?}", before_transform.elapsed()); @@ -462,6 +444,7 @@ mod tests { use heed::EnvOpenOptions; use super::*; + use crate::documents::DocumentBatchBuilder; use crate::update::DeleteDocuments; use crate::HashMap; @@ -474,9 +457,12 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "kevina" }, + { "id": 3, "name": "benoit" } + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -488,9 +474,8 @@ mod tests { // Second we send 1 document with id 1, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n1,updated kevin\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ { "id": 1, "name": "updated kevin" } ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -502,9 +487,12 @@ mod tests { // Third we send 3 documents again to replace the existing ones. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ + { "id": 1, "name": "updated second kevin" }, + { "id": 2, "name": "updated kevina" }, + { "id": 3, "name": "updated benoit" } + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 2); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -525,9 +513,12 @@ mod tests { // First we send 3 documents with duplicate ids and // change the index method to merge documents. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n1,kevin\n1,kevina\n1,benoit\n"[..]; + let content = documents!([ + { "id": 1, "name": "kevin" }, + { "id": 1, "name": "kevina" }, + { "id": 1, "name": "benoit" } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -552,9 +543,8 @@ mod tests { // Second we send 1 document with id 1, to force it to be merged with the previous one. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,age\n1,25\n"[..]; + let content = documents!([ { "id": 1, "age": 25 } ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Csv); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -574,13 +564,13 @@ mod tests { let mut doc_iter = doc.iter(); assert_eq!(doc_iter.next(), Some((0, &br#""1""#[..]))); assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..]))); - assert_eq!(doc_iter.next(), Some((2, &br#""25""#[..]))); + assert_eq!(doc_iter.next(), Some((2, &br#"25"#[..]))); assert_eq!(doc_iter.next(), None); drop(rtxn); } #[test] - fn not_auto_generated_csv_documents_ids() { + fn not_auto_generated_documents_ids() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -588,35 +578,12 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); - assert!(builder.execute(content, |_, _| ()).is_err()); - wtxn.commit().unwrap(); - - // Check that there is no document. - let rtxn = index.read_txn().unwrap(); - let count = index.number_of_documents(&rtxn).unwrap(); - assert_eq!(count, 0); - drop(rtxn); - } - - #[test] - fn not_auto_generated_json_documents_ids() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // First we send 3 documents and 2 without ids. - let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ - { "name": "kevina", "id": 21 }, + let content = documents!([ { "name": "kevin" }, + { "name": "kevina" }, { "name": "benoit" } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); assert!(builder.execute(content, |_, _| ()).is_err()); wtxn.commit().unwrap(); @@ -636,10 +603,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name\nkevin\nkevina\nbenoit\n"[..]; + let content = documents!([ + { "name": "kevin" }, + { "name": "kevina" }, + { "name": "benoit" } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -655,10 +625,9 @@ mod tests { // Second we send 1 document with the generated uuid, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); - let content = format!("id,name\n{},updated kevin", kevin_uuid); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Csv); - builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -689,9 +658,12 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "kevina" }, + { "id": 3, "name": "benoit" } + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -703,9 +675,9 @@ mod tests { // Second we send 1 document without specifying the id. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name\nnew kevin"[..]; + let content = documents!([ { "name": "new kevin" } ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Csv); + builder.enable_autogenerate_docids(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -717,7 +689,7 @@ mod tests { } #[test] - fn empty_csv_update() { + fn empty_update() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -725,9 +697,8 @@ mod tests { // First we send 0 documents and only headers. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); + let content = documents!([]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -738,83 +709,6 @@ mod tests { drop(rtxn); } - #[test] - fn json_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // First we send 3 documents with an id for only one of them. - let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ - { "name": "kevin" }, - { "name": "kevina", "id": 21 }, - { "name": "benoit" } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Check that there is 3 documents now. - let rtxn = index.read_txn().unwrap(); - let count = index.number_of_documents(&rtxn).unwrap(); - assert_eq!(count, 3); - drop(rtxn); - } - - #[test] - fn empty_json_update() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // First we send 0 documents. - let mut wtxn = index.write_txn().unwrap(); - let content = &b"[]"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Check that there is no documents. - let rtxn = index.read_txn().unwrap(); - let count = index.number_of_documents(&rtxn).unwrap(); - assert_eq!(count, 0); - drop(rtxn); - } - - #[test] - fn json_stream_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // First we send 3 documents with an id for only one of them. - let mut wtxn = index.write_txn().unwrap(); - let content = &br#" - { "name": "kevin" } - { "name": "kevina", "id": 21 } - { "name": "benoit" } - "#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::JsonStream); - builder.execute(content, |_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Check that there is 3 documents now. - let rtxn = index.read_txn().unwrap(); - let count = index.number_of_documents(&rtxn).unwrap(); - assert_eq!(count, 3); - drop(rtxn); - } - #[test] fn invalid_documents_ids() { let path = tempfile::tempdir().unwrap(); @@ -825,18 +719,16 @@ mod tests { // First we send 1 document with an invalid id. let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. - let content = &b"id,name\nbrume bleue,kevin\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); assert!(builder.execute(content, |_, _| ()).is_err()); wtxn.commit().unwrap(); // First we send 1 document with a valid id. let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. - let content = &b"id,name\n32,kevin\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Csv); + let content = documents!([ { "id": 32, "name": "kevin" } ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -848,7 +740,7 @@ mod tests { } #[test] - fn complex_json_documents() { + fn complex_documents() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -856,13 +748,12 @@ mod tests { // First we send 3 documents with an id for only one of them. let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -893,33 +784,31 @@ mod tests { // First we send 3 documents with an id for only one of them. let mut wtxn = index.write_txn().unwrap(); - let documents = &r#"[ + let documents = documents!([ { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, { "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 }, { "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 }, { "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" }, { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" }, { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } } - ]"#[..]; + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - builder.execute(Cursor::new(documents), |_, _| ()).unwrap(); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); let mut wtxn = index.write_txn().unwrap(); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); - let documents = &r#"[ + let documents = documents!([ { "id": 2, "author": "J. Austen", "date": "1813" } - ]"#[..]; + ]); - builder.execute(Cursor::new(documents), |_, _| ()).unwrap(); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); } @@ -931,15 +820,13 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ + let content = documents!([ { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" }, { "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" }, { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); + ]); + IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap(); assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); @@ -951,22 +838,18 @@ mod tests { let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_none()); - let content = &br#"[ + let content = documents!([ { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); + ]); + IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap(); let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_some()); - let content = &br#"[ + let content = documents!([ { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); - builder.execute(content, |_, _| ()).unwrap(); + ]); + IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); } @@ -987,12 +870,16 @@ mod tests { big_object.insert(key, "I am a text!"); } - let content = vec![big_object]; - let content = serde_json::to_string(&content).unwrap(); + let mut cursor = Cursor::new(Vec::new()); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Json); - builder.execute(Cursor::new(content), |_, _| ()).unwrap(); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + builder.add_documents(big_object).unwrap(); + builder.finish().unwrap(); + cursor.set_position(0); + let content = DocumentBatchReader::from_reader(cursor).unwrap(); + + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); } @@ -1005,16 +892,38 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = r#"#id,title,au{hor,genre,price$ -2,"Prideand Prejudice","Jane Austin","romance",3.5$ -456,"Le Petit Prince","Antoine de Saint-Exupéry","adventure",10.0$ -1,Wonderland","Lewis Carroll","fantasy",25.99$ -4,"Harry Potter ing","fantasy\0lood Prince","J. K. Rowling","fantasy\0, -"#; + let content = documents!([ + { + "id": 2, + "title": "Prideand Prejudice", + "au{hor": "Jane Austin", + "genre": "romance", + "price$": "3.5$", + }, + { + "id": 456, + "title": "Le Petit Prince", + "au{hor": "Antoine de Saint-Exupéry", + "genre": "adventure", + "price$": "10.0$", + }, + { + "id": 1, + "title": "Wonderland", + "au{hor": "Lewis Carroll", + "genre": "fantasy", + "price$": "25.99$", + }, + { + "id": 4, + "title": "Harry Potter ing fantasy\0lood Prince", + "au{hor": "J. K. Rowling", + "genre": "fantasy\0", + }, + ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); - builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index e8b61ef14..fc5eb2c84 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,12 +1,12 @@ use std::borrow::Cow; use std::collections::btree_map::Entry; +use std::collections::HashMap; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; -use std::iter::Peekable; -use std::result::Result as StdResult; use std::time::Instant; use grenad::CompressionType; +use itertools::Itertools; use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; @@ -15,7 +15,8 @@ use super::helpers::{ create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn, }; use super::IndexDocumentsMethod; -use crate::error::{InternalError, UserError}; +use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; +use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32}; @@ -51,90 +52,63 @@ pub struct Transform<'t, 'i> { pub autogenerate_docids: bool, } -fn is_primary_key(field: impl AsRef) -> bool { - field.as_ref().to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME) +/// Create a mapping between the field ids found in the document batch and the one that were +/// already present in the index. +/// +/// If new fields are present in the addition, they are added to the index field ids map. +fn create_fields_mapping( + index_field_map: &mut FieldsIdsMap, + batch_field_map: &DocumentsBatchIndex, +) -> Result> { + batch_field_map + .iter() + // we sort by id here to ensure a deterministic mapping of the fields, that preserves + // the original ordering. + .sorted_by_key(|(&id, _)| id) + .map(|(field, name)| match index_field_map.id(&name) { + Some(id) => Ok((*field, id)), + None => index_field_map + .insert(&name) + .ok_or(Error::UserError(UserError::AttributeLimitReached)) + .map(|id| (*field, id)), + }) + .collect() +} + +fn find_primary_key(index: &bimap::BiHashMap) -> Option<&str> { + index + .right_values() + .find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME)) + .map(String::as_str) } impl Transform<'_, '_> { - pub fn output_from_json(self, reader: R, progress_callback: F) -> Result - where - R: Read, - F: Fn(UpdateIndexingStep) + Sync, - { - self.output_from_generic_json(reader, false, progress_callback) - } - - pub fn output_from_json_stream( + pub fn read_documents( self, - reader: R, + mut reader: DocumentBatchReader, progress_callback: F, ) -> Result where - R: Read, - F: Fn(UpdateIndexingStep) + Sync, - { - self.output_from_generic_json(reader, true, progress_callback) - } - - fn output_from_generic_json( - self, - reader: R, - is_stream: bool, - progress_callback: F, - ) -> Result - where - R: Read, + R: Read + Seek, F: Fn(UpdateIndexingStep) + Sync, { + let fields_index = reader.index(); let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); + let mapping = create_fields_mapping(&mut fields_ids_map, fields_index)?; - // Deserialize the whole batch of documents in memory. - let mut documents: Peekable< - Box>>>, - > = if is_stream { - let iter = serde_json::Deserializer::from_reader(reader).into_iter(); - let iter = Box::new(iter) as Box>; - iter.peekable() - } else { - let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?; - let iter = vec.into_iter().map(Ok); - let iter = Box::new(iter) as Box>; - iter.peekable() - }; + let alternative_name = self + .index + .primary_key(self.rtxn)? + .or_else(|| find_primary_key(fields_index)) + .map(String::from); - // We extract the primary key from the first document in - // the batch if it hasn't already been defined in the index - let first = match documents.peek().map(StdResult::as_ref).transpose() { - Ok(first) => first, - Err(_) => { - let error = documents.next().unwrap().unwrap_err(); - return Err(UserError::SerdeJson(error).into()); - } - }; - - let alternative_name = - first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned()); - let (primary_key_id, primary_key) = compute_primary_key_pair( + let (primary_key_id, primary_key_name) = compute_primary_key_pair( self.index.primary_key(self.rtxn)?, &mut fields_ids_map, alternative_name, self.autogenerate_docids, )?; - if documents.peek().is_none() { - return Ok(TransformOutput { - primary_key, - fields_ids_map, - field_distribution: self.index.field_distribution(self.rtxn)?, - external_documents_ids: ExternalDocumentsIds::default(), - new_documents_ids: RoaringBitmap::new(), - replaced_documents_ids: RoaringBitmap::new(), - documents_count: 0, - documents_file: tempfile::tempfile()?, - }); - } - // We must choose the appropriate merge function for when two or more documents // with the same user id must be merged or fully replaced in the same batch. let merge_function = match self.index_documents_method { @@ -151,204 +125,103 @@ impl Transform<'_, '_> { self.max_memory, ); - let mut json_buffer = Vec::new(); let mut obkv_buffer = Vec::new(); - let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; let mut documents_count = 0; - - for result in documents { - let document = result.map_err(UserError::SerdeJson)?; - + let mut external_id_buffer = Vec::new(); + let mut field_buffer: Vec<(u16, &[u8])> = Vec::new(); + while let Some((addition_index, document)) = reader.next_document_with_index()? { + let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.log_every_n.map_or(false, |len| documents_count % len == 0) { - progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { + progress_callback(UpdateIndexingStep::RemapDocumentAddition { documents_seen: documents_count, }); } - obkv_buffer.clear(); - let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); - - // We prepare the fields ids map with the documents keys. - for (key, _value) in &document { - fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; + for (k, v) in document.iter() { + let mapped_id = *mapping.get(&k).unwrap(); + field_buffer_cache.push((mapped_id, v)); } - // We retrieve the user id from the document based on the primary key name, - // if the document id isn't present we generate a uuid. - let external_id = match document.get(&primary_key) { - Some(value) => match value { - Value::String(string) => Cow::Borrowed(string.as_str()), - Value::Number(number) => Cow::Owned(number.to_string()), - content => { - return Err( - UserError::InvalidDocumentId { document_id: content.clone() }.into() - ) + // We need to make sure that every document has a primary key. After we have remapped + // all the fields in the document, we try to find the primary key value. If we can find + // it, transform it into a string and validate it, and then update it in the + // document. If none is found, and we were told to generate missing document ids, then + // we create the missing field, and update the new document. + let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; + let external_id = + match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { + Some((_, bytes)) => { + let value = match serde_json::from_slice(bytes).unwrap() { + Value::String(string) => match validate_document_id(&string) { + Some(s) if s.len() == string.len() => string, + Some(s) => s.to_string(), + None => { + return Err(UserError::InvalidDocumentId { + document_id: Value::String(string), + } + .into()) + } + }, + Value::Number(number) => number.to_string(), + content => { + return Err(UserError::InvalidDocumentId { + document_id: content.clone(), + } + .into()) + } + }; + serde_json::to_writer(&mut external_id_buffer, &value).unwrap(); + *bytes = &external_id_buffer; + Cow::Owned(value) } - }, - None => { - if !self.autogenerate_docids { - return Err(UserError::MissingDocumentId { document }.into()); - } - let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); - Cow::Borrowed(uuid) - } - }; + None => { + if !self.autogenerate_docids { + let mut json = Map::new(); + for (key, value) in document.iter() { + let key = addition_index.get_by_left(&key).cloned(); + let value = serde_json::from_slice::(&value).ok(); - // We iterate in the fields ids ordered. - for (field_id, name) in fields_ids_map.iter() { - json_buffer.clear(); + if let Some((k, v)) = key.zip(value) { + json.insert(k, v); + } + } - // We try to extract the value from the document and if we don't find anything - // and this should be the document id we return the one we generated. - if let Some(value) = document.get(name) { - // We serialize the attribute values. - serde_json::to_writer(&mut json_buffer, value) - .map_err(InternalError::SerdeJson)?; - writer.insert(field_id, &json_buffer)?; - } - // We validate the document id [a-zA-Z0-9\-_]. - if field_id == primary_key_id && validate_document_id(&external_id).is_none() { - return Err(UserError::InvalidDocumentId { - document_id: Value::from(external_id), + return Err(UserError::MissingDocumentId { document: json }.into()); + } + + let uuid = + uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); + serde_json::to_writer(&mut external_id_buffer, &uuid).unwrap(); + field_buffer_cache.push((primary_key_id, &external_id_buffer)); + Cow::Borrowed(&*uuid) } - .into()); - } + }; + + // Insertion in a obkv need to be done with keys ordered. For now they are ordered + // according to the document addition key order, so we sort it according to the + // fieldids map keys order. + field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2)); + + // The last step is to build the new obkv document, and insert it in the sorter. + let mut writer = obkv::KvWriter::new(&mut obkv_buffer); + for (k, v) in field_buffer_cache.iter() { + writer.insert(*k, v)?; } // We use the extracted/generated user id as the key for this document. - sorter.insert(external_id.as_bytes(), &obkv_buffer)?; + sorter.insert(&external_id.as_ref().as_bytes(), &obkv_buffer)?; documents_count += 1; - } - progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { - documents_seen: documents_count, - }); - - // Now that we have a valid sorter that contains the user id and the obkv we - // give it to the last transforming function which returns the TransformOutput. - self.output_from_sorter( - sorter, - primary_key, - fields_ids_map, - documents_count, - external_documents_ids, - progress_callback, - ) - } - - pub fn output_from_csv(self, reader: R, progress_callback: F) -> Result - where - R: Read, - F: Fn(UpdateIndexingStep) + Sync, - { - let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); - - let mut csv = csv::Reader::from_reader(reader); - let headers = csv.headers().map_err(UserError::Csv)?; - - let mut fields_ids = Vec::new(); - // Generate the new fields ids based on the current fields ids and this CSV headers. - for (i, header) in headers.iter().enumerate() { - let id = fields_ids_map.insert(header).ok_or(UserError::AttributeLimitReached)?; - fields_ids.push((id, i)); - } - - // Extract the position of the primary key in the current headers, None if not found. - let primary_key_pos = match self.index.primary_key(self.rtxn)? { - Some(primary_key) => { - // The primary key is known so we must find the position in the CSV headers. - headers.iter().position(|h| h == primary_key) - } - None => headers.iter().position(is_primary_key), - }; - - // Returns the field id in the fields ids map, create an "id" field - // in case it is not in the current headers. - let alternative_name = primary_key_pos.map(|pos| headers[pos].to_string()); - let (primary_key_id, primary_key_name) = compute_primary_key_pair( - self.index.primary_key(self.rtxn)?, - &mut fields_ids_map, - alternative_name, - self.autogenerate_docids, - )?; - - // The primary key field is not present in the header, so we need to create it. - if primary_key_pos.is_none() { - fields_ids.push((primary_key_id, usize::max_value())); - } - - // We sort the fields ids by the fields ids map id, this way we are sure to iterate over - // the records fields in the fields ids map order and correctly generate the obkv. - fields_ids.sort_unstable_by_key(|(field_id, _)| *field_id); - - // We initialize the sorter with the user indexing settings. - let mut sorter = create_sorter( - keep_latest_obkv, - self.chunk_compression_type, - self.chunk_compression_level, - self.max_nb_chunks, - self.max_memory, - ); - - // We write into the sorter to merge and deduplicate the documents - // based on the external ids. - let mut json_buffer = Vec::new(); - let mut obkv_buffer = Vec::new(); - let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; - let mut documents_count = 0; - - let mut record = csv::StringRecord::new(); - while csv.read_record(&mut record).map_err(UserError::Csv)? { - obkv_buffer.clear(); - let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); - - if self.log_every_n.map_or(false, |len| documents_count % len == 0) { - progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { - documents_seen: documents_count, - }); - } - - // We extract the user id if we know where it is or generate an UUID V4 otherwise. - let external_id = match primary_key_pos { - Some(pos) => { - let external_id = &record[pos]; - // We validate the document id [a-zA-Z0-9\-_]. - match validate_document_id(&external_id) { - Some(valid) => valid, - None => { - return Err(UserError::InvalidDocumentId { - document_id: Value::from(external_id), - } - .into()) - } - } - } - None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer), - }; - - // When the primary_key_field_id is found in the fields ids list - // we return the generated document id instead of the record field. - let iter = fields_ids.iter().map(|(fi, i)| { - let field = if *fi == primary_key_id { external_id } else { &record[*i] }; - (fi, field) + progress_callback(UpdateIndexingStep::RemapDocumentAddition { + documents_seen: documents_count, }); - // We retrieve the field id based on the fields ids map fields ids order. - for (field_id, field) in iter { - // We serialize the attribute values as JSON strings. - json_buffer.clear(); - serde_json::to_writer(&mut json_buffer, &field) - .map_err(InternalError::SerdeJson)?; - writer.insert(*field_id, &json_buffer)?; - } - - // We use the extracted/generated user id as the key for this document. - sorter.insert(external_id, &obkv_buffer)?; - documents_count += 1; + obkv_buffer.clear(); + field_buffer = drop_and_reuse(field_buffer_cache); + external_id_buffer.clear(); } - progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { + progress_callback(UpdateIndexingStep::RemapDocumentAddition { documents_seen: documents_count, }); @@ -359,7 +232,6 @@ impl Transform<'_, '_> { primary_key_name, fields_ids_map, documents_count, - external_documents_ids, progress_callback, ) } @@ -373,12 +245,12 @@ impl Transform<'_, '_> { primary_key: String, fields_ids_map: FieldsIdsMap, approximate_number_of_documents: usize, - mut external_documents_ids: ExternalDocumentsIds<'_>, progress_callback: F, ) -> Result where F: Fn(UpdateIndexingStep) + Sync, { + let mut external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let documents_ids = self.index.documents_ids(self.rtxn)?; let mut field_distribution = self.index.field_distribution(self.rtxn)?; let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); @@ -610,6 +482,17 @@ fn validate_document_id(document_id: &str) -> Option<&str> { }) } +/// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec`. +/// +/// The size and alignment of T and U must match. +fn drop_and_reuse(mut vec: Vec) -> Vec { + debug_assert_eq!(std::mem::align_of::(), std::mem::align_of::()); + debug_assert_eq!(std::mem::size_of::(), std::mem::size_of::()); + vec.clear(); + debug_assert!(vec.is_empty()); + vec.into_iter().map(|_| unreachable!()).collect() +} + #[cfg(test)] mod test { use super::*; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 36ed7d8fa..d80437ec7 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -2,9 +2,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::DeleteDocuments; pub use self::facets::Facets; -pub use self::index_documents::{ - DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat, -}; +pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod}; pub use self::settings::{Setting, Settings}; pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index f1b3e2628..4aa79f6e3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -111,6 +111,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn log_every_n(&mut self, n: usize) { + self.log_every_n = Some(n); + } + pub fn reset_searchable_fields(&mut self) { self.searchable_fields = Setting::Reset; } @@ -501,7 +505,7 @@ mod tests { use super::*; use crate::error::Error; - use crate::update::{IndexDocuments, UpdateFormat}; + use crate::update::IndexDocuments; use crate::{Criterion, FilterCondition, SearchResult}; #[test] @@ -513,9 +517,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"id,name,age\n0,kevin,23\n1,kevina,21\n2,benoit,34\n"[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.update_format(UpdateFormat::Csv); + + let content = documents!([ + { "id": 1, "name": "kevin", "age": 23 }, + { "id": 2, "name": "kevina", "age": 21}, + { "id": 3, "name": "benoit", "age": 34 } + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -567,10 +575,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -611,10 +622,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -633,10 +647,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); // In the same transaction we change the displayed fields to be only the age. @@ -678,13 +695,12 @@ mod tests { builder.execute(|_, _| ()).unwrap(); // Then index some documents. - let content = &br#"[ - { "name": "kevin", "age": 23 }, + let content = documents!([ + { "name": "kevin", "age": 23}, { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } - ]"#[..]; + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); builder.enable_autogenerate_docids(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -695,11 +711,19 @@ mod tests { assert_eq!(fields_ids, hashset! { S("age") }); // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. + let fidmap = index.fields_ids_map(&rtxn).unwrap(); + println!("fidmap: {:?}", fidmap); + for document in index.all_documents(&rtxn).unwrap() { + let document = document.unwrap(); + let json = crate::obkv_to_json(&fidmap.ids().collect::>(), &fidmap, document.1) + .unwrap(); + println!("json: {:?}", json); + } let count = index .facet_id_f64_docids .remap_key_type::() - // The faceted field id is 2u16 - .prefix_iter(&rtxn, &[0, 2, 0]) + // The faceted field id is 1u16 + .prefix_iter(&rtxn, &[0, 1, 0]) .unwrap() .count(); assert_eq!(count, 3); @@ -707,25 +731,23 @@ mod tests { // Index a little more documents with new and current facets values. let mut wtxn = index.write_txn().unwrap(); - let content = &br#"[ - { "name": "kevin2", "age": 23 }, + let content = documents!([ + { "name": "kevin2", "age": 23}, { "name": "kevina2", "age": 21 }, - { "name": "benoit", "age": 35 } - ]"#[..]; + { "name": "benoit", "age": 35 } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Json); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); // Only count the field_id 0 and level 0 facet values. - // TODO we must support typed CSVs for numbers to be understood. let count = index .facet_id_f64_docids .remap_key_type::() - .prefix_iter(&rtxn, &[0, 2, 0]) + .prefix_iter(&rtxn, &[0, 1, 0]) .unwrap() .count(); assert_eq!(count, 4); @@ -747,13 +769,12 @@ mod tests { builder.execute(|_, _| ()).unwrap(); // Then index some documents. - let content = &br#"[ - { "name": "kevin", "age": 23 }, + let content = documents!([ + { "name": "kevin", "age": 23}, { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } - ]"#[..]; + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); builder.enable_autogenerate_docids(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -790,7 +811,7 @@ mod tests { builder.execute(|_, _| ()).unwrap(); // Then index some documents. - let content = &br#"[ + let content = documents!([ { "name": "kevin", "age": 23 }, { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 }, @@ -798,9 +819,8 @@ mod tests { { "name": "bertrand", "age": 34 }, { "name": "bernie", "age": 34 }, { "name": "ben", "age": 34 } - ]"#[..]; + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); builder.enable_autogenerate_docids(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -822,10 +842,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -844,10 +867,13 @@ mod tests { // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23, "maxim": "I love dogs" }, + { "name": "kevina", "age": 21, "maxim": "Doggos are the best" }, + { "name": "benoit", "age": 34, "maxim": "The crepes are really good" }, + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); // In the same transaction we provide some stop_words @@ -915,10 +941,13 @@ mod tests { // Send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..]; + let content = documents!([ + { "name": "kevin", "age": 23, "maxim": "I love dogs"}, + { "name": "kevina", "age": 21, "maxim": "Doggos are the best"}, + { "name": "benoit", "age": 34, "maxim": "The crepes are really good"}, + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); // In the same transaction provide some synonyms @@ -1038,7 +1067,7 @@ mod tests { assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); // Then index some documents with the "mykey" primary key. - let content = &br#"[ + let content = documents!([ { "mykey": 1, "name": "kevin", "age": 23 }, { "mykey": 2, "name": "kevina", "age": 21 }, { "mykey": 3, "name": "benoit", "age": 34 }, @@ -1046,9 +1075,8 @@ mod tests { { "mykey": 5, "name": "bertrand", "age": 34 }, { "mykey": 6, "name": "bernie", "age": 34 }, { "mykey": 7, "name": "ben", "age": 34 } - ]"#[..]; + ]); let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); builder.disable_autogenerate_docids(); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -1087,7 +1115,7 @@ mod tests { builder.set_filterable_fields(hashset! { S("genres") }); builder.execute(|_, _| ()).unwrap(); - let content = &br#"[ + let content = documents!([ { "id": 11, "title": "Star Wars", @@ -1105,9 +1133,8 @@ mod tests { "poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg", "release_date": 819676800 } - ]"#[..]; - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.update_format(UpdateFormat::Json); + ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/update_step.rs b/milli/src/update/update_step.rs index 68a32bb67..fd5739caf 100644 --- a/milli/src/update/update_step.rs +++ b/milli/src/update/update_step.rs @@ -2,10 +2,9 @@ use UpdateIndexingStep::*; #[derive(Debug, Clone, Copy)] pub enum UpdateIndexingStep { - /// Transform from the original user given format (CSV, JSON, JSON lines) - /// into a generic format based on the obkv and grenad crates. This step also - /// deduplicate potential documents in this batch update by merging or replacing them. - TransformFromUserIntoGenericFormat { documents_seen: usize }, + /// Remap document addition fields the one present in the database, adding new fields in to the + /// schema on the go. + RemapDocumentAddition { documents_seen: usize }, /// This step check the external document id, computes the internal ids and merge /// the documents that are already present in the database. @@ -23,7 +22,7 @@ pub enum UpdateIndexingStep { impl UpdateIndexingStep { pub const fn step(&self) -> usize { match self { - TransformFromUserIntoGenericFormat { .. } => 0, + RemapDocumentAddition { .. } => 0, ComputeIdsAndMergeDocuments { .. } => 1, IndexDocuments { .. } => 2, MergeDataIntoFinalDatabase { .. } => 3, diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index e3f6c5b09..cda0da617 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -1,11 +1,13 @@ use std::cmp::Reverse; use std::collections::HashSet; +use std::io::Cursor; use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::update::{Settings, UpdateBuilder, UpdateFormat}; +use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::update::{Settings, UpdateBuilder}; use milli::{AscDesc, Criterion, DocumentId, Index, Member}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -55,9 +57,20 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = UpdateBuilder::new(0); builder.max_memory(10 * 1024 * 1024); // 10MiB let mut builder = builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::JsonStream); builder.enable_autogenerate_docids(); - builder.execute(CONTENT.as_bytes(), |_, _| ()).unwrap(); + let mut cursor = Cursor::new(Vec::new()); + let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let reader = Cursor::new(CONTENT.as_bytes()); + for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { + documents_builder.add_documents(doc.unwrap()).unwrap(); + } + documents_builder.finish().unwrap(); + + cursor.set_position(0); + + // index documents + let content = DocumentBatchReader::from_reader(cursor).unwrap(); + builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index f6a937f67..f3b04c4fa 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,10 +1,12 @@ use std::cmp::Reverse; +use std::io::Cursor; use big_s::S; use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; -use milli::update::{Settings, UpdateBuilder, UpdateFormat}; +use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::update::{Settings, UpdateBuilder}; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; use rand::Rng; use Criterion::*; @@ -386,31 +388,37 @@ fn criteria_ascdesc() { let mut builder = UpdateBuilder::new(0); builder.max_memory(10 * 1024 * 1024); // 10MiB let mut builder = builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Csv); builder.enable_autogenerate_docids(); - let content = [ - vec![S("name,age")], - (0..ASC_DESC_CANDIDATES_THRESHOLD + 1) - .map(|_| { - let mut rng = rand::thread_rng(); + let mut cursor = Cursor::new(Vec::new()); + let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - let age = rng.gen::().to_string(); - let name = rng - .sample_iter(&rand::distributions::Alphanumeric) - .map(char::from) - .filter(|c| *c >= 'a' && *c <= 'z') - .take(10) - .collect::(); + (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| { + let mut rng = rand::thread_rng(); - format!("{},{}", name, age) - }) - .collect::>(), - ] - .iter() - .flatten() - .join("\n"); - builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + let age = rng.gen::().to_string(); + let name = rng + .sample_iter(&rand::distributions::Alphanumeric) + .map(char::from) + .filter(|c| *c >= 'a' && *c <= 'z') + .take(10) + .collect::(); + + let json = serde_json::json!({ + "name": name, + "age": age, + }); + + batch_builder.add_documents(json).unwrap(); + }); + + batch_builder.finish().unwrap(); + + cursor.set_position(0); + + let reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/search/src/main.rs b/search/src/main.rs deleted file mode 100644 index fba714dab..000000000 --- a/search/src/main.rs +++ /dev/null @@ -1,98 +0,0 @@ -use std::io::{self, BufRead, Write}; -use std::iter::once; -use std::path::PathBuf; -use std::time::Instant; - -use byte_unit::Byte; -use heed::EnvOpenOptions; -use log::debug; -use milli::{obkv_to_json, Index}; -use structopt::StructOpt; - -#[cfg(target_os = "linux")] -#[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; - -#[derive(Debug, StructOpt)] -/// A simple search helper binary for the milli project. -pub struct Opt { - /// The database path where the database is located. - /// It is created if it doesn't already exist. - #[structopt(long = "db", parse(from_os_str))] - database: PathBuf, - - /// The maximum size the database can take on disk. It is recommended to specify - /// the whole disk space (value must be a multiple of a page size). - #[structopt(long = "db-size", default_value = "100 GiB")] - database_size: Byte, - - /// Verbose mode (-v, -vv, -vvv, etc.) - #[structopt(short, long, parse(from_occurrences))] - verbose: usize, - - /// The query string to search for (doesn't support prefix search yet). - query: Option, - - /// Compute and print the facet distribution of all the faceted fields. - #[structopt(long)] - print_facet_distribution: bool, -} - -fn main() -> anyhow::Result<()> { - let opt = Opt::from_args(); - - stderrlog::new() - .verbosity(opt.verbose) - .show_level(false) - .timestamp(stderrlog::Timestamp::Off) - .init()?; - - // Return an error if the database does not exist. - if !opt.database.exists() { - anyhow::bail!("The database ({}) does not exist.", opt.database.display()); - } - - let mut options = EnvOpenOptions::new(); - options.map_size(opt.database_size.get_bytes() as usize); - - // Open the LMDB database. - let index = Index::new(options, &opt.database)?; - let rtxn = index.read_txn()?; - let fields_ids_map = index.fields_ids_map(&rtxn)?; - let displayed_fields = match index.displayed_fields_ids(&rtxn)? { - Some(fields) => fields, - None => fields_ids_map.iter().map(|(id, _)| id).collect(), - }; - - let stdin = io::stdin(); - let lines = match opt.query { - Some(query) => Box::new(once(Ok(query))), - None => Box::new(stdin.lock().lines()) as Box>, - }; - - let mut stdout = io::stdout(); - for result in lines { - let before = Instant::now(); - - let query = result?; - let result = index.search(&rtxn).query(query).execute()?; - let documents = index.documents(&rtxn, result.documents_ids.iter().cloned())?; - - for (_id, record) in documents { - let val = obkv_to_json(&displayed_fields, &fields_ids_map, record)?; - serde_json::to_writer(&mut stdout, &val)?; - let _ = writeln!(&mut stdout); - } - - if opt.print_facet_distribution { - let facets = - index.facets_distribution(&rtxn).candidates(result.candidates).execute()?; - serde_json::to_writer(&mut stdout, &facets)?; - let _ = writeln!(&mut stdout); - } - - debug!("Took {:.02?} to find {} documents", before.elapsed(), result.documents_ids.len()); - } - - Ok(()) -} From f8ecbc28e28d9e38afc74ad1626071e663bc6fe1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 21 Sep 2021 18:09:14 +0200 Subject: [PATCH 1014/1889] Update version for the next release (v0.15.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 98e20da78..ef4484090 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.14.0" +version = "0.15.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 1f897e820..1b56d3ff1 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.14.0" +version = "0.15.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index ad5460778..c9631c157 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.14.0" +version = "0.15.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 37a524197..68f55a6c8 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.14.0" +version = "0.15.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index ebb2607c0..a1bcdcbcf 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.14.0" +version = "0.15.0" authors = ["Clément Renault "] edition = "2018" From 2e99fa8251194a62e9280c3a2f00673b59af741c Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 Sep 2021 11:30:33 +0200 Subject: [PATCH 1015/1889] remove the cargo.lock again --- Cargo.lock | 3410 ---------------------------------------------------- 1 file changed, 3410 deletions(-) delete mode 100644 Cargo.lock diff --git a/Cargo.lock b/Cargo.lock deleted file mode 100644 index 601f711ff..000000000 --- a/Cargo.lock +++ /dev/null @@ -1,3410 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 3 - -[[package]] -name = "addr2line" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e61f2b7f93d2c7d2b08263acaa4a363b3e276806c68af6134c44f523bf1aacd" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "ahash" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" - -[[package]] -name = "aho-corasick" -version = "0.7.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" -dependencies = [ - "memchr", -] - -[[package]] -name = "ansi_term" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "anyhow" -version = "1.0.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1" - -[[package]] -name = "arrayvec" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" - -[[package]] -name = "as-slice" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45403b49e3954a4b8428a0ac21a4b7afadccf92bfd96273f1a58cd4812496ae0" -dependencies = [ - "generic-array 0.12.4", - "generic-array 0.13.3", - "generic-array 0.14.4", - "stable_deref_trait", -] - -[[package]] -name = "askama" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d298738b6e47e1034e560e5afe63aa488fea34e25ec11b855a76f0d7b8e73134" -dependencies = [ - "askama_derive", - "askama_escape", - "askama_shared", - "mime", - "mime_guess", -] - -[[package]] -name = "askama_derive" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca2925c4c290382f9d2fa3d1c1b6a63fa1427099721ecca4749b154cc9c25522" -dependencies = [ - "askama_shared", - "proc-macro2 1.0.29", - "syn 1.0.76", -] - -[[package]] -name = "askama_escape" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90c108c1a94380c89d2215d0ac54ce09796823cca0fd91b299cfff3b33e346fb" - -[[package]] -name = "askama_shared" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2582b77e0f3c506ec4838a25fa8a5f97b9bed72bb6d3d272ea1c031d8bd373bc" -dependencies = [ - "askama_escape", - "humansize", - "nom", - "num-traits", - "percent-encoding", - "proc-macro2 1.0.29", - "quote 1.0.9", - "serde", - "syn 1.0.76", - "toml", -] - -[[package]] -name = "askama_warp" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c96f410ab17fa08f70b5fda07ce1112418642c914864961630808979343ea226" -dependencies = [ - "askama", - "warp", -] - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "autocfg" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" - -[[package]] -name = "backtrace" -version = "0.3.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7a905d892734eea339e896738c14b9afce22b5318f64b951e70bf3844419b01" -dependencies = [ - "addr2line", - "cc", - "cfg-if 1.0.0", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "base64" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3441f0f7b02788e948e47f457ca01f1d7e6d92c693bc132c22b087d3141c03ff" - -[[package]] -name = "base64" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" - -[[package]] -name = "benchmarks" -version = "0.1.0" -dependencies = [ - "anyhow", - "bytes 1.1.0", - "convert_case", - "criterion", - "flate2", - "heed", - "jemallocator", - "milli", - "reqwest", -] - -[[package]] -name = "big_s" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8" - -[[package]] -name = "bimap" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50ae17cabbc8a38a1e3e4c1a6a664e9a09672dc14d0896fa8d865d3a5a446b07" -dependencies = [ - "serde", -] - -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitvec" -version = "0.19.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321" -dependencies = [ - "funty", - "radium", - "tap", - "wyz", -] - -[[package]] -name = "block-buffer" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" -dependencies = [ - "block-padding", - "byte-tools", - "byteorder", - "generic-array 0.12.4", -] - -[[package]] -name = "block-buffer" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" -dependencies = [ - "generic-array 0.14.4", -] - -[[package]] -name = "block-padding" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" -dependencies = [ - "byte-tools", -] - -[[package]] -name = "bstr" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" -dependencies = [ - "lazy_static", - "memchr", - "regex-automata", - "serde", -] - -[[package]] -name = "buf_redux" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b953a6887648bb07a535631f2bc00fbdb2a2216f135552cb3f534ed136b9c07f" -dependencies = [ - "memchr", - "safemem", -] - -[[package]] -name = "bumpalo" -version = "3.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9df67f7bf9ef8498769f994239c45613ef0c5899415fb58e9add412d2c1a538" - -[[package]] -name = "byte-tools" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" - -[[package]] -name = "byte-unit" -version = "4.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "063197e6eb4b775b64160dedde7a0986bb2836cce140e9492e9e96f28e18bcd8" -dependencies = [ - "serde", - "utf8-width", -] - -[[package]] -name = "bytemuck" -version = "1.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72957246c41db82b8ef88a5486143830adeb8227ef9837740bdec67724cf2c5b" -dependencies = [ - "bytemuck_derive", -] - -[[package]] -name = "bytemuck_derive" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e215f8c2f9f79cb53c8335e687ffd07d5bfcb6fe5fc80723762d0be46e7cc54" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", -] - -[[package]] -name = "byteorder" -version = "1.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" - -[[package]] -name = "bytes" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e4cec68f03f32e44924783795810fa50a7035d8c8ebe78580ad7e6c703fba38" - -[[package]] -name = "bytes" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" - -[[package]] -name = "cast" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" -dependencies = [ - "rustc_version", -] - -[[package]] -name = "cc" -version = "1.0.70" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26a6ce4b6a484fa3edb70f7efa6fc430fd2b87285fe8b84304fd0936faa0dc0" - -[[package]] -name = "cedarwood" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d" -dependencies = [ - "smallvec", -] - -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "character_converter" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" -dependencies = [ - "bincode", -] - -[[package]] -name = "chrono" -version = "0.4.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" -dependencies = [ - "libc", - "num-integer", - "num-traits", - "serde", - "time", - "winapi 0.3.9", -] - -[[package]] -name = "clap" -version = "2.33.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" -dependencies = [ - "ansi_term", - "atty", - "bitflags", - "strsim", - "term_size", - "textwrap", - "unicode-width", - "vec_map", -] - -[[package]] -name = "cli" -version = "0.1.0" -dependencies = [ - "bimap", - "byte-unit", - "color-eyre", - "csv", - "eyre", - "heed", - "indicatif", - "jemallocator", - "milli", - "serde", - "serde_json", - "stderrlog", - "structopt", -] - -[[package]] -name = "color-eyre" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f1885697ee8a177096d42f158922251a41973117f6d8a234cee94b9509157b7" -dependencies = [ - "backtrace", - "color-spantrace", - "eyre", - "indenter", - "once_cell", - "owo-colors", - "tracing-error", -] - -[[package]] -name = "color-spantrace" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6eee477a4a8a72f4addd4de416eb56d54bc307b284d6601bafdee1f4ea462d1" -dependencies = [ - "once_cell", - "owo-colors", - "tracing-core", - "tracing-error", -] - -[[package]] -name = "concat-arrays" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1df715824eb382e34b7afb7463b0247bf41538aeba731fba05241ecdb5dc3747" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", -] - -[[package]] -name = "console" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3993e6445baa160675931ec041a5e03ca84b9c6e32a056150d3aa2bdda0a1f45" -dependencies = [ - "encode_unicode", - "lazy_static", - "libc", - "terminal_size", - "winapi 0.3.9", -] - -[[package]] -name = "convert_case" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" - -[[package]] -name = "cow-utils" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" - -[[package]] -name = "cpufeatures" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" -dependencies = [ - "libc", -] - -[[package]] -name = "crc32fast" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81156fece84ab6a9f2afdb109ce3ae577e42b1228441eded99bd77f627953b1a" -dependencies = [ - "cfg-if 1.0.0", -] - -[[package]] -name = "criterion" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" -dependencies = [ - "atty", - "cast", - "clap", - "criterion-plot", - "csv", - "itertools", - "lazy_static", - "num-traits", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_cbor", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", -] - -[[package]] -name = "criterion-plot" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" -dependencies = [ - "cast", - "itertools", -] - -[[package]] -name = "crossbeam-channel" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.5", -] - -[[package]] -name = "crossbeam-deque" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-epoch", - "crossbeam-utils 0.8.5", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" -dependencies = [ - "cfg-if 1.0.0", - "crossbeam-utils 0.8.5", - "lazy_static", - "memoffset", - "scopeguard", -] - -[[package]] -name = "crossbeam-queue" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b" -dependencies = [ - "crossbeam-utils 0.6.6", -] - -[[package]] -name = "crossbeam-utils" -version = "0.6.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04973fa96e96579258a5091af6003abde64af786b860f18622b82e026cca60e6" -dependencies = [ - "cfg-if 0.1.10", - "lazy_static", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" -dependencies = [ - "cfg-if 1.0.0", - "lazy_static", -] - -[[package]] -name = "csv" -version = "1.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" -dependencies = [ - "bstr", - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" -dependencies = [ - "memchr", -] - -[[package]] -name = "deunicode" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2c9736e15e7df1638a7f6eee92a6511615c738246a052af5ba86f039b65aede" - -[[package]] -name = "digest" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" -dependencies = [ - "generic-array 0.12.4", -] - -[[package]] -name = "digest" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" -dependencies = [ - "generic-array 0.14.4", -] - -[[package]] -name = "dtoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56899898ce76aaf4a0f24d914c97ea6ed976d42fec6ad33fcbb0a1103e07b2b0" - -[[package]] -name = "either" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" - -[[package]] -name = "encode_unicode" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" - -[[package]] -name = "encoding_rs" -version = "0.8.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" -dependencies = [ - "cfg-if 1.0.0", -] - -[[package]] -name = "eyre" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "221239d1d5ea86bf5d6f91c9d6bc3646ffe471b08ff9b0f91c44f115ac969d2b" -dependencies = [ - "indenter", - "once_cell", -] - -[[package]] -name = "fake-simd" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" - -[[package]] -name = "flate2" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e6988e897c1c9c485f43b47a529cef42fde0547f9d8d41a7062518f1d8fc53f" -dependencies = [ - "cfg-if 1.0.0", - "crc32fast", - "libc", - "miniz_oxide", -] - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "form_urlencoded" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fc25a87fa4fd2094bffb06925852034d90a17f0d1e05197d4956d3555752191" -dependencies = [ - "matches", - "percent-encoding", -] - -[[package]] -name = "fs_extra" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2022715d62ab30faffd124d40b76f4134a550a87792276512b18d63272333394" - -[[package]] -name = "fst" -version = "0.4.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" - -[[package]] -name = "fuchsia-zircon" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" -dependencies = [ - "bitflags", - "fuchsia-zircon-sys", -] - -[[package]] -name = "fuchsia-zircon-sys" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" - -[[package]] -name = "funty" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" - -[[package]] -name = "futures" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12aa0eb539080d55c3f2d45a67c3b58b6b0773c1a3ca2dfec66d58c97fd66ca" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5da6ba8c3bb3c165d3c7319fc1cc8304facf1fb8db99c5de877183c08a273888" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d1c26957f23603395cd326b0ffe64124b818f4449552f960d815cfba83a53d" - -[[package]] -name = "futures-executor" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45025be030969d763025784f7f355043dc6bc74093e4ecc5000ca4dc50d8745c" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "522de2a0fe3e380f1bc577ba0474108faf3f6b18321dbf60b3b9c39a75073377" - -[[package]] -name = "futures-macro" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18e4a4b95cea4b4ccbcf1c5675ca7c4ee4e9e75eb79944d07defde18068f79bb" -dependencies = [ - "autocfg", - "proc-macro-hack", - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", -] - -[[package]] -name = "futures-sink" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36ea153c13024fe480590b3e3d4cad89a0cfacecc24577b68f86c6ced9c2bc11" - -[[package]] -name = "futures-task" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d3d00f4eddb73e498a54394f228cd55853bdf059259e8e7bc6e69d408892e99" - -[[package]] -name = "futures-util" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36568465210a3a6ee45e1f165136d68671471a501e632e9a98d96872222b5481" -dependencies = [ - "autocfg", - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite 0.2.7", - "pin-utils", - "proc-macro-hack", - "proc-macro-nested", - "slab", -] - -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - -[[package]] -name = "generic-array" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffdf9f34f1447443d37393cc6c2b8313aebddcd96906caf34e54c68d8e57d7bd" -dependencies = [ - "typenum", -] - -[[package]] -name = "generic-array" -version = "0.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f797e67af32588215eaaab8327027ee8e71b9dd0b2b26996aedf20c030fce309" -dependencies = [ - "typenum", -] - -[[package]] -name = "generic-array" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "501466ecc8a30d1d3b7fc9229b122b2ce8ed6e9d9223f1138d4babb253e51817" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "geoutils" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e006f616a407d396ace1d2ebb3f43ed73189db8b098079bd129928d7645dd1e" - -[[package]] -name = "getrandom" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "wasi 0.9.0+wasi-snapshot-preview1", -] - -[[package]] -name = "getrandom" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", -] - -[[package]] -name = "gimli" -version = "0.25.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0a01e0497841a3b2db4f8afa483cce65f7e96a3498bd6c541734792aeac8fe7" - -[[package]] -name = "grenad" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a7a9cc43b28a20f791b17863f34a36654fdfa50be6d0a67bb18c1e34d145f18" -dependencies = [ - "bytemuck", - "byteorder", - "tempfile", -] - -[[package]] -name = "h2" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e4728fd124914ad25e99e3d15a9361a879f6620f63cb56bbb08f95abb97a535" -dependencies = [ - "bytes 0.5.6", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap", - "slab", - "tokio 0.2.25", - "tokio-util 0.3.1", - "tracing", - "tracing-futures", -] - -[[package]] -name = "h2" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7f3675cfef6a30c8031cf9e6493ebdc3bb3272a3fea3923c4210d1830e6a472" -dependencies = [ - "bytes 1.1.0", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http", - "indexmap", - "slab", - "tokio 1.11.0", - "tokio-util 0.6.8", - "tracing", -] - -[[package]] -name = "half" -version = "1.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3" - -[[package]] -name = "hash32" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4041af86e63ac4298ce40e5cca669066e75b6f1aa3390fe2561ffa5e1d9f4cc" -dependencies = [ - "byteorder", -] - -[[package]] -name = "hashbrown" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf" -dependencies = [ - "ahash", - "autocfg", -] - -[[package]] -name = "hashbrown" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" - -[[package]] -name = "headers" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0b7591fb62902706ae8e7aaff416b1b0fa2c0fd0878b46dc13baa3712d8a855" -dependencies = [ - "base64 0.13.0", - "bitflags", - "bytes 1.1.0", - "headers-core", - "http", - "mime", - "sha-1 0.9.8", - "time", -] - -[[package]] -name = "headers-core" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7f66481bfee273957b1f20485a4ff3362987f85b2c236580d81b4eb7a326429" -dependencies = [ - "http", -] - -[[package]] -name = "heapless" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634bd4d29cbf24424d0a4bfcbf80c6960129dc24424752a7d1d1390607023422" -dependencies = [ - "as-slice", - "generic-array 0.14.4", - "hash32", - "stable_deref_trait", -] - -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" -dependencies = [ - "unicode-segmentation", -] - -[[package]] -name = "heed" -version = "0.12.1" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#fc017cf3394af737f92fd71e16f0499a78b79d65" -dependencies = [ - "byteorder", - "heed-traits", - "heed-types", - "libc", - "lmdb-rkv-sys", - "once_cell", - "page_size", - "serde", - "synchronoise", - "url", - "zerocopy", -] - -[[package]] -name = "heed-traits" -version = "0.7.0" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#fc017cf3394af737f92fd71e16f0499a78b79d65" - -[[package]] -name = "heed-types" -version = "0.7.2" -source = "git+https://github.com/Kerollmops/heed?tag=v0.12.1#fc017cf3394af737f92fd71e16f0499a78b79d65" -dependencies = [ - "bincode", - "heed-traits", - "serde", - "serde_json", - "zerocopy", -] - -[[package]] -name = "helpers" -version = "0.14.0" -dependencies = [ - "anyhow", - "byte-unit", - "heed", - "jemallocator", - "milli", - "stderrlog", - "structopt", -] - -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - -[[package]] -name = "http" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "527e8c9ac747e28542699a951517aa9a6945af506cd1f2e1b53a576c17b6cc11" -dependencies = [ - "bytes 1.1.0", - "fnv", - "itoa", -] - -[[package]] -name = "http-body" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13d5ff830006f7646652e057693569bfe0d51760c0085a071769d142a205111b" -dependencies = [ - "bytes 0.5.6", - "http", -] - -[[package]] -name = "http-body" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "399c583b2979440c60be0821a6199eca73bc3c8dcd9d070d75ac726e2c6186e5" -dependencies = [ - "bytes 1.1.0", - "http", - "pin-project-lite 0.2.7", -] - -[[package]] -name = "http-ui" -version = "0.14.0" -dependencies = [ - "anyhow", - "askama", - "askama_warp", - "bimap", - "byte-unit", - "bytes 0.5.6", - "crossbeam-channel", - "csv", - "either", - "flate2", - "fst", - "funty", - "futures", - "heed", - "jemallocator", - "log", - "maplit", - "meilisearch-tokenizer", - "memmap", - "milli", - "once_cell", - "rayon", - "serde", - "serde_json", - "serde_test", - "stderrlog", - "structopt", - "tempfile", - "tokio 0.2.25", - "warp", -] - -[[package]] -name = "httparse" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acd94fdbe1d4ff688b67b04eee2e17bd50995534a61539e45adfefb45e5e5503" - -[[package]] -name = "httpdate" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "494b4d60369511e7dea41cf646832512a94e542f68bb9c49e54518e0f468eb47" - -[[package]] -name = "httpdate" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440" - -[[package]] -name = "human_format" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86cce260d758a9aa3d7c4b99d55c815a540f8a37514ba6046ab6be402a157cb0" - -[[package]] -name = "humansize" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02296996cb8796d7c6e3bc2d9211b7802812d36999a51bb754123ead7d37d026" - -[[package]] -name = "hyper" -version = "0.13.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a6f157065790a3ed2f88679250419b5cdd96e714a0d65f7797fd337186e96bb" -dependencies = [ - "bytes 0.5.6", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.2.7", - "http", - "http-body 0.3.1", - "httparse", - "httpdate 0.3.2", - "itoa", - "pin-project 1.0.8", - "socket2 0.3.19", - "tokio 0.2.25", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper" -version = "0.14.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "15d1cfb9e4f68655fa04c01f59edb405b6074a0f7118ea881e5026e4a1cd8593" -dependencies = [ - "bytes 1.1.0", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.3.4", - "http", - "http-body 0.4.3", - "httparse", - "httpdate 1.0.1", - "itoa", - "pin-project-lite 0.2.7", - "socket2 0.4.2", - "tokio 1.11.0", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper-rustls" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f9f7a97316d44c0af9b0301e65010573a853a9fc97046d7331d7f6bc0fd5a64" -dependencies = [ - "futures-util", - "hyper 0.14.13", - "log", - "rustls", - "tokio 1.11.0", - "tokio-rustls", - "webpki", -] - -[[package]] -name = "idna" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" -dependencies = [ - "matches", - "unicode-bidi", - "unicode-normalization", -] - -[[package]] -name = "indenter" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce23b50ad8242c51a442f3ff322d56b02f08852c77e4c0b4d3fd684abc89c683" - -[[package]] -name = "indexmap" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5" -dependencies = [ - "autocfg", - "hashbrown 0.11.2", -] - -[[package]] -name = "indicatif" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b" -dependencies = [ - "console", - "lazy_static", - "number_prefix", - "regex", -] - -[[package]] -name = "infos" -version = "0.14.0" -dependencies = [ - "anyhow", - "byte-unit", - "csv", - "heed", - "jemallocator", - "milli", - "roaring", - "serde_json", - "stderrlog", - "structopt", -] - -[[package]] -name = "input_buffer" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19a8a95243d5a0398cae618ec29477c6e3cb631152be5c19481f80bc71559754" -dependencies = [ - "bytes 0.5.6", -] - -[[package]] -name = "iovec" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e" -dependencies = [ - "libc", -] - -[[package]] -name = "ipnet" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f2d64f2edebec4ce84ad108148e67e1064789bee435edc5b60ad398714a3a9" - -[[package]] -name = "itertools" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" - -[[package]] -name = "jemalloc-sys" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45" -dependencies = [ - "cc", - "fs_extra", - "libc", -] - -[[package]] -name = "jemallocator" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69" -dependencies = [ - "jemalloc-sys", - "libc", -] - -[[package]] -name = "jieba-rs" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94287627d13ab7b943787ab20b54b37f8af11179ce85de4734071c88f9eab354" -dependencies = [ - "cedarwood", - "fxhash", - "hashbrown 0.11.2", - "lazy_static", - "phf", - "phf_codegen", - "regex", -] - -[[package]] -name = "js-sys" -version = "0.3.55" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cc9ffccd38c451a86bf13657df244e9c3f37493cce8e5e21e940963777acc84" -dependencies = [ - "wasm-bindgen", -] - -[[package]] -name = "kernel32-sys" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "levenshtein_automata" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" -dependencies = [ - "fst", -] - -[[package]] -name = "lexical-core" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" -dependencies = [ - "arrayvec", - "bitflags", - "cfg-if 1.0.0", - "ryu", - "static_assertions", -] - -[[package]] -name = "libc" -version = "0.2.102" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2a5ac8f984bfcf3a823267e5fde638acc3325f6496633a5da6bb6eb2171e103" - -[[package]] -name = "linked-hash-map" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fb9b38af92608140b86b693604b9ffcc5824240a484d1ecd4795bacb2fe88f3" - -[[package]] -name = "lmdb-rkv-sys" -version = "0.15.0" -source = "git+https://github.com/meilisearch/lmdb-rs#d0b50d02938ee84e4e4372697ea991fe2a4cae3b" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - -[[package]] -name = "log" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" -dependencies = [ - "cfg-if 1.0.0", -] - -[[package]] -name = "logging_timer" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40d0c249955c17c2f8f86b5f501b16d2509ebbe775f7b1d1d2b1ba85ade2a793" -dependencies = [ - "log", - "logging_timer_proc_macros", -] - -[[package]] -name = "logging_timer_proc_macros" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "482c2c28e6bcfe7c4274f82f701774d755e6aa873edfd619460fcd0966e0eb07" -dependencies = [ - "log", - "proc-macro2 0.4.30", - "quote 0.6.13", - "syn 0.15.44", -] - -[[package]] -name = "maplit" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" - -[[package]] -name = "matches" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3e378b66a060d48947b590737b30a1be76706c8dd7b8ba0f2fe3989c68a853f" - -[[package]] -name = "meilisearch-tokenizer" -version = "0.2.5" -source = "git+https://github.com/meilisearch/tokenizer.git?tag=v0.2.5#c0b5cf741ed9485147f2cbe523f2214d4fa4c395" -dependencies = [ - "character_converter", - "cow-utils", - "deunicode", - "fst", - "jieba-rs", - "once_cell", - "slice-group-by", - "unicode-segmentation", - "whatlang", -] - -[[package]] -name = "memchr" -version = "2.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" - -[[package]] -name = "memmap" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "memoffset" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" -dependencies = [ - "autocfg", -] - -[[package]] -name = "milli" -version = "0.14.0" -dependencies = [ - "big_s", - "bimap", - "bincode", - "bstr", - "byteorder", - "chrono", - "concat-arrays", - "crossbeam-channel", - "either", - "flate2", - "fst", - "fxhash", - "geoutils", - "grenad", - "heed", - "human_format", - "itertools", - "levenshtein_automata", - "linked-hash-map", - "log", - "logging_timer", - "maplit", - "meilisearch-tokenizer", - "memmap", - "obkv", - "once_cell", - "ordered-float", - "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", - "pest_derive", - "rand 0.8.4", - "rayon", - "roaring", - "rstar", - "serde", - "serde_json", - "slice-group-by", - "smallstr", - "smallvec", - "tempfile", - "uuid", -] - -[[package]] -name = "mime" -version = "0.3.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" - -[[package]] -name = "mime_guess" -version = "2.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2684d4c2e97d99848d30b324b00c8fcc7e5c897b7cbb5819b09e7c90e8baf212" -dependencies = [ - "mime", - "unicase", -] - -[[package]] -name = "miniz_oxide" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92518e98c078586bc6c934028adcca4c92a53d6a958196de835170a01d84e4b" -dependencies = [ - "adler", - "autocfg", -] - -[[package]] -name = "mio" -version = "0.6.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4afd66f5b91bf2a3bc13fad0e21caedac168ca4c707504e75585648ae80e4cc4" -dependencies = [ - "cfg-if 0.1.10", - "fuchsia-zircon", - "fuchsia-zircon-sys", - "iovec", - "kernel32-sys", - "libc", - "log", - "miow 0.2.2", - "net2", - "slab", - "winapi 0.2.8", -] - -[[package]] -name = "mio" -version = "0.7.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c2bdb6314ec10835cd3293dd268473a835c02b7b352e788be788b3c6ca6bb16" -dependencies = [ - "libc", - "log", - "miow 0.3.7", - "ntapi", - "winapi 0.3.9", -] - -[[package]] -name = "mio-named-pipes" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0840c1c50fd55e521b247f949c241c9997709f23bd7f023b9762cd561e935656" -dependencies = [ - "log", - "mio 0.6.23", - "miow 0.3.7", - "winapi 0.3.9", -] - -[[package]] -name = "mio-uds" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afcb699eb26d4332647cc848492bbc15eafb26f08d0304550d5aa1f612e066f0" -dependencies = [ - "iovec", - "libc", - "mio 0.6.23", -] - -[[package]] -name = "miow" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebd808424166322d4a38da87083bfddd3ac4c131334ed55856112eb06d46944d" -dependencies = [ - "kernel32-sys", - "net2", - "winapi 0.2.8", - "ws2_32-sys", -] - -[[package]] -name = "miow" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f1c5b025cda876f66ef43a113f91ebc9f4ccef34843000e0adf6ebbab84e21" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "multipart" -version = "0.17.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d050aeedc89243f5347c3e237e3e13dc76fbe4ae3742a57b94dc14f69acf76d4" -dependencies = [ - "buf_redux", - "httparse", - "log", - "mime", - "mime_guess", - "quick-error", - "rand 0.7.3", - "safemem", - "tempfile", - "twoway", -] - -[[package]] -name = "net2" -version = "0.2.37" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "391630d12b68002ae1e25e8f974306474966550ad82dac6886fb8910c19568ae" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "nom" -version = "6.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7413f999671bd4745a7b624bd370a569fb6bc574b23c83a3c5ed2e453f3d5e2" -dependencies = [ - "bitvec", - "funty", - "lexical-core", - "memchr", - "version_check", -] - -[[package]] -name = "ntapi" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f6bb902e437b6d86e03cce10a7e2af662292c5dfef23b65899ea3ac9354ad44" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "num-integer" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" -dependencies = [ - "autocfg", - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" -dependencies = [ - "autocfg", -] - -[[package]] -name = "num_cpus" -version = "1.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" -dependencies = [ - "hermit-abi", - "libc", -] - -[[package]] -name = "number_prefix" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" - -[[package]] -name = "object" -version = "0.26.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39f37e50073ccad23b6d09bcb5b263f4e76d3bb6038e4a3c08e52162ffa8abc2" -dependencies = [ - "memchr", -] - -[[package]] -name = "obkv" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385" - -[[package]] -name = "once_cell" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56" - -[[package]] -name = "oorandom" -version = "11.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" - -[[package]] -name = "opaque-debug" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" - -[[package]] -name = "opaque-debug" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" - -[[package]] -name = "ordered-float" -version = "2.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97c9d06878b3a851e8026ef94bf7fef9ba93062cd412601da4d9cf369b1cc62d" -dependencies = [ - "num-traits", -] - -[[package]] -name = "owo-colors" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2386b4ebe91c2f7f51082d4cefa145d030e33a1842a96b12e4885cc3c01f7a55" - -[[package]] -name = "page_size" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eebde548fbbf1ea81a99b128872779c437752fb99f217c45245e1a61dcd9edcd" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "pdqselect" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec91767ecc0a0bbe558ce8c9da33c068066c57ecc8bb8477ef8c1ad3ef77c27" - -[[package]] -name = "percent-encoding" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" - -[[package]] -name = "pest" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" -dependencies = [ - "ucd-trie", -] - -[[package]] -name = "pest" -version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" -dependencies = [ - "ucd-trie", -] - -[[package]] -name = "pest_derive" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" -dependencies = [ - "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "pest_generator", -] - -[[package]] -name = "pest_generator" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" -dependencies = [ - "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "pest_meta", - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", -] - -[[package]] -name = "pest_meta" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" -dependencies = [ - "maplit", - "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", - "sha-1 0.8.2", -] - -[[package]] -name = "phf" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2ac8b67553a7ca9457ce0e526948cad581819238f4a9d1ea74545851fa24f37" -dependencies = [ - "phf_shared", -] - -[[package]] -name = "phf_codegen" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963adb11cf22ee65dfd401cf75577c1aa0eca58c0b97f9337d2da61d3e640503" -dependencies = [ - "phf_generator", - "phf_shared", -] - -[[package]] -name = "phf_generator" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d43f3220d96e0080cc9ea234978ccd80d904eafb17be31bb0f76daaea6493082" -dependencies = [ - "phf_shared", - "rand 0.8.4", -] - -[[package]] -name = "phf_shared" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a68318426de33640f02be62b4ae8eb1261be2efbc337b60c54d845bf4484e0d9" -dependencies = [ - "siphasher", -] - -[[package]] -name = "pin-project" -version = "0.4.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "918192b5c59119d51e0cd221f4d49dde9112824ba717369e903c97d076083d0f" -dependencies = [ - "pin-project-internal 0.4.28", -] - -[[package]] -name = "pin-project" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "576bc800220cc65dac09e99e97b08b358cfab6e17078de8dc5fee223bd2d0c08" -dependencies = [ - "pin-project-internal 1.0.8", -] - -[[package]] -name = "pin-project-internal" -version = "0.4.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be26700300be6d9d23264c73211d8190e755b6b5ca7a1b28230025511b52a5e" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", -] - -[[package]] -name = "pin-project-internal" -version = "1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e8fe8163d14ce7f0cdac2e040116f22eac817edabff0be91e8aff7e9accf389" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", -] - -[[package]] -name = "pin-project-lite" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "257b64915a082f7811703966789728173279bdebb956b143dbcd23f6f970a777" - -[[package]] -name = "pin-project-lite" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkg-config" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3831453b3449ceb48b6d9c7ad7c96d5ea673e9b470a1dc578c2ce6521230884c" - -[[package]] -name = "plotters" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" -dependencies = [ - "num-traits", - "plotters-backend", - "plotters-svg", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "plotters-backend" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" - -[[package]] -name = "plotters-svg" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" -dependencies = [ - "plotters-backend", -] - -[[package]] -name = "ppv-lite86" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" - -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "version_check", -] - -[[package]] -name = "proc-macro-hack" -version = "0.5.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" - -[[package]] -name = "proc-macro-nested" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc881b2c22681370c6a780e47af9840ef841837bc98118431d4e1868bd0c1086" - -[[package]] -name = "proc-macro2" -version = "0.4.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" -dependencies = [ - "unicode-xid 0.1.0", -] - -[[package]] -name = "proc-macro2" -version = "1.0.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d" -dependencies = [ - "unicode-xid 0.2.2", -] - -[[package]] -name = "quick-error" -version = "1.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" - -[[package]] -name = "quote" -version = "0.6.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1" -dependencies = [ - "proc-macro2 0.4.30", -] - -[[package]] -name = "quote" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" -dependencies = [ - "proc-macro2 1.0.29", -] - -[[package]] -name = "radium" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" - -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom 0.1.16", - "libc", - "rand_chacha 0.2.2", - "rand_core 0.5.1", - "rand_hc 0.2.0", -] - -[[package]] -name = "rand" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" -dependencies = [ - "libc", - "rand_chacha 0.3.1", - "rand_core 0.6.3", - "rand_hc 0.3.1", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core 0.5.1", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core 0.6.3", -] - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom 0.1.16", -] - -[[package]] -name = "rand_core" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" -dependencies = [ - "getrandom 0.2.3", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" -dependencies = [ - "rand_core 0.5.1", -] - -[[package]] -name = "rand_hc" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" -dependencies = [ - "rand_core 0.6.3", -] - -[[package]] -name = "rayon" -version = "1.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" -dependencies = [ - "autocfg", - "crossbeam-deque", - "either", - "rayon-core", -] - -[[package]] -name = "rayon-core" -version = "1.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-utils 0.8.5", - "lazy_static", - "num_cpus", -] - -[[package]] -name = "redox_syscall" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" -dependencies = [ - "bitflags", -] - -[[package]] -name = "regex" -version = "1.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - -[[package]] -name = "regex-syntax" -version = "0.6.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" - -[[package]] -name = "remove_dir_all" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "reqwest" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "246e9f61b9bb77df069a947682be06e31ac43ea37862e244a69f177694ea6d22" -dependencies = [ - "base64 0.13.0", - "bytes 1.1.0", - "encoding_rs", - "futures-core", - "futures-util", - "http", - "http-body 0.4.3", - "hyper 0.14.13", - "hyper-rustls", - "ipnet", - "js-sys", - "lazy_static", - "log", - "mime", - "percent-encoding", - "pin-project-lite 0.2.7", - "rustls", - "serde", - "serde_urlencoded 0.7.0", - "tokio 1.11.0", - "tokio-rustls", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", - "webpki-roots", - "winreg", -] - -[[package]] -name = "retain_mut" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9c17925a9027d298a4603d286befe3f9dc0e8ed02523141914eb628798d6e5b" - -[[package]] -name = "ring" -version = "0.16.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" -dependencies = [ - "cc", - "libc", - "once_cell", - "spin", - "untrusted", - "web-sys", - "winapi 0.3.9", -] - -[[package]] -name = "roaring" -version = "0.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "536cfa885fc388b8ae69edf96d7970849b7d9c1395da1b8330f17715babf8a09" -dependencies = [ - "bytemuck", - "byteorder", - "retain_mut", -] - -[[package]] -name = "rstar" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d535e658ada8c1987a113e5261f8b907f721b2854d666e72820671481b7ee125" -dependencies = [ - "heapless", - "num-traits", - "pdqselect", - "serde", - "smallvec", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ef03e0a2b150c7a90d01faf6254c9c48a41e95fb2a8c2ac1c6f0d2b9aefc342" - -[[package]] -name = "rustc_version" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" -dependencies = [ - "semver", -] - -[[package]] -name = "rustls" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35edb675feee39aec9c99fa5ff985081995a06d594114ae14cbe797ad7b7a6d7" -dependencies = [ - "base64 0.13.0", - "log", - "ring", - "sct", - "webpki", -] - -[[package]] -name = "ryu" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" - -[[package]] -name = "safemem" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef703b7cb59335eae2eb93ceb664c0eb7ea6bf567079d843e09420219668e072" - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "scoped-tls" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" - -[[package]] -name = "scopeguard" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" - -[[package]] -name = "sct" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b362b83898e0e69f38515b82ee15aa80636befe47c3b6d3d89a911e78fc228ce" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "semver" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012" - -[[package]] -name = "serde" -version = "1.0.130" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_cbor" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" -dependencies = [ - "half", - "serde", -] - -[[package]] -name = "serde_derive" -version = "1.0.130" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", -] - -[[package]] -name = "serde_json" -version = "1.0.68" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f690853975602e1bfe1ccbf50504d67174e3bcf340f23b5ea9992e0587a52d8" -dependencies = [ - "indexmap", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "serde_test" -version = "1.0.130" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d82178225dbdeae2d5d190e8649287db6a3a32c6d24da22ae3146325aa353e4c" -dependencies = [ - "serde", -] - -[[package]] -name = "serde_urlencoded" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ec5d77e2d4c73717816afac02670d5c4f534ea95ed430442cad02e7a6e32c97" -dependencies = [ - "dtoa", - "itoa", - "serde", - "url", -] - -[[package]] -name = "serde_urlencoded" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edfa57a7f8d9c1d260a549e7224100f6c43d43f9103e06dd8b4095a9b2b43ce9" -dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "sha-1" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" -dependencies = [ - "block-buffer 0.7.3", - "digest 0.8.1", - "fake-simd", - "opaque-debug 0.2.3", -] - -[[package]] -name = "sha-1" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99cd6713db3cf16b6c84e06321e049a9b9f699826e16096d23bbcc44d15d51a6" -dependencies = [ - "block-buffer 0.9.0", - "cfg-if 1.0.0", - "cpufeatures", - "digest 0.9.0", - "opaque-debug 0.3.0", -] - -[[package]] -name = "sharded-slab" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982" -dependencies = [ - "lazy_static", -] - -[[package]] -name = "signal-hook-registry" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51e73328dc4ac0c7ccbda3a494dfa03df1de2f46018127f60c693f2648455b0" -dependencies = [ - "libc", -] - -[[package]] -name = "siphasher" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533494a8f9b724d33625ab53c6c4800f7cc445895924a8ef649222dcb76e938b" - -[[package]] -name = "slab" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c307a32c1c5c437f38c7fd45d753050587732ba8628319fbdf12a7e289ccc590" - -[[package]] -name = "slice-group-by" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb" - -[[package]] -name = "smallstr" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e922794d168678729ffc7e07182721a14219c65814e66e91b839a272fe5ae4f" -dependencies = [ - "serde", - "smallvec", -] - -[[package]] -name = "smallvec" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" - -[[package]] -name = "socket2" -version = "0.3.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "122e570113d28d773067fab24266b66753f6ea915758651696b6e35e49f88d6e" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "socket2" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dc90fe6c7be1a323296982db1836d1ea9e47b6839496dde9a541bc496df3516" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "spin" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" - -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - -[[package]] -name = "stderrlog" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45a53e2eff3e94a019afa6265e8ee04cb05b9d33fe9f5078b14e4e391d155a38" -dependencies = [ - "atty", - "chrono", - "log", - "termcolor", - "thread_local", -] - -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - -[[package]] -name = "structopt" -version = "0.3.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf9d950ef167e25e0bdb073cf1d68e9ad2795ac826f2f3f59647817cf23c0bfa" -dependencies = [ - "clap", - "lazy_static", - "structopt-derive", -] - -[[package]] -name = "structopt-derive" -version = "0.4.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba" -dependencies = [ - "heck", - "proc-macro-error", - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", -] - -[[package]] -name = "syn" -version = "0.15.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ca4b3b69a77cbe1ffc9e198781b7acb0c7365a883670e8f1c1bc66fba79a5c5" -dependencies = [ - "proc-macro2 0.4.30", - "quote 0.6.13", - "unicode-xid 0.1.0", -] - -[[package]] -name = "syn" -version = "1.0.76" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f107db402c2c2055242dbf4d2af0e69197202e9faacbef9571bbe47f5a1b84" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "unicode-xid 0.2.2", -] - -[[package]] -name = "synchronoise" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d717ed0efc9d39ab3b642a096bc369a3e02a38a51c41845d7fe31bdad1d6eaeb" -dependencies = [ - "crossbeam-queue", -] - -[[package]] -name = "synstructure" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "474aaa926faa1603c40b7885a9eaea29b444d1cb2850cb7c0e37bb1a4182f4fa" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", - "unicode-xid 0.2.2", -] - -[[package]] -name = "tap" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" - -[[package]] -name = "tempfile" -version = "3.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" -dependencies = [ - "cfg-if 1.0.0", - "libc", - "rand 0.8.4", - "redox_syscall", - "remove_dir_all", - "winapi 0.3.9", -] - -[[package]] -name = "term_size" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "termcolor" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "terminal_size" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "633c1a546cee861a1a6d0dc69ebeca693bf4296661ba7852b9d21d159e0506df" -dependencies = [ - "libc", - "winapi 0.3.9", -] - -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "term_size", - "unicode-width", -] - -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static", -] - -[[package]] -name = "time" -version = "0.1.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db9e6914ab8b1ae1c260a4ae7a49b6c5611b40328a735b21862567685e73255" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi 0.3.9", -] - -[[package]] -name = "tinytemplate" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" -dependencies = [ - "serde", - "serde_json", -] - -[[package]] -name = "tinyvec" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5241dd6f21443a3606b432718b166d3cedc962fd4b8bea54a8bc7f514ebda986" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" - -[[package]] -name = "tokio" -version = "0.2.25" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6703a273949a90131b290be1fe7b039d0fc884aa1935860dfcbe056f28cd8092" -dependencies = [ - "bytes 0.5.6", - "fnv", - "futures-core", - "iovec", - "lazy_static", - "libc", - "memchr", - "mio 0.6.23", - "mio-named-pipes", - "mio-uds", - "num_cpus", - "pin-project-lite 0.1.12", - "signal-hook-registry", - "slab", - "tokio-macros", - "winapi 0.3.9", -] - -[[package]] -name = "tokio" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4efe6fc2395938c8155973d7be49fe8d03a843726e285e100a8a383cc0154ce" -dependencies = [ - "autocfg", - "bytes 1.1.0", - "libc", - "memchr", - "mio 0.7.13", - "num_cpus", - "pin-project-lite 0.2.7", - "winapi 0.3.9", -] - -[[package]] -name = "tokio-macros" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e44da00bfc73a25f814cd8d7e57a68a5c31b74b3152a0a1d1f590c97ed06265a" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", -] - -[[package]] -name = "tokio-rustls" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc6844de72e57df1980054b38be3a9f4702aba4858be64dd700181a8a6d0e1b6" -dependencies = [ - "rustls", - "tokio 1.11.0", - "webpki", -] - -[[package]] -name = "tokio-tungstenite" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d9e878ad426ca286e4dcae09cbd4e1973a7f8987d97570e2469703dd7f5720c" -dependencies = [ - "futures-util", - "log", - "pin-project 0.4.28", - "tokio 0.2.25", - "tungstenite", -] - -[[package]] -name = "tokio-util" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be8242891f2b6cbef26a2d7e8605133c2c554cd35b3e4948ea892d6d68436499" -dependencies = [ - "bytes 0.5.6", - "futures-core", - "futures-sink", - "log", - "pin-project-lite 0.1.12", - "tokio 0.2.25", -] - -[[package]] -name = "tokio-util" -version = "0.6.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d3725d3efa29485e87311c5b699de63cde14b00ed4d256b8318aa30ca452cd" -dependencies = [ - "bytes 1.1.0", - "futures-core", - "futures-sink", - "log", - "pin-project-lite 0.2.7", - "tokio 1.11.0", -] - -[[package]] -name = "toml" -version = "0.5.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" -dependencies = [ - "serde", -] - -[[package]] -name = "tower-service" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "360dfd1d6d30e05fda32ace2c8c70e9c0a9da713275777f5a4dbb8a1893930c6" - -[[package]] -name = "tracing" -version = "0.1.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84f96e095c0c82419687c20ddf5cb3eadb61f4e1405923c9dc8e53a1adacbda8" -dependencies = [ - "cfg-if 1.0.0", - "log", - "pin-project-lite 0.2.7", - "tracing-attributes", - "tracing-core", -] - -[[package]] -name = "tracing-attributes" -version = "0.1.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98863d0dd09fa59a1b79c6750ad80dbda6b75f4e71c437a6a1a8cb91a8bcbd77" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", -] - -[[package]] -name = "tracing-core" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46125608c26121c81b0c6d693eab5a420e416da7e43c426d2e8f7df8da8a3acf" -dependencies = [ - "lazy_static", -] - -[[package]] -name = "tracing-error" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4d7c0b83d4a500748fa5879461652b361edf5c9d51ede2a2ac03875ca185e24" -dependencies = [ - "tracing", - "tracing-subscriber", -] - -[[package]] -name = "tracing-futures" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" -dependencies = [ - "pin-project 1.0.8", - "tracing", -] - -[[package]] -name = "tracing-subscriber" -version = "0.2.24" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdd0568dbfe3baf7048b7908d2b32bca0d81cd56bec6d2a8f894b01d74f86be3" -dependencies = [ - "sharded-slab", - "thread_local", - "tracing-core", -] - -[[package]] -name = "try-lock" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" - -[[package]] -name = "tungstenite" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0308d80d86700c5878b9ef6321f020f29b1bb9d5ff3cab25e75e23f3a492a23" -dependencies = [ - "base64 0.12.3", - "byteorder", - "bytes 0.5.6", - "http", - "httparse", - "input_buffer", - "log", - "rand 0.7.3", - "sha-1 0.9.8", - "url", - "utf-8", -] - -[[package]] -name = "twoway" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b11b2b5241ba34be09c3cc85a36e56e48f9888862e19cedf23336d35316ed1" -dependencies = [ - "memchr", -] - -[[package]] -name = "typenum" -version = "1.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b63708a265f51345575b27fe43f9500ad611579e764c79edbc2037b1121959ec" - -[[package]] -name = "ucd-trie" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" - -[[package]] -name = "unicase" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" -dependencies = [ - "version_check", -] - -[[package]] -name = "unicode-bidi" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "246f4c42e67e7a4e3c6106ff716a5d067d4132a642840b242e357e468a2a0085" - -[[package]] -name = "unicode-normalization" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54590932941a9e9266f0832deed84ebe1bf2e4c9e4a3554d393d18f5e854bf9" -dependencies = [ - "tinyvec", -] - -[[package]] -name = "unicode-segmentation" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" - -[[package]] -name = "unicode-width" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" - -[[package]] -name = "unicode-xid" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" - -[[package]] -name = "unicode-xid" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" - -[[package]] -name = "untrusted" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" - -[[package]] -name = "url" -version = "2.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a507c383b2d33b5fc35d1861e77e6b383d158b2da5e14fe51b83dfedf6fd578c" -dependencies = [ - "form_urlencoded", - "idna", - "matches", - "percent-encoding", -] - -[[package]] -name = "urlencoding" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a1f0175e03a0973cf4afd476bef05c26e228520400eb1fd473ad417b1c00ffb" - -[[package]] -name = "utf-8" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" - -[[package]] -name = "utf8-width" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cf7d77f457ef8dfa11e4cd5933c5ddb5dc52a94664071951219a97710f0a32b" - -[[package]] -name = "uuid" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" -dependencies = [ - "getrandom 0.2.3", -] - -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" - -[[package]] -name = "version_check" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" - -[[package]] -name = "walkdir" -version = "2.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" -dependencies = [ - "same-file", - "winapi 0.3.9", - "winapi-util", -] - -[[package]] -name = "want" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ce8a968cb1cd110d136ff8b819a556d6fb6d919363c61534f6860c7eb172ba0" -dependencies = [ - "log", - "try-lock", -] - -[[package]] -name = "warp" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f41be6df54c97904af01aa23e613d4521eed7ab23537cede692d4058f6449407" -dependencies = [ - "bytes 0.5.6", - "futures", - "headers", - "http", - "hyper 0.13.10", - "log", - "mime", - "mime_guess", - "multipart", - "pin-project 0.4.28", - "scoped-tls", - "serde", - "serde_json", - "serde_urlencoded 0.6.1", - "tokio 0.2.25", - "tokio-tungstenite", - "tower-service", - "tracing", - "tracing-futures", - "urlencoding", -] - -[[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" - -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - -[[package]] -name = "wasm-bindgen" -version = "0.2.78" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "632f73e236b219150ea279196e54e610f5dbafa5d61786303d4da54f84e47fce" -dependencies = [ - "cfg-if 1.0.0", - "serde", - "serde_json", - "wasm-bindgen-macro", -] - -[[package]] -name = "wasm-bindgen-backend" -version = "0.2.78" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a317bf8f9fba2476b4b2c85ef4c4af8ff39c3c7f0cdfeed4f82c34a880aa837b" -dependencies = [ - "bumpalo", - "lazy_static", - "log", - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e8d7523cb1f2a4c96c1317ca690031b714a51cc14e05f712446691f413f5d39" -dependencies = [ - "cfg-if 1.0.0", - "js-sys", - "wasm-bindgen", - "web-sys", -] - -[[package]] -name = "wasm-bindgen-macro" -version = "0.2.78" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d56146e7c495528bf6587663bea13a8eb588d39b36b679d83972e1a2dbbdacf9" -dependencies = [ - "quote 1.0.9", - "wasm-bindgen-macro-support", -] - -[[package]] -name = "wasm-bindgen-macro-support" -version = "0.2.78" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7803e0eea25835f8abdc585cd3021b3deb11543c6fe226dcd30b228857c5c5ab" -dependencies = [ - "proc-macro2 1.0.29", - "quote 1.0.9", - "syn 1.0.76", - "wasm-bindgen-backend", - "wasm-bindgen-shared", -] - -[[package]] -name = "wasm-bindgen-shared" -version = "0.2.78" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0237232789cf037d5480773fe568aac745bfe2afbc11a863e97901780a6b47cc" - -[[package]] -name = "web-sys" -version = "0.3.55" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38eb105f1c59d9eaa6b5cdc92b859d85b926e82cb2e0945cd0c9259faa6fe9fb" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "webpki" -version = "0.21.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8e38c0608262c46d4a56202ebabdeb094cef7e560ca7a226c6bf055188aa4ea" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "webpki-roots" -version = "0.21.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aabe153544e473b775453675851ecc86863d2a81d786d741f6b76778f2a48940" -dependencies = [ - "webpki", -] - -[[package]] -name = "whatlang" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a346d2eb29c03618693ed24a29d1acd0c3f2cb08ae58b9669d7461e033cf703" -dependencies = [ - "hashbrown 0.7.2", -] - -[[package]] -name = "winapi" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-build" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "winreg" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0120db82e8a1e0b9fb3345a539c478767c0048d842860994d96113d5b667bd69" -dependencies = [ - "winapi 0.3.9", -] - -[[package]] -name = "ws2_32-sys" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d59cefebd0c892fa2dd6de581e937301d8552cb44489cdff035c6187cb63fa5e" -dependencies = [ - "winapi 0.2.8", - "winapi-build", -] - -[[package]] -name = "wyz" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" - -[[package]] -name = "zerocopy" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46" -dependencies = [ - "byteorder", - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb" -dependencies = [ - "proc-macro2 1.0.29", - "syn 1.0.76", - "synstructure", -] From 78b0bce9a1bff60f3fc386b11b6c3b7df2e22be7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 Sep 2021 11:37:05 +0200 Subject: [PATCH 1016/1889] fix the returned error when asc desc fails to be parsed --- milli/src/criterion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 24879cdd4..c526a7e32 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -156,7 +156,7 @@ impl FromStr for AscDesc { match text.rsplit_once(':') { Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)), Some((left, "desc")) => Ok(AscDesc::Desc(left.parse()?)), - _ => Err(UserError::InvalidRankingRuleName { name: text.to_string() }), + _ => Err(UserError::InvalidAscDescSyntax { name: text.to_string() }), } } } From 176160d32f7a1e5be21421ff9bd31c66f31c1ffe Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 Sep 2021 12:10:21 +0200 Subject: [PATCH 1017/1889] fix all benchmarks and add the compile time checking of the benhcmarks in the ci --- .github/workflows/rust.yml | 2 +- benchmarks/Cargo.toml | 3 ++ benchmarks/benches/indexing.rs | 70 ++++++++++-------------------- benchmarks/benches/search_geo.rs | 4 +- benchmarks/benches/utils.rs | 73 +++++++++++++++++++++++++++++--- 5 files changed, 94 insertions(+), 58 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b5335d799..7338d134b 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -33,7 +33,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: check - args: --all + args: --workspace --all-targets - name: Run cargo test uses: actions-rs/cargo@v1 with: diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index b598f2f6f..99a36b740 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -6,6 +6,9 @@ publish = false [dependencies] milli = { path = "../milli" } +anyhow = "1.0" +serde_json = { version = "1.0.62", features = ["preserve_order"] } +csv = "1.1.6" [target.'cfg(target_os = "linux")'.dependencies] jemallocator = "0.3.2" diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 30532aef8..66ecc7154 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -1,11 +1,12 @@ mod datasets_paths; +mod utils; -use std::fs::{create_dir_all, remove_dir_all, File}; +use std::fs::{create_dir_all, remove_dir_all}; use std::path::Path; use criterion::{criterion_group, criterion_main, Criterion}; use heed::EnvOpenOptions; -use milli::update::{IndexDocumentsMethod, UpdateBuilder, UpdateFormat}; +use milli::update::UpdateBuilder; use milli::Index; #[cfg(target_os = "linux")] @@ -67,15 +68,10 @@ fn indexing_songs_default(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Csv); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( - "could not find the dataset in: {}", - datasets_paths::SMOL_SONGS - )); - builder.execute(reader, |_, _| ()).unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -118,15 +114,10 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Csv); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( - "could not find the dataset in: {}", - datasets_paths::SMOL_SONGS - )); - builder.execute(reader, |_, _| ()).unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -165,15 +156,10 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Csv); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::SMOL_SONGS).expect(&format!( - "could not find the dataset in: {}", - datasets_paths::SMOL_SONGS - )); - builder.execute(reader, |_, _| ()).unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -211,15 +197,10 @@ fn indexing_wiki(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Csv); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::SMOL_WIKI_ARTICLES).expect(&format!( - "could not find the dataset in: {}", - datasets_paths::SMOL_SONGS - )); - builder.execute(reader, |_, _| ()).unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -262,13 +243,10 @@ fn indexing_movies_default(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); - builder.update_format(UpdateFormat::Json); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::MOVIES) - .expect(&format!("could not find the dataset in: {}", datasets_paths::MOVIES)); - builder.execute(reader, |_, _| ()).unwrap(); + let documents = utils::documents_from(datasets_paths::MOVIES, "json"); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -316,15 +294,11 @@ fn indexing_geo(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); + let builder = update_builder.index_documents(&mut wtxn, &index); + + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); + builder.execute(documents, |_, _| ()).unwrap(); - builder.update_format(UpdateFormat::JsonStream); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(datasets_paths::SMOL_ALL_COUNTRIES).expect(&format!( - "could not find the dataset in: {}", - datasets_paths::SMOL_ALL_COUNTRIES - )); - builder.execute(reader, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); diff --git a/benchmarks/benches/search_geo.rs b/benchmarks/benches/search_geo.rs index 1432f691b..84448c32d 100644 --- a/benchmarks/benches/search_geo.rs +++ b/benchmarks/benches/search_geo.rs @@ -2,7 +2,7 @@ mod datasets_paths; mod utils; use criterion::{criterion_group, criterion_main}; -use milli::update::{Settings, UpdateFormat}; +use milli::update::Settings; use utils::Conf; #[cfg(target_os = "linux")] @@ -33,7 +33,7 @@ fn base_conf(builder: &mut Settings) { #[rustfmt::skip] const BASE_CONF: Conf = Conf { dataset: datasets_paths::SMOL_ALL_COUNTRIES, - dataset_format: UpdateFormat::JsonStream, + dataset_format: "jsonl", queries: &[ "", ], diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 72eac59d9..e5bdbdfaa 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,10 +1,15 @@ +#![allow(dead_code)] + use std::fs::{create_dir_all, remove_dir_all, File}; +use std::io::{self, Cursor, Read, Seek}; use std::path::Path; use criterion::BenchmarkId; use heed::EnvOpenOptions; -use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder, UpdateFormat}; +use milli::documents::DocumentBatchReader; +use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder}; use milli::{FilterCondition, Index}; +use serde_json::{Map, Value}; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory @@ -13,7 +18,7 @@ pub struct Conf<'a> { /// the dataset to be used, it must be an uncompressed csv pub dataset: &'a str, /// The format of the dataset - pub dataset_format: UpdateFormat, + pub dataset_format: &'a str, pub group_name: &'a str, pub queries: &'a [&'a str], /// here you can change which criterion are used and in which order. @@ -33,7 +38,7 @@ pub struct Conf<'a> { impl Conf<'_> { pub const BASE: Self = Conf { database_name: "benches.mmdb", - dataset_format: UpdateFormat::Csv, + dataset_format: "csv", dataset: "", group_name: "", queries: &[], @@ -87,11 +92,10 @@ pub fn base_setup(conf: &Conf) -> Index { if let None = conf.primary_key { builder.enable_autogenerate_docids(); } - builder.update_format(conf.dataset_format); + let documents = documents_from(conf.dataset, conf.dataset_format); + builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - let reader = File::open(conf.dataset) - .expect(&format!("could not find the dataset in: {}", conf.dataset)); - builder.execute(reader, |_, _| ()).unwrap(); + builder.execute(documents, |_, _| ()).unwrap(); wtxn.commit().unwrap(); index @@ -128,3 +132,58 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { index.prepare_for_closing().wait(); } } + +pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader { + let reader = + File::open(filename).expect(&format!("could not find the dataset in: {}", filename)); + let documents = match filetype { + "csv" => documents_from_csv(reader).unwrap(), + "json" => documents_from_json(reader).unwrap(), + "jsonl" => documents_from_jsonl(reader).unwrap(), + otherwise => panic!("invalid update format {:?}", otherwise), + }; + DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap() +} + +fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let values = serde_json::Deserializer::from_reader(reader) + .into_iter::>(); + for document in values { + let document = document?; + documents.add_documents(document)?; + } + documents.finish()?; + + Ok(writer.into_inner()) +} + +fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let json: serde_json::Value = serde_json::from_reader(reader)?; + documents.add_documents(json)?; + documents.finish()?; + + Ok(writer.into_inner()) +} + +fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { + let mut writer = Cursor::new(Vec::new()); + let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + + let mut records = csv::Reader::from_reader(reader); + let iter = records.deserialize::>(); + + for doc in iter { + let doc = doc?; + documents.add_documents(doc)?; + } + + documents.finish()?; + + Ok(writer.into_inner()) +} From 113a061bee3f40134fab7a01f80af24ee4ed349b Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 Sep 2021 15:09:07 +0200 Subject: [PATCH 1018/1889] fix the error handling on the criterion side --- milli/src/criterion.rs | 181 +++++++++++++++++++++++++++++++++++------ milli/src/error.rs | 36 +++++++- 2 files changed, 189 insertions(+), 28 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index c526a7e32..33112508c 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -53,10 +53,31 @@ impl FromStr for Criterion { Ok(AscDesc::Asc(Member::Field(field))) => Ok(Criterion::Asc(field)), Ok(AscDesc::Desc(Member::Field(field))) => Ok(Criterion::Desc(field)), Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { - Err(UserError::InvalidRankingRuleName { name: text.to_string() })? + Err(UserError::InvalidReservedRankingRuleNameSort { + name: "_geoPoint".to_string(), + })? } Err(UserError::InvalidAscDescSyntax { name }) => { - Err(UserError::InvalidRankingRuleName { name }.into()) + Err(UserError::InvalidRankingRuleName { name })? + } + Err(UserError::InvalidReservedAscDescSyntax { name }) + if name.starts_with("_geoPoint") => + { + Err(UserError::InvalidReservedRankingRuleNameSort { + name: "_geoPoint".to_string(), + } + .into()) + } + Err(UserError::InvalidReservedAscDescSyntax { name }) + if name.starts_with("_geoRadius") => + { + Err(UserError::InvalidReservedRankingRuleNameFilter { + name: "_geoRadius".to_string(), + } + .into()) + } + Err(UserError::InvalidReservedAscDescSyntax { name }) => { + Err(UserError::InvalidReservedRankingRuleName { name }.into()) } Err(error) => { Err(UserError::InvalidRankingRuleName { name: error.to_string() }.into()) @@ -80,20 +101,22 @@ impl FromStr for Member { Some(point) => { let (lat, long) = point .split_once(',') - .ok_or_else(|| UserError::InvalidRankingRuleName { name: text.to_string() }) + .ok_or_else(|| UserError::InvalidReservedAscDescSyntax { + name: text.to_string(), + }) .and_then(|(lat, long)| { lat.trim() .parse() .and_then(|lat| long.trim().parse().map(|long| (lat, long))) - .map_err(|_| UserError::InvalidRankingRuleName { + .map_err(|_| UserError::InvalidReservedAscDescSyntax { name: text.to_string(), }) })?; Ok(Member::Geo([lat, long])) } None => { - if is_reserved_keyword(text) { - return Err(UserError::InvalidReservedRankingRuleName { + if is_reserved_keyword(text) || text.starts_with("_geoRadius(") { + return Err(UserError::InvalidReservedAscDescSyntax { name: text.to_string(), })?; } @@ -191,14 +214,15 @@ impl fmt::Display for Criterion { #[cfg(test)] mod tests { + use big_s::S; + use AscDesc::*; + use Member::*; + use UserError::*; + use super::*; #[test] fn parse_asc_desc() { - use big_s::S; - use AscDesc::*; - use Member::*; - let valid_req = [ ("truc:asc", Asc(Field(S("truc")))), ("bidule:desc", Desc(Field(S("bidule")))), @@ -216,28 +240,52 @@ mod tests { ]; for (req, expected) in valid_req { - let res = req.parse(); - assert!(res.is_ok(), "Failed to parse `{}`, was expecting `{:?}`", req, expected); - assert_eq!(expected, res.unwrap()); + let res = req.parse::(); + assert!( + res.is_ok(), + "Failed to parse `{}`, was expecting `{:?}` but instead got `{:?}`", + req, + expected, + res + ); + assert_eq!(res.unwrap(), expected); } let invalid_req = [ - "truc:machin", - "truc:deesc", - "truc:asc:deesc", - "42desc", - "_geoPoint:asc", - "_geoDistance:asc", - "_geoPoint(42.12 , 59.598)", - "_geoPoint(42.12 , 59.598):deesc", - "_geoPoint(42.12 , 59.598):machin", - "_geoPoint(42.12 , 59.598):asc:aasc", - "_geoPoint(42,12 , 59,598):desc", - "_geoPoint(35, 85, 75):asc", - "_geoPoint(18):asc", + ("truc:machin", InvalidAscDescSyntax { name: S("truc:machin") }), + ("truc:deesc", InvalidAscDescSyntax { name: S("truc:deesc") }), + ("truc:asc:deesc", InvalidAscDescSyntax { name: S("truc:asc:deesc") }), + ("42desc", InvalidAscDescSyntax { name: S("42desc") }), + ("_geoPoint:asc", InvalidReservedAscDescSyntax { name: S("_geoPoint") }), + ("_geoDistance:asc", InvalidReservedAscDescSyntax { name: S("_geoDistance") }), + ( + "_geoPoint(42.12 , 59.598)", + InvalidAscDescSyntax { name: S("_geoPoint(42.12 , 59.598)") }, + ), + ( + "_geoPoint(42.12 , 59.598):deesc", + InvalidAscDescSyntax { name: S("_geoPoint(42.12 , 59.598):deesc") }, + ), + ( + "_geoPoint(42.12 , 59.598):machin", + InvalidAscDescSyntax { name: S("_geoPoint(42.12 , 59.598):machin") }, + ), + ( + "_geoPoint(42.12 , 59.598):asc:aasc", + InvalidAscDescSyntax { name: S("_geoPoint(42.12 , 59.598):asc:aasc") }, + ), + ( + "_geoPoint(42,12 , 59,598):desc", + InvalidReservedAscDescSyntax { name: S("_geoPoint(42,12 , 59,598)") }, + ), + ( + "_geoPoint(35, 85, 75):asc", + InvalidReservedAscDescSyntax { name: S("_geoPoint(35, 85, 75)") }, + ), + ("_geoPoint(18):asc", InvalidReservedAscDescSyntax { name: S("_geoPoint(18)") }), ]; - for req in invalid_req { + for (req, expected_error) in invalid_req { let res = req.parse::(); assert!( res.is_err(), @@ -245,6 +293,85 @@ mod tests { req, res, ); + let res = res.unwrap_err(); + assert_eq!( + res.to_string(), + expected_error.to_string(), + "Bad error for input {}: got `{:?}` instead of `{:?}`", + req, + res, + expected_error + ); + } + } + + #[test] + fn parse_criterion() { + let valid_criteria = [ + ("words", Criterion::Words), + ("typo", Criterion::Typo), + ("proximity", Criterion::Proximity), + ("attribute", Criterion::Attribute), + ("sort", Criterion::Sort), + ("exactness", Criterion::Exactness), + ("price:asc", Criterion::Asc(S("price"))), + ("price:desc", Criterion::Desc(S("price"))), + ("price:asc:desc", Criterion::Desc(S("price:asc"))), + ("truc:machin:desc", Criterion::Desc(S("truc:machin"))), + ("hello-world!:desc", Criterion::Desc(S("hello-world!"))), + ("it's spacy over there:asc", Criterion::Asc(S("it's spacy over there"))), + ]; + + for (input, expected) in valid_criteria { + let res = input.parse::(); + assert!( + res.is_ok(), + "Failed to parse `{}`, was expecting `{:?}` but instead got `{:?}`", + input, + expected, + res + ); + assert_eq!(res.unwrap(), expected); + } + + let invalid_criteria = [ + ("words suffix", InvalidRankingRuleName { name: S("words suffix") }), + ("prefix typo", InvalidRankingRuleName { name: S("prefix typo") }), + ("proximity attribute", InvalidRankingRuleName { name: S("proximity attribute") }), + ("price", InvalidRankingRuleName { name: S("price") }), + ("asc:price", InvalidRankingRuleName { name: S("asc:price") }), + ("price:deesc", InvalidRankingRuleName { name: S("price:deesc") }), + ("price:aasc", InvalidRankingRuleName { name: S("price:aasc") }), + ("price:asc and desc", InvalidRankingRuleName { name: S("price:asc and desc") }), + ("price:asc:truc", InvalidRankingRuleName { name: S("price:asc:truc") }), + ("_geo:asc", InvalidReservedRankingRuleName { name: S("_geo") }), + ("_geoDistance:asc", InvalidReservedRankingRuleName { name: S("_geoDistance") }), + ("_geoPoint:asc", InvalidReservedRankingRuleNameSort { name: S("_geoPoint") }), + ("_geoPoint(42, 75):asc", InvalidReservedRankingRuleNameSort { name: S("_geoPoint") }), + ("_geoRadius:asc", InvalidReservedRankingRuleNameFilter { name: S("_geoRadius") }), + ( + "_geoRadius(42, 75, 59):asc", + InvalidReservedRankingRuleNameFilter { name: S("_geoRadius") }, + ), + ]; + + for (input, expected) in invalid_criteria { + let res = input.parse::(); + assert!( + res.is_err(), + "Should no be able to parse `{}`, was expecting an error but instead got: `{:?}`", + input, + res + ); + let res = res.unwrap_err(); + assert_eq!( + res.to_string(), + expected.to_string(), + "Bad error for input {}: got `{:?}` instead of `{:?}`", + input, + res, + expected + ); } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index fe0ac2cf7..2e2d3088e 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -57,14 +57,18 @@ pub enum UserError { AttributeLimitReached, DocumentLimitReached, InvalidAscDescSyntax { name: String }, + InvalidReservedAscDescSyntax { name: String }, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), InvalidSortName { name: String }, + InvalidReservedSortName { name: String }, InvalidGeoField { document_id: Value, object: Value }, InvalidRankingRuleName { name: String }, InvalidReservedRankingRuleName { name: String }, + InvalidReservedRankingRuleNameSort { name: String }, + InvalidReservedRankingRuleNameFilter { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, InvalidStoreFile, @@ -230,10 +234,40 @@ impl fmt::Display for UserError { "the document with the id: {} contains an invalid _geo field: {}", document_id, object ), - Self::InvalidRankingRuleName { name } => write!(f, "invalid criterion {}", name), + Self::InvalidRankingRuleName { name } => write!(f, "invalid ranking rule {}", name), + Self::InvalidReservedAscDescSyntax { name } => { + write!( + f, + "{} is a reserved keyword and thus can't be used as a asc/desc rule", + name + ) + } Self::InvalidReservedRankingRuleName { name } => { write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name) } + Self::InvalidReservedRankingRuleNameSort { name } => { + write!( + f, + "{0} is a reserved keyword and thus can't be used as a ranking rule. \ +{0} can only be used for sorting at search time", + name + ) + } + Self::InvalidReservedRankingRuleNameFilter { name } => { + write!( + f, + "{0} is a reserved keyword and thus can't be used as a ranking rule. \ +{0} can only be used for filtering at search time", + name + ) + } + Self::InvalidReservedSortName { name } => { + write!( + f, + "{} is a reserved keyword and thus can't be used as a sort expression", + name + ) + } Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); write!( From 257e621d4011857d998d9c2b51996a60edba4c2c Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 Sep 2021 15:18:39 +0200 Subject: [PATCH 1019/1889] create an asc_desc module --- milli/src/asc_desc.rs | 228 +++++++++++++++++++++++++++++++ milli/src/criterion.rs | 186 +------------------------ milli/src/lib.rs | 4 +- milli/src/search/criteria/mod.rs | 3 +- milli/src/search/mod.rs | 3 +- 5 files changed, 235 insertions(+), 189 deletions(-) create mode 100644 milli/src/asc_desc.rs diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs new file mode 100644 index 000000000..d68d5b6a2 --- /dev/null +++ b/milli/src/asc_desc.rs @@ -0,0 +1,228 @@ +//! This module provide the `AscDesc` type and define a all the errors related to this type + +use std::fmt; +use std::str::FromStr; + +use serde::{Deserialize, Serialize}; + +use crate::error::is_reserved_keyword; +use crate::CriterionError; + +/// This error type is never supposed to be shown to the end user. +/// You must always cast it to a sort error or a criterion error. +#[derive(Debug)] +pub enum AscDescError { + InvalidSyntax { name: String }, + ReservedKeyword { name: String }, +} + +impl fmt::Display for AscDescError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::InvalidSyntax { name } => { + write!(f, "invalid asc/desc syntax for {}", name) + } + Self::ReservedKeyword { name } => { + write!( + f, + "{} is a reserved keyword and thus can't be used as a asc/desc rule", + name + ) + } + } + } +} + +impl From for CriterionError { + fn from(error: AscDescError) -> Self { + match error { + AscDescError::InvalidSyntax { name } => CriterionError::InvalidName { name }, + AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => { + CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() } + } + AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => { + CriterionError::ReservedNameForFilter { name: "_geoRadius".to_string() } + } + AscDescError::ReservedKeyword { name } => (CriterionError::ReservedName { name }), + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +pub enum Member { + Field(String), + Geo([f64; 2]), +} + +impl FromStr for Member { + type Err = AscDescError; + + fn from_str(text: &str) -> Result { + match text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(")")) { + Some(point) => { + let (lat, long) = point + .split_once(',') + .ok_or_else(|| AscDescError::ReservedKeyword { name: text.to_string() }) + .and_then(|(lat, long)| { + lat.trim() + .parse() + .and_then(|lat| long.trim().parse().map(|long| (lat, long))) + .map_err(|_| AscDescError::ReservedKeyword { name: text.to_string() }) + })?; + Ok(Member::Geo([lat, long])) + } + None => { + if is_reserved_keyword(text) || text.starts_with("_geoRadius(") { + return Err(AscDescError::ReservedKeyword { name: text.to_string() })?; + } + Ok(Member::Field(text.to_string())) + } + } + } +} + +impl fmt::Display for Member { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Member::Field(name) => f.write_str(name), + Member::Geo([lat, lng]) => write!(f, "_geoPoint({}, {})", lat, lng), + } + } +} + +impl Member { + pub fn field(&self) -> Option<&str> { + match self { + Member::Field(field) => Some(field), + Member::Geo(_) => None, + } + } + + pub fn geo_point(&self) -> Option<&[f64; 2]> { + match self { + Member::Geo(point) => Some(point), + Member::Field(_) => None, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +pub enum AscDesc { + Asc(Member), + Desc(Member), +} + +impl AscDesc { + pub fn member(&self) -> &Member { + match self { + AscDesc::Asc(member) => member, + AscDesc::Desc(member) => member, + } + } + + pub fn field(&self) -> Option<&str> { + self.member().field() + } +} + +impl FromStr for AscDesc { + type Err = AscDescError; + + /// Since we don't know if this was deserialized for a criterion or a sort we just return a + /// string and let the caller create his own error + fn from_str(text: &str) -> Result { + match text.rsplit_once(':') { + Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)), + Some((left, "desc")) => Ok(AscDesc::Desc(left.parse()?)), + _ => Err(AscDescError::InvalidSyntax { name: text.to_string() }), + } + } +} + +#[cfg(test)] +mod tests { + use big_s::S; + use AscDesc::*; + use AscDescError::*; + use Member::*; + + use super::*; + + #[test] + fn parse_asc_desc() { + let valid_req = [ + ("truc:asc", Asc(Field(S("truc")))), + ("bidule:desc", Desc(Field(S("bidule")))), + ("a-b:desc", Desc(Field(S("a-b")))), + ("a:b:desc", Desc(Field(S("a:b")))), + ("a12:asc", Asc(Field(S("a12")))), + ("42:asc", Asc(Field(S("42")))), + ("_geoPoint(42, 59):asc", Asc(Geo([42., 59.]))), + ("_geoPoint(42.459, 59):desc", Desc(Geo([42.459, 59.]))), + ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), + ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), + ("_geoPoint(42.0002, 59.895):desc", Desc(Geo([42.0002, 59.895]))), + ("_geoPoint(42., 59.):desc", Desc(Geo([42., 59.]))), + ("truc(12, 13):desc", Desc(Field(S("truc(12, 13)")))), + ]; + + for (req, expected) in valid_req { + let res = req.parse::(); + assert!( + res.is_ok(), + "Failed to parse `{}`, was expecting `{:?}` but instead got `{:?}`", + req, + expected, + res + ); + assert_eq!(res.unwrap(), expected); + } + + let invalid_req = [ + ("truc:machin", InvalidSyntax { name: S("truc:machin") }), + ("truc:deesc", InvalidSyntax { name: S("truc:deesc") }), + ("truc:asc:deesc", InvalidSyntax { name: S("truc:asc:deesc") }), + ("42desc", InvalidSyntax { name: S("42desc") }), + ("_geoPoint:asc", ReservedKeyword { name: S("_geoPoint") }), + ("_geoDistance:asc", ReservedKeyword { name: S("_geoDistance") }), + ("_geoPoint(42.12 , 59.598)", InvalidSyntax { name: S("_geoPoint(42.12 , 59.598)") }), + ( + "_geoPoint(42.12 , 59.598):deesc", + InvalidSyntax { name: S("_geoPoint(42.12 , 59.598):deesc") }, + ), + ( + "_geoPoint(42.12 , 59.598):machin", + InvalidSyntax { name: S("_geoPoint(42.12 , 59.598):machin") }, + ), + ( + "_geoPoint(42.12 , 59.598):asc:aasc", + InvalidSyntax { name: S("_geoPoint(42.12 , 59.598):asc:aasc") }, + ), + ( + "_geoPoint(42,12 , 59,598):desc", + ReservedKeyword { name: S("_geoPoint(42,12 , 59,598)") }, + ), + ("_geoPoint(35, 85, 75):asc", ReservedKeyword { name: S("_geoPoint(35, 85, 75)") }), + ("_geoPoint(18):asc", ReservedKeyword { name: S("_geoPoint(18)") }), + ]; + + for (req, expected_error) in invalid_req { + let res = req.parse::(); + assert!( + res.is_err(), + "Should no be able to parse `{}`, was expecting an error but instead got: `{:?}`", + req, + res, + ); + let res = res.unwrap_err(); + assert_eq!( + res.to_string(), + expected_error.to_string(), + "Bad error for input {}: got `{:?}` instead of `{:?}`", + req, + res, + expected_error + ); + } + } +} diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 33112508c..acc0148bb 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -3,7 +3,8 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; -use crate::error::{is_reserved_keyword, Error, UserError}; +use crate::error::{Error, UserError}; +use crate::{AscDesc, Member}; #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { @@ -87,103 +88,6 @@ impl FromStr for Criterion { } } -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] -pub enum Member { - Field(String), - Geo([f64; 2]), -} - -impl FromStr for Member { - type Err = UserError; - - fn from_str(text: &str) -> Result { - match text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(")")) { - Some(point) => { - let (lat, long) = point - .split_once(',') - .ok_or_else(|| UserError::InvalidReservedAscDescSyntax { - name: text.to_string(), - }) - .and_then(|(lat, long)| { - lat.trim() - .parse() - .and_then(|lat| long.trim().parse().map(|long| (lat, long))) - .map_err(|_| UserError::InvalidReservedAscDescSyntax { - name: text.to_string(), - }) - })?; - Ok(Member::Geo([lat, long])) - } - None => { - if is_reserved_keyword(text) || text.starts_with("_geoRadius(") { - return Err(UserError::InvalidReservedAscDescSyntax { - name: text.to_string(), - })?; - } - Ok(Member::Field(text.to_string())) - } - } - } -} - -impl fmt::Display for Member { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Member::Field(name) => f.write_str(name), - Member::Geo([lat, lng]) => write!(f, "_geoPoint({}, {})", lat, lng), - } - } -} - -impl Member { - pub fn field(&self) -> Option<&str> { - match self { - Member::Field(field) => Some(field), - Member::Geo(_) => None, - } - } - - pub fn geo_point(&self) -> Option<&[f64; 2]> { - match self { - Member::Geo(point) => Some(point), - Member::Field(_) => None, - } - } -} - -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] -pub enum AscDesc { - Asc(Member), - Desc(Member), -} - -impl AscDesc { - pub fn member(&self) -> &Member { - match self { - AscDesc::Asc(member) => member, - AscDesc::Desc(member) => member, - } - } - - pub fn field(&self) -> Option<&str> { - self.member().field() - } -} - -impl FromStr for AscDesc { - type Err = UserError; - - /// Since we don't know if this was deserialized for a criterion or a sort we just return a - /// string and let the caller create his own error - fn from_str(text: &str) -> Result { - match text.rsplit_once(':') { - Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)), - Some((left, "desc")) => Ok(AscDesc::Desc(left.parse()?)), - _ => Err(UserError::InvalidAscDescSyntax { name: text.to_string() }), - } - } -} - pub fn default_criteria() -> Vec { vec![ Criterion::Words, @@ -215,96 +119,10 @@ impl fmt::Display for Criterion { #[cfg(test)] mod tests { use big_s::S; - use AscDesc::*; - use Member::*; use UserError::*; use super::*; - #[test] - fn parse_asc_desc() { - let valid_req = [ - ("truc:asc", Asc(Field(S("truc")))), - ("bidule:desc", Desc(Field(S("bidule")))), - ("a-b:desc", Desc(Field(S("a-b")))), - ("a:b:desc", Desc(Field(S("a:b")))), - ("a12:asc", Asc(Field(S("a12")))), - ("42:asc", Asc(Field(S("42")))), - ("_geoPoint(42, 59):asc", Asc(Geo([42., 59.]))), - ("_geoPoint(42.459, 59):desc", Desc(Geo([42.459, 59.]))), - ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), - ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), - ("_geoPoint(42.0002, 59.895):desc", Desc(Geo([42.0002, 59.895]))), - ("_geoPoint(42., 59.):desc", Desc(Geo([42., 59.]))), - ("truc(12, 13):desc", Desc(Field(S("truc(12, 13)")))), - ]; - - for (req, expected) in valid_req { - let res = req.parse::(); - assert!( - res.is_ok(), - "Failed to parse `{}`, was expecting `{:?}` but instead got `{:?}`", - req, - expected, - res - ); - assert_eq!(res.unwrap(), expected); - } - - let invalid_req = [ - ("truc:machin", InvalidAscDescSyntax { name: S("truc:machin") }), - ("truc:deesc", InvalidAscDescSyntax { name: S("truc:deesc") }), - ("truc:asc:deesc", InvalidAscDescSyntax { name: S("truc:asc:deesc") }), - ("42desc", InvalidAscDescSyntax { name: S("42desc") }), - ("_geoPoint:asc", InvalidReservedAscDescSyntax { name: S("_geoPoint") }), - ("_geoDistance:asc", InvalidReservedAscDescSyntax { name: S("_geoDistance") }), - ( - "_geoPoint(42.12 , 59.598)", - InvalidAscDescSyntax { name: S("_geoPoint(42.12 , 59.598)") }, - ), - ( - "_geoPoint(42.12 , 59.598):deesc", - InvalidAscDescSyntax { name: S("_geoPoint(42.12 , 59.598):deesc") }, - ), - ( - "_geoPoint(42.12 , 59.598):machin", - InvalidAscDescSyntax { name: S("_geoPoint(42.12 , 59.598):machin") }, - ), - ( - "_geoPoint(42.12 , 59.598):asc:aasc", - InvalidAscDescSyntax { name: S("_geoPoint(42.12 , 59.598):asc:aasc") }, - ), - ( - "_geoPoint(42,12 , 59,598):desc", - InvalidReservedAscDescSyntax { name: S("_geoPoint(42,12 , 59,598)") }, - ), - ( - "_geoPoint(35, 85, 75):asc", - InvalidReservedAscDescSyntax { name: S("_geoPoint(35, 85, 75)") }, - ), - ("_geoPoint(18):asc", InvalidReservedAscDescSyntax { name: S("_geoPoint(18)") }), - ]; - - for (req, expected_error) in invalid_req { - let res = req.parse::(); - assert!( - res.is_err(), - "Should no be able to parse `{}`, was expecting an error but instead got: `{:?}`", - req, - res, - ); - let res = res.unwrap_err(); - assert_eq!( - res.to_string(), - expected_error.to_string(), - "Bad error for input {}: got `{:?}` instead of `{:?}`", - req, - res, - expected_error - ); - } - } - #[test] fn parse_criterion() { let valid_criteria = [ diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 550e7f13d..f36de8437 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -4,6 +4,7 @@ extern crate pest_derive; #[macro_use] pub mod documents; +mod asc_desc; mod criterion; mod error; mod external_documents_ids; @@ -24,7 +25,8 @@ use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; use serde_json::{Map, Value}; -pub use self::criterion::{default_criteria, AscDesc, Criterion, Member}; +pub use self::asc_desc::{AscDesc, Member}; +pub use self::criterion::{default_criteria, Criterion}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index c2de55de5..a23e5acf9 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -12,10 +12,9 @@ use self::r#final::Final; use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; -use crate::criterion::{AscDesc as AscDescName, Member}; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; -use crate::{DocumentId, FieldId, Index, Result, TreeLevel}; +use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result, TreeLevel}; mod asc_desc; mod attribute; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f752f5822..3984ed130 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -18,10 +18,9 @@ pub(crate) use self::facet::ParserRule; pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; -use crate::criterion::{AscDesc, Criterion}; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; -use crate::{DocumentId, Index, Result}; +use crate::{AscDesc, Criterion, DocumentId, Index, Result}; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); From 86e272856ad03071c1d67aea8d8aefa2dc532f27 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 Sep 2021 15:33:32 +0200 Subject: [PATCH 1020/1889] create an asc_desc error type that is never supposed to be returned to the end user --- milli/src/criterion.rs | 17 +++++------------ milli/src/error.rs | 12 ------------ milli/src/lib.rs | 2 +- 3 files changed, 6 insertions(+), 25 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index acc0148bb..3afc1b1f8 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -4,7 +4,7 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; use crate::error::{Error, UserError}; -use crate::{AscDesc, Member}; +use crate::{AscDesc, AscDescError, Member}; #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { @@ -58,31 +58,24 @@ impl FromStr for Criterion { name: "_geoPoint".to_string(), })? } - Err(UserError::InvalidAscDescSyntax { name }) => { + Err(AscDescError::InvalidSyntax { name }) => { Err(UserError::InvalidRankingRuleName { name })? } - Err(UserError::InvalidReservedAscDescSyntax { name }) - if name.starts_with("_geoPoint") => - { + Err(AscDescError::ReservedKeyword { name }) if name.starts_with("_geoPoint") => { Err(UserError::InvalidReservedRankingRuleNameSort { name: "_geoPoint".to_string(), } .into()) } - Err(UserError::InvalidReservedAscDescSyntax { name }) - if name.starts_with("_geoRadius") => - { + Err(AscDescError::ReservedKeyword { name }) if name.starts_with("_geoRadius") => { Err(UserError::InvalidReservedRankingRuleNameFilter { name: "_geoRadius".to_string(), } .into()) } - Err(UserError::InvalidReservedAscDescSyntax { name }) => { + Err(AscDescError::ReservedKeyword { name }) => { Err(UserError::InvalidReservedRankingRuleName { name }.into()) } - Err(error) => { - Err(UserError::InvalidRankingRuleName { name: error.to_string() }.into()) - } }, } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 2e2d3088e..1c0125c70 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -56,8 +56,6 @@ pub enum FieldIdMapMissingEntry { pub enum UserError { AttributeLimitReached, DocumentLimitReached, - InvalidAscDescSyntax { name: String }, - InvalidReservedAscDescSyntax { name: String }, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidFilter(pest::error::Error), @@ -226,22 +224,12 @@ impl fmt::Display for UserError { ) } Self::InvalidFilter(error) => error.fmt(f), - Self::InvalidAscDescSyntax { name } => { - write!(f, "invalid asc/desc syntax for {}", name) - } Self::InvalidGeoField { document_id, object } => write!( f, "the document with the id: {} contains an invalid _geo field: {}", document_id, object ), Self::InvalidRankingRuleName { name } => write!(f, "invalid ranking rule {}", name), - Self::InvalidReservedAscDescSyntax { name } => { - write!( - f, - "{} is a reserved keyword and thus can't be used as a asc/desc rule", - name - ) - } Self::InvalidReservedRankingRuleName { name } => { write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name) } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index f36de8437..d61e7d6e3 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -25,7 +25,7 @@ use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; use serde_json::{Map, Value}; -pub use self::asc_desc::{AscDesc, Member}; +pub use self::asc_desc::{AscDesc, AscDescError, Member}; pub use self::criterion::{default_criteria, Criterion}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, From 023446ecf3ceda4b80e4aa6fc49bca6bc97c28ed Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 Sep 2021 16:02:07 +0200 Subject: [PATCH 1021/1889] create a smaller and easier to maintain CriterionError type --- milli/src/criterion.rs | 100 ++++++++++++++++++++++++++--------------- milli/src/error.rs | 28 ++---------- milli/src/lib.rs | 2 +- 3 files changed, 69 insertions(+), 61 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 3afc1b1f8..8abfb5a30 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -3,8 +3,43 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; -use crate::error::{Error, UserError}; -use crate::{AscDesc, AscDescError, Member}; +use crate::error::Error; +use crate::{AscDesc, AscDescError, Member, UserError}; + +#[derive(Debug)] +pub enum CriterionError { + InvalidName { name: String }, + ReservedName { name: String }, + ReservedNameForSort { name: String }, + ReservedNameForFilter { name: String }, +} + +impl fmt::Display for CriterionError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name), + Self::ReservedName { name } => { + write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name) + } + Self::ReservedNameForSort { name } => { + write!( + f, + "{0} is a reserved keyword and thus can't be used as a ranking rule. \ +{0} can only be used for sorting at search time", + name + ) + } + Self::ReservedNameForFilter { name } => { + write!( + f, + "{0} is a reserved keyword and thus can't be used as a ranking rule. \ +{0} can only be used for filtering at search time", + name + ) + } + } + } +} #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { @@ -40,7 +75,7 @@ impl Criterion { } impl FromStr for Criterion { - type Err = Error; + type Err = CriterionError; fn from_str(text: &str) -> Result { match text { @@ -54,33 +89,31 @@ impl FromStr for Criterion { Ok(AscDesc::Asc(Member::Field(field))) => Ok(Criterion::Asc(field)), Ok(AscDesc::Desc(Member::Field(field))) => Ok(Criterion::Desc(field)), Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { - Err(UserError::InvalidReservedRankingRuleNameSort { - name: "_geoPoint".to_string(), - })? + Err(CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() })? } Err(AscDescError::InvalidSyntax { name }) => { - Err(UserError::InvalidRankingRuleName { name })? + Err(CriterionError::InvalidName { name })? } Err(AscDescError::ReservedKeyword { name }) if name.starts_with("_geoPoint") => { - Err(UserError::InvalidReservedRankingRuleNameSort { - name: "_geoPoint".to_string(), - } - .into()) + Err(CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() })? } Err(AscDescError::ReservedKeyword { name }) if name.starts_with("_geoRadius") => { - Err(UserError::InvalidReservedRankingRuleNameFilter { - name: "_geoRadius".to_string(), - } - .into()) + Err(CriterionError::ReservedNameForFilter { name: "_geoRadius".to_string() })? } Err(AscDescError::ReservedKeyword { name }) => { - Err(UserError::InvalidReservedRankingRuleName { name }.into()) + Err(CriterionError::ReservedName { name })? } }, } } } +impl From for Error { + fn from(error: CriterionError) -> Self { + Self::UserError(UserError::CriterionError(error)) + } +} + pub fn default_criteria() -> Vec { vec![ Criterion::Words, @@ -112,7 +145,7 @@ impl fmt::Display for Criterion { #[cfg(test)] mod tests { use big_s::S; - use UserError::*; + use CriterionError::*; use super::*; @@ -146,24 +179,21 @@ mod tests { } let invalid_criteria = [ - ("words suffix", InvalidRankingRuleName { name: S("words suffix") }), - ("prefix typo", InvalidRankingRuleName { name: S("prefix typo") }), - ("proximity attribute", InvalidRankingRuleName { name: S("proximity attribute") }), - ("price", InvalidRankingRuleName { name: S("price") }), - ("asc:price", InvalidRankingRuleName { name: S("asc:price") }), - ("price:deesc", InvalidRankingRuleName { name: S("price:deesc") }), - ("price:aasc", InvalidRankingRuleName { name: S("price:aasc") }), - ("price:asc and desc", InvalidRankingRuleName { name: S("price:asc and desc") }), - ("price:asc:truc", InvalidRankingRuleName { name: S("price:asc:truc") }), - ("_geo:asc", InvalidReservedRankingRuleName { name: S("_geo") }), - ("_geoDistance:asc", InvalidReservedRankingRuleName { name: S("_geoDistance") }), - ("_geoPoint:asc", InvalidReservedRankingRuleNameSort { name: S("_geoPoint") }), - ("_geoPoint(42, 75):asc", InvalidReservedRankingRuleNameSort { name: S("_geoPoint") }), - ("_geoRadius:asc", InvalidReservedRankingRuleNameFilter { name: S("_geoRadius") }), - ( - "_geoRadius(42, 75, 59):asc", - InvalidReservedRankingRuleNameFilter { name: S("_geoRadius") }, - ), + ("words suffix", InvalidName { name: S("words suffix") }), + ("prefix typo", InvalidName { name: S("prefix typo") }), + ("proximity attribute", InvalidName { name: S("proximity attribute") }), + ("price", InvalidName { name: S("price") }), + ("asc:price", InvalidName { name: S("asc:price") }), + ("price:deesc", InvalidName { name: S("price:deesc") }), + ("price:aasc", InvalidName { name: S("price:aasc") }), + ("price:asc and desc", InvalidName { name: S("price:asc and desc") }), + ("price:asc:truc", InvalidName { name: S("price:asc:truc") }), + ("_geo:asc", ReservedName { name: S("_geo") }), + ("_geoDistance:asc", ReservedName { name: S("_geoDistance") }), + ("_geoPoint:asc", ReservedNameForSort { name: S("_geoPoint") }), + ("_geoPoint(42, 75):asc", ReservedNameForSort { name: S("_geoPoint") }), + ("_geoRadius:asc", ReservedNameForFilter { name: S("_geoRadius") }), + ("_geoRadius(42, 75, 59):asc", ReservedNameForFilter { name: S("_geoRadius") }), ]; for (input, expected) in invalid_criteria { diff --git a/milli/src/error.rs b/milli/src/error.rs index 1c0125c70..519de8516 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -8,7 +8,7 @@ use rayon::ThreadPoolBuildError; use serde_json::{Map, Value}; use crate::search::ParserRule; -use crate::{DocumentId, FieldId}; +use crate::{CriterionError, DocumentId, FieldId}; pub type Object = Map; @@ -55,6 +55,7 @@ pub enum FieldIdMapMissingEntry { #[derive(Debug)] pub enum UserError { AttributeLimitReached, + CriterionError(CriterionError), DocumentLimitReached, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, @@ -63,10 +64,6 @@ pub enum UserError { InvalidSortName { name: String }, InvalidReservedSortName { name: String }, InvalidGeoField { document_id: Value, object: Value }, - InvalidRankingRuleName { name: String }, - InvalidReservedRankingRuleName { name: String }, - InvalidReservedRankingRuleNameSort { name: String }, - InvalidReservedRankingRuleNameFilter { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, InvalidStoreFile, @@ -213,6 +210,7 @@ impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), + Self::CriterionError(error) => f.write_str(&error.to_string()), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), Self::InvalidFacetsDistribution { invalid_facets_name } => { let name_list = @@ -229,26 +227,6 @@ impl fmt::Display for UserError { "the document with the id: {} contains an invalid _geo field: {}", document_id, object ), - Self::InvalidRankingRuleName { name } => write!(f, "invalid ranking rule {}", name), - Self::InvalidReservedRankingRuleName { name } => { - write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name) - } - Self::InvalidReservedRankingRuleNameSort { name } => { - write!( - f, - "{0} is a reserved keyword and thus can't be used as a ranking rule. \ -{0} can only be used for sorting at search time", - name - ) - } - Self::InvalidReservedRankingRuleNameFilter { name } => { - write!( - f, - "{0} is a reserved keyword and thus can't be used as a ranking rule. \ -{0} can only be used for filtering at search time", - name - ) - } Self::InvalidReservedSortName { name } => { write!( f, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index d61e7d6e3..8a54bbbdf 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -26,7 +26,7 @@ pub use grenad::CompressionType; use serde_json::{Map, Value}; pub use self::asc_desc::{AscDesc, AscDescError, Member}; -pub use self::criterion::{default_criteria, Criterion}; +pub use self::criterion::{default_criteria, Criterion, CriterionError}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; From 1e5e3d57e256410931df2ccfe54cd21b5eb0dc41 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 Sep 2021 16:09:08 +0200 Subject: [PATCH 1022/1889] auto convert AscDescError into CriterionError --- milli/src/criterion.rs | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 8abfb5a30..4299e4974 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -4,7 +4,7 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; use crate::error::Error; -use crate::{AscDesc, AscDescError, Member, UserError}; +use crate::{AscDesc, Member, UserError}; #[derive(Debug)] pub enum CriterionError { @@ -41,6 +41,12 @@ impl fmt::Display for CriterionError { } } +impl From for Error { + fn from(error: CriterionError) -> Self { + Self::UserError(UserError::CriterionError(error)) + } +} + #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { /// Sorted by decreasing number of matched query terms. @@ -85,35 +91,17 @@ impl FromStr for Criterion { "attribute" => Ok(Criterion::Attribute), "sort" => Ok(Criterion::Sort), "exactness" => Ok(Criterion::Exactness), - text => match AscDesc::from_str(text) { - Ok(AscDesc::Asc(Member::Field(field))) => Ok(Criterion::Asc(field)), - Ok(AscDesc::Desc(Member::Field(field))) => Ok(Criterion::Desc(field)), - Ok(AscDesc::Asc(Member::Geo(_))) | Ok(AscDesc::Desc(Member::Geo(_))) => { + text => match AscDesc::from_str(text)? { + AscDesc::Asc(Member::Field(field)) => Ok(Criterion::Asc(field)), + AscDesc::Desc(Member::Field(field)) => Ok(Criterion::Desc(field)), + AscDesc::Asc(Member::Geo(_)) | AscDesc::Desc(Member::Geo(_)) => { Err(CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() })? } - Err(AscDescError::InvalidSyntax { name }) => { - Err(CriterionError::InvalidName { name })? - } - Err(AscDescError::ReservedKeyword { name }) if name.starts_with("_geoPoint") => { - Err(CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() })? - } - Err(AscDescError::ReservedKeyword { name }) if name.starts_with("_geoRadius") => { - Err(CriterionError::ReservedNameForFilter { name: "_geoRadius".to_string() })? - } - Err(AscDescError::ReservedKeyword { name }) => { - Err(CriterionError::ReservedName { name })? - } }, } } } -impl From for Error { - fn from(error: CriterionError) -> Self { - Self::UserError(UserError::CriterionError(error)) - } -} - pub fn default_criteria() -> Vec { vec![ Criterion::Words, From 47ee93b0bd7e81ee042aa5f891a01c00c88f9454 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 Sep 2021 16:29:11 +0200 Subject: [PATCH 1023/1889] return an error when _geoPoint is used but _geo is not sortable --- milli/src/search/mod.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3984ed130..bec059d46 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -20,7 +20,7 @@ pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; -use crate::{AscDesc, Criterion, DocumentId, Index, Result}; +use crate::{AscDesc, Criterion, DocumentId, Index, Member, Result}; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -147,15 +147,20 @@ impl<'a> Search<'a> { if let Some(sort_criteria) = &self.sort_criteria { let sortable_fields = self.index.sortable_fields(self.rtxn)?; for asc_desc in sort_criteria { - // we are not supposed to find any geoPoint in the criterion - if let Some(field) = asc_desc.field() { - if !sortable_fields.contains(field) { + match asc_desc.member() { + Member::Field(ref field) if !sortable_fields.contains(field) => { return Err(UserError::InvalidSortableAttribute { field: field.to_string(), valid_fields: sortable_fields, - } - .into()); + })? } + Member::Geo(_) if !sortable_fields.contains("_geo") => { + return Err(UserError::InvalidSortableAttribute { + field: "_geo".to_string(), + valid_fields: sortable_fields, + })? + } + _ => (), } } } From 218f0a666163cbfb6980eb8fd23535d60b0dc2ad Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 22 Sep 2021 16:59:23 +0200 Subject: [PATCH 1024/1889] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/asc_desc.rs | 6 +++--- milli/src/criterion.rs | 12 ++++++------ milli/src/error.rs | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index d68d5b6a2..9a3bda934 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -1,4 +1,4 @@ -//! This module provide the `AscDesc` type and define a all the errors related to this type +//! This module provides the `AscDesc` type and defines all the errors related to this type. use std::fmt; use std::str::FromStr; @@ -43,7 +43,7 @@ impl From for CriterionError { AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => { CriterionError::ReservedNameForFilter { name: "_geoRadius".to_string() } } - AscDescError::ReservedKeyword { name } => (CriterionError::ReservedName { name }), + AscDescError::ReservedKeyword { name } => CriterionError::ReservedName { name }, } } } @@ -129,7 +129,7 @@ impl FromStr for AscDesc { type Err = AscDescError; /// Since we don't know if this was deserialized for a criterion or a sort we just return a - /// string and let the caller create his own error + /// string and let the caller create his own error. fn from_str(text: &str) -> Result { match text.rsplit_once(':') { Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)), diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 4299e4974..aff7fcf68 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -24,17 +24,17 @@ impl fmt::Display for CriterionError { Self::ReservedNameForSort { name } => { write!( f, - "{0} is a reserved keyword and thus can't be used as a ranking rule. \ -{0} can only be used for sorting at search time", - name + "{} is a reserved keyword and thus can't be used as a ranking rule. \ +{} can only be used for sorting at search time", + name, name ) } Self::ReservedNameForFilter { name } => { write!( f, - "{0} is a reserved keyword and thus can't be used as a ranking rule. \ -{0} can only be used for filtering at search time", - name + "{} is a reserved keyword and thus can't be used as a ranking rule. \ +{} can only be used for filtering at search time", + name, name ) } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 519de8516..bd4f02b99 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -210,7 +210,7 @@ impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), - Self::CriterionError(error) => f.write_str(&error.to_string()), + Self::CriterionError(error) => write!(f, "{}", error), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), Self::InvalidFacetsDistribution { invalid_facets_name } => { let name_list = From 1eacab2169e518ebae5a42858229d3ebc2d01cb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 22 Sep 2021 17:16:31 +0200 Subject: [PATCH 1025/1889] Update version for the next release (v0.15.1) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index ef4484090..d6f4ff004 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.15.0" +version = "0.16.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 1b56d3ff1..72744469a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.15.0" +version = "0.16.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index c9631c157..9f3460526 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.15.0" +version = "0.16.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 68f55a6c8..f4e5a7caa 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.15.0" +version = "0.16.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index a1bcdcbcf..696dfd018 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.15.0" +version = "0.16.0" authors = ["Clément Renault "] edition = "2018" From 551df0cb770328880d0e97986a19340d43d12101 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 23 Sep 2021 15:55:39 +0200 Subject: [PATCH 1026/1889] Add test checking the bug reported in meilisearch issue 1716 --- milli/src/update/index_documents/mod.rs | 54 +++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index f9577243f..fe35e0143 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -927,4 +927,58 @@ mod tests { wtxn.commit().unwrap(); } + + #[test] + fn index_2_times_documents_split_by_zero_document_indexation() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let content = documents!([ + {"id": 0, "name": "Kerollmops", "score": 78}, + {"id": 1, "name": "ManyTheFish", "score": 75}, + {"id": 2, "name": "Ferdi", "score": 39}, + {"id": 3, "name": "Tommy", "score": 33} + ]); + + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Check that there is 4 document now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 4); + + let content = documents!([]); + + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Check that there is 4 document now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 4); + + let content = documents!([ + {"id": 0, "name": "Kerollmops", "score": 78}, + {"id": 1, "name": "ManyTheFish", "score": 75}, + {"id": 2, "name": "Ferdi", "score": 39}, + {"id": 3, "name": "Tommy", "score": 33} + ]); + + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, 2); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Check that there is 4 document now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 4); + } } From b18806386912978a325efbf3b3758bb7e21262f4 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 27 Sep 2021 14:26:21 +0200 Subject: [PATCH 1027/1889] Change chunk size to 4MiB to fit more the end user usage --- milli/src/update/index_documents/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index fe35e0143..b00dbf375 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -248,7 +248,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let chunk_iter = grenad_obkv_into_chunks( documents_file, params.clone(), - self.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB + self.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB ); let result = chunk_iter.map(|chunk_iter| { From c7cb816ae155ef3ef2ce9daf904b98a66cd9e578 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 27 Sep 2021 19:07:22 +0200 Subject: [PATCH 1028/1889] simplify the error handling of the sort syntax for meilisearch --- milli/src/asc_desc.rs | 64 ++++++++++++++++++++++++++++++++++++++++++- milli/src/error.rs | 16 ++--------- milli/src/lib.rs | 2 +- 3 files changed, 67 insertions(+), 15 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index 9a3bda934..5adef782f 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -6,7 +6,7 @@ use std::str::FromStr; use serde::{Deserialize, Serialize}; use crate::error::is_reserved_keyword; -use crate::CriterionError; +use crate::{CriterionError, Error, UserError}; /// This error type is never supposed to be shown to the end user. /// You must always cast it to a sort error or a criterion error. @@ -139,6 +139,68 @@ impl FromStr for AscDesc { } } +#[derive(Debug)] +pub enum SortError { + InvalidName { name: String }, + ReservedName { name: String }, + ReservedNameForSettings { name: String }, + ReservedNameForFilter { name: String }, +} + +impl From for SortError { + fn from(error: AscDescError) -> Self { + match error { + AscDescError::InvalidSyntax { name } => SortError::InvalidName { name }, + AscDescError::ReservedKeyword { name } if &name == "_geo" => { + SortError::ReservedNameForSettings { name: "_geoPoint".to_string() } + } + AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => { + SortError::ReservedNameForFilter { name: "_geoRadius".to_string() } + } + AscDescError::ReservedKeyword { name } => SortError::ReservedName { name }, + } + } +} + +impl fmt::Display for SortError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::InvalidName { name } => { + write!(f, "invalid syntax for the sort parameter {}", name) + } + Self::ReservedName { name } => { + write!( + f, + "{} is a reserved keyword and thus can't be used as a sort expression", + name + ) + } + Self::ReservedNameForSettings { name } => { + write!( + f, + "{} is a reserved keyword and thus can't be used as a sort expression. \ +{} can only be used in the settings", + name, name + ) + } + Self::ReservedNameForFilter { name } => { + write!( + f, + "{} is a reserved keyword and thus can't be used as a sort expression. \ +{} can only be used for filtering at search time", + name, name + ) + } + } + } +} + +impl From for Error { + fn from(error: SortError) -> Self { + Self::UserError(UserError::SortError(error)) + } +} + #[cfg(test)] mod tests { use big_s::S; diff --git a/milli/src/error.rs b/milli/src/error.rs index bd4f02b99..723d5c4c2 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -8,7 +8,7 @@ use rayon::ThreadPoolBuildError; use serde_json::{Map, Value}; use crate::search::ParserRule; -use crate::{CriterionError, DocumentId, FieldId}; +use crate::{CriterionError, DocumentId, FieldId, SortError}; pub type Object = Map; @@ -61,8 +61,6 @@ pub enum UserError { InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), - InvalidSortName { name: String }, - InvalidReservedSortName { name: String }, InvalidGeoField { document_id: Value, object: Value }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, @@ -74,6 +72,7 @@ pub enum UserError { PrimaryKeyCannotBeChanged, PrimaryKeyCannotBeReset, SerdeJson(serde_json::Error), + SortError(SortError), UnknownInternalDocumentId { document_id: DocumentId }, } @@ -227,13 +226,6 @@ impl fmt::Display for UserError { "the document with the id: {} contains an invalid _geo field: {}", document_id, object ), - Self::InvalidReservedSortName { name } => { - write!( - f, - "{} is a reserved keyword and thus can't be used as a sort expression", - name - ) - } Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); write!( @@ -245,9 +237,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco ) } Self::InvalidFilterAttribute(error) => error.fmt(f), - Self::InvalidSortName { name } => { - write!(f, "Invalid syntax for the sort parameter: {}", name) - } Self::InvalidSortableAttribute { field, valid_fields } => { let valid_names = valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); @@ -277,6 +266,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco f.write_str("primary key cannot be reset if the database contains documents") } Self::SerdeJson(error) => error.fmt(f), + Self::SortError(error) => write!(f, "{}", error), Self::UnknownInternalDocumentId { document_id } => { write!(f, "an unknown internal document id have been used ({})", document_id) } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 8a54bbbdf..bb0a32528 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -25,7 +25,7 @@ use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; use serde_json::{Map, Value}; -pub use self::asc_desc::{AscDesc, AscDescError, Member}; +pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::criterion::{default_criteria, Criterion, CriterionError}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, From cc732fe95ef448454cf6eb98243aef75de80590e Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Sep 2021 11:15:24 +0200 Subject: [PATCH 1029/1889] update http-ui to use the sort-error --- http-ui/src/main.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 1bacdfbed..b76547309 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -22,7 +22,9 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use milli::documents::DocumentBatchReader; use milli::update::UpdateIndexingStep::*; use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder}; -use milli::{obkv_to_json, CompressionType, FilterCondition, Index, MatchingWords, SearchResult}; +use milli::{ + obkv_to_json, CompressionType, FilterCondition, Index, MatchingWords, SearchResult, SortError, +}; use once_cell::sync::OnceCell; use rayon::ThreadPool; use serde::{Deserialize, Serialize}; @@ -756,7 +758,7 @@ async fn main() -> anyhow::Result<()> { } if let Some(sort) = query.sort { - search.sort_criteria(vec![sort.parse().unwrap()]); + search.sort_criteria(vec![sort.parse().map_err(SortError::from).unwrap()]); } let SearchResult { matching_words, candidates, documents_ids } = From 19884162957e6c71eb95d4b5a1f989be442c1a1c Mon Sep 17 00:00:00 2001 From: many Date: Tue, 28 Sep 2021 12:05:11 +0200 Subject: [PATCH 1030/1889] Add failing test related to Meilisearch#1714 --- milli/src/update/index_documents/mod.rs | 37 +++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b00dbf375..498a2a85d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -981,4 +981,41 @@ mod tests { let count = index.number_of_documents(&rtxn).unwrap(); assert_eq!(count, 4); } + + #[test] + fn test_meilisearch_1714() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let content = documents!([ + {"id": "123", "title": "小化妆包" }, + {"id": "456", "title": "Ipad 包" } + ]); + + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // Only the first document should match. + let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len(); + assert_eq!(count, 1); + + // Only the second document should match. + let count = index.word_docids.get(&rtxn, "包").unwrap().unwrap().len(); + assert_eq!(count, 1); + + let mut search = crate::Search::new(&rtxn, &index); + search.query("化妆包"); + search.authorize_typos(true); + search.optional_words(true); + + // only 1 document should be returned + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids.len(), 1); + } } From 8046ae4bd570b092a98c424dfbd4f5ac3e0678cc Mon Sep 17 00:00:00 2001 From: many Date: Tue, 28 Sep 2021 12:10:43 +0200 Subject: [PATCH 1031/1889] Count the number of char instead of counting bytes to assign the typo tolerance --- milli/src/search/query_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 8fa24b9d3..0744231ae 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -262,7 +262,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result QueryKind { if authorize_typos { - match word.len() { + match word.chars().count() { 0..=4 => QueryKind::exact(word), 5..=8 => QueryKind::tolerant(1, word), _ => QueryKind::tolerant(2, word), From a80dcfd4a3a047f98e9571231e439f7406f0beea Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Sep 2021 14:32:24 +0200 Subject: [PATCH 1032/1889] improve error message for bad sort syntax with geosearch --- milli/src/asc_desc.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index 5adef782f..b8323292c 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -141,6 +141,7 @@ impl FromStr for AscDesc { #[derive(Debug)] pub enum SortError { + BadGeoPointUsage { name: String }, InvalidName { name: String }, ReservedName { name: String }, ReservedNameForSettings { name: String }, @@ -151,6 +152,9 @@ impl From for SortError { fn from(error: AscDescError) -> Self { match error { AscDescError::InvalidSyntax { name } => SortError::InvalidName { name }, + AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => { + SortError::BadGeoPointUsage { name } + } AscDescError::ReservedKeyword { name } if &name == "_geo" => { SortError::ReservedNameForSettings { name: "_geoPoint".to_string() } } @@ -165,6 +169,14 @@ impl From for SortError { impl fmt::Display for SortError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { + Self::BadGeoPointUsage { name } => { + write!( + f, + "invalid syntax for the `_geoPoint` parameter: `{}`. \ +Usage: `_geoPoint(latitude, longitude):asc`", + name + ) + } Self::InvalidName { name } => { write!(f, "invalid syntax for the sort parameter {}", name) } From 3580b2d803fcac029ff57f08a2108610335357b0 Mon Sep 17 00:00:00 2001 From: Vishnu Ganesan Date: Tue, 28 Sep 2021 19:30:23 +0530 Subject: [PATCH 1033/1889] Fixes #365 --- milli/src/error.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 723d5c4c2..6bd8604fc 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -247,8 +247,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco ) } Self::SortRankingRuleMissing => f.write_str( - "The sort ranking rule must be specified in the \ - ranking rules settings to use the sort parameter at search time", + "You must specify where \"sort\" is listed in the \ +rankingRules settings to use the sort parameter at search time", ), Self::MissingDocumentId { document } => { let json = serde_json::to_string(document).unwrap(); From 785c1372f28f6db7685f5f5cb6bc62bc4e9f2894 Mon Sep 17 00:00:00 2001 From: Vishnu Gt Date: Tue, 28 Sep 2021 20:11:32 +0530 Subject: [PATCH 1034/1889] Change "settings" to "setting" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/error.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 6bd8604fc..1f1cc5264 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -248,7 +248,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco } Self::SortRankingRuleMissing => f.write_str( "You must specify where \"sort\" is listed in the \ -rankingRules settings to use the sort parameter at search time", +rankingRules setting to use the sort parameter at search time", ), Self::MissingDocumentId { document } => { let json = serde_json::to_string(document).unwrap(); From f65153ad6454317213680e9a9a908ec78d5645a7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 Sep 2021 18:35:54 +0200 Subject: [PATCH 1035/1889] stop casting integer docids to string --- milli/src/update/index_documents/mod.rs | 6 +++--- milli/src/update/index_documents/transform.rs | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 498a2a85d..e61fbd6b4 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -536,7 +536,7 @@ mod tests { // Check that this document is equal to the last one sent. let mut doc_iter = doc.iter(); - assert_eq!(doc_iter.next(), Some((0, &br#""1""#[..]))); + assert_eq!(doc_iter.next(), Some((0, &b"1"[..]))); assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..]))); assert_eq!(doc_iter.next(), None); drop(rtxn); @@ -562,9 +562,9 @@ mod tests { // Check that this document is equal to the last one sent. let mut doc_iter = doc.iter(); - assert_eq!(doc_iter.next(), Some((0, &br#""1""#[..]))); + assert_eq!(doc_iter.next(), Some((0, &b"1"[..]))); assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..]))); - assert_eq!(doc_iter.next(), Some((2, &br#"25"#[..]))); + assert_eq!(doc_iter.next(), Some((2, &b"25"[..]))); assert_eq!(doc_iter.next(), None); drop(rtxn); } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index fc5eb2c84..8d656e50c 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -171,7 +171,6 @@ impl Transform<'_, '_> { } }; serde_json::to_writer(&mut external_id_buffer, &value).unwrap(); - *bytes = &external_id_buffer; Cow::Owned(value) } None => { From 0e8665bf1801fd27189b438efc17bc5c7f2ee67c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 28 Sep 2021 19:38:12 +0200 Subject: [PATCH 1036/1889] Update version for the next release (v0.17.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index d6f4ff004..bb8a35d12 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.16.0" +version = "0.17.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 72744469a..33b5fe3b8 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.16.0" +version = "0.17.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 9f3460526..2a3ca0bd3 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.16.0" +version = "0.17.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f4e5a7caa..19027d49e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.16.0" +version = "0.17.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 696dfd018..d570ce423 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.16.0" +version = "0.17.0" authors = ["Clément Renault "] edition = "2018" From d2427f18e5d802f7309c2559571749a7ec9d52b8 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 28 Sep 2021 15:58:36 +0200 Subject: [PATCH 1037/1889] Enhance CSV document parsing --- benchmarks/benches/utils.rs | 78 +++++++- http-ui/src/documents_from_csv.rs | 285 ++++++++++++++++++++++++++++++ http-ui/src/main.rs | 5 +- 3 files changed, 364 insertions(+), 4 deletions(-) create mode 100644 http-ui/src/documents_from_csv.rs diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index e5bdbdfaa..24f5d5343 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -2,6 +2,7 @@ use std::fs::{create_dir_all, remove_dir_all, File}; use std::io::{self, Cursor, Read, Seek}; +use std::num::ParseFloatError; use std::path::Path; use criterion::BenchmarkId; @@ -175,8 +176,7 @@ fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let mut records = csv::Reader::from_reader(reader); - let iter = records.deserialize::>(); + let iter = CSVDocumentDeserializer::from_reader(reader)?; for doc in iter { let doc = doc?; @@ -187,3 +187,77 @@ fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { Ok(writer.into_inner()) } + +enum AllowedType { + String, + Number, +} + +fn parse_csv_header(header: &str) -> (String, AllowedType) { + // if there are several separators we only split on the last one. + match header.rsplit_once(':') { + Some((field_name, field_type)) => match field_type { + "string" => (field_name.to_string(), AllowedType::String), + "number" => (field_name.to_string(), AllowedType::Number), + // we may return an error in this case. + _otherwise => (header.to_string(), AllowedType::String), + }, + None => (header.to_string(), AllowedType::String), + } +} + +struct CSVDocumentDeserializer +where + R: Read, +{ + documents: csv::StringRecordsIntoIter, + headers: Vec<(String, AllowedType)>, +} + +impl CSVDocumentDeserializer { + fn from_reader(reader: R) -> io::Result { + let mut records = csv::Reader::from_reader(reader); + + let headers = records.headers()?.into_iter().map(parse_csv_header).collect(); + + Ok(Self { documents: records.into_records(), headers }) + } +} + +impl Iterator for CSVDocumentDeserializer { + type Item = anyhow::Result>; + + fn next(&mut self) -> Option { + let csv_document = self.documents.next()?; + + match csv_document { + Ok(csv_document) => { + let mut document = Map::new(); + + for ((field_name, field_type), value) in + self.headers.iter().zip(csv_document.into_iter()) + { + let parsed_value: Result = match field_type { + AllowedType::Number => { + value.parse::().map(Value::from).map_err(Into::into) + } + AllowedType::String => Ok(Value::String(value.to_string())), + }; + + match parsed_value { + Ok(value) => drop(document.insert(field_name.to_string(), value)), + Err(_e) => { + return Some(Err(anyhow::anyhow!( + "Value '{}' is not a valid number", + value + ))) + } + } + } + + Some(Ok(document)) + } + Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))), + } + } +} diff --git a/http-ui/src/documents_from_csv.rs b/http-ui/src/documents_from_csv.rs new file mode 100644 index 000000000..2b62f23c2 --- /dev/null +++ b/http-ui/src/documents_from_csv.rs @@ -0,0 +1,285 @@ +use std::io::{Read, Result as IoResult}; +use std::num::ParseFloatError; + +use serde_json::{Map, Value}; + +enum AllowedType { + String, + Number, +} + +fn parse_csv_header(header: &str) -> (String, AllowedType) { + // if there are several separators we only split on the last one. + match header.rsplit_once(':') { + Some((field_name, field_type)) => match field_type { + "string" => (field_name.to_string(), AllowedType::String), + "number" => (field_name.to_string(), AllowedType::Number), + // we may return an error in this case. + _otherwise => (header.to_string(), AllowedType::String), + }, + None => (header.to_string(), AllowedType::String), + } +} + +pub struct CSVDocumentDeserializer +where + R: Read, +{ + documents: csv::StringRecordsIntoIter, + headers: Vec<(String, AllowedType)>, +} + +impl CSVDocumentDeserializer { + pub fn from_reader(reader: R) -> IoResult { + let mut records = csv::Reader::from_reader(reader); + + let headers = records.headers()?.into_iter().map(parse_csv_header).collect(); + + Ok(Self { documents: records.into_records(), headers }) + } +} + +impl Iterator for CSVDocumentDeserializer { + type Item = anyhow::Result>; + + fn next(&mut self) -> Option { + let csv_document = self.documents.next()?; + + match csv_document { + Ok(csv_document) => { + let mut document = Map::new(); + + for ((field_name, field_type), value) in + self.headers.iter().zip(csv_document.into_iter()) + { + let parsed_value: Result = match field_type { + AllowedType::Number => { + value.parse::().map(Value::from).map_err(Into::into) + } + AllowedType::String => Ok(Value::String(value.to_string())), + }; + + match parsed_value { + Ok(value) => drop(document.insert(field_name.to_string(), value)), + Err(_e) => { + return Some(Err(anyhow::anyhow!( + "Value '{}' is not a valid number", + value + ))) + } + } + } + + Some(Ok(document)) + } + Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))), + } + } +} + +#[cfg(test)] +mod test { + use serde_json::json; + + use super::*; + + #[test] + fn simple_csv_document() { + let documents = r#"city,country,pop +"Boston","United States","4628910""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert_eq!( + Value::Object(csv_iter.next().unwrap().unwrap()), + json!({ + "city": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn coma_in_field() { + let documents = r#"city,country,pop +"Boston","United, States","4628910""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert_eq!( + Value::Object(csv_iter.next().unwrap().unwrap()), + json!({ + "city": "Boston", + "country": "United, States", + "pop": "4628910", + }) + ); + } + + #[test] + fn quote_in_field() { + let documents = r#"city,country,pop +"Boston","United"" States","4628910""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert_eq!( + Value::Object(csv_iter.next().unwrap().unwrap()), + json!({ + "city": "Boston", + "country": "United\" States", + "pop": "4628910", + }) + ); + } + + #[test] + fn integer_in_field() { + let documents = r#"city,country,pop:number +"Boston","United States","4628910""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert_eq!( + Value::Object(csv_iter.next().unwrap().unwrap()), + json!({ + "city": "Boston", + "country": "United States", + "pop": 4628910.0, + }) + ); + } + + #[test] + fn float_in_field() { + let documents = r#"city,country,pop:number +"Boston","United States","4628910.01""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert_eq!( + Value::Object(csv_iter.next().unwrap().unwrap()), + json!({ + "city": "Boston", + "country": "United States", + "pop": 4628910.01, + }) + ); + } + + #[test] + fn several_double_dot_in_header() { + let documents = r#"city:love:string,country:state,pop +"Boston","United States","4628910""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert_eq!( + Value::Object(csv_iter.next().unwrap().unwrap()), + json!({ + "city:love": "Boston", + "country:state": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn ending_by_double_dot_in_header() { + let documents = r#"city:,country,pop +"Boston","United States","4628910""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert_eq!( + Value::Object(csv_iter.next().unwrap().unwrap()), + json!({ + "city:": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn starting_by_double_dot_in_header() { + let documents = r#":city,country,pop +"Boston","United States","4628910""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert_eq!( + Value::Object(csv_iter.next().unwrap().unwrap()), + json!({ + ":city": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn starting_by_double_dot_in_header2() { + let documents = r#":string,country,pop +"Boston","United States","4628910""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert_eq!( + Value::Object(csv_iter.next().unwrap().unwrap()), + json!({ + "": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn double_double_dot_in_header() { + let documents = r#"city::string,country,pop +"Boston","United States","4628910""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert_eq!( + Value::Object(csv_iter.next().unwrap().unwrap()), + json!({ + "city:": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn bad_type_in_header() { + let documents = r#"city,country:number,pop +"Boston","United States","4628910""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert!(csv_iter.next().unwrap().is_err()); + } + + #[test] + fn bad_column_count1() { + let documents = r#"city,country,pop +"Boston","United States","4628910", "too much""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert!(csv_iter.next().unwrap().is_err()); + } + + #[test] + fn bad_column_count2() { + let documents = r#"city,country,pop +"Boston","United States""#; + + let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); + + assert!(csv_iter.next().unwrap().is_err()); + } +} diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b76547309..9efdd1371 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,3 +1,4 @@ +mod documents_from_csv; mod update_store; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; @@ -38,6 +39,7 @@ use warp::http::Response; use warp::Filter; use self::update_store::UpdateStore; +use crate::documents_from_csv::CSVDocumentDeserializer; #[cfg(target_os = "linux")] #[global_allocator] @@ -1056,8 +1058,7 @@ fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let mut records = csv::Reader::from_reader(reader); - let iter = records.deserialize::>(); + let iter = CSVDocumentDeserializer::from_reader(reader)?; for doc in iter { let doc = doc?; From 1df5b8712bb0932e1dc524af545ec445b2e2af6e Mon Sep 17 00:00:00 2001 From: many Date: Wed, 29 Sep 2021 14:41:56 +0200 Subject: [PATCH 1038/1889] Hotfix meilisearch#1707 --- milli/src/search/criteria/attribute.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 6e0bb40d5..468f2ce32 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -192,7 +192,10 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { in_prefix_cache: bool, ) -> heed::Result> { match ctx.word_position_last_level(&word, in_prefix_cache)? { - Some(level) => { + Some(_) => { + // HOTFIX Meilisearch#1707: it is better to only iterate over the level 0. + // A cleaner fix will be implemented soon. + let level = TreeLevel::min_value(); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; @@ -528,10 +531,10 @@ impl<'t, 'q> Branch<'t, 'q> { fn cmp(&self, other: &Self) -> Ordering { let self_rank = self.compute_rank(); let other_rank = other.compute_rank(); - let left_cmp = self_rank.cmp(&other_rank).reverse(); + let left_cmp = self_rank.cmp(&other_rank); // on level: lower is better, // we want to dig faster into levels on interesting branches. - let level_cmp = self.tree_level.cmp(&other.tree_level).reverse(); + let level_cmp = self.tree_level.cmp(&other.tree_level); left_cmp.then(level_cmp).then(self.last_result.2.len().cmp(&other.last_result.2.len())) } From 7ad0214089f7c2a1a469c2e25e727f819040cd81 Mon Sep 17 00:00:00 2001 From: Many Date: Wed, 29 Sep 2021 14:49:41 +0200 Subject: [PATCH 1039/1889] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 468f2ce32..8703bb268 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -194,7 +194,6 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { match ctx.word_position_last_level(&word, in_prefix_cache)? { Some(_) => { // HOTFIX Meilisearch#1707: it is better to only iterate over the level 0. - // A cleaner fix will be implemented soon. let level = TreeLevel::min_value(); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); let inner = From 2e49230ca24eb4fa455df79fc33c864169dc0992 Mon Sep 17 00:00:00 2001 From: Many Date: Wed, 29 Sep 2021 14:49:45 +0200 Subject: [PATCH 1040/1889] Update milli/src/search/criteria/attribute.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/attribute.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 8703bb268..5eb1311a1 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -193,7 +193,7 @@ impl<'t, 'q> WordLevelIterator<'t, 'q> { ) -> heed::Result> { match ctx.word_position_last_level(&word, in_prefix_cache)? { Some(_) => { - // HOTFIX Meilisearch#1707: it is better to only iterate over the level 0. + // HOTFIX Meilisearch#1707: it is better to only iterate over level 0 for performances reasons. let level = TreeLevel::min_value(); let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); let inner = From 26b5dad042373f87c585f02a4483af94e15bf849 Mon Sep 17 00:00:00 2001 From: Many Date: Wed, 29 Sep 2021 15:08:39 +0200 Subject: [PATCH 1041/1889] Revert "Change chunk size to 4MiB to fit more the end user usage" --- milli/src/update/index_documents/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e61fbd6b4..30ee49893 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -248,7 +248,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let chunk_iter = grenad_obkv_into_chunks( documents_file, params.clone(), - self.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB + self.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB ); let result = chunk_iter.map(|chunk_iter| { From 0ee67bb7d174f1ef974bb578d565914322a535a2 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 30 Sep 2021 12:28:40 +0200 Subject: [PATCH 1042/1889] improve the reserved keyword error message for the filters --- milli/src/search/facet/filter_condition.rs | 97 +++++++++++++++++----- milli/src/search/facet/grammar.pest | 2 +- 2 files changed, 79 insertions(+), 20 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 1e5bf9ad0..f0a51fe0a 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -600,24 +600,33 @@ fn field_id( // lexing ensures that we at least have a key let key = items.next().unwrap(); if key.as_rule() == Rule::reserved { - return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "`{}` is a reserved keyword and therefore can't be used as a filter expression. \ - Available filterable attributes are: {}", - key.as_str(), - filterable_fields.iter().join(", "), - ), - }, - key.as_span(), - )); + let message = match key.as_str() { + key if key.starts_with("_geoPoint") => { + format!( + "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. \ + Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", + ) + } + key @ "_geo" => { + format!( + "`{}` is a reserved keyword and thus can't be used as a filter expression. \ + Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", + key + ) + } + key => format!( + "`{}` is a reserved keyword and thus can't be used as a filter expression.", + key + ), + }; + return Err(PestError::new_from_span(ErrorVariant::CustomError { message }, key.as_span())); } if !filterable_fields.contains(key.as_str()) { return Err(PestError::new_from_span( ErrorVariant::CustomError { message: format!( - "attribute `{}` is not filterable, available filterable attributes are: {}", + "attribute `{}` is not filterable, available filterable attributes are: {}.", key.as_str(), filterable_fields.iter().join(", "), ), @@ -688,13 +697,6 @@ mod tests { let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); - - let result = FilterCondition::from_str(&rtxn, &index, "_geo = France"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains( - "`_geo` is a reserved keyword and therefore can't be used as a filter expression." - )); } #[test] @@ -777,6 +779,49 @@ mod tests { assert_eq!(condition, expected); } + #[test] + fn reserved_field_names() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let rtxn = index.read_txn().unwrap(); + + let error = FilterCondition::from_str(&rtxn, &index, "_geo = 12").unwrap_err(); + assert!(error + .to_string() + .contains("`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), + "{}", + error.to_string() + ); + + let error = + FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).unwrap_err(); + assert!(error + .to_string() + .contains("`_geoDistance` is a reserved keyword and thus can't be used as a filter expression."), + "{}", + error.to_string() + ); + + let error = FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).unwrap_err(); + assert!(error + .to_string() + .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), + "{}", + error.to_string() + ); + + let error = + FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).unwrap_err(); + assert!(error + .to_string() + .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), + "{}", + error.to_string() + ); + } + #[test] fn geo_radius() { let path = tempfile::tempdir().unwrap(); @@ -788,6 +833,20 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + // _geo is not filterable + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 12, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("attribute `_geo` is not filterable, available filterable attributes are:"),); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest index d07d5bca5..8bfdeb667 100644 --- a/milli/src/search/facet/grammar.pest +++ b/milli/src/search/facet/grammar.pest @@ -8,7 +8,7 @@ char = _{ !(PEEK | "\\") ~ ANY | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} -reserved = { "_geo" | "_geoDistance" | "_geoPoint" | ("_geoPoint" ~ parameters) } +reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" } // we deliberately choose to allow empty parameters to generate more specific error message later parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} condition = _{between | eq | greater | less | geq | leq | neq} From d9eba9d1451a89ad98d171d1ea8c2ad5f381df1b Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 30 Sep 2021 12:50:40 +0200 Subject: [PATCH 1043/1889] improve and test the sort error message --- milli/src/asc_desc.rs | 70 ++++++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index b8323292c..09bd0082a 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -20,12 +20,12 @@ impl fmt::Display for AscDescError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::InvalidSyntax { name } => { - write!(f, "invalid asc/desc syntax for {}", name) + write!(f, "invalid asc/desc syntax for {}.", name) } Self::ReservedKeyword { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a asc/desc rule", + "{} is a reserved keyword and thus can't be used as a asc/desc rule.", name ) } @@ -128,8 +128,6 @@ impl AscDesc { impl FromStr for AscDesc { type Err = AscDescError; - /// Since we don't know if this was deserialized for a criterion or a sort we just return a - /// string and let the caller create his own error. fn from_str(text: &str) -> Result { match text.rsplit_once(':') { Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)), @@ -156,10 +154,10 @@ impl From for SortError { SortError::BadGeoPointUsage { name } } AscDescError::ReservedKeyword { name } if &name == "_geo" => { - SortError::ReservedNameForSettings { name: "_geoPoint".to_string() } + SortError::ReservedNameForSettings { name } } AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => { - SortError::ReservedNameForFilter { name: "_geoRadius".to_string() } + SortError::ReservedNameForFilter { name: String::from("_geoRadius") } } AscDescError::ReservedKeyword { name } => SortError::ReservedName { name }, } @@ -173,34 +171,26 @@ impl fmt::Display for SortError { write!( f, "invalid syntax for the `_geoPoint` parameter: `{}`. \ -Usage: `_geoPoint(latitude, longitude):asc`", + Usage: `_geoPoint(latitude, longitude):asc`.", name ) } Self::InvalidName { name } => { - write!(f, "invalid syntax for the sort parameter {}", name) + write!(f, "invalid syntax for the sort parameter `{}`.", name) } Self::ReservedName { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a sort expression", + "{} is a reserved keyword and thus can't be used as a sort expression.", name ) } - Self::ReservedNameForSettings { name } => { + Self::ReservedNameForSettings { name } | Self::ReservedNameForFilter { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a sort expression. \ -{} can only be used in the settings", - name, name - ) - } - Self::ReservedNameForFilter { name } => { - write!( - f, - "{} is a reserved keyword and thus can't be used as a sort expression. \ -{} can only be used for filtering at search time", - name, name + "`{}` is a reserved keyword and thus can't be used as a sort expression. \ + Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates.", + name, ) } } @@ -299,4 +289,42 @@ mod tests { ); } } + + #[test] + fn sort_error_message() { + let errors = [ + ( + AscDescError::InvalidSyntax { name: S("truc:machin") }, + S("invalid syntax for the sort parameter `truc:machin`."), + ), + ( + AscDescError::InvalidSyntax { name: S("hello:world") }, + S("invalid syntax for the sort parameter `hello:world`."), + ), + ( + AscDescError::ReservedKeyword { name: S("_geo") }, + S("`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), + ), + ( + AscDescError::ReservedKeyword { name: S("_geoDistance") }, + S("_geoDistance is a reserved keyword and thus can't be used as a sort expression.") + ), + ( + AscDescError::ReservedKeyword { name: S("_geoRadius(12, 13)") }, + S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), + ), + ]; + + for (asc_desc_error, expected_message) in errors { + let sort_error = SortError::from(asc_desc_error); + assert_eq!( + sort_error.to_string(), + expected_message, + "was expecting {} for the error {:?} but instead got {}", + expected_message, + sort_error, + sort_error.to_string() + ); + } + } } From 05d8a33a284c0340984ea78d208c132ac0fdc3cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Sat, 2 Oct 2021 16:21:31 +0200 Subject: [PATCH 1044/1889] Update version for the next release (v0.17.1) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index bb8a35d12..041d35099 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.17.0" +version = "0.17.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 33b5fe3b8..afe29eaab 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.17.0" +version = "0.17.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 2a3ca0bd3..fbc4993a5 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.17.0" +version = "0.17.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 19027d49e..bf001e155 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.17.0" +version = "0.17.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index d570ce423..900b1f50a 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.17.0" +version = "0.17.1" authors = ["Clément Renault "] edition = "2018" From 75d341d92898b5ed1c63dd555b343dabb875ab26 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 4 Oct 2021 16:36:11 +0200 Subject: [PATCH 1045/1889] Re-implement set based algorithm for attribute criterion --- milli/src/search/criteria/attribute.rs | 625 +++++++++---------------- 1 file changed, 218 insertions(+), 407 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 5eb1311a1..0e589dd92 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -1,7 +1,7 @@ -use std::borrow::Cow; use std::cmp::{self, Ordering}; use std::collections::binary_heap::PeekMut; use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap}; +use std::iter::Peekable; use std::mem::take; use roaring::RoaringBitmap; @@ -17,10 +17,6 @@ use crate::{Result, TreeLevel}; /// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). const LCM_10_FIRST_NUMBERS: u32 = 2520; -/// To compute the interval size of a level, -/// we use 4 as the exponentiation base and the level as the exponent. -const LEVEL_EXPONENTIATION_BASE: u32 = 4; - /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 1000; @@ -32,7 +28,8 @@ pub struct Attribute<'t> { state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, bucket_candidates: RoaringBitmap, parent: Box, - current_buckets: Option>, + linear_buckets: Option>, + set_buckets: Option>>, } impl<'t> Attribute<'t> { @@ -42,7 +39,8 @@ impl<'t> Attribute<'t> { state: None, bucket_candidates: RoaringBitmap::new(), parent, - current_buckets: None, + linear_buckets: None, + set_buckets: None, } } } @@ -67,19 +65,19 @@ impl<'t> Criterion for Attribute<'t> { } Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { - let current_buckets = match self.current_buckets.as_mut() { - Some(current_buckets) => current_buckets, + let linear_buckets = match self.linear_buckets.as_mut() { + Some(linear_buckets) => linear_buckets, None => { - let new_buckets = linear_compute_candidates( + let new_buckets = initialize_linear_buckets( self.ctx, &flattened_query_tree, &allowed_candidates, )?; - self.current_buckets.get_or_insert(new_buckets.into_iter()) + self.linear_buckets.get_or_insert(new_buckets.into_iter()) } }; - match current_buckets.next() { + match linear_buckets.next() { Some((_score, candidates)) => candidates, None => { return Ok(Some(CriterionResult { @@ -91,13 +89,21 @@ impl<'t> Criterion for Attribute<'t> { } } } else { - match set_compute_candidates( - self.ctx, - &flattened_query_tree, - &allowed_candidates, - params.wdcache, - )? { - Some(candidates) => candidates, + let mut set_buckets = match self.set_buckets.as_mut() { + Some(set_buckets) => set_buckets, + None => { + let new_buckets = initialize_set_buckets( + self.ctx, + &flattened_query_tree, + &allowed_candidates, + params.wdcache, + )?; + self.set_buckets.get_or_insert(new_buckets) + } + }; + + match set_compute_candidates(&mut set_buckets, &allowed_candidates)? { + Some((_score, candidates)) => candidates, None => { return Ok(Some(CriterionResult { query_tree: Some(query_tree), @@ -148,7 +154,7 @@ impl<'t> Criterion for Attribute<'t> { } self.state = Some((query_tree, flattened_query_tree, candidates)); - self.current_buckets = None; + self.linear_buckets = None; } Some(CriterionResult { query_tree: None, @@ -170,142 +176,52 @@ impl<'t> Criterion for Attribute<'t> { } } -/// WordLevelIterator is an pseudo-Iterator over intervals of word-position for one word, -/// it will begin at the first non-empty interval and will return every interval without -/// jumping over empty intervals. -struct WordLevelIterator<'t, 'q> { - inner: Box< - dyn Iterator> + 't, - >, - level: TreeLevel, - interval_size: u32, - word: Cow<'q, str>, - in_prefix_cache: bool, - inner_next: Option<(u32, u32, RoaringBitmap)>, - current_interval: Option<(u32, u32)>, -} - -impl<'t, 'q> WordLevelIterator<'t, 'q> { - fn new( - ctx: &'t dyn Context<'t>, - word: Cow<'q, str>, - in_prefix_cache: bool, - ) -> heed::Result> { - match ctx.word_position_last_level(&word, in_prefix_cache)? { - Some(_) => { - // HOTFIX Meilisearch#1707: it is better to only iterate over level 0 for performances reasons. - let level = TreeLevel::min_value(); - let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); - let inner = - ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; - Ok(Some(Self { - inner, - level, - interval_size, - word, - in_prefix_cache, - inner_next: None, - current_interval: None, - })) - } - None => Ok(None), - } - } - - fn dig( - &self, - ctx: &'t dyn Context<'t>, - level: &TreeLevel, - left_interval: Option, - ) -> heed::Result { - let level = *level.min(&self.level); - let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::::into(level) as u32); - let word = self.word.clone(); - let in_prefix_cache = self.in_prefix_cache; - let inner = - ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; - - Ok(Self { - inner, - level, - interval_size, - word, - in_prefix_cache, - inner_next: None, - current_interval: None, - }) - } - - fn next(&mut self) -> heed::Result> { - fn is_next_interval(last_right: u32, next_left: u32) -> bool { - last_right + 1 == next_left - } - - let inner_next = match self.inner_next.take() { - Some(inner_next) => Some(inner_next), - None => self - .inner - .next() - .transpose()? - .map(|((_, _, left, right), docids)| (left, right, docids)), - }; - - match inner_next { - Some((left, right, docids)) => match self.current_interval { - Some((last_left, last_right)) if !is_next_interval(last_right, left) => { - let blank_left = last_left + self.interval_size; - let blank_right = last_right + self.interval_size; - self.current_interval = Some((blank_left, blank_right)); - self.inner_next = Some((left, right, docids)); - Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) - } - _ => { - self.current_interval = Some((left, right)); - Ok(Some((left, right, docids))) - } - }, - None => Ok(None), - } - } -} - /// QueryLevelIterator is an pseudo-Iterator for a Query, /// It contains WordLevelIterators and is chainned with other QueryLevelIterator. -struct QueryLevelIterator<'t, 'q> { - parent: Option>>, - inner: Vec>, - level: TreeLevel, - accumulator: Vec>, - parent_accumulator: Vec>, - interval_to_skip: usize, +struct QueryLevelIterator<'t> { + inner: Vec< + Peekable< + Box< + dyn Iterator> + + 't, + >, + >, + >, } -impl<'t, 'q> QueryLevelIterator<'t, 'q> { +impl<'t> QueryLevelIterator<'t> { fn new( ctx: &'t dyn Context<'t>, - queries: &'q [Query], + queries: &[Query], wdcache: &mut WordDerivationsCache, - ) -> Result> { + ) -> Result { let mut inner = Vec::with_capacity(queries.len()); for query in queries { + let in_prefix_cache = query.prefix && ctx.in_prefix_cache(query.kind.word()); match &query.kind { QueryKind::Exact { word, .. } => { - if !query.prefix || ctx.in_prefix_cache(&word) { - let word = Cow::Borrowed(query.kind.word()); - if let Some(word_level_iterator) = - WordLevelIterator::new(ctx, word, query.prefix)? - { - inner.push(word_level_iterator); - } + if !query.prefix || in_prefix_cache { + let iter = ctx.word_position_iterator( + query.kind.word(), + TreeLevel::min_value(), + in_prefix_cache, + None, + None, + )?; + + inner.push(iter.peekable()); } else { for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { - let word = Cow::Owned(word.to_owned()); - if let Some(word_level_iterator) = - WordLevelIterator::new(ctx, word, false)? - { - inner.push(word_level_iterator); - } + let iter = ctx.word_position_iterator( + &word, + TreeLevel::min_value(), + in_prefix_cache, + None, + None, + )?; + + inner.push(iter.peekable()); } } } @@ -313,360 +229,255 @@ impl<'t, 'q> QueryLevelIterator<'t, 'q> { for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { - let word = Cow::Owned(word.to_owned()); - if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? - { - inner.push(word_level_iterator); - } + let iter = ctx.word_position_iterator( + &word, + TreeLevel::min_value(), + in_prefix_cache, + None, + None, + )?; + + inner.push(iter.peekable()); } } - } + }; } - let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level); - match highest { - Some(level) => Ok(Some(Self { - parent: None, - inner, - level, - accumulator: vec![], - parent_accumulator: vec![], - interval_to_skip: 0, - })), - None => Ok(None), - } - } - - fn parent(&mut self, parent: QueryLevelIterator<'t, 'q>) -> &Self { - self.parent = Some(Box::new(parent)); - self - } - - /// create a new QueryLevelIterator with a lower level than the current one. - fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result { - let (level, parent) = match &self.parent { - Some(parent) => { - let parent = parent.dig(ctx)?; - (parent.level.min(self.level), Some(Box::new(parent))) - } - None => (self.level.saturating_sub(1), None), - }; - - let left_interval = self - .accumulator - .get(self.interval_to_skip) - .map(|opt| opt.as_ref().map(|(left, _, _)| *left)) - .flatten(); - let mut inner = Vec::with_capacity(self.inner.len()); - for word_level_iterator in self.inner.iter() { - inner.push(word_level_iterator.dig(ctx, &level, left_interval)?); - } - - Ok(Self { - parent, - inner, - level, - accumulator: vec![], - parent_accumulator: vec![], - interval_to_skip: 0, - }) - } - - fn inner_next(&mut self, level: TreeLevel) -> heed::Result> { - let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None; - let u8_level = Into::::into(level); - let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); - for wli in self.inner.iter_mut() { - let wli_u8_level = Into::::into(wli.level); - let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); - for _ in 0..accumulated_count { - if let Some((next_left, _, next_docids)) = wli.next()? { - accumulated = match accumulated.take() { - Some((acc_left, acc_right, mut acc_docids)) => { - acc_docids |= next_docids; - Some((acc_left, acc_right, acc_docids)) - } - None => Some((next_left, next_left + interval_size, next_docids)), - }; - } - } - } - - Ok(accumulated) - } - - /// return the next meta-interval created from inner WordLevelIterators, - /// and from eventual chainned QueryLevelIterator. - fn next( - &mut self, - allowed_candidates: &RoaringBitmap, - tree_level: TreeLevel, - ) -> heed::Result> { - let parent_result = match self.parent.as_mut() { - Some(parent) => Some(parent.next(allowed_candidates, tree_level)?), - None => None, - }; - - match parent_result { - Some(parent_next) => { - let inner_next = self.inner_next(tree_level)?; - self.interval_to_skip += interval_to_skip( - &self.parent_accumulator, - &self.accumulator, - self.interval_to_skip, - allowed_candidates, - ); - self.accumulator.push(inner_next); - self.parent_accumulator.push(parent_next); - let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; - - for current in self - .accumulator - .iter() - .rev() - .zip(self.parent_accumulator.iter()) - .skip(self.interval_to_skip) - { - if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { - match merged_interval.as_mut() { - Some((_, _, merged_docids)) => *merged_docids |= a & b, - None => { - merged_interval = Some((left_a + left_b, right_a + right_b, a & b)) - } - } - } - } - Ok(merged_interval) - } - None => { - let level = self.level; - match self.inner_next(level)? { - Some((left, right, mut candidates)) => { - self.accumulator = vec![Some((left, right, RoaringBitmap::new()))]; - candidates &= allowed_candidates; - Ok(Some((left, right, candidates))) - } - None => { - self.accumulator = vec![None]; - Ok(None) - } - } - } - } + Ok(Self { inner }) } } -/// Count the number of interval that can be skiped when we make the cross-intersections -/// in order to compute the next meta-interval. -/// A pair of intervals is skiped when both intervals doesn't contain any allowed docids. -fn interval_to_skip( - parent_accumulator: &[Option<(u32, u32, RoaringBitmap)>], - current_accumulator: &[Option<(u32, u32, RoaringBitmap)>], - already_skiped: usize, - allowed_candidates: &RoaringBitmap, -) -> usize { - parent_accumulator - .iter() - .zip(current_accumulator.iter()) - .skip(already_skiped) - .take_while(|(parent, current)| { - let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); - let skip_current = current - .as_ref() - .map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); - skip_parent && skip_current - }) - .count() +impl<'t> Iterator for QueryLevelIterator<'t> { + type Item = heed::Result<(u32, RoaringBitmap)>; + + fn next(&mut self) -> Option { + // sort inner words from the closest next position to the more far next position. + let expected_pos = self + .inner + .iter_mut() + .filter_map(|wli| match wli.peek() { + Some(Ok(((_, _, pos, _), _))) => Some(*pos), + _ => None, + }) + .min()?; + + let mut candidates = None; + for wli in self.inner.iter_mut() { + if let Some(Ok(((_, _, pos, _), _))) = wli.peek() { + if *pos > expected_pos { + continue; + } + } + + match wli.next() { + Some(Ok((_, docids))) => { + candidates = match candidates.take() { + Some(candidates) => Some(candidates | docids), + None => Some(docids), + } + } + Some(Err(e)) => return Some(Err(e)), + None => continue, + } + } + + candidates.map(|candidates| Ok((expected_pos, candidates))) + } } /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, /// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates. -struct Branch<'t, 'q> { - query_level_iterator: QueryLevelIterator<'t, 'q>, - last_result: (u32, u32, RoaringBitmap), - tree_level: TreeLevel, +struct Branch<'t> { + query_level_iterator: Vec<(u32, RoaringBitmap, Peekable>)>, + last_result: (u32, RoaringBitmap), branch_size: u32, } -impl<'t, 'q> Branch<'t, 'q> { +impl<'t> Branch<'t> { + fn new( + ctx: &'t dyn Context<'t>, + flatten_branch: &[Vec], + wdcache: &mut WordDerivationsCache, + allowed_candidates: &RoaringBitmap, + ) -> Result { + let mut query_level_iterator = Vec::new(); + for queries in flatten_branch { + let mut qli = QueryLevelIterator::new(ctx, queries, wdcache)?.peekable(); + let (pos, docids) = qli.next().transpose()?.unwrap_or((0, RoaringBitmap::new())); + query_level_iterator.push((pos, docids & allowed_candidates, qli)); + } + + let mut branch = Self { + query_level_iterator, + last_result: (0, RoaringBitmap::new()), + branch_size: flatten_branch.len() as u32, + }; + + branch.update_last_result(); + + Ok(branch) + } + /// return the next meta-interval of the branch, /// and update inner interval in order to be ranked by the BinaryHeap. fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result { - let tree_level = self.query_level_iterator.level; - match self.query_level_iterator.next(allowed_candidates, tree_level)? { - Some(last_result) => { - self.last_result = last_result; - self.tree_level = tree_level; - Ok(true) - } - None => Ok(false), + // update the first query. + let index = self.lowest_iterator_index(); + match self.query_level_iterator.get_mut(index) { + Some((cur_pos, cur_docids, qli)) => match qli.next().transpose()? { + Some((next_pos, next_docids)) => { + *cur_pos = next_pos; + *cur_docids |= next_docids & allowed_candidates; + } + None => return Ok(false), + }, + None => return Ok(false), } + + self.update_last_result(); + + Ok(true) } - /// make the current Branch iterate over smaller intervals. - fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> { - self.query_level_iterator = self.query_level_iterator.dig(ctx)?; - Ok(()) + fn lowest_iterator_index(&mut self) -> usize { + let (index, _) = self + .query_level_iterator + .iter_mut() + .map(|(pos, docids, qli)| { + if docids.is_empty() { + 0 + } else { + qli.peek() + .map(|result| { + result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0) + }) + .unwrap_or(u32::MAX) + } + }) + .enumerate() + .min_by_key(|(_, diff)| *diff) + .unwrap_or((0, 0)); + + index } - /// because next() method could be time consuming, - /// update inner interval in order to be ranked by the binary_heap without computing it, - /// the next() method should be called when the real interval is needed. - fn lazy_next(&mut self) { - let u8_level = Into::::into(self.tree_level); - let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); - let (left, right, _) = self.last_result; + fn update_last_result(&mut self) { + let mut result_pos = 0; + let mut result_docids = None; - self.last_result = (left + interval_size, right + interval_size, RoaringBitmap::new()); + for (pos, docids, _qli) in self.query_level_iterator.iter() { + result_pos += pos; + result_docids = result_docids + .take() + .map_or_else(|| Some(docids.clone()), |candidates| Some(candidates & docids)); + } + + // remove last result docids from inner iterators + if let Some(docids) = result_docids.as_ref() { + for (_, query_docids, _) in self.query_level_iterator.iter_mut() { + *query_docids -= docids; + } + } + + self.last_result = (result_pos, result_docids.unwrap_or_default()); } /// return the score of the current inner interval. fn compute_rank(&self) -> u32 { - // we compute a rank from the left interval. - let (left, _, _) = self.last_result; - left.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size + // we compute a rank from the position. + let (pos, _) = self.last_result; + pos.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size } fn cmp(&self, other: &Self) -> Ordering { let self_rank = self.compute_rank(); let other_rank = other.compute_rank(); - let left_cmp = self_rank.cmp(&other_rank); - // on level: lower is better, - // we want to dig faster into levels on interesting branches. - let level_cmp = self.tree_level.cmp(&other.tree_level); - left_cmp.then(level_cmp).then(self.last_result.2.len().cmp(&other.last_result.2.len())) + // lower rank is better, and because BinaryHeap give the higher ranked branch, we reverse it. + self_rank.cmp(&other_rank).reverse() } } -impl<'t, 'q> Ord for Branch<'t, 'q> { +impl<'t> Ord for Branch<'t> { fn cmp(&self, other: &Self) -> Ordering { self.cmp(other) } } -impl<'t, 'q> PartialOrd for Branch<'t, 'q> { +impl<'t> PartialOrd for Branch<'t> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl<'t, 'q> PartialEq for Branch<'t, 'q> { +impl<'t> PartialEq for Branch<'t> { fn eq(&self, other: &Self) -> bool { self.cmp(other) == Ordering::Equal } } -impl<'t, 'q> Eq for Branch<'t, 'q> {} +impl<'t> Eq for Branch<'t> {} -fn initialize_query_level_iterators<'t, 'q>( - ctx: &'t dyn Context<'t>, - branches: &'q FlattenedQueryTree, - allowed_candidates: &RoaringBitmap, - wdcache: &mut WordDerivationsCache, -) -> Result>> { - let mut positions = BinaryHeap::with_capacity(branches.len()); - for branch in branches { - let mut branch_positions = Vec::with_capacity(branch.len()); - for queries in branch { - match QueryLevelIterator::new(ctx, queries, wdcache)? { - Some(qli) => branch_positions.push(qli), - None => { - // the branch seems to be invalid, so we skip it. - branch_positions.clear(); - break; - } - } - } - // QueryLevelIterator need to be sorted by level and folded in descending order. - branch_positions.sort_unstable_by_key(|qli| qli.level); - let folded_query_level_iterators = - branch_positions.into_iter().fold(None, |fold: Option, mut qli| { - match fold { - Some(fold) => { - qli.parent(fold); - Some(qli) - } - None => Some(qli), - } - }); - - if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { - let tree_level = folded_query_level_iterators.level; - let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?; - if let Some(last_result) = last_result { - let branch = Branch { - last_result, - tree_level, - query_level_iterator: folded_query_level_iterators, - branch_size: branch.len() as u32, - }; - positions.push(branch); - } - } - } - - Ok(positions) -} - -fn set_compute_candidates<'t>( +fn initialize_set_buckets<'t>( ctx: &'t dyn Context<'t>, branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, wdcache: &mut WordDerivationsCache, -) -> Result> { - let mut branches_heap = - initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; - let lowest_level = TreeLevel::min_value(); +) -> Result>> { + let mut heap = BinaryHeap::new(); + for flatten_branch in branches { + let branch = Branch::new(ctx, flatten_branch, wdcache, allowed_candidates)?; + heap.push(branch); + } + + Ok(heap) +} + +fn set_compute_candidates( + branches_heap: &mut BinaryHeap, + allowed_candidates: &RoaringBitmap, +) -> Result> { let mut final_candidates: Option<(u32, RoaringBitmap)> = None; let mut allowed_candidates = allowed_candidates.clone(); while let Some(mut branch) = branches_heap.peek_mut() { - let is_lowest_level = branch.tree_level == lowest_level; - let branch_rank = branch.compute_rank(); // if current is worst than best we break to return // candidates that correspond to the best rank + let branch_rank = branch.compute_rank(); if let Some((best_rank, _)) = final_candidates { if branch_rank > best_rank { break; } } - let _left = branch.last_result.0; - let candidates = take(&mut branch.last_result.2); + + let candidates = take(&mut branch.last_result.1); if candidates.is_empty() { // we don't have candidates, get next interval. if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } - } else if is_lowest_level { - // we have candidates, but we can't dig deeper. + } else { allowed_candidates -= &candidates; final_candidates = match final_candidates.take() { // we add current candidates to best candidates Some((best_rank, mut best_candidates)) => { best_candidates |= candidates; - branch.lazy_next(); + branch.next(&allowed_candidates)?; Some((best_rank, best_candidates)) } // we take current candidates as best candidates None => { - branch.lazy_next(); + branch.next(&allowed_candidates)?; Some((branch_rank, candidates)) } }; - } else { - // we have candidates, lets dig deeper in levels. - branch.dig(ctx)?; - if !branch.next(&allowed_candidates)? { - PeekMut::pop(branch); - } } } - Ok(final_candidates.map(|(_rank, candidates)| candidates)) + Ok(final_candidates) } -fn linear_compute_candidates( +fn initialize_linear_buckets( ctx: &dyn Context, branches: &FlattenedQueryTree, allowed_candidates: &RoaringBitmap, From 3296bb243c817c16c0b47603b6760f69f1409c56 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 5 Oct 2021 11:18:42 +0200 Subject: [PATCH 1046/1889] Simplify word level position DB into a word position DB --- milli/src/heed_codec/mod.rs | 4 +- milli/src/heed_codec/str_beu32_codec.rs | 38 +++ .../heed_codec/str_level_position_codec.rs | 47 --- milli/src/index.rs | 23 +- milli/src/lib.rs | 4 +- milli/src/search/criteria/attribute.rs | 57 ++-- milli/src/search/criteria/exactness.rs | 7 +- milli/src/search/criteria/mod.rs | 104 ++----- milli/src/tree_level.rs | 51 ---- milli/src/update/clear_documents.rs | 8 +- milli/src/update/delete_documents.rs | 9 +- ...ids.rs => extract_word_position_docids.rs} | 12 +- .../src/update/index_documents/extract/mod.rs | 10 +- milli/src/update/index_documents/mod.rs | 6 +- .../src/update/index_documents/typed_chunk.rs | 8 +- milli/src/update/mod.rs | 4 +- milli/src/update/words_level_positions.rs | 268 ------------------ .../update/words_prefix_position_docids.rs | 105 +++++++ 18 files changed, 220 insertions(+), 545 deletions(-) create mode 100644 milli/src/heed_codec/str_beu32_codec.rs delete mode 100644 milli/src/heed_codec/str_level_position_codec.rs delete mode 100644 milli/src/tree_level.rs rename milli/src/update/index_documents/extract/{extract_word_level_position_docids.rs => extract_word_position_docids.rs} (76%) delete mode 100644 milli/src/update/words_level_positions.rs create mode 100644 milli/src/update/words_prefix_position_docids.rs diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 7bd7dff2d..2f2a01192 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -4,7 +4,7 @@ mod field_id_word_count_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; -mod str_level_position_codec; +mod str_beu32_codec; mod str_str_u8_codec; pub use self::beu32_str_codec::BEU32StrCodec; @@ -14,5 +14,5 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; -pub use self::str_level_position_codec::StrLevelPositionCodec; +pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/milli/src/heed_codec/str_beu32_codec.rs b/milli/src/heed_codec/str_beu32_codec.rs new file mode 100644 index 000000000..d1f379bdc --- /dev/null +++ b/milli/src/heed_codec/str_beu32_codec.rs @@ -0,0 +1,38 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::mem::size_of; +use std::str; + +pub struct StrBEU32Codec; + +impl<'a> heed::BytesDecode<'a> for StrBEU32Codec { + type DItem = (&'a str, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::(); + + if bytes.len() < footer_len { + return None; + } + + let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let word = str::from_utf8(word).ok()?; + let pos = bytes.try_into().map(u32::from_be_bytes).ok()?; + + Some((word, pos)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrBEU32Codec { + type EItem = (&'a str, u32); + + fn bytes_encode((word, pos): &Self::EItem) -> Option> { + let pos = pos.to_be_bytes(); + + let mut bytes = Vec::with_capacity(word.len() + pos.len()); + bytes.extend_from_slice(word.as_bytes()); + bytes.extend_from_slice(&pos[..]); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/str_level_position_codec.rs b/milli/src/heed_codec/str_level_position_codec.rs deleted file mode 100644 index 5be45bbeb..000000000 --- a/milli/src/heed_codec/str_level_position_codec.rs +++ /dev/null @@ -1,47 +0,0 @@ -use std::borrow::Cow; -use std::convert::{TryFrom, TryInto}; -use std::mem::size_of; -use std::str; - -use crate::TreeLevel; - -pub struct StrLevelPositionCodec; - -impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { - type DItem = (&'a str, TreeLevel, u32, u32); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let footer_len = size_of::() + size_of::() * 2; - - if bytes.len() < footer_len { - return None; - } - - let (word, bytes) = bytes.split_at(bytes.len() - footer_len); - let word = str::from_utf8(word).ok()?; - - let (level, bytes) = bytes.split_first()?; - let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?; - let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?; - let level = TreeLevel::try_from(*level).ok()?; - - Some((word, level, left, right)) - } -} - -impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { - type EItem = (&'a str, TreeLevel, u32, u32); - - fn bytes_encode((word, level, left, right): &Self::EItem) -> Option> { - let left = left.to_be_bytes(); - let right = right.to_be_bytes(); - - let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len()); - bytes.extend_from_slice(word.as_bytes()); - bytes.push((*level).into()); - bytes.extend_from_slice(&left[..]); - bytes.extend_from_slice(&right[..]); - - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/index.rs b/milli/src/index.rs index dd5851ccc..6ce693fbe 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -20,7 +20,7 @@ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, StrLevelPositionCodec, StrStrU8Codec, BEU32, + Search, StrBEU32Codec, StrStrU8Codec, BEU32, }; pub mod main_key { @@ -55,8 +55,8 @@ pub mod db_name { pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; - pub const WORD_LEVEL_POSITION_DOCIDS: &str = "word-level-position-docids"; - pub const WORD_PREFIX_LEVEL_POSITION_DOCIDS: &str = "word-prefix-level-position-docids"; + pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; + pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; @@ -86,12 +86,12 @@ pub struct Index { /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, - /// Maps the word, level and position range with the docids that corresponds to it. - pub word_level_position_docids: Database, + /// Maps the word and the position with the docids that corresponds to it. + pub word_position_docids: Database, /// Maps the field id and the word count with the docids that corresponds to it. pub field_id_word_count_docids: Database, - /// Maps the level positions of a word prefix with all the docids where this prefix appears. - pub word_prefix_level_position_docids: Database, + /// Maps the position of a word prefix with all the docids where this prefix appears. + pub word_prefix_position_docids: Database, /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, @@ -122,10 +122,9 @@ impl Index { let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; - let word_level_position_docids = env.create_database(Some(WORD_LEVEL_POSITION_DOCIDS))?; + let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; - let word_prefix_level_position_docids = - env.create_database(Some(WORD_PREFIX_LEVEL_POSITION_DOCIDS))?; + let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; @@ -143,8 +142,8 @@ impl Index { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, - word_level_position_docids, - word_prefix_level_position_docids, + word_position_docids, + word_prefix_position_docids, field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index bb0a32528..838817d98 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -14,7 +14,6 @@ pub mod heed_codec; pub mod index; pub mod proximity; mod search; -pub mod tree_level; pub mod update; use std::collections::{BTreeMap, HashMap}; @@ -35,11 +34,10 @@ pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, - RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, + RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, }; pub use self::index::Index; pub use self::search::{FacetDistribution, FilterCondition, MatchingWords, Search, SearchResult}; -pub use self::tree_level::TreeLevel; pub type Result = std::result::Result; diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 0e589dd92..07b3cf95c 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -10,7 +10,7 @@ use super::{resolve_query_tree, Context, Criterion, CriterionParameters, Criteri use crate::search::criteria::Query; use crate::search::query_tree::{Operation, QueryKind}; use crate::search::{build_dfa, word_derivations, WordDerivationsCache}; -use crate::{Result, TreeLevel}; +use crate::Result; /// To be able to divide integers by the number of words in the query /// we want to find a multiplier that allow us to divide by any number between 1 and 10. @@ -176,20 +176,14 @@ impl<'t> Criterion for Attribute<'t> { } } -/// QueryLevelIterator is an pseudo-Iterator for a Query, -/// It contains WordLevelIterators and is chainned with other QueryLevelIterator. -struct QueryLevelIterator<'t> { - inner: Vec< - Peekable< - Box< - dyn Iterator> - + 't, - >, - >, - >, +/// QueryPositionIterator is an Iterator over positions of a Query, +/// It contains iterators over words positions. +struct QueryPositionIterator<'t> { + inner: + Vec> + 't>>>, } -impl<'t> QueryLevelIterator<'t> { +impl<'t> QueryPositionIterator<'t> { fn new( ctx: &'t dyn Context<'t>, queries: &[Query], @@ -201,25 +195,14 @@ impl<'t> QueryLevelIterator<'t> { match &query.kind { QueryKind::Exact { word, .. } => { if !query.prefix || in_prefix_cache { - let iter = ctx.word_position_iterator( - query.kind.word(), - TreeLevel::min_value(), - in_prefix_cache, - None, - None, - )?; + let iter = + ctx.word_position_iterator(query.kind.word(), in_prefix_cache)?; inner.push(iter.peekable()); } else { for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { - let iter = ctx.word_position_iterator( - &word, - TreeLevel::min_value(), - in_prefix_cache, - None, - None, - )?; + let iter = ctx.word_position_iterator(&word, in_prefix_cache)?; inner.push(iter.peekable()); } @@ -229,13 +212,7 @@ impl<'t> QueryLevelIterator<'t> { for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { - let iter = ctx.word_position_iterator( - &word, - TreeLevel::min_value(), - in_prefix_cache, - None, - None, - )?; + let iter = ctx.word_position_iterator(&word, in_prefix_cache)?; inner.push(iter.peekable()); } @@ -247,7 +224,7 @@ impl<'t> QueryLevelIterator<'t> { } } -impl<'t> Iterator for QueryLevelIterator<'t> { +impl<'t> Iterator for QueryPositionIterator<'t> { type Item = heed::Result<(u32, RoaringBitmap)>; fn next(&mut self) -> Option { @@ -256,14 +233,14 @@ impl<'t> Iterator for QueryLevelIterator<'t> { .inner .iter_mut() .filter_map(|wli| match wli.peek() { - Some(Ok(((_, _, pos, _), _))) => Some(*pos), + Some(Ok(((_, pos), _))) => Some(*pos), _ => None, }) .min()?; let mut candidates = None; for wli in self.inner.iter_mut() { - if let Some(Ok(((_, _, pos, _), _))) = wli.peek() { + if let Some(Ok(((_, pos), _))) = wli.peek() { if *pos > expected_pos { continue; } @@ -286,9 +263,9 @@ impl<'t> Iterator for QueryLevelIterator<'t> { } /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, -/// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates. +/// This branch allows us to iterate over meta-interval of positions. struct Branch<'t> { - query_level_iterator: Vec<(u32, RoaringBitmap, Peekable>)>, + query_level_iterator: Vec<(u32, RoaringBitmap, Peekable>)>, last_result: (u32, RoaringBitmap), branch_size: u32, } @@ -302,7 +279,7 @@ impl<'t> Branch<'t> { ) -> Result { let mut query_level_iterator = Vec::new(); for queries in flatten_branch { - let mut qli = QueryLevelIterator::new(ctx, queries, wdcache)?.peekable(); + let mut qli = QueryPositionIterator::new(ctx, queries, wdcache)?.peekable(); let (pos, docids) = qli.next().transpose()?.unwrap_or((0, RoaringBitmap::new())); query_level_iterator.push((pos, docids & allowed_candidates, qli)); } diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 1e4d4e7a2..8e56b3649 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -10,7 +10,7 @@ use crate::search::criteria::{ resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; -use crate::{Result, TreeLevel}; +use crate::Result; pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, @@ -293,7 +293,6 @@ fn attribute_start_with_docids( attribute_id: u32, query: &[ExactQueryPart], ) -> heed::Result> { - let lowest_level = TreeLevel::min_value(); let mut attribute_candidates_array = Vec::new(); // start from attribute first position let mut pos = attribute_id * 1000; @@ -303,7 +302,7 @@ fn attribute_start_with_docids( Synonyms(synonyms) => { let mut synonyms_candidates = RoaringBitmap::new(); for word in synonyms { - let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; + let wc = ctx.word_position_docids(word, pos)?; if let Some(word_candidates) = wc { synonyms_candidates |= word_candidates; } @@ -313,7 +312,7 @@ fn attribute_start_with_docids( } Phrase(phrase) => { for word in phrase { - let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; + let wc = ctx.word_position_docids(word, pos)?; if let Some(word_candidates) = wc { attribute_candidates_array.push(word_candidates); } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index a23e5acf9..0cad7c013 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -14,7 +14,7 @@ use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, WordDerivationsCache}; -use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result, TreeLevel}; +use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; mod asc_desc; mod attribute; @@ -90,20 +90,8 @@ pub trait Context<'c> { fn word_position_iterator( &self, word: &str, - level: TreeLevel, in_prefix_cache: bool, - left: Option, - right: Option, - ) -> heed::Result< - Box< - dyn Iterator> + 'c, - >, - >; - fn word_position_last_level( - &self, - word: &str, - in_prefix_cache: bool, - ) -> heed::Result>; + ) -> heed::Result> + 'c>>; fn synonyms(&self, word: &str) -> heed::Result>>>; fn searchable_fields_ids(&self) -> Result>; fn field_id_word_count_docids( @@ -111,13 +99,7 @@ pub trait Context<'c> { field_id: FieldId, word_count: u8, ) -> heed::Result>; - fn word_level_position_docids( - &self, - word: &str, - level: TreeLevel, - left: u32, - right: u32, - ) -> heed::Result>; + fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result>; } pub struct CriteriaBuilder<'t> { @@ -183,54 +165,24 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { fn word_position_iterator( &self, word: &str, - level: TreeLevel, in_prefix_cache: bool, - left: Option, - right: Option, - ) -> heed::Result< - Box< - dyn Iterator> + 'c, - >, - > { + ) -> heed::Result> + 'c>> + { let range = { - let left = left.unwrap_or(u32::min_value()); - let right = right.unwrap_or(u32::max_value()); - let left = (word, level, left, left); - let right = (word, level, right, right); + let left = u32::min_value(); + let right = u32::max_value(); + let left = (word, left); + let right = (word, right); left..=right }; let db = match in_prefix_cache { - true => self.index.word_prefix_level_position_docids, - false => self.index.word_level_position_docids, + true => self.index.word_prefix_position_docids, + false => self.index.word_position_docids, }; Ok(Box::new(db.range(self.rtxn, &range)?)) } - fn word_position_last_level( - &self, - word: &str, - in_prefix_cache: bool, - ) -> heed::Result> { - let range = { - let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); - let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); - left..=right - }; - let db = match in_prefix_cache { - true => self.index.word_prefix_level_position_docids, - false => self.index.word_level_position_docids, - }; - let last_level = db - .remap_data_type::() - .range(self.rtxn, &range)? - .last() - .transpose()? - .map(|((_, level, _, _), _)| level); - - Ok(last_level) - } - fn synonyms(&self, word: &str) -> heed::Result>>> { self.index.words_synonyms(self.rtxn, &[word]) } @@ -251,15 +203,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.field_id_word_count_docids.get(self.rtxn, &key) } - fn word_level_position_docids( - &self, - word: &str, - level: TreeLevel, - left: u32, - right: u32, - ) -> heed::Result> { - let key = (word, level, left, right); - self.index.word_level_position_docids.get(self.rtxn, &key) + fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result> { + let key = (word, pos); + self.index.word_position_docids.get(self.rtxn, &key) } } @@ -616,27 +562,13 @@ pub mod test { fn word_position_iterator( &self, _word: &str, - _level: TreeLevel, _in_prefix_cache: bool, - _left: Option, - _right: Option, ) -> heed::Result< - Box< - dyn Iterator> - + 'c, - >, + Box> + 'c>, > { todo!() } - fn word_position_last_level( - &self, - _word: &str, - _in_prefix_cache: bool, - ) -> heed::Result> { - todo!() - } - fn synonyms(&self, _word: &str) -> heed::Result>>> { todo!() } @@ -645,12 +577,10 @@ pub mod test { todo!() } - fn word_level_position_docids( + fn word_position_docids( &self, _word: &str, - _level: TreeLevel, - _left: u32, - _right: u32, + _pos: u32, ) -> heed::Result> { todo!() } diff --git a/milli/src/tree_level.rs b/milli/src/tree_level.rs deleted file mode 100644 index b69316cf6..000000000 --- a/milli/src/tree_level.rs +++ /dev/null @@ -1,51 +0,0 @@ -use std::convert::TryFrom; -use std::fmt; - -/// This is just before the lowest printable character (space, sp, 32) -const MAX_VALUE: u8 = 31; - -#[derive(Debug, Copy, Clone)] -pub enum Error { - LevelTooHigh(u8), -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -#[repr(transparent)] -pub struct TreeLevel(u8); - -impl TreeLevel { - pub const fn max_value() -> TreeLevel { - TreeLevel(MAX_VALUE) - } - - pub const fn min_value() -> TreeLevel { - TreeLevel(0) - } - - pub fn saturating_sub(&self, lhs: u8) -> TreeLevel { - TreeLevel(self.0.saturating_sub(lhs)) - } -} - -impl Into for TreeLevel { - fn into(self) -> u8 { - self.0 - } -} - -impl TryFrom for TreeLevel { - type Error = Error; - - fn try_from(value: u8) -> Result { - match value { - 0..=MAX_VALUE => Ok(TreeLevel(value)), - _ => Err(Error::LevelTooHigh(value)), - } - } -} - -impl fmt::Display for TreeLevel { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.0) - } -} diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ea4193eaf..a820c2a49 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -28,9 +28,9 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, - word_level_position_docids, + word_position_docids, field_id_word_count_docids, - word_prefix_level_position_docids, + word_prefix_position_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s, @@ -64,9 +64,9 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; - word_level_position_docids.clear(self.wtxn)?; + word_position_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; - word_prefix_level_position_docids.clear(self.wtxn)?; + word_prefix_position_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 1b16ba9bf..207aed63c 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -102,8 +102,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_pair_proximity_docids, field_id_word_count_docids, word_prefix_pair_proximity_docids, - word_level_position_docids, - word_prefix_level_position_docids, + word_position_docids, + word_prefix_position_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s, @@ -326,8 +326,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the word level position docids. - let mut iter = - word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); @@ -346,7 +345,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We delete the documents ids that are under the word prefix level position docids. let mut iter = - word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); diff --git a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs similarity index 76% rename from milli/src/update/index_documents/extract/extract_word_level_position_docids.rs rename to milli/src/update/index_documents/extract/extract_word_position_docids.rs index 04cedf5c7..4ca8537ac 100644 --- a/milli/src/update/index_documents/extract/extract_word_level_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -14,13 +14,13 @@ use crate::{DocumentId, Result}; /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_level_position_docids( +pub fn extract_word_position_docids( mut docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { let max_memory = indexer.max_memory_by_thread(); - let mut word_level_position_docids_sorter = create_sorter( + let mut word_position_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -37,15 +37,11 @@ pub fn extract_word_level_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - key_buffer.push(0); // tree level - - // Levels are composed of left and right bounds. - key_buffer.extend_from_slice(&position.to_be_bytes()); key_buffer.extend_from_slice(&position.to_be_bytes()); - word_level_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; + word_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; } } - sorter_into_reader(word_level_position_docids_sorter, indexer) + sorter_into_reader(word_position_docids_sorter, indexer) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 47a62be67..0406e8ef4 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -5,8 +5,8 @@ mod extract_fid_docid_facet_values; mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_word_docids; -mod extract_word_level_position_docids; mod extract_word_pair_proximity_docids; +mod extract_word_position_docids; use std::collections::HashSet; use std::fs::File; @@ -22,8 +22,8 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; -use self::extract_word_level_position_docids::extract_word_level_position_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; +use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ into_clonable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, @@ -98,10 +98,10 @@ pub(crate) fn data_from_obkv_documents( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), - extract_word_level_position_docids, + extract_word_position_docids, merge_cbo_roaring_bitmaps, - TypedChunk::WordLevelPositionDocids, - "word-level-position-docids", + TypedChunk::WordPositionDocids, + "word-position-docids", ); spawn_extraction_task( diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 30ee49893..b0dbd9c3e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -27,7 +27,7 @@ pub use self::transform::{Transform, TransformOutput}; use crate::documents::DocumentBatchReader; use crate::update::{ Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, - WordsLevelPositions, WordsPrefixesFst, + WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result}; @@ -412,8 +412,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - // Run the words level positions update operation. - let mut builder = WordsLevelPositions::new(self.wtxn, self.index); + // Run the words prefix position docids update operation. + let mut builder = WordPrefixPositionDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.max_nb_chunks = self.max_nb_chunks; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b17f28b66..b24a03ff6 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -22,7 +22,7 @@ pub(crate) enum TypedChunk { FieldIdWordcountDocids(grenad::Reader), NewDocumentsIds(RoaringBitmap), WordDocids(grenad::Reader), - WordLevelPositionDocids(grenad::Reader), + WordPositionDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), @@ -110,10 +110,10 @@ pub(crate) fn write_typed_chunk_into_index( index.put_words_fst(wtxn, &fst)?; is_merged_database = true; } - TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => { + TypedChunk::WordPositionDocids(word_position_docids_iter) => { append_entries_into_database( - word_level_position_docids_iter, - &index.word_level_position_docids, + word_position_docids_iter, + &index.word_position_docids, wtxn, index_is_empty, |value, _buffer| Ok(value), diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index d80437ec7..3b6edb0a3 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -8,7 +8,7 @@ pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; -pub use self::words_level_positions::WordsLevelPositions; +pub use self::words_prefix_position_docids::WordPrefixPositionDocids; pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; @@ -21,5 +21,5 @@ mod update_builder; mod update_step; mod word_prefix_docids; mod word_prefix_pair_proximity_docids; -mod words_level_positions; +mod words_prefix_position_docids; mod words_prefixes_fst; diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs deleted file mode 100644 index 0af51fbb2..000000000 --- a/milli/src/update/words_level_positions.rs +++ /dev/null @@ -1,268 +0,0 @@ -use std::convert::TryFrom; -use std::fs::File; -use std::num::NonZeroU32; -use std::{cmp, str}; - -use fst::Streamer; -use grenad::{CompressionType, Reader, Writer}; -use heed::types::{ByteSlice, DecodeIgnore, Str}; -use heed::{BytesEncode, Error}; -use log::debug; -use roaring::RoaringBitmap; - -use crate::error::{InternalError, SerializationError}; -use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; -use crate::index::main_key::WORDS_PREFIXES_FST_KEY; -use crate::update::index_documents::{ - create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, - write_into_lmdb_database, writer_into_reader, WriteMethod, -}; -use crate::{Index, Result, TreeLevel}; - -pub struct WordsLevelPositions<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, - level_group_size: NonZeroU32, - min_level_size: NonZeroU32, -} - -impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordsLevelPositions<'t, 'u, 'i> { - WordsLevelPositions { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - max_nb_chunks: None, - max_memory: None, - level_group_size: NonZeroU32::new(4).unwrap(), - min_level_size: NonZeroU32::new(5).unwrap(), - } - } - - pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { - self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); - self - } - - pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { - self.min_level_size = value; - self - } - - #[logging_timer::time("WordsLevelPositions::{}")] - pub fn execute(self) -> Result<()> { - debug!("Computing and writing the word levels positions docids into LMDB on disk..."); - - let entries = compute_positions_levels( - self.wtxn, - self.index.word_docids.remap_data_type::(), - self.index.word_level_position_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.level_group_size, - self.min_level_size, - )?; - - // The previously computed entries also defines the level 0 entries - // so we can clear the database and append all of these entries. - self.index.word_level_position_docids.clear(self.wtxn)?; - - write_into_lmdb_database( - self.wtxn, - *self.index.word_level_position_docids.as_polymorph(), - entries, - |_, _| Err(InternalError::IndexingMergingKeys { process: "word level position" })?, - WriteMethod::Append, - )?; - - // We compute the word prefix level positions database. - self.index.word_prefix_level_position_docids.clear(self.wtxn)?; - - let mut word_prefix_level_positions_docids_sorter = create_sorter( - merge_cbo_roaring_bitmaps, - self.chunk_compression_type, - self.chunk_compression_level, - self.max_nb_chunks, - self.max_memory, - ); - - // We insert the word prefix level positions where the level is equal to 0 and - // corresponds to the word-prefix level positions where the prefixes appears - // in the prefix FST previously constructed. - let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - let db = self.index.word_level_position_docids.remap_data_type::(); - // iter over all prefixes in the prefix fst. - let mut word_stream = prefix_fst.stream(); - while let Some(prefix_bytes) = word_stream.next() { - let prefix = str::from_utf8(prefix_bytes).map_err(|_| { - SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } - })?; - - // iter over all lines of the DB where the key is prefixed by the current prefix. - let mut iter = db - .remap_key_type::() - .prefix_iter(self.wtxn, &prefix_bytes)? - .remap_key_type::(); - while let Some(((_word, level, left, right), data)) = iter.next().transpose()? { - // if level is 0, we push the line in the sorter - // replacing the complete word by the prefix. - if level == TreeLevel::min_value() { - let key = (prefix, level, left, right); - let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap(); - word_prefix_level_positions_docids_sorter.insert(bytes, data)?; - } - } - } - - // We finally write all the word prefix level positions docids with - // a level equal to 0 into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_level_position_docids.as_polymorph(), - word_prefix_level_positions_docids_sorter, - merge_cbo_roaring_bitmaps, - WriteMethod::Append, - )?; - - let entries = compute_positions_levels( - self.wtxn, - self.index.word_prefix_docids.remap_data_type::(), - self.index.word_prefix_level_position_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.level_group_size, - self.min_level_size, - )?; - - // The previously computed entries also defines the level 0 entries - // so we can clear the database and append all of these entries. - self.index.word_prefix_level_position_docids.clear(self.wtxn)?; - - write_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_level_position_docids.as_polymorph(), - entries, - |_, _| { - Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })? - }, - WriteMethod::Append, - )?; - - Ok(()) - } -} - -/// Returns the next number after or equal to `x` that is divisible by `d`. -fn next_divisible(x: u32, d: u32) -> u32 { - (x.saturating_sub(1) | (d - 1)) + 1 -} - -/// Returns the previous number after or equal to `x` that is divisible by `d`, -/// saturates on zero. -fn previous_divisible(x: u32, d: u32) -> u32 { - match x.checked_sub(d - 1) { - Some(0) | None => 0, - Some(x) => next_divisible(x, d), - } -} - -/// Generates all the words positions levels based on the levels zero (including the level zero). -fn compute_positions_levels( - rtxn: &heed::RoTxn, - words_db: heed::Database, - words_positions_db: heed::Database, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroU32, - min_level_size: NonZeroU32, -) -> Result> { - // It is forbidden to keep a cursor and write in a database at the same time with LMDB - // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = tempfile::tempfile() - .and_then(|file| create_writer(compression_type, compression_level, file))?; - - for result in words_db.iter(rtxn)? { - let (word, ()) = result?; - - let level_0_range = { - let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); - let right = (word, TreeLevel::min_value(), u32::max_value(), u32::max_value()); - left..=right - }; - - let first_level_size = words_positions_db - .remap_data_type::() - .range(rtxn, &level_0_range)? - .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (TreeLevel::try_from(l).unwrap(), level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); - - // As specified in the documentation, we also write the level 0 entries. - for result in words_positions_db.range(rtxn, &level_0_range)? { - let ((word, level, left, right), docids) = result?; - write_level_entry(&mut writer, word, level, left, right, &docids)?; - } - - for (level, group_size) in group_size_iter { - let mut left = 0; - let mut right = 0; - let mut group_docids = RoaringBitmap::new(); - - for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() { - let ((_word, _level, value, _right), docids) = result?; - - if i == 0 { - left = previous_divisible(value, group_size); - right = left + (group_size - 1); - } - - if value > right { - // we found the first bound of the next group, we must store the left - // and right bounds associated with the docids. - write_level_entry(&mut writer, word, level, left, right, &group_docids)?; - - // We save the left bound for the new group and also reset the docids. - group_docids = RoaringBitmap::new(); - left = previous_divisible(value, group_size); - right = left + (group_size - 1); - } - - // The right bound is always the bound we run through. - group_docids |= docids; - } - - if !group_docids.is_empty() { - write_level_entry(&mut writer, word, level, left, right, &group_docids)?; - } - } - } - - writer_into_reader(writer) -} - -fn write_level_entry( - writer: &mut Writer, - word: &str, - level: TreeLevel, - left: u32, - right: u32, - ids: &RoaringBitmap, -) -> Result<()> { - let key = (word, level, left, right); - let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) -} diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs new file mode 100644 index 000000000..a8346a1cb --- /dev/null +++ b/milli/src/update/words_prefix_position_docids.rs @@ -0,0 +1,105 @@ +use std::num::NonZeroU32; +use std::{cmp, str}; + +use fst::Streamer; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesEncode; +use log::debug; + +use crate::error::SerializationError; +use crate::heed_codec::StrBEU32Codec; +use crate::index::main_key::WORDS_PREFIXES_FST_KEY; +use crate::update::index_documents::{ + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, +}; +use crate::{Index, Result}; + +pub struct WordPrefixPositionDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, +} + +impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordPrefixPositionDocids<'t, 'u, 'i> { + WordPrefixPositionDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + max_nb_chunks: None, + max_memory: None, + level_group_size: NonZeroU32::new(4).unwrap(), + min_level_size: NonZeroU32::new(5).unwrap(), + } + } + + pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { + self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); + self + } + + pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { + self.min_level_size = value; + self + } + + #[logging_timer::time("WordPrefixPositionDocids::{}")] + pub fn execute(self) -> Result<()> { + debug!("Computing and writing the word levels positions docids into LMDB on disk..."); + + self.index.word_prefix_position_docids.clear(self.wtxn)?; + + let mut word_prefix_positions_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + self.chunk_compression_type, + self.chunk_compression_level, + self.max_nb_chunks, + self.max_memory, + ); + + // We insert the word prefix position and + // corresponds to the word-prefix position where the prefixes appears + // in the prefix FST previously constructed. + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + let db = self.index.word_position_docids.remap_data_type::(); + // iter over all prefixes in the prefix fst. + let mut word_stream = prefix_fst.stream(); + while let Some(prefix_bytes) = word_stream.next() { + let prefix = str::from_utf8(prefix_bytes).map_err(|_| { + SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } + })?; + + // iter over all lines of the DB where the key is prefixed by the current prefix. + let mut iter = db + .remap_key_type::() + .prefix_iter(self.wtxn, &prefix_bytes)? + .remap_key_type::(); + while let Some(((_word, pos), data)) = iter.next().transpose()? { + let key = (prefix, pos); + let bytes = StrBEU32Codec::bytes_encode(&key).unwrap(); + word_prefix_positions_docids_sorter.insert(bytes, data)?; + } + } + + // We finally write all the word prefix position docids into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_position_docids.as_polymorph(), + word_prefix_positions_docids_sorter, + merge_cbo_roaring_bitmaps, + WriteMethod::Append, + )?; + + Ok(()) + } +} From 5ed75de0dbc465162abd123160324c2fddd7b8f1 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 5 Oct 2021 13:56:12 +0200 Subject: [PATCH 1047/1889] Update infos crate --- infos/src/main.rs | 84 +++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 47 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index bb09d7234..dc98d410d 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -7,7 +7,7 @@ use byte_unit::Byte; use heed::EnvOpenOptions; use milli::facet::FacetType; use milli::index::db_name::*; -use milli::{FieldId, Index, TreeLevel}; +use milli::{FieldId, Index}; use structopt::StructOpt; use Command::*; @@ -22,8 +22,8 @@ const ALL_DATABASE_NAMES: &[&str] = &[ DOCID_WORD_POSITIONS, WORD_PAIR_PROXIMITY_DOCIDS, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS, - WORD_LEVEL_POSITION_DOCIDS, - WORD_PREFIX_LEVEL_POSITION_DOCIDS, + WORD_POSITION_DOCIDS, + WORD_PREFIX_POSITION_DOCIDS, FIELD_ID_WORD_COUNT_DOCIDS, FACET_ID_F64_DOCIDS, FACET_ID_STRING_DOCIDS, @@ -281,10 +281,10 @@ fn main() -> anyhow::Result<()> { facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name) } WordsLevelPositionsDocids { full_display, words } => { - words_level_positions_docids(&index, &rtxn, !full_display, words) + words_positions_docids(&index, &rtxn, !full_display, words) } WordPrefixesLevelPositionsDocids { full_display, prefixes } => { - word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) + word_prefixes_positions_docids(&index, &rtxn, !full_display, prefixes) } FieldIdWordCountDocids { full_display, field_name } => { field_id_word_count_docids(&index, &rtxn, !full_display, field_name) @@ -379,8 +379,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, - word_level_position_docids, - word_prefix_level_position_docids, + word_position_docids, + word_prefix_position_docids, field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, @@ -395,8 +395,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let docid_word_positions_name = "docid_word_positions"; let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids"; - let word_level_position_docids_name = "word_level_position_docids"; - let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; + let word_position_docids_name = "word_position_docids"; + let word_prefix_position_docids_name = "word_prefix_position_docids"; let field_id_word_count_docids_name = "field_id_word_count_docids"; let facet_id_f64_docids_name = "facet_id_f64_docids"; let facet_id_string_docids_name = "facet_id_string_docids"; @@ -471,19 +471,19 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho } } - for result in word_level_position_docids.remap_data_type::().iter(rtxn)? { - let ((word, level, left, right), value) = result?; - let key = format!("{} {} {:?}", word, level, left..=right); - heap.push(Reverse((value.len(), key, word_level_position_docids_name))); + for result in word_position_docids.remap_data_type::().iter(rtxn)? { + let ((word, pos), value) = result?; + let key = format!("{} {}", word, pos); + heap.push(Reverse((value.len(), key, word_position_docids_name))); if heap.len() > limit { heap.pop(); } } - for result in word_prefix_level_position_docids.remap_data_type::().iter(rtxn)? { - let ((word, level, left, right), value) = result?; - let key = format!("{} {} {:?}", word, level, left..=right); - heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name))); + for result in word_prefix_position_docids.remap_data_type::().iter(rtxn)? { + let ((word, pos), value) = result?; + let key = format!("{} {}", word, pos); + heap.push(Reverse((value.len(), key, word_prefix_position_docids_name))); if heap.len() > limit { heap.pop(); } @@ -663,7 +663,7 @@ fn facet_values_docids( Ok(wtr.flush()?) } -fn words_level_positions_docids( +fn words_positions_docids( index: &Index, rtxn: &heed::RoTxn, debug: bool, @@ -671,16 +671,16 @@ fn words_level_positions_docids( ) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; + wtr.write_record(&["word", "position", "documents_count", "documents_ids"])?; for word in words.iter().map(AsRef::as_ref) { let range = { - let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); - let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + let left = (word, u32::min_value()); + let right = (word, u32::max_value()); left..=right }; - for result in index.word_level_position_docids.range(rtxn, &range)? { - let ((w, level, left, right), docids) = result?; + for result in index.word_position_docids.range(rtxn, &range)? { + let ((w, pos), docids) = result?; let count = docids.len().to_string(); let docids = if debug { @@ -688,20 +688,15 @@ fn words_level_positions_docids( } else { format!("{:?}", docids.iter().collect::>()) }; - let position_range = if level == TreeLevel::min_value() { - format!("{:?}", left) - } else { - format!("{:?}", left..=right) - }; - let level = level.to_string(); - wtr.write_record(&[w, &level, &position_range, &count, &docids])?; + let position = format!("{:?}", pos); + wtr.write_record(&[w, &position, &count, &docids])?; } } Ok(wtr.flush()?) } -fn word_prefixes_level_positions_docids( +fn word_prefixes_positions_docids( index: &Index, rtxn: &heed::RoTxn, debug: bool, @@ -709,16 +704,16 @@ fn word_prefixes_level_positions_docids( ) -> anyhow::Result<()> { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?; + wtr.write_record(&["prefix", "position", "documents_count", "documents_ids"])?; for word in prefixes.iter().map(AsRef::as_ref) { let range = { - let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); - let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); + let left = (word, u32::min_value()); + let right = (word, u32::max_value()); left..=right }; - for result in index.word_prefix_level_position_docids.range(rtxn, &range)? { - let ((w, level, left, right), docids) = result?; + for result in index.word_prefix_position_docids.range(rtxn, &range)? { + let ((w, pos), docids) = result?; let count = docids.len().to_string(); let docids = if debug { @@ -726,13 +721,8 @@ fn word_prefixes_level_positions_docids( } else { format!("{:?}", docids.iter().collect::>()) }; - let position_range = if level == TreeLevel::min_value() { - format!("{:?}", left) - } else { - format!("{:?}", left..=right) - }; - let level = level.to_string(); - wtr.write_record(&[w, &level, &position_range, &count, &docids])?; + let position = format!("{:?}", pos); + wtr.write_record(&[w, &position, &count, &docids])?; } } @@ -970,8 +960,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, - word_level_position_docids, - word_prefix_level_position_docids, + word_position_docids, + word_prefix_position_docids, field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, @@ -994,8 +984,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a DOCID_WORD_POSITIONS => docid_word_positions.as_polymorph(), WORD_PAIR_PROXIMITY_DOCIDS => word_pair_proximity_docids.as_polymorph(), WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => word_prefix_pair_proximity_docids.as_polymorph(), - WORD_LEVEL_POSITION_DOCIDS => word_level_position_docids.as_polymorph(), - WORD_PREFIX_LEVEL_POSITION_DOCIDS => word_prefix_level_position_docids.as_polymorph(), + WORD_POSITION_DOCIDS => word_position_docids.as_polymorph(), + WORD_PREFIX_POSITION_DOCIDS => word_prefix_position_docids.as_polymorph(), FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(), FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(), FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(), From ea4bd29d1470b9d3b466fd4ae833ca5a63d68b42 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 5 Oct 2021 17:35:07 +0200 Subject: [PATCH 1048/1889] Apply PR comments --- milli/src/search/criteria/attribute.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 07b3cf95c..4c433ec28 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -228,7 +228,7 @@ impl<'t> Iterator for QueryPositionIterator<'t> { type Item = heed::Result<(u32, RoaringBitmap)>; fn next(&mut self) -> Option { - // sort inner words from the closest next position to the more far next position. + // sort inner words from the closest next position to the farthest next position. let expected_pos = self .inner .iter_mut() @@ -324,11 +324,12 @@ impl<'t> Branch<'t> { if docids.is_empty() { 0 } else { - qli.peek() - .map(|result| { + match qli.peek() { + Some(result) => { result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0) - }) - .unwrap_or(u32::MAX) + } + None => u32::MAX, + } } }) .enumerate() From 1bd15d849bc52b2f1a8162e680e3ebe1cdd0c924 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 5 Oct 2021 18:52:14 +0200 Subject: [PATCH 1049/1889] Reduce candidates threshold --- http-ui/src/main.rs | 1 + milli/src/search/criteria/attribute.rs | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 9efdd1371..27fc138dd 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -361,6 +361,7 @@ async fn main() -> anyhow::Result<()> { // We must use the write transaction of the update here. let mut wtxn = index_cloned.write_txn()?; let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned); + builder.enable_autogenerate_docids(); match method.as_str() { "replace" => builder diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 4c433ec28..6a7929eed 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -19,7 +19,7 @@ const LCM_10_FIRST_NUMBERS: u32 = 2520; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 1000; +const CANDIDATES_THRESHOLD: u64 = 500; type FlattenedQueryTree = Vec>>; From 085bc6440cd81c7d2799f579d520e64e761fa035 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 6 Oct 2021 11:12:26 +0200 Subject: [PATCH 1050/1889] Apply PR comments --- milli/src/search/criteria/attribute.rs | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 6a7929eed..3d67b60c0 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -195,15 +195,13 @@ impl<'t> QueryPositionIterator<'t> { match &query.kind { QueryKind::Exact { word, .. } => { if !query.prefix || in_prefix_cache { - let iter = - ctx.word_position_iterator(query.kind.word(), in_prefix_cache)?; - + let word = query.kind.word(); + let iter = ctx.word_position_iterator(word, in_prefix_cache)?; inner.push(iter.peekable()); } else { for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { let iter = ctx.word_position_iterator(&word, in_prefix_cache)?; - inner.push(iter.peekable()); } } @@ -213,7 +211,6 @@ impl<'t> QueryPositionIterator<'t> { word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { let iter = ctx.word_position_iterator(&word, in_prefix_cache)?; - inner.push(iter.peekable()); } } @@ -305,15 +302,13 @@ impl<'t> Branch<'t> { Some((next_pos, next_docids)) => { *cur_pos = next_pos; *cur_docids |= next_docids & allowed_candidates; + self.update_last_result(); + Ok(true) } - None => return Ok(false), + None => Ok(false), }, - None => return Ok(false), + None => Ok(false), } - - self.update_last_result(); - - Ok(true) } fn lowest_iterator_index(&mut self) -> usize { From 11dfe387616e8cb70822b55ed81a2e75aebf6022 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 7 Oct 2021 15:42:08 +0200 Subject: [PATCH 1051/1889] Update the check on the latitude and longitude Latitude are not supposed to go beyound 90 degrees or below -90. The same goes for longitude with 180 or -180. This was badly implemented in the filters, and was not implemented for the AscDesc rules. --- milli/src/asc_desc.rs | 49 ++++++++++++++++++-- milli/src/search/facet/filter_condition.rs | 54 +++++++++++++++++----- 2 files changed, 87 insertions(+), 16 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index 09bd0082a..ebd550e60 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -12,6 +12,8 @@ use crate::{CriterionError, Error, UserError}; /// You must always cast it to a sort error or a criterion error. #[derive(Debug)] pub enum AscDescError { + BadLat, + BadLng, InvalidSyntax { name: String }, ReservedKeyword { name: String }, } @@ -19,6 +21,12 @@ pub enum AscDescError { impl fmt::Display for AscDescError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { + Self::BadLat => { + write!(f, "Latitude must be contained between -90 and 90 degrees.",) + } + Self::BadLng => { + write!(f, "Longitude must be contained between -180 and 180 degrees.",) + } Self::InvalidSyntax { name } => { write!(f, "invalid asc/desc syntax for {}.", name) } @@ -36,6 +44,9 @@ impl fmt::Display for AscDescError { impl From for CriterionError { fn from(error: AscDescError) -> Self { match error { + AscDescError::BadLat | AscDescError::BadLng => { + CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() } + } AscDescError::InvalidSyntax { name } => CriterionError::InvalidName { name }, AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => { CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() } @@ -60,16 +71,21 @@ impl FromStr for Member { fn from_str(text: &str) -> Result { match text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(")")) { Some(point) => { - let (lat, long) = point + let (lat, lng) = point .split_once(',') .ok_or_else(|| AscDescError::ReservedKeyword { name: text.to_string() }) - .and_then(|(lat, long)| { + .and_then(|(lat, lng)| { lat.trim() .parse() - .and_then(|lat| long.trim().parse().map(|long| (lat, long))) + .and_then(|lat| lng.trim().parse().map(|lng| (lat, lng))) .map_err(|_| AscDescError::ReservedKeyword { name: text.to_string() }) })?; - Ok(Member::Geo([lat, long])) + if !(-90.0..=90.0).contains(&lat) { + return Err(AscDescError::BadLat)?; + } else if !(-180.0..=180.0).contains(&lng) { + return Err(AscDescError::BadLng)?; + } + Ok(Member::Geo([lat, lng])) } None => { if is_reserved_keyword(text) || text.starts_with("_geoRadius(") { @@ -139,6 +155,8 @@ impl FromStr for AscDesc { #[derive(Debug)] pub enum SortError { + BadLat, + BadLng, BadGeoPointUsage { name: String }, InvalidName { name: String }, ReservedName { name: String }, @@ -149,6 +167,8 @@ pub enum SortError { impl From for SortError { fn from(error: AscDescError) -> Self { match error { + AscDescError::BadLat => SortError::BadLat, + AscDescError::BadLng => SortError::BadLng, AscDescError::InvalidSyntax { name } => SortError::InvalidName { name }, AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => { SortError::BadGeoPointUsage { name } @@ -167,6 +187,12 @@ impl From for SortError { impl fmt::Display for SortError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { + Self::BadLat => { + write!(f, "Latitude must be contained between -90 and 90 degrees.",) + } + Self::BadLng => { + write!(f, "Longitude must be contained between -180 and 180 degrees.",) + } Self::BadGeoPointUsage { name } => { write!( f, @@ -225,6 +251,8 @@ mod tests { ("_geoPoint(42.459, 59):desc", Desc(Geo([42.459, 59.]))), ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), + ("_geoPoint(90.000000000, 180):desc", Desc(Geo([90., 180.]))), + ("_geoPoint(-90, -180.0000000000):asc", Asc(Geo([-90., -180.]))), ("_geoPoint(42.0002, 59.895):desc", Desc(Geo([42.0002, 59.895]))), ("_geoPoint(42., 59.):desc", Desc(Geo([42., 59.]))), ("truc(12, 13):desc", Desc(Field(S("truc(12, 13)")))), @@ -268,6 +296,11 @@ mod tests { ), ("_geoPoint(35, 85, 75):asc", ReservedKeyword { name: S("_geoPoint(35, 85, 75)") }), ("_geoPoint(18):asc", ReservedKeyword { name: S("_geoPoint(18)") }), + ("_geoPoint(200, 200):asc", BadLat), + ("_geoPoint(90.000001, 0):asc", BadLat), + ("_geoPoint(0, -180.000001):desc", BadLng), + ("_geoPoint(159.256, 130):asc", BadLat), + ("_geoPoint(12, -2021):desc", BadLng), ]; for (req, expected_error) in invalid_req { @@ -313,6 +346,14 @@ mod tests { AscDescError::ReservedKeyword { name: S("_geoRadius(12, 13)") }, S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), ), + ( + AscDescError::BadLat, + S("Latitude must be contained between -90 and 90 degrees."), + ), + ( + AscDescError::BadLng, + S("Longitude must be contained between -180 and 180 degrees."), + ), ]; for (asc_desc_error, expected_message) in errors { diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index f0a51fe0a..f1055b2f8 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -206,17 +206,19 @@ impl FilterCondition { )))?; } let (lat, lng, distance) = (¶meters[0], ¶meters[1], parameters[2].0); - if let Some(span) = (!(-181.0..181.).contains(&lat.0)) - .then(|| &lat.1) - .or((!(-181.0..181.).contains(&lng.0)).then(|| &lng.1)) - { + if !(-90.0..=90.0).contains(&lat.0) { return Err(UserError::InvalidFilter(PestError::new_from_span( ErrorVariant::CustomError { - message: format!( - "Latitude and longitude must be contained between -180 to 180 degrees." - ), + message: format!("Latitude must be contained between -90 and 90 degrees."), }, - span.clone(), + lat.1.clone(), + )))?; + } else if !(-180.0..=180.0).contains(&lng.0) { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!("Longitude must be contained between -180 and 180 degrees."), + }, + lng.1.clone(), )))?; } Ok(Operator(fid, GeoLowerThan([lat.0, lng.0], distance))) @@ -858,6 +860,18 @@ mod tests { let expected = Operator(0, GeoLowerThan([12., 13.0005], 2000.)); assert_eq!(condition, expected); + // basic test with latitude and longitude at the max angle + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(90, 180, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([90., 180.], 2000.)); + assert_eq!(condition, expected); + + // basic test with latitude and longitude at the min angle + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90, -180, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([-90., -180.], 2000.)); + assert_eq!(condition, expected); + // test the negation of the GeoLowerThan let condition = FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); @@ -906,20 +920,36 @@ mod tests { assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); // georadius have a bad latitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-200, 150, 10)"); + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); assert!(result.is_err()); let error = result.unwrap_err(); assert!(error .to_string() - .contains("Latitude and longitude must be contained between -180 to 180 degrees.")); + .contains("Latitude must be contained between -90 and 90 degrees.")); + + // georadius have a bad latitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude must be contained between -90 and 90 degrees.")); // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 181, 10)"); + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)"); assert!(result.is_err()); let error = result.unwrap_err(); assert!(error .to_string() - .contains("Latitude and longitude must be contained between -180 to 180 degrees.")); + .contains("Longitude must be contained between -180 and 180 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Longitude must be contained between -180 and 180 degrees.")); } #[test] From b65aa7b5ac467b54bb1e306d019f71afbec88a45 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 7 Oct 2021 17:45:01 +0200 Subject: [PATCH 1052/1889] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/asc_desc.rs | 44 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index ebd550e60..00f65a459 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -12,8 +12,8 @@ use crate::{CriterionError, Error, UserError}; /// You must always cast it to a sort error or a criterion error. #[derive(Debug)] pub enum AscDescError { - BadLat, - BadLng, + InvalidLatitude, + InvalidLongitude, InvalidSyntax { name: String }, ReservedKeyword { name: String }, } @@ -21,10 +21,10 @@ pub enum AscDescError { impl fmt::Display for AscDescError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::BadLat => { + Self::InvalidLatitude => { write!(f, "Latitude must be contained between -90 and 90 degrees.",) } - Self::BadLng => { + Self::InvalidLongitude => { write!(f, "Longitude must be contained between -180 and 180 degrees.",) } Self::InvalidSyntax { name } => { @@ -44,7 +44,7 @@ impl fmt::Display for AscDescError { impl From for CriterionError { fn from(error: AscDescError) -> Self { match error { - AscDescError::BadLat | AscDescError::BadLng => { + AscDescError::InvalidLatitude | AscDescError::InvalidLongitude => { CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() } } AscDescError::InvalidSyntax { name } => CriterionError::InvalidName { name }, @@ -81,9 +81,9 @@ impl FromStr for Member { .map_err(|_| AscDescError::ReservedKeyword { name: text.to_string() }) })?; if !(-90.0..=90.0).contains(&lat) { - return Err(AscDescError::BadLat)?; + return Err(AscDescError::InvalidLatitude)?; } else if !(-180.0..=180.0).contains(&lng) { - return Err(AscDescError::BadLng)?; + return Err(AscDescError::InvalidLongitude)?; } Ok(Member::Geo([lat, lng])) } @@ -155,8 +155,8 @@ impl FromStr for AscDesc { #[derive(Debug)] pub enum SortError { - BadLat, - BadLng, + InvalidLatitude, + InvalidLongitude, BadGeoPointUsage { name: String }, InvalidName { name: String }, ReservedName { name: String }, @@ -167,8 +167,8 @@ pub enum SortError { impl From for SortError { fn from(error: AscDescError) -> Self { match error { - AscDescError::BadLat => SortError::BadLat, - AscDescError::BadLng => SortError::BadLng, + AscDescError::InvalidLatitude => SortError::InvalidLatitude, + AscDescError::InvalidLongitude => SortError::InvalidLongitude, AscDescError::InvalidSyntax { name } => SortError::InvalidName { name }, AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => { SortError::BadGeoPointUsage { name } @@ -187,12 +187,8 @@ impl From for SortError { impl fmt::Display for SortError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::BadLat => { - write!(f, "Latitude must be contained between -90 and 90 degrees.",) - } - Self::BadLng => { - write!(f, "Longitude must be contained between -180 and 180 degrees.",) - } + Self::InvalidLatitude => write!(f, "{}", AscDescError::InvalidLatitude), + Self::InvalidLongitude => write!(f, "{}", AscDescError::InvalidLongitude), Self::BadGeoPointUsage { name } => { write!( f, @@ -296,11 +292,11 @@ mod tests { ), ("_geoPoint(35, 85, 75):asc", ReservedKeyword { name: S("_geoPoint(35, 85, 75)") }), ("_geoPoint(18):asc", ReservedKeyword { name: S("_geoPoint(18)") }), - ("_geoPoint(200, 200):asc", BadLat), - ("_geoPoint(90.000001, 0):asc", BadLat), - ("_geoPoint(0, -180.000001):desc", BadLng), - ("_geoPoint(159.256, 130):asc", BadLat), - ("_geoPoint(12, -2021):desc", BadLng), + ("_geoPoint(200, 200):asc", InvalidLatitude), + ("_geoPoint(90.000001, 0):asc", InvalidLatitude), + ("_geoPoint(0, -180.000001):desc", InvalidLongitude), + ("_geoPoint(159.256, 130):asc", InvalidLatitude), + ("_geoPoint(12, -2021):desc", InvalidLongitude), ]; for (req, expected_error) in invalid_req { @@ -347,11 +343,11 @@ mod tests { S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), ), ( - AscDescError::BadLat, + AscDescError::InvalidLatitude, S("Latitude must be contained between -90 and 90 degrees."), ), ( - AscDescError::BadLng, + AscDescError::InvalidLongitude, S("Longitude must be contained between -180 and 180 degrees."), ), ]; From 2dfe24f067b6d8cb0e2d85f36b4cc7015cb1a82c Mon Sep 17 00:00:00 2001 From: Tom Parker-Shemilt Date: Sun, 10 Oct 2021 22:47:12 +0100 Subject: [PATCH 1053/1889] memmap -> memmap2 --- http-ui/Cargo.toml | 2 +- http-ui/src/main.rs | 2 +- milli/Cargo.toml | 2 +- milli/src/update/index_documents/helpers/clonable_mmap.rs | 2 +- milli/src/update/index_documents/helpers/grenad_helpers.rs | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index afe29eaab..255dc165a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -11,7 +11,7 @@ byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } -memmap = "0.7.0" +memmap2 = "0.5.0" milli = { path = "../milli" } once_cell = "1.5.2" rayon = "1.5.0" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 27fc138dd..74ea3ccd8 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -860,7 +860,7 @@ async fn main() -> anyhow::Result<()> { } let file = file.into_std().await; - let mmap = unsafe { memmap::Mmap::map(&file).expect("can't map file") }; + let mmap = unsafe { memmap2::Mmap::map(&file).expect("can't map file") }; let method = match update_method.as_deref() { Some("replace") => String::from("replace"), diff --git a/milli/Cargo.toml b/milli/Cargo.toml index bf001e155..cf3582818 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -23,7 +23,7 @@ human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } -memmap = "0.7.0" +memmap2 = "0.5.0" obkv = "0.2.0" once_cell = "1.5.2" ordered-float = "2.1.1" diff --git a/milli/src/update/index_documents/helpers/clonable_mmap.rs b/milli/src/update/index_documents/helpers/clonable_mmap.rs index 691d10593..1c4d78506 100644 --- a/milli/src/update/index_documents/helpers/clonable_mmap.rs +++ b/milli/src/update/index_documents/helpers/clonable_mmap.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use memmap::Mmap; +use memmap2::Mmap; /// Wrapper around Mmap allowing to virtualy clone grenad-chunks /// in a parallel process like the indexing. diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index fbdf2b42e..10662892b 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -70,7 +70,7 @@ pub unsafe fn into_clonable_grenad( reader: grenad::Reader, ) -> Result> { let file = reader.into_inner(); - let mmap = memmap::Mmap::map(&file)?; + let mmap = memmap2::Mmap::map(&file)?; let cursor = io::Cursor::new(ClonableMmap::from(mmap)); let reader = grenad::Reader::new(cursor)?; Ok(reader) From ed7fd855afc7fc6854393e34ef697d8461e37cfe Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 11 Oct 2021 14:26:36 +0200 Subject: [PATCH 1054/1889] fix the wiki indexing benchmark --- benchmarks/benches/indexing.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 66ecc7154..e4657d5b6 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -197,7 +197,8 @@ fn indexing_wiki(c: &mut Criterion) { move |index| { let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); - let builder = update_builder.index_documents(&mut wtxn, &index); + let mut builder = update_builder.index_documents(&mut wtxn, &index); + builder.enable_autogenerate_docids(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); builder.execute(documents, |_, _| ()).unwrap(); From 799f3d43c82eed4d53a936839ebefbffdaeeb65b Mon Sep 17 00:00:00 2001 From: mpostma Date: Mon, 11 Oct 2021 14:58:40 +0200 Subject: [PATCH 1055/1889] fix serialization to obkv format --- milli/src/documents/serde.rs | 21 +++++++++++++++------ milli/src/update/settings.rs | 1 + 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 76dc8915c..036ec246a 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -1,9 +1,12 @@ +use std::collections::BTreeMap; use std::convert::TryInto; +use std::io::Cursor; use std::{fmt, io}; use byteorder::{BigEndian, WriteBytesExt}; use obkv::KvWriter; use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer}; +use serde_json::Value; use super::{ByteCounter, DocumentsBatchIndex, Error}; use crate::FieldId; @@ -36,7 +39,7 @@ impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer { map: KvWriter::new(cursor), index: &mut self.index, writer: &mut self.writer, - buffer: Vec::new(), + mapped_documents: BTreeMap::new(), }; Ok(map_serializer) @@ -226,7 +229,7 @@ pub struct MapSerializer<'a, W> { map: KvWriter>, FieldId>, index: &'a mut DocumentsBatchIndex, writer: W, - buffer: Vec, + mapped_documents: BTreeMap, } /// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and @@ -244,6 +247,14 @@ impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { } fn end(mut self) -> Result { + let mut buf = Vec::new(); + for (key, value) in self.mapped_documents { + buf.clear(); + let mut cursor = Cursor::new(&mut buf); + serde_json::to_writer(&mut cursor, &value).map_err(Error::JsonError)?; + self.map.insert(key, cursor.into_inner()).map_err(Error::Io)?; + } + let data = self.map.into_inner().map_err(Error::Io)?.into_inner(); let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?; @@ -265,11 +276,9 @@ impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { let field_serializer = FieldSerializer { index: &mut self.index }; let field_id: FieldId = key.serialize(field_serializer)?; - self.buffer.clear(); - let mut cursor = io::Cursor::new(&mut self.buffer); - serde_json::to_writer(&mut cursor, value).map_err(Error::JsonError)?; + let value = serde_json::to_value(value).map_err(Error::JsonError)?; - self.map.insert(field_id, cursor.into_inner()).map_err(Error::Io)?; + self.mapped_documents.insert(field_id, value); Ok(()) } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 4aa79f6e3..87a829a1a 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1134,6 +1134,7 @@ mod tests { "release_date": 819676800 } ]); + let builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); From 99889a0ed08ccb0d9d27cbfd57fce712f2e70816 Mon Sep 17 00:00:00 2001 From: mpostma Date: Mon, 11 Oct 2021 15:03:52 +0200 Subject: [PATCH 1056/1889] add obkv document serialization test --- milli/src/documents/mod.rs | 8 ++++++++ milli/src/update/settings.rs | 1 - 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index da4227e6b..f79c210fe 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -230,4 +230,12 @@ mod test { let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); assert_eq!(nested, json!({ "toto": ["hello"] })); } + + #[test] + fn out_of_order_fields() { + let _documents = documents!([ + {"id": 1,"b": 0}, + {"id": 2,"a": 0,"b": 0}, + ]); + } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 87a829a1a..4aa79f6e3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1134,7 +1134,6 @@ mod tests { "release_date": 819676800 } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 1); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); From dd56e82dbae81f73d4820c8dfdc0f6c0f2e1ac7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 11 Oct 2021 15:20:35 +0200 Subject: [PATCH 1057/1889] Update version for the next release (v0.17.2) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 041d35099..14287e4d1 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.17.1" +version = "0.17.2" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index afe29eaab..5d634ff8f 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.17.1" +version = "0.17.2" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index fbc4993a5..27bf9d607 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.17.1" +version = "0.17.2" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index bf001e155..7d93f5995 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.17.1" +version = "0.17.2" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 900b1f50a..9c2720ff9 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.17.1" +version = "0.17.2" authors = ["Clément Renault "] edition = "2018" From 8748df2ca456db11bfe34215821df8c6e04de2d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Wed, 15 Sep 2021 18:57:18 +0800 Subject: [PATCH 1058/1889] draft without error handling --- milli/Cargo.toml | 1 + milli/src/error.rs | 11 + milli/src/search/facet/filter_condition.rs | 254 ++++++++++++++++++++- 3 files changed, 265 insertions(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7d93f5995..50ada6bfb 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -24,6 +24,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } memmap = "0.7.0" +nom = "7" obkv = "0.2.0" once_cell = "1.5.2" ordered-float = "2.1.1" diff --git a/milli/src/error.rs b/milli/src/error.rs index 1f1cc5264..be3fbfdef 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -62,6 +62,9 @@ pub enum UserError { InvalidFilter(pest::error::Error), InvalidFilterAttribute(pest::error::Error), InvalidGeoField { document_id: Value, object: Value }, + InvalidFilterAttributeNom, + InvalidFilterValue, + InvalidSortName { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, InvalidStoreFile, @@ -82,6 +85,11 @@ impl From for Error { Error::IoError(error) } } +impl From for UserError { + fn from(_: std::num::ParseFloatError) -> UserError { + UserError::InvalidFilterValue + } +} impl From for Error { fn from(error: fst::Error) -> Error { @@ -208,6 +216,9 @@ impl StdError for InternalError {} impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { + //TODO + Self::InvalidFilterAttributeNom => f.write_str("parser error "), + Self::InvalidFilterValue => f.write_str("parser error "), Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), Self::CriterionError(error) => write!(f, "{}", error), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index f1055b2f8..b14c3648f 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -4,6 +4,7 @@ use std::ops::Bound::{self, Excluded, Included}; use std::result::Result as StdResult; use std::str::FromStr; +use crate::error::UserError as IError; use either::Either; use heed::types::DecodeIgnore; use itertools::Itertools; @@ -13,6 +14,17 @@ use pest::iterators::{Pair, Pairs}; use pest::Parser; use roaring::RoaringBitmap; +use nom::{ + branch::alt, + bytes::complete::{tag, take_while1}, + character::complete::{char, multispace0}, + combinator::map, + error::ParseError, + multi::many0, + sequence::{delimited, preceded, tuple}, + IResult, +}; + use self::FilterCondition::*; use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; @@ -64,6 +76,156 @@ pub enum FilterCondition { Empty, } +struct ParseContext<'a> { + fields_ids_map: &'a FieldsIdsMap, + filterable_fields: &'a HashSet, +} +// impl From + +impl<'a> ParseContext<'a> { + fn parse_or_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition> { + let (input, lhs) = self.parse_and_nom(input)?; + let (input, ors) = many0(preceded(tag("OR"), |c| Self::parse_or_nom(self, c)))(input)?; + let expr = ors + .into_iter() + .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch))); + Ok((input, expr)) + } + fn parse_and_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition> { + let (input, lhs) = self.parse_not_nom(input)?; + let (input, ors) = many0(preceded(tag("AND"), |c| Self::parse_and_nom(self, c)))(input)?; + let expr = ors + .into_iter() + .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); + Ok((input, expr)) + } + + fn parse_not_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition> { + let r = alt(( + map( + preceded(alt((Self::ws(tag("!")), Self::ws(tag("NOT")))), |c| { + Self::parse_condition_expression(self, c) + }), + |e| e.negate(), + ), + |c| Self::parse_condition_expression(self, c), + ))(input); + return r; + } + + fn ws<'b, F: 'b, O, E: ParseError<&'b str>>( + inner: F, + ) -> impl FnMut(&'b str) -> IResult<&'b str, O, E> + where + F: Fn(&'b str) -> IResult<&'b str, O, E>, + { + delimited(multispace0, inner, multispace0) + } + + fn parse_simple_condition( + &self, + input: &'a str, + ) -> StdResult<(&'a str, FilterCondition), UserError> { + let operator = alt((tag(">"), tag(">="), tag("="), tag("<"), tag("!="), tag("<="))); + let (input, (key, op, value)) = + match tuple((Self::ws(Self::parse_key), operator, Self::ws(Self::parse_key)))(input) { + Ok((input, (key, op, value))) => (input, (key, op, value)), + Err(_) => return Err(UserError::InvalidFilterAttributeNom), + }; + + let fid = match field_id_by_key(self.fields_ids_map, self.filterable_fields, key)? { + Some(fid) => fid, + None => return Err(UserError::InvalidFilterAttributeNom), + }; + let r = nom_parse::(value); + let k = match op { + ">" => Operator(fid, GreaterThan(value.parse::()?)), + "<" => Operator(fid, LowerThan(value.parse::()?)), + "<=" => Operator(fid, LowerThanOrEqual(value.parse::()?)), + ">=" => Operator(fid, GreaterThanOrEqual(value.parse::()?)), + "=" => Operator(fid, Equal(r.0.ok(), value.to_string().to_lowercase())), + "!=" => Operator(fid, NotEqual(r.0.ok(), value.to_string().to_lowercase())), + _ => unreachable!(), + }; + Ok((input, k)) + } + + fn parse_range_condition( + &'a self, + input: &'a str, + ) -> StdResult<(&str, FilterCondition), UserError> { + let (input, (key, from, _, to)) = match tuple(( + Self::ws(Self::parse_key), + Self::ws(Self::parse_key), + tag("TO"), + Self::ws(Self::parse_key), + ))(input) + { + Ok((input, (key, from, tag, to))) => (input, (key, from, tag, to)), + Err(_) => return Err(UserError::InvalidFilterAttributeNom), + }; + let fid = match field_id_by_key(self.fields_ids_map, self.filterable_fields, key)? { + Some(fid) => fid, + None => return Err(UserError::InvalidFilterAttributeNom), + }; + let res = Operator(fid, Between(from.parse::()?, to.parse::()?)); + Ok((input, res)) + } + + fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition> { + let l1 = |c| self.wrap(|c| self.parse_simple_condition(c), c); + let l2 = |c| self.wrap(|c| self.parse_range_condition(c), c); + let (input, condition) = match alt((l1, l2))(input) { + Ok((i, c)) => (i, c), + Err(_) => { + return Err(nom::Err::Error(nom::error::Error::from_error_kind( + "foo", + nom::error::ErrorKind::Fail, + ))) + } + }; + Ok((input, condition)) + } + fn wrap(&'a self, inner: F, input: &'a str) -> IResult<&'a str, FilterCondition> + where + F: Fn(&'a str) -> StdResult<(&'a str, FilterCondition), E>, + { + match inner(input) { + Ok(e) => Ok(e), + Err(_) => { + return Err(nom::Err::Error(nom::error::Error::from_error_kind( + "foo", + nom::error::ErrorKind::Fail, + ))) + } + } + } + + fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition> { + return alt(( + delimited( + Self::ws(char('(')), + |c| Self::parse_expression(self, c), + Self::ws(char(')')), + ), + |c| Self::parse_condition(self, c), + ))(input); + } + + fn parse_key(input: &str) -> IResult<&str, &str> { + let key = |input| take_while1(Self::is_key_component)(input); + alt((key, delimited(char('"'), key, char('"'))))(input) + } + fn is_key_component(c: char) -> bool { + c.is_alphanumeric() || ['_', '-', '.'].contains(&c) + } + + pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition> { + self.parse_or_nom(input) + } +} + +//for nom impl FilterCondition { pub fn from_array( rtxn: &heed::RoTxn, @@ -109,11 +271,75 @@ impl FilterCondition { Ok(ands) } - pub fn from_str( rtxn: &heed::RoTxn, index: &Index, expression: &str, + ) -> Result { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let filterable_fields = index.filterable_fields(rtxn)?; + let ctx = + ParseContext { fields_ids_map: &fields_ids_map, filterable_fields: &filterable_fields }; + match ctx.parse_expression(expression) { + Ok((_, fc)) => Ok(fc), + Err(e) => { + println!("{:?}", e); + unreachable!() + } + } + } +} + +impl FilterCondition { + pub fn from_array_pest( + rtxn: &heed::RoTxn, + index: &Index, + array: I, + ) -> Result> + where + I: IntoIterator>, + J: IntoIterator, + A: AsRef, + B: AsRef, + { + let mut ands = None; + + for either in array { + match either { + Either::Left(array) => { + let mut ors = None; + for rule in array { + let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; + ors = match ors.take() { + Some(ors) => Some(Or(Box::new(ors), Box::new(condition))), + None => Some(condition), + }; + } + + if let Some(rule) = ors { + ands = match ands.take() { + Some(ands) => Some(And(Box::new(ands), Box::new(rule))), + None => Some(rule), + }; + } + } + Either::Right(rule) => { + let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; + ands = match ands.take() { + Some(ands) => Some(And(Box::new(ands), Box::new(condition))), + None => Some(condition), + }; + } + } + } + + Ok(ands) + } + + pub fn from_str_pest( + rtxn: &heed::RoTxn, + index: &Index, + expression: &str, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?; @@ -586,6 +812,19 @@ impl FilterCondition { } } +fn field_id_by_key( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + key: &str, +) -> StdResult, IError> { + // lexing ensures that we at least have a key + if !filterable_fields.contains(key) { + return StdResult::Err(UserError::InvalidFilterAttributeNom); + } + + Ok(fields_ids_map.id(key)) +} + /// Retrieve the field id base on the pest value. /// /// Returns an error if the given value is not filterable. @@ -640,6 +879,19 @@ fn field_id( Ok(fields_ids_map.id(key.as_str())) } +fn nom_parse(input: &str) -> (StdResult, String) +where + T: FromStr, + T::Err: ToString, +{ + let result = match input.parse::() { + Ok(value) => Ok(value), + Err(e) => Err(UserError::InvalidFilterValue), + }; + + (result, input.to_string()) +} + /// Tries to parse the pest pair into the type `T` specified, always returns /// the original string that we tried to parse. /// From 50ad750ec119cc157f383d6717b4c7f010a1b245 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Thu, 16 Sep 2021 17:56:18 +0800 Subject: [PATCH 1059/1889] enhance error handling --- milli/src/error.rs | 9 +- milli/src/search/facet/filter_condition.rs | 241 +++++++++++---------- 2 files changed, 127 insertions(+), 123 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index be3fbfdef..a798539cd 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -64,6 +64,7 @@ pub enum UserError { InvalidGeoField { document_id: Value, object: Value }, InvalidFilterAttributeNom, InvalidFilterValue, + InvalidFilterNom { input: String }, InvalidSortName { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, @@ -85,11 +86,6 @@ impl From for Error { Error::IoError(error) } } -impl From for UserError { - fn from(_: std::num::ParseFloatError) -> UserError { - UserError::InvalidFilterValue - } -} impl From for Error { fn from(error: fst::Error) -> Error { @@ -217,8 +213,7 @@ impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { //TODO - Self::InvalidFilterAttributeNom => f.write_str("parser error "), - Self::InvalidFilterValue => f.write_str("parser error "), + Self::InvalidFilterNom { input } => write!(f, "parser error {}", input), Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), Self::CriterionError(error) => write!(f, "{}", error), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index b14c3648f..e6ed79230 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -4,7 +4,6 @@ use std::ops::Bound::{self, Excluded, Included}; use std::result::Result as StdResult; use std::str::FromStr; -use crate::error::UserError as IError; use either::Either; use heed::types::DecodeIgnore; use itertools::Itertools; @@ -19,7 +18,9 @@ use nom::{ bytes::complete::{tag, take_while1}, character::complete::{char, multispace0}, combinator::map, + error::ErrorKind, error::ParseError, + error::VerboseError, multi::many0, sequence::{delimited, preceded, tuple}, IResult, @@ -29,7 +30,7 @@ use self::FilterCondition::*; use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; use super::FacetNumberRange; -use crate::error::UserError; +use crate::error::{Error, UserError}; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, }; @@ -83,7 +84,10 @@ struct ParseContext<'a> { // impl From impl<'a> ParseContext<'a> { - fn parse_or_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition> { + fn parse_or_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { let (input, lhs) = self.parse_and_nom(input)?; let (input, ors) = many0(preceded(tag("OR"), |c| Self::parse_or_nom(self, c)))(input)?; let expr = ors @@ -91,7 +95,11 @@ impl<'a> ParseContext<'a> { .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch))); Ok((input, expr)) } - fn parse_and_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition> { + + fn parse_and_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { let (input, lhs) = self.parse_not_nom(input)?; let (input, ors) = many0(preceded(tag("AND"), |c| Self::parse_and_nom(self, c)))(input)?; let expr = ors @@ -100,119 +108,146 @@ impl<'a> ParseContext<'a> { Ok((input, expr)) } - fn parse_not_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition> { - let r = alt(( + fn parse_not_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + alt(( map( - preceded(alt((Self::ws(tag("!")), Self::ws(tag("NOT")))), |c| { + preceded(alt((self.ws(tag("!")), self.ws(tag("NOT")))), |c| { Self::parse_condition_expression(self, c) }), |e| e.negate(), ), |c| Self::parse_condition_expression(self, c), - ))(input); - return r; + ))(input) } - fn ws<'b, F: 'b, O, E: ParseError<&'b str>>( - inner: F, - ) -> impl FnMut(&'b str) -> IResult<&'b str, O, E> + fn ws(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> where - F: Fn(&'b str) -> IResult<&'b str, O, E>, + F: Fn(&'a str) -> IResult<&'a str, O, E>, + E: ParseError<&'a str>, { delimited(multispace0, inner, multispace0) } - fn parse_simple_condition( - &self, - input: &'a str, - ) -> StdResult<(&'a str, FilterCondition), UserError> { + fn parse_simple_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { let operator = alt((tag(">"), tag(">="), tag("="), tag("<"), tag("!="), tag("<="))); let (input, (key, op, value)) = - match tuple((Self::ws(Self::parse_key), operator, Self::ws(Self::parse_key)))(input) { - Ok((input, (key, op, value))) => (input, (key, op, value)), - Err(_) => return Err(UserError::InvalidFilterAttributeNom), - }; - - let fid = match field_id_by_key(self.fields_ids_map, self.filterable_fields, key)? { - Some(fid) => fid, - None => return Err(UserError::InvalidFilterAttributeNom), - }; - let r = nom_parse::(value); + tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))( + input, + )?; + let fid = self.parse_fid(input, key)?; + let r: StdResult>> = self.parse_numeric(value); let k = match op { - ">" => Operator(fid, GreaterThan(value.parse::()?)), - "<" => Operator(fid, LowerThan(value.parse::()?)), - "<=" => Operator(fid, LowerThanOrEqual(value.parse::()?)), - ">=" => Operator(fid, GreaterThanOrEqual(value.parse::()?)), - "=" => Operator(fid, Equal(r.0.ok(), value.to_string().to_lowercase())), - "!=" => Operator(fid, NotEqual(r.0.ok(), value.to_string().to_lowercase())), + "=" => Operator(fid, Equal(r.ok(), value.to_string().to_lowercase())), + "!=" => Operator(fid, NotEqual(r.ok(), value.to_string().to_lowercase())), + ">" | "<" | "<=" | ">=" => { + return self.parse_numeric_unary_condition(input, fid, value) + } _ => unreachable!(), }; Ok((input, k)) } - fn parse_range_condition( - &'a self, - input: &'a str, - ) -> StdResult<(&str, FilterCondition), UserError> { - let (input, (key, from, _, to)) = match tuple(( - Self::ws(Self::parse_key), - Self::ws(Self::parse_key), - tag("TO"), - Self::ws(Self::parse_key), - ))(input) - { - Ok((input, (key, from, tag, to))) => (input, (key, from, tag, to)), - Err(_) => return Err(UserError::InvalidFilterAttributeNom), - }; - let fid = match field_id_by_key(self.fields_ids_map, self.filterable_fields, key)? { - Some(fid) => fid, - None => return Err(UserError::InvalidFilterAttributeNom), - }; - let res = Operator(fid, Between(from.parse::()?, to.parse::()?)); - Ok((input, res)) - } - - fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition> { - let l1 = |c| self.wrap(|c| self.parse_simple_condition(c), c); - let l2 = |c| self.wrap(|c| self.parse_range_condition(c), c); - let (input, condition) = match alt((l1, l2))(input) { - Ok((i, c)) => (i, c), - Err(_) => { - return Err(nom::Err::Error(nom::error::Error::from_error_kind( - "foo", - nom::error::ErrorKind::Fail, - ))) - } - }; - Ok((input, condition)) - } - fn wrap(&'a self, inner: F, input: &'a str) -> IResult<&'a str, FilterCondition> + fn parse_numeric(&'a self, input: &'a str) -> StdResult> where - F: Fn(&'a str) -> StdResult<(&'a str, FilterCondition), E>, + E: ParseError<&'a str>, + T: std::str::FromStr, { - match inner(input) { - Ok(e) => Ok(e), + match input.parse::() { + Ok(n) => Ok(n), Err(_) => { - return Err(nom::Err::Error(nom::error::Error::from_error_kind( - "foo", - nom::error::ErrorKind::Fail, - ))) + return match input.chars().nth(0) { + Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), + None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), + }; } } } - fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition> { + fn parse_numeric_unary_condition( + &'a self, + input: &'a str, + fid: u16, + value: &'a str, + ) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + let numeric: f64 = self.parse_numeric(value)?; + let k = match input { + ">" => Operator(fid, GreaterThan(numeric)), + "<" => Operator(fid, LowerThan(numeric)), + "<=" => Operator(fid, LowerThanOrEqual(numeric)), + ">=" => Operator(fid, GreaterThanOrEqual(numeric)), + _ => unreachable!(), + }; + Ok((input, k)) + } + + fn parse_fid(&'a self, input: &'a str, key: &'a str) -> StdResult> + where + E: ParseError<&'a str>, + { + let error = match input.chars().nth(0) { + Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), + None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), + }; + if !self.filterable_fields.contains(key) { + return error; + } + match self.fields_ids_map.id(key) { + Some(fid) => Ok(fid), + None => error, + } + } + + fn parse_range_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + let (input, (key, from, _, to)) = tuple(( + self.ws(|c| self.parse_key(c)), + self.ws(|c| self.parse_key(c)), + tag("TO"), + self.ws(|c| self.parse_key(c)), + ))(input)?; + + let fid = self.parse_fid(input, key)?; + let numeric_from: f64 = self.parse_numeric(from)?; + let numeric_to: f64 = self.parse_numeric(to)?; + let res = Operator(fid, Between(numeric_from, numeric_to)); + Ok((input, res)) + } + + fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + let l1 = |c| self.parse_simple_condition(c); + let l2 = |c| self.parse_range_condition(c); + let (input, condition) = alt((l1, l2))(input)?; + Ok((input, condition)) + } + + fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> + where + E: ParseError<&'a str>, + { return alt(( - delimited( - Self::ws(char('(')), - |c| Self::parse_expression(self, c), - Self::ws(char(')')), - ), + delimited(self.ws(char('(')), |c| Self::parse_expression(self, c), self.ws(char(')'))), |c| Self::parse_condition(self, c), ))(input); } - fn parse_key(input: &str) -> IResult<&str, &str> { + fn parse_key(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> + where + E: ParseError<&'a str>, + { let key = |input| take_while1(Self::is_key_component)(input); alt((key, delimited(char('"'), key, char('"'))))(input) } @@ -220,7 +255,10 @@ impl<'a> ParseContext<'a> { c.is_alphanumeric() || ['_', '-', '.'].contains(&c) } - pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition> { + pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { self.parse_or_nom(input) } } @@ -280,12 +318,9 @@ impl FilterCondition { let filterable_fields = index.filterable_fields(rtxn)?; let ctx = ParseContext { fields_ids_map: &fields_ids_map, filterable_fields: &filterable_fields }; - match ctx.parse_expression(expression) { + match ctx.parse_expression::>(expression) { Ok((_, fc)) => Ok(fc), - Err(e) => { - println!("{:?}", e); - unreachable!() - } + Err(e) => Err(Error::UserError(UserError::InvalidFilterNom { input: e.to_string() })), } } } @@ -812,19 +847,6 @@ impl FilterCondition { } } -fn field_id_by_key( - fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, - key: &str, -) -> StdResult, IError> { - // lexing ensures that we at least have a key - if !filterable_fields.contains(key) { - return StdResult::Err(UserError::InvalidFilterAttributeNom); - } - - Ok(fields_ids_map.id(key)) -} - /// Retrieve the field id base on the pest value. /// /// Returns an error if the given value is not filterable. @@ -879,19 +901,6 @@ fn field_id( Ok(fields_ids_map.id(key.as_str())) } -fn nom_parse(input: &str) -> (StdResult, String) -where - T: FromStr, - T::Err: ToString, -{ - let result = match input.parse::() { - Ok(value) => Ok(value), - Err(e) => Err(UserError::InvalidFilterValue), - }; - - (result, input.to_string()) -} - /// Tries to parse the pest pair into the type `T` specified, always returns /// the original string that we tried to parse. /// From ac1df9d9d7fc15e223aed66e9f53c2184ea442af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Tue, 28 Sep 2021 17:17:52 +0800 Subject: [PATCH 1060/1889] fix typo and remove pest --- milli/src/search/facet/filter_condition.rs | 307 ++------------------- 1 file changed, 24 insertions(+), 283 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index e6ed79230..d51ae27cd 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -2,15 +2,10 @@ use std::collections::HashSet; use std::fmt::Debug; use std::ops::Bound::{self, Excluded, Included}; use std::result::Result as StdResult; -use std::str::FromStr; use either::Either; use heed::types::DecodeIgnore; -use itertools::Itertools; use log::debug; -use pest::error::{Error as PestError, ErrorVariant}; -use pest::iterators::{Pair, Pairs}; -use pest::Parser; use roaring::RoaringBitmap; use nom::{ @@ -28,7 +23,8 @@ use nom::{ use self::FilterCondition::*; use self::Operator::*; -use super::parser::{FilterParser, Rule, PREC_CLIMBER}; +use super::parser::FilterParser; +use super::parser::{Rule, PREC_CLIMBER}; use super::FacetNumberRange; use crate::error::{Error, UserError}; use crate::heed_codec::facet::{ @@ -145,9 +141,7 @@ impl<'a> ParseContext<'a> { let k = match op { "=" => Operator(fid, Equal(r.ok(), value.to_string().to_lowercase())), "!=" => Operator(fid, NotEqual(r.ok(), value.to_string().to_lowercase())), - ">" | "<" | "<=" | ">=" => { - return self.parse_numeric_unary_condition(input, fid, value) - } + ">" | "<" | "<=" | ">=" => return self.parse_numeric_unary_condition(op, fid, value), _ => unreachable!(), }; Ok((input, k)) @@ -326,92 +320,6 @@ impl FilterCondition { } impl FilterCondition { - pub fn from_array_pest( - rtxn: &heed::RoTxn, - index: &Index, - array: I, - ) -> Result> - where - I: IntoIterator>, - J: IntoIterator, - A: AsRef, - B: AsRef, - { - let mut ands = None; - - for either in array { - match either { - Either::Left(array) => { - let mut ors = None; - for rule in array { - let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; - ors = match ors.take() { - Some(ors) => Some(Or(Box::new(ors), Box::new(condition))), - None => Some(condition), - }; - } - - if let Some(rule) = ors { - ands = match ands.take() { - Some(ands) => Some(And(Box::new(ands), Box::new(rule))), - None => Some(rule), - }; - } - } - Either::Right(rule) => { - let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; - ands = match ands.take() { - Some(ands) => Some(And(Box::new(ands), Box::new(condition))), - None => Some(condition), - }; - } - } - } - - Ok(ands) - } - - pub fn from_str_pest( - rtxn: &heed::RoTxn, - index: &Index, - expression: &str, - ) -> Result { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let filterable_fields = index.filterable_fields(rtxn)?; - let lexed = - FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; - FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) - } - - fn from_pairs( - fim: &FieldsIdsMap, - ff: &HashSet, - expression: Pairs, - ) -> Result { - PREC_CLIMBER.climb( - expression, - |pair: Pair| match pair.as_rule() { - Rule::greater => Ok(Self::greater_than(fim, ff, pair)?), - Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?), - Rule::eq => Ok(Self::equal(fim, ff, pair)?), - Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()), - Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), - Rule::less => Ok(Self::lower_than(fim, ff, pair)?), - Rule::between => Ok(Self::between(fim, ff, pair)?), - Rule::geo_radius => Ok(Self::geo_radius(fim, ff, pair)?), - Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), - Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), - Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), - _ => unreachable!(), - }, - |lhs: Result, op: Pair, rhs: Result| match op.as_rule() { - Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), - Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), - _ => unreachable!(), - }, - ) - } - fn negate(self) -> FilterCondition { match self { Operator(fid, op) => match op.negate() { @@ -484,128 +392,6 @@ impl FilterCondition { } Ok(Operator(fid, GeoLowerThan([lat.0, lng.0], distance))) } - - fn between( - fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, - item: Pair, - ) -> Result { - let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { - Some(fid) => fid, - None => return Ok(Empty), - }; - - let (lresult, _) = pest_parse(items.next().unwrap()); - let (rresult, _) = pest_parse(items.next().unwrap()); - - let lvalue = lresult.map_err(UserError::InvalidFilter)?; - let rvalue = rresult.map_err(UserError::InvalidFilter)?; - - Ok(Operator(fid, Between(lvalue, rvalue))) - } - - fn equal( - fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, - item: Pair, - ) -> Result { - let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { - Some(fid) => fid, - None => return Ok(Empty), - }; - - let value = items.next().unwrap(); - let (result, svalue) = pest_parse(value); - - let svalue = svalue.to_lowercase(); - Ok(Operator(fid, Equal(result.ok(), svalue))) - } - - fn greater_than( - fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, - item: Pair, - ) -> Result { - let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { - Some(fid) => fid, - None => return Ok(Empty), - }; - - let value = items.next().unwrap(); - let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; - - Ok(Operator(fid, GreaterThan(value))) - } - - fn greater_than_or_equal( - fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, - item: Pair, - ) -> Result { - let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { - Some(fid) => fid, - None => return Ok(Empty), - }; - - let value = items.next().unwrap(); - let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; - - Ok(Operator(fid, GreaterThanOrEqual(value))) - } - - fn lower_than( - fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, - item: Pair, - ) -> Result { - let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { - Some(fid) => fid, - None => return Ok(Empty), - }; - - let value = items.next().unwrap(); - let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; - - Ok(Operator(fid, LowerThan(value))) - } - - fn lower_than_or_equal( - fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, - item: Pair, - ) -> Result { - let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { - Some(fid) => fid, - None => return Ok(Empty), - }; - - let value = items.next().unwrap(); - let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; - - Ok(Operator(fid, LowerThanOrEqual(value))) - } } impl FilterCondition { @@ -855,72 +641,6 @@ impl FilterCondition { /// /// The pest pair is simply a string associated with a span, a location to highlight in /// the error message. -fn field_id( - fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, - items: &mut Pairs, -) -> StdResult, PestError> { - // lexing ensures that we at least have a key - let key = items.next().unwrap(); - if key.as_rule() == Rule::reserved { - let message = match key.as_str() { - key if key.starts_with("_geoPoint") => { - format!( - "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. \ - Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", - ) - } - key @ "_geo" => { - format!( - "`{}` is a reserved keyword and thus can't be used as a filter expression. \ - Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", - key - ) - } - key => format!( - "`{}` is a reserved keyword and thus can't be used as a filter expression.", - key - ), - }; - return Err(PestError::new_from_span(ErrorVariant::CustomError { message }, key.as_span())); - } - - if !filterable_fields.contains(key.as_str()) { - return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` is not filterable, available filterable attributes are: {}.", - key.as_str(), - filterable_fields.iter().join(", "), - ), - }, - key.as_span(), - )); - } - - Ok(fields_ids_map.id(key.as_str())) -} - -/// Tries to parse the pest pair into the type `T` specified, always returns -/// the original string that we tried to parse. -/// -/// Returns the parsing error associated with the span if the conversion fails. -fn pest_parse(pair: Pair) -> (StdResult>, String) -where - T: FromStr, - T::Err: ToString, -{ - let result = match pair.as_str().parse::() { - Ok(value) => Ok(value), - Err(e) => Err(PestError::::new_from_span( - ErrorVariant::CustomError { message: e.to_string() }, - pair.as_span(), - )), - }; - - (result, pair.as_str().to_string()) -} - #[cfg(test)] mod tests { use big_s::S; @@ -991,6 +711,27 @@ mod tests { assert_eq!(condition, expected); } + #[test] + fn compare() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "channel < 20").unwrap(); + let expected = Operator(0, LowerThan(20.0)); + + assert_eq!(condition, expected); + } + #[test] fn parentheses() { let path = tempfile::tempdir().unwrap(); From f7796edc7e6a814680b658701ac6331aee85cf42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Tue, 28 Sep 2021 17:23:49 +0800 Subject: [PATCH 1061/1889] remove everything about pest --- milli/Cargo.toml | 4 +--- milli/src/error.rs | 12 +++++++----- milli/src/lib.rs | 3 --- milli/src/search/facet/filter_condition.rs | 3 +-- milli/src/search/facet/mod.rs | 2 -- milli/src/search/facet/parser.rs | 12 ------------ milli/src/search/mod.rs | 1 - 7 files changed, 9 insertions(+), 28 deletions(-) delete mode 100644 milli/src/search/facet/parser.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 50ada6bfb..007d9d415 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -24,7 +24,6 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } memmap = "0.7.0" -nom = "7" obkv = "0.2.0" once_cell = "1.5.2" ordered-float = "2.1.1" @@ -40,8 +39,7 @@ tempfile = "3.2.0" uuid = { version = "0.8.2", features = ["v4"] } # facet filter parser -pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } -pest_derive = "2.1.0" +nom = "7" # documents words self-join itertools = "0.10.0" diff --git a/milli/src/error.rs b/milli/src/error.rs index a798539cd..b80238468 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -7,7 +7,6 @@ use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; use serde_json::{Map, Value}; -use crate::search::ParserRule; use crate::{CriterionError, DocumentId, FieldId, SortError}; pub type Object = Map; @@ -59,8 +58,6 @@ pub enum UserError { DocumentLimitReached, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, - InvalidFilter(pest::error::Error), - InvalidFilterAttribute(pest::error::Error), InvalidGeoField { document_id: Value, object: Value }, InvalidFilterAttributeNom, InvalidFilterValue, @@ -226,12 +223,15 @@ impl fmt::Display for UserError { name_list ) } - Self::InvalidFilter(error) => error.fmt(f), Self::InvalidGeoField { document_id, object } => write!( f, "the document with the id: {} contains an invalid _geo field: {}", document_id, object ), + Self::InvalidAscDescSyntax { name } => { + write!(f, "invalid asc/desc syntax for {}", name) + } + Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); write!( @@ -242,7 +242,9 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco json ) } - Self::InvalidFilterAttribute(error) => error.fmt(f), + Self::InvalidSortName { name } => { + write!(f, "Invalid syntax for the sort parameter: {}", name) + } Self::InvalidSortableAttribute { field, valid_fields } => { let valid_names = valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 838817d98..33b3d9c5c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,6 +1,3 @@ -#[macro_use] -extern crate pest_derive; - #[macro_use] pub mod documents; diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index d51ae27cd..e39687117 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -23,8 +23,7 @@ use nom::{ use self::FilterCondition::*; use self::Operator::*; -use super::parser::FilterParser; -use super::parser::{Rule, PREC_CLIMBER}; + use super::FacetNumberRange; use crate::error::{Error, UserError}; use crate::heed_codec::facet::{ diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ddf710e32..a5c041dd5 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -2,10 +2,8 @@ pub use self::facet_distribution::FacetDistribution; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::facet_string::FacetStringIter; pub use self::filter_condition::{FilterCondition, Operator}; -pub(crate) use self::parser::Rule as ParserRule; mod facet_distribution; mod facet_number; mod facet_string; mod filter_condition; -mod parser; diff --git a/milli/src/search/facet/parser.rs b/milli/src/search/facet/parser.rs deleted file mode 100644 index 1bff27cfb..000000000 --- a/milli/src/search/facet/parser.rs +++ /dev/null @@ -1,12 +0,0 @@ -use once_cell::sync::Lazy; -use pest::prec_climber::{Assoc, Operator, PrecClimber}; - -pub static PREC_CLIMBER: Lazy> = Lazy::new(|| { - use Assoc::*; - use Rule::*; - pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)]) -}); - -#[derive(Parser)] -#[grammar = "search/facet/grammar.pest"] -pub struct FilterParser; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index bec059d46..85d5dc8a7 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -14,7 +14,6 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub(crate) use self::facet::ParserRule; pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; From 7a90a101ee792af7df908cb2ef5e514a253096f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Tue, 28 Sep 2021 17:50:15 +0800 Subject: [PATCH 1062/1889] reorganize parser logic --- milli/src/search/facet/filter_condition.rs | 617 +-------------------- milli/src/search/facet/filter_parser.rs | 500 +++++++++++++++++ milli/src/search/facet/mod.rs | 3 +- milli/src/search/mod.rs | 2 +- 4 files changed, 518 insertions(+), 604 deletions(-) create mode 100644 milli/src/search/facet/filter_parser.rs diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index e39687117..c728e0acd 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,29 +1,16 @@ use std::collections::HashSet; use std::fmt::Debug; use std::ops::Bound::{self, Excluded, Included}; -use std::result::Result as StdResult; use either::Either; use heed::types::DecodeIgnore; use log::debug; +use nom::error::VerboseError; use roaring::RoaringBitmap; -use nom::{ - branch::alt, - bytes::complete::{tag, take_while1}, - character::complete::{char, multispace0}, - combinator::map, - error::ErrorKind, - error::ParseError, - error::VerboseError, - multi::many0, - sequence::{delimited, preceded, tuple}, - IResult, -}; - use self::FilterCondition::*; -use self::Operator::*; +use super::filter_parser::{Operator, ParseContext}; use super::FacetNumberRange; use crate::error::{Error, UserError}; use crate::heed_codec::facet::{ @@ -33,37 +20,6 @@ use crate::{ distance_between_two_points, CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result, }; -#[derive(Debug, Clone, PartialEq)] -pub enum Operator { - GreaterThan(f64), - GreaterThanOrEqual(f64), - Equal(Option, String), - NotEqual(Option, String), - LowerThan(f64), - LowerThanOrEqual(f64), - Between(f64, f64), - GeoLowerThan([f64; 2], f64), - GeoGreaterThan([f64; 2], f64), -} - -impl Operator { - /// This method can return two operations in case it must express - /// an OR operation for the between case (i.e. `TO`). - fn negate(self) -> (Self, Option) { - match self { - GreaterThan(n) => (LowerThanOrEqual(n), None), - GreaterThanOrEqual(n) => (LowerThan(n), None), - Equal(n, s) => (NotEqual(n, s), None), - NotEqual(n, s) => (Equal(n, s), None), - LowerThan(n) => (GreaterThanOrEqual(n), None), - LowerThanOrEqual(n) => (GreaterThan(n), None), - Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), - GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None), - GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None), - } - } -} - #[derive(Debug, Clone, PartialEq)] pub enum FilterCondition { Operator(FieldId, Operator), @@ -72,190 +28,8 @@ pub enum FilterCondition { Empty, } -struct ParseContext<'a> { - fields_ids_map: &'a FieldsIdsMap, - filterable_fields: &'a HashSet, -} // impl From -impl<'a> ParseContext<'a> { - fn parse_or_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: ParseError<&'a str>, - { - let (input, lhs) = self.parse_and_nom(input)?; - let (input, ors) = many0(preceded(tag("OR"), |c| Self::parse_or_nom(self, c)))(input)?; - let expr = ors - .into_iter() - .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch))); - Ok((input, expr)) - } - - fn parse_and_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: ParseError<&'a str>, - { - let (input, lhs) = self.parse_not_nom(input)?; - let (input, ors) = many0(preceded(tag("AND"), |c| Self::parse_and_nom(self, c)))(input)?; - let expr = ors - .into_iter() - .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); - Ok((input, expr)) - } - - fn parse_not_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: ParseError<&'a str>, - { - alt(( - map( - preceded(alt((self.ws(tag("!")), self.ws(tag("NOT")))), |c| { - Self::parse_condition_expression(self, c) - }), - |e| e.negate(), - ), - |c| Self::parse_condition_expression(self, c), - ))(input) - } - - fn ws(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> - where - F: Fn(&'a str) -> IResult<&'a str, O, E>, - E: ParseError<&'a str>, - { - delimited(multispace0, inner, multispace0) - } - - fn parse_simple_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: ParseError<&'a str>, - { - let operator = alt((tag(">"), tag(">="), tag("="), tag("<"), tag("!="), tag("<="))); - let (input, (key, op, value)) = - tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))( - input, - )?; - let fid = self.parse_fid(input, key)?; - let r: StdResult>> = self.parse_numeric(value); - let k = match op { - "=" => Operator(fid, Equal(r.ok(), value.to_string().to_lowercase())), - "!=" => Operator(fid, NotEqual(r.ok(), value.to_string().to_lowercase())), - ">" | "<" | "<=" | ">=" => return self.parse_numeric_unary_condition(op, fid, value), - _ => unreachable!(), - }; - Ok((input, k)) - } - - fn parse_numeric(&'a self, input: &'a str) -> StdResult> - where - E: ParseError<&'a str>, - T: std::str::FromStr, - { - match input.parse::() { - Ok(n) => Ok(n), - Err(_) => { - return match input.chars().nth(0) { - Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), - None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), - }; - } - } - } - - fn parse_numeric_unary_condition( - &'a self, - input: &'a str, - fid: u16, - value: &'a str, - ) -> IResult<&'a str, FilterCondition, E> - where - E: ParseError<&'a str>, - { - let numeric: f64 = self.parse_numeric(value)?; - let k = match input { - ">" => Operator(fid, GreaterThan(numeric)), - "<" => Operator(fid, LowerThan(numeric)), - "<=" => Operator(fid, LowerThanOrEqual(numeric)), - ">=" => Operator(fid, GreaterThanOrEqual(numeric)), - _ => unreachable!(), - }; - Ok((input, k)) - } - - fn parse_fid(&'a self, input: &'a str, key: &'a str) -> StdResult> - where - E: ParseError<&'a str>, - { - let error = match input.chars().nth(0) { - Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), - None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), - }; - if !self.filterable_fields.contains(key) { - return error; - } - match self.fields_ids_map.id(key) { - Some(fid) => Ok(fid), - None => error, - } - } - - fn parse_range_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: ParseError<&'a str>, - { - let (input, (key, from, _, to)) = tuple(( - self.ws(|c| self.parse_key(c)), - self.ws(|c| self.parse_key(c)), - tag("TO"), - self.ws(|c| self.parse_key(c)), - ))(input)?; - - let fid = self.parse_fid(input, key)?; - let numeric_from: f64 = self.parse_numeric(from)?; - let numeric_to: f64 = self.parse_numeric(to)?; - let res = Operator(fid, Between(numeric_from, numeric_to)); - Ok((input, res)) - } - - fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: ParseError<&'a str>, - { - let l1 = |c| self.parse_simple_condition(c); - let l2 = |c| self.parse_range_condition(c); - let (input, condition) = alt((l1, l2))(input)?; - Ok((input, condition)) - } - - fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> - where - E: ParseError<&'a str>, - { - return alt(( - delimited(self.ws(char('(')), |c| Self::parse_expression(self, c), self.ws(char(')'))), - |c| Self::parse_condition(self, c), - ))(input); - } - - fn parse_key(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> - where - E: ParseError<&'a str>, - { - let key = |input| take_while1(Self::is_key_component)(input); - alt((key, delimited(char('"'), key, char('"'))))(input) - } - fn is_key_component(c: char) -> bool { - c.is_alphanumeric() || ['_', '-', '.'].contains(&c) - } - - pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: ParseError<&'a str>, - { - self.parse_or_nom(input) - } -} - //for nom impl FilterCondition { pub fn from_array( @@ -269,7 +43,7 @@ impl FilterCondition { A: AsRef, B: AsRef, { - let mut ands = None; + let mut ands: Option = None; for either in array { match either { @@ -316,10 +90,7 @@ impl FilterCondition { Err(e) => Err(Error::UserError(UserError::InvalidFilterNom { input: e.to_string() })), } } -} - -impl FilterCondition { - fn negate(self) -> FilterCondition { + pub fn negate(self) -> FilterCondition { match self { Operator(fid, op) => match op.negate() { (op, None) => Operator(fid, op), @@ -389,7 +160,7 @@ impl FilterCondition { lng.1.clone(), )))?; } - Ok(Operator(fid, GeoLowerThan([lat.0, lng.0], distance))) + Ok(Operator(fid, Operator::GeoLowerThan([lat.0, lng.0], distance))) } } @@ -514,9 +285,9 @@ impl FilterCondition { // as the facets values are all in the same database and prefixed by the // field id and the level. let (left, right) = match operator { - GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), - GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), - Equal(number, string) => { + Operator::GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), + Operator::GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), + Operator::Equal(number, string) => { let (_original_value, string_docids) = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); let number_docids = match number { @@ -538,23 +309,23 @@ impl FilterCondition { }; return Ok(string_docids | number_docids); } - NotEqual(number, string) => { + Operator::NotEqual(number, string) => { let all_numbers_ids = if number.is_some() { index.number_faceted_documents_ids(rtxn, field_id)? } else { RoaringBitmap::new() }; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; - let operator = Equal(*number, string.clone()); + let operator = Operator::Equal(*number, string.clone()); let docids = Self::evaluate_operator( rtxn, index, numbers_db, strings_db, field_id, &operator, )?; return Ok((all_numbers_ids | all_strings_ids) - docids); } - LowerThan(val) => (Included(f64::MIN), Excluded(*val)), - LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), - Between(left, right) => (Included(*left), Included(*right)), - GeoLowerThan(base_point, distance) => { + Operator::LowerThan(val) => (Included(f64::MIN), Excluded(*val)), + Operator::LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), + Operator::Between(left, right) => (Included(*left), Included(*right)), + Operator::GeoLowerThan(base_point, distance) => { let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, None => return Ok(RoaringBitmap::new()), @@ -570,7 +341,7 @@ impl FilterCondition { return Ok(result); } - GeoGreaterThan(point, distance) => { + Operator::GeoGreaterThan(point, distance) => { let result = Self::evaluate_operator( rtxn, index, @@ -631,361 +402,3 @@ impl FilterCondition { } } } - -/// Retrieve the field id base on the pest value. -/// -/// Returns an error if the given value is not filterable. -/// -/// Returns Ok(None) if the given value is filterable, but is not yet ascociated to a field_id. -/// -/// The pest pair is simply a string associated with a span, a location to highlight in -/// the error message. -#[cfg(test)] -mod tests { - use big_s::S; - use heed::EnvOpenOptions; - use maplit::hashset; - - use super::*; - use crate::update::Settings; - - #[test] - fn string() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut map = index.fields_ids_map(&wtxn).unwrap(); - map.insert("channel"); - index.put_fields_ids_map(&mut wtxn, &map).unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { S("channel") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); - let expected = Operator(0, Operator::Equal(None, S("ponce"))); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); - let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); - let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); - assert_eq!(condition, expected); - } - - #[test] - fn number() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut map = index.fields_ids_map(&wtxn).unwrap(); - map.insert("timestamp"); - index.put_fields_ids_map(&mut wtxn, &map).unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { "timestamp".into() }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = Operator(0, Between(22.0, 44.0)); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); - let expected = - Or(Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0)))); - assert_eq!(condition, expected); - } - - #[test] - fn compare() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "channel < 20").unwrap(); - let expected = Operator(0, LowerThan(20.0)); - - assert_eq!(condition, expected); - } - - #[test] - fn parentheses() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", - ) - .unwrap(); - let expected = Or( - Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), - Box::new(And( - Box::new(Operator(1, Between(22.0, 44.0))), - Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), - )), - ); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", - ) - .unwrap(); - let expected = Or( - Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), - Box::new(Or( - Box::new(Or( - Box::new(Operator(1, LowerThan(22.0))), - Box::new(Operator(1, GreaterThan(44.0))), - )), - Box::new(Operator(0, Operator::Equal(None, S("ponce")))), - )), - ); - assert_eq!(condition, expected); - } - - #[test] - fn reserved_field_names() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let rtxn = index.read_txn().unwrap(); - - let error = FilterCondition::from_str(&rtxn, &index, "_geo = 12").unwrap_err(); - assert!(error - .to_string() - .contains("`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), - "{}", - error.to_string() - ); - - let error = - FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).unwrap_err(); - assert!(error - .to_string() - .contains("`_geoDistance` is a reserved keyword and thus can't be used as a filter expression."), - "{}", - error.to_string() - ); - - let error = FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).unwrap_err(); - assert!(error - .to_string() - .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), - "{}", - error.to_string() - ); - - let error = - FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).unwrap_err(); - assert!(error - .to_string() - .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), - "{}", - error.to_string() - ); - } - - #[test] - fn geo_radius() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - // _geo is not filterable - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 12, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("attribute `_geo` is not filterable, available filterable attributes are:"),); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - // basic test - let condition = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); - let expected = Operator(0, GeoLowerThan([12., 13.0005], 2000.)); - assert_eq!(condition, expected); - - // basic test with latitude and longitude at the max angle - let condition = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(90, 180, 2000)").unwrap(); - let expected = Operator(0, GeoLowerThan([90., 180.], 2000.)); - assert_eq!(condition, expected); - - // basic test with latitude and longitude at the min angle - let condition = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90, -180, 2000)").unwrap(); - let expected = Operator(0, GeoLowerThan([-90., -180.], 2000.)); - assert_eq!(condition, expected); - - // test the negation of the GeoLowerThan - let condition = - FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); - let expected = Operator(0, GeoGreaterThan([50., 18.], 2000.500)); - assert_eq!(condition, expected); - - // composition of multiple operations - let condition = FilterCondition::from_str( - &rtxn, - &index, - "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", - ) - .unwrap(); - let expected = Or( - Box::new(And( - Box::new(Operator(0, GeoGreaterThan([1., 2.], 300.))), - Box::new(Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), - )), - Box::new(Operator(1, LowerThanOrEqual(10.))), - ); - assert_eq!(condition, expected); - - // georadius don't have any parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius don't have any parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius don't have enough parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius have too many parameters - let result = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius have a bad latitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Latitude must be contained between -90 and 90 degrees.")); - - // georadius have a bad latitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Latitude must be contained between -90 and 90 degrees.")); - - // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Longitude must be contained between -180 and 180 degrees.")); - - // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Longitude must be contained between -180 and 180 degrees.")); - } - - #[test] - fn from_array() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array( - &rtxn, - &index, - vec![ - Either::Right("channel = gotaga"), - Either::Left(vec!["timestamp = 44", "channel != ponce"]), - ], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga AND (timestamp = 44 OR channel != ponce)", - ) - .unwrap(); - assert_eq!(condition, expected); - } -} diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs new file mode 100644 index 000000000..53a51ca49 --- /dev/null +++ b/milli/src/search/facet/filter_parser.rs @@ -0,0 +1,500 @@ +use std::collections::HashSet; +use std::fmt::Debug; +use std::result::Result as StdResult; + +use super::FilterCondition; +use crate::{FieldId, FieldsIdsMap}; +use nom::{ + branch::alt, + bytes::complete::{tag, take_while1}, + character::complete::{char, multispace0}, + combinator::map, + error::ErrorKind, + error::ParseError, + error::VerboseError, + multi::many0, + sequence::{delimited, preceded, tuple}, + IResult, +}; + +use self::Operator::*; +#[derive(Debug, Clone, PartialEq)] +pub enum Operator { + GreaterThan(f64), + GreaterThanOrEqual(f64), + Equal(Option, String), + NotEqual(Option, String), + LowerThan(f64), + LowerThanOrEqual(f64), + Between(f64, f64), + GeoLowerThan([f64; 2], f64), + GeoGreaterThan([f64; 2], f64), +} + +impl Operator { + /// This method can return two operations in case it must express + /// an OR operation for the between case (i.e. `TO`). + pub fn negate(self) -> (Self, Option) { + match self { + GreaterThan(n) => (LowerThanOrEqual(n), None), + GreaterThanOrEqual(n) => (LowerThan(n), None), + Equal(n, s) => (NotEqual(n, s), None), + NotEqual(n, s) => (Equal(n, s), None), + LowerThan(n) => (GreaterThanOrEqual(n), None), + LowerThanOrEqual(n) => (GreaterThan(n), None), + Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), + } + } +} + +pub struct ParseContext<'a> { + pub fields_ids_map: &'a FieldsIdsMap, + pub filterable_fields: &'a HashSet, +} + +impl<'a> ParseContext<'a> { + fn parse_or_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + let (input, lhs) = self.parse_and_nom(input)?; + let (input, ors) = many0(preceded(tag("OR"), |c| Self::parse_or_nom(self, c)))(input)?; + let expr = ors + .into_iter() + .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch))); + Ok((input, expr)) + } + + fn parse_and_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + let (input, lhs) = self.parse_not_nom(input)?; + let (input, ors) = many0(preceded(tag("AND"), |c| Self::parse_and_nom(self, c)))(input)?; + let expr = ors + .into_iter() + .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); + Ok((input, expr)) + } + + fn parse_not_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + alt(( + map( + preceded(alt((self.ws(tag("!")), self.ws(tag("NOT")))), |c| { + Self::parse_condition_expression(self, c) + }), + |e| e.negate(), + ), + |c| Self::parse_condition_expression(self, c), + ))(input) + } + + fn ws(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> + where + F: Fn(&'a str) -> IResult<&'a str, O, E>, + E: ParseError<&'a str>, + { + delimited(multispace0, inner, multispace0) + } + + fn parse_simple_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + let operator = alt((tag(">"), tag(">="), tag("="), tag("<"), tag("!="), tag("<="))); + let (input, (key, op, value)) = + tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))( + input, + )?; + let fid = self.parse_fid(input, key)?; + let r: StdResult>> = self.parse_numeric(value); + let k = match op { + "=" => FilterCondition::Operator(fid, Equal(r.ok(), value.to_string().to_lowercase())), + "!=" => { + FilterCondition::Operator(fid, NotEqual(r.ok(), value.to_string().to_lowercase())) + } + ">" | "<" | "<=" | ">=" => return self.parse_numeric_unary_condition(op, fid, value), + _ => unreachable!(), + }; + Ok((input, k)) + } + + fn parse_numeric(&'a self, input: &'a str) -> StdResult> + where + E: ParseError<&'a str>, + T: std::str::FromStr, + { + match input.parse::() { + Ok(n) => Ok(n), + Err(_) => { + return match input.chars().nth(0) { + Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), + None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), + }; + } + } + } + + fn parse_numeric_unary_condition( + &'a self, + input: &'a str, + fid: u16, + value: &'a str, + ) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + let numeric: f64 = self.parse_numeric(value)?; + let k = match input { + ">" => FilterCondition::Operator(fid, GreaterThan(numeric)), + "<" => FilterCondition::Operator(fid, LowerThan(numeric)), + "<=" => FilterCondition::Operator(fid, LowerThanOrEqual(numeric)), + ">=" => FilterCondition::Operator(fid, GreaterThanOrEqual(numeric)), + _ => unreachable!(), + }; + Ok((input, k)) + } + + fn parse_fid(&'a self, input: &'a str, key: &'a str) -> StdResult> + where + E: ParseError<&'a str>, + { + let error = match input.chars().nth(0) { + Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), + None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), + }; + if !self.filterable_fields.contains(key) { + return error; + } + match self.fields_ids_map.id(key) { + Some(fid) => Ok(fid), + None => error, + } + } + + fn parse_range_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + let (input, (key, from, _, to)) = tuple(( + self.ws(|c| self.parse_key(c)), + self.ws(|c| self.parse_key(c)), + tag("TO"), + self.ws(|c| self.parse_key(c)), + ))(input)?; + + let fid = self.parse_fid(input, key)?; + let numeric_from: f64 = self.parse_numeric(from)?; + let numeric_to: f64 = self.parse_numeric(to)?; + let res = FilterCondition::Operator(fid, Between(numeric_from, numeric_to)); + Ok((input, res)) + } + + fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + let l1 = |c| self.parse_simple_condition(c); + let l2 = |c| self.parse_range_condition(c); + let (input, condition) = alt((l1, l2))(input)?; + Ok((input, condition)) + } + + fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + return alt(( + delimited(self.ws(char('(')), |c| Self::parse_expression(self, c), self.ws(char(')'))), + |c| Self::parse_condition(self, c), + ))(input); + } + + fn parse_key(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> + where + E: ParseError<&'a str>, + { + let key = |input| take_while1(Self::is_key_component)(input); + alt((key, delimited(char('"'), key, char('"'))))(input) + } + fn is_key_component(c: char) -> bool { + c.is_alphanumeric() || ['_', '-', '.'].contains(&c) + } + + pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + self.parse_or_nom(input) + } +} + +#[cfg(test)] +mod tests { + use big_s::S; + use either::Either; + use heed::EnvOpenOptions; + use maplit::hashset; + + use super::*; + use crate::{update::Settings, Index}; + + #[test] + fn string() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut map = index.fields_ids_map(&wtxn).unwrap(); + map.insert("channel"); + index.put_fields_ids_map(&mut wtxn, &map).unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_filterable_fields(hashset! { S("channel") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); + let expected = FilterCondition::Operator(0, Operator::Equal(None, S("ponce"))); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); + let expected = FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce"))); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); + let expected = FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce"))); + assert_eq!(condition, expected); + } + + #[test] + fn number() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut map = index.fields_ids_map(&wtxn).unwrap(); + map.insert("timestamp"); + index.put_fields_ids_map(&mut wtxn, &map).unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_filterable_fields(hashset! { "timestamp".into() }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); + let expected = FilterCondition::Operator(0, Between(22.0, 44.0)); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); + let expected = FilterCondition::Or( + Box::new(FilterCondition::Operator(0, LowerThan(22.0))), + Box::new(FilterCondition::Operator(0, GreaterThan(44.0))), + ); + assert_eq!(condition, expected); + } + + #[test] + fn compare() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "channel < 20").unwrap(); + let expected = FilterCondition::Operator(0, LowerThan(20.0)); + + assert_eq!(condition, expected); + } + + #[test] + fn parentheses() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", + ) + .unwrap(); + let expected = FilterCondition::Or( + Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), + Box::new(FilterCondition::And( + Box::new(FilterCondition::Operator(1, Between(22.0, 44.0))), + Box::new(FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce")))), + )), + ); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", + ) + .unwrap(); + let expected = FilterCondition::Or( + Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), + Box::new(FilterCondition::Or( + Box::new(FilterCondition::Or( + Box::new(FilterCondition::Operator(1, LowerThan(22.0))), + Box::new(FilterCondition::Operator(1, GreaterThan(44.0))), + )), + Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("ponce")))), + )), + ); + assert_eq!(condition, expected); + } + + #[test] + fn from_array() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array( + &rtxn, + &index, + vec![ + Either::Right("channel = gotaga"), + Either::Left(vec!["timestamp = 44", "channel != ponce"]), + ], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga AND (timestamp = 44 OR channel != ponce)", + ) + .unwrap(); + assert_eq!(condition, expected); + } + #[test] + fn geo_radius() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + // basic test + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([12., 13.0005], 2000.)); + assert_eq!(condition, expected); + + // test the negation of the GeoLowerThan + let condition = + FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); + let expected = Operator(0, GeoGreaterThan([50., 18.], 2000.500)); + assert_eq!(condition, expected); + + // composition of multiple operations + let condition = FilterCondition::from_str( + &rtxn, + &index, + "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", + ) + .unwrap(); + let expected = Or( + Box::new(And( + Box::new(Operator(0, GeoGreaterThan([1., 2.], 300.))), + Box::new(Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), + )), + Box::new(Operator(1, LowerThanOrEqual(10.))), + ); + assert_eq!(condition, expected); + + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius don't have enough parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have too many parameters + let result = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have a bad latitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-200, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude and longitude must be contained between -180 to 180 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 181, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude and longitude must be contained between -180 to 180 degrees.")); + } +} diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index a5c041dd5..3efa0262f 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,9 +1,10 @@ pub use self::facet_distribution::FacetDistribution; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::facet_string::FacetStringIter; -pub use self::filter_condition::{FilterCondition, Operator}; +pub use self::filter_condition::FilterCondition; mod facet_distribution; mod facet_number; mod facet_string; mod filter_condition; +mod filter_parser; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 85d5dc8a7..9b76ca851 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -14,7 +14,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; +pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; From 469d92c569a41e0f98a81e9597ccfec02466282c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Sun, 10 Oct 2021 14:50:59 +0800 Subject: [PATCH 1063/1889] tweak error handling --- milli/src/error.rs | 6 +- milli/src/search/facet/filter_condition.rs | 75 ++++------------------ milli/src/search/facet/filter_parser.rs | 71 +++++++++++++++++--- 3 files changed, 74 insertions(+), 78 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index b80238468..3ae18165f 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -210,6 +210,8 @@ impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { //TODO + Self::InvalidFilterAttributeNom => write!(f, "parser error "), + Self::InvalidFilterValue => write!(f, "parser error "), Self::InvalidFilterNom { input } => write!(f, "parser error {}", input), Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), Self::CriterionError(error) => write!(f, "{}", error), @@ -228,10 +230,6 @@ impl fmt::Display for UserError { "the document with the id: {} contains an invalid _geo field: {}", document_id, object ), - Self::InvalidAscDescSyntax { name } => { - write!(f, "invalid asc/desc syntax for {}", name) - } - Self::InvalidCriterionName { name } => write!(f, "invalid criterion {}", name), Self::InvalidDocumentId { document_id } => { let json = serde_json::to_string(document_id).unwrap(); write!( diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index c728e0acd..c4fa8e561 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -5,7 +5,7 @@ use std::ops::Bound::{self, Excluded, Included}; use either::Either; use heed::types::DecodeIgnore; use log::debug; -use nom::error::VerboseError; +use nom::error::{convert_error, VerboseError}; use roaring::RoaringBitmap; use self::FilterCondition::*; @@ -87,7 +87,15 @@ impl FilterCondition { ParseContext { fields_ids_map: &fields_ids_map, filterable_fields: &filterable_fields }; match ctx.parse_expression::>(expression) { Ok((_, fc)) => Ok(fc), - Err(e) => Err(Error::UserError(UserError::InvalidFilterNom { input: e.to_string() })), + Err(e) => { + match e { + nom::Err::Error(x) => { + println!("verbose err:\n{}", convert_error(expression, x)) + } + _ => unreachable!(), + } + Err(Error::UserError(UserError::InvalidFilterNom { input: "whatever".to_string() })) + } } } pub fn negate(self) -> FilterCondition { @@ -101,67 +109,6 @@ impl FilterCondition { Empty => Empty, } } - - fn geo_radius( - fields_ids_map: &FieldsIdsMap, - filterable_fields: &HashSet, - item: Pair, - ) -> Result { - if !filterable_fields.contains("_geo") { - return Err(UserError::InvalidFilterAttribute(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `_geo` is not filterable, available filterable attributes are: {}", - filterable_fields.iter().join(", "), - ), - }, - item.as_span(), - )))?; - } - let mut items = item.into_inner(); - let fid = match fields_ids_map.id("_geo") { - Some(fid) => fid, - None => return Ok(Empty), - }; - let parameters_item = items.next().unwrap(); - // We don't need more than 3 parameters, but to handle errors correctly we are still going - // to extract the first 4 parameters - let param_span = parameters_item.as_span(); - let parameters = parameters_item - .into_inner() - .take(4) - .map(|param| (param.clone(), param.as_span())) - .map(|(param, span)| pest_parse(param).0.map(|arg| (arg, span))) - .collect::, _>>() - .map_err(UserError::InvalidFilter)?; - if parameters.len() != 3 { - return Err(UserError::InvalidFilter(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), - }, - // we want to point to the last parameters and if there was no parameters we - // point to the parenthesis - parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), - )))?; - } - let (lat, lng, distance) = (¶meters[0], ¶meters[1], parameters[2].0); - if !(-90.0..=90.0).contains(&lat.0) { - return Err(UserError::InvalidFilter(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!("Latitude must be contained between -90 and 90 degrees."), - }, - lat.1.clone(), - )))?; - } else if !(-180.0..=180.0).contains(&lng.0) { - return Err(UserError::InvalidFilter(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!("Longitude must be contained between -180 and 180 degrees."), - }, - lng.1.clone(), - )))?; - } - Ok(Operator(fid, Operator::GeoLowerThan([lat.0, lng.0], distance))) - } } impl FilterCondition { @@ -348,7 +295,7 @@ impl FilterCondition { numbers_db, strings_db, field_id, - &GeoLowerThan(point.clone(), *distance), + &Operator::GeoLowerThan(point.clone(), *distance), )?; let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; return Ok(geo_faceted_doc_ids - result); diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 53a51ca49..419c148d7 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -9,10 +9,11 @@ use nom::{ bytes::complete::{tag, take_while1}, character::complete::{char, multispace0}, combinator::map, - error::ErrorKind, error::ParseError, error::VerboseError, + error::{ContextError, ErrorKind}, multi::many0, + multi::separated_list1, sequence::{delimited, preceded, tuple}, IResult, }; @@ -43,6 +44,8 @@ impl Operator { LowerThan(n) => (GreaterThanOrEqual(n), None), LowerThanOrEqual(n) => (GreaterThan(n), None), Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), + GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None), + GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None), } } } @@ -193,13 +196,52 @@ impl<'a> ParseContext<'a> { Ok((input, res)) } + fn parse_geo_radius(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + where + E: ParseError<&'a str>, + { + let (input, args) = preceded( + tag("_geoRadius"), + delimited( + tag("("), + separated_list1(tag(","), self.ws(|c| self.parse_value(c))), + tag(")"), + ), + )(input)?; + + if args.len() != 3 { + let e = E::from_char(input, '('); + return Err(nom::Err::Failure(e)); + } + let lat = self.parse_numeric(args[0])?; + let lng = self.parse_numeric(args[1])?; + let dis = self.parse_numeric(args[2])?; + + let fid = match self.fields_ids_map.id("_geo") { + Some(fid) => fid, + None => return Ok((input, FilterCondition::Empty)), + }; + + if let Some(span) = (!(-181.0..181.).contains(&lat)) + .then(|| &lat) + .or((!(-181.0..181.).contains(&lng)).then(|| &lng)) + { + let e = E::from_char(input, '('); + return Err(nom::Err::Failure(e)); + } + + let res = FilterCondition::Operator(fid, GeoLowerThan([lat, lng], dis)); + Ok((input, res)) + } + fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where E: ParseError<&'a str>, { + let l0 = |c| self.parse_geo_radius(c); let l1 = |c| self.parse_simple_condition(c); let l2 = |c| self.parse_range_condition(c); - let (input, condition) = alt((l1, l2))(input)?; + let (input, condition) = alt((l0, l1, l2))(input)?; Ok((input, condition)) } @@ -220,6 +262,15 @@ impl<'a> ParseContext<'a> { let key = |input| take_while1(Self::is_key_component)(input); alt((key, delimited(char('"'), key, char('"'))))(input) } + + fn parse_value(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> + where + E: ParseError<&'a str>, + { + let key = |input| take_while1(Self::is_key_component)(input); + alt((key, delimited(char('"'), key, char('"'))))(input) + } + fn is_key_component(c: char) -> bool { c.is_alphanumeric() || ['_', '-', '.'].contains(&c) } @@ -431,13 +482,13 @@ mod tests { // basic test let condition = FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); - let expected = Operator(0, GeoLowerThan([12., 13.0005], 2000.)); + let expected = FilterCondition::Operator(0, GeoLowerThan([12., 13.0005], 2000.)); assert_eq!(condition, expected); // test the negation of the GeoLowerThan let condition = FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); - let expected = Operator(0, GeoGreaterThan([50., 18.], 2000.500)); + let expected = FilterCondition::Operator(0, GeoGreaterThan([50., 18.], 2000.500)); assert_eq!(condition, expected); // composition of multiple operations @@ -446,13 +497,13 @@ mod tests { &index, "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", ) - .unwrap(); - let expected = Or( - Box::new(And( - Box::new(Operator(0, GeoGreaterThan([1., 2.], 300.))), - Box::new(Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), + .unwrap_or_else(|e| FilterCondition::Empty); + let expected = FilterCondition::Or( + Box::new(FilterCondition::And( + Box::new(FilterCondition::Operator(0, GeoGreaterThan([1., 2.], 300.))), + Box::new(FilterCondition::Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), )), - Box::new(Operator(1, LowerThanOrEqual(10.))), + Box::new(FilterCondition::Operator(1, LowerThanOrEqual(10.))), ); assert_eq!(condition, expected); From 28f9be8d7c3cdc449d5b9a125bd600bc685640a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Tue, 12 Oct 2021 08:20:01 +0800 Subject: [PATCH 1064/1889] support syntax --- milli/src/search/facet/filter_parser.rs | 86 +++++++++++++++++-------- 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 419c148d7..7f7ec83c2 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -58,10 +58,12 @@ pub struct ParseContext<'a> { impl<'a> ParseContext<'a> { fn parse_or_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { let (input, lhs) = self.parse_and_nom(input)?; - let (input, ors) = many0(preceded(tag("OR"), |c| Self::parse_or_nom(self, c)))(input)?; + let (input, ors) = + many0(preceded(self.ws(tag("OR")), |c| Self::parse_or_nom(self, c)))(input)?; + let expr = ors .into_iter() .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch))); @@ -70,10 +72,16 @@ impl<'a> ParseContext<'a> { fn parse_and_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { let (input, lhs) = self.parse_not_nom(input)?; - let (input, ors) = many0(preceded(tag("AND"), |c| Self::parse_and_nom(self, c)))(input)?; + // let (input, lhs) = alt(( + // delimited(self.ws(char('(')), |c| Self::parse_not_nom(self, c), self.ws(char(')'))), + // |c| self.parse_not_nom(c), + // ))(input)?; + + let (input, ors) = + many0(preceded(self.ws(tag("AND")), |c| Self::parse_and_nom(self, c)))(input)?; let expr = ors .into_iter() .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); @@ -82,7 +90,7 @@ impl<'a> ParseContext<'a> { fn parse_not_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { alt(( map( @@ -98,20 +106,26 @@ impl<'a> ParseContext<'a> { fn ws(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> where F: Fn(&'a str) -> IResult<&'a str, O, E>, - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { delimited(multispace0, inner, multispace0) } fn parse_simple_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + std::fmt::Debug, { - let operator = alt((tag(">"), tag(">="), tag("="), tag("<"), tag("!="), tag("<="))); - let (input, (key, op, value)) = - tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))( - input, - )?; + let operator = alt((tag("<="), tag(">="), tag(">"), tag("="), tag("<"), tag("!="))); + let k = tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))( + input, + ); + let (input, (key, op, value)) = match k { + Ok(o) => o, + Err(e) => { + return Err(e); + } + }; + let fid = self.parse_fid(input, key)?; let r: StdResult>> = self.parse_numeric(value); let k = match op { @@ -127,7 +141,7 @@ impl<'a> ParseContext<'a> { fn parse_numeric(&'a self, input: &'a str) -> StdResult> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, T: std::str::FromStr, { match input.parse::() { @@ -148,7 +162,7 @@ impl<'a> ParseContext<'a> { value: &'a str, ) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { let numeric: f64 = self.parse_numeric(value)?; let k = match input { @@ -163,7 +177,7 @@ impl<'a> ParseContext<'a> { fn parse_fid(&'a self, input: &'a str, key: &'a str) -> StdResult> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { let error = match input.chars().nth(0) { Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), @@ -180,7 +194,7 @@ impl<'a> ParseContext<'a> { fn parse_range_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { let (input, (key, from, _, to)) = tuple(( self.ws(|c| self.parse_key(c)), @@ -193,19 +207,20 @@ impl<'a> ParseContext<'a> { let numeric_from: f64 = self.parse_numeric(from)?; let numeric_to: f64 = self.parse_numeric(to)?; let res = FilterCondition::Operator(fid, Between(numeric_from, numeric_to)); + Ok((input, res)) } fn parse_geo_radius(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { let (input, args) = preceded( tag("_geoRadius"), delimited( - tag("("), + char('('), separated_list1(tag(","), self.ws(|c| self.parse_value(c))), - tag(")"), + char(')'), ), )(input)?; @@ -236,7 +251,7 @@ impl<'a> ParseContext<'a> { fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { let l0 = |c| self.parse_geo_radius(c); let l1 = |c| self.parse_simple_condition(c); @@ -247,12 +262,12 @@ impl<'a> ParseContext<'a> { fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { - return alt(( + alt(( delimited(self.ws(char('(')), |c| Self::parse_expression(self, c), self.ws(char(')'))), |c| Self::parse_condition(self, c), - ))(input); + ))(input) } fn parse_key(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> @@ -265,7 +280,7 @@ impl<'a> ParseContext<'a> { fn parse_value(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { let key = |input| take_while1(Self::is_key_component)(input); alt((key, delimited(char('"'), key, char('"'))))(input) @@ -277,9 +292,10 @@ impl<'a> ParseContext<'a> { pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + Debug, { - self.parse_or_nom(input) + let a = self.parse_or_nom(input); + a } } @@ -506,6 +522,24 @@ mod tests { Box::new(FilterCondition::Operator(1, LowerThanOrEqual(10.))), ); assert_eq!(condition, expected); + } + + #[test] + fn geo_radius_error() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); // georadius don't have any parameters let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); From 70f576d5d39bfffc289f2d506a8cdd6a0c060fe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Tue, 12 Oct 2021 10:05:10 +0800 Subject: [PATCH 1065/1889] error handling --- milli/src/search/facet/filter_condition.rs | 17 +++--- milli/src/search/facet/filter_parser.rs | 63 +++++++++++++--------- 2 files changed, 47 insertions(+), 33 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index c4fa8e561..96c8867ec 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; + use std::fmt::Debug; use std::ops::Bound::{self, Excluded, Included}; @@ -17,7 +17,7 @@ use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, }; use crate::{ - distance_between_two_points, CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result, + distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result, }; #[derive(Debug, Clone, PartialEq)] @@ -88,13 +88,14 @@ impl FilterCondition { match ctx.parse_expression::>(expression) { Ok((_, fc)) => Ok(fc), Err(e) => { - match e { - nom::Err::Error(x) => { - println!("verbose err:\n{}", convert_error(expression, x)) - } + let ve = match e { + nom::Err::Error(x) => x, + nom::Err::Failure(x) => x, _ => unreachable!(), - } - Err(Error::UserError(UserError::InvalidFilterNom { input: "whatever".to_string() })) + }; + Err(Error::UserError(UserError::InvalidFilterNom { + input: convert_error(expression, ve).to_string(), + })) } } } diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 7f7ec83c2..72acaecfb 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -58,7 +58,7 @@ pub struct ParseContext<'a> { impl<'a> ParseContext<'a> { fn parse_or_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { let (input, lhs) = self.parse_and_nom(input)?; let (input, ors) = @@ -72,7 +72,7 @@ impl<'a> ParseContext<'a> { fn parse_and_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { let (input, lhs) = self.parse_not_nom(input)?; // let (input, lhs) = alt(( @@ -90,7 +90,7 @@ impl<'a> ParseContext<'a> { fn parse_not_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { alt(( map( @@ -106,14 +106,14 @@ impl<'a> ParseContext<'a> { fn ws(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> where F: Fn(&'a str) -> IResult<&'a str, O, E>, - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { delimited(multispace0, inner, multispace0) } fn parse_simple_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + std::fmt::Debug, + E: ParseError<&'a str> + ContextError<&'a str> + std::fmt::Debug, { let operator = alt((tag("<="), tag(">="), tag(">"), tag("="), tag("<"), tag("!="))); let k = tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))( @@ -141,7 +141,7 @@ impl<'a> ParseContext<'a> { fn parse_numeric(&'a self, input: &'a str) -> StdResult> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, T: std::str::FromStr, { match input.parse::() { @@ -162,7 +162,7 @@ impl<'a> ParseContext<'a> { value: &'a str, ) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { let numeric: f64 = self.parse_numeric(value)?; let k = match input { @@ -177,7 +177,7 @@ impl<'a> ParseContext<'a> { fn parse_fid(&'a self, input: &'a str, key: &'a str) -> StdResult> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { let error = match input.chars().nth(0) { Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), @@ -194,7 +194,7 @@ impl<'a> ParseContext<'a> { fn parse_range_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { let (input, (key, from, _, to)) = tuple(( self.ws(|c| self.parse_key(c)), @@ -213,20 +213,34 @@ impl<'a> ParseContext<'a> { fn parse_geo_radius(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { - let (input, args) = preceded( + let err_msg_args_incomplete:&'static str = + "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; + let err_msg_args_invalid: &'static str = + "_geoRadius. Latitude and longitude must be contained between -180 to 180 degrees."; + let (input, args): (&str, Vec<&str>) = match preceded( tag("_geoRadius"), delimited( char('('), - separated_list1(tag(","), self.ws(|c| self.parse_value(c))), + separated_list1(tag(","), self.ws(|c| self.parse_value::(c))), char(')'), ), - )(input)?; + )(input) + { + Ok(e) => e, + Err(_e) => { + return Err(nom::Err::Failure(E::add_context( + input, + err_msg_args_incomplete, + E::from_char(input, '('), + ))); + } + }; if args.len() != 3 { let e = E::from_char(input, '('); - return Err(nom::Err::Failure(e)); + return Err(nom::Err::Failure(E::add_context(input, err_msg_args_incomplete, e))); } let lat = self.parse_numeric(args[0])?; let lng = self.parse_numeric(args[1])?; @@ -237,12 +251,12 @@ impl<'a> ParseContext<'a> { None => return Ok((input, FilterCondition::Empty)), }; - if let Some(span) = (!(-181.0..181.).contains(&lat)) + if let Some(_span) = (!(-181.0..181.).contains(&lat)) .then(|| &lat) .or((!(-181.0..181.).contains(&lng)).then(|| &lng)) { let e = E::from_char(input, '('); - return Err(nom::Err::Failure(e)); + return Err(nom::Err::Failure(E::add_context(input,err_msg_args_invalid, e))); } let res = FilterCondition::Operator(fid, GeoLowerThan([lat, lng], dis)); @@ -251,18 +265,18 @@ impl<'a> ParseContext<'a> { fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { - let l0 = |c| self.parse_geo_radius(c); let l1 = |c| self.parse_simple_condition(c); let l2 = |c| self.parse_range_condition(c); - let (input, condition) = alt((l0, l1, l2))(input)?; + let l3 = |c| self.parse_geo_radius(c); + let (input, condition) = alt((l1, l2,l3))(input)?; Ok((input, condition)) } fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { alt(( delimited(self.ws(char('(')), |c| Self::parse_expression(self, c), self.ws(char(')'))), @@ -272,7 +286,7 @@ impl<'a> ParseContext<'a> { fn parse_key(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> where - E: ParseError<&'a str>, + E: ParseError<&'a str> + ContextError<&'a str>, { let key = |input| take_while1(Self::is_key_component)(input); alt((key, delimited(char('"'), key, char('"'))))(input) @@ -280,7 +294,7 @@ impl<'a> ParseContext<'a> { fn parse_value(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { let key = |input| take_while1(Self::is_key_component)(input); alt((key, delimited(char('"'), key, char('"'))))(input) @@ -292,7 +306,7 @@ impl<'a> ParseContext<'a> { pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + Debug, + E: ParseError<&'a str> + ContextError<&'a str> + Debug, { let a = self.parse_or_nom(input); a @@ -512,8 +526,7 @@ mod tests { &rtxn, &index, "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", - ) - .unwrap_or_else(|e| FilterCondition::Empty); + ).unwrap(); let expected = FilterCondition::Or( Box::new(FilterCondition::And( Box::new(FilterCondition::Operator(0, GeoGreaterThan([1., 2.], 300.))), From d323e35001f0d83619c1aec1c6900ee20e08a0d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Tue, 12 Oct 2021 13:22:32 +0800 Subject: [PATCH 1066/1889] add a test case --- milli/src/search/facet/filter_parser.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 72acaecfb..8d00d086c 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -395,15 +395,19 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.set_searchable_fields(vec![S("channel"), S("timestamp"),S("id")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") ,S("id")}); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); let condition = FilterCondition::from_str(&rtxn, &index, "channel < 20").unwrap(); let expected = FilterCondition::Operator(0, LowerThan(20.0)); + assert_eq!(condition, expected); + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "id < 200").unwrap(); + let expected = FilterCondition::Operator(2, LowerThan(200.0)); assert_eq!(condition, expected); } From 360c5ff3dfe6341590df6fc6c3679a7abb125012 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 22 Sep 2021 17:48:24 +0200 Subject: [PATCH 1067/1889] Remove limit of 1000 position per attribute Instead of using an arbitrary limit we encode the absolute position in a u32 using one strong u16 for the field id and a weak u16 for the relative position in the attribute. --- milli/src/lib.rs | 37 ++++++++++++++++++ milli/src/proximity.rs | 13 ++----- milli/src/search/criteria/exactness.rs | 11 +++--- .../extract/extract_docid_word_positions.rs | 9 ++--- .../extract/extract_fid_word_count_docids.rs | 7 ++-- milli/src/update/index_documents/mod.rs | 38 +++++++++++++++++++ 6 files changed, 91 insertions(+), 24 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 838817d98..781cedb2c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -53,9 +53,24 @@ pub type Attribute = u32; pub type DocumentId = u32; pub type FieldId = u16; pub type Position = u32; +pub type RelativePosition = u16; pub type FieldDistribution = BTreeMap; pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>; +pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; + +// Convert an absolute word position into a relative position. +// Return the field id of the attribute related to the absolute position +// and the relative position in the attribute. +pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) { + ((absolute >> 16) as u16, (absolute & 0xFFFF) as u16) +} + +// Compute the absolute word position with the field id of the attribute and relative position in the attribute. +pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { + (field_id as u32) << 16 | (relative as u32) +} + /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], @@ -187,4 +202,26 @@ mod tests { // the distance of hard separators is clamped to 8 anyway. assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); } + + #[test] + fn test_relative_position_conversion() { + assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000)); + assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF)); + assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000)); + assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00)); + assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF)); + assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678)); + assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF)); + } + + #[test] + fn test_absolute_position_conversion() { + assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000)); + assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF)); + assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000)); + assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00)); + assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF)); + assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678)); + assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF)); + } } diff --git a/milli/src/proximity.rs b/milli/src/proximity.rs index 083e5a977..62f490119 100644 --- a/milli/src/proximity.rs +++ b/milli/src/proximity.rs @@ -1,8 +1,7 @@ use std::cmp; -use crate::{Attribute, Position}; +use crate::{relative_from_absolute_position, Position}; -pub const ONE_ATTRIBUTE: u32 = 1000; pub const MAX_DISTANCE: u32 = 8; pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { @@ -14,19 +13,15 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { } pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { - let (lhs_attr, lhs_index) = extract_position(lhs); - let (rhs_attr, rhs_index) = extract_position(rhs); + let (lhs_attr, lhs_index) = relative_from_absolute_position(lhs); + let (rhs_attr, rhs_index) = relative_from_absolute_position(rhs); if lhs_attr != rhs_attr { MAX_DISTANCE } else { - index_proximity(lhs_index, rhs_index) + index_proximity(lhs_index as u32, rhs_index as u32) } } -pub fn extract_position(position: Position) -> (Attribute, Position) { - (position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE) -} - pub fn path_proximity(path: &[Position]) -> u32 { path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::() } diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 8e56b3649..e7775423c 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -10,7 +10,7 @@ use crate::search::criteria::{ resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; -use crate::Result; +use crate::{absolute_from_relative_position, FieldId, Result}; pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, @@ -181,7 +181,7 @@ fn resolve_state( ctx.field_id_word_count_docids(id, query_len)? { let mut attribute_candidates_array = - attribute_start_with_docids(ctx, id as u32, query)?; + attribute_start_with_docids(ctx, id, query)?; attribute_candidates_array.push(attribute_allowed_docids); candidates |= intersection_of(attribute_candidates_array.iter().collect()); } @@ -199,8 +199,7 @@ fn resolve_state( let mut candidates = RoaringBitmap::new(); let attributes_ids = ctx.searchable_fields_ids()?; for id in attributes_ids { - let attribute_candidates_array = - attribute_start_with_docids(ctx, id as u32, query)?; + let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; candidates |= intersection_of(attribute_candidates_array.iter().collect()); } @@ -290,12 +289,12 @@ fn resolve_state( fn attribute_start_with_docids( ctx: &dyn Context, - attribute_id: u32, + attribute_id: FieldId, query: &[ExactQueryPart], ) -> heed::Result> { let mut attribute_candidates_array = Vec::new(); // start from attribute first position - let mut pos = attribute_id * 1000; + let mut pos = absolute_from_relative_position(attribute_id, 0); for part in query { use ExactQueryPart::*; match part { diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ca65f0874..df19125c6 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -10,8 +10,7 @@ use serde_json::Value; use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; -use crate::proximity::ONE_ATTRIBUTE; -use crate::{FieldId, Result}; +use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; /// Extracts the word and positions where this word appear and /// prefixes it by the document id. @@ -63,7 +62,7 @@ pub fn extract_docid_word_positions( if let Some(field) = json_to_string(&value, &mut field_buffer) { let analyzed = analyzer.analyze(field); let tokens = process_tokens(analyzed.tokens()) - .take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE); + .take_while(|(p, _)| (*p as u32) < MAX_POSITION_PER_ATTRIBUTE); for (index, token) in tokens { let token = token.text().trim(); @@ -71,10 +70,10 @@ pub fn extract_docid_word_positions( key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(token.as_bytes()); - let position: u32 = index + let position: u16 = index .try_into() .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let position = field_id as u32 * ONE_ATTRIBUTE + position; + let position = absolute_from_relative_position(field_id, position); docid_word_positions_sorter .insert(&key_buffer, &position.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 1fbc55714..4e25cb4f6 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -10,8 +10,7 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::proximity::extract_position; -use crate::{DocumentId, FieldId, Result}; +use crate::{relative_from_absolute_position, DocumentId, FieldId, Result}; /// Extracts the field id word count and the documents ids where /// this field id with this amount of words appear. @@ -53,8 +52,8 @@ pub fn extract_fid_word_count_docids( } for position in read_u32_ne_bytes(value) { - let (field_id, position) = extract_position(position); - let word_count = position + 1; + let (field_id, position) = relative_from_absolute_position(position); + let word_count = position as u32 + 1; let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); *value = cmp::max(*value, word_count); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b0dbd9c3e..8138c6191 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -884,6 +884,44 @@ mod tests { wtxn.commit().unwrap(); } + #[test] + fn index_more_than_1000_positions_in_a_field() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(50 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut big_object = HashMap::new(); + big_object.insert(S("id"), "wow"); + let content: String = + (0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap(); + big_object.insert("content".to_string(), &content); + + let mut cursor = Cursor::new(Vec::new()); + + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + builder.add_documents(big_object).unwrap(); + builder.finish().unwrap(); + cursor.set_position(0); + let content = DocumentBatchReader::from_reader(cursor).unwrap(); + + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.execute(content, |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + + let mut rtxn = index.read_txn().unwrap(); + + assert!(index.word_docids.get(&mut rtxn, "0").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "64").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "256").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "1024").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "32768").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "65535").unwrap().is_some()); + } + #[test] fn index_documents_with_zeroes() { let path = tempfile::tempdir().unwrap(); From c5a60754848e4bb0c2e9bf87781d6584e550b00f Mon Sep 17 00:00:00 2001 From: many Date: Wed, 6 Oct 2021 12:11:07 +0200 Subject: [PATCH 1068/1889] Make max_position_per_attributes changable --- http-ui/src/main.rs | 8 ++++++++ .../extract/extract_docid_word_positions.rs | 5 ++++- milli/src/update/index_documents/extract/mod.rs | 4 ++++ milli/src/update/index_documents/mod.rs | 4 ++++ milli/src/update/settings.rs | 3 +++ milli/src/update/update_builder.rs | 8 ++++++++ 6 files changed, 31 insertions(+), 1 deletion(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 27fc138dd..652a88451 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -131,6 +131,11 @@ pub struct IndexerOpt { /// Number of parallel jobs for indexing, defaults to # of CPUs. #[structopt(long)] pub indexing_jobs: Option, + + /// Maximum relative position in an attribute for a word to be indexed. + /// Any value higher than 65535 will be clamped. + #[structopt(long)] + pub max_positions_per_attributes: Option, } struct Highlighter<'a, A> { @@ -346,6 +351,9 @@ async fn main() -> anyhow::Result<()> { if let Some(chunk_compression_level) = indexer_opt_cloned.chunk_compression_level { update_builder.chunk_compression_level(chunk_compression_level); } + if let Some(max_pos_per_attributes) = indexer_opt_cloned.max_positions_per_attributes { + update_builder.max_positions_per_attributes(max_pos_per_attributes); + } update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap()); update_builder.log_every_n(indexer_opt_cloned.log_every_n); update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index df19125c6..fa1381412 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -23,7 +23,10 @@ pub fn extract_docid_word_positions( indexer: GrenadParameters, searchable_fields: &Option>, stop_words: Option<&fst::Set<&[u8]>>, + max_positions_per_attributes: Option, ) -> Result<(RoaringBitmap, grenad::Reader)> { + let max_positions_per_attributes = max_positions_per_attributes + .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); let mut documents_ids = RoaringBitmap::new(); @@ -62,7 +65,7 @@ pub fn extract_docid_word_positions( if let Some(field) = json_to_string(&value, &mut field_buffer) { let analyzed = analyzer.analyze(field); let tokens = process_tokens(analyzed.tokens()) - .take_while(|(p, _)| (*p as u32) < MAX_POSITION_PER_ATTRIBUTE); + .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { let token = token.text().trim(); diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 0406e8ef4..0f04418ed 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -42,6 +42,7 @@ pub(crate) fn data_from_obkv_documents( primary_key_id: FieldId, geo_field_id: Option, stop_words: Option>, + max_positions_per_attributes: Option, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() @@ -55,6 +56,7 @@ pub(crate) fn data_from_obkv_documents( primary_key_id, geo_field_id, &stop_words, + max_positions_per_attributes, ) }) .collect(); @@ -177,6 +179,7 @@ fn extract_documents_data( primary_key_id: FieldId, geo_field_id: Option, stop_words: &Option>, + max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, (grenad::Reader, grenad::Reader), @@ -206,6 +209,7 @@ fn extract_documents_data( indexer.clone(), searchable_fields, stop_words.as_ref(), + max_positions_per_attributes, )?; // send documents_ids to DB writer diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 8138c6191..92bcab0e9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -68,6 +68,7 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + pub(crate) max_positions_per_attributes: Option, facet_level_group_size: Option, facet_min_level_size: Option, words_prefix_threshold: Option, @@ -104,6 +105,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { update_method: IndexDocumentsMethod::ReplaceDocuments, autogenerate_docids: false, update_id, + max_positions_per_attributes: None, } } @@ -262,6 +264,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { primary_key_id, geo_field_id, stop_words, + self.max_positions_per_attributes, ) }); @@ -284,6 +287,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, thread_pool: self.thread_pool, + max_positions_per_attributes: self.max_positions_per_attributes, update_id: self.update_id, }; let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 4aa79f6e3..41c156676 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -69,6 +69,7 @@ pub struct Settings<'a, 't, 'u, 'i> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + pub(crate) max_positions_per_attributes: Option, update_id: u64, searchable_fields: Setting>, @@ -108,6 +109,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { synonyms: Setting::NotSet, primary_key: Setting::NotSet, update_id, + max_positions_per_attributes: None, } } @@ -237,6 +239,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.chunk_compression_type = self.chunk_compression_type; indexing_builder.chunk_compression_level = self.chunk_compression_level; indexing_builder.thread_pool = self.thread_pool; + indexing_builder.max_positions_per_attributes = self.max_positions_per_attributes; indexing_builder.execute_raw(output, &cb)?; Ok(()) diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 561c4bc50..20ec28e06 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -12,6 +12,7 @@ pub struct UpdateBuilder<'a> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + pub(crate) max_positions_per_attributes: Option, pub(crate) update_id: u64, } @@ -25,6 +26,7 @@ impl<'a> UpdateBuilder<'a> { chunk_compression_type: CompressionType::None, chunk_compression_level: None, thread_pool: None, + max_positions_per_attributes: None, update_id, } } @@ -57,6 +59,10 @@ impl<'a> UpdateBuilder<'a> { self.thread_pool = Some(thread_pool); } + pub fn max_positions_per_attributes(&mut self, max_positions_per_attributes: u32) { + self.max_positions_per_attributes = Some(max_positions_per_attributes); + } + pub fn clear_documents<'t, 'u, 'i>( self, wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -87,6 +93,7 @@ impl<'a> UpdateBuilder<'a> { builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.thread_pool = self.thread_pool; + builder.max_positions_per_attributes = self.max_positions_per_attributes; builder } @@ -105,6 +112,7 @@ impl<'a> UpdateBuilder<'a> { builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.thread_pool = self.thread_pool; + builder.max_positions_per_attributes = self.max_positions_per_attributes; builder } From 9a266a531b39e7ba2623b1dff0b3e6caf2ac7e62 Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 12 Oct 2021 10:59:14 +0200 Subject: [PATCH 1069/1889] test correct primary key inference --- milli/src/update/index_documents/transform.rs | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 8d656e50c..f4cbc8e22 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -497,6 +497,7 @@ mod test { use super::*; mod compute_primary_key { + use super::{compute_primary_key_pair, FieldsIdsMap}; #[test] @@ -537,5 +538,27 @@ mod test { assert!(result.is_err()); assert_eq!(fields_map.len(), 0); } + + } + + mod primary_key_inference { + use bimap::BiHashMap; + + use crate::update::index_documents::transform::find_primary_key; + + #[test] + fn primary_key_infered_on_first_field() { + // We run the test multiple times to change the order in which the fields are iterated upon. + for _ in 1..50 { + let mut map = BiHashMap::new(); + map.insert(1, "fakeId".to_string()); + map.insert(2, "fakeId".to_string()); + map.insert(3, "fakeId".to_string()); + map.insert(4, "fakeId".to_string()); + map.insert(0, "realId".to_string()); + + assert_eq!(find_primary_key(&map), Some("realId")); + } + } } } From 86ead92ed5155db84a4ea401a09e025eec9715d2 Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 12 Oct 2021 11:14:12 +0200 Subject: [PATCH 1070/1889] infer primary key on sorted fields --- milli/src/update/index_documents/transform.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f4cbc8e22..c0c88abed 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -77,7 +77,9 @@ fn create_fields_mapping( fn find_primary_key(index: &bimap::BiHashMap) -> Option<&str> { index - .right_values() + .iter() + .sorted_by_key(|(k, _)| *k) + .map(|(_, v)| v) .find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME)) .map(String::as_str) } @@ -497,7 +499,6 @@ mod test { use super::*; mod compute_primary_key { - use super::{compute_primary_key_pair, FieldsIdsMap}; #[test] @@ -538,7 +539,6 @@ mod test { assert!(result.is_err()); assert_eq!(fields_map.len(), 0); } - } mod primary_key_inference { From 2c65781d91e23c4ea3ca702472f4bdcd998d884c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Tue, 12 Oct 2021 22:19:28 +0800 Subject: [PATCH 1071/1889] format --- milli/src/search/facet/filter_condition.rs | 5 +---- milli/src/search/facet/filter_parser.rs | 12 ++++++------ 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 96c8867ec..2686e4a4b 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,4 +1,3 @@ - use std::fmt::Debug; use std::ops::Bound::{self, Excluded, Included}; @@ -16,9 +15,7 @@ use crate::error::{Error, UserError}; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, }; -use crate::{ - distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result, -}; +use crate::{distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result}; #[derive(Debug, Clone, PartialEq)] pub enum FilterCondition { diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 8d00d086c..01e944a98 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -215,8 +215,7 @@ impl<'a> ParseContext<'a> { where E: ParseError<&'a str> + ContextError<&'a str> + Debug, { - let err_msg_args_incomplete:&'static str = - "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; + let err_msg_args_incomplete:&'static str = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; let err_msg_args_invalid: &'static str = "_geoRadius. Latitude and longitude must be contained between -180 to 180 degrees."; let (input, args): (&str, Vec<&str>) = match preceded( @@ -256,7 +255,7 @@ impl<'a> ParseContext<'a> { .or((!(-181.0..181.).contains(&lng)).then(|| &lng)) { let e = E::from_char(input, '('); - return Err(nom::Err::Failure(E::add_context(input,err_msg_args_invalid, e))); + return Err(nom::Err::Failure(E::add_context(input, err_msg_args_invalid, e))); } let res = FilterCondition::Operator(fid, GeoLowerThan([lat, lng], dis)); @@ -270,7 +269,7 @@ impl<'a> ParseContext<'a> { let l1 = |c| self.parse_simple_condition(c); let l2 = |c| self.parse_range_condition(c); let l3 = |c| self.parse_geo_radius(c); - let (input, condition) = alt((l1, l2,l3))(input)?; + let (input, condition) = alt((l1, l2, l3))(input)?; Ok((input, condition)) } @@ -395,7 +394,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp"),S("id")]); // to keep the fields order + builder.set_searchable_fields(vec![S("channel"), S("timestamp"), S("id")]); // to keep the fields order builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") ,S("id")}); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -530,7 +529,8 @@ mod tests { &rtxn, &index, "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", - ).unwrap(); + ) + .unwrap(); let expected = FilterCondition::Or( Box::new(FilterCondition::And( Box::new(FilterCondition::Operator(0, GeoGreaterThan([1., 2.], 300.))), From 5de5dd80a33fe6d7bd0a05c3f9a06841f8844562 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Wed, 13 Oct 2021 11:06:15 +0800 Subject: [PATCH 1072/1889] WIP: remove '_nom' suffix/redundant error enum/... --- milli/src/error.rs | 9 ++---- milli/src/search/facet/filter_condition.rs | 2 +- milli/src/search/facet/filter_parser.rs | 22 ++++++--------- milli/src/search/facet/grammar.pest | 33 ---------------------- 4 files changed, 11 insertions(+), 55 deletions(-) delete mode 100644 milli/src/search/facet/grammar.pest diff --git a/milli/src/error.rs b/milli/src/error.rs index 3ae18165f..c0ce101c8 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -59,9 +59,7 @@ pub enum UserError { InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidGeoField { document_id: Value, object: Value }, - InvalidFilterAttributeNom, - InvalidFilterValue, - InvalidFilterNom { input: String }, + InvalidFilter { input: String }, InvalidSortName { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, @@ -209,10 +207,7 @@ impl StdError for InternalError {} impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - //TODO - Self::InvalidFilterAttributeNom => write!(f, "parser error "), - Self::InvalidFilterValue => write!(f, "parser error "), - Self::InvalidFilterNom { input } => write!(f, "parser error {}", input), + Self::InvalidFilter { input } => write!(f, "parser error {}", input), Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), Self::CriterionError(error) => write!(f, "{}", error), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 2686e4a4b..0e98edd2c 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -90,7 +90,7 @@ impl FilterCondition { nom::Err::Failure(x) => x, _ => unreachable!(), }; - Err(Error::UserError(UserError::InvalidFilterNom { + Err(Error::UserError(UserError::InvalidFilter { input: convert_error(expression, ve).to_string(), })) } diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 01e944a98..493c53920 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -56,13 +56,12 @@ pub struct ParseContext<'a> { } impl<'a> ParseContext<'a> { - fn parse_or_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + fn parse_or(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where E: ParseError<&'a str> + ContextError<&'a str> + Debug, { - let (input, lhs) = self.parse_and_nom(input)?; - let (input, ors) = - many0(preceded(self.ws(tag("OR")), |c| Self::parse_or_nom(self, c)))(input)?; + let (input, lhs) = self.parse_and(input)?; + let (input, ors) = many0(preceded(self.ws(tag("OR")), |c| Self::parse_or(self, c)))(input)?; let expr = ors .into_iter() @@ -70,25 +69,20 @@ impl<'a> ParseContext<'a> { Ok((input, expr)) } - fn parse_and_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + fn parse_and(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where E: ParseError<&'a str> + ContextError<&'a str> + Debug, { - let (input, lhs) = self.parse_not_nom(input)?; - // let (input, lhs) = alt(( - // delimited(self.ws(char('(')), |c| Self::parse_not_nom(self, c), self.ws(char(')'))), - // |c| self.parse_not_nom(c), - // ))(input)?; - + let (input, lhs) = self.parse_not(input)?; let (input, ors) = - many0(preceded(self.ws(tag("AND")), |c| Self::parse_and_nom(self, c)))(input)?; + many0(preceded(self.ws(tag("AND")), |c| Self::parse_and(self, c)))(input)?; let expr = ors .into_iter() .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); Ok((input, expr)) } - fn parse_not_nom(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + fn parse_not(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where E: ParseError<&'a str> + ContextError<&'a str> + Debug, { @@ -307,7 +301,7 @@ impl<'a> ParseContext<'a> { where E: ParseError<&'a str> + ContextError<&'a str> + Debug, { - let a = self.parse_or_nom(input); + let a = self.parse_or(input); a } } diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest deleted file mode 100644 index 8bfdeb667..000000000 --- a/milli/src/search/facet/grammar.pest +++ /dev/null @@ -1,33 +0,0 @@ -key = _{reserved | quoted | word } -value = _{quoted | word } -quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP } -string = {char*} -word = ${(LETTER | NUMBER | "_" | "-" | ".")+} - -char = _{ !(PEEK | "\\") ~ ANY - | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") - | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} - -reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" } -// we deliberately choose to allow empty parameters to generate more specific error message later -parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} -condition = _{between | eq | greater | less | geq | leq | neq} -between = {key ~ value ~ "TO" ~ value} -geq = {key ~ ">=" ~ value} -leq = {key ~ "<=" ~ value} -neq = {key ~ "!=" ~ value} -eq = {key ~ "=" ~ value} -greater = {key ~ ">" ~ value} -less = {key ~ "<" ~ value} -geo_radius = {"_geoRadius" ~ parameters } - -prgm = {SOI ~ expr ~ EOI} -expr = _{ ( term ~ (operation ~ term)* ) } -term = { ("(" ~ expr ~ ")") | condition | not | geo_radius } -operation = _{ and | or } -and = {"AND"} -or = {"OR"} - -not = {"NOT" ~ term} - -WHITESPACE = _{ " " } From cd359cd96ece0b80110813194865050f2012c792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Wed, 13 Oct 2021 18:04:15 +0800 Subject: [PATCH 1073/1889] WIP: extract the error trait bound to new trait. --- milli/src/search/facet/filter_parser.rs | 37 ++++++++++++++----------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 493c53920..c635ac9dd 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -9,7 +9,6 @@ use nom::{ bytes::complete::{tag, take_while1}, character::complete::{char, multispace0}, combinator::map, - error::ParseError, error::VerboseError, error::{ContextError, ErrorKind}, multi::many0, @@ -50,6 +49,12 @@ impl Operator { } } +pub trait FilterParserError<'a>: + nom::error::ParseError<&'a str> + ContextError<&'a str> + std::fmt::Debug +{ +} +impl<'a> FilterParserError<'a> for VerboseError<&'a str> {} + pub struct ParseContext<'a> { pub fields_ids_map: &'a FieldsIdsMap, pub filterable_fields: &'a HashSet, @@ -58,7 +63,7 @@ pub struct ParseContext<'a> { impl<'a> ParseContext<'a> { fn parse_or(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { let (input, lhs) = self.parse_and(input)?; let (input, ors) = many0(preceded(self.ws(tag("OR")), |c| Self::parse_or(self, c)))(input)?; @@ -71,7 +76,7 @@ impl<'a> ParseContext<'a> { fn parse_and(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { let (input, lhs) = self.parse_not(input)?; let (input, ors) = @@ -84,7 +89,7 @@ impl<'a> ParseContext<'a> { fn parse_not(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { alt(( map( @@ -100,14 +105,14 @@ impl<'a> ParseContext<'a> { fn ws(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> where F: Fn(&'a str) -> IResult<&'a str, O, E>, - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { delimited(multispace0, inner, multispace0) } fn parse_simple_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + ContextError<&'a str> + std::fmt::Debug, + E: FilterParserError<'a>, { let operator = alt((tag("<="), tag(">="), tag(">"), tag("="), tag("<"), tag("!="))); let k = tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))( @@ -135,7 +140,7 @@ impl<'a> ParseContext<'a> { fn parse_numeric(&'a self, input: &'a str) -> StdResult> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, T: std::str::FromStr, { match input.parse::() { @@ -156,7 +161,7 @@ impl<'a> ParseContext<'a> { value: &'a str, ) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { let numeric: f64 = self.parse_numeric(value)?; let k = match input { @@ -171,7 +176,7 @@ impl<'a> ParseContext<'a> { fn parse_fid(&'a self, input: &'a str, key: &'a str) -> StdResult> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { let error = match input.chars().nth(0) { Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), @@ -188,7 +193,7 @@ impl<'a> ParseContext<'a> { fn parse_range_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { let (input, (key, from, _, to)) = tuple(( self.ws(|c| self.parse_key(c)), @@ -207,7 +212,7 @@ impl<'a> ParseContext<'a> { fn parse_geo_radius(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { let err_msg_args_incomplete:&'static str = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; let err_msg_args_invalid: &'static str = @@ -258,7 +263,7 @@ impl<'a> ParseContext<'a> { fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { let l1 = |c| self.parse_simple_condition(c); let l2 = |c| self.parse_range_condition(c); @@ -269,7 +274,7 @@ impl<'a> ParseContext<'a> { fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { alt(( delimited(self.ws(char('(')), |c| Self::parse_expression(self, c), self.ws(char(')'))), @@ -279,7 +284,7 @@ impl<'a> ParseContext<'a> { fn parse_key(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> where - E: ParseError<&'a str> + ContextError<&'a str>, + E: FilterParserError<'a>, { let key = |input| take_while1(Self::is_key_component)(input); alt((key, delimited(char('"'), key, char('"'))))(input) @@ -287,7 +292,7 @@ impl<'a> ParseContext<'a> { fn parse_value(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { let key = |input| take_while1(Self::is_key_component)(input); alt((key, delimited(char('"'), key, char('"'))))(input) @@ -299,7 +304,7 @@ impl<'a> ParseContext<'a> { pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where - E: ParseError<&'a str> + ContextError<&'a str> + Debug, + E: FilterParserError<'a>, { let a = self.parse_or(input); a From a3e7c468cd0928a2ac7905d390cc4ab7fb93ebc7 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 13 Oct 2021 13:05:07 +0200 Subject: [PATCH 1074/1889] add helper methods on the settings --- milli/src/update/settings.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 41c156676..dee63c726 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -28,6 +28,21 @@ impl Default for Setting { } impl Setting { + pub fn set(self) -> Option { + match self { + Self::Set(value) => Some(value), + _ => None, + } + } + + pub const fn as_ref(&self) -> Setting<&T> { + match *self { + Self::Set(ref value) => Setting::Set(value), + Self::Reset => Setting::Reset, + Self::NotSet => Setting::NotSet, + } + } + pub const fn is_not_set(&self) -> bool { matches!(self, Self::NotSet) } From e750465e15aea5f9ca6fabc03b92a8b09f044486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Thu, 14 Oct 2021 16:12:00 +0800 Subject: [PATCH 1075/1889] check logic for geolocation. --- milli/src/search/facet/filter_parser.rs | 55 ++++++++++++++++++------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index c635ac9dd..123a80c35 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -214,9 +214,14 @@ impl<'a> ParseContext<'a> { where E: FilterParserError<'a>, { - let err_msg_args_incomplete:&'static str = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; - let err_msg_args_invalid: &'static str = - "_geoRadius. Latitude and longitude must be contained between -180 to 180 degrees."; + let err_msg_args_incomplete= "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; + + let err_msg_latitude_invalid = + "_geoRadius. Latitude must be contained between -90 and 90 degrees."; + + let err_msg_longitude_invalid = + "_geoRadius. Longitude must be contained between -180 and 180 degrees."; + let (input, args): (&str, Vec<&str>) = match preceded( tag("_geoRadius"), delimited( @@ -249,12 +254,18 @@ impl<'a> ParseContext<'a> { None => return Ok((input, FilterCondition::Empty)), }; - if let Some(_span) = (!(-181.0..181.).contains(&lat)) - .then(|| &lat) - .or((!(-181.0..181.).contains(&lng)).then(|| &lng)) - { - let e = E::from_char(input, '('); - return Err(nom::Err::Failure(E::add_context(input, err_msg_args_invalid, e))); + if !(-90.0..=90.0).contains(&lat) { + return Err(nom::Err::Failure(E::add_context( + input, + err_msg_latitude_invalid, + E::from_char(input, '('), + ))); + } else if !(-180.0..=180.0).contains(&lng) { + return Err(nom::Err::Failure(E::add_context( + input, + err_msg_longitude_invalid, + E::from_char(input, '('), + ))); } let res = FilterCondition::Operator(fid, GeoLowerThan([lat, lng], dis)); @@ -582,20 +593,36 @@ mod tests { let error = result.unwrap_err(); assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - // georadius have a bad latitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-200, 150, 10)"); + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); assert!(result.is_err()); let error = result.unwrap_err(); assert!(error .to_string() - .contains("Latitude and longitude must be contained between -180 to 180 degrees.")); + .contains("Latitude must be contained between -90 and 90 degrees.")); + + // georadius have a bad latitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude must be contained between -90 and 90 degrees.")); // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 181, 10)"); + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)"); assert!(result.is_err()); let error = result.unwrap_err(); assert!(error .to_string() - .contains("Latitude and longitude must be contained between -180 to 180 degrees.")); + .contains("Longitude must be contained between -180 and 180 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Longitude must be contained between -180 and 180 degrees.")); + } } From 2ea2f7570c440022398a07b846945f591ed67401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Thu, 14 Oct 2021 16:46:13 +0800 Subject: [PATCH 1076/1889] use nightly cargo to format the code --- milli/src/search/facet/filter_condition.rs | 1 - milli/src/search/facet/filter_parser.rs | 28 ++++++++++------------ 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 0e98edd2c..c76bb9388 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -8,7 +8,6 @@ use nom::error::{convert_error, VerboseError}; use roaring::RoaringBitmap; use self::FilterCondition::*; - use super::filter_parser::{Operator, ParseContext}; use super::FacetNumberRange; use crate::error::{Error, UserError}; diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 123a80c35..ee8249069 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -2,22 +2,18 @@ use std::collections::HashSet; use std::fmt::Debug; use std::result::Result as StdResult; -use super::FilterCondition; -use crate::{FieldId, FieldsIdsMap}; -use nom::{ - branch::alt, - bytes::complete::{tag, take_while1}, - character::complete::{char, multispace0}, - combinator::map, - error::VerboseError, - error::{ContextError, ErrorKind}, - multi::many0, - multi::separated_list1, - sequence::{delimited, preceded, tuple}, - IResult, -}; +use nom::branch::alt; +use nom::bytes::complete::{tag, take_while1}; +use nom::character::complete::{char, multispace0}; +use nom::combinator::map; +use nom::error::{ContextError, ErrorKind, VerboseError}; +use nom::multi::{many0, separated_list1}; +use nom::sequence::{delimited, preceded, tuple}; +use nom::IResult; use self::Operator::*; +use super::FilterCondition; +use crate::{FieldId, FieldsIdsMap}; #[derive(Debug, Clone, PartialEq)] pub enum Operator { GreaterThan(f64), @@ -330,7 +326,8 @@ mod tests { use maplit::hashset; use super::*; - use crate::{update::Settings, Index}; + use crate::update::Settings; + use crate::Index; #[test] fn string() { @@ -623,6 +620,5 @@ mod tests { assert!(error .to_string() .contains("Longitude must be contained between -180 and 180 degrees.")); - } } From 7666e4f34a6c6c73fd231c504c6ea1ef2d12e307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E7=80=9A=E9=AA=8B?= Date: Thu, 14 Oct 2021 21:37:59 +0800 Subject: [PATCH 1077/1889] follow the suggestions --- milli/Cargo.toml | 2 +- milli/src/search/facet/filter_condition.rs | 4 +--- milli/src/search/facet/filter_parser.rs | 8 +++----- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 007d9d415..af8370309 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -39,7 +39,7 @@ tempfile = "3.2.0" uuid = { version = "0.8.2", features = ["v4"] } # facet filter parser -nom = "7" +nom = "7.0.0" # documents words self-join itertools = "0.10.0" diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index c76bb9388..4fedeee69 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -24,9 +24,6 @@ pub enum FilterCondition { Empty, } -// impl From - -//for nom impl FilterCondition { pub fn from_array( rtxn: &heed::RoTxn, @@ -72,6 +69,7 @@ impl FilterCondition { Ok(ands) } + pub fn from_str( rtxn: &heed::RoTxn, index: &Index, diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index ee8249069..4d8a54987 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -49,6 +49,7 @@ pub trait FilterParserError<'a>: nom::error::ParseError<&'a str> + ContextError<&'a str> + std::fmt::Debug { } + impl<'a> FilterParserError<'a> for VerboseError<&'a str> {} pub struct ParseContext<'a> { @@ -211,7 +212,6 @@ impl<'a> ParseContext<'a> { E: FilterParserError<'a>, { let err_msg_args_incomplete= "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; - let err_msg_latitude_invalid = "_geoRadius. Latitude must be contained between -90 and 90 degrees."; @@ -275,8 +275,7 @@ impl<'a> ParseContext<'a> { let l1 = |c| self.parse_simple_condition(c); let l2 = |c| self.parse_range_condition(c); let l3 = |c| self.parse_geo_radius(c); - let (input, condition) = alt((l1, l2, l3))(input)?; - Ok((input, condition)) + alt((l1, l2, l3))(input) } fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> @@ -313,8 +312,7 @@ impl<'a> ParseContext<'a> { where E: FilterParserError<'a>, { - let a = self.parse_or(input); - a + self.parse_or(input) } } From efaef4f7483ada7ecdb7b58eeb1e893ea0ffbd2c Mon Sep 17 00:00:00 2001 From: Damanpreet Singh Date: Sat, 16 Oct 2021 21:41:45 +0530 Subject: [PATCH 1078/1889] Added search_geo benchmark in cron job --- .../workflows/cron_benchmarks_search_geo.yml | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 .github/workflows/cron_benchmarks_search_geo.yml diff --git a/.github/workflows/cron_benchmarks_search_geo.yml b/.github/workflows/cron_benchmarks_search_geo.yml new file mode 100644 index 000000000..5a387c8a7 --- /dev/null +++ b/.github/workflows/cron_benchmarks_search_geo.yml @@ -0,0 +1,70 @@ +name: Benchmarks search geo (cron) + +on: + schedule: + - cron: "30 18 * * FRI" # every friday at 18:30 + +env: + BENCH_NAME: "search_geo" + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: self-hosted + steps: + - uses: actions/checkout@v2 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmp files + - name: Install critcmp + run: cargo install critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json" From 493d9b98f5b0270638579440bbb168858de8b9be Mon Sep 17 00:00:00 2001 From: Damanpreet Singh Date: Sat, 16 Oct 2021 21:52:36 +0530 Subject: [PATCH 1079/1889] fix indexing benchmark GH actions upload filename --- .github/workflows/benchmarks.yml | 2 +- .github/workflows/cron_benchmarks_indexing.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index 7a9fbb5de..a857618d0 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -38,7 +38,7 @@ jobs: id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks diff --git a/.github/workflows/cron_benchmarks_indexing.yml b/.github/workflows/cron_benchmarks_indexing.yml index e806edc84..452966194 100644 --- a/.github/workflows/cron_benchmarks_indexing.yml +++ b/.github/workflows/cron_benchmarks_indexing.yml @@ -34,7 +34,7 @@ jobs: id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks From 70121e3c6bbdc851eac83a84df1533bef3a23f9a Mon Sep 17 00:00:00 2001 From: SaintMalik <37118134+saintmalik@users.noreply.github.com> Date: Mon, 18 Oct 2021 04:00:19 +0100 Subject: [PATCH 1080/1889] fix typo in repo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 07071183e..4df258585 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ It can index a massive amount of documents in not much time, I already achieved These metrics are done on a MacBook Pro with the M1 processor. -You can feed the engine with your CSV (comma-seperated, yes) data like this: +You can feed the engine with your CSV (comma-separated, yes) data like this: ```bash printf "id,name,age\n1,hello,32\n2,kiki,24\n" | http POST 127.0.0.1:9700/documents content-type:text/csv From 2209acbfe2c423268237a662a27678d8f334cf66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 18 Oct 2021 13:45:48 +0200 Subject: [PATCH 1081/1889] Update version for the next release (v0.18.2) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 14287e4d1..2f3c26a7b 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.17.2" +version = "0.18.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 7f8cd5fed..83d5f67c4 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.17.2" +version = "0.18.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 27bf9d607..b0220993f 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.17.2" +version = "0.18.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index c695ca2ba..594cc60e0 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.17.2" +version = "0.18.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index 9c2720ff9..b55c4b8d6 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.17.2" +version = "0.18.0" authors = ["Clément Renault "] edition = "2018" From 4c34164d2edf47a84329af232cb2ff2f921cd4f8 Mon Sep 17 00:00:00 2001 From: Damanpreet Singh Date: Mon, 18 Oct 2021 18:43:36 +0530 Subject: [PATCH 1082/1889] fixed filename for search_geo cron --- .github/workflows/cron_benchmarks_search_geo.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cron_benchmarks_search_geo.yml b/.github/workflows/cron_benchmarks_search_geo.yml index 5a387c8a7..642b5018e 100644 --- a/.github/workflows/cron_benchmarks_search_geo.yml +++ b/.github/workflows/cron_benchmarks_search_geo.yml @@ -34,7 +34,7 @@ jobs: id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks From 2e4604b0b9c6e4eb9eb0718775bb3f1ed429d506 Mon Sep 17 00:00:00 2001 From: Damanpreet Singh Date: Mon, 18 Oct 2021 18:48:38 +0530 Subject: [PATCH 1083/1889] fixed filename for search_* crons --- .github/workflows/cron_benchmarks_search_songs.yml | 2 +- .github/workflows/cron_benchmarks_search_wiki.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/cron_benchmarks_search_songs.yml b/.github/workflows/cron_benchmarks_search_songs.yml index 018c20817..d15cc7ab6 100644 --- a/.github/workflows/cron_benchmarks_search_songs.yml +++ b/.github/workflows/cron_benchmarks_search_songs.yml @@ -34,7 +34,7 @@ jobs: id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks diff --git a/.github/workflows/cron_benchmarks_search_wiki.yml b/.github/workflows/cron_benchmarks_search_wiki.yml index 78f940e38..c73e8c037 100644 --- a/.github/workflows/cron_benchmarks_search_wiki.yml +++ b/.github/workflows/cron_benchmarks_search_wiki.yml @@ -34,7 +34,7 @@ jobs: id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks From 661bc21af51f30e564685e3cdff9261ea0680884 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 20 Oct 2021 17:27:12 +0200 Subject: [PATCH 1084/1889] Fix the filter parser And add a bunch of tests on the filter::from_array --- milli/src/search/facet/filter_parser.rs | 174 +++++++++++++++++++----- 1 file changed, 141 insertions(+), 33 deletions(-) diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 4d8a54987..cfa3cdae0 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -3,17 +3,19 @@ use std::fmt::Debug; use std::result::Result as StdResult; use nom::branch::alt; -use nom::bytes::complete::{tag, take_while1}; +use nom::bytes::complete::{tag, take_till, take_till1, take_while1}; use nom::character::complete::{char, multispace0}; use nom::combinator::map; use nom::error::{ContextError, ErrorKind, VerboseError}; use nom::multi::{many0, separated_list1}; +use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, tuple}; use nom::IResult; use self::Operator::*; use super::FilterCondition; use crate::{FieldId, FieldsIdsMap}; + #[derive(Debug, Clone, PartialEq)] pub enum Operator { GreaterThan(f64), @@ -111,28 +113,33 @@ impl<'a> ParseContext<'a> { where E: FilterParserError<'a>, { - let operator = alt((tag("<="), tag(">="), tag(">"), tag("="), tag("<"), tag("!="))); - let k = tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))( + let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); + let k = tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_value(c))))( input, ); let (input, (key, op, value)) = match k { Ok(o) => o, - Err(e) => { - return Err(e); - } + Err(e) => return Err(e), }; let fid = self.parse_fid(input, key)?; let r: StdResult>> = self.parse_numeric(value); - let k = match op { - "=" => FilterCondition::Operator(fid, Equal(r.ok(), value.to_string().to_lowercase())), - "!=" => { - FilterCondition::Operator(fid, NotEqual(r.ok(), value.to_string().to_lowercase())) + match op { + "=" => { + let k = + FilterCondition::Operator(fid, Equal(r.ok(), value.to_string().to_lowercase())); + Ok((input, k)) } - ">" | "<" | "<=" | ">=" => return self.parse_numeric_unary_condition(op, fid, value), + "!=" => { + let k = FilterCondition::Operator( + fid, + NotEqual(r.ok(), value.to_string().to_lowercase()), + ); + Ok((input, k)) + } + ">" | "<" | "<=" | ">=" => self.parse_numeric_unary_condition(op, fid, value), _ => unreachable!(), - }; - Ok((input, k)) + } } fn parse_numeric(&'a self, input: &'a str) -> StdResult> @@ -142,12 +149,10 @@ impl<'a> ParseContext<'a> { { match input.parse::() { Ok(n) => Ok(n), - Err(_) => { - return match input.chars().nth(0) { - Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), - None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), - }; - } + Err(_) => match input.chars().nth(0) { + Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), + None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), + }, } } @@ -194,9 +199,9 @@ impl<'a> ParseContext<'a> { { let (input, (key, from, _, to)) = tuple(( self.ws(|c| self.parse_key(c)), - self.ws(|c| self.parse_key(c)), + self.ws(|c| self.parse_value(c)), tag("TO"), - self.ws(|c| self.parse_key(c)), + self.ws(|c| self.parse_value(c)), ))(input)?; let fid = self.parse_fid(input, key)?; @@ -211,22 +216,23 @@ impl<'a> ParseContext<'a> { where E: FilterParserError<'a>, { - let err_msg_args_incomplete= "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; + let err_msg_args_incomplete = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; let err_msg_latitude_invalid = "_geoRadius. Latitude must be contained between -90 and 90 degrees."; let err_msg_longitude_invalid = "_geoRadius. Longitude must be contained between -180 and 180 degrees."; - let (input, args): (&str, Vec<&str>) = match preceded( + let parsed = preceded::<_, _, _, E, _, _>( tag("_geoRadius"), delimited( char('('), - separated_list1(tag(","), self.ws(|c| self.parse_value::(c))), + separated_list1(tag(","), self.ws(|c| recognize_float(c))), char(')'), ), - )(input) - { + )(input); + + let (input, args): (&str, Vec<&str>) = match parsed { Ok(e) => e, Err(_e) => { return Err(nom::Err::Failure(E::add_context( @@ -293,15 +299,30 @@ impl<'a> ParseContext<'a> { E: FilterParserError<'a>, { let key = |input| take_while1(Self::is_key_component)(input); - alt((key, delimited(char('"'), key, char('"'))))(input) + let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); + let quoted_key = |input| take_till(|c: char| c == '"')(input); + + alt(( + delimited(char('\''), simple_quoted_key, char('\'')), + delimited(char('"'), quoted_key, char('"')), + key, + ))(input) } fn parse_value(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> where E: FilterParserError<'a>, { - let key = |input| take_while1(Self::is_key_component)(input); - alt((key, delimited(char('"'), key, char('"'))))(input) + let key = + |input| take_till1(|c: char| c.is_ascii_whitespace() || c == '(' || c == ')')(input); + let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); + let quoted_key = |input| take_till(|c: char| c == '"')(input); + + alt(( + delimited(char('\''), simple_quoted_key, char('\'')), + delimited(char('"'), quoted_key, char('"')), + key, + ))(input) } fn is_key_component(c: char) -> bool { @@ -312,7 +333,7 @@ impl<'a> ParseContext<'a> { where E: FilterParserError<'a>, { - self.parse_or(input) + alt((|input| self.parse_or(input), |input| self.parse_and(input)))(input) } } @@ -481,6 +502,90 @@ mod tests { builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); + // Simple array with Left + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, _, _, &str>( + &rtxn, + &index, + vec![Either::Left(["channel = mv"])], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = mv").unwrap(); + assert_eq!(condition, expected); + + // Simple array with Right + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( + &rtxn, + &index, + vec![Either::Right("channel = mv")], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = mv").unwrap(); + assert_eq!(condition, expected); + + // Array with Left and escaped quote + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, _, _, &str>( + &rtxn, + &index, + vec![Either::Left(["channel = \"Mister Mv\""])], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = \"Mister Mv\"").unwrap(); + assert_eq!(condition, expected); + + // Array with Right and escaped quote + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( + &rtxn, + &index, + vec![Either::Right("channel = \"Mister Mv\"")], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = \"Mister Mv\"").unwrap(); + assert_eq!(condition, expected); + + // Array with Left and escaped simple quote + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, _, _, &str>( + &rtxn, + &index, + vec![Either::Left(["channel = 'Mister Mv'"])], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = 'Mister Mv'").unwrap(); + assert_eq!(condition, expected); + + // Array with Right and escaped simple quote + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( + &rtxn, + &index, + vec![Either::Right("channel = 'Mister Mv'")], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = 'Mister Mv'").unwrap(); + assert_eq!(condition, expected); + + // Simple with parenthesis + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, _, _, &str>( + &rtxn, + &index, + vec![Either::Left(["(channel = mv)"])], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "(channel = mv)").unwrap(); + assert_eq!(condition, expected); + // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FilterCondition::from_array( @@ -501,6 +606,7 @@ mod tests { .unwrap(); assert_eq!(condition, expected); } + #[test] fn geo_radius() { let path = tempfile::tempdir().unwrap(); @@ -591,9 +697,11 @@ mod tests { let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); assert!(result.is_err()); let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Latitude must be contained between -90 and 90 degrees.")); + assert!( + error.to_string().contains("Latitude must be contained between -90 and 90 degrees."), + "{}", + error.to_string() + ); // georadius have a bad latitude let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); From f8fe9316c08b924086a700a83616b8fa619707f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 21 Oct 2021 11:56:14 +0200 Subject: [PATCH 1085/1889] Update version for the next release (v0.18.1) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 2f3c26a7b..b3a15f100 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.18.0" +version = "0.18.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 83d5f67c4..a7af6fb9b 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.18.0" +version = "0.18.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index b0220993f..2701c36d7 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.18.0" +version = "0.18.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 594cc60e0..01309797d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.18.0" +version = "0.18.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index b55c4b8d6..dd74ad7b6 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.18.0" +version = "0.18.1" authors = ["Clément Renault "] edition = "2018" From 36281a653f5052c677ad518fb680fc8ff044350c Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 21 Oct 2021 12:40:11 +0200 Subject: [PATCH 1086/1889] write all the simple tests --- milli/src/search/facet/filter_parser.rs | 121 +++++++++++++++++++++--- 1 file changed, 110 insertions(+), 11 deletions(-) diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index cfa3cdae0..bd5aaf976 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -359,25 +359,124 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut map = index.fields_ids_map(&wtxn).unwrap(); map.insert("channel"); + map.insert("dog race"); + map.insert("subscribers"); index.put_fields_ids_map(&mut wtxn, &map).unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { S("channel") }); + builder.set_filterable_fields(hashset! { S("channel"), S("dog race"), S("subscribers") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); - // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); - let expected = FilterCondition::Operator(0, Operator::Equal(None, S("ponce"))); - assert_eq!(condition, expected); - let condition = FilterCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); - let expected = FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce"))); - assert_eq!(condition, expected); + use FilterCondition as Fc; + let test_case = [ + // simple test + ( + Fc::from_str(&rtxn, &index, "channel = Ponce"), + Fc::Operator(0, Operator::Equal(None, S("ponce"))), + ), + // test all the quotes and simple quotes + ( + Fc::from_str(&rtxn, &index, "channel = 'Mister Mv'"), + Fc::Operator(0, Operator::Equal(None, S("mister mv"))), + ), + ( + Fc::from_str(&rtxn, &index, "channel = \"Mister Mv\""), + Fc::Operator(0, Operator::Equal(None, S("mister mv"))), + ), + ( + Fc::from_str(&rtxn, &index, "'dog race' = Borzoi"), + Fc::Operator(1, Operator::Equal(None, S("borzoi"))), + ), + ( + Fc::from_str(&rtxn, &index, "\"dog race\" = Chusky"), + Fc::Operator(1, Operator::Equal(None, S("chusky"))), + ), + ( + Fc::from_str(&rtxn, &index, "\"dog race\" = \"Bernese Mountain\""), + Fc::Operator(1, Operator::Equal(None, S("bernese mountain"))), + ), + ( + Fc::from_str(&rtxn, &index, "'dog race' = 'Bernese Mountain'"), + Fc::Operator(1, Operator::Equal(None, S("bernese mountain"))), + ), + ( + Fc::from_str(&rtxn, &index, "\"dog race\" = 'Bernese Mountain'"), + Fc::Operator(1, Operator::Equal(None, S("bernese mountain"))), + ), + // test all the operators + ( + Fc::from_str(&rtxn, &index, "channel != ponce"), + Fc::Operator(0, Operator::NotEqual(None, S("ponce"))), + ), + ( + Fc::from_str(&rtxn, &index, "NOT channel = ponce"), + Fc::Operator(0, Operator::NotEqual(None, S("ponce"))), + ), + ( + Fc::from_str(&rtxn, &index, "subscribers < 1000"), + Fc::Operator(2, Operator::LowerThan(1000.)), + ), + ( + Fc::from_str(&rtxn, &index, "subscribers > 1000"), + Fc::Operator(2, Operator::GreaterThan(1000.)), + ), + ( + Fc::from_str(&rtxn, &index, "subscribers <= 1000"), + Fc::Operator(2, Operator::LowerThanOrEqual(1000.)), + ), + ( + Fc::from_str(&rtxn, &index, "subscribers >= 1000"), + Fc::Operator(2, Operator::GreaterThanOrEqual(1000.)), + ), + ( + Fc::from_str(&rtxn, &index, "NOT subscribers < 1000"), + Fc::Operator(2, Operator::GreaterThanOrEqual(1000.)), + ), + ( + Fc::from_str(&rtxn, &index, "NOT subscribers > 1000"), + Fc::Operator(2, Operator::LowerThanOrEqual(1000.)), + ), + ( + Fc::from_str(&rtxn, &index, "NOT subscribers <= 1000"), + Fc::Operator(2, Operator::GreaterThan(1000.)), + ), + ( + Fc::from_str(&rtxn, &index, "NOT subscribers >= 1000"), + Fc::Operator(2, Operator::LowerThan(1000.)), + ), + ( + Fc::from_str(&rtxn, &index, "subscribers 100 TO 1000"), + Fc::Operator(2, Operator::Between(100., 1000.)), + ), + ( + Fc::from_str(&rtxn, &index, "NOT subscribers 100 TO 1000"), + Fc::Or( + Box::new(Fc::Operator(2, Operator::LowerThan(100.))), + Box::new(Fc::Operator(2, Operator::GreaterThan(1000.))), + ), + ), + ( + Fc::from_str(&rtxn, &index, "_geoRadius(12, 13, 14)"), + Fc::Operator(2, Operator::GeoLowerThan([12., 13.], 14.)), + ), + ( + Fc::from_str(&rtxn, &index, "NOT _geoRadius(12, 13, 14)"), + Fc::Operator(2, Operator::GeoGreaterThan([12., 13.], 14.)), + ), + ]; - let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); - let expected = FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce"))); - assert_eq!(condition, expected); + for (result, expected) in test_case { + assert!( + result.is_ok(), + "Filter {:?} was supposed to be parsed but failed with the following error: `{}`", + expected, + result.unwrap_err() + ); + let filter = result.unwrap(); + assert_eq!(filter, expected,); + } } #[test] From 423baac08b1921594643eafcf5d4db8d679c84df Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 21 Oct 2021 12:45:40 +0200 Subject: [PATCH 1087/1889] fix the tests --- milli/src/search/facet/filter_parser.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index bd5aaf976..1ba4962f8 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -253,6 +253,7 @@ impl<'a> ParseContext<'a> { let fid = match self.fields_ids_map.id("_geo") { Some(fid) => fid, + // TODO send an error None => return Ok((input, FilterCondition::Empty)), }; @@ -361,9 +362,12 @@ mod tests { map.insert("channel"); map.insert("dog race"); map.insert("subscribers"); + map.insert("_geo"); index.put_fields_ids_map(&mut wtxn, &map).unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { S("channel"), S("dog race"), S("subscribers") }); + builder.set_filterable_fields( + hashset! { S("channel"), S("dog race"), S("subscribers"), S("_geo") }, + ); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -459,11 +463,11 @@ mod tests { ), ( Fc::from_str(&rtxn, &index, "_geoRadius(12, 13, 14)"), - Fc::Operator(2, Operator::GeoLowerThan([12., 13.], 14.)), + Fc::Operator(3, Operator::GeoLowerThan([12., 13.], 14.)), ), ( Fc::from_str(&rtxn, &index, "NOT _geoRadius(12, 13, 14)"), - Fc::Operator(2, Operator::GeoGreaterThan([12., 13.], 14.)), + Fc::Operator(3, Operator::GeoGreaterThan([12., 13.], 14.)), ), ]; From e1d81342cf39894c1c99c5e0e042752d1d909355 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 21 Oct 2021 13:01:25 +0200 Subject: [PATCH 1088/1889] add test on the or and and operator --- milli/src/search/facet/filter_parser.rs | 48 +++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 1ba4962f8..3454d91a4 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -469,6 +469,54 @@ mod tests { Fc::from_str(&rtxn, &index, "NOT _geoRadius(12, 13, 14)"), Fc::Operator(3, Operator::GeoGreaterThan([12., 13.], 14.)), ), + // test simple `or` and `and` + ( + Fc::from_str(&rtxn, &index, "channel = ponce AND 'dog race' != 'bernese mountain'"), + Fc::And( + Box::new(Fc::Operator(0, Operator::Equal(None, S("ponce")))), + Box::new(Fc::Operator(1, Operator::NotEqual(None, S("bernese mountain")))), + ), + ), + ( + Fc::from_str(&rtxn, &index, "channel = ponce OR 'dog race' != 'bernese mountain'"), + Fc::Or( + Box::new(Fc::Operator(0, Operator::Equal(None, S("ponce")))), + Box::new(Fc::Operator(1, Operator::NotEqual(None, S("bernese mountain")))), + ), + ), + ( + Fc::from_str( + &rtxn, + &index, + "channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000", + ), + Fc::Or( + Box::new(Fc::And( + Box::new(Fc::Operator(0, Operator::Equal(None, S("ponce")))), + Box::new(Fc::Operator(1, Operator::NotEqual(None, S("bernese mountain")))), + )), + Box::new(Fc::Operator(2, Operator::GreaterThan(1000.))), + ), + ), + // test parenthesis + /* + ( + Fc::from_str( + &rtxn, + &index, + "(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14)", + ), + Fc::And( + Box::new(Fc::Or( + Box::new(Fc::And( + Box::new(Fc::Operator(0, Operator::Equal(None, S("ponce")))), + Box::new(Fc::Operator(1, Operator::NotEqual(None, S("bernese mountain")))), + )), + Box::new(Fc::Operator(2, Operator::GreaterThan(1000.))), + )), + Box::new(Fc::Operator(3, Operator::GeoLowerThan([12., 13.], 14.)))) + ), + */ ]; for (result, expected) in test_case { From 6c15f50899ba37f624aa521e0940c1bb8c86b5ba Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 21 Oct 2021 16:45:42 +0200 Subject: [PATCH 1089/1889] rewrite the parser logic --- milli/src/search/facet/filter_parser.rs | 156 ++++++++++++------------ 1 file changed, 76 insertions(+), 80 deletions(-) diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 3454d91a4..9440a44ca 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -1,9 +1,26 @@ +//! BNF grammar: +//! +//! ```text +//! expression = or +//! or = and (~ "OR" ~ and) +//! and = not (~ "AND" not)* +//! not = ("NOT" | "!") not | primary +//! primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius +//! to = value value TO value +//! condition = value ("==" | ">" ...) value +//! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +//! singleQuoted = "'" .* all but quotes "'" +//! doubleQuoted = "\"" (word | spaces)* "\"" +//! word = (alphanumeric | _ | - | .)+ +//! geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) +//! ``` + use std::collections::HashSet; use std::fmt::Debug; use std::result::Result as StdResult; use nom::branch::alt; -use nom::bytes::complete::{tag, take_till, take_till1, take_while1}; +use nom::bytes::complete::{tag, take_till, take_while1}; use nom::character::complete::{char, multispace0}; use nom::combinator::map; use nom::error::{ContextError, ErrorKind, VerboseError}; @@ -60,12 +77,14 @@ pub struct ParseContext<'a> { } impl<'a> ParseContext<'a> { + /// and = not (~ "AND" not)* fn parse_or(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where E: FilterParserError<'a>, { let (input, lhs) = self.parse_and(input)?; - let (input, ors) = many0(preceded(self.ws(tag("OR")), |c| Self::parse_or(self, c)))(input)?; + let (input, ors) = + many0(preceded(self.ws(tag("OR")), |c| Self::parse_and(self, c)))(input)?; let expr = ors .into_iter() @@ -78,49 +97,40 @@ impl<'a> ParseContext<'a> { E: FilterParserError<'a>, { let (input, lhs) = self.parse_not(input)?; - let (input, ors) = - many0(preceded(self.ws(tag("AND")), |c| Self::parse_and(self, c)))(input)?; + let (input, ors) = many0(preceded(self.ws(tag("AND")), |c| self.parse_not(c)))(input)?; let expr = ors .into_iter() .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); Ok((input, expr)) } + /// not = ("NOT" | "!") not | primary fn parse_not(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where E: FilterParserError<'a>, { alt(( - map( - preceded(alt((self.ws(tag("!")), self.ws(tag("NOT")))), |c| { - Self::parse_condition_expression(self, c) - }), - |e| e.negate(), - ), - |c| Self::parse_condition_expression(self, c), + map(preceded(alt((tag("!"), tag("NOT"))), |c| self.parse_not(c)), |e| e.negate()), + |c| self.parse_primary(c), ))(input) } fn ws(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> where - F: Fn(&'a str) -> IResult<&'a str, O, E>, + F: FnMut(&'a str) -> IResult<&'a str, O, E>, E: FilterParserError<'a>, { delimited(multispace0, inner, multispace0) } - fn parse_simple_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + /// condition = value ("==" | ">" ...) value + fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where E: FilterParserError<'a>, { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); - let k = tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_value(c))))( - input, - ); - let (input, (key, op, value)) = match k { - Ok(o) => o, - Err(e) => return Err(e), - }; + let (input, (key, op, value)) = + tuple((|c| self.parse_value(c), operator, |c| self.parse_value(c)))(input)?; let fid = self.parse_fid(input, key)?; let r: StdResult>> = self.parse_numeric(value); @@ -137,7 +147,17 @@ impl<'a> ParseContext<'a> { ); Ok((input, k)) } - ">" | "<" | "<=" | ">=" => self.parse_numeric_unary_condition(op, fid, value), + ">" | "<" | "<=" | ">=" => { + let numeric: f64 = self.parse_numeric(value)?; + let k = match op { + ">" => FilterCondition::Operator(fid, GreaterThan(numeric)), + "<" => FilterCondition::Operator(fid, LowerThan(numeric)), + "<=" => FilterCondition::Operator(fid, LowerThanOrEqual(numeric)), + ">=" => FilterCondition::Operator(fid, GreaterThanOrEqual(numeric)), + _ => unreachable!(), + }; + Ok((input, k)) + } _ => unreachable!(), } } @@ -156,26 +176,6 @@ impl<'a> ParseContext<'a> { } } - fn parse_numeric_unary_condition( - &'a self, - input: &'a str, - fid: u16, - value: &'a str, - ) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let numeric: f64 = self.parse_numeric(value)?; - let k = match input { - ">" => FilterCondition::Operator(fid, GreaterThan(numeric)), - "<" => FilterCondition::Operator(fid, LowerThan(numeric)), - "<=" => FilterCondition::Operator(fid, LowerThanOrEqual(numeric)), - ">=" => FilterCondition::Operator(fid, GreaterThanOrEqual(numeric)), - _ => unreachable!(), - }; - Ok((input, k)) - } - fn parse_fid(&'a self, input: &'a str, key: &'a str) -> StdResult> where E: FilterParserError<'a>, @@ -193,12 +193,13 @@ impl<'a> ParseContext<'a> { } } - fn parse_range_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> + /// to = value value TO value + fn parse_to(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where E: FilterParserError<'a>, { let (input, (key, from, _, to)) = tuple(( - self.ws(|c| self.parse_key(c)), + self.ws(|c| self.parse_value(c)), self.ws(|c| self.parse_value(c)), tag("TO"), self.ws(|c| self.parse_value(c)), @@ -212,6 +213,7 @@ impl<'a> ParseContext<'a> { Ok((input, res)) } + /// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) fn parse_geo_radius(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where E: FilterParserError<'a>, @@ -224,7 +226,8 @@ impl<'a> ParseContext<'a> { "_geoRadius. Longitude must be contained between -180 and 180 degrees."; let parsed = preceded::<_, _, _, E, _, _>( - tag("_geoRadius"), + // TODO: forbid spaces between _geoRadius and parenthesis + self.ws(tag("_geoRadius")), delimited( char('('), separated_list1(tag(","), self.ws(|c| recognize_float(c))), @@ -275,54 +278,35 @@ impl<'a> ParseContext<'a> { Ok((input, res)) } - fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let l1 = |c| self.parse_simple_condition(c); - let l2 = |c| self.parse_range_condition(c); - let l3 = |c| self.parse_geo_radius(c); - alt((l1, l2, l3))(input) - } - - fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> + /// primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius + fn parse_primary(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> where E: FilterParserError<'a>, { alt(( - delimited(self.ws(char('(')), |c| Self::parse_expression(self, c), self.ws(char(')'))), - |c| Self::parse_condition(self, c), - ))(input) - } - - fn parse_key(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> - where - E: FilterParserError<'a>, - { - let key = |input| take_while1(Self::is_key_component)(input); - let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); - let quoted_key = |input| take_till(|c: char| c == '"')(input); - - alt(( - delimited(char('\''), simple_quoted_key, char('\'')), - delimited(char('"'), quoted_key, char('"')), - key, + delimited(self.ws(char('(')), |c| self.parse_expression(c), self.ws(char(')'))), + |c| self.parse_condition(c), + |c| self.parse_to(c), + |c| self.parse_geo_radius(c), ))(input) } + /// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* fn parse_value(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> where E: FilterParserError<'a>, { - let key = - |input| take_till1(|c: char| c.is_ascii_whitespace() || c == '(' || c == ')')(input); + // singleQuoted = "'" .* all but quotes "'" let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); + // doubleQuoted = "\"" (word | spaces)* "\"" let quoted_key = |input| take_till(|c: char| c == '"')(input); + // word = (alphanumeric | _ | - | .)+ + let word = |input| take_while1(Self::is_key_component)(input); alt(( - delimited(char('\''), simple_quoted_key, char('\'')), - delimited(char('"'), quoted_key, char('"')), - key, + self.ws(delimited(char('\''), simple_quoted_key, char('\''))), + self.ws(delimited(char('"'), quoted_key, char('"'))), + self.ws(word), ))(input) } @@ -330,11 +314,12 @@ impl<'a> ParseContext<'a> { c.is_alphanumeric() || ['_', '-', '.'].contains(&c) } + /// expression = or pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> where E: FilterParserError<'a>, { - alt((|input| self.parse_or(input), |input| self.parse_and(input)))(input) + self.parse_or(input) } } @@ -499,7 +484,19 @@ mod tests { ), ), // test parenthesis - /* + ( + Fc::from_str( + &rtxn, + &index, + "channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )", + ), + Fc::And( + Box::new(Fc::Operator(0, Operator::Equal(None, S("ponce")))), + Box::new(Fc::Or( + Box::new(Fc::Operator(1, Operator::NotEqual(None, S("bernese mountain")))), + Box::new(Fc::Operator(2, Operator::GreaterThan(1000.))), + ))), + ), ( Fc::from_str( &rtxn, @@ -516,7 +513,6 @@ mod tests { )), Box::new(Fc::Operator(3, Operator::GeoLowerThan([12., 13.], 14.)))) ), - */ ]; for (result, expected) in test_case { From c634d43ac5f9bec5162b2158291045566b12338c Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 21 Oct 2021 17:10:27 +0200 Subject: [PATCH 1090/1889] add a simple test on the filters with an integer --- milli/src/search/facet/filter_parser.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 9440a44ca..47189841a 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -365,6 +365,10 @@ mod tests { Fc::from_str(&rtxn, &index, "channel = Ponce"), Fc::Operator(0, Operator::Equal(None, S("ponce"))), ), + ( + Fc::from_str(&rtxn, &index, "subscribers = 12"), + Fc::Operator(2, Operator::Equal(Some(12.), S("12"))), + ), // test all the quotes and simple quotes ( Fc::from_str(&rtxn, &index, "channel = 'Mister Mv'"), From 7e5c5c4d2750aea594dca8f0f597f13b60b6db1f Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 01:15:42 +0200 Subject: [PATCH 1091/1889] start a new rewrite of the filter parser --- Cargo.toml | 2 +- filter_parser/Cargo.toml | 15 + filter_parser/src/lib.rs | 623 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 639 insertions(+), 1 deletion(-) create mode 100644 filter_parser/Cargo.toml create mode 100644 filter_parser/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index b78989f50..5d2d47713 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["milli", "http-ui", "benchmarks", "infos", "helpers", "cli"] +members = ["milli", "filter_parser", "http-ui", "benchmarks", "infos", "helpers", "cli"] default-members = ["milli"] [profile.dev] diff --git a/filter_parser/Cargo.toml b/filter_parser/Cargo.toml new file mode 100644 index 000000000..d8a522b1b --- /dev/null +++ b/filter_parser/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "filter_parser" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +nom = "7.0.0" +nom_locate = "4.0.0" + +[dev-dependencies] +big_s = "1.0.2" +maplit = "1.0.2" +rand = "0.8.3" diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs new file mode 100644 index 000000000..6e6f5a1e6 --- /dev/null +++ b/filter_parser/src/lib.rs @@ -0,0 +1,623 @@ +#![allow(unused_imports)] +//! BNF grammar: +//! +//! ```text +//! expression = or +//! or = and (~ "OR" ~ and) +//! and = not (~ "AND" not)* +//! not = ("NOT" | "!") not | primary +//! primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius +//! to = value value TO value +//! condition = value ("==" | ">" ...) value +//! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +//! singleQuoted = "'" .* all but quotes "'" +//! doubleQuoted = "\"" (word | spaces)* "\"" +//! word = (alphanumeric | _ | - | .)+ +//! geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) +//! ``` + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FilterCondition<'a> { + Operator { fid: Token<'a>, op: Operator<'a> }, + Or(Box, Box), + And(Box, Box), + GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> }, + GeoGreaterThan { point: [Token<'a>; 2], radius: Token<'a> }, + Empty, +} + +impl<'a> FilterCondition<'a> { + pub fn negate(self) -> FilterCondition<'a> { + use FilterCondition::*; + + match self { + Operator { fid, op } => match op.negate() { + (op, None) => Operator { fid, op }, + (a, Some(b)) => { + Or(Operator { fid: fid.clone(), op: a }.into(), Operator { fid, op: b }.into()) + } + }, + Or(a, b) => And(a.negate().into(), b.negate().into()), + And(a, b) => Or(a.negate().into(), b.negate().into()), + Empty => Empty, + GeoLowerThan { point, radius } => GeoGreaterThan { point, radius }, + GeoGreaterThan { point, radius } => GeoLowerThan { point, radius }, + } + } + + pub fn parse(input: &'a str) -> IResult { + let span = Span::new(input); + parse_expression(span) + } +} + +use std::collections::HashSet; +use std::fmt::Debug; +use std::result::Result as StdResult; + +use nom::branch::alt; +use nom::bytes::complete::{tag, take_till, take_while1}; +use nom::character::complete::{char, multispace0}; +use nom::combinator::map; +use nom::error::{ContextError, ErrorKind, ParseError, VerboseError}; +use nom::multi::{many0, separated_list1}; +use nom::number::complete::recognize_float; +use nom::sequence::{delimited, preceded, tuple}; +use nom::IResult; +use nom_locate::LocatedSpan; + +use self::Operator::*; + +pub enum FilterError { + AttributeNotFilterable(String), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Token<'a> { + pub position: Span<'a>, + pub inner: &'a str, +} + +impl<'a> Token<'a> { + pub fn new(position: Span<'a>) -> Self { + Self { position, inner: &position } + } +} + +impl<'a> From> for Token<'a> { + fn from(span: Span<'a>) -> Self { + Self { inner: &span, position: span } + } +} + +type Span<'a> = LocatedSpan<&'a str>; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Operator<'a> { + GreaterThan(Token<'a>), + GreaterThanOrEqual(Token<'a>), + Equal(Token<'a>), + NotEqual(Token<'a>), + LowerThan(Token<'a>), + LowerThanOrEqual(Token<'a>), + Between { from: Token<'a>, to: Token<'a> }, +} + +impl<'a> Operator<'a> { + /// This method can return two operations in case it must express + /// an OR operation for the between case (i.e. `TO`). + pub fn negate(self) -> (Self, Option) { + match self { + GreaterThan(n) => (LowerThanOrEqual(n), None), + GreaterThanOrEqual(n) => (LowerThan(n), None), + Equal(s) => (NotEqual(s), None), + NotEqual(s) => (Equal(s), None), + LowerThan(n) => (GreaterThanOrEqual(n), None), + LowerThanOrEqual(n) => (GreaterThan(n), None), + Between { from, to } => (LowerThan(from), Some(GreaterThan(to))), + } + } +} + +pub trait FilterParserError<'a>: + nom::error::ParseError<&'a str> + ContextError<&'a str> + std::fmt::Debug +{ +} + +impl<'a> FilterParserError<'a> for VerboseError<&'a str> {} + +/// and = not (~ "AND" not)* +fn parse_or(input: Span) -> IResult { + let (input, lhs) = parse_and(input)?; + let (input, ors) = many0(preceded(ws(tag("OR")), |c| parse_and(c)))(input)?; + + let expr = ors + .into_iter() + .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch))); + Ok((input, expr)) +} + +fn parse_and(input: Span) -> IResult { + let (input, lhs) = parse_not(input)?; + let (input, ors) = many0(preceded(ws(tag("AND")), |c| parse_not(c)))(input)?; + let expr = ors + .into_iter() + .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); + Ok((input, expr)) +} + +/// not = ("NOT" | "!") not | primary +fn parse_not(input: Span) -> IResult { + alt((map(preceded(alt((tag("!"), tag("NOT"))), |c| parse_not(c)), |e| e.negate()), |c| { + parse_primary(c) + }))(input) +} + +fn ws<'a, O>( + inner: impl FnMut(Span<'a>) -> IResult, +) -> impl FnMut(Span<'a>) -> IResult { + delimited(multispace0, inner, multispace0) +} + +/// condition = value ("==" | ">" ...) value +fn parse_condition(input: Span) -> IResult { + let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); + let (input, (key, op, value)) = + tuple((|c| parse_value(c), operator, |c| parse_value(c)))(input)?; + + let fid = key.into(); + + // TODO + match *op.fragment() { + "=" => { + let k = FilterCondition::Operator { fid, op: Equal(value.into()) }; + Ok((input, k)) + } + "!=" => { + let k = FilterCondition::Operator { fid, op: NotEqual(value.into()) }; + Ok((input, k)) + } + ">" | "<" | "<=" | ">=" => { + let k = match *op.fragment() { + ">" => FilterCondition::Operator { fid, op: GreaterThan(value.into()) }, + "<" => FilterCondition::Operator { fid, op: LowerThan(value.into()) }, + "<=" => FilterCondition::Operator { fid, op: LowerThanOrEqual(value.into()) }, + ">=" => FilterCondition::Operator { fid, op: GreaterThanOrEqual(value.into()) }, + _ => unreachable!(), + }; + Ok((input, k)) + } + _ => unreachable!(), + } +} + +/// to = value value TO value +fn parse_to(input: Span) -> IResult { + let (input, (key, from, _, to)) = + tuple((ws(|c| parse_value(c)), ws(|c| parse_value(c)), tag("TO"), ws(|c| parse_value(c))))( + input, + )?; + + Ok(( + input, + FilterCondition::Operator { + fid: key.into(), + op: Between { from: from.into(), to: to.into() }, + }, + )) +} + +/// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) +fn parse_geo_radius(input: Span) -> IResult { + let err_msg_args_incomplete = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; + /* + TODO + let err_msg_latitude_invalid = + "_geoRadius. Latitude must be contained between -90 and 90 degrees."; + + let err_msg_longitude_invalid = + "_geoRadius. Longitude must be contained between -180 and 180 degrees."; + */ + + let parsed = preceded::<_, _, _, _, _, _>( + // TODO: forbid spaces between _geoRadius and parenthesis + ws(tag("_geoRadius")), + delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')')), + )(input); + + let (input, args): (Span, Vec) = match parsed { + Ok(e) => e, + Err(_e) => { + return Err(nom::Err::Failure(nom::error::Error::add_context( + input, + err_msg_args_incomplete, + nom::error::Error::from_char(input, '('), + ))); + } + }; + + if args.len() != 3 { + let e = nom::error::Error::from_char(input, '('); + return Err(nom::Err::Failure(nom::error::Error::add_context( + input, + err_msg_args_incomplete, + e, + ))); + } + + let res = FilterCondition::GeoLowerThan { + point: [args[0].into(), args[1].into()], + radius: args[2].into(), + }; + Ok((input, res)) +} + +/// primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius +fn parse_primary(input: Span) -> IResult { + alt(( + delimited(ws(char('(')), |c| parse_expression(c), ws(char(')'))), + |c| parse_condition(c), + |c| parse_to(c), + |c| parse_geo_radius(c), + ))(input) +} + +/// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +fn parse_value(input: Span) -> IResult { + // singleQuoted = "'" .* all but quotes "'" + let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); + // doubleQuoted = "\"" (word | spaces)* "\"" + let quoted_key = |input| take_till(|c: char| c == '"')(input); + // word = (alphanumeric | _ | - | .)+ + let word = |input| take_while1(is_key_component)(input); + + alt(( + ws(delimited(char('\''), simple_quoted_key, char('\''))), + ws(delimited(char('"'), quoted_key, char('"'))), + ws(word), + ))(input) +} + +fn is_key_component(c: char) -> bool { + c.is_alphanumeric() || ['_', '-', '.'].contains(&c) +} + +/// expression = or +pub fn parse_expression(input: Span) -> IResult { + parse_or(input) +} + +#[cfg(test)] +mod tests { + use big_s::S; + use maplit::hashset; + + use super::*; + + /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element + fn rtok<'a>(before: &'a str, value: &'a str) -> Token<'a> { + // if the string is empty we still need to return 1 for the line number + let lines = before.is_empty().then(|| 1).unwrap_or_else(|| before.lines().count()); + let offset = before.chars().count(); + unsafe { Span::new_from_raw_offset(offset, lines as u32, value, ()) }.into() + } + + #[test] + fn parse() { + use FilterCondition as Fc; + + // new_from_raw_offset is unsafe + let test_case = [ + // simple test + ( + "channel = Ponce", + Fc::Operator { + fid: rtok("", "channel"), + op: Operator::Equal(rtok("channel = ", "Ponce")), + }, + ), + ( + "subscribers = 12", + Fc::Operator { + fid: rtok("", "subscribers"), + op: Operator::Equal(rtok("subscribers = ", "12")), + }, + ), + // test all the quotes and simple quotes + ( + "channel = 'Mister Mv'", + Fc::Operator { + fid: rtok("", "channel"), + op: Operator::Equal(rtok("channel = '", "Mister Mv")), + }, + ), + ( + "channel = \"Mister Mv\"", + Fc::Operator { + fid: rtok("", "channel"), + op: Operator::Equal(rtok("channel = \"", "Mister Mv")), + }, + ), + ( + "'dog race' = Borzoi", + Fc::Operator { + fid: rtok("'", "dog race"), + op: Operator::Equal(rtok("'dog race' = ", "Borzoi")), + }, + ), + ( + "\"dog race\" = Chusky", + Fc::Operator { + fid: rtok("\"", "dog race"), + op: Operator::Equal(rtok("\"dog race\" = ", "Chusky")), + }, + ), + ( + "\"dog race\" = \"Bernese Mountain\"", + Fc::Operator { + fid: rtok("\"", "dog race"), + op: Operator::Equal(rtok("\"dog race\" = \"", "Bernese Mountain")), + }, + ), + ( + "'dog race' = 'Bernese Mountain'", + Fc::Operator { + fid: rtok("'", "dog race"), + op: Operator::Equal(rtok("'dog race' = '", "Bernese Mountain")), + }, + ), + ( + "\"dog race\" = 'Bernese Mountain'", + Fc::Operator { + fid: rtok("\"", "dog race"), + op: Operator::Equal(rtok("\"dog race\" = \"", "Bernese Mountain")), + }, + ), + // test all the operators + ( + "channel != ponce", + Fc::Operator { + fid: rtok("", "channel"), + op: Operator::NotEqual(rtok("channel != ", "ponce")), + }, + ), + ( + "NOT channel = ponce", + Fc::Operator { + fid: rtok("NOT ", "channel"), + op: Operator::NotEqual(rtok("NOT channel = ", "ponce")), + }, + ), + ( + "subscribers < 1000", + Fc::Operator { + fid: rtok("", "subscribers"), + op: Operator::LowerThan(rtok("subscribers < ", "1000")), + }, + ), + ( + "subscribers > 1000", + Fc::Operator { + fid: rtok("", "subscribers"), + op: Operator::GreaterThan(rtok("subscribers > ", "1000")), + }, + ), + ( + "subscribers <= 1000", + Fc::Operator { + fid: rtok("", "subscribers"), + op: Operator::LowerThanOrEqual(rtok("subscribers <= ", "1000")), + }, + ), + ( + "subscribers >= 1000", + Fc::Operator { + fid: rtok("", "subscribers"), + op: Operator::GreaterThanOrEqual(rtok("subscribers >= ", "1000")), + }, + ), + ( + "NOT subscribers < 1000", + Fc::Operator { + fid: rtok("NOT ", "subscribers"), + op: Operator::GreaterThanOrEqual(rtok("NOT subscribers < ", "1000")), + }, + ), + ( + "NOT subscribers > 1000", + Fc::Operator { + fid: rtok("NOT ", "subscribers"), + op: Operator::LowerThanOrEqual(rtok("NOT subscribers > ", "1000")), + }, + ), + ( + "NOT subscribers <= 1000", + Fc::Operator { + fid: rtok("NOT ", "subscribers"), + op: Operator::GreaterThan(rtok("NOT subscribers <= ", "1000")), + }, + ), + ( + "NOT subscribers >= 1000", + Fc::Operator { + fid: rtok("NOT ", "subscribers"), + op: Operator::LowerThan(rtok("NOT subscribers >= ", "1000")), + }, + ), + ( + "subscribers 100 TO 1000", + Fc::Operator { + fid: rtok("", "subscribers"), + op: Operator::Between { + from: rtok("subscribers ", "100"), + to: rtok("subscribers 100 TO ", "1000"), + }, + }, + ), + ( + "NOT subscribers 100 TO 1000", + Fc::Or( + Fc::Operator { + fid: rtok("NOT ", "subscribers"), + op: Operator::LowerThan(rtok("NOT subscribers ", "100")), + } + .into(), + Fc::Operator { + fid: rtok("NOT ", "subscribers"), + op: Operator::GreaterThan(rtok("NOT subscribers 100 TO ", "1000")), + } + .into(), + ), + ), + ( + "_geoRadius(12, 13, 14)", + Fc::GeoLowerThan { + point: [rtok("_geoRadius(", "12"), rtok("_geoRadius(12, ", "13")], + radius: rtok("_geoRadius(12, 13, ", "14"), + }, + ), + ( + "NOT _geoRadius(12, 13, 14)", + Fc::GeoGreaterThan { + point: [rtok("NOT _geoRadius(", "12"), rtok("NOT _geoRadius(12, ", "13")], + radius: rtok("NOT _geoRadius(12, 13, ", "14"), + }, + ), + // test simple `or` and `and` + ( + "channel = ponce AND 'dog race' != 'bernese mountain'", + Fc::And( + Fc::Operator { + fid: rtok("", "channel"), + op: Operator::Equal(rtok("channel = ", "ponce")), + } + .into(), + Fc::Operator { + fid: rtok("channel = ponce AND '", "dog race"), + op: Operator::NotEqual(rtok( + "channel = ponce AND 'dog race' != '", + "bernese mountain", + )), + } + .into(), + ), + ), + ( + "channel = ponce OR 'dog race' != 'bernese mountain'", + Fc::Or( + Fc::Operator { + fid: rtok("", "channel"), + op: Operator::Equal(rtok("channel = ", "ponce")), + } + .into(), + Fc::Operator { + fid: rtok("channel = ponce OR '", "dog race"), + op: Operator::NotEqual(rtok( + "channel = ponce OR 'dog race' != '", + "bernese mountain", + )), + } + .into(), + ), + ), + ( + "channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000", + Fc::Or( + Fc::And( + Fc::Operator { + fid: rtok("", "channel"), + op: Operator::Equal(rtok("channel = ", "ponce")), + } + .into(), + Fc::Operator { + fid: rtok("channel = ponce AND '", "dog race"), + op: Operator::NotEqual(rtok( + "channel = ponce AND 'dog race' != '", + "bernese mountain", + )), + } + .into(), + ) + .into(), + Fc::Operator { + fid: rtok( + "channel = ponce AND 'dog race' != 'bernese mountain' OR ", + "subscribers", + ), + op: Operator::GreaterThan(rtok( + "channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > ", + "1000", + )), + } + .into(), + ), + ), + // test parenthesis + ( + "channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )", + Fc::And( + Fc::Operator { fid: rtok("", "channel"), op: Operator::Equal(rtok("channel = ", "ponce")) }.into(), + Fc::Or( + Fc::Operator { fid: rtok("channel = ponce AND ( '", "dog race"), op: Operator::NotEqual(rtok("channel = ponce AND ( 'dog race' != '", "bernese mountain"))}.into(), + Fc::Operator { fid: rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Operator::GreaterThan(rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(), + ).into()), + ), + ( + "(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14)", + Fc::And( + Fc::Or( + Fc::And( + Fc::Operator { fid: rtok("(", "channel"), op: Operator::Equal(rtok("(channel = ", "ponce")) }.into(), + Fc::Operator { fid: rtok("(channel = ponce AND '", "dog race"), op: Operator::NotEqual(rtok("(channel = ponce AND 'dog race' != '", "bernese mountain")) }.into(), + ).into(), + Fc::Operator { fid: rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Operator::GreaterThan(rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(), + ).into(), + Fc::GeoLowerThan { point: [rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(", "12"), rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, ", "13")], radius: rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, ", "14") }.into() + ) + ) + ]; + + for (input, expected) in test_case { + let result = Fc::parse(input); + + assert!( + result.is_ok(), + "Filter `{:?}` was supposed to be parsed but failed with the following error: `{}`", + expected, + result.unwrap_err() + ); + let filter = result.unwrap().1; + assert_eq!(filter, expected, "Filter `{}` failed.", input); + } + } + + #[test] + fn name() { + use FilterCondition as Fc; + + // new_from_raw_offset is unsafe + let test_case = [ + // simple test + ( + "channel=Ponce", + Fc::Operator { + fid: rtok("", "channel"), + op: Operator::Equal(rtok("channel = ", "Ponce")), + }, + ), + ]; + + for (input, expected) in test_case { + let result = Fc::parse(input); + + assert!( + result.is_ok(), + "Filter `{:?}` was supposed to be parsed but failed with the following error: `{}`", + expected, + result.unwrap_err() + ); + let filter = result.unwrap().1; + assert_eq!(filter, expected, "Filter `{}` failed.", input); + } + } +} From 01dedde1c9e1e2195d4b72252cf8153fdb743aa9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 01:59:38 +0200 Subject: [PATCH 1092/1889] update some names and move some parser out of the lib.rs --- filter_parser/Cargo.toml | 5 - filter_parser/src/condition.rs | 94 ++++++ filter_parser/src/lib.rs | 355 +++++++-------------- filter_parser/src/value.rs | 71 +++++ milli/Cargo.toml | 1 + milli/src/search/facet/filter_condition.rs | 22 +- milli/src/search/facet/filter_parser.rs | 52 +-- milli/src/search/mod.rs | 3 +- 8 files changed, 318 insertions(+), 285 deletions(-) create mode 100644 filter_parser/src/condition.rs create mode 100644 filter_parser/src/value.rs diff --git a/filter_parser/Cargo.toml b/filter_parser/Cargo.toml index d8a522b1b..80767d5c4 100644 --- a/filter_parser/Cargo.toml +++ b/filter_parser/Cargo.toml @@ -8,8 +8,3 @@ edition = "2021" [dependencies] nom = "7.0.0" nom_locate = "4.0.0" - -[dev-dependencies] -big_s = "1.0.2" -maplit = "1.0.2" -rand = "0.8.3" diff --git a/filter_parser/src/condition.rs b/filter_parser/src/condition.rs new file mode 100644 index 000000000..5a1bb62be --- /dev/null +++ b/filter_parser/src/condition.rs @@ -0,0 +1,94 @@ +//! BNF grammar: +//! +//! ```text +//! condition = value ("==" | ">" ...) value +//! to = value value TO value +//! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +//! singleQuoted = "'" .* all but quotes "'" +//! doubleQuoted = "\"" (word | spaces)* "\"" +//! word = (alphanumeric | _ | - | .)+ +//! geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) +//! ``` + +use nom::branch::alt; +use nom::bytes::complete::tag; +use nom::sequence::tuple; +use nom::IResult; +use Condition::*; + +use crate::{parse_value, ws, FilterCondition, Span, Token}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Condition<'a> { + GreaterThan(Token<'a>), + GreaterThanOrEqual(Token<'a>), + Equal(Token<'a>), + NotEqual(Token<'a>), + LowerThan(Token<'a>), + LowerThanOrEqual(Token<'a>), + Between { from: Token<'a>, to: Token<'a> }, +} + +impl<'a> Condition<'a> { + /// This method can return two operations in case it must express + /// an OR operation for the between case (i.e. `TO`). + pub fn negate(self) -> (Self, Option) { + match self { + GreaterThan(n) => (LowerThanOrEqual(n), None), + GreaterThanOrEqual(n) => (LowerThan(n), None), + Equal(s) => (NotEqual(s), None), + NotEqual(s) => (Equal(s), None), + LowerThan(n) => (GreaterThanOrEqual(n), None), + LowerThanOrEqual(n) => (GreaterThan(n), None), + Between { from, to } => (LowerThan(from), Some(GreaterThan(to))), + } + } +} + +/// condition = value ("==" | ">" ...) value +pub fn parse_condition(input: Span) -> IResult { + let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); + let (input, (key, op, value)) = + tuple((|c| parse_value(c), operator, |c| parse_value(c)))(input)?; + + let fid = key; + + // TODO + match *op.fragment() { + "=" => { + let k = FilterCondition::Condition { fid, op: Equal(value) }; + Ok((input, k)) + } + "!=" => { + let k = FilterCondition::Condition { fid, op: NotEqual(value) }; + Ok((input, k)) + } + ">" | "<" | "<=" | ">=" => { + let k = match *op.fragment() { + ">" => FilterCondition::Condition { fid, op: GreaterThan(value) }, + "<" => FilterCondition::Condition { fid, op: LowerThan(value) }, + "<=" => FilterCondition::Condition { fid, op: LowerThanOrEqual(value) }, + ">=" => FilterCondition::Condition { fid, op: GreaterThanOrEqual(value) }, + _ => unreachable!(), + }; + Ok((input, k)) + } + _ => unreachable!(), + } +} + +/// to = value value TO value +pub fn parse_to(input: Span) -> IResult { + let (input, (key, from, _, to)) = + tuple((ws(|c| parse_value(c)), ws(|c| parse_value(c)), tag("TO"), ws(|c| parse_value(c))))( + input, + )?; + + Ok(( + input, + FilterCondition::Condition { + fid: key.into(), + op: Between { from: from.into(), to: to.into() }, + }, + )) +} diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 6e6f5a1e6..096a9e26e 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -1,4 +1,3 @@ -#![allow(unused_imports)] //! BNF grammar: //! //! ```text @@ -7,8 +6,8 @@ //! and = not (~ "AND" not)* //! not = ("NOT" | "!") not | primary //! primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius -//! to = value value TO value //! condition = value ("==" | ">" ...) value +//! to = value value TO value //! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" (word | spaces)* "\"" @@ -16,61 +15,24 @@ //! geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) //! ``` -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum FilterCondition<'a> { - Operator { fid: Token<'a>, op: Operator<'a> }, - Or(Box, Box), - And(Box, Box), - GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> }, - GeoGreaterThan { point: [Token<'a>; 2], radius: Token<'a> }, - Empty, -} - -impl<'a> FilterCondition<'a> { - pub fn negate(self) -> FilterCondition<'a> { - use FilterCondition::*; - - match self { - Operator { fid, op } => match op.negate() { - (op, None) => Operator { fid, op }, - (a, Some(b)) => { - Or(Operator { fid: fid.clone(), op: a }.into(), Operator { fid, op: b }.into()) - } - }, - Or(a, b) => And(a.negate().into(), b.negate().into()), - And(a, b) => Or(a.negate().into(), b.negate().into()), - Empty => Empty, - GeoLowerThan { point, radius } => GeoGreaterThan { point, radius }, - GeoGreaterThan { point, radius } => GeoLowerThan { point, radius }, - } - } - - pub fn parse(input: &'a str) -> IResult { - let span = Span::new(input); - parse_expression(span) - } -} - -use std::collections::HashSet; +mod condition; +mod value; use std::fmt::Debug; -use std::result::Result as StdResult; +pub use condition::{parse_condition, parse_to, Condition}; use nom::branch::alt; -use nom::bytes::complete::{tag, take_till, take_while1}; +use nom::bytes::complete::tag; use nom::character::complete::{char, multispace0}; use nom::combinator::map; -use nom::error::{ContextError, ErrorKind, ParseError, VerboseError}; +use nom::error::{ContextError, ParseError}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; -use nom::sequence::{delimited, preceded, tuple}; +use nom::sequence::{delimited, preceded}; use nom::IResult; use nom_locate::LocatedSpan; +pub(crate) use value::parse_value; -use self::Operator::*; - -pub enum FilterError { - AttributeNotFilterable(String), -} +type Span<'a> = LocatedSpan<&'a str>; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Token<'a> { @@ -90,42 +52,49 @@ impl<'a> From> for Token<'a> { } } -type Span<'a> = LocatedSpan<&'a str>; - #[derive(Debug, Clone, PartialEq, Eq)] -pub enum Operator<'a> { - GreaterThan(Token<'a>), - GreaterThanOrEqual(Token<'a>), - Equal(Token<'a>), - NotEqual(Token<'a>), - LowerThan(Token<'a>), - LowerThanOrEqual(Token<'a>), - Between { from: Token<'a>, to: Token<'a> }, +pub enum FilterCondition<'a> { + Condition { fid: Token<'a>, op: Condition<'a> }, + Or(Box, Box), + And(Box, Box), + GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> }, + GeoGreaterThan { point: [Token<'a>; 2], radius: Token<'a> }, + Empty, } -impl<'a> Operator<'a> { - /// This method can return two operations in case it must express - /// an OR operation for the between case (i.e. `TO`). - pub fn negate(self) -> (Self, Option) { +impl<'a> FilterCondition<'a> { + pub fn negate(self) -> FilterCondition<'a> { + use FilterCondition::*; + match self { - GreaterThan(n) => (LowerThanOrEqual(n), None), - GreaterThanOrEqual(n) => (LowerThan(n), None), - Equal(s) => (NotEqual(s), None), - NotEqual(s) => (Equal(s), None), - LowerThan(n) => (GreaterThanOrEqual(n), None), - LowerThanOrEqual(n) => (GreaterThan(n), None), - Between { from, to } => (LowerThan(from), Some(GreaterThan(to))), + Condition { fid, op } => match op.negate() { + (op, None) => Condition { fid, op }, + (a, Some(b)) => Or( + Condition { fid: fid.clone(), op: a }.into(), + Condition { fid, op: b }.into(), + ), + }, + Or(a, b) => And(a.negate().into(), b.negate().into()), + And(a, b) => Or(a.negate().into(), b.negate().into()), + Empty => Empty, + GeoLowerThan { point, radius } => GeoGreaterThan { point, radius }, + GeoGreaterThan { point, radius } => GeoLowerThan { point, radius }, } } + + pub fn parse(input: &'a str) -> IResult { + let span = Span::new(input); + parse_expression(span) + } } -pub trait FilterParserError<'a>: - nom::error::ParseError<&'a str> + ContextError<&'a str> + std::fmt::Debug -{ +// remove OPTIONAL whitespaces before AND after the the provided parser +fn ws<'a, O>( + inner: impl FnMut(Span<'a>) -> IResult, +) -> impl FnMut(Span<'a>) -> IResult { + delimited(multispace0, inner, multispace0) } -impl<'a> FilterParserError<'a> for VerboseError<&'a str> {} - /// and = not (~ "AND" not)* fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; @@ -153,60 +122,6 @@ fn parse_not(input: Span) -> IResult { }))(input) } -fn ws<'a, O>( - inner: impl FnMut(Span<'a>) -> IResult, -) -> impl FnMut(Span<'a>) -> IResult { - delimited(multispace0, inner, multispace0) -} - -/// condition = value ("==" | ">" ...) value -fn parse_condition(input: Span) -> IResult { - let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); - let (input, (key, op, value)) = - tuple((|c| parse_value(c), operator, |c| parse_value(c)))(input)?; - - let fid = key.into(); - - // TODO - match *op.fragment() { - "=" => { - let k = FilterCondition::Operator { fid, op: Equal(value.into()) }; - Ok((input, k)) - } - "!=" => { - let k = FilterCondition::Operator { fid, op: NotEqual(value.into()) }; - Ok((input, k)) - } - ">" | "<" | "<=" | ">=" => { - let k = match *op.fragment() { - ">" => FilterCondition::Operator { fid, op: GreaterThan(value.into()) }, - "<" => FilterCondition::Operator { fid, op: LowerThan(value.into()) }, - "<=" => FilterCondition::Operator { fid, op: LowerThanOrEqual(value.into()) }, - ">=" => FilterCondition::Operator { fid, op: GreaterThanOrEqual(value.into()) }, - _ => unreachable!(), - }; - Ok((input, k)) - } - _ => unreachable!(), - } -} - -/// to = value value TO value -fn parse_to(input: Span) -> IResult { - let (input, (key, from, _, to)) = - tuple((ws(|c| parse_value(c)), ws(|c| parse_value(c)), tag("TO"), ws(|c| parse_value(c))))( - input, - )?; - - Ok(( - input, - FilterCondition::Operator { - fid: key.into(), - op: Between { from: from.into(), to: to.into() }, - }, - )) -} - /// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) fn parse_geo_radius(input: Span) -> IResult { let err_msg_args_incomplete = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; @@ -262,40 +177,17 @@ fn parse_primary(input: Span) -> IResult { ))(input) } -/// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* -fn parse_value(input: Span) -> IResult { - // singleQuoted = "'" .* all but quotes "'" - let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); - // doubleQuoted = "\"" (word | spaces)* "\"" - let quoted_key = |input| take_till(|c: char| c == '"')(input); - // word = (alphanumeric | _ | - | .)+ - let word = |input| take_while1(is_key_component)(input); - - alt(( - ws(delimited(char('\''), simple_quoted_key, char('\''))), - ws(delimited(char('"'), quoted_key, char('"'))), - ws(word), - ))(input) -} - -fn is_key_component(c: char) -> bool { - c.is_alphanumeric() || ['_', '-', '.'].contains(&c) -} - /// expression = or pub fn parse_expression(input: Span) -> IResult { parse_or(input) } #[cfg(test)] -mod tests { - use big_s::S; - use maplit::hashset; - +pub mod tests { use super::*; /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element - fn rtok<'a>(before: &'a str, value: &'a str) -> Token<'a> { + pub fn rtok<'a>(before: &'a str, value: &'a str) -> Token<'a> { // if the string is empty we still need to return 1 for the line number let lines = before.is_empty().then(|| 1).unwrap_or_else(|| before.lines().count()); let offset = before.chars().count(); @@ -306,149 +198,148 @@ mod tests { fn parse() { use FilterCondition as Fc; - // new_from_raw_offset is unsafe let test_case = [ // simple test ( "channel = Ponce", - Fc::Operator { + Fc::Condition { fid: rtok("", "channel"), - op: Operator::Equal(rtok("channel = ", "Ponce")), + op: Condition::Equal(rtok("channel = ", "Ponce")), }, ), ( "subscribers = 12", - Fc::Operator { + Fc::Condition { fid: rtok("", "subscribers"), - op: Operator::Equal(rtok("subscribers = ", "12")), + op: Condition::Equal(rtok("subscribers = ", "12")), }, ), // test all the quotes and simple quotes ( "channel = 'Mister Mv'", - Fc::Operator { + Fc::Condition { fid: rtok("", "channel"), - op: Operator::Equal(rtok("channel = '", "Mister Mv")), + op: Condition::Equal(rtok("channel = '", "Mister Mv")), }, ), ( "channel = \"Mister Mv\"", - Fc::Operator { + Fc::Condition { fid: rtok("", "channel"), - op: Operator::Equal(rtok("channel = \"", "Mister Mv")), + op: Condition::Equal(rtok("channel = \"", "Mister Mv")), }, ), ( "'dog race' = Borzoi", - Fc::Operator { + Fc::Condition { fid: rtok("'", "dog race"), - op: Operator::Equal(rtok("'dog race' = ", "Borzoi")), + op: Condition::Equal(rtok("'dog race' = ", "Borzoi")), }, ), ( "\"dog race\" = Chusky", - Fc::Operator { + Fc::Condition { fid: rtok("\"", "dog race"), - op: Operator::Equal(rtok("\"dog race\" = ", "Chusky")), + op: Condition::Equal(rtok("\"dog race\" = ", "Chusky")), }, ), ( "\"dog race\" = \"Bernese Mountain\"", - Fc::Operator { + Fc::Condition { fid: rtok("\"", "dog race"), - op: Operator::Equal(rtok("\"dog race\" = \"", "Bernese Mountain")), + op: Condition::Equal(rtok("\"dog race\" = \"", "Bernese Mountain")), }, ), ( "'dog race' = 'Bernese Mountain'", - Fc::Operator { + Fc::Condition { fid: rtok("'", "dog race"), - op: Operator::Equal(rtok("'dog race' = '", "Bernese Mountain")), + op: Condition::Equal(rtok("'dog race' = '", "Bernese Mountain")), }, ), ( "\"dog race\" = 'Bernese Mountain'", - Fc::Operator { + Fc::Condition { fid: rtok("\"", "dog race"), - op: Operator::Equal(rtok("\"dog race\" = \"", "Bernese Mountain")), + op: Condition::Equal(rtok("\"dog race\" = \"", "Bernese Mountain")), }, ), // test all the operators ( "channel != ponce", - Fc::Operator { + Fc::Condition { fid: rtok("", "channel"), - op: Operator::NotEqual(rtok("channel != ", "ponce")), + op: Condition::NotEqual(rtok("channel != ", "ponce")), }, ), ( "NOT channel = ponce", - Fc::Operator { + Fc::Condition { fid: rtok("NOT ", "channel"), - op: Operator::NotEqual(rtok("NOT channel = ", "ponce")), + op: Condition::NotEqual(rtok("NOT channel = ", "ponce")), }, ), ( "subscribers < 1000", - Fc::Operator { + Fc::Condition { fid: rtok("", "subscribers"), - op: Operator::LowerThan(rtok("subscribers < ", "1000")), + op: Condition::LowerThan(rtok("subscribers < ", "1000")), }, ), ( "subscribers > 1000", - Fc::Operator { + Fc::Condition { fid: rtok("", "subscribers"), - op: Operator::GreaterThan(rtok("subscribers > ", "1000")), + op: Condition::GreaterThan(rtok("subscribers > ", "1000")), }, ), ( "subscribers <= 1000", - Fc::Operator { + Fc::Condition { fid: rtok("", "subscribers"), - op: Operator::LowerThanOrEqual(rtok("subscribers <= ", "1000")), + op: Condition::LowerThanOrEqual(rtok("subscribers <= ", "1000")), }, ), ( "subscribers >= 1000", - Fc::Operator { + Fc::Condition { fid: rtok("", "subscribers"), - op: Operator::GreaterThanOrEqual(rtok("subscribers >= ", "1000")), + op: Condition::GreaterThanOrEqual(rtok("subscribers >= ", "1000")), }, ), ( "NOT subscribers < 1000", - Fc::Operator { + Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Operator::GreaterThanOrEqual(rtok("NOT subscribers < ", "1000")), + op: Condition::GreaterThanOrEqual(rtok("NOT subscribers < ", "1000")), }, ), ( "NOT subscribers > 1000", - Fc::Operator { + Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Operator::LowerThanOrEqual(rtok("NOT subscribers > ", "1000")), + op: Condition::LowerThanOrEqual(rtok("NOT subscribers > ", "1000")), }, ), ( "NOT subscribers <= 1000", - Fc::Operator { + Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Operator::GreaterThan(rtok("NOT subscribers <= ", "1000")), + op: Condition::GreaterThan(rtok("NOT subscribers <= ", "1000")), }, ), ( "NOT subscribers >= 1000", - Fc::Operator { + Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Operator::LowerThan(rtok("NOT subscribers >= ", "1000")), + op: Condition::LowerThan(rtok("NOT subscribers >= ", "1000")), }, ), ( "subscribers 100 TO 1000", - Fc::Operator { + Fc::Condition { fid: rtok("", "subscribers"), - op: Operator::Between { + op: Condition::Between { from: rtok("subscribers ", "100"), to: rtok("subscribers 100 TO ", "1000"), }, @@ -457,14 +348,14 @@ mod tests { ( "NOT subscribers 100 TO 1000", Fc::Or( - Fc::Operator { + Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Operator::LowerThan(rtok("NOT subscribers ", "100")), + op: Condition::LowerThan(rtok("NOT subscribers ", "100")), } .into(), - Fc::Operator { + Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Operator::GreaterThan(rtok("NOT subscribers 100 TO ", "1000")), + op: Condition::GreaterThan(rtok("NOT subscribers 100 TO ", "1000")), } .into(), ), @@ -487,14 +378,14 @@ mod tests { ( "channel = ponce AND 'dog race' != 'bernese mountain'", Fc::And( - Fc::Operator { + Fc::Condition { fid: rtok("", "channel"), - op: Operator::Equal(rtok("channel = ", "ponce")), + op: Condition::Equal(rtok("channel = ", "ponce")), } .into(), - Fc::Operator { + Fc::Condition { fid: rtok("channel = ponce AND '", "dog race"), - op: Operator::NotEqual(rtok( + op: Condition::NotEqual(rtok( "channel = ponce AND 'dog race' != '", "bernese mountain", )), @@ -505,14 +396,14 @@ mod tests { ( "channel = ponce OR 'dog race' != 'bernese mountain'", Fc::Or( - Fc::Operator { + Fc::Condition { fid: rtok("", "channel"), - op: Operator::Equal(rtok("channel = ", "ponce")), + op: Condition::Equal(rtok("channel = ", "ponce")), } .into(), - Fc::Operator { + Fc::Condition { fid: rtok("channel = ponce OR '", "dog race"), - op: Operator::NotEqual(rtok( + op: Condition::NotEqual(rtok( "channel = ponce OR 'dog race' != '", "bernese mountain", )), @@ -524,14 +415,14 @@ mod tests { "channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000", Fc::Or( Fc::And( - Fc::Operator { + Fc::Condition { fid: rtok("", "channel"), - op: Operator::Equal(rtok("channel = ", "ponce")), + op: Condition::Equal(rtok("channel = ", "ponce")), } .into(), - Fc::Operator { + Fc::Condition { fid: rtok("channel = ponce AND '", "dog race"), - op: Operator::NotEqual(rtok( + op: Condition::NotEqual(rtok( "channel = ponce AND 'dog race' != '", "bernese mountain", )), @@ -539,12 +430,12 @@ mod tests { .into(), ) .into(), - Fc::Operator { + Fc::Condition { fid: rtok( "channel = ponce AND 'dog race' != 'bernese mountain' OR ", "subscribers", ), - op: Operator::GreaterThan(rtok( + op: Condition::GreaterThan(rtok( "channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > ", "1000", )), @@ -556,10 +447,10 @@ mod tests { ( "channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )", Fc::And( - Fc::Operator { fid: rtok("", "channel"), op: Operator::Equal(rtok("channel = ", "ponce")) }.into(), + Fc::Condition { fid: rtok("", "channel"), op: Condition::Equal(rtok("channel = ", "ponce")) }.into(), Fc::Or( - Fc::Operator { fid: rtok("channel = ponce AND ( '", "dog race"), op: Operator::NotEqual(rtok("channel = ponce AND ( 'dog race' != '", "bernese mountain"))}.into(), - Fc::Operator { fid: rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Operator::GreaterThan(rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(), + Fc::Condition { fid: rtok("channel = ponce AND ( '", "dog race"), op: Condition::NotEqual(rtok("channel = ponce AND ( 'dog race' != '", "bernese mountain"))}.into(), + Fc::Condition { fid: rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Condition::GreaterThan(rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(), ).into()), ), ( @@ -567,10 +458,10 @@ mod tests { Fc::And( Fc::Or( Fc::And( - Fc::Operator { fid: rtok("(", "channel"), op: Operator::Equal(rtok("(channel = ", "ponce")) }.into(), - Fc::Operator { fid: rtok("(channel = ponce AND '", "dog race"), op: Operator::NotEqual(rtok("(channel = ponce AND 'dog race' != '", "bernese mountain")) }.into(), + Fc::Condition { fid: rtok("(", "channel"), op: Condition::Equal(rtok("(channel = ", "ponce")) }.into(), + Fc::Condition { fid: rtok("(channel = ponce AND '", "dog race"), op: Condition::NotEqual(rtok("(channel = ponce AND 'dog race' != '", "bernese mountain")) }.into(), ).into(), - Fc::Operator { fid: rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Operator::GreaterThan(rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(), + Fc::Condition { fid: rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Condition::GreaterThan(rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(), ).into(), Fc::GeoLowerThan { point: [rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(", "12"), rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, ", "13")], radius: rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, ", "14") }.into() ) @@ -590,34 +481,4 @@ mod tests { assert_eq!(filter, expected, "Filter `{}` failed.", input); } } - - #[test] - fn name() { - use FilterCondition as Fc; - - // new_from_raw_offset is unsafe - let test_case = [ - // simple test - ( - "channel=Ponce", - Fc::Operator { - fid: rtok("", "channel"), - op: Operator::Equal(rtok("channel = ", "Ponce")), - }, - ), - ]; - - for (input, expected) in test_case { - let result = Fc::parse(input); - - assert!( - result.is_ok(), - "Filter `{:?}` was supposed to be parsed but failed with the following error: `{}`", - expected, - result.unwrap_err() - ); - let filter = result.unwrap().1; - assert_eq!(filter, expected, "Filter `{}` failed.", input); - } - } } diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs new file mode 100644 index 000000000..c36becf7e --- /dev/null +++ b/filter_parser/src/value.rs @@ -0,0 +1,71 @@ +use nom::branch::alt; +use nom::bytes::complete::{take_till, take_while1}; +use nom::character::complete::char; +use nom::sequence::delimited; +use nom::IResult; + +use crate::{ws, Span, Token}; + +/// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +pub fn parse_value(input: Span) -> IResult { + // singleQuoted = "'" .* all but quotes "'" + let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); + // doubleQuoted = "\"" (word | spaces)* "\"" + let quoted_key = |input| take_till(|c: char| c == '"')(input); + // word = (alphanumeric | _ | - | .)+ + let word = |input| take_while1(is_key_component)(input); + + alt(( + ws(delimited(char('\''), simple_quoted_key, char('\''))), + ws(delimited(char('"'), quoted_key, char('"'))), + ws(word), + ))(input) + .map(|(s, t)| (s, t.into())) +} + +fn is_key_component(c: char) -> bool { + c.is_alphanumeric() || ['_', '-', '.'].contains(&c) +} + +#[cfg(test)] +pub mod tests { + use super::*; + use crate::tests::rtok; + + #[test] + fn name() { + let test_case = [ + ("channel", rtok("", "channel")), + (".private", rtok("", ".private")), + ("I-love-kebab", rtok("", "I-love-kebab")), + ("but_snakes_are_also_good", rtok("", "but_snakes_are_also_good")), + ("parens(", rtok("", "parens")), + ("parens)", rtok("", "parens")), + ("not!", rtok("", "not")), + (" channel", rtok(" ", "channel")), + ("channel ", rtok("", "channel")), + ("'channel'", rtok("'", "channel")), + ("\"channel\"", rtok("\"", "channel")), + ("'cha)nnel'", rtok("'", "cha)nnel")), + ("'cha\"nnel'", rtok("'", "cha\"nnel")), + ("\"cha'nnel\"", rtok("\"", "cha'nnel")), + ("\" some spaces \"", rtok("\"", " some spaces ")), + ("\"cha'nnel\"", rtok("'", "cha'nnel")), + ("\"cha'nnel\"", rtok("'", "cha'nnel")), + ]; + + for (input, expected) in test_case { + let input = Span::new(input); + let result = parse_value(input); + + assert!( + result.is_ok(), + "Filter `{:?}` was supposed to be parsed but failed with the following error: `{}`", + expected, + result.unwrap_err() + ); + let value = result.unwrap().1; + assert_eq!(value, expected, "Filter `{}` failed.", input); + } + } +} diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 594cc60e0..63fd0d984 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -40,6 +40,7 @@ uuid = { version = "0.8.2", features = ["v4"] } # facet filter parser nom = "7.0.0" +nom_locate = "4.0.0" # documents words self-join itertools = "0.10.0" diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 4fedeee69..5c57adb88 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -16,20 +16,20 @@ use crate::heed_codec::facet::{ }; use crate::{distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result}; -#[derive(Debug, Clone, PartialEq)] -pub enum FilterCondition { - Operator(FieldId, Operator), +#[derive(Debug, Clone)] +pub enum FilterCondition<'a> { + Operator(FieldId, Operator<'a>), Or(Box, Box), And(Box, Box), Empty, } -impl FilterCondition { - pub fn from_array( +impl<'a> FilterCondition<'a> { + pub fn from_array( rtxn: &heed::RoTxn, index: &Index, array: I, - ) -> Result> + ) -> Result>> where I: IntoIterator>, J: IntoIterator, @@ -73,8 +73,8 @@ impl FilterCondition { pub fn from_str( rtxn: &heed::RoTxn, index: &Index, - expression: &str, - ) -> Result { + expression: &'a str, + ) -> Result> { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?; let ctx = @@ -93,7 +93,7 @@ impl FilterCondition { } } } - pub fn negate(self) -> FilterCondition { + pub fn negate(self) -> FilterCondition<'a> { match self { Operator(fid, op) => match op.negate() { (op, None) => Operator(fid, op), @@ -106,7 +106,7 @@ impl FilterCondition { } } -impl FilterCondition { +impl<'a> FilterCondition<'a> { /// Aggregates the documents ids that are part of the specified range automatically /// going deeper through the levels. fn explore_facet_number_levels( @@ -221,7 +221,7 @@ impl FilterCondition { numbers_db: heed::Database, strings_db: heed::Database, field_id: FieldId, - operator: &Operator, + operator: &Operator<'a>, ) -> Result { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs index 47189841a..c25d523aa 100644 --- a/milli/src/search/facet/filter_parser.rs +++ b/milli/src/search/facet/filter_parser.rs @@ -28,25 +28,38 @@ use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, tuple}; use nom::IResult; +use nom_locate::LocatedSpan; use self::Operator::*; use super::FilterCondition; use crate::{FieldId, FieldsIdsMap}; -#[derive(Debug, Clone, PartialEq)] -pub enum Operator { - GreaterThan(f64), - GreaterThanOrEqual(f64), - Equal(Option, String), - NotEqual(Option, String), - LowerThan(f64), - LowerThanOrEqual(f64), - Between(f64, f64), - GeoLowerThan([f64; 2], f64), - GeoGreaterThan([f64; 2], f64), +pub enum FilterError { + AttributeNotFilterable(String), } -impl Operator { +#[derive(Debug, Clone, PartialEq, Eq)] +struct Token<'a> { + pub position: Span<'a>, + pub inner: &'a str, +} + +type Span<'a> = LocatedSpan<&'a str>; + +#[derive(Debug, Clone)] +pub enum Operator<'a> { + GreaterThan(Token<'a>), + GreaterThanOrEqual(Token<'a>), + Equal(Option>, Token<'a>), + NotEqual(Option>, Token<'a>), + LowerThan(Token<'a>), + LowerThanOrEqual(Token<'a>), + Between(Token<'a>, Token<'a>), + GeoLowerThan([Token<'a>; 2], Token<'a>), + GeoGreaterThan([Token<'a>; 2], Token<'a>), +} + +impl<'a> Operator<'a> { /// This method can return two operations in case it must express /// an OR operation for the between case (i.e. `TO`). pub fn negate(self) -> (Self, Option) { @@ -180,16 +193,13 @@ impl<'a> ParseContext<'a> { where E: FilterParserError<'a>, { - let error = match input.chars().nth(0) { - Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), - None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), - }; - if !self.filterable_fields.contains(key) { - return error; - } match self.fields_ids_map.id(key) { - Some(fid) => Ok(fid), - None => error, + Some(fid) if self.filterable_fields.contains(key) => Ok(fid), + _ => Err(nom::Err::Failure(E::add_context( + input, + "Attribute is not filterable", + E::from_char(input, 'T'), + ))), } } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 9b76ca851..8cd7f1a34 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -34,7 +34,8 @@ mod query_tree; pub struct Search<'a> { query: Option, - filter: Option, + // this should be linked to the String in the query + filter: Option>, offset: usize, limit: usize, sort_criteria: Option>, From c27870e76511c1f109b31ccb2ccafb5dc07dcfc3 Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 14:33:18 +0200 Subject: [PATCH 1093/1889] integrate a first version without any error handling --- filter_parser/src/condition.rs | 8 +- filter_parser/src/lib.rs | 60 +- filter_parser/src/value.rs | 7 +- milli/Cargo.toml | 2 +- milli/src/lib.rs | 2 +- milli/src/search/facet/filter_condition.rs | 547 +++++++++++-- milli/src/search/facet/filter_parser.rs | 891 --------------------- milli/src/search/facet/mod.rs | 3 +- milli/src/search/mod.rs | 6 +- 9 files changed, 507 insertions(+), 1019 deletions(-) delete mode 100644 milli/src/search/facet/filter_parser.rs diff --git a/filter_parser/src/condition.rs b/filter_parser/src/condition.rs index 5a1bb62be..75ee8c6f7 100644 --- a/filter_parser/src/condition.rs +++ b/filter_parser/src/condition.rs @@ -12,6 +12,7 @@ use nom::branch::alt; use nom::bytes::complete::tag; +use nom::error::ParseError; use nom::sequence::tuple; use nom::IResult; use Condition::*; @@ -46,14 +47,15 @@ impl<'a> Condition<'a> { } /// condition = value ("==" | ">" ...) value -pub fn parse_condition(input: Span) -> IResult { +pub fn parse_condition<'a, E: ParseError>>( + input: Span<'a>, +) -> IResult, FilterCondition, E> { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); let (input, (key, op, value)) = tuple((|c| parse_value(c), operator, |c| parse_value(c)))(input)?; let fid = key; - // TODO match *op.fragment() { "=" => { let k = FilterCondition::Condition { fid, op: Equal(value) }; @@ -78,7 +80,7 @@ pub fn parse_condition(input: Span) -> IResult { } /// to = value value TO value -pub fn parse_to(input: Span) -> IResult { +pub fn parse_to<'a, E: ParseError>>(input: Span<'a>) -> IResult { let (input, (key, from, _, to)) = tuple((ws(|c| parse_value(c)), ws(|c| parse_value(c)), tag("TO"), ws(|c| parse_value(c))))( input, diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 096a9e26e..bb826872f 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -27,12 +27,12 @@ use nom::combinator::map; use nom::error::{ContextError, ParseError}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; -use nom::sequence::{delimited, preceded}; +use nom::sequence::{delimited, preceded, tuple}; use nom::IResult; use nom_locate::LocatedSpan; pub(crate) use value::parse_value; -type Span<'a> = LocatedSpan<&'a str>; +pub type Span<'a> = LocatedSpan<&'a str>; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Token<'a> { @@ -82,21 +82,22 @@ impl<'a> FilterCondition<'a> { } } - pub fn parse(input: &'a str) -> IResult { + pub fn parse>>(input: &'a str) -> Result { let span = Span::new(input); - parse_expression(span) + // handle error + Ok(parse_expression::<'a, E>(span).map(|(_rem, output)| output).ok().unwrap()) } } // remove OPTIONAL whitespaces before AND after the the provided parser -fn ws<'a, O>( - inner: impl FnMut(Span<'a>) -> IResult, -) -> impl FnMut(Span<'a>) -> IResult { +fn ws<'a, O, E: ParseError>>( + inner: impl FnMut(Span<'a>) -> IResult, +) -> impl FnMut(Span<'a>) -> IResult { delimited(multispace0, inner, multispace0) } /// and = not (~ "AND" not)* -fn parse_or(input: Span) -> IResult { +fn parse_or<'a, E: ParseError>>(input: Span<'a>) -> IResult { let (input, lhs) = parse_and(input)?; let (input, ors) = many0(preceded(ws(tag("OR")), |c| parse_and(c)))(input)?; @@ -106,7 +107,7 @@ fn parse_or(input: Span) -> IResult { Ok((input, expr)) } -fn parse_and(input: Span) -> IResult { +fn parse_and<'a, E: ParseError>>(input: Span<'a>) -> IResult { let (input, lhs) = parse_not(input)?; let (input, ors) = many0(preceded(ws(tag("AND")), |c| parse_not(c)))(input)?; let expr = ors @@ -116,15 +117,17 @@ fn parse_and(input: Span) -> IResult { } /// not = ("NOT" | "!") not | primary -fn parse_not(input: Span) -> IResult { +fn parse_not<'a, E: ParseError>>(input: Span<'a>) -> IResult { alt((map(preceded(alt((tag("!"), tag("NOT"))), |c| parse_not(c)), |e| e.negate()), |c| { parse_primary(c) }))(input) } /// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) -fn parse_geo_radius(input: Span) -> IResult { - let err_msg_args_incomplete = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; +fn parse_geo_radius<'a, E: ParseError>>( + input: Span<'a>, +) -> IResult, FilterCondition, E> { + // let err_msg_args_incomplete = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; /* TODO let err_msg_latitude_invalid = @@ -134,30 +137,25 @@ fn parse_geo_radius(input: Span) -> IResult { "_geoRadius. Longitude must be contained between -180 and 180 degrees."; */ + // we want to forbid space BEFORE the _geoRadius but not after let parsed = preceded::<_, _, _, _, _, _>( - // TODO: forbid spaces between _geoRadius and parenthesis - ws(tag("_geoRadius")), + tuple((multispace0, tag("_geoRadius"))), delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')')), )(input); - let (input, args): (Span, Vec) = match parsed { - Ok(e) => e, - Err(_e) => { - return Err(nom::Err::Failure(nom::error::Error::add_context( - input, - err_msg_args_incomplete, - nom::error::Error::from_char(input, '('), - ))); - } - }; + let (input, args): (Span, Vec) = parsed?; if args.len() != 3 { + // TODO + panic!("todo"); + /* let e = nom::error::Error::from_char(input, '('); return Err(nom::Err::Failure(nom::error::Error::add_context( input, err_msg_args_incomplete, e, ))); + */ } let res = FilterCondition::GeoLowerThan { @@ -168,7 +166,9 @@ fn parse_geo_radius(input: Span) -> IResult { } /// primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius -fn parse_primary(input: Span) -> IResult { +fn parse_primary<'a, E: ParseError>>( + input: Span<'a>, +) -> IResult { alt(( delimited(ws(char('(')), |c| parse_expression(c), ws(char(')'))), |c| parse_condition(c), @@ -178,12 +178,16 @@ fn parse_primary(input: Span) -> IResult { } /// expression = or -pub fn parse_expression(input: Span) -> IResult { +pub fn parse_expression<'a, E: ParseError>>( + input: Span<'a>, +) -> IResult { parse_or(input) } #[cfg(test)] pub mod tests { + use nom::error::Error; + use super::*; /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element @@ -469,7 +473,7 @@ pub mod tests { ]; for (input, expected) in test_case { - let result = Fc::parse(input); + let result = Fc::parse::>(input); assert!( result.is_ok(), @@ -477,7 +481,7 @@ pub mod tests { expected, result.unwrap_err() ); - let filter = result.unwrap().1; + let filter = result.unwrap(); assert_eq!(filter, expected, "Filter `{}` failed.", input); } } diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index c36becf7e..1497aaddd 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -1,13 +1,14 @@ use nom::branch::alt; use nom::bytes::complete::{take_till, take_while1}; use nom::character::complete::char; +use nom::error::ParseError; use nom::sequence::delimited; use nom::IResult; use crate::{ws, Span, Token}; /// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* -pub fn parse_value(input: Span) -> IResult { +pub fn parse_value<'a, E: ParseError>>(input: Span<'a>) -> IResult, Token, E> { // singleQuoted = "'" .* all but quotes "'" let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); // doubleQuoted = "\"" (word | spaces)* "\"" @@ -29,6 +30,8 @@ fn is_key_component(c: char) -> bool { #[cfg(test)] pub mod tests { + use nom::error::Error; + use super::*; use crate::tests::rtok; @@ -56,7 +59,7 @@ pub mod tests { for (input, expected) in test_case { let input = Span::new(input); - let result = parse_value(input); + let result = parse_value::>(input); assert!( result.is_ok(), diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 63fd0d984..3fc53492f 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -39,8 +39,8 @@ tempfile = "3.2.0" uuid = { version = "0.8.2", features = ["v4"] } # facet filter parser +filter_parser = { path = "../filter_parser" } nom = "7.0.0" -nom_locate = "4.0.0" # documents words self-join itertools = "0.10.0" diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 6fe5947f5..27453bf36 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -34,7 +34,7 @@ pub use self::heed_codec::{ RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, }; pub use self::index::Index; -pub use self::search::{FacetDistribution, FilterCondition, MatchingWords, Search, SearchResult}; +pub use self::search::{FacetDistribution, Filter, MatchingWords, Search, SearchResult}; pub type Result = std::result::Result; diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 5c57adb88..50caf4eac 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -2,13 +2,12 @@ use std::fmt::Debug; use std::ops::Bound::{self, Excluded, Included}; use either::Either; +use filter_parser::{Condition, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; use log::debug; use nom::error::{convert_error, VerboseError}; use roaring::RoaringBitmap; -use self::FilterCondition::*; -use super::filter_parser::{Operator, ParseContext}; use super::FacetNumberRange; use crate::error::{Error, UserError}; use crate::heed_codec::facet::{ @@ -17,24 +16,19 @@ use crate::heed_codec::facet::{ use crate::{distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result}; #[derive(Debug, Clone)] -pub enum FilterCondition<'a> { - Operator(FieldId, Operator<'a>), - Or(Box, Box), - And(Box, Box), - Empty, +pub struct Filter<'a> { + condition: FilterCondition<'a>, } -impl<'a> FilterCondition<'a> { - pub fn from_array( +impl<'a> Filter<'a> { + pub fn from_array( rtxn: &heed::RoTxn, index: &Index, array: I, ) -> Result>> where - I: IntoIterator>, - J: IntoIterator, - A: AsRef, - B: AsRef, + I: IntoIterator>, + J: IntoIterator, { let mut ands: Option = None; @@ -43,24 +37,32 @@ impl<'a> FilterCondition<'a> { Either::Left(array) => { let mut ors = None; for rule in array { - let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; + let condition = + FilterCondition::parse::>(rule.as_ref()).unwrap(); ors = match ors.take() { - Some(ors) => Some(Or(Box::new(ors), Box::new(condition))), + Some(ors) => { + Some(FilterCondition::Or(Box::new(ors), Box::new(condition))) + } None => Some(condition), }; } if let Some(rule) = ors { ands = match ands.take() { - Some(ands) => Some(And(Box::new(ands), Box::new(rule))), + Some(ands) => { + Some(FilterCondition::And(Box::new(ands), Box::new(rule))) + } None => Some(rule), }; } } Either::Right(rule) => { - let condition = FilterCondition::from_str(rtxn, index, rule.as_ref())?; + let condition = + FilterCondition::parse::>(rule.as_ref()).unwrap(); ands = match ands.take() { - Some(ands) => Some(And(Box::new(ands), Box::new(condition))), + Some(ands) => { + Some(FilterCondition::And(Box::new(ands), Box::new(condition))) + } None => Some(condition), }; } @@ -70,17 +72,14 @@ impl<'a> FilterCondition<'a> { Ok(ands) } - pub fn from_str( - rtxn: &heed::RoTxn, - index: &Index, - expression: &'a str, - ) -> Result> { + pub fn from_str(rtxn: &heed::RoTxn, index: &Index, expression: &'a str) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?; - let ctx = - ParseContext { fields_ids_map: &fields_ids_map, filterable_fields: &filterable_fields }; - match ctx.parse_expression::>(expression) { - Ok((_, fc)) => Ok(fc), + // TODO TAMO + let condition = FilterCondition::parse::>(expression).ok().unwrap(); + /* + let condition = match FilterCondition::parse::>(expression) { + Ok(fc) => Ok(fc), Err(e) => { let ve = match e { nom::Err::Error(x) => x, @@ -88,25 +87,16 @@ impl<'a> FilterCondition<'a> { _ => unreachable!(), }; Err(Error::UserError(UserError::InvalidFilter { - input: convert_error(expression, ve).to_string(), + input: convert_error(Span::new(expression), ve).to_string(), })) } - } - } - pub fn negate(self) -> FilterCondition<'a> { - match self { - Operator(fid, op) => match op.negate() { - (op, None) => Operator(fid, op), - (a, Some(b)) => Or(Box::new(Operator(fid, a)), Box::new(Operator(fid, b))), - }, - Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), - And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), - Empty => Empty, - } + }; + */ + Ok(Self { condition }) } } -impl<'a> FilterCondition<'a> { +impl<'a> Filter<'a> { /// Aggregates the documents ids that are part of the specified range automatically /// going deeper through the levels. fn explore_facet_number_levels( @@ -221,20 +211,33 @@ impl<'a> FilterCondition<'a> { numbers_db: heed::Database, strings_db: heed::Database, field_id: FieldId, - operator: &Operator<'a>, + operator: &Condition<'a>, ) -> Result { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. + // TODO TAMO: return good error when we can't parse a span let (left, right) = match operator { - Operator::GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), - Operator::GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), - Operator::Equal(number, string) => { + Condition::GreaterThan(val) => { + (Excluded(val.inner.parse::().unwrap()), Included(f64::MAX)) + } + Condition::GreaterThanOrEqual(val) => { + (Included(val.inner.parse::().unwrap()), Included(f64::MAX)) + } + Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.inner.parse().unwrap())), + Condition::LowerThanOrEqual(val) => { + (Included(f64::MIN), Included(val.inner.parse().unwrap())) + } + Condition::Between { from, to } => { + (Included(from.inner.parse::().unwrap()), Included(to.inner.parse().unwrap())) + } + Condition::Equal(val) => { let (_original_value, string_docids) = - strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); + strings_db.get(rtxn, &(field_id, val.inner))?.unwrap_or_default(); + let number = val.inner.parse::().ok(); let number_docids = match number { Some(n) => { - let n = Included(*n); + let n = Included(n); let mut output = RoaringBitmap::new(); Self::explore_facet_number_levels( rtxn, @@ -251,50 +254,49 @@ impl<'a> FilterCondition<'a> { }; return Ok(string_docids | number_docids); } - Operator::NotEqual(number, string) => { + Condition::NotEqual(val) => { + let number = val.inner.parse::().ok(); let all_numbers_ids = if number.is_some() { index.number_faceted_documents_ids(rtxn, field_id)? } else { RoaringBitmap::new() }; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; - let operator = Operator::Equal(*number, string.clone()); + let operator = Condition::Equal(val.clone()); let docids = Self::evaluate_operator( rtxn, index, numbers_db, strings_db, field_id, &operator, )?; return Ok((all_numbers_ids | all_strings_ids) - docids); - } - Operator::LowerThan(val) => (Included(f64::MIN), Excluded(*val)), - Operator::LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), - Operator::Between(left, right) => (Included(*left), Included(*right)), - Operator::GeoLowerThan(base_point, distance) => { - let rtree = match index.geo_rtree(rtxn)? { - Some(rtree) => rtree, - None => return Ok(RoaringBitmap::new()), - }; + } /* + Condition::GeoLowerThan(base_point, distance) => { + let rtree = match index.geo_rtree(rtxn)? { + Some(rtree) => rtree, + None => return Ok(RoaringBitmap::new()), + }; - let result = rtree - .nearest_neighbor_iter(base_point) - .take_while(|point| { - distance_between_two_points(base_point, point.geom()) < *distance - }) - .map(|point| point.data) - .collect(); + let result = rtree + .nearest_neighbor_iter(base_point) + .take_while(|point| { + distance_between_two_points(base_point, point.geom()) < *distance + }) + .map(|point| point.data) + .collect(); - return Ok(result); - } - Operator::GeoGreaterThan(point, distance) => { - let result = Self::evaluate_operator( - rtxn, - index, - numbers_db, - strings_db, - field_id, - &Operator::GeoLowerThan(point.clone(), *distance), - )?; - let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; - return Ok(geo_faceted_doc_ids - result); - } + return Ok(result); + } + Condition::GeoGreaterThan(point, distance) => { + let result = Self::evaluate_operator( + rtxn, + index, + numbers_db, + strings_db, + field_id, + &Condition::GeoLowerThan(point.clone(), *distance), + )?; + let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; + return Ok(geo_faceted_doc_ids - result); + } + */ }; // Ask for the biggest value that can exist for this specific field, if it exists @@ -326,21 +328,390 @@ impl<'a> FilterCondition<'a> { let numbers_db = index.facet_id_f64_docids; let strings_db = index.facet_id_string_docids; - match self { - Operator(fid, op) => { - Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op) + match &self.condition { + FilterCondition::Condition { fid, op } => { + // TODO: parse fid + let _ = fid; + let fid = 42; + Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) } - Or(lhs, rhs) => { - let lhs = lhs.evaluate(rtxn, index)?; - let rhs = rhs.evaluate(rtxn, index)?; + FilterCondition::Or(lhs, rhs) => { + let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?; + let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?; Ok(lhs | rhs) } - And(lhs, rhs) => { - let lhs = lhs.evaluate(rtxn, index)?; - let rhs = rhs.evaluate(rtxn, index)?; + FilterCondition::And(lhs, rhs) => { + let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?; + let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?; Ok(lhs & rhs) } Empty => Ok(RoaringBitmap::new()), } } } + +impl<'a> From> for Filter<'a> { + fn from(fc: FilterCondition<'a>) -> Self { + Self { condition: fc } + } +} + +#[cfg(test)] +mod tests { + use big_s::S; + use either::Either; + use heed::EnvOpenOptions; + use maplit::hashset; + + use super::*; + use crate::update::Settings; + use crate::Index; + + #[test] + fn number() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut map = index.fields_ids_map(&wtxn).unwrap(); + map.insert("timestamp"); + index.put_fields_ids_map(&mut wtxn, &map).unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_filterable_fields(hashset! { "timestamp".into() }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); + let expected = FilterCondition::Operator(0, Between(22.0, 44.0)); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); + let expected = FilterCondition::Or( + Box::new(FilterCondition::Operator(0, LowerThan(22.0))), + Box::new(FilterCondition::Operator(0, GreaterThan(44.0))), + ); + assert_eq!(condition, expected); + } + + #[test] + fn compare() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp"), S("id")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") ,S("id")}); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "channel < 20").unwrap(); + let expected = FilterCondition::Operator(0, LowerThan(20.0)); + assert_eq!(condition, expected); + + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "id < 200").unwrap(); + let expected = FilterCondition::Operator(2, LowerThan(200.0)); + assert_eq!(condition, expected); + } + + #[test] + fn parentheses() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", + ) + .unwrap(); + let expected = FilterCondition::Or( + Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), + Box::new(FilterCondition::And( + Box::new(FilterCondition::Operator(1, Between(22.0, 44.0))), + Box::new(FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce")))), + )), + ); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", + ) + .unwrap(); + let expected = FilterCondition::Or( + Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), + Box::new(FilterCondition::Or( + Box::new(FilterCondition::Or( + Box::new(FilterCondition::Operator(1, LowerThan(22.0))), + Box::new(FilterCondition::Operator(1, GreaterThan(44.0))), + )), + Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("ponce")))), + )), + ); + assert_eq!(condition, expected); + } + + #[test] + fn from_array() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Simple array with Left + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, _, _, &str>( + &rtxn, + &index, + vec![Either::Left(["channel = mv"])], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = mv").unwrap(); + assert_eq!(condition, expected); + + // Simple array with Right + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( + &rtxn, + &index, + vec![Either::Right("channel = mv")], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = mv").unwrap(); + assert_eq!(condition, expected); + + // Array with Left and escaped quote + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, _, _, &str>( + &rtxn, + &index, + vec![Either::Left(["channel = \"Mister Mv\""])], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = \"Mister Mv\"").unwrap(); + assert_eq!(condition, expected); + + // Array with Right and escaped quote + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( + &rtxn, + &index, + vec![Either::Right("channel = \"Mister Mv\"")], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = \"Mister Mv\"").unwrap(); + assert_eq!(condition, expected); + + // Array with Left and escaped simple quote + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, _, _, &str>( + &rtxn, + &index, + vec![Either::Left(["channel = 'Mister Mv'"])], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = 'Mister Mv'").unwrap(); + assert_eq!(condition, expected); + + // Array with Right and escaped simple quote + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( + &rtxn, + &index, + vec![Either::Right("channel = 'Mister Mv'")], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "channel = 'Mister Mv'").unwrap(); + assert_eq!(condition, expected); + + // Simple with parenthesis + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array::<_, _, _, &str>( + &rtxn, + &index, + vec![Either::Left(["(channel = mv)"])], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str(&rtxn, &index, "(channel = mv)").unwrap(); + assert_eq!(condition, expected); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array( + &rtxn, + &index, + vec![ + Either::Right("channel = gotaga"), + Either::Left(vec!["timestamp = 44", "channel != ponce"]), + ], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga AND (timestamp = 44 OR channel != ponce)", + ) + .unwrap(); + assert_eq!(condition, expected); + } + + #[test] + fn geo_radius() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + // basic test + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); + let expected = FilterCondition::Operator(0, GeoLowerThan([12., 13.0005], 2000.)); + assert_eq!(condition, expected); + + // test the negation of the GeoLowerThan + let condition = + FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); + let expected = FilterCondition::Operator(0, GeoGreaterThan([50., 18.], 2000.500)); + assert_eq!(condition, expected); + + // composition of multiple operations + let condition = FilterCondition::from_str( + &rtxn, + &index, + "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", + ) + .unwrap(); + let expected = FilterCondition::Or( + Box::new(FilterCondition::And( + Box::new(FilterCondition::Operator(0, GeoGreaterThan([1., 2.], 300.))), + Box::new(FilterCondition::Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), + )), + Box::new(FilterCondition::Operator(1, LowerThanOrEqual(10.))), + ); + assert_eq!(condition, expected); + } + + #[test] + fn geo_radius_error() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius don't have enough parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have too many parameters + let result = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!( + error.to_string().contains("Latitude must be contained between -90 and 90 degrees."), + "{}", + error.to_string() + ); + + // georadius have a bad latitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude must be contained between -90 and 90 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Longitude must be contained between -180 and 180 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Longitude must be contained between -180 and 180 degrees.")); + } +} diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs deleted file mode 100644 index c25d523aa..000000000 --- a/milli/src/search/facet/filter_parser.rs +++ /dev/null @@ -1,891 +0,0 @@ -//! BNF grammar: -//! -//! ```text -//! expression = or -//! or = and (~ "OR" ~ and) -//! and = not (~ "AND" not)* -//! not = ("NOT" | "!") not | primary -//! primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius -//! to = value value TO value -//! condition = value ("==" | ">" ...) value -//! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* -//! singleQuoted = "'" .* all but quotes "'" -//! doubleQuoted = "\"" (word | spaces)* "\"" -//! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) -//! ``` - -use std::collections::HashSet; -use std::fmt::Debug; -use std::result::Result as StdResult; - -use nom::branch::alt; -use nom::bytes::complete::{tag, take_till, take_while1}; -use nom::character::complete::{char, multispace0}; -use nom::combinator::map; -use nom::error::{ContextError, ErrorKind, VerboseError}; -use nom::multi::{many0, separated_list1}; -use nom::number::complete::recognize_float; -use nom::sequence::{delimited, preceded, tuple}; -use nom::IResult; -use nom_locate::LocatedSpan; - -use self::Operator::*; -use super::FilterCondition; -use crate::{FieldId, FieldsIdsMap}; - -pub enum FilterError { - AttributeNotFilterable(String), -} - -#[derive(Debug, Clone, PartialEq, Eq)] -struct Token<'a> { - pub position: Span<'a>, - pub inner: &'a str, -} - -type Span<'a> = LocatedSpan<&'a str>; - -#[derive(Debug, Clone)] -pub enum Operator<'a> { - GreaterThan(Token<'a>), - GreaterThanOrEqual(Token<'a>), - Equal(Option>, Token<'a>), - NotEqual(Option>, Token<'a>), - LowerThan(Token<'a>), - LowerThanOrEqual(Token<'a>), - Between(Token<'a>, Token<'a>), - GeoLowerThan([Token<'a>; 2], Token<'a>), - GeoGreaterThan([Token<'a>; 2], Token<'a>), -} - -impl<'a> Operator<'a> { - /// This method can return two operations in case it must express - /// an OR operation for the between case (i.e. `TO`). - pub fn negate(self) -> (Self, Option) { - match self { - GreaterThan(n) => (LowerThanOrEqual(n), None), - GreaterThanOrEqual(n) => (LowerThan(n), None), - Equal(n, s) => (NotEqual(n, s), None), - NotEqual(n, s) => (Equal(n, s), None), - LowerThan(n) => (GreaterThanOrEqual(n), None), - LowerThanOrEqual(n) => (GreaterThan(n), None), - Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), - GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None), - GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None), - } - } -} - -pub trait FilterParserError<'a>: - nom::error::ParseError<&'a str> + ContextError<&'a str> + std::fmt::Debug -{ -} - -impl<'a> FilterParserError<'a> for VerboseError<&'a str> {} - -pub struct ParseContext<'a> { - pub fields_ids_map: &'a FieldsIdsMap, - pub filterable_fields: &'a HashSet, -} - -impl<'a> ParseContext<'a> { - /// and = not (~ "AND" not)* - fn parse_or(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let (input, lhs) = self.parse_and(input)?; - let (input, ors) = - many0(preceded(self.ws(tag("OR")), |c| Self::parse_and(self, c)))(input)?; - - let expr = ors - .into_iter() - .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch))); - Ok((input, expr)) - } - - fn parse_and(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let (input, lhs) = self.parse_not(input)?; - let (input, ors) = many0(preceded(self.ws(tag("AND")), |c| self.parse_not(c)))(input)?; - let expr = ors - .into_iter() - .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); - Ok((input, expr)) - } - - /// not = ("NOT" | "!") not | primary - fn parse_not(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - alt(( - map(preceded(alt((tag("!"), tag("NOT"))), |c| self.parse_not(c)), |e| e.negate()), - |c| self.parse_primary(c), - ))(input) - } - - fn ws(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> - where - F: FnMut(&'a str) -> IResult<&'a str, O, E>, - E: FilterParserError<'a>, - { - delimited(multispace0, inner, multispace0) - } - - /// condition = value ("==" | ">" ...) value - fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); - let (input, (key, op, value)) = - tuple((|c| self.parse_value(c), operator, |c| self.parse_value(c)))(input)?; - - let fid = self.parse_fid(input, key)?; - let r: StdResult>> = self.parse_numeric(value); - match op { - "=" => { - let k = - FilterCondition::Operator(fid, Equal(r.ok(), value.to_string().to_lowercase())); - Ok((input, k)) - } - "!=" => { - let k = FilterCondition::Operator( - fid, - NotEqual(r.ok(), value.to_string().to_lowercase()), - ); - Ok((input, k)) - } - ">" | "<" | "<=" | ">=" => { - let numeric: f64 = self.parse_numeric(value)?; - let k = match op { - ">" => FilterCondition::Operator(fid, GreaterThan(numeric)), - "<" => FilterCondition::Operator(fid, LowerThan(numeric)), - "<=" => FilterCondition::Operator(fid, LowerThanOrEqual(numeric)), - ">=" => FilterCondition::Operator(fid, GreaterThanOrEqual(numeric)), - _ => unreachable!(), - }; - Ok((input, k)) - } - _ => unreachable!(), - } - } - - fn parse_numeric(&'a self, input: &'a str) -> StdResult> - where - E: FilterParserError<'a>, - T: std::str::FromStr, - { - match input.parse::() { - Ok(n) => Ok(n), - Err(_) => match input.chars().nth(0) { - Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), - None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), - }, - } - } - - fn parse_fid(&'a self, input: &'a str, key: &'a str) -> StdResult> - where - E: FilterParserError<'a>, - { - match self.fields_ids_map.id(key) { - Some(fid) if self.filterable_fields.contains(key) => Ok(fid), - _ => Err(nom::Err::Failure(E::add_context( - input, - "Attribute is not filterable", - E::from_char(input, 'T'), - ))), - } - } - - /// to = value value TO value - fn parse_to(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let (input, (key, from, _, to)) = tuple(( - self.ws(|c| self.parse_value(c)), - self.ws(|c| self.parse_value(c)), - tag("TO"), - self.ws(|c| self.parse_value(c)), - ))(input)?; - - let fid = self.parse_fid(input, key)?; - let numeric_from: f64 = self.parse_numeric(from)?; - let numeric_to: f64 = self.parse_numeric(to)?; - let res = FilterCondition::Operator(fid, Between(numeric_from, numeric_to)); - - Ok((input, res)) - } - - /// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) - fn parse_geo_radius(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let err_msg_args_incomplete = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; - let err_msg_latitude_invalid = - "_geoRadius. Latitude must be contained between -90 and 90 degrees."; - - let err_msg_longitude_invalid = - "_geoRadius. Longitude must be contained between -180 and 180 degrees."; - - let parsed = preceded::<_, _, _, E, _, _>( - // TODO: forbid spaces between _geoRadius and parenthesis - self.ws(tag("_geoRadius")), - delimited( - char('('), - separated_list1(tag(","), self.ws(|c| recognize_float(c))), - char(')'), - ), - )(input); - - let (input, args): (&str, Vec<&str>) = match parsed { - Ok(e) => e, - Err(_e) => { - return Err(nom::Err::Failure(E::add_context( - input, - err_msg_args_incomplete, - E::from_char(input, '('), - ))); - } - }; - - if args.len() != 3 { - let e = E::from_char(input, '('); - return Err(nom::Err::Failure(E::add_context(input, err_msg_args_incomplete, e))); - } - let lat = self.parse_numeric(args[0])?; - let lng = self.parse_numeric(args[1])?; - let dis = self.parse_numeric(args[2])?; - - let fid = match self.fields_ids_map.id("_geo") { - Some(fid) => fid, - // TODO send an error - None => return Ok((input, FilterCondition::Empty)), - }; - - if !(-90.0..=90.0).contains(&lat) { - return Err(nom::Err::Failure(E::add_context( - input, - err_msg_latitude_invalid, - E::from_char(input, '('), - ))); - } else if !(-180.0..=180.0).contains(&lng) { - return Err(nom::Err::Failure(E::add_context( - input, - err_msg_longitude_invalid, - E::from_char(input, '('), - ))); - } - - let res = FilterCondition::Operator(fid, GeoLowerThan([lat, lng], dis)); - Ok((input, res)) - } - - /// primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius - fn parse_primary(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - alt(( - delimited(self.ws(char('(')), |c| self.parse_expression(c), self.ws(char(')'))), - |c| self.parse_condition(c), - |c| self.parse_to(c), - |c| self.parse_geo_radius(c), - ))(input) - } - - /// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* - fn parse_value(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> - where - E: FilterParserError<'a>, - { - // singleQuoted = "'" .* all but quotes "'" - let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); - // doubleQuoted = "\"" (word | spaces)* "\"" - let quoted_key = |input| take_till(|c: char| c == '"')(input); - // word = (alphanumeric | _ | - | .)+ - let word = |input| take_while1(Self::is_key_component)(input); - - alt(( - self.ws(delimited(char('\''), simple_quoted_key, char('\''))), - self.ws(delimited(char('"'), quoted_key, char('"'))), - self.ws(word), - ))(input) - } - - fn is_key_component(c: char) -> bool { - c.is_alphanumeric() || ['_', '-', '.'].contains(&c) - } - - /// expression = or - pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - self.parse_or(input) - } -} - -#[cfg(test)] -mod tests { - use big_s::S; - use either::Either; - use heed::EnvOpenOptions; - use maplit::hashset; - - use super::*; - use crate::update::Settings; - use crate::Index; - - #[test] - fn string() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut map = index.fields_ids_map(&wtxn).unwrap(); - map.insert("channel"); - map.insert("dog race"); - map.insert("subscribers"); - map.insert("_geo"); - index.put_fields_ids_map(&mut wtxn, &map).unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields( - hashset! { S("channel"), S("dog race"), S("subscribers"), S("_geo") }, - ); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - - use FilterCondition as Fc; - let test_case = [ - // simple test - ( - Fc::from_str(&rtxn, &index, "channel = Ponce"), - Fc::Operator(0, Operator::Equal(None, S("ponce"))), - ), - ( - Fc::from_str(&rtxn, &index, "subscribers = 12"), - Fc::Operator(2, Operator::Equal(Some(12.), S("12"))), - ), - // test all the quotes and simple quotes - ( - Fc::from_str(&rtxn, &index, "channel = 'Mister Mv'"), - Fc::Operator(0, Operator::Equal(None, S("mister mv"))), - ), - ( - Fc::from_str(&rtxn, &index, "channel = \"Mister Mv\""), - Fc::Operator(0, Operator::Equal(None, S("mister mv"))), - ), - ( - Fc::from_str(&rtxn, &index, "'dog race' = Borzoi"), - Fc::Operator(1, Operator::Equal(None, S("borzoi"))), - ), - ( - Fc::from_str(&rtxn, &index, "\"dog race\" = Chusky"), - Fc::Operator(1, Operator::Equal(None, S("chusky"))), - ), - ( - Fc::from_str(&rtxn, &index, "\"dog race\" = \"Bernese Mountain\""), - Fc::Operator(1, Operator::Equal(None, S("bernese mountain"))), - ), - ( - Fc::from_str(&rtxn, &index, "'dog race' = 'Bernese Mountain'"), - Fc::Operator(1, Operator::Equal(None, S("bernese mountain"))), - ), - ( - Fc::from_str(&rtxn, &index, "\"dog race\" = 'Bernese Mountain'"), - Fc::Operator(1, Operator::Equal(None, S("bernese mountain"))), - ), - // test all the operators - ( - Fc::from_str(&rtxn, &index, "channel != ponce"), - Fc::Operator(0, Operator::NotEqual(None, S("ponce"))), - ), - ( - Fc::from_str(&rtxn, &index, "NOT channel = ponce"), - Fc::Operator(0, Operator::NotEqual(None, S("ponce"))), - ), - ( - Fc::from_str(&rtxn, &index, "subscribers < 1000"), - Fc::Operator(2, Operator::LowerThan(1000.)), - ), - ( - Fc::from_str(&rtxn, &index, "subscribers > 1000"), - Fc::Operator(2, Operator::GreaterThan(1000.)), - ), - ( - Fc::from_str(&rtxn, &index, "subscribers <= 1000"), - Fc::Operator(2, Operator::LowerThanOrEqual(1000.)), - ), - ( - Fc::from_str(&rtxn, &index, "subscribers >= 1000"), - Fc::Operator(2, Operator::GreaterThanOrEqual(1000.)), - ), - ( - Fc::from_str(&rtxn, &index, "NOT subscribers < 1000"), - Fc::Operator(2, Operator::GreaterThanOrEqual(1000.)), - ), - ( - Fc::from_str(&rtxn, &index, "NOT subscribers > 1000"), - Fc::Operator(2, Operator::LowerThanOrEqual(1000.)), - ), - ( - Fc::from_str(&rtxn, &index, "NOT subscribers <= 1000"), - Fc::Operator(2, Operator::GreaterThan(1000.)), - ), - ( - Fc::from_str(&rtxn, &index, "NOT subscribers >= 1000"), - Fc::Operator(2, Operator::LowerThan(1000.)), - ), - ( - Fc::from_str(&rtxn, &index, "subscribers 100 TO 1000"), - Fc::Operator(2, Operator::Between(100., 1000.)), - ), - ( - Fc::from_str(&rtxn, &index, "NOT subscribers 100 TO 1000"), - Fc::Or( - Box::new(Fc::Operator(2, Operator::LowerThan(100.))), - Box::new(Fc::Operator(2, Operator::GreaterThan(1000.))), - ), - ), - ( - Fc::from_str(&rtxn, &index, "_geoRadius(12, 13, 14)"), - Fc::Operator(3, Operator::GeoLowerThan([12., 13.], 14.)), - ), - ( - Fc::from_str(&rtxn, &index, "NOT _geoRadius(12, 13, 14)"), - Fc::Operator(3, Operator::GeoGreaterThan([12., 13.], 14.)), - ), - // test simple `or` and `and` - ( - Fc::from_str(&rtxn, &index, "channel = ponce AND 'dog race' != 'bernese mountain'"), - Fc::And( - Box::new(Fc::Operator(0, Operator::Equal(None, S("ponce")))), - Box::new(Fc::Operator(1, Operator::NotEqual(None, S("bernese mountain")))), - ), - ), - ( - Fc::from_str(&rtxn, &index, "channel = ponce OR 'dog race' != 'bernese mountain'"), - Fc::Or( - Box::new(Fc::Operator(0, Operator::Equal(None, S("ponce")))), - Box::new(Fc::Operator(1, Operator::NotEqual(None, S("bernese mountain")))), - ), - ), - ( - Fc::from_str( - &rtxn, - &index, - "channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000", - ), - Fc::Or( - Box::new(Fc::And( - Box::new(Fc::Operator(0, Operator::Equal(None, S("ponce")))), - Box::new(Fc::Operator(1, Operator::NotEqual(None, S("bernese mountain")))), - )), - Box::new(Fc::Operator(2, Operator::GreaterThan(1000.))), - ), - ), - // test parenthesis - ( - Fc::from_str( - &rtxn, - &index, - "channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )", - ), - Fc::And( - Box::new(Fc::Operator(0, Operator::Equal(None, S("ponce")))), - Box::new(Fc::Or( - Box::new(Fc::Operator(1, Operator::NotEqual(None, S("bernese mountain")))), - Box::new(Fc::Operator(2, Operator::GreaterThan(1000.))), - ))), - ), - ( - Fc::from_str( - &rtxn, - &index, - "(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14)", - ), - Fc::And( - Box::new(Fc::Or( - Box::new(Fc::And( - Box::new(Fc::Operator(0, Operator::Equal(None, S("ponce")))), - Box::new(Fc::Operator(1, Operator::NotEqual(None, S("bernese mountain")))), - )), - Box::new(Fc::Operator(2, Operator::GreaterThan(1000.))), - )), - Box::new(Fc::Operator(3, Operator::GeoLowerThan([12., 13.], 14.)))) - ), - ]; - - for (result, expected) in test_case { - assert!( - result.is_ok(), - "Filter {:?} was supposed to be parsed but failed with the following error: `{}`", - expected, - result.unwrap_err() - ); - let filter = result.unwrap(); - assert_eq!(filter, expected,); - } - } - - #[test] - fn number() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut map = index.fields_ids_map(&wtxn).unwrap(); - map.insert("timestamp"); - index.put_fields_ids_map(&mut wtxn, &map).unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { "timestamp".into() }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = FilterCondition::Operator(0, Between(22.0, 44.0)); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, LowerThan(22.0))), - Box::new(FilterCondition::Operator(0, GreaterThan(44.0))), - ); - assert_eq!(condition, expected); - } - - #[test] - fn compare() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp"), S("id")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") ,S("id")}); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "channel < 20").unwrap(); - let expected = FilterCondition::Operator(0, LowerThan(20.0)); - assert_eq!(condition, expected); - - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "id < 200").unwrap(); - let expected = FilterCondition::Operator(2, LowerThan(200.0)); - assert_eq!(condition, expected); - } - - #[test] - fn parentheses() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), - Box::new(FilterCondition::And( - Box::new(FilterCondition::Operator(1, Between(22.0, 44.0))), - Box::new(FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce")))), - )), - ); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), - Box::new(FilterCondition::Or( - Box::new(FilterCondition::Or( - Box::new(FilterCondition::Operator(1, LowerThan(22.0))), - Box::new(FilterCondition::Operator(1, GreaterThan(44.0))), - )), - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("ponce")))), - )), - ); - assert_eq!(condition, expected); - } - - #[test] - fn from_array() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Simple array with Left - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, _, _, &str>( - &rtxn, - &index, - vec![Either::Left(["channel = mv"])], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = mv").unwrap(); - assert_eq!(condition, expected); - - // Simple array with Right - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( - &rtxn, - &index, - vec![Either::Right("channel = mv")], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = mv").unwrap(); - assert_eq!(condition, expected); - - // Array with Left and escaped quote - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, _, _, &str>( - &rtxn, - &index, - vec![Either::Left(["channel = \"Mister Mv\""])], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = \"Mister Mv\"").unwrap(); - assert_eq!(condition, expected); - - // Array with Right and escaped quote - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( - &rtxn, - &index, - vec![Either::Right("channel = \"Mister Mv\"")], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = \"Mister Mv\"").unwrap(); - assert_eq!(condition, expected); - - // Array with Left and escaped simple quote - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, _, _, &str>( - &rtxn, - &index, - vec![Either::Left(["channel = 'Mister Mv'"])], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = 'Mister Mv'").unwrap(); - assert_eq!(condition, expected); - - // Array with Right and escaped simple quote - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( - &rtxn, - &index, - vec![Either::Right("channel = 'Mister Mv'")], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = 'Mister Mv'").unwrap(); - assert_eq!(condition, expected); - - // Simple with parenthesis - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, _, _, &str>( - &rtxn, - &index, - vec![Either::Left(["(channel = mv)"])], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "(channel = mv)").unwrap(); - assert_eq!(condition, expected); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array( - &rtxn, - &index, - vec![ - Either::Right("channel = gotaga"), - Either::Left(vec!["timestamp = 44", "channel != ponce"]), - ], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga AND (timestamp = 44 OR channel != ponce)", - ) - .unwrap(); - assert_eq!(condition, expected); - } - - #[test] - fn geo_radius() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - // basic test - let condition = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); - let expected = FilterCondition::Operator(0, GeoLowerThan([12., 13.0005], 2000.)); - assert_eq!(condition, expected); - - // test the negation of the GeoLowerThan - let condition = - FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); - let expected = FilterCondition::Operator(0, GeoGreaterThan([50., 18.], 2000.500)); - assert_eq!(condition, expected); - - // composition of multiple operations - let condition = FilterCondition::from_str( - &rtxn, - &index, - "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::And( - Box::new(FilterCondition::Operator(0, GeoGreaterThan([1., 2.], 300.))), - Box::new(FilterCondition::Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), - )), - Box::new(FilterCondition::Operator(1, LowerThanOrEqual(10.))), - ); - assert_eq!(condition, expected); - } - - #[test] - fn geo_radius_error() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - - // georadius don't have any parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius don't have any parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius don't have enough parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius have too many parameters - let result = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!( - error.to_string().contains("Latitude must be contained between -90 and 90 degrees."), - "{}", - error.to_string() - ); - - // georadius have a bad latitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Latitude must be contained between -90 and 90 degrees.")); - - // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Longitude must be contained between -180 and 180 degrees.")); - - // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Longitude must be contained between -180 and 180 degrees.")); - } -} diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 3efa0262f..d6f276fbb 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,10 +1,9 @@ pub use self::facet_distribution::FacetDistribution; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::facet_string::FacetStringIter; -pub use self::filter_condition::FilterCondition; +pub use self::filter_condition::Filter; mod facet_distribution; mod facet_number; mod facet_string; mod filter_condition; -mod filter_parser; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 8cd7f1a34..a31ead1ec 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -14,7 +14,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition}; +pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; @@ -35,7 +35,7 @@ mod query_tree; pub struct Search<'a> { query: Option, // this should be linked to the String in the query - filter: Option>, + filter: Option>, offset: usize, limit: usize, sort_criteria: Option>, @@ -97,7 +97,7 @@ impl<'a> Search<'a> { self } - pub fn filter(&mut self, condition: FilterCondition) -> &mut Search<'a> { + pub fn filter(&mut self, condition: Filter<'a>) -> &mut Search<'a> { self.filter = Some(condition); self } From d6ba84ea99721a919cedc0e6a44ecbc992e4a983 Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 15:09:56 +0200 Subject: [PATCH 1094/1889] re introduce the special error type to be able to add context to the errors --- filter_parser/src/condition.rs | 7 ++-- filter_parser/src/lib.rs | 59 ++++++++++++---------------------- filter_parser/src/value.rs | 5 ++- 3 files changed, 25 insertions(+), 46 deletions(-) diff --git a/filter_parser/src/condition.rs b/filter_parser/src/condition.rs index 75ee8c6f7..b8d0e1efc 100644 --- a/filter_parser/src/condition.rs +++ b/filter_parser/src/condition.rs @@ -12,12 +12,11 @@ use nom::branch::alt; use nom::bytes::complete::tag; -use nom::error::ParseError; use nom::sequence::tuple; use nom::IResult; use Condition::*; -use crate::{parse_value, ws, FilterCondition, Span, Token}; +use crate::{parse_value, ws, FPError, FilterCondition, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Condition<'a> { @@ -47,7 +46,7 @@ impl<'a> Condition<'a> { } /// condition = value ("==" | ">" ...) value -pub fn parse_condition<'a, E: ParseError>>( +pub fn parse_condition<'a, E: FPError<'a>>( input: Span<'a>, ) -> IResult, FilterCondition, E> { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); @@ -80,7 +79,7 @@ pub fn parse_condition<'a, E: ParseError>>( } /// to = value value TO value -pub fn parse_to<'a, E: ParseError>>(input: Span<'a>) -> IResult { +pub fn parse_to<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { let (input, (key, from, _, to)) = tuple((ws(|c| parse_value(c)), ws(|c| parse_value(c)), tag("TO"), ws(|c| parse_value(c))))( input, diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index bb826872f..007817655 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -24,16 +24,22 @@ use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::{char, multispace0}; use nom::combinator::map; -use nom::error::{ContextError, ParseError}; +use nom::error::{ContextError, Error, VerboseError}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, tuple}; -use nom::IResult; +use nom::{Finish, IResult}; use nom_locate::LocatedSpan; pub(crate) use value::parse_value; pub type Span<'a> = LocatedSpan<&'a str>; +pub trait FilterParserError<'a>: nom::error::ParseError> + ContextError> {} +impl<'a> FilterParserError<'a> for VerboseError> {} +impl<'a> FilterParserError<'a> for Error> {} + +use FilterParserError as FPError; + #[derive(Debug, Clone, PartialEq, Eq)] pub struct Token<'a> { pub position: Span<'a>, @@ -82,22 +88,21 @@ impl<'a> FilterCondition<'a> { } } - pub fn parse>>(input: &'a str) -> Result { + pub fn parse>(input: &'a str) -> Result { let span = Span::new(input); - // handle error - Ok(parse_expression::<'a, E>(span).map(|(_rem, output)| output).ok().unwrap()) + parse_expression::<'a, E>(span).finish().map(|(_rem, output)| output) } } // remove OPTIONAL whitespaces before AND after the the provided parser -fn ws<'a, O, E: ParseError>>( +fn ws<'a, O, E: FPError<'a>>( inner: impl FnMut(Span<'a>) -> IResult, ) -> impl FnMut(Span<'a>) -> IResult { delimited(multispace0, inner, multispace0) } /// and = not (~ "AND" not)* -fn parse_or<'a, E: ParseError>>(input: Span<'a>) -> IResult { +fn parse_or<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { let (input, lhs) = parse_and(input)?; let (input, ors) = many0(preceded(ws(tag("OR")), |c| parse_and(c)))(input)?; @@ -107,7 +112,7 @@ fn parse_or<'a, E: ParseError>>(input: Span<'a>) -> IResult>>(input: Span<'a>) -> IResult { +fn parse_and<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { let (input, lhs) = parse_not(input)?; let (input, ors) = many0(preceded(ws(tag("AND")), |c| parse_not(c)))(input)?; let expr = ors @@ -117,25 +122,15 @@ fn parse_and<'a, E: ParseError>>(input: Span<'a>) -> IResult>>(input: Span<'a>) -> IResult { +fn parse_not<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { alt((map(preceded(alt((tag("!"), tag("NOT"))), |c| parse_not(c)), |e| e.negate()), |c| { parse_primary(c) }))(input) } /// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) -fn parse_geo_radius<'a, E: ParseError>>( - input: Span<'a>, -) -> IResult, FilterCondition, E> { - // let err_msg_args_incomplete = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; - /* - TODO - let err_msg_latitude_invalid = - "_geoRadius. Latitude must be contained between -90 and 90 degrees."; - - let err_msg_longitude_invalid = - "_geoRadius. Longitude must be contained between -180 and 180 degrees."; - */ +fn parse_geo_radius<'a, E: FPError<'a>>(input: Span<'a>) -> IResult, FilterCondition, E> { + let err_msg_args_incomplete = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; // we want to forbid space BEFORE the _geoRadius but not after let parsed = preceded::<_, _, _, _, _, _>( @@ -146,16 +141,8 @@ fn parse_geo_radius<'a, E: ParseError>>( let (input, args): (Span, Vec) = parsed?; if args.len() != 3 { - // TODO - panic!("todo"); - /* - let e = nom::error::Error::from_char(input, '('); - return Err(nom::Err::Failure(nom::error::Error::add_context( - input, - err_msg_args_incomplete, - e, - ))); - */ + let e = E::from_char(input, '('); + return Err(nom::Err::Failure(E::add_context(input, err_msg_args_incomplete, e))); } let res = FilterCondition::GeoLowerThan { @@ -166,9 +153,7 @@ fn parse_geo_radius<'a, E: ParseError>>( } /// primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius -fn parse_primary<'a, E: ParseError>>( - input: Span<'a>, -) -> IResult { +fn parse_primary<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { alt(( delimited(ws(char('(')), |c| parse_expression(c), ws(char(')'))), |c| parse_condition(c), @@ -178,16 +163,12 @@ fn parse_primary<'a, E: ParseError>>( } /// expression = or -pub fn parse_expression<'a, E: ParseError>>( - input: Span<'a>, -) -> IResult { +pub fn parse_expression<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { parse_or(input) } #[cfg(test)] pub mod tests { - use nom::error::Error; - use super::*; /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index 1497aaddd..5b3a8dfd1 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -1,14 +1,13 @@ use nom::branch::alt; use nom::bytes::complete::{take_till, take_while1}; use nom::character::complete::char; -use nom::error::ParseError; use nom::sequence::delimited; use nom::IResult; -use crate::{ws, Span, Token}; +use crate::{ws, FPError, Span, Token}; /// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* -pub fn parse_value<'a, E: ParseError>>(input: Span<'a>) -> IResult, Token, E> { +pub fn parse_value<'a, E: FPError<'a>>(input: Span<'a>) -> IResult, Token, E> { // singleQuoted = "'" .* all but quotes "'" let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); // doubleQuoted = "\"" (word | spaces)* "\"" From efb2f8b3254213bf919084ff6f42d922a3f7a68a Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 16:38:35 +0200 Subject: [PATCH 1095/1889] convert the errors --- filter_parser/Cargo.toml | 1 + filter_parser/src/lib.rs | 4 +- milli/Cargo.toml | 1 + milli/src/search/facet/filter_condition.rs | 53 +++++++++------------- 4 files changed, 26 insertions(+), 33 deletions(-) diff --git a/filter_parser/Cargo.toml b/filter_parser/Cargo.toml index 80767d5c4..2bdb3316a 100644 --- a/filter_parser/Cargo.toml +++ b/filter_parser/Cargo.toml @@ -8,3 +8,4 @@ edition = "2021" [dependencies] nom = "7.0.0" nom_locate = "4.0.0" +nom-greedyerror = "0.4.0" diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 007817655..7153c5361 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -24,17 +24,19 @@ use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::{char, multispace0}; use nom::combinator::map; -use nom::error::{ContextError, Error, VerboseError}; +use nom::error::{ContextError, Error, ErrorKind, VerboseError}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, tuple}; use nom::{Finish, IResult}; +use nom_greedyerror::GreedyError; use nom_locate::LocatedSpan; pub(crate) use value::parse_value; pub type Span<'a> = LocatedSpan<&'a str>; pub trait FilterParserError<'a>: nom::error::ParseError> + ContextError> {} +impl<'a> FilterParserError<'a> for GreedyError, ErrorKind> {} impl<'a> FilterParserError<'a> for VerboseError> {} impl<'a> FilterParserError<'a> for Error> {} diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3fc53492f..1aaeed008 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -41,6 +41,7 @@ uuid = { version = "0.8.2", features = ["v4"] } # facet filter parser filter_parser = { path = "../filter_parser" } nom = "7.0.0" +nom-greedyerror = "0.4.0" # documents words self-join itertools = "0.10.0" diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 50caf4eac..fca35ff4d 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -2,10 +2,11 @@ use std::fmt::Debug; use std::ops::Bound::{self, Excluded, Included}; use either::Either; -use filter_parser::{Condition, FilterCondition, Span, Token}; +use filter_parser::{Condition, FilterCondition, FilterParserError, Span, Token}; use heed::types::DecodeIgnore; use log::debug; -use nom::error::{convert_error, VerboseError}; +use nom::error::{ErrorKind, VerboseError}; +use nom_greedyerror::{convert_error, GreedyError}; use roaring::RoaringBitmap; use super::FacetNumberRange; @@ -20,12 +21,14 @@ pub struct Filter<'a> { condition: FilterCondition<'a>, } +impl<'a> From>> for Error { + fn from(nom_error: VerboseError>) -> Self { + UserError::InvalidFilter { input: nom_error.to_string() }.into() + } +} + impl<'a> Filter<'a> { - pub fn from_array( - rtxn: &heed::RoTxn, - index: &Index, - array: I, - ) -> Result>> + pub fn from_array(array: I) -> Result> where I: IntoIterator>, J: IntoIterator, @@ -37,8 +40,7 @@ impl<'a> Filter<'a> { Either::Left(array) => { let mut ors = None; for rule in array { - let condition = - FilterCondition::parse::>(rule.as_ref()).unwrap(); + let condition = Self::from_str(rule.as_ref())?.condition; ors = match ors.take() { Some(ors) => { Some(FilterCondition::Or(Box::new(ors), Box::new(condition))) @@ -57,8 +59,7 @@ impl<'a> Filter<'a> { } } Either::Right(rule) => { - let condition = - FilterCondition::parse::>(rule.as_ref()).unwrap(); + let condition = Self::from_str(rule.as_ref())?.condition; ands = match ands.take() { Some(ands) => { Some(FilterCondition::And(Box::new(ands), Box::new(condition))) @@ -69,29 +70,16 @@ impl<'a> Filter<'a> { } } - Ok(ands) + Ok(ands.map(|ands| Self { condition: ands })) } - pub fn from_str(rtxn: &heed::RoTxn, index: &Index, expression: &'a str) -> Result { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let filterable_fields = index.filterable_fields(rtxn)?; - // TODO TAMO - let condition = FilterCondition::parse::>(expression).ok().unwrap(); - /* - let condition = match FilterCondition::parse::>(expression) { + pub fn from_str(expression: &'a str) -> Result { + let condition = match FilterCondition::parse::>(expression) { Ok(fc) => Ok(fc), - Err(e) => { - let ve = match e { - nom::Err::Error(x) => x, - nom::Err::Failure(x) => x, - _ => unreachable!(), - }; - Err(Error::UserError(UserError::InvalidFilter { - input: convert_error(Span::new(expression), ve).to_string(), - })) - } - }; - */ + Err(e) => Err(Error::UserError(UserError::InvalidFilter { + input: convert_error(Span::new(expression), e).to_string(), + })), + }?; Ok(Self { condition }) } } @@ -345,7 +333,8 @@ impl<'a> Filter<'a> { let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?; Ok(lhs & rhs) } - Empty => Ok(RoaringBitmap::new()), + FilterCondition::Empty => Ok(RoaringBitmap::new()), + _ => panic!("do the geosearch"), } } } From 6c9165b6a8161544c76c1a3d63867a72e5983115 Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 16:52:13 +0200 Subject: [PATCH 1096/1889] provide a helper to parse the token but to not handle the errors --- milli/src/search/facet/filter_condition.rs | 24 ++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index fca35ff4d..2ba5a023e 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,5 +1,6 @@ use std::fmt::Debug; use std::ops::Bound::{self, Excluded, Included}; +use std::str::FromStr; use either::Either; use filter_parser::{Condition, FilterCondition, FilterParserError, Span, Token}; @@ -27,6 +28,10 @@ impl<'a> From>> for Error { } } +fn parse(tok: &Token) -> Result { + Ok(tok.inner.parse().ok().unwrap()) +} + impl<'a> Filter<'a> { pub fn from_array(array: I) -> Result> where @@ -206,19 +211,11 @@ impl<'a> Filter<'a> { // field id and the level. // TODO TAMO: return good error when we can't parse a span let (left, right) = match operator { - Condition::GreaterThan(val) => { - (Excluded(val.inner.parse::().unwrap()), Included(f64::MAX)) - } - Condition::GreaterThanOrEqual(val) => { - (Included(val.inner.parse::().unwrap()), Included(f64::MAX)) - } - Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.inner.parse().unwrap())), - Condition::LowerThanOrEqual(val) => { - (Included(f64::MIN), Included(val.inner.parse().unwrap())) - } - Condition::Between { from, to } => { - (Included(from.inner.parse::().unwrap()), Included(to.inner.parse().unwrap())) - } + Condition::GreaterThan(val) => (Excluded(parse(val)?), Included(f64::MAX)), + Condition::GreaterThanOrEqual(val) => (Included(parse(val)?), Included(f64::MAX)), + Condition::LowerThan(val) => (Included(f64::MIN), Excluded(parse(val)?)), + Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(parse(val)?)), + Condition::Between { from, to } => (Included(parse(from)?), Included(parse(to)?)), Condition::Equal(val) => { let (_original_value, string_docids) = strings_db.get(rtxn, &(field_id, val.inner))?.unwrap_or_default(); @@ -334,6 +331,7 @@ impl<'a> Filter<'a> { Ok(lhs & rhs) } FilterCondition::Empty => Ok(RoaringBitmap::new()), + // TODO: TAMO _ => panic!("do the geosearch"), } } From e25ca9776fbf8e9bead564c8bd9803f7eafd2a7e Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 17:23:22 +0200 Subject: [PATCH 1097/1889] start updating the exposed function to makes other modules happy --- cli/src/main.rs | 2 +- http-ui/src/main.rs | 7 ++++--- milli/src/facet/mod.rs | 2 ++ milli/src/lib.rs | 4 +++- milli/src/search/facet/filter_condition.rs | 17 +++++++++++------ milli/src/search/facet/mod.rs | 2 ++ milli/src/search/mod.rs | 2 +- 7 files changed, 24 insertions(+), 12 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index b84ff3243..cae4d081f 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -262,7 +262,7 @@ impl Search { } if let Some(ref filter) = self.filter { - let condition = milli::FilterCondition::from_str(&txn, &index, filter)?; + let condition = milli::Filter::from_str(filter)?; search.filter(condition); } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index d27c6d5bb..e3f8f0317 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -24,7 +24,8 @@ use milli::documents::DocumentBatchReader; use milli::update::UpdateIndexingStep::*; use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder}; use milli::{ - obkv_to_json, CompressionType, FilterCondition, Index, MatchingWords, SearchResult, SortError, + obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatchingWords, + SearchResult, SortError, }; use once_cell::sync::OnceCell; use rayon::ThreadPool; @@ -739,7 +740,7 @@ async fn main() -> anyhow::Result<()> { let filters = match query.filters { Some(condition) if !condition.trim().is_empty() => { - Some(FilterCondition::from_str(&rtxn, &index, &condition).unwrap()) + Some(MilliFilter::from_str(&condition).unwrap()) } _otherwise => None, }; @@ -747,7 +748,7 @@ async fn main() -> anyhow::Result<()> { let facet_filters = match query.facet_filters { Some(array) => { let eithers = array.into_iter().map(Into::into); - FilterCondition::from_array(&rtxn, &index, eithers).unwrap() + MilliFilter::from_array(eithers).unwrap() } _otherwise => None, }; diff --git a/milli/src/facet/mod.rs b/milli/src/facet/mod.rs index 274d2588d..aaa7a65ce 100644 --- a/milli/src/facet/mod.rs +++ b/milli/src/facet/mod.rs @@ -2,5 +2,7 @@ mod facet_type; mod facet_value; pub mod value_encoding; +pub use filter_parser::{Condition, FilterCondition, FilterParserError, Span, Token}; + pub use self::facet_type::FacetType; pub use self::facet_value::FacetValue; diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 27453bf36..e2ecb060c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -34,7 +34,9 @@ pub use self::heed_codec::{ RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, }; pub use self::index::Index; -pub use self::search::{FacetDistribution, Filter, MatchingWords, Search, SearchResult}; +pub use self::search::{ + Condition, FacetDistribution, Filter, FilterCondition, MatchingWords, Search, SearchResult, +}; pub type Result = std::result::Result; diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 2ba5a023e..29be3edf4 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -3,7 +3,7 @@ use std::ops::Bound::{self, Excluded, Included}; use std::str::FromStr; use either::Either; -use filter_parser::{Condition, FilterCondition, FilterParserError, Span, Token}; +pub use filter_parser::{Condition, FilterCondition, FilterParserError, Span, Token}; use heed::types::DecodeIgnore; use log::debug; use nom::error::{ErrorKind, VerboseError}; @@ -209,7 +209,7 @@ impl<'a> Filter<'a> { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. - // TODO TAMO: return good error when we can't parse a span + let (left, right) = match operator { Condition::GreaterThan(val) => (Excluded(parse(val)?), Included(f64::MAX)), Condition::GreaterThanOrEqual(val) => (Included(parse(val)?), Included(f64::MAX)), @@ -315,10 +315,15 @@ impl<'a> Filter<'a> { match &self.condition { FilterCondition::Condition { fid, op } => { - // TODO: parse fid - let _ = fid; - let fid = 42; - Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) + let filterable_fields = index.fields_ids_map(rtxn)?; + if let Some(fid) = filterable_fields.id(fid.inner) { + Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) + } else { + // TODO TAMO: update the error message + return Err(UserError::InvalidFilter { + input: format!("Bad filter, available filters are {:?}", filterable_fields), + })?; + } } FilterCondition::Or(lhs, rhs) => { let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index d6f276fbb..c0b692de7 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,3 +1,5 @@ +pub use filter_parser::{Condition, FilterCondition}; + pub use self::facet_distribution::FacetDistribution; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::facet_string::FacetStringIter; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index a31ead1ec..f52dd06f0 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -14,7 +14,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; +pub use self::facet::{Condition, FacetDistribution, FacetNumberIter, Filter, FilterCondition}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; From 4e113bbf1b61acd480c858087fdfeb6f7be2143c Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 17:49:08 +0200 Subject: [PATCH 1098/1889] handle the case of empty input --- filter_parser/src/lib.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 7153c5361..4623f9387 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -91,6 +91,9 @@ impl<'a> FilterCondition<'a> { } pub fn parse>(input: &'a str) -> Result { + if input.trim().is_empty() { + return Ok(Self::Empty); + } let span = Span::new(input); parse_expression::<'a, E>(span).finish().map(|(_rem, output)| output) } From 7cd9109e2fbea5e032c40cc33201cf9e3a15c130 Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 17:50:15 +0200 Subject: [PATCH 1099/1889] lowercase value extracted from Token --- milli/src/search/facet/filter_condition.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 29be3edf4..01132dce0 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -217,8 +217,9 @@ impl<'a> Filter<'a> { Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(parse(val)?)), Condition::Between { from, to } => (Included(parse(from)?), Included(parse(to)?)), Condition::Equal(val) => { - let (_original_value, string_docids) = - strings_db.get(rtxn, &(field_id, val.inner))?.unwrap_or_default(); + let (_original_value, string_docids) = strings_db + .get(rtxn, &(field_id, &val.inner.to_lowercase()))? + .unwrap_or_default(); let number = val.inner.parse::().ok(); let number_docids = match number { Some(n) => { @@ -316,7 +317,7 @@ impl<'a> Filter<'a> { match &self.condition { FilterCondition::Condition { fid, op } => { let filterable_fields = index.fields_ids_map(rtxn)?; - if let Some(fid) = filterable_fields.id(fid.inner) { + if let Some(fid) = filterable_fields.id(&fid.inner.to_lowercase()) { Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) } else { // TODO TAMO: update the error message From 3942b3732f21ff97409355d85b6dfb80dbff85d4 Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 18:03:39 +0200 Subject: [PATCH 1100/1889] re-implement the geosearch --- milli/src/search/facet/filter_condition.rs | 61 ++++++++++------------ 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 01132dce0..fb7ce2ec7 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -253,36 +253,7 @@ impl<'a> Filter<'a> { rtxn, index, numbers_db, strings_db, field_id, &operator, )?; return Ok((all_numbers_ids | all_strings_ids) - docids); - } /* - Condition::GeoLowerThan(base_point, distance) => { - let rtree = match index.geo_rtree(rtxn)? { - Some(rtree) => rtree, - None => return Ok(RoaringBitmap::new()), - }; - - let result = rtree - .nearest_neighbor_iter(base_point) - .take_while(|point| { - distance_between_two_points(base_point, point.geom()) < *distance - }) - .map(|point| point.data) - .collect(); - - return Ok(result); - } - Condition::GeoGreaterThan(point, distance) => { - let result = Self::evaluate_operator( - rtxn, - index, - numbers_db, - strings_db, - field_id, - &Condition::GeoLowerThan(point.clone(), *distance), - )?; - let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; - return Ok(geo_faceted_doc_ids - result); - } - */ + } }; // Ask for the biggest value that can exist for this specific field, if it exists @@ -337,8 +308,34 @@ impl<'a> Filter<'a> { Ok(lhs & rhs) } FilterCondition::Empty => Ok(RoaringBitmap::new()), - // TODO: TAMO - _ => panic!("do the geosearch"), + FilterCondition::GeoLowerThan { point, radius } => { + let base_point = [parse(&point[0])?, parse(&point[1])?]; + let radius = parse(&radius)?; + let rtree = match index.geo_rtree(rtxn)? { + Some(rtree) => rtree, + None => return Ok(RoaringBitmap::new()), + }; + + let result = rtree + .nearest_neighbor_iter(&base_point) + .take_while(|point| { + distance_between_two_points(&base_point, point.geom()) < radius + }) + .map(|point| point.data) + .collect(); + + return Ok(result); + } + FilterCondition::GeoGreaterThan { point, radius } => { + let result = Self::evaluate( + &FilterCondition::GeoLowerThan { point: point.clone(), radius: radius.clone() } + .into(), + rtxn, + index, + )?; + let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; + return Ok(geo_faceted_doc_ids - result); + } } } } From c8d03046bfec58b02a50ac310550c3312b8804c0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 18:08:18 +0200 Subject: [PATCH 1101/1889] add a check on the fid in the geosearch --- milli/src/search/facet/filter_condition.rs | 40 ++++++++++++++-------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index fb7ce2ec7..6f9c4849e 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -309,22 +309,32 @@ impl<'a> Filter<'a> { } FilterCondition::Empty => Ok(RoaringBitmap::new()), FilterCondition::GeoLowerThan { point, radius } => { - let base_point = [parse(&point[0])?, parse(&point[1])?]; - let radius = parse(&radius)?; - let rtree = match index.geo_rtree(rtxn)? { - Some(rtree) => rtree, - None => return Ok(RoaringBitmap::new()), - }; + let filterable_fields = index.fields_ids_map(rtxn)?; + if filterable_fields.id("_geo").is_some() { + let base_point = [parse(&point[0])?, parse(&point[1])?]; + // TODO TAMO: ensure lat is between -90 and 90 + // TODO TAMO: ensure lng is between -180 and 180 + let radius = parse(&radius)?; + let rtree = match index.geo_rtree(rtxn)? { + Some(rtree) => rtree, + None => return Ok(RoaringBitmap::new()), + }; - let result = rtree - .nearest_neighbor_iter(&base_point) - .take_while(|point| { - distance_between_two_points(&base_point, point.geom()) < radius - }) - .map(|point| point.data) - .collect(); + let result = rtree + .nearest_neighbor_iter(&base_point) + .take_while(|point| { + distance_between_two_points(&base_point, point.geom()) < radius + }) + .map(|point| point.data) + .collect(); - return Ok(result); + Ok(result) + } else { + // TODO TAMO: update the error message + return Err(UserError::InvalidFilter { + input: format!("You tried to use _geo in a filter, you probably wanted to use _geoRadius"), + })?; + } } FilterCondition::GeoGreaterThan { point, radius } => { let result = Self::evaluate( @@ -334,7 +344,7 @@ impl<'a> Filter<'a> { index, )?; let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; - return Ok(geo_faceted_doc_ids - result); + Ok(geo_faceted_doc_ids - result) } } } From 1327807caad1df55a28b68f1dfd9d8699d9c6426 Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 22 Oct 2021 19:00:33 +0200 Subject: [PATCH 1102/1889] add some error messages --- milli/src/search/facet/filter_condition.rs | 39 +++++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 6f9c4849e..42b3fc52d 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -29,7 +29,18 @@ impl<'a> From>> for Error { } fn parse(tok: &Token) -> Result { - Ok(tok.inner.parse().ok().unwrap()) + match tok.inner.parse::() { + Ok(t) => Ok(t), + Err(_e) => Err(UserError::InvalidFilter { + input: format!( + "Could not parse `{}` at line {} and offset {}", + tok.inner, + tok.position.location_line(), + tok.position.get_column() + ), + } + .into()), + } } impl<'a> Filter<'a> { @@ -291,10 +302,28 @@ impl<'a> Filter<'a> { if let Some(fid) = filterable_fields.id(&fid.inner.to_lowercase()) { Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) } else { - // TODO TAMO: update the error message - return Err(UserError::InvalidFilter { - input: format!("Bad filter, available filters are {:?}", filterable_fields), - })?; + match fid.inner { + // TODO update the error messages according to the spec + "_geo" => { + return Err(UserError::InvalidFilter { input: format!("Tried to use _geo in a filter, you probably wanted to use _geoRadius(latitude, longitude, radius)") })?; + } + "_geoDistance" => { + return Err(UserError::InvalidFilter { + input: format!("Reserved field _geoDistance"), + })?; + } + fid if fid.starts_with("_geoPoint(") => { + return Err(UserError::InvalidFilter { input: format!("_geoPoint only available in sort. You wanted to use _geoRadius") })?; + } + fid => { + return Err(UserError::InvalidFilter { + input: format!( + "Bad filter {}, available filters are {:?}", + fid, filterable_fields + ), + })?; + } + } } } FilterCondition::Or(lhs, rhs) => { From 8d70b01714acdca6e27dccbf08e6c42829973ccd Mon Sep 17 00:00:00 2001 From: marin postma Date: Wed, 20 Oct 2021 21:26:52 +0200 Subject: [PATCH 1103/1889] optimize document deserialization --- milli/src/documents/builder.rs | 142 ++++++- milli/src/documents/mod.rs | 20 +- milli/src/documents/serde.rs | 532 +++++------------------- milli/src/search/distinct/mod.rs | 3 +- milli/src/update/index_documents/mod.rs | 3 +- milli/tests/search/mod.rs | 7 +- milli/tests/search/query_criteria.rs | 3 +- 7 files changed, 241 insertions(+), 469 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index ba1319eff..98213edd7 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,9 +1,13 @@ +use std::collections::BTreeMap; use std::io; use byteorder::{BigEndian, WriteBytesExt}; -use serde::ser::Serialize; +use serde::Deserializer; +use serde_json::Value; -use super::serde::DocumentSerializer; +use crate::FieldId; + +use super::serde::DocumentVisitor; use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary @@ -24,7 +28,12 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// builder.finish().unwrap(); /// ``` pub struct DocumentBatchBuilder { - serializer: DocumentSerializer, + inner: ByteCounter, + index: DocumentsBatchIndex, + obkv_buffer: Vec, + value_buffer: Vec, + values: BTreeMap, + count: usize, } impl DocumentBatchBuilder { @@ -34,27 +43,33 @@ impl DocumentBatchBuilder { // add space to write the offset of the metadata at the end of the writer writer.write_u64::(0)?; - let serializer = - DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true }; + let this = Self { + inner: writer, + index, + obkv_buffer: Vec::new(), + value_buffer: Vec::new(), + values: BTreeMap::new(), + count: 0, + }; - Ok(Self { serializer }) + Ok(this) } /// Returns the number of documents that have been written to the builder. pub fn len(&self) -> usize { - self.serializer.count + self.count } /// This method must be called after the document addition is terminated. It will put the /// metadata at the end of the file, and write the metadata offset at the beginning on the /// file. pub fn finish(self) -> Result<(), Error> { - let DocumentSerializer { - writer: ByteCounter { mut writer, count: offset }, + let Self { + inner: ByteCounter { mut writer, count: offset }, index, count, .. - } = self.serializer; + } = self; let meta = DocumentsMetadata { count, index }; @@ -68,13 +83,106 @@ impl DocumentBatchBuilder { Ok(()) } - /// Adds documents to the builder. - /// - /// The internal index is updated with the fields found - /// in the documents. Document must either be a map or a sequences of map, anything else will - /// fail. - pub fn add_documents(&mut self, document: T) -> Result<(), Error> { - document.serialize(&mut self.serializer)?; + + /// Extends the builder with json documents from a reader. + pub fn extend_from_json(&mut self, reader: R) -> Result<(), Error> { + let mut de = serde_json::Deserializer::from_reader(reader); + + let mut visitor = DocumentVisitor { + inner: &mut self.inner, + index: &mut self.index, + obkv_buffer: &mut self.obkv_buffer, + value_buffer: &mut self.value_buffer, + values: &mut self.values, + count: &mut self.count, + }; + + de.deserialize_any(&mut visitor).unwrap(); + Ok(()) } } + +#[cfg(test)] +mod test { + use std::io::Cursor; + + use crate::documents::DocumentBatchReader; + + use super::*; + + #[test] + fn add_single_documents_json() { + let mut cursor = Cursor::new(Vec::new()); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + let json = serde_json::json!({ + "id": 1, + "field": "hello!", + }); + + builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + + let json = serde_json::json!({ + "blabla": false, + "field": "hello!", + "id": 1, + }); + + builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + + assert_eq!(builder.len(), 2); + + builder.finish().unwrap(); + + cursor.set_position(0); + + let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 2); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 3); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn add_documents_seq_json() { + let mut cursor = Cursor::new(Vec::new()); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + let json = serde_json::json!([{ + "id": 1, + "field": "hello!", + },{ + "blabla": false, + "field": "hello!", + "id": 1, + } + ]); + + builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + + assert_eq!(builder.len(), 2); + + builder.finish().unwrap(); + + cursor.set_position(0); + + let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 2); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 3); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } +} diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index f79c210fe..ce0539c24 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -92,7 +92,8 @@ macro_rules! documents { let documents = serde_json::json!($data); let mut writer = std::io::Cursor::new(Vec::new()); let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - builder.add_documents(documents).unwrap(); + let documents = serde_json::to_vec(&documents).unwrap(); + builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); builder.finish().unwrap(); writer.set_position(0); @@ -124,7 +125,8 @@ mod test { let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(json).unwrap(); + todo!(); + //builder.add_documents(json).unwrap(); builder.finish().unwrap(); @@ -153,8 +155,9 @@ mod test { let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(doc1).unwrap(); - builder.add_documents(doc2).unwrap(); + todo!(); + //builder.add_documents(doc1).unwrap(); + //builder.add_documents(doc2).unwrap(); builder.finish().unwrap(); @@ -182,7 +185,8 @@ mod test { let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(docs).unwrap(); + todo!(); + //builder.add_documents(docs).unwrap(); builder.finish().unwrap(); @@ -210,11 +214,13 @@ mod test { { "tata": "hello" }, ]]); - assert!(builder.add_documents(docs).is_err()); + todo!(); + //assert!(builder.add_documents(docs).is_err()); let docs = json!("hello"); - assert!(builder.add_documents(docs).is_err()); + todo!(); + //assert!(builder.add_documents(docs).is_err()); } #[test] diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 036ec246a..0d02fff6c 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -1,474 +1,128 @@ use std::collections::BTreeMap; -use std::convert::TryInto; use std::io::Cursor; -use std::{fmt, io}; +use std::io::Write; +use std::fmt; -use byteorder::{BigEndian, WriteBytesExt}; -use obkv::KvWriter; -use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer}; +use byteorder::WriteBytesExt; +use serde::Deserialize; +use serde::de::DeserializeSeed; +use serde::de::MapAccess; +use serde::de::SeqAccess; +use serde::de::Visitor; use serde_json::Value; -use super::{ByteCounter, DocumentsBatchIndex, Error}; +use super::{ByteCounter, DocumentsBatchIndex}; use crate::FieldId; -pub struct DocumentSerializer { - pub writer: ByteCounter, - pub buffer: Vec, - pub index: DocumentsBatchIndex, - pub count: usize, - pub allow_seq: bool, -} +struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex); -impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer { - type Ok = (); +impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { + type Value = FieldId; - type Error = Error; - - type SerializeSeq = SeqSerializer<'a, W>; - type SerializeTuple = Impossible<(), Self::Error>; - type SerializeTupleStruct = Impossible<(), Self::Error>; - type SerializeTupleVariant = Impossible<(), Self::Error>; - type SerializeMap = MapSerializer<'a, &'a mut ByteCounter>; - type SerializeStruct = Impossible<(), Self::Error>; - type SerializeStructVariant = Impossible<(), Self::Error>; - fn serialize_map(self, _len: Option) -> Result { - self.buffer.clear(); - let cursor = io::Cursor::new(&mut self.buffer); - self.count += 1; - let map_serializer = MapSerializer { - map: KvWriter::new(cursor), - index: &mut self.index, - writer: &mut self.writer, - mapped_documents: BTreeMap::new(), - }; - - Ok(map_serializer) - } - - fn serialize_seq(self, _len: Option) -> Result { - if self.allow_seq { - // Only allow sequence of documents of depth 1. - self.allow_seq = false; - Ok(SeqSerializer { serializer: self }) - } else { - Err(Error::InvalidDocumentFormat) - } - } - - fn serialize_bool(self, _v: bool) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i8(self, _v: i8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i16(self, _v: i16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i32(self, _v: i32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i64(self, _v: i64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u8(self, _v: u8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u16(self, _v: u16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u32(self, _v: u32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u64(self, _v: u64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f32(self, _v: f32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f64(self, _v: f64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_char(self, _v: char) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_str(self, _v: &str) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_none(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_some(self, _value: &T) -> Result + fn deserialize(self, deserializer: D) -> Result where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) + D: serde::Deserializer<'de> { + deserializer.deserialize_str(self) } } -pub struct SeqSerializer<'a, W> { - serializer: &'a mut DocumentSerializer, +impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { + type Value = FieldId; + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, { + let field_id = match self.0.get_by_right(v) { + Some(field_id) => *field_id, + None => { + let field_id = self.0.len() as FieldId; + self.0.insert(field_id, v.to_string()); + field_id + } + }; + + Ok(field_id) + } + + fn expecting(&self, _formatter: &mut fmt::Formatter) -> fmt::Result { + todo!() + } } -impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> { - type Ok = (); - type Error = Error; +struct ValueDeserializer; - fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> +impl<'de> DeserializeSeed<'de> for ValueDeserializer { + type Value = serde_json::Value; + + fn deserialize(self, deserializer: D) -> Result where - T: Serialize, + D: serde::Deserializer<'de> { + serde_json::Value::deserialize(deserializer) + } +} + +pub struct DocumentVisitor<'a, W> { + pub inner: &'a mut ByteCounter, + pub index: &'a mut DocumentsBatchIndex, + pub obkv_buffer: &'a mut Vec, + pub value_buffer: &'a mut Vec, + pub values: &'a mut BTreeMap, + pub count: &'a mut usize, +} + +impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { + /// This Visitor value is nothing, since it write the value to a file. + type Value = (); + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, { - value.serialize(&mut *self.serializer)?; + while let Some(_) = seq.next_element_seed(&mut *self)? { } + Ok(()) } - fn end(self) -> Result { - Ok(()) - } -} - -pub struct MapSerializer<'a, W> { - map: KvWriter>, FieldId>, - index: &'a mut DocumentsBatchIndex, - writer: W, - mapped_documents: BTreeMap, -} - -/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and -/// serialize_value, therefore these to methods remain unimplemented. -impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { - type Ok = (); - type Error = Error; - - fn serialize_key(&mut self, _key: &T) -> Result<(), Self::Error> { - unreachable!() - } - - fn serialize_value(&mut self, _value: &T) -> Result<(), Self::Error> { - unreachable!() - } - - fn end(mut self) -> Result { - let mut buf = Vec::new(); - for (key, value) in self.mapped_documents { - buf.clear(); - let mut cursor = Cursor::new(&mut buf); - serde_json::to_writer(&mut cursor, &value).map_err(Error::JsonError)?; - self.map.insert(key, cursor.into_inner()).map_err(Error::Io)?; + fn visit_map(self, mut map: A) -> Result + where + A: MapAccess<'de>, + { + while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer).unwrap() { + self.values.insert(key, value); } - let data = self.map.into_inner().map_err(Error::Io)?.into_inner(); - let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?; + self.obkv_buffer.clear(); + let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer)); + for (key, value) in self.values.iter() { + self.value_buffer.clear(); + // This is guaranteed to work + serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value).unwrap(); + obkv.insert(*key, &self.value_buffer).unwrap(); + } - self.writer.write_u32::(data_len).map_err(Error::Io)?; - self.writer.write_all(&data).map_err(Error::Io)?; + let reader = obkv.into_inner().unwrap().into_inner(); + + self.inner.write_u32::(reader.len() as u32).unwrap(); + self.inner.write_all(reader).unwrap(); + + *self.count += 1; Ok(()) } - fn serialize_entry( - &mut self, - key: &K, - value: &V, - ) -> Result<(), Self::Error> - where - K: Serialize, - V: Serialize, - { - let field_serializer = FieldSerializer { index: &mut self.index }; - let field_id: FieldId = key.serialize(field_serializer)?; - - let value = serde_json::to_value(value).map_err(Error::JsonError)?; - - self.mapped_documents.insert(field_id, value); - - Ok(()) + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "a documents, or a sequence of documents.") } } -struct FieldSerializer<'a> { - index: &'a mut DocumentsBatchIndex, -} +impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> +where W: Write, +{ + type Value = (); -impl<'a> serde::Serializer for FieldSerializer<'a> { - type Ok = FieldId; - - type Error = Error; - - type SerializeSeq = Impossible; - type SerializeTuple = Impossible; - type SerializeTupleStruct = Impossible; - type SerializeTupleVariant = Impossible; - type SerializeMap = Impossible; - type SerializeStruct = Impossible; - type SerializeStructVariant = Impossible; - - fn serialize_str(self, ws: &str) -> Result { - let field_id = match self.index.get_by_right(ws) { - Some(field_id) => *field_id, - None => { - let field_id = self.index.len() as FieldId; - self.index.insert(field_id, ws.to_string()); - field_id - } - }; - - Ok(field_id) - } - - fn serialize_bool(self, _v: bool) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i8(self, _v: i8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i16(self, _v: i16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i32(self, _v: i32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i64(self, _v: i64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u8(self, _v: u8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u16(self, _v: u16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u32(self, _v: u32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u64(self, _v: u64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f32(self, _v: f32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f64(self, _v: f64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_char(self, _v: char) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_none(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_some(self, _value: &T) -> Result + fn deserialize(self, deserializer: D) -> Result where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } -} - -impl serde::ser::Error for Error { - fn custom(msg: T) -> Self { - Error::Custom(msg.to_string()) + D: serde::Deserializer<'de> { + deserializer.deserialize_map(self) } } diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index deb51a053..5639c53fa 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -68,7 +68,8 @@ mod test { "txts": sample_txts[..(rng.gen_range(0..3))], "cat-ints": sample_ints[..(rng.gen_range(0..3))], }); - builder.add_documents(doc).unwrap(); + todo!() + //builder.add_documents(doc).unwrap(); } builder.finish().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 92bcab0e9..58e21a615 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -877,7 +877,8 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(big_object).unwrap(); + todo!(); + //builder.add_documents(big_object).unwrap(); builder.finish().unwrap(); cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index cda0da617..d62b8ec31 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -61,9 +61,10 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let reader = Cursor::new(CONTENT.as_bytes()); - for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - documents_builder.add_documents(doc.unwrap()).unwrap(); - } + todo!(); + //for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { + //documents_builder.add_documents(doc.unwrap()).unwrap(); + //} documents_builder.finish().unwrap(); cursor.set_position(0); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index f3b04c4fa..3fb36b1d5 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -409,7 +409,8 @@ fn criteria_ascdesc() { "age": age, }); - batch_builder.add_documents(json).unwrap(); + todo!(); + //batch_builder.add_documents(json).unwrap(); }); batch_builder.finish().unwrap(); From 0f86d6b28f333b2f75439ef3483b0f8f2f59c824 Mon Sep 17 00:00:00 2001 From: marin postma Date: Thu, 21 Oct 2021 11:05:16 +0200 Subject: [PATCH 1104/1889] implement csv serialization --- milli/Cargo.toml | 1 + milli/src/documents/builder.rs | 103 +++++++++++++++++- milli/src/documents/mod.rs | 33 +++++- milli/src/documents/serde.rs | 14 +-- milli/src/update/index_documents/transform.rs | 8 +- 5 files changed, 142 insertions(+), 17 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 594cc60e0..709f8d865 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -47,6 +47,7 @@ itertools = "0.10.0" # logging log = "0.4.14" logging_timer = "1.0.0" +csv = "1.1.6" [dev-dependencies] big_s = "1.0.2" diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 98213edd7..719580b4a 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,5 +1,8 @@ use std::collections::BTreeMap; +use std::collections::HashMap; use std::io; +use std::io::Cursor; +use std::io::Write; use byteorder::{BigEndian, WriteBytesExt}; use serde::Deserializer; @@ -38,7 +41,7 @@ pub struct DocumentBatchBuilder { impl DocumentBatchBuilder { pub fn new(writer: W) -> Result { - let index = DocumentsBatchIndex::new(); + let index = DocumentsBatchIndex::default(); let mut writer = ByteCounter::new(writer); // add space to write the offset of the metadata at the end of the writer writer.write_u64::(0)?; @@ -101,6 +104,79 @@ impl DocumentBatchBuilder { Ok(()) } + + /// Extends the builder with json documents from a reader. + /// + /// This method can be only called once and is mutually exclusive with extend from json. This + /// is because the fields in a csv are always guaranteed to come in order, and permits some + /// optimizations. + /// + /// From csv takes care to call finish in the end. + pub fn from_csv(mut self, reader: R) -> Result<(), Error> { + + // Ensure that this is the first and only addition made with this builder + debug_assert!(self.index.is_empty()); + + let mut records = csv::Reader::from_reader(reader); + + let headers = records + .headers() + .unwrap() + .into_iter() + .map(parse_csv_header) + .map(|(k, t)| (self.index.insert(&k), t)) + .collect::>(); + + let records = records.into_records(); + + dbg!(&headers); + for record in records { + match record { + Ok(record) => { + let mut writer = obkv::KvWriter::new(Cursor::new(&mut self.obkv_buffer)); + for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { + let value = match ty { + AllowedType::Number => value.parse::().map(Value::from).unwrap(), + AllowedType::String => Value::String(value.to_string()), + }; + + serde_json::to_writer(Cursor::new(&mut self.value_buffer), dbg!(&value)).unwrap(); + writer.insert(*fid, &self.value_buffer)?; + self.value_buffer.clear(); + } + + self.inner.write_u32::(self.obkv_buffer.len() as u32)?; + self.inner.write_all(&self.obkv_buffer)?; + + self.obkv_buffer.clear(); + self.count += 1; + }, + Err(_) => panic!(), + } + } + + self.finish()?; + + Ok(()) + } +} + +#[derive(Debug)] +enum AllowedType { + String, + Number, +} + +fn parse_csv_header(header: &str) -> (String, AllowedType) { + // if there are several separators we only split on the last one. + match header.rsplit_once(':') { + Some((field_name, field_type)) => match field_type { + "string" => (field_name.to_string(), AllowedType::String), + "number" => (field_name.to_string(), AllowedType::Number), // if the pattern isn't reconized, we keep the whole field. + _otherwise => (header.to_string(), AllowedType::String), + }, + None => (header.to_string(), AllowedType::String), + } } #[cfg(test)] @@ -185,4 +261,29 @@ mod test { assert!(reader.next_document_with_index().unwrap().is_none()); } + + #[test] + fn add_documents_csv() { + let mut cursor = Cursor::new(Vec::new()); + let builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + let csv = "id:number,field:string\n1,hello!\n2,blabla"; + + builder.from_csv(Cursor::new(csv.as_bytes())).unwrap(); + + cursor.set_position(0); + + let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + dbg!(reader.len()); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 2); + assert_eq!(document.iter().count(), 2); + + let (_index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(document.iter().count(), 2); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index ce0539c24..9f6ebd3de 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -17,7 +17,38 @@ pub use reader::DocumentBatchReader; use crate::FieldId; /// A bidirectional map that links field ids to their name in a document batch. -pub type DocumentsBatchIndex = BiHashMap; +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct DocumentsBatchIndex(pub BiHashMap); + +impl DocumentsBatchIndex { + /// Insert the field in the map, or return it's field id if it doesn't already exists. + pub fn insert(&mut self, field: &str) -> FieldId { + match self.0.get_by_right(field) { + Some(field_id) => *field_id, + None => { + let field_id = self.0.len() as FieldId; + self.0.insert(field_id, field.to_string()); + field_id + } + } + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn iter(&self) -> impl Iterator { + self.0.iter() + } + + pub fn get_id(&self, id: FieldId) -> Option<&String> { + self.0.get_by_left(&id) + } +} #[derive(Debug, Serialize, Deserialize)] struct DocumentsMetadata { diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 0d02fff6c..86fb68534 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -31,17 +31,9 @@ impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { fn visit_str(self, v: &str) -> Result where - E: serde::de::Error, { - let field_id = match self.0.get_by_right(v) { - Some(field_id) => *field_id, - None => { - let field_id = self.0.len() as FieldId; - self.0.insert(field_id, v.to_string()); - field_id - } - }; - - Ok(field_id) + E: serde::de::Error, + { + Ok(self.0.insert(v)) } fn expecting(&self, _formatter: &mut fmt::Formatter) -> fmt::Result { diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index c0c88abed..5af1eda72 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -75,7 +75,7 @@ fn create_fields_mapping( .collect() } -fn find_primary_key(index: &bimap::BiHashMap) -> Option<&str> { +fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { index .iter() .sorted_by_key(|(k, _)| *k) @@ -179,7 +179,7 @@ impl Transform<'_, '_> { if !self.autogenerate_docids { let mut json = Map::new(); for (key, value) in document.iter() { - let key = addition_index.get_by_left(&key).cloned(); + let key = addition_index.get_id(key).cloned(); let value = serde_json::from_slice::(&value).ok(); if let Some((k, v)) = key.zip(value) { @@ -544,7 +544,7 @@ mod test { mod primary_key_inference { use bimap::BiHashMap; - use crate::update::index_documents::transform::find_primary_key; + use crate::{documents::DocumentsBatchIndex, update::index_documents::transform::find_primary_key}; #[test] fn primary_key_infered_on_first_field() { @@ -557,7 +557,7 @@ mod test { map.insert(4, "fakeId".to_string()); map.insert(0, "realId".to_string()); - assert_eq!(find_primary_key(&map), Some("realId")); + assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); } } } From 2e62925a6e5f6ee864410322758b32c39a8f1a38 Mon Sep 17 00:00:00 2001 From: marin postma Date: Sun, 24 Oct 2021 14:41:36 +0200 Subject: [PATCH 1105/1889] fix tests --- milli/src/documents/builder.rs | 51 +++++++++---------------- milli/src/documents/mod.rs | 28 ++++++++------ milli/src/documents/serde.rs | 1 + milli/src/index.rs | 1 + milli/src/search/distinct/mod.rs | 5 ++- milli/src/update/index_documents/mod.rs | 4 +- milli/tests/search/mod.rs | 10 +++-- milli/tests/search/query_criteria.rs | 4 +- 8 files changed, 49 insertions(+), 55 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 719580b4a..8c70910b5 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,5 +1,4 @@ use std::collections::BTreeMap; -use std::collections::HashMap; use std::io; use std::io::Cursor; use std::io::Write; @@ -18,18 +17,6 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to /// iterate other the documents. -/// -/// ## example: -/// ``` -/// use milli::documents::DocumentBatchBuilder; -/// use serde_json::json; -/// use std::io::Cursor; -/// -/// let mut writer = Cursor::new(Vec::new()); -/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); -/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap(); -/// builder.finish().unwrap(); -/// ``` pub struct DocumentBatchBuilder { inner: ByteCounter, index: DocumentsBatchIndex, @@ -100,7 +87,7 @@ impl DocumentBatchBuilder { count: &mut self.count, }; - de.deserialize_any(&mut visitor).unwrap(); + de.deserialize_any(&mut visitor).map_err(Error::JsonError)?; Ok(()) } @@ -112,10 +99,11 @@ impl DocumentBatchBuilder { /// optimizations. /// /// From csv takes care to call finish in the end. - pub fn from_csv(mut self, reader: R) -> Result<(), Error> { + pub fn from_csv(reader: R, writer: W) -> Result { + let mut this = Self::new(writer)?; // Ensure that this is the first and only addition made with this builder - debug_assert!(self.index.is_empty()); + debug_assert!(this.index.is_empty()); let mut records = csv::Reader::from_reader(reader); @@ -124,40 +112,37 @@ impl DocumentBatchBuilder { .unwrap() .into_iter() .map(parse_csv_header) - .map(|(k, t)| (self.index.insert(&k), t)) - .collect::>(); + .map(|(k, t)| (this.index.insert(&k), t)) + .collect::>(); let records = records.into_records(); - dbg!(&headers); for record in records { match record { Ok(record) => { - let mut writer = obkv::KvWriter::new(Cursor::new(&mut self.obkv_buffer)); + let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { let value = match ty { AllowedType::Number => value.parse::().map(Value::from).unwrap(), AllowedType::String => Value::String(value.to_string()), }; - serde_json::to_writer(Cursor::new(&mut self.value_buffer), dbg!(&value)).unwrap(); - writer.insert(*fid, &self.value_buffer)?; - self.value_buffer.clear(); + serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value).unwrap(); + writer.insert(*fid, &this.value_buffer)?; + this.value_buffer.clear(); } - self.inner.write_u32::(self.obkv_buffer.len() as u32)?; - self.inner.write_all(&self.obkv_buffer)?; + this.inner.write_u32::(this.obkv_buffer.len() as u32)?; + this.inner.write_all(&this.obkv_buffer)?; - self.obkv_buffer.clear(); - self.count += 1; + this.obkv_buffer.clear(); + this.count += 1; }, Err(_) => panic!(), } } - self.finish()?; - - Ok(()) + Ok(this) } } @@ -265,18 +250,16 @@ mod test { #[test] fn add_documents_csv() { let mut cursor = Cursor::new(Vec::new()); - let builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let csv = "id:number,field:string\n1,hello!\n2,blabla"; - builder.from_csv(Cursor::new(csv.as_bytes())).unwrap(); + let builder = DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); + builder.finish().unwrap(); cursor.set_position(0); let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - dbg!(reader.len()); - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); assert_eq!(index.len(), 2); assert_eq!(document.iter().count(), 2); diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 9f6ebd3de..8a8b87794 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -135,6 +135,8 @@ macro_rules! documents { #[cfg(test)] mod test { + use std::io::Cursor; + use serde_json::{json, Value}; use super::*; @@ -151,13 +153,14 @@ mod test { "bool": true }); + let json = serde_json::to_vec(&json).unwrap(); + let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - todo!(); - //builder.add_documents(json).unwrap(); + builder.extend_from_json(Cursor::new(json)).unwrap(); builder.finish().unwrap(); @@ -181,14 +184,16 @@ mod test { "toto": false, }); + let doc1 = serde_json::to_vec(&doc1).unwrap(); + let doc2 = serde_json::to_vec(&doc2).unwrap(); + let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - todo!(); - //builder.add_documents(doc1).unwrap(); - //builder.add_documents(doc2).unwrap(); + builder.extend_from_json(Cursor::new(doc1)).unwrap(); + builder.extend_from_json(Cursor::new(doc2)).unwrap(); builder.finish().unwrap(); @@ -211,13 +216,14 @@ mod test { { "tata": "hello" }, ]); + let docs = serde_json::to_vec(&docs).unwrap(); + let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - todo!(); - //builder.add_documents(docs).unwrap(); + builder.extend_from_json(Cursor::new(docs)).unwrap(); builder.finish().unwrap(); @@ -245,13 +251,13 @@ mod test { { "tata": "hello" }, ]]); - todo!(); - //assert!(builder.add_documents(docs).is_err()); + let docs = serde_json::to_vec(&docs).unwrap(); + assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); let docs = json!("hello"); + let docs = serde_json::to_vec(&docs).unwrap(); - todo!(); - //assert!(builder.add_documents(docs).is_err()); + assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); } #[test] diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 86fb68534..2466ed373 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -98,6 +98,7 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { self.inner.write_all(reader).unwrap(); *self.count += 1; + self.values.clear(); Ok(()) } diff --git a/milli/src/index.rs b/milli/src/index.rs index 6ce693fbe..fe89fe734 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -953,6 +953,7 @@ pub(crate) mod tests { { "id": 1, "name": "kevin", "has_dog": true }, { "id": 2, "name": "bob" } ]); + let mut wtxn = index.write_txn().unwrap(); let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 5639c53fa..11f6379e3 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -68,8 +68,9 @@ mod test { "txts": sample_txts[..(rng.gen_range(0..3))], "cat-ints": sample_ints[..(rng.gen_range(0..3))], }); - todo!() - //builder.add_documents(doc).unwrap(); + + let doc = Cursor::new(serde_json::to_vec(&doc).unwrap()); + builder.extend_from_json(doc).unwrap(); } builder.finish().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 58e21a615..17c778060 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -877,8 +877,8 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - todo!(); - //builder.add_documents(big_object).unwrap(); + let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap()); + builder.extend_from_json(big_object).unwrap(); builder.finish().unwrap(); cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index d62b8ec31..e8fb3fdfa 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -61,10 +61,12 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let reader = Cursor::new(CONTENT.as_bytes()); - todo!(); - //for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - //documents_builder.add_documents(doc.unwrap()).unwrap(); - //} + + for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { + let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); + documents_builder.extend_from_json(doc).unwrap(); + } + documents_builder.finish().unwrap(); cursor.set_position(0); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 3fb36b1d5..e5dde049c 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -409,8 +409,8 @@ fn criteria_ascdesc() { "age": age, }); - todo!(); - //batch_builder.add_documents(json).unwrap(); + let json = Cursor::new(serde_json::to_vec(&json).unwrap()); + batch_builder.extend_from_json(json).unwrap(); }); batch_builder.finish().unwrap(); From 53c79e85f2a0acd4559c11eb01f27914ca1c5ccb Mon Sep 17 00:00:00 2001 From: marin postma Date: Sun, 24 Oct 2021 15:39:56 +0200 Subject: [PATCH 1106/1889] document errors --- milli/src/documents/builder.rs | 20 +++++++------------ milli/src/documents/mod.rs | 30 +++++++++++++++++++++++++--- milli/src/documents/serde.rs | 36 ++++++++++++++++++++++------------ 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 8c70910b5..a7b0c90a4 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -87,18 +87,13 @@ impl DocumentBatchBuilder { count: &mut self.count, }; - de.deserialize_any(&mut visitor).map_err(Error::JsonError)?; - - Ok(()) + de.deserialize_any(&mut visitor).map_err(Error::JsonError)? } - /// Extends the builder with json documents from a reader. + /// Creates a builder from a reader of CSV documents. /// - /// This method can be only called once and is mutually exclusive with extend from json. This - /// is because the fields in a csv are always guaranteed to come in order, and permits some - /// optimizations. - /// - /// From csv takes care to call finish in the end. + /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform + /// optimisations, and extending from another CSV is not allowed. pub fn from_csv(reader: R, writer: W) -> Result { let mut this = Self::new(writer)?; @@ -108,8 +103,7 @@ impl DocumentBatchBuilder { let mut records = csv::Reader::from_reader(reader); let headers = records - .headers() - .unwrap() + .headers()? .into_iter() .map(parse_csv_header) .map(|(k, t)| (this.index.insert(&k), t)) @@ -123,11 +117,11 @@ impl DocumentBatchBuilder { let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { let value = match ty { - AllowedType::Number => value.parse::().map(Value::from).unwrap(), + AllowedType::Number => value.parse::().map(Value::from)?, AllowedType::String => Value::String(value.to_string()), }; - serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value).unwrap(); + serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; writer.insert(*fid, &this.value_buffer)?; this.value_buffer.clear(); } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 8a8b87794..bbc114480 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -7,7 +7,9 @@ mod builder; mod reader; mod serde; -use std::{fmt, io}; +use std::num::ParseFloatError; +use std::io; +use std::fmt::{self, Debug}; use ::serde::{Deserialize, Serialize}; use bimap::BiHashMap; @@ -81,14 +83,22 @@ impl io::Write for ByteCounter { #[derive(Debug)] pub enum Error { + ParseFloat(std::num::ParseFloatError), InvalidDocumentFormat, Custom(String), JsonError(serde_json::Error), + CsvError(csv::Error), Serialize(bincode::Error), Io(io::Error), DocumentTooLarge, } +impl From for Error { + fn from(e: csv::Error) -> Self { + Self::CsvError(e) + } +} + impl From for Error { fn from(other: io::Error) -> Self { Self::Io(other) @@ -101,15 +111,29 @@ impl From for Error { } } +impl From for Error { + fn from(other: serde_json::Error) -> Self { + Self::JsonError(other) + } +} + +impl From for Error { + fn from(other: ParseFloatError) -> Self { + Self::ParseFloat(other) + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { + Error::ParseFloat(e) => write!(f, "{}", e), Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), - Error::Io(e) => e.fmt(f), + Error::Io(e) => write!(f, "{}", e), Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"), - Error::Serialize(e) => e.fmt(f), + Error::Serialize(e) => write!(f, "{}", e), + Error::CsvError(e) => write!(f, "{}", e), } } } diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 2466ed373..4dfdca2c7 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -11,9 +11,19 @@ use serde::de::SeqAccess; use serde::de::Visitor; use serde_json::Value; +use super::Error; use super::{ByteCounter, DocumentsBatchIndex}; use crate::FieldId; +macro_rules! tri { + ($e:expr) => { + match $e { + Ok(r) => r, + Err(e) => return Ok(Err(e.into())), + } + }; +} + struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex); impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { @@ -36,8 +46,8 @@ impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { Ok(self.0.insert(v)) } - fn expecting(&self, _formatter: &mut fmt::Formatter) -> fmt::Result { - todo!() + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "a string") } } @@ -64,22 +74,22 @@ pub struct DocumentVisitor<'a, W> { impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { /// This Visitor value is nothing, since it write the value to a file. - type Value = (); + type Value = Result<(), Error>; fn visit_seq(self, mut seq: A) -> Result where A: SeqAccess<'de>, { - while let Some(_) = seq.next_element_seed(&mut *self)? { } + while let Some(v) = seq.next_element_seed(&mut *self)? { tri!(v) } - Ok(()) + Ok(Ok(())) } fn visit_map(self, mut map: A) -> Result where A: MapAccess<'de>, { - while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer).unwrap() { + while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? { self.values.insert(key, value); } @@ -88,19 +98,19 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { for (key, value) in self.values.iter() { self.value_buffer.clear(); // This is guaranteed to work - serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value).unwrap(); - obkv.insert(*key, &self.value_buffer).unwrap(); + tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value)); + tri!(obkv.insert(*key, &self.value_buffer)); } - let reader = obkv.into_inner().unwrap().into_inner(); + let reader = tri!(obkv.into_inner()).into_inner(); - self.inner.write_u32::(reader.len() as u32).unwrap(); - self.inner.write_all(reader).unwrap(); + tri!(self.inner.write_u32::(reader.len() as u32)); + tri!(self.inner.write_all(reader)); *self.count += 1; self.values.clear(); - Ok(()) + Ok(Ok(())) } fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -111,7 +121,7 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> where W: Write, { - type Value = (); + type Value = Result<(), Error>; fn deserialize(self, deserializer: D) -> Result where From 430e9b13d3523a17a5354a7a2f8d955a3097a008 Mon Sep 17 00:00:00 2001 From: marin postma Date: Mon, 25 Oct 2021 09:48:53 +0200 Subject: [PATCH 1107/1889] add csv builder tests --- milli/src/documents/builder.rs | 324 ++++++++++++++++-- milli/src/documents/mod.rs | 2 +- milli/src/update/index_documents/transform.rs | 2 +- 3 files changed, 297 insertions(+), 31 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index a7b0c90a4..d54d9639c 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -54,12 +54,7 @@ impl DocumentBatchBuilder { /// metadata at the end of the file, and write the metadata offset at the beginning on the /// file. pub fn finish(self) -> Result<(), Error> { - let Self { - inner: ByteCounter { mut writer, count: offset }, - index, - count, - .. - } = self; + let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self; let meta = DocumentsMetadata { count, index }; @@ -73,7 +68,6 @@ impl DocumentBatchBuilder { Ok(()) } - /// Extends the builder with json documents from a reader. pub fn extend_from_json(&mut self, reader: R) -> Result<(), Error> { let mut de = serde_json::Deserializer::from_reader(reader); @@ -95,7 +89,6 @@ impl DocumentBatchBuilder { /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform /// optimisations, and extending from another CSV is not allowed. pub fn from_csv(reader: R, writer: W) -> Result { - let mut this = Self::new(writer)?; // Ensure that this is the first and only addition made with this builder debug_assert!(this.index.is_empty()); @@ -112,28 +105,24 @@ impl DocumentBatchBuilder { let records = records.into_records(); for record in records { - match record { - Ok(record) => { - let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); - for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { - let value = match ty { - AllowedType::Number => value.parse::().map(Value::from)?, - AllowedType::String => Value::String(value.to_string()), - }; + let record = record?; + let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); + for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { + let value = match ty { + AllowedType::Number => value.parse::().map(Value::from)?, + AllowedType::String => Value::String(value.to_string()), + }; - serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; - writer.insert(*fid, &this.value_buffer)?; - this.value_buffer.clear(); - } - - this.inner.write_u32::(this.obkv_buffer.len() as u32)?; - this.inner.write_all(&this.obkv_buffer)?; - - this.obkv_buffer.clear(); - this.count += 1; - }, - Err(_) => panic!(), + serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; + writer.insert(*fid, &this.value_buffer)?; + this.value_buffer.clear(); } + + this.inner.write_u32::(this.obkv_buffer.len() as u32)?; + this.inner.write_all(&this.obkv_buffer)?; + + this.obkv_buffer.clear(); + this.count += 1; } Ok(this) @@ -162,10 +151,25 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) { mod test { use std::io::Cursor; + use serde_json::{json, Map}; + use crate::documents::DocumentBatchReader; use super::*; + fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { + let mut map = Map::new(); + + for (fid, value) in obkv.iter() { + let field_name = index.name(fid).unwrap().clone(); + let value: Value = serde_json::from_slice(value).unwrap(); + + map.insert(field_name, value); + } + + Value::Object(map) + } + #[test] fn add_single_documents_json() { let mut cursor = Cursor::new(Vec::new()); @@ -247,7 +251,8 @@ mod test { let csv = "id:number,field:string\n1,hello!\n2,blabla"; - let builder = DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); + let builder = + DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); builder.finish().unwrap(); cursor.set_position(0); @@ -263,4 +268,265 @@ mod test { assert!(reader.next_document_with_index().unwrap().is_none()); } + + #[test] + fn simple_csv_document() { + let documents = r#"city,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn coma_in_field() { + let documents = r#"city,country,pop +"Boston","United, States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United, States", + "pop": "4628910", + }) + ); + } + + #[test] + fn quote_in_field() { + let documents = r#"city,country,pop +"Boston","United"" States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United\" States", + "pop": "4628910", + }) + ); + } + + #[test] + fn integer_in_field() { + let documents = r#"city,country,pop:number +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": 4628910.0, + }) + ); + } + + #[test] + fn float_in_field() { + let documents = r#"city,country,pop:number +"Boston","United States","4628910.01""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": 4628910.01, + }) + ); + } + + #[test] + fn several_colon_in_header() { + let documents = r#"city:love:string,country:state,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city:love": "Boston", + "country:state": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn ending_by_colon_in_header() { + let documents = r#"city:,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city:": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn starting_by_colon_in_header() { + let documents = r#":city,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + ":city": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[ignore] + #[test] + fn starting_by_colon_in_header2() { + let documents = r#":string,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + + assert!(reader.next_document_with_index().is_err()); + } + + #[test] + fn double_colon_in_header() { + let documents = r#"city::string,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city:": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn bad_type_in_header() { + let documents = r#"city,country:number,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + } + + #[test] + fn bad_column_count1() { + let documents = r#"city,country,pop +"Boston","United States","4628910", "too much""#; + + let mut buf = Vec::new(); + assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + } + + #[test] + fn bad_column_count2() { + let documents = r#"city,country,pop +"Boston","United States""#; + + let mut buf = Vec::new(); + assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index bbc114480..b61a3326b 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -47,7 +47,7 @@ impl DocumentsBatchIndex { self.0.iter() } - pub fn get_id(&self, id: FieldId) -> Option<&String> { + pub fn name(&self, id: FieldId) -> Option<&String> { self.0.get_by_left(&id) } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 5af1eda72..eac60effb 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -179,7 +179,7 @@ impl Transform<'_, '_> { if !self.autogenerate_docids { let mut json = Map::new(); for (key, value) in document.iter() { - let key = addition_index.get_id(key).cloned(); + let key = addition_index.name(key).cloned(); let value = serde_json::from_slice::(&value).ok(); if let Some((k, v)) = key.zip(value) { From 3fcccc31b5885029499056adaaf42853ab45d1ce Mon Sep 17 00:00:00 2001 From: marin postma Date: Mon, 25 Oct 2021 10:26:28 +0200 Subject: [PATCH 1108/1889] add document builder example --- milli/src/documents/builder.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index d54d9639c..144dcdfa9 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -16,7 +16,20 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// format used by milli. /// /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to -/// iterate other the documents. +/// iterate over the documents. +/// +/// ## example: +/// ``` +/// use milli::documents::DocumentBatchBuilder; +/// use serde_json::json; +/// use std::io::Cursor; +/// +/// let json = r##"{"id": 1, "name": "foo"}"##; +/// let mut writer = Cursor::new(Vec::new()); +/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); +/// builder.extend_from_json(Cursor::new(json.as_bytes())).unwrap(); +/// builder.finish().unwrap(); +/// ``` pub struct DocumentBatchBuilder { inner: ByteCounter, index: DocumentsBatchIndex, From 679fe18b17f7f1458cf336f8299be20ad2aca98f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 25 Oct 2021 11:52:17 +0200 Subject: [PATCH 1109/1889] Update version for the next release (v0.19.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index b3a15f100..341b8eb7c 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.18.1" +version = "0.19.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index a7af6fb9b..310388e01 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.18.1" +version = "0.19.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 2701c36d7..726fa9c5f 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.18.1" +version = "0.19.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 01309797d..12f474f12 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.18.1" +version = "0.19.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index dd74ad7b6..d9441984e 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.18.1" +version = "0.19.0" authors = ["Clément Renault "] edition = "2018" From 208903dddea1567f2c2c3afe719f0f512b9d0fe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 25 Oct 2021 11:58:00 +0200 Subject: [PATCH 1110/1889] Revert "Replacing pest with nom " --- milli/Cargo.toml | 3 +- milli/src/error.rs | 11 +- milli/src/lib.rs | 3 + milli/src/search/facet/filter_condition.rs | 708 ++++++++++++++++++++- milli/src/search/facet/filter_parser.rs | 622 ------------------ milli/src/search/facet/grammar.pest | 33 + milli/src/search/facet/mod.rs | 5 +- milli/src/search/facet/parser.rs | 12 + milli/src/search/mod.rs | 3 +- 9 files changed, 736 insertions(+), 664 deletions(-) delete mode 100644 milli/src/search/facet/filter_parser.rs create mode 100644 milli/src/search/facet/grammar.pest create mode 100644 milli/src/search/facet/parser.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 594cc60e0..3792204e9 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -39,7 +39,8 @@ tempfile = "3.2.0" uuid = { version = "0.8.2", features = ["v4"] } # facet filter parser -nom = "7.0.0" +pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } +pest_derive = "2.1.0" # documents words self-join itertools = "0.10.0" diff --git a/milli/src/error.rs b/milli/src/error.rs index c0ce101c8..1f1cc5264 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -7,6 +7,7 @@ use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; use serde_json::{Map, Value}; +use crate::search::ParserRule; use crate::{CriterionError, DocumentId, FieldId, SortError}; pub type Object = Map; @@ -58,9 +59,9 @@ pub enum UserError { DocumentLimitReached, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, + InvalidFilter(pest::error::Error), + InvalidFilterAttribute(pest::error::Error), InvalidGeoField { document_id: Value, object: Value }, - InvalidFilter { input: String }, - InvalidSortName { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, InvalidStoreFile, @@ -207,7 +208,6 @@ impl StdError for InternalError {} impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::InvalidFilter { input } => write!(f, "parser error {}", input), Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), Self::CriterionError(error) => write!(f, "{}", error), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), @@ -220,6 +220,7 @@ impl fmt::Display for UserError { name_list ) } + Self::InvalidFilter(error) => error.fmt(f), Self::InvalidGeoField { document_id, object } => write!( f, "the document with the id: {} contains an invalid _geo field: {}", @@ -235,9 +236,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco json ) } - Self::InvalidSortName { name } => { - write!(f, "Invalid syntax for the sort parameter: {}", name) - } + Self::InvalidFilterAttribute(error) => error.fmt(f), Self::InvalidSortableAttribute { field, valid_fields } => { let valid_names = valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 6fe5947f5..781cedb2c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,3 +1,6 @@ +#[macro_use] +extern crate pest_derive; + #[macro_use] pub mod documents; diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 4fedeee69..f1055b2f8 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,20 +1,60 @@ +use std::collections::HashSet; use std::fmt::Debug; use std::ops::Bound::{self, Excluded, Included}; +use std::result::Result as StdResult; +use std::str::FromStr; use either::Either; use heed::types::DecodeIgnore; +use itertools::Itertools; use log::debug; -use nom::error::{convert_error, VerboseError}; +use pest::error::{Error as PestError, ErrorVariant}; +use pest::iterators::{Pair, Pairs}; +use pest::Parser; use roaring::RoaringBitmap; use self::FilterCondition::*; -use super::filter_parser::{Operator, ParseContext}; +use self::Operator::*; +use super::parser::{FilterParser, Rule, PREC_CLIMBER}; use super::FacetNumberRange; -use crate::error::{Error, UserError}; +use crate::error::UserError; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, }; -use crate::{distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result}; +use crate::{ + distance_between_two_points, CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result, +}; + +#[derive(Debug, Clone, PartialEq)] +pub enum Operator { + GreaterThan(f64), + GreaterThanOrEqual(f64), + Equal(Option, String), + NotEqual(Option, String), + LowerThan(f64), + LowerThanOrEqual(f64), + Between(f64, f64), + GeoLowerThan([f64; 2], f64), + GeoGreaterThan([f64; 2], f64), +} + +impl Operator { + /// This method can return two operations in case it must express + /// an OR operation for the between case (i.e. `TO`). + fn negate(self) -> (Self, Option) { + match self { + GreaterThan(n) => (LowerThanOrEqual(n), None), + GreaterThanOrEqual(n) => (LowerThan(n), None), + Equal(n, s) => (NotEqual(n, s), None), + NotEqual(n, s) => (Equal(n, s), None), + LowerThan(n) => (GreaterThanOrEqual(n), None), + LowerThanOrEqual(n) => (GreaterThan(n), None), + Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), + GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None), + GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None), + } + } +} #[derive(Debug, Clone, PartialEq)] pub enum FilterCondition { @@ -36,7 +76,7 @@ impl FilterCondition { A: AsRef, B: AsRef, { - let mut ands: Option = None; + let mut ands = None; for either in array { match either { @@ -77,23 +117,41 @@ impl FilterCondition { ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?; - let ctx = - ParseContext { fields_ids_map: &fields_ids_map, filterable_fields: &filterable_fields }; - match ctx.parse_expression::>(expression) { - Ok((_, fc)) => Ok(fc), - Err(e) => { - let ve = match e { - nom::Err::Error(x) => x, - nom::Err::Failure(x) => x, - _ => unreachable!(), - }; - Err(Error::UserError(UserError::InvalidFilter { - input: convert_error(expression, ve).to_string(), - })) - } - } + let lexed = + FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; + FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } - pub fn negate(self) -> FilterCondition { + + fn from_pairs( + fim: &FieldsIdsMap, + ff: &HashSet, + expression: Pairs, + ) -> Result { + PREC_CLIMBER.climb( + expression, + |pair: Pair| match pair.as_rule() { + Rule::greater => Ok(Self::greater_than(fim, ff, pair)?), + Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?), + Rule::eq => Ok(Self::equal(fim, ff, pair)?), + Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()), + Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), + Rule::less => Ok(Self::lower_than(fim, ff, pair)?), + Rule::between => Ok(Self::between(fim, ff, pair)?), + Rule::geo_radius => Ok(Self::geo_radius(fim, ff, pair)?), + Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), + Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), + Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), + _ => unreachable!(), + }, + |lhs: Result, op: Pair, rhs: Result| match op.as_rule() { + Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), + Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), + _ => unreachable!(), + }, + ) + } + + fn negate(self) -> FilterCondition { match self { Operator(fid, op) => match op.negate() { (op, None) => Operator(fid, op), @@ -104,6 +162,189 @@ impl FilterCondition { Empty => Empty, } } + + fn geo_radius( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + if !filterable_fields.contains("_geo") { + return Err(UserError::InvalidFilterAttribute(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `_geo` is not filterable, available filterable attributes are: {}", + filterable_fields.iter().join(", "), + ), + }, + item.as_span(), + )))?; + } + let mut items = item.into_inner(); + let fid = match fields_ids_map.id("_geo") { + Some(fid) => fid, + None => return Ok(Empty), + }; + let parameters_item = items.next().unwrap(); + // We don't need more than 3 parameters, but to handle errors correctly we are still going + // to extract the first 4 parameters + let param_span = parameters_item.as_span(); + let parameters = parameters_item + .into_inner() + .take(4) + .map(|param| (param.clone(), param.as_span())) + .map(|(param, span)| pest_parse(param).0.map(|arg| (arg, span))) + .collect::, _>>() + .map_err(UserError::InvalidFilter)?; + if parameters.len() != 3 { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), + }, + // we want to point to the last parameters and if there was no parameters we + // point to the parenthesis + parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), + )))?; + } + let (lat, lng, distance) = (¶meters[0], ¶meters[1], parameters[2].0); + if !(-90.0..=90.0).contains(&lat.0) { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!("Latitude must be contained between -90 and 90 degrees."), + }, + lat.1.clone(), + )))?; + } else if !(-180.0..=180.0).contains(&lng.0) { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!("Longitude must be contained between -180 and 180 degrees."), + }, + lng.1.clone(), + )))?; + } + Ok(Operator(fid, GeoLowerThan([lat.0, lng.0], distance))) + } + + fn between( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let (lresult, _) = pest_parse(items.next().unwrap()); + let (rresult, _) = pest_parse(items.next().unwrap()); + + let lvalue = lresult.map_err(UserError::InvalidFilter)?; + let rvalue = rresult.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, Between(lvalue, rvalue))) + } + + fn equal( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let value = items.next().unwrap(); + let (result, svalue) = pest_parse(value); + + let svalue = svalue.to_lowercase(); + Ok(Operator(fid, Equal(result.ok(), svalue))) + } + + fn greater_than( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let value = items.next().unwrap(); + let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, GreaterThan(value))) + } + + fn greater_than_or_equal( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let value = items.next().unwrap(); + let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, GreaterThanOrEqual(value))) + } + + fn lower_than( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let value = items.next().unwrap(); + let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, LowerThan(value))) + } + + fn lower_than_or_equal( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let value = items.next().unwrap(); + let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, LowerThanOrEqual(value))) + } } impl FilterCondition { @@ -227,9 +468,9 @@ impl FilterCondition { // as the facets values are all in the same database and prefixed by the // field id and the level. let (left, right) = match operator { - Operator::GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), - Operator::GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), - Operator::Equal(number, string) => { + GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), + GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), + Equal(number, string) => { let (_original_value, string_docids) = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); let number_docids = match number { @@ -251,23 +492,23 @@ impl FilterCondition { }; return Ok(string_docids | number_docids); } - Operator::NotEqual(number, string) => { + NotEqual(number, string) => { let all_numbers_ids = if number.is_some() { index.number_faceted_documents_ids(rtxn, field_id)? } else { RoaringBitmap::new() }; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; - let operator = Operator::Equal(*number, string.clone()); + let operator = Equal(*number, string.clone()); let docids = Self::evaluate_operator( rtxn, index, numbers_db, strings_db, field_id, &operator, )?; return Ok((all_numbers_ids | all_strings_ids) - docids); } - Operator::LowerThan(val) => (Included(f64::MIN), Excluded(*val)), - Operator::LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), - Operator::Between(left, right) => (Included(*left), Included(*right)), - Operator::GeoLowerThan(base_point, distance) => { + LowerThan(val) => (Included(f64::MIN), Excluded(*val)), + LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), + Between(left, right) => (Included(*left), Included(*right)), + GeoLowerThan(base_point, distance) => { let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, None => return Ok(RoaringBitmap::new()), @@ -283,14 +524,14 @@ impl FilterCondition { return Ok(result); } - Operator::GeoGreaterThan(point, distance) => { + GeoGreaterThan(point, distance) => { let result = Self::evaluate_operator( rtxn, index, numbers_db, strings_db, field_id, - &Operator::GeoLowerThan(point.clone(), *distance), + &GeoLowerThan(point.clone(), *distance), )?; let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; return Ok(geo_faceted_doc_ids - result); @@ -344,3 +585,406 @@ impl FilterCondition { } } } + +/// Retrieve the field id base on the pest value. +/// +/// Returns an error if the given value is not filterable. +/// +/// Returns Ok(None) if the given value is filterable, but is not yet ascociated to a field_id. +/// +/// The pest pair is simply a string associated with a span, a location to highlight in +/// the error message. +fn field_id( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + items: &mut Pairs, +) -> StdResult, PestError> { + // lexing ensures that we at least have a key + let key = items.next().unwrap(); + if key.as_rule() == Rule::reserved { + let message = match key.as_str() { + key if key.starts_with("_geoPoint") => { + format!( + "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. \ + Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", + ) + } + key @ "_geo" => { + format!( + "`{}` is a reserved keyword and thus can't be used as a filter expression. \ + Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", + key + ) + } + key => format!( + "`{}` is a reserved keyword and thus can't be used as a filter expression.", + key + ), + }; + return Err(PestError::new_from_span(ErrorVariant::CustomError { message }, key.as_span())); + } + + if !filterable_fields.contains(key.as_str()) { + return Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` is not filterable, available filterable attributes are: {}.", + key.as_str(), + filterable_fields.iter().join(", "), + ), + }, + key.as_span(), + )); + } + + Ok(fields_ids_map.id(key.as_str())) +} + +/// Tries to parse the pest pair into the type `T` specified, always returns +/// the original string that we tried to parse. +/// +/// Returns the parsing error associated with the span if the conversion fails. +fn pest_parse(pair: Pair) -> (StdResult>, String) +where + T: FromStr, + T::Err: ToString, +{ + let result = match pair.as_str().parse::() { + Ok(value) => Ok(value), + Err(e) => Err(PestError::::new_from_span( + ErrorVariant::CustomError { message: e.to_string() }, + pair.as_span(), + )), + }; + + (result, pair.as_str().to_string()) +} + +#[cfg(test)] +mod tests { + use big_s::S; + use heed::EnvOpenOptions; + use maplit::hashset; + + use super::*; + use crate::update::Settings; + + #[test] + fn string() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut map = index.fields_ids_map(&wtxn).unwrap(); + map.insert("channel"); + index.put_fields_ids_map(&mut wtxn, &map).unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_filterable_fields(hashset! { S("channel") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); + let expected = Operator(0, Operator::Equal(None, S("ponce"))); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); + let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); + let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); + assert_eq!(condition, expected); + } + + #[test] + fn number() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut map = index.fields_ids_map(&wtxn).unwrap(); + map.insert("timestamp"); + index.put_fields_ids_map(&mut wtxn, &map).unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_filterable_fields(hashset! { "timestamp".into() }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); + let expected = Operator(0, Between(22.0, 44.0)); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); + let expected = + Or(Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0)))); + assert_eq!(condition, expected); + } + + #[test] + fn parentheses() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", + ) + .unwrap(); + let expected = Or( + Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), + Box::new(And( + Box::new(Operator(1, Between(22.0, 44.0))), + Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), + )), + ); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", + ) + .unwrap(); + let expected = Or( + Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), + Box::new(Or( + Box::new(Or( + Box::new(Operator(1, LowerThan(22.0))), + Box::new(Operator(1, GreaterThan(44.0))), + )), + Box::new(Operator(0, Operator::Equal(None, S("ponce")))), + )), + ); + assert_eq!(condition, expected); + } + + #[test] + fn reserved_field_names() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let rtxn = index.read_txn().unwrap(); + + let error = FilterCondition::from_str(&rtxn, &index, "_geo = 12").unwrap_err(); + assert!(error + .to_string() + .contains("`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), + "{}", + error.to_string() + ); + + let error = + FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).unwrap_err(); + assert!(error + .to_string() + .contains("`_geoDistance` is a reserved keyword and thus can't be used as a filter expression."), + "{}", + error.to_string() + ); + + let error = FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).unwrap_err(); + assert!(error + .to_string() + .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), + "{}", + error.to_string() + ); + + let error = + FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).unwrap_err(); + assert!(error + .to_string() + .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), + "{}", + error.to_string() + ); + } + + #[test] + fn geo_radius() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + // _geo is not filterable + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 12, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("attribute `_geo` is not filterable, available filterable attributes are:"),); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + // basic test + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([12., 13.0005], 2000.)); + assert_eq!(condition, expected); + + // basic test with latitude and longitude at the max angle + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(90, 180, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([90., 180.], 2000.)); + assert_eq!(condition, expected); + + // basic test with latitude and longitude at the min angle + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90, -180, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([-90., -180.], 2000.)); + assert_eq!(condition, expected); + + // test the negation of the GeoLowerThan + let condition = + FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); + let expected = Operator(0, GeoGreaterThan([50., 18.], 2000.500)); + assert_eq!(condition, expected); + + // composition of multiple operations + let condition = FilterCondition::from_str( + &rtxn, + &index, + "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", + ) + .unwrap(); + let expected = Or( + Box::new(And( + Box::new(Operator(0, GeoGreaterThan([1., 2.], 300.))), + Box::new(Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), + )), + Box::new(Operator(1, LowerThanOrEqual(10.))), + ); + assert_eq!(condition, expected); + + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius don't have enough parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have too many parameters + let result = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have a bad latitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude must be contained between -90 and 90 degrees.")); + + // georadius have a bad latitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude must be contained between -90 and 90 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Longitude must be contained between -180 and 180 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Longitude must be contained between -180 and 180 degrees.")); + } + + #[test] + fn from_array() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array( + &rtxn, + &index, + vec![ + Either::Right("channel = gotaga"), + Either::Left(vec!["timestamp = 44", "channel != ponce"]), + ], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga AND (timestamp = 44 OR channel != ponce)", + ) + .unwrap(); + assert_eq!(condition, expected); + } +} diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs deleted file mode 100644 index 4d8a54987..000000000 --- a/milli/src/search/facet/filter_parser.rs +++ /dev/null @@ -1,622 +0,0 @@ -use std::collections::HashSet; -use std::fmt::Debug; -use std::result::Result as StdResult; - -use nom::branch::alt; -use nom::bytes::complete::{tag, take_while1}; -use nom::character::complete::{char, multispace0}; -use nom::combinator::map; -use nom::error::{ContextError, ErrorKind, VerboseError}; -use nom::multi::{many0, separated_list1}; -use nom::sequence::{delimited, preceded, tuple}; -use nom::IResult; - -use self::Operator::*; -use super::FilterCondition; -use crate::{FieldId, FieldsIdsMap}; -#[derive(Debug, Clone, PartialEq)] -pub enum Operator { - GreaterThan(f64), - GreaterThanOrEqual(f64), - Equal(Option, String), - NotEqual(Option, String), - LowerThan(f64), - LowerThanOrEqual(f64), - Between(f64, f64), - GeoLowerThan([f64; 2], f64), - GeoGreaterThan([f64; 2], f64), -} - -impl Operator { - /// This method can return two operations in case it must express - /// an OR operation for the between case (i.e. `TO`). - pub fn negate(self) -> (Self, Option) { - match self { - GreaterThan(n) => (LowerThanOrEqual(n), None), - GreaterThanOrEqual(n) => (LowerThan(n), None), - Equal(n, s) => (NotEqual(n, s), None), - NotEqual(n, s) => (Equal(n, s), None), - LowerThan(n) => (GreaterThanOrEqual(n), None), - LowerThanOrEqual(n) => (GreaterThan(n), None), - Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), - GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None), - GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None), - } - } -} - -pub trait FilterParserError<'a>: - nom::error::ParseError<&'a str> + ContextError<&'a str> + std::fmt::Debug -{ -} - -impl<'a> FilterParserError<'a> for VerboseError<&'a str> {} - -pub struct ParseContext<'a> { - pub fields_ids_map: &'a FieldsIdsMap, - pub filterable_fields: &'a HashSet, -} - -impl<'a> ParseContext<'a> { - fn parse_or(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let (input, lhs) = self.parse_and(input)?; - let (input, ors) = many0(preceded(self.ws(tag("OR")), |c| Self::parse_or(self, c)))(input)?; - - let expr = ors - .into_iter() - .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch))); - Ok((input, expr)) - } - - fn parse_and(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let (input, lhs) = self.parse_not(input)?; - let (input, ors) = - many0(preceded(self.ws(tag("AND")), |c| Self::parse_and(self, c)))(input)?; - let expr = ors - .into_iter() - .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); - Ok((input, expr)) - } - - fn parse_not(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - alt(( - map( - preceded(alt((self.ws(tag("!")), self.ws(tag("NOT")))), |c| { - Self::parse_condition_expression(self, c) - }), - |e| e.negate(), - ), - |c| Self::parse_condition_expression(self, c), - ))(input) - } - - fn ws(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> - where - F: Fn(&'a str) -> IResult<&'a str, O, E>, - E: FilterParserError<'a>, - { - delimited(multispace0, inner, multispace0) - } - - fn parse_simple_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let operator = alt((tag("<="), tag(">="), tag(">"), tag("="), tag("<"), tag("!="))); - let k = tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))( - input, - ); - let (input, (key, op, value)) = match k { - Ok(o) => o, - Err(e) => { - return Err(e); - } - }; - - let fid = self.parse_fid(input, key)?; - let r: StdResult>> = self.parse_numeric(value); - let k = match op { - "=" => FilterCondition::Operator(fid, Equal(r.ok(), value.to_string().to_lowercase())), - "!=" => { - FilterCondition::Operator(fid, NotEqual(r.ok(), value.to_string().to_lowercase())) - } - ">" | "<" | "<=" | ">=" => return self.parse_numeric_unary_condition(op, fid, value), - _ => unreachable!(), - }; - Ok((input, k)) - } - - fn parse_numeric(&'a self, input: &'a str) -> StdResult> - where - E: FilterParserError<'a>, - T: std::str::FromStr, - { - match input.parse::() { - Ok(n) => Ok(n), - Err(_) => { - return match input.chars().nth(0) { - Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), - None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), - }; - } - } - } - - fn parse_numeric_unary_condition( - &'a self, - input: &'a str, - fid: u16, - value: &'a str, - ) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let numeric: f64 = self.parse_numeric(value)?; - let k = match input { - ">" => FilterCondition::Operator(fid, GreaterThan(numeric)), - "<" => FilterCondition::Operator(fid, LowerThan(numeric)), - "<=" => FilterCondition::Operator(fid, LowerThanOrEqual(numeric)), - ">=" => FilterCondition::Operator(fid, GreaterThanOrEqual(numeric)), - _ => unreachable!(), - }; - Ok((input, k)) - } - - fn parse_fid(&'a self, input: &'a str, key: &'a str) -> StdResult> - where - E: FilterParserError<'a>, - { - let error = match input.chars().nth(0) { - Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), - None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), - }; - if !self.filterable_fields.contains(key) { - return error; - } - match self.fields_ids_map.id(key) { - Some(fid) => Ok(fid), - None => error, - } - } - - fn parse_range_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let (input, (key, from, _, to)) = tuple(( - self.ws(|c| self.parse_key(c)), - self.ws(|c| self.parse_key(c)), - tag("TO"), - self.ws(|c| self.parse_key(c)), - ))(input)?; - - let fid = self.parse_fid(input, key)?; - let numeric_from: f64 = self.parse_numeric(from)?; - let numeric_to: f64 = self.parse_numeric(to)?; - let res = FilterCondition::Operator(fid, Between(numeric_from, numeric_to)); - - Ok((input, res)) - } - - fn parse_geo_radius(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let err_msg_args_incomplete= "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; - let err_msg_latitude_invalid = - "_geoRadius. Latitude must be contained between -90 and 90 degrees."; - - let err_msg_longitude_invalid = - "_geoRadius. Longitude must be contained between -180 and 180 degrees."; - - let (input, args): (&str, Vec<&str>) = match preceded( - tag("_geoRadius"), - delimited( - char('('), - separated_list1(tag(","), self.ws(|c| self.parse_value::(c))), - char(')'), - ), - )(input) - { - Ok(e) => e, - Err(_e) => { - return Err(nom::Err::Failure(E::add_context( - input, - err_msg_args_incomplete, - E::from_char(input, '('), - ))); - } - }; - - if args.len() != 3 { - let e = E::from_char(input, '('); - return Err(nom::Err::Failure(E::add_context(input, err_msg_args_incomplete, e))); - } - let lat = self.parse_numeric(args[0])?; - let lng = self.parse_numeric(args[1])?; - let dis = self.parse_numeric(args[2])?; - - let fid = match self.fields_ids_map.id("_geo") { - Some(fid) => fid, - None => return Ok((input, FilterCondition::Empty)), - }; - - if !(-90.0..=90.0).contains(&lat) { - return Err(nom::Err::Failure(E::add_context( - input, - err_msg_latitude_invalid, - E::from_char(input, '('), - ))); - } else if !(-180.0..=180.0).contains(&lng) { - return Err(nom::Err::Failure(E::add_context( - input, - err_msg_longitude_invalid, - E::from_char(input, '('), - ))); - } - - let res = FilterCondition::Operator(fid, GeoLowerThan([lat, lng], dis)); - Ok((input, res)) - } - - fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let l1 = |c| self.parse_simple_condition(c); - let l2 = |c| self.parse_range_condition(c); - let l3 = |c| self.parse_geo_radius(c); - alt((l1, l2, l3))(input) - } - - fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - alt(( - delimited(self.ws(char('(')), |c| Self::parse_expression(self, c), self.ws(char(')'))), - |c| Self::parse_condition(self, c), - ))(input) - } - - fn parse_key(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> - where - E: FilterParserError<'a>, - { - let key = |input| take_while1(Self::is_key_component)(input); - alt((key, delimited(char('"'), key, char('"'))))(input) - } - - fn parse_value(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> - where - E: FilterParserError<'a>, - { - let key = |input| take_while1(Self::is_key_component)(input); - alt((key, delimited(char('"'), key, char('"'))))(input) - } - - fn is_key_component(c: char) -> bool { - c.is_alphanumeric() || ['_', '-', '.'].contains(&c) - } - - pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - self.parse_or(input) - } -} - -#[cfg(test)] -mod tests { - use big_s::S; - use either::Either; - use heed::EnvOpenOptions; - use maplit::hashset; - - use super::*; - use crate::update::Settings; - use crate::Index; - - #[test] - fn string() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut map = index.fields_ids_map(&wtxn).unwrap(); - map.insert("channel"); - index.put_fields_ids_map(&mut wtxn, &map).unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { S("channel") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); - let expected = FilterCondition::Operator(0, Operator::Equal(None, S("ponce"))); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); - let expected = FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce"))); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); - let expected = FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce"))); - assert_eq!(condition, expected); - } - - #[test] - fn number() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut map = index.fields_ids_map(&wtxn).unwrap(); - map.insert("timestamp"); - index.put_fields_ids_map(&mut wtxn, &map).unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { "timestamp".into() }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = FilterCondition::Operator(0, Between(22.0, 44.0)); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, LowerThan(22.0))), - Box::new(FilterCondition::Operator(0, GreaterThan(44.0))), - ); - assert_eq!(condition, expected); - } - - #[test] - fn compare() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp"), S("id")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") ,S("id")}); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "channel < 20").unwrap(); - let expected = FilterCondition::Operator(0, LowerThan(20.0)); - assert_eq!(condition, expected); - - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "id < 200").unwrap(); - let expected = FilterCondition::Operator(2, LowerThan(200.0)); - assert_eq!(condition, expected); - } - - #[test] - fn parentheses() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), - Box::new(FilterCondition::And( - Box::new(FilterCondition::Operator(1, Between(22.0, 44.0))), - Box::new(FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce")))), - )), - ); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), - Box::new(FilterCondition::Or( - Box::new(FilterCondition::Or( - Box::new(FilterCondition::Operator(1, LowerThan(22.0))), - Box::new(FilterCondition::Operator(1, GreaterThan(44.0))), - )), - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("ponce")))), - )), - ); - assert_eq!(condition, expected); - } - - #[test] - fn from_array() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array( - &rtxn, - &index, - vec![ - Either::Right("channel = gotaga"), - Either::Left(vec!["timestamp = 44", "channel != ponce"]), - ], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga AND (timestamp = 44 OR channel != ponce)", - ) - .unwrap(); - assert_eq!(condition, expected); - } - #[test] - fn geo_radius() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - // basic test - let condition = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); - let expected = FilterCondition::Operator(0, GeoLowerThan([12., 13.0005], 2000.)); - assert_eq!(condition, expected); - - // test the negation of the GeoLowerThan - let condition = - FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); - let expected = FilterCondition::Operator(0, GeoGreaterThan([50., 18.], 2000.500)); - assert_eq!(condition, expected); - - // composition of multiple operations - let condition = FilterCondition::from_str( - &rtxn, - &index, - "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::And( - Box::new(FilterCondition::Operator(0, GeoGreaterThan([1., 2.], 300.))), - Box::new(FilterCondition::Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), - )), - Box::new(FilterCondition::Operator(1, LowerThanOrEqual(10.))), - ); - assert_eq!(condition, expected); - } - - #[test] - fn geo_radius_error() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - - // georadius don't have any parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius don't have any parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius don't have enough parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius have too many parameters - let result = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Latitude must be contained between -90 and 90 degrees.")); - - // georadius have a bad latitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Latitude must be contained between -90 and 90 degrees.")); - - // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Longitude must be contained between -180 and 180 degrees.")); - - // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Longitude must be contained between -180 and 180 degrees.")); - } -} diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest new file mode 100644 index 000000000..8bfdeb667 --- /dev/null +++ b/milli/src/search/facet/grammar.pest @@ -0,0 +1,33 @@ +key = _{reserved | quoted | word } +value = _{quoted | word } +quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP } +string = {char*} +word = ${(LETTER | NUMBER | "_" | "-" | ".")+} + +char = _{ !(PEEK | "\\") ~ ANY + | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") + | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} + +reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" } +// we deliberately choose to allow empty parameters to generate more specific error message later +parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} +condition = _{between | eq | greater | less | geq | leq | neq} +between = {key ~ value ~ "TO" ~ value} +geq = {key ~ ">=" ~ value} +leq = {key ~ "<=" ~ value} +neq = {key ~ "!=" ~ value} +eq = {key ~ "=" ~ value} +greater = {key ~ ">" ~ value} +less = {key ~ "<" ~ value} +geo_radius = {"_geoRadius" ~ parameters } + +prgm = {SOI ~ expr ~ EOI} +expr = _{ ( term ~ (operation ~ term)* ) } +term = { ("(" ~ expr ~ ")") | condition | not | geo_radius } +operation = _{ and | or } +and = {"AND"} +or = {"OR"} + +not = {"NOT" ~ term} + +WHITESPACE = _{ " " } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 3efa0262f..ddf710e32 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,10 +1,11 @@ pub use self::facet_distribution::FacetDistribution; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::facet_string::FacetStringIter; -pub use self::filter_condition::FilterCondition; +pub use self::filter_condition::{FilterCondition, Operator}; +pub(crate) use self::parser::Rule as ParserRule; mod facet_distribution; mod facet_number; mod facet_string; mod filter_condition; -mod filter_parser; +mod parser; diff --git a/milli/src/search/facet/parser.rs b/milli/src/search/facet/parser.rs new file mode 100644 index 000000000..1bff27cfb --- /dev/null +++ b/milli/src/search/facet/parser.rs @@ -0,0 +1,12 @@ +use once_cell::sync::Lazy; +use pest::prec_climber::{Assoc, Operator, PrecClimber}; + +pub static PREC_CLIMBER: Lazy> = Lazy::new(|| { + use Assoc::*; + use Rule::*; + pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)]) +}); + +#[derive(Parser)] +#[grammar = "search/facet/grammar.pest"] +pub struct FilterParser; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 9b76ca851..bec059d46 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -14,7 +14,8 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition}; +pub(crate) use self::facet::ParserRule; +pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; From f9445c1d9059a77274f70508e2d0c896d9404bbb Mon Sep 17 00:00:00 2001 From: marin postma Date: Mon, 25 Oct 2021 16:07:57 +0200 Subject: [PATCH 1111/1889] return float parsing error context in csv --- milli/src/documents/builder.rs | 11 +++++++---- milli/src/documents/mod.rs | 14 ++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 144dcdfa9..c2b0e01cc 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -115,14 +115,17 @@ impl DocumentBatchBuilder { .map(|(k, t)| (this.index.insert(&k), t)) .collect::>(); - let records = records.into_records(); - - for record in records { + for (i, record) in records.into_records().enumerate() { let record = record?; let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { let value = match ty { - AllowedType::Number => value.parse::().map(Value::from)?, + AllowedType::Number => value.parse::().map(Value::from).map_err(|error| Error::ParseFloat { + error, + // +1 for the header offset. + line: i + 1, + value: value.to_string(), + })?, AllowedType::String => Value::String(value.to_string()), }; diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index b61a3326b..6f653d825 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -83,7 +83,11 @@ impl io::Write for ByteCounter { #[derive(Debug)] pub enum Error { - ParseFloat(std::num::ParseFloatError), + ParseFloat { + error: std::num::ParseFloatError, + line: usize, + value: String, + }, InvalidDocumentFormat, Custom(String), JsonError(serde_json::Error), @@ -117,16 +121,10 @@ impl From for Error { } } -impl From for Error { - fn from(other: ParseFloatError) -> Self { - Self::ParseFloat(other) - } -} - impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Error::ParseFloat(e) => write!(f, "{}", e), + Error::ParseFloat { error, line, value} => write!(f, "Error parsing number {:?} at line {}: {}", value, line, error), Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), From baddd80069b26d66421ce68a7a2e004e148646b0 Mon Sep 17 00:00:00 2001 From: marin postma Date: Mon, 25 Oct 2021 17:38:32 +0200 Subject: [PATCH 1112/1889] implement review suggestions --- benchmarks/benches/utils.rs | 32 +- cli/src/main.rs | 28 +- http-ui/src/documents_from_csv.rs | 285 ------------------ http-ui/src/main.rs | 28 +- milli/src/documents/builder.rs | 54 ++-- milli/src/documents/mod.rs | 17 +- milli/src/documents/serde.rs | 39 +-- milli/src/update/index_documents/mod.rs | 3 +- milli/src/update/index_documents/transform.rs | 3 +- 9 files changed, 89 insertions(+), 400 deletions(-) delete mode 100644 http-ui/src/documents_from_csv.rs diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 24f5d5343..dbe8fffad 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,7 +1,7 @@ #![allow(dead_code)] use std::fs::{create_dir_all, remove_dir_all, File}; -use std::io::{self, Cursor, Read, Seek}; +use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; use std::num::ParseFloatError; use std::path::Path; @@ -146,44 +146,34 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader anyhow::Result> { +fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let values = serde_json::Deserializer::from_reader(reader) - .into_iter::>(); - for document in values { - let document = document?; - documents.add_documents(document)?; + let mut buf = String::new(); + let mut reader = BufReader::new(reader); + + while reader.read_line(&mut buf)? > 0 { + documents.extend_from_json(&mut buf.as_bytes())?; } documents.finish()?; Ok(writer.into_inner()) } -fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { +fn documents_from_json(reader: impl Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let json: serde_json::Value = serde_json::from_reader(reader)?; - documents.add_documents(json)?; + documents.extend_from_json(reader)?; documents.finish()?; Ok(writer.into_inner()) } -fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { +fn documents_from_csv(reader: impl Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - - let iter = CSVDocumentDeserializer::from_reader(reader)?; - - for doc in iter { - let doc = doc?; - documents.add_documents(doc)?; - } - - documents.finish()?; + milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; Ok(writer.into_inner()) } diff --git a/cli/src/main.rs b/cli/src/main.rs index b84ff3243..8e28d4a25 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,5 +1,5 @@ use std::fs::File; -use std::io::{stdin, Cursor, Read}; +use std::io::{stdin, BufRead, BufReader, Cursor, Read}; use std::path::PathBuf; use std::str::FromStr; @@ -9,7 +9,6 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; -use serde_json::{Map, Value}; use structopt::StructOpt; #[cfg(target_os = "linux")] @@ -202,11 +201,11 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let values = serde_json::Deserializer::from_reader(reader) - .into_iter::>(); - for document in values { - let document = document?; - documents.add_documents(document)?; + let mut buf = String::new(); + let mut reader = BufReader::new(reader); + + while reader.read_line(&mut buf)? > 0 { + documents.extend_from_json(&mut buf.as_bytes())?; } documents.finish()?; @@ -217,8 +216,7 @@ fn documents_from_json(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let json: serde_json::Value = serde_json::from_reader(reader)?; - documents.add_documents(json)?; + documents.extend_from_json(reader)?; documents.finish()?; Ok(writer.into_inner()) @@ -226,17 +224,7 @@ fn documents_from_json(reader: impl Read) -> Result> { fn documents_from_csv(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - - let mut records = csv::Reader::from_reader(reader); - let iter = records.deserialize::>(); - - for doc in iter { - let doc = doc?; - documents.add_documents(doc)?; - } - - documents.finish()?; + milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; Ok(writer.into_inner()) } diff --git a/http-ui/src/documents_from_csv.rs b/http-ui/src/documents_from_csv.rs deleted file mode 100644 index 2b62f23c2..000000000 --- a/http-ui/src/documents_from_csv.rs +++ /dev/null @@ -1,285 +0,0 @@ -use std::io::{Read, Result as IoResult}; -use std::num::ParseFloatError; - -use serde_json::{Map, Value}; - -enum AllowedType { - String, - Number, -} - -fn parse_csv_header(header: &str) -> (String, AllowedType) { - // if there are several separators we only split on the last one. - match header.rsplit_once(':') { - Some((field_name, field_type)) => match field_type { - "string" => (field_name.to_string(), AllowedType::String), - "number" => (field_name.to_string(), AllowedType::Number), - // we may return an error in this case. - _otherwise => (header.to_string(), AllowedType::String), - }, - None => (header.to_string(), AllowedType::String), - } -} - -pub struct CSVDocumentDeserializer -where - R: Read, -{ - documents: csv::StringRecordsIntoIter, - headers: Vec<(String, AllowedType)>, -} - -impl CSVDocumentDeserializer { - pub fn from_reader(reader: R) -> IoResult { - let mut records = csv::Reader::from_reader(reader); - - let headers = records.headers()?.into_iter().map(parse_csv_header).collect(); - - Ok(Self { documents: records.into_records(), headers }) - } -} - -impl Iterator for CSVDocumentDeserializer { - type Item = anyhow::Result>; - - fn next(&mut self) -> Option { - let csv_document = self.documents.next()?; - - match csv_document { - Ok(csv_document) => { - let mut document = Map::new(); - - for ((field_name, field_type), value) in - self.headers.iter().zip(csv_document.into_iter()) - { - let parsed_value: Result = match field_type { - AllowedType::Number => { - value.parse::().map(Value::from).map_err(Into::into) - } - AllowedType::String => Ok(Value::String(value.to_string())), - }; - - match parsed_value { - Ok(value) => drop(document.insert(field_name.to_string(), value)), - Err(_e) => { - return Some(Err(anyhow::anyhow!( - "Value '{}' is not a valid number", - value - ))) - } - } - } - - Some(Ok(document)) - } - Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))), - } - } -} - -#[cfg(test)] -mod test { - use serde_json::json; - - use super::*; - - #[test] - fn simple_csv_document() { - let documents = r#"city,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn coma_in_field() { - let documents = r#"city,country,pop -"Boston","United, States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United, States", - "pop": "4628910", - }) - ); - } - - #[test] - fn quote_in_field() { - let documents = r#"city,country,pop -"Boston","United"" States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United\" States", - "pop": "4628910", - }) - ); - } - - #[test] - fn integer_in_field() { - let documents = r#"city,country,pop:number -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United States", - "pop": 4628910.0, - }) - ); - } - - #[test] - fn float_in_field() { - let documents = r#"city,country,pop:number -"Boston","United States","4628910.01""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United States", - "pop": 4628910.01, - }) - ); - } - - #[test] - fn several_double_dot_in_header() { - let documents = r#"city:love:string,country:state,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city:love": "Boston", - "country:state": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn ending_by_double_dot_in_header() { - let documents = r#"city:,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city:": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn starting_by_double_dot_in_header() { - let documents = r#":city,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - ":city": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn starting_by_double_dot_in_header2() { - let documents = r#":string,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn double_double_dot_in_header() { - let documents = r#"city::string,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city:": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn bad_type_in_header() { - let documents = r#"city,country:number,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert!(csv_iter.next().unwrap().is_err()); - } - - #[test] - fn bad_column_count1() { - let documents = r#"city,country,pop -"Boston","United States","4628910", "too much""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert!(csv_iter.next().unwrap().is_err()); - } - - #[test] - fn bad_column_count2() { - let documents = r#"city,country,pop -"Boston","United States""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert!(csv_iter.next().unwrap().is_err()); - } -} diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index d27c6d5bb..9e9fe4a2b 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,10 +1,9 @@ -mod documents_from_csv; mod update_store; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; -use std::io::Cursor; +use std::io::{BufRead, BufReader, Cursor}; use std::net::SocketAddr; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; @@ -39,7 +38,6 @@ use warp::http::Response; use warp::Filter; use self::update_store::UpdateStore; -use crate::documents_from_csv::CSVDocumentDeserializer; #[cfg(target_os = "linux")] #[global_allocator] @@ -1041,11 +1039,11 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let values = serde_json::Deserializer::from_reader(reader) - .into_iter::>(); - for document in values { - let document = document?; - documents.add_documents(document)?; + let mut buf = String::new(); + let mut reader = BufReader::new(reader); + + while reader.read_line(&mut buf)? > 0 { + documents.extend_from_json(&mut buf.as_bytes())?; } documents.finish()?; @@ -1056,8 +1054,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let json: serde_json::Value = serde_json::from_reader(reader)?; - documents.add_documents(json)?; + documents.extend_from_json(reader)?; documents.finish()?; Ok(writer.into_inner()) @@ -1065,16 +1062,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - - let iter = CSVDocumentDeserializer::from_reader(reader)?; - - for doc in iter { - let doc = doc?; - documents.add_documents(doc)?; - } - - documents.finish()?; + milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; Ok(writer.into_inner()) } diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index c2b0e01cc..6ba890b79 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,16 +1,14 @@ use std::collections::BTreeMap; use std::io; -use std::io::Cursor; -use std::io::Write; +use std::io::{Cursor, Write}; use byteorder::{BigEndian, WriteBytesExt}; use serde::Deserializer; use serde_json::Value; -use crate::FieldId; - use super::serde::DocumentVisitor; use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; +use crate::FieldId; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary /// format used by milli. @@ -27,7 +25,7 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// let json = r##"{"id": 1, "name": "foo"}"##; /// let mut writer = Cursor::new(Vec::new()); /// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); -/// builder.extend_from_json(Cursor::new(json.as_bytes())).unwrap(); +/// builder.extend_from_json(&mut json.as_bytes()).unwrap(); /// builder.finish().unwrap(); /// ``` pub struct DocumentBatchBuilder { @@ -46,16 +44,14 @@ impl DocumentBatchBuilder { // add space to write the offset of the metadata at the end of the writer writer.write_u64::(0)?; - let this = Self { + Ok(Self { inner: writer, index, obkv_buffer: Vec::new(), value_buffer: Vec::new(), values: BTreeMap::new(), count: 0, - }; - - Ok(this) + }) } /// Returns the number of documents that have been written to the builder. @@ -117,27 +113,31 @@ impl DocumentBatchBuilder { for (i, record) in records.into_records().enumerate() { let record = record?; - let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); + this.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer); for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { let value = match ty { - AllowedType::Number => value.parse::().map(Value::from).map_err(|error| Error::ParseFloat { - error, - // +1 for the header offset. - line: i + 1, - value: value.to_string(), - })?, + AllowedType::Number => { + value.parse::().map(Value::from).map_err(|error| { + Error::ParseFloat { + error, + // +1 for the header offset. + line: i + 1, + value: value.to_string(), + } + })? + } AllowedType::String => Value::String(value.to_string()), }; + this.value_buffer.clear(); serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; writer.insert(*fid, &this.value_buffer)?; - this.value_buffer.clear(); } this.inner.write_u32::(this.obkv_buffer.len() as u32)?; this.inner.write_all(&this.obkv_buffer)?; - this.obkv_buffer.clear(); this.count += 1; } @@ -156,7 +156,8 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) { match header.rsplit_once(':') { Some((field_name, field_type)) => match field_type { "string" => (field_name.to_string(), AllowedType::String), - "number" => (field_name.to_string(), AllowedType::Number), // if the pattern isn't reconized, we keep the whole field. + "number" => (field_name.to_string(), AllowedType::Number), + // if the pattern isn't reconized, we keep the whole field. _otherwise => (header.to_string(), AllowedType::String), }, None => (header.to_string(), AllowedType::String), @@ -169,9 +170,8 @@ mod test { use serde_json::{json, Map}; - use crate::documents::DocumentBatchReader; - use super::*; + use crate::documents::DocumentBatchReader; fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { let mut map = Map::new(); @@ -525,7 +525,9 @@ mod test { "Boston","United States","4628910""#; let mut buf = Vec::new(); - assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + assert!( + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() + ); } #[test] @@ -534,7 +536,9 @@ mod test { "Boston","United States","4628910", "too much""#; let mut buf = Vec::new(); - assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + assert!( + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() + ); } #[test] @@ -543,6 +547,8 @@ mod test { "Boston","United States""#; let mut buf = Vec::new(); - assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + assert!( + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() + ); } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 6f653d825..14d97ee7d 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -7,9 +7,8 @@ mod builder; mod reader; mod serde; -use std::num::ParseFloatError; -use std::io; use std::fmt::{self, Debug}; +use std::io; use ::serde::{Deserialize, Serialize}; use bimap::BiHashMap; @@ -24,7 +23,7 @@ pub struct DocumentsBatchIndex(pub BiHashMap); impl DocumentsBatchIndex { /// Insert the field in the map, or return it's field id if it doesn't already exists. - pub fn insert(&mut self, field: &str) -> FieldId { + pub fn insert(&mut self, field: &str) -> FieldId { match self.0.get_by_right(field) { Some(field_id) => *field_id, None => { @@ -43,7 +42,7 @@ impl DocumentsBatchIndex { self.0.len() } - pub fn iter(&self) -> impl Iterator { + pub fn iter(&self) -> bimap::hash::Iter { self.0.iter() } @@ -83,11 +82,7 @@ impl io::Write for ByteCounter { #[derive(Debug)] pub enum Error { - ParseFloat { - error: std::num::ParseFloatError, - line: usize, - value: String, - }, + ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, InvalidDocumentFormat, Custom(String), JsonError(serde_json::Error), @@ -124,7 +119,9 @@ impl From for Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Error::ParseFloat { error, line, value} => write!(f, "Error parsing number {:?} at line {}: {}", value, line, error), + Error::ParseFloat { error, line, value } => { + write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) + } Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 4dfdca2c7..d57bf1ffb 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -1,18 +1,13 @@ use std::collections::BTreeMap; -use std::io::Cursor; -use std::io::Write; use std::fmt; +use std::io::{Cursor, Write}; use byteorder::WriteBytesExt; +use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; use serde::Deserialize; -use serde::de::DeserializeSeed; -use serde::de::MapAccess; -use serde::de::SeqAccess; -use serde::de::Visitor; use serde_json::Value; -use super::Error; -use super::{ByteCounter, DocumentsBatchIndex}; +use super::{ByteCounter, DocumentsBatchIndex, Error}; use crate::FieldId; macro_rules! tri { @@ -31,8 +26,9 @@ impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { fn deserialize(self, deserializer: D) -> Result where - D: serde::Deserializer<'de> { - deserializer.deserialize_str(self) + D: serde::Deserializer<'de>, + { + deserializer.deserialize_str(self) } } @@ -43,7 +39,7 @@ impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { where E: serde::de::Error, { - Ok(self.0.insert(v)) + Ok(self.0.insert(v)) } fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -58,8 +54,9 @@ impl<'de> DeserializeSeed<'de> for ValueDeserializer { fn deserialize(self, deserializer: D) -> Result where - D: serde::Deserializer<'de> { - serde_json::Value::deserialize(deserializer) + D: serde::Deserializer<'de>, + { + serde_json::Value::deserialize(deserializer) } } @@ -80,7 +77,9 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { where A: SeqAccess<'de>, { - while let Some(v) = seq.next_element_seed(&mut *self)? { tri!(v) } + while let Some(v) = seq.next_element_seed(&mut *self)? { + tri!(v) + } Ok(Ok(())) } @@ -89,7 +88,9 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { where A: MapAccess<'de>, { - while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? { + while let Some((key, value)) = + map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? + { self.values.insert(key, value); } @@ -119,13 +120,15 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { } impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> -where W: Write, +where + W: Write, { type Value = Result<(), Error>; fn deserialize(self, deserializer: D) -> Result where - D: serde::Deserializer<'de> { - deserializer.deserialize_map(self) + D: serde::Deserializer<'de>, + { + deserializer.deserialize_map(self) } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 17c778060..440546b10 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -906,8 +906,9 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); + let big_object = serde_json::to_string(&big_object).unwrap(); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(big_object).unwrap(); + builder.extend_from_json(&mut big_object.as_bytes()).unwrap(); builder.finish().unwrap(); cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index eac60effb..08aa72d35 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -544,7 +544,8 @@ mod test { mod primary_key_inference { use bimap::BiHashMap; - use crate::{documents::DocumentsBatchIndex, update::index_documents::transform::find_primary_key}; + use crate::documents::DocumentsBatchIndex; + use crate::update::index_documents::transform::find_primary_key; #[test] fn primary_key_infered_on_first_field() { From 9b8ab40d80c0fdf6cfeb95270ebd7bdbb4cf386a Mon Sep 17 00:00:00 2001 From: marin postma Date: Tue, 26 Oct 2021 11:35:49 +0200 Subject: [PATCH 1113/1889] remove search folder --- search/Cargo.toml | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 search/Cargo.toml diff --git a/search/Cargo.toml b/search/Cargo.toml deleted file mode 100644 index d9441984e..000000000 --- a/search/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "search" -version = "0.19.0" -authors = ["Clément Renault "] -edition = "2018" - -[dependencies] -anyhow = "1.0.38" -byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } -log = "0.4.14" -milli = { path = "../milli" } -serde_json = "1.0.62" -stderrlog = "0.5.1" -structopt = { version = "0.3.21", default-features = false } - -[target.'cfg(target_os = "linux")'.dependencies] -jemallocator = "0.3.2" From 3599df77f05f4e89a28c0160411e95a840e0b227 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 26 Oct 2021 17:49:35 +0200 Subject: [PATCH 1114/1889] Change some error messages --- milli/src/asc_desc.rs | 12 +- milli/src/criterion.rs | 14 ++- milli/src/error.rs | 112 ++++++++++++------ milli/src/search/facet/filter_condition.rs | 104 ++++++---------- milli/src/update/index_documents/transform.rs | 6 +- milli/src/update/settings.rs | 8 +- 6 files changed, 138 insertions(+), 118 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index 00f65a459..c0a277c0c 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -28,12 +28,12 @@ impl fmt::Display for AscDescError { write!(f, "Longitude must be contained between -180 and 180 degrees.",) } Self::InvalidSyntax { name } => { - write!(f, "invalid asc/desc syntax for {}.", name) + write!(f, "invalid asc/desc syntax for `{}`.", name) } Self::ReservedKeyword { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a asc/desc rule.", + "`{}` is a reserved keyword and thus can't be used as a asc/desc rule.", name ) } @@ -192,18 +192,18 @@ impl fmt::Display for SortError { Self::BadGeoPointUsage { name } => { write!( f, - "invalid syntax for the `_geoPoint` parameter: `{}`. \ + "Invalid syntax for the `_geoPoint` parameter: `{}`. \ Usage: `_geoPoint(latitude, longitude):asc`.", name ) } Self::InvalidName { name } => { - write!(f, "invalid syntax for the sort parameter `{}`.", name) + write!(f, "Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name) } Self::ReservedName { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a sort expression.", + "`{}` is a reserved keyword and thus can't be used as a sort expression.", name ) } @@ -211,7 +211,7 @@ impl fmt::Display for SortError { write!( f, "`{}` is a reserved keyword and thus can't be used as a sort expression. \ - Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates.", + Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.", name, ) } diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index aff7fcf68..0586fcc0f 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -19,21 +19,25 @@ impl fmt::Display for CriterionError { match self { Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name), Self::ReservedName { name } => { - write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name) + write!( + f, + "`{}` is a reserved keyword and thus can't be used as a ranking rule", + name + ) } Self::ReservedNameForSort { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a ranking rule. \ -{} can only be used for sorting at search time", + "`{}` is a reserved keyword and thus can't be used as a ranking rule. \ +`{}` can only be used for sorting at search time", name, name ) } Self::ReservedNameForFilter { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a ranking rule. \ -{} can only be used for filtering at search time", + "`{}` is a reserved keyword and thus can't be used as a ranking rule. \ +`{}` can only be used for filtering at search time", name, name ) } diff --git a/milli/src/error.rs b/milli/src/error.rs index 1f1cc5264..a4125d117 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -59,23 +59,28 @@ pub enum UserError { DocumentLimitReached, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, - InvalidFilter(pest::error::Error), - InvalidFilterAttribute(pest::error::Error), + InvalidFilter(FilterError), InvalidGeoField { document_id: Value, object: Value }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, InvalidStoreFile, MaxDatabaseSizeReached, - MissingDocumentId { document: Object }, + MissingDocumentId { primary_key: String, document: Object }, MissingPrimaryKey, NoSpaceLeftOnDevice, - PrimaryKeyCannotBeChanged, - PrimaryKeyCannotBeReset, + PrimaryKeyCannotBeChanged(String), SerdeJson(serde_json::Error), SortError(SortError), UnknownInternalDocumentId { document_id: DocumentId }, } +#[derive(Debug)] +pub enum FilterError { + InvalidAttribute { field: String, valid_fields: HashSet }, + ReservedKeyword { field: String, context: Option }, + Syntax(pest::error::Error), +} + impl From for Error { fn from(error: io::Error) -> Error { // TODO must be improved and more precise @@ -160,6 +165,12 @@ impl From for Error { } } +impl From for Error { + fn from(error: FilterError) -> Error { + Error::UserError(UserError::InvalidFilter(error)) + } +} + impl From for Error { fn from(error: SerializationError) -> Error { Error::InternalError(InternalError::Serialization(error)) @@ -169,7 +180,7 @@ impl From for Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::InternalError(error) => write!(f, "internal: {}", error), + Self::InternalError(error) => write!(f, "internal: {}.", error), Self::IoError(error) => error.fmt(f), Self::UserError(error) => error.fmt(f), } @@ -182,15 +193,15 @@ impl fmt::Display for InternalError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::DatabaseMissingEntry { db_name, key } => { - write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) + write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name) } Self::FieldIdMapMissingEntry(error) => error.fmt(f), Self::Fst(error) => error.fmt(f), Self::GrenadInvalidCompressionType => { - f.write_str("invalid compression type have been specified to grenad") + f.write_str("Invalid compression type have been specified to grenad.") } Self::IndexingMergingKeys { process } => { - write!(f, "invalid merge while processing {}", process) + write!(f, "Invalid merge while processing {}.", process) } Self::Serialization(error) => error.fmt(f), Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), @@ -208,67 +219,100 @@ impl StdError for InternalError {} impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), + Self::AttributeLimitReached => f.write_str("Maximum number of attributes reached."), Self::CriterionError(error) => write!(f, "{}", error), - Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), + Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."), Self::InvalidFacetsDistribution { invalid_facets_name } => { let name_list = invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", "); write!( f, - "invalid facet distribution, the fields {} are not set as filterable", + "Invalid facet distribution, the fields `{}` are not set as filterable.", name_list ) } Self::InvalidFilter(error) => error.fmt(f), Self::InvalidGeoField { document_id, object } => write!( f, - "the document with the id: {} contains an invalid _geo field: {}", + "The document with the id: `{}` contains an invalid _geo field: `{}`.", document_id, object ), Self::InvalidDocumentId { document_id } => { - let json = serde_json::to_string(document_id).unwrap(); + let document_id = match document_id { + Value::String(id) => id.clone(), + _ => document_id.to_string(), + }; write!( f, - "document identifier is invalid {}, \ -a document id can be of type integer or string \ -only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)", - json + "Document identifier `{}` is invalid. \ +A document identifier can be of type integer or string, \ +only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", + document_id ) } - Self::InvalidFilterAttribute(error) => error.fmt(f), Self::InvalidSortableAttribute { field, valid_fields } => { let valid_names = valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); write!( f, - "Attribute {} is not sortable, available sortable attributes are: {}", + "Attribute `{}` is not sortable. Available sortable attributes are: `{}`.", field, valid_names ) } Self::SortRankingRuleMissing => f.write_str( - "You must specify where \"sort\" is listed in the \ -rankingRules setting to use the sort parameter at search time", + "The sort ranking rule must be specified in the \ +ranking rules settings to use the sort parameter at search time.", ), - Self::MissingDocumentId { document } => { + Self::MissingDocumentId { primary_key, document } => { let json = serde_json::to_string(document).unwrap(); - write!(f, "document doesn't have an identifier {}", json) + write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json) } - Self::MissingPrimaryKey => f.write_str("missing primary key"), - Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), + Self::MissingPrimaryKey => f.write_str("Missing primary key."), + Self::MaxDatabaseSizeReached => f.write_str("Maximum database size reached."), // TODO where can we find it instead of writing the text ourselves? - Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), - Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), - Self::PrimaryKeyCannotBeChanged => { - f.write_str("primary key cannot be changed if the database contains documents") - } - Self::PrimaryKeyCannotBeReset => { - f.write_str("primary key cannot be reset if the database contains documents") + Self::NoSpaceLeftOnDevice => f.write_str("No space left on device."), + Self::InvalidStoreFile => f.write_str("Store file is not a valid database file."), + Self::PrimaryKeyCannotBeChanged(primary_key) => { + write!(f, "Index already has a primary key: `{}`.", primary_key) } Self::SerdeJson(error) => error.fmt(f), Self::SortError(error) => write!(f, "{}", error), Self::UnknownInternalDocumentId { document_id } => { - write!(f, "an unknown internal document id have been used ({})", document_id) + write!(f, "An unknown internal document id have been used: `{}`.", document_id) + } + } + } +} + +impl fmt::Display for FilterError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::InvalidAttribute { field, valid_fields } => write!( + f, + "Attribute `{}` is not filterable. Available filterable attributes are: `{}`.", + field, + valid_fields + .clone() + .into_iter() + .reduce(|left, right| left + "`, `" + &right) + .unwrap_or_default() + ), + Self::ReservedKeyword { field, context: Some(context) } => { + write!( + f, + "`{}` is a reserved keyword and thus can't be used as a filter expression. {}", + field, context + ) + } + Self::ReservedKeyword { field, context: None } => { + write!( + f, + "`{}` is a reserved keyword and thus can't be used as a filter expression.", + field + ) + } + Self::Syntax(syntax_helper) => { + write!(f, "Invalid syntax for the filter parameter: `{}`.", syntax_helper) } } } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index f1055b2f8..3378054d4 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -6,7 +6,6 @@ use std::str::FromStr; use either::Either; use heed::types::DecodeIgnore; -use itertools::Itertools; use log::debug; use pest::error::{Error as PestError, ErrorVariant}; use pest::iterators::{Pair, Pairs}; @@ -17,7 +16,7 @@ use self::FilterCondition::*; use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; use super::FacetNumberRange; -use crate::error::UserError; +use crate::error::FilterError; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, }; @@ -117,8 +116,7 @@ impl FilterCondition { ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?; - let lexed = - FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; + let lexed = FilterParser::parse(Rule::prgm, expression).map_err(FilterError::Syntax)?; FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } @@ -169,15 +167,11 @@ impl FilterCondition { item: Pair, ) -> Result { if !filterable_fields.contains("_geo") { - return Err(UserError::InvalidFilterAttribute(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `_geo` is not filterable, available filterable attributes are: {}", - filterable_fields.iter().join(", "), - ), - }, - item.as_span(), - )))?; + return Err(FilterError::InvalidAttribute { + field: "_geo".to_string(), + valid_fields: filterable_fields.clone(), + } + .into()); } let mut items = item.into_inner(); let fid = match fields_ids_map.id("_geo") { @@ -194,27 +188,27 @@ impl FilterCondition { .map(|param| (param.clone(), param.as_span())) .map(|(param, span)| pest_parse(param).0.map(|arg| (arg, span))) .collect::, _>>() - .map_err(UserError::InvalidFilter)?; + .map_err(FilterError::Syntax)?; if parameters.len() != 3 { - return Err(UserError::InvalidFilter(PestError::new_from_span( + return Err(FilterError::Syntax(PestError::new_from_span( ErrorVariant::CustomError { message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), }, // we want to point to the last parameters and if there was no parameters we // point to the parenthesis parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), - )))?; + )).into()); } let (lat, lng, distance) = (¶meters[0], ¶meters[1], parameters[2].0); if !(-90.0..=90.0).contains(&lat.0) { - return Err(UserError::InvalidFilter(PestError::new_from_span( + return Err(FilterError::Syntax(PestError::new_from_span( ErrorVariant::CustomError { message: format!("Latitude must be contained between -90 and 90 degrees."), }, lat.1.clone(), )))?; } else if !(-180.0..=180.0).contains(&lng.0) { - return Err(UserError::InvalidFilter(PestError::new_from_span( + return Err(FilterError::Syntax(PestError::new_from_span( ErrorVariant::CustomError { message: format!("Longitude must be contained between -180 and 180 degrees."), }, @@ -230,9 +224,7 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; @@ -240,8 +232,8 @@ impl FilterCondition { let (lresult, _) = pest_parse(items.next().unwrap()); let (rresult, _) = pest_parse(items.next().unwrap()); - let lvalue = lresult.map_err(UserError::InvalidFilter)?; - let rvalue = rresult.map_err(UserError::InvalidFilter)?; + let lvalue = lresult.map_err(FilterError::Syntax)?; + let rvalue = rresult.map_err(FilterError::Syntax)?; Ok(Operator(fid, Between(lvalue, rvalue))) } @@ -252,9 +244,7 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; @@ -272,16 +262,14 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; + let value = result.map_err(FilterError::Syntax)?; Ok(Operator(fid, GreaterThan(value))) } @@ -292,16 +280,14 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; + let value = result.map_err(FilterError::Syntax)?; Ok(Operator(fid, GreaterThanOrEqual(value))) } @@ -312,16 +298,14 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; + let value = result.map_err(FilterError::Syntax)?; Ok(Operator(fid, LowerThan(value))) } @@ -332,16 +316,14 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; + let value = result.map_err(FilterError::Syntax)?; Ok(Operator(fid, LowerThanOrEqual(value))) } @@ -598,43 +580,27 @@ fn field_id( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, items: &mut Pairs, -) -> StdResult, PestError> { +) -> StdResult, FilterError> { // lexing ensures that we at least have a key let key = items.next().unwrap(); if key.as_rule() == Rule::reserved { - let message = match key.as_str() { + return match key.as_str() { key if key.starts_with("_geoPoint") => { - format!( - "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. \ - Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", - ) + Err(FilterError::ReservedKeyword { field: "_geoPoint".to_string(), context: Some("Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.".to_string()) }) } - key @ "_geo" => { - format!( - "`{}` is a reserved keyword and thus can't be used as a filter expression. \ - Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", - key - ) + "_geo" => { + Err(FilterError::ReservedKeyword { field: "_geo".to_string(), context: Some("Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.".to_string()) }) } - key => format!( - "`{}` is a reserved keyword and thus can't be used as a filter expression.", - key - ), + key => + Err(FilterError::ReservedKeyword { field: key.to_string(), context: None }), }; - return Err(PestError::new_from_span(ErrorVariant::CustomError { message }, key.as_span())); } if !filterable_fields.contains(key.as_str()) { - return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` is not filterable, available filterable attributes are: {}.", - key.as_str(), - filterable_fields.iter().join(", "), - ), - }, - key.as_span(), - )); + return Err(FilterError::InvalidAttribute { + field: key.as_str().to_string(), + valid_fields: filterable_fields.clone(), + }); } Ok(fields_ids_map.id(key.as_str())) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 08aa72d35..855fb8db9 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -187,7 +187,11 @@ impl Transform<'_, '_> { } } - return Err(UserError::MissingDocumentId { document: json }.into()); + return Err(UserError::MissingDocumentId { + primary_key: primary_key_name, + document: json, + } + .into()); } let uuid = diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index dee63c726..94875a079 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -465,7 +465,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_primary_key(self.wtxn, primary_key)?; Ok(()) } else { - Err(UserError::PrimaryKeyCannotBeChanged.into()) + let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); + Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) } } Setting::Reset => { @@ -473,7 +474,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.delete_primary_key(self.wtxn)?; Ok(()) } else { - Err(UserError::PrimaryKeyCannotBeReset.into()) + let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); + Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) } } Setting::NotSet => Ok(()), @@ -1105,7 +1107,7 @@ mod tests { builder.reset_primary_key(); let err = builder.execute(|_, _| ()).unwrap_err(); - assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset))); + assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); wtxn.abort().unwrap(); // But if we clear the database... From 2be755ce75d8b1e64c11bf1801a1f3474beb3362 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 27 Oct 2021 19:50:41 +0200 Subject: [PATCH 1115/1889] Lower error check, already check in meilisearch --- milli/src/asc_desc.rs | 46 ---------------------- milli/src/search/facet/filter_condition.rs | 43 ++------------------ 2 files changed, 4 insertions(+), 85 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index c0a277c0c..8d4973c2f 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -318,50 +318,4 @@ mod tests { ); } } - - #[test] - fn sort_error_message() { - let errors = [ - ( - AscDescError::InvalidSyntax { name: S("truc:machin") }, - S("invalid syntax for the sort parameter `truc:machin`."), - ), - ( - AscDescError::InvalidSyntax { name: S("hello:world") }, - S("invalid syntax for the sort parameter `hello:world`."), - ), - ( - AscDescError::ReservedKeyword { name: S("_geo") }, - S("`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), - ), - ( - AscDescError::ReservedKeyword { name: S("_geoDistance") }, - S("_geoDistance is a reserved keyword and thus can't be used as a sort expression.") - ), - ( - AscDescError::ReservedKeyword { name: S("_geoRadius(12, 13)") }, - S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), - ), - ( - AscDescError::InvalidLatitude, - S("Latitude must be contained between -90 and 90 degrees."), - ), - ( - AscDescError::InvalidLongitude, - S("Longitude must be contained between -180 and 180 degrees."), - ), - ]; - - for (asc_desc_error, expected_message) in errors { - let sort_error = SortError::from(asc_desc_error); - assert_eq!( - sort_error.to_string(), - expected_message, - "was expecting {} for the error {:?} but instead got {}", - expected_message, - sort_error, - sort_error.to_string() - ); - } - } } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 3378054d4..f8d40aefb 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -755,39 +755,13 @@ mod tests { let index = Index::new(options, &path).unwrap(); let rtxn = index.read_txn().unwrap(); - let error = FilterCondition::from_str(&rtxn, &index, "_geo = 12").unwrap_err(); - assert!(error - .to_string() - .contains("`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), - "{}", - error.to_string() - ); + assert!(FilterCondition::from_str(&rtxn, &index, "_geo = 12").is_err()); - let error = - FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).unwrap_err(); - assert!(error - .to_string() - .contains("`_geoDistance` is a reserved keyword and thus can't be used as a filter expression."), - "{}", - error.to_string() - ); + assert!(FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).is_err()); - let error = FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).unwrap_err(); - assert!(error - .to_string() - .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), - "{}", - error.to_string() - ); + assert!(FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).is_err()); - let error = - FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).unwrap_err(); - assert!(error - .to_string() - .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), - "{}", - error.to_string() - ); + assert!(FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).is_err()); } #[test] @@ -804,15 +778,6 @@ mod tests { builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); - let rtxn = index.read_txn().unwrap(); - // _geo is not filterable - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 12, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("attribute `_geo` is not filterable, available filterable attributes are:"),); - let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); From 183d3dada7d50c2bc8a129805e8dfae4bb3255dd Mon Sep 17 00:00:00 2001 From: marin postma Date: Thu, 28 Oct 2021 10:33:04 +0200 Subject: [PATCH 1116/1889] return document count from builder --- milli/src/documents/builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 6ba890b79..f95fa9190 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -62,7 +62,7 @@ impl DocumentBatchBuilder { /// This method must be called after the document addition is terminated. It will put the /// metadata at the end of the file, and write the metadata offset at the beginning on the /// file. - pub fn finish(self) -> Result<(), Error> { + pub fn finish(self) -> Result { let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self; let meta = DocumentsMetadata { count, index }; @@ -74,7 +74,7 @@ impl DocumentBatchBuilder { writer.flush()?; - Ok(()) + Ok(count) } /// Extends the builder with json documents from a reader. From ed6db196810f78632758fc386f8a7f5f6cd6f357 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 28 Oct 2021 11:18:32 +0200 Subject: [PATCH 1117/1889] Fix PR comments --- milli/src/error.rs | 8 ++++---- milli/src/search/facet/filter_condition.rs | 22 +++++++++++++++------- milli/src/search/mod.rs | 4 ++-- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index a4125d117..be8458ce6 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use std::convert::Infallible; use std::error::Error as StdError; use std::{fmt, io, str}; @@ -58,10 +58,10 @@ pub enum UserError { CriterionError(CriterionError), DocumentLimitReached, InvalidDocumentId { document_id: Value }, - InvalidFacetsDistribution { invalid_facets_name: HashSet }, + InvalidFacetsDistribution { invalid_facets_name: BTreeSet }, InvalidFilter(FilterError), InvalidGeoField { document_id: Value, object: Value }, - InvalidSortableAttribute { field: String, valid_fields: HashSet }, + InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, SortRankingRuleMissing, InvalidStoreFile, MaxDatabaseSizeReached, @@ -76,7 +76,7 @@ pub enum UserError { #[derive(Debug)] pub enum FilterError { - InvalidAttribute { field: String, valid_fields: HashSet }, + InvalidAttribute { field: String, valid_fields: BTreeSet }, ReservedKeyword { field: String, context: Option }, Syntax(pest::error::Error), } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index f8d40aefb..cd7bcdc4f 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -169,7 +169,7 @@ impl FilterCondition { if !filterable_fields.contains("_geo") { return Err(FilterError::InvalidAttribute { field: "_geo".to_string(), - valid_fields: filterable_fields.clone(), + valid_fields: filterable_fields.into_iter().cloned().collect(), } .into()); } @@ -192,7 +192,7 @@ impl FilterCondition { if parameters.len() != 3 { return Err(FilterError::Syntax(PestError::new_from_span( ErrorVariant::CustomError { - message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), + message: format!("The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)"), }, // we want to point to the last parameters and if there was no parameters we // point to the parenthesis @@ -599,7 +599,7 @@ fn field_id( if !filterable_fields.contains(key.as_str()) { return Err(FilterError::InvalidAttribute { field: key.as_str().to_string(), - valid_fields: filterable_fields.clone(), + valid_fields: filterable_fields.into_iter().cloned().collect(), }); } @@ -829,26 +829,34 @@ mod tests { let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); assert!(result.is_err()); let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + assert!(error.to_string().contains( + "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)" + )); // georadius don't have any parameters let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); assert!(result.is_err()); let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + assert!(error.to_string().contains( + "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)" + )); // georadius don't have enough parameters let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); assert!(result.is_err()); let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + assert!(error.to_string().contains( + "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)" + )); // georadius have too many parameters let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); assert!(result.is_err()); let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + assert!(error.to_string().contains( + "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)" + )); // georadius have a bad latitude let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index bec059d46..aa2544091 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -151,13 +151,13 @@ impl<'a> Search<'a> { Member::Field(ref field) if !sortable_fields.contains(field) => { return Err(UserError::InvalidSortableAttribute { field: field.to_string(), - valid_fields: sortable_fields, + valid_fields: sortable_fields.into_iter().collect(), })? } Member::Geo(_) if !sortable_fields.contains("_geo") => { return Err(UserError::InvalidSortableAttribute { field: "_geo".to_string(), - valid_fields: sortable_fields, + valid_fields: sortable_fields.into_iter().collect(), })? } _ => (), From 9f1e0d2a49447f106277b8a07e0bba65370b47c8 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 28 Oct 2021 14:47:17 +0200 Subject: [PATCH 1118/1889] Refine asc/desc error messages --- milli/src/asc_desc.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index 8d4973c2f..f07e1ded8 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -28,7 +28,7 @@ impl fmt::Display for AscDescError { write!(f, "Longitude must be contained between -180 and 180 degrees.",) } Self::InvalidSyntax { name } => { - write!(f, "invalid asc/desc syntax for `{}`.", name) + write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name) } Self::ReservedKeyword { name } => { write!( @@ -192,8 +192,8 @@ impl fmt::Display for SortError { Self::BadGeoPointUsage { name } => { write!( f, - "Invalid syntax for the `_geoPoint` parameter: `{}`. \ - Usage: `_geoPoint(latitude, longitude):asc`.", + "Invalid syntax for the geo parameter: expected expression formated like \ + `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{}`.", name ) } From 056ff13c4d981a37fce66b6f1c7eee6a375920d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 28 Oct 2021 14:52:57 +0200 Subject: [PATCH 1119/1889] Update version for the next release (v0.20.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 341b8eb7c..2e71dfb21 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.19.0" +version = "0.20.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 310388e01..824771ce7 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.19.0" +version = "0.20.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 726fa9c5f..461c22de0 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.19.0" +version = "0.20.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 209c8b1f7..375cbf0dc 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.19.0" +version = "0.20.0" authors = ["Kerollmops "] edition = "2018" From 76a2adb7c38b1ca15b7f5868de8dbe360a58f281 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Nov 2021 17:35:17 +0100 Subject: [PATCH 1120/1889] re-enable the tests in the parser and start the creation of an error type --- filter_parser/src/condition.rs | 15 +-- filter_parser/src/lib.rs | 103 ++++++++++++++++----- filter_parser/src/value.rs | 2 +- milli/src/search/facet/filter_condition.rs | 2 +- 4 files changed, 86 insertions(+), 36 deletions(-) diff --git a/filter_parser/src/condition.rs b/filter_parser/src/condition.rs index b8d0e1efc..c7a9a85a0 100644 --- a/filter_parser/src/condition.rs +++ b/filter_parser/src/condition.rs @@ -3,20 +3,16 @@ //! ```text //! condition = value ("==" | ">" ...) value //! to = value value TO value -//! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* -//! singleQuoted = "'" .* all but quotes "'" -//! doubleQuoted = "\"" (word | spaces)* "\"" -//! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) //! ``` use nom::branch::alt; use nom::bytes::complete::tag; +use nom::combinator::cut; use nom::sequence::tuple; use nom::IResult; use Condition::*; -use crate::{parse_value, ws, FPError, FilterCondition, Span, Token}; +use crate::{parse_value, FPError, FilterCondition, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Condition<'a> { @@ -50,8 +46,7 @@ pub fn parse_condition<'a, E: FPError<'a>>( input: Span<'a>, ) -> IResult, FilterCondition, E> { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); - let (input, (key, op, value)) = - tuple((|c| parse_value(c), operator, |c| parse_value(c)))(input)?; + let (input, (key, op, value)) = tuple((|c| parse_value(c), operator, cut(parse_value)))(input)?; let fid = key; @@ -81,9 +76,7 @@ pub fn parse_condition<'a, E: FPError<'a>>( /// to = value value TO value pub fn parse_to<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { let (input, (key, from, _, to)) = - tuple((ws(|c| parse_value(c)), ws(|c| parse_value(c)), tag("TO"), ws(|c| parse_value(c))))( - input, - )?; + tuple((|c| parse_value(c), |c| parse_value(c), tag("TO"), cut(parse_value)))(input)?; Ok(( input, diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 4623f9387..5b8107b82 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -1,44 +1,50 @@ //! BNF grammar: //! //! ```text +//! filter = expression ~ EOF //! expression = or //! or = and (~ "OR" ~ and) //! and = not (~ "AND" not)* //! not = ("NOT" | "!") not | primary -//! primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius +//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to //! condition = value ("==" | ">" ...) value //! to = value value TO value //! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" (word | spaces)* "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) +//! geoRadius = WS* ~ "_geoRadius(" ~ float ~ "," ~ float ~ "," float ~ ")" +//! ``` +//! +//! Other BNF grammar used to handle some specific errors: +//! ```text +//! geoPoint = WS* ~ "_geoPoint(" ~ (float ~ ",")* ~ ")" //! ``` mod condition; +mod error; mod value; + use std::fmt::Debug; pub use condition::{parse_condition, parse_to, Condition}; +pub use error::{Error, ErrorKind}; use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::{char, multispace0}; -use nom::combinator::map; -use nom::error::{ContextError, Error, ErrorKind, VerboseError}; +use nom::combinator::{cut, eof, map}; +use nom::error::{ContextError, ParseError}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; -use nom::sequence::{delimited, preceded, tuple}; +use nom::sequence::{delimited, preceded, terminated, tuple}; use nom::{Finish, IResult}; -use nom_greedyerror::GreedyError; use nom_locate::LocatedSpan; pub(crate) use value::parse_value; -pub type Span<'a> = LocatedSpan<&'a str>; +pub type Span<'a> = LocatedSpan<&'a str, &'a str>; -pub trait FilterParserError<'a>: nom::error::ParseError> + ContextError> {} -impl<'a> FilterParserError<'a> for GreedyError, ErrorKind> {} -impl<'a> FilterParserError<'a> for VerboseError> {} -impl<'a> FilterParserError<'a> for Error> {} +pub trait FilterParserError<'a>: ParseError> + ContextError> {} +impl<'a, T> FilterParserError<'a> for T where T: ParseError> + ContextError> {} use FilterParserError as FPError; @@ -94,8 +100,8 @@ impl<'a> FilterCondition<'a> { if input.trim().is_empty() { return Ok(Self::Empty); } - let span = Span::new(input); - parse_expression::<'a, E>(span).finish().map(|(_rem, output)| output) + let span = Span::new_extra(input, input); + parse_filter::<'a, E>(span).finish().map(|(_rem, output)| output) } } @@ -109,7 +115,7 @@ fn ws<'a, O, E: FPError<'a>>( /// and = not (~ "AND" not)* fn parse_or<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { let (input, lhs) = parse_and(input)?; - let (input, ors) = many0(preceded(ws(tag("OR")), |c| parse_and(c)))(input)?; + let (input, ors) = many0(preceded(ws(tag("OR")), cut(parse_and)))(input)?; let expr = ors .into_iter() @@ -119,7 +125,7 @@ fn parse_or<'a, E: FPError<'a>>(input: Span<'a>) -> IResult>(input: Span<'a>) -> IResult { let (input, lhs) = parse_not(input)?; - let (input, ors) = many0(preceded(ws(tag("AND")), |c| parse_not(c)))(input)?; + let (input, ors) = many0(preceded(ws(tag("AND")), cut(parse_not)))(input)?; let expr = ors .into_iter() .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); @@ -128,9 +134,10 @@ fn parse_and<'a, E: FPError<'a>>(input: Span<'a>) -> IResult>(input: Span<'a>) -> IResult { - alt((map(preceded(alt((tag("!"), tag("NOT"))), |c| parse_not(c)), |e| e.negate()), |c| { - parse_primary(c) - }))(input) + alt(( + map(preceded(alt((tag("!"), tag("NOT"))), cut(parse_not)), |e| e.negate()), + cut(parse_primary), + ))(input) } /// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) @@ -140,7 +147,7 @@ fn parse_geo_radius<'a, E: FPError<'a>>(input: Span<'a>) -> IResult, Fi // we want to forbid space BEFORE the _geoRadius but not after let parsed = preceded::<_, _, _, _, _, _>( tuple((multispace0, tag("_geoRadius"))), - delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')')), + cut(delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')'))), )(input); let (input, args): (Span, Vec) = parsed?; @@ -157,13 +164,13 @@ fn parse_geo_radius<'a, E: FPError<'a>>(input: Span<'a>) -> IResult, Fi Ok((input, res)) } -/// primary = (WS* ~ "(" expression ")" ~ WS*) | condition | to | geoRadius +/// primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to fn parse_primary<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { alt(( - delimited(ws(char('(')), |c| parse_expression(c), ws(char(')'))), + delimited(ws(char('(')), cut(parse_expression), cut(ws(char(')')))), + |c| parse_geo_radius(c), |c| parse_condition(c), |c| parse_to(c), - |c| parse_geo_radius(c), ))(input) } @@ -172,6 +179,11 @@ pub fn parse_expression<'a, E: FPError<'a>>(input: Span<'a>) -> IResult>(input: Span<'a>) -> IResult { + terminated(parse_expression, eof)(input) +} + #[cfg(test)] pub mod tests { use super::*; @@ -181,7 +193,8 @@ pub mod tests { // if the string is empty we still need to return 1 for the line number let lines = before.is_empty().then(|| 1).unwrap_or_else(|| before.lines().count()); let offset = before.chars().count(); - unsafe { Span::new_from_raw_offset(offset, lines as u32, value, ()) }.into() + // the extra field is not checked in the tests so we can set it to nothing + unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into() } #[test] @@ -471,4 +484,48 @@ pub mod tests { assert_eq!(filter, expected, "Filter `{}` failed.", input); } } + + #[test] + fn error() { + use FilterCondition as Fc; + + let result = Fc::parse::>("test = truc OR truc"); + assert!(result.is_err()); + + let test_case = [ + // simple test + ("OR", "An error occured"), + ("AND", "An error occured"), + ("channel = Ponce OR", "An error occured"), + ("channel = Ponce = 12", "An error occured"), + ("_geoRadius = 12", "An error occured"), + ("_geoPoint(12, 13, 14)", "An error occured"), + ("_geo = _geoRadius(12, 13, 14)", "An error occured"), + ]; + + for (input, expected) in test_case { + let result = Fc::parse::>(input); + + assert!( + result.is_err(), + "Filter `{:?}` wasn't supposed to be parsed but it did with the following result: `{:?}`", + expected, + result.unwrap() + ); + let filter = result.unwrap_err().to_string(); + assert_eq!(filter, expected, "Filter `{:?}` was supposed to return the following error: `{}`, but instead returned `{}`.", input, filter, expected); + } + } + + /* + #[test] + fn bidule() { + use FilterCondition as Fc; + + let result = Fc::parse::>("test = truc OR truc"); + dbg!(result); + + assert!(false); + } + */ } diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index 5b3a8dfd1..55c9aec23 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -57,7 +57,7 @@ pub mod tests { ]; for (input, expected) in test_case { - let input = Span::new(input); + let input = Span::new_extra(input, input); let result = parse_value::>(input); assert!( diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 42b3fc52d..b61cd451b 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -93,7 +93,7 @@ impl<'a> Filter<'a> { let condition = match FilterCondition::parse::>(expression) { Ok(fc) => Ok(fc), Err(e) => Err(Error::UserError(UserError::InvalidFilter { - input: convert_error(Span::new(expression), e).to_string(), + input: convert_error(Span::new_extra(expression, expression), e).to_string(), })), }?; Ok(Self { condition }) From 5d3af5f2732f878fa3cfa758d5deff8f01a47d2f Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 2 Nov 2021 20:27:07 +0100 Subject: [PATCH 1121/1889] remove all genericity in favor of my custom error type --- filter_parser/src/condition.rs | 9 +++----- filter_parser/src/lib.rs | 41 +++++++++++++++------------------- filter_parser/src/value.rs | 24 +++++++++----------- 3 files changed, 32 insertions(+), 42 deletions(-) diff --git a/filter_parser/src/condition.rs b/filter_parser/src/condition.rs index c7a9a85a0..faacceb72 100644 --- a/filter_parser/src/condition.rs +++ b/filter_parser/src/condition.rs @@ -9,10 +9,9 @@ use nom::branch::alt; use nom::bytes::complete::tag; use nom::combinator::cut; use nom::sequence::tuple; -use nom::IResult; use Condition::*; -use crate::{parse_value, FPError, FilterCondition, Span, Token}; +use crate::{parse_value, FilterCondition, IResult, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Condition<'a> { @@ -42,9 +41,7 @@ impl<'a> Condition<'a> { } /// condition = value ("==" | ">" ...) value -pub fn parse_condition<'a, E: FPError<'a>>( - input: Span<'a>, -) -> IResult, FilterCondition, E> { +pub fn parse_condition(input: Span) -> IResult { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); let (input, (key, op, value)) = tuple((|c| parse_value(c), operator, cut(parse_value)))(input)?; @@ -74,7 +71,7 @@ pub fn parse_condition<'a, E: FPError<'a>>( } /// to = value value TO value -pub fn parse_to<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { +pub fn parse_to(input: Span) -> IResult { let (input, (key, from, _, to)) = tuple((|c| parse_value(c), |c| parse_value(c), tag("TO"), cut(parse_value)))(input)?; diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 5b8107b82..86c6cd79c 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -37,16 +37,13 @@ use nom::error::{ContextError, ParseError}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, terminated, tuple}; -use nom::{Finish, IResult}; +use nom::Finish; use nom_locate::LocatedSpan; pub(crate) use value::parse_value; pub type Span<'a> = LocatedSpan<&'a str, &'a str>; -pub trait FilterParserError<'a>: ParseError> + ContextError> {} -impl<'a, T> FilterParserError<'a> for T where T: ParseError> + ContextError> {} - -use FilterParserError as FPError; +type IResult<'a, Ret> = nom::IResult, Ret, Error<'a>>; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Token<'a> { @@ -96,24 +93,22 @@ impl<'a> FilterCondition<'a> { } } - pub fn parse>(input: &'a str) -> Result { + pub fn parse(input: &'a str) -> Result { if input.trim().is_empty() { return Ok(Self::Empty); } let span = Span::new_extra(input, input); - parse_filter::<'a, E>(span).finish().map(|(_rem, output)| output) + parse_filter(span).finish().map(|(_rem, output)| output) } } // remove OPTIONAL whitespaces before AND after the the provided parser -fn ws<'a, O, E: FPError<'a>>( - inner: impl FnMut(Span<'a>) -> IResult, -) -> impl FnMut(Span<'a>) -> IResult { +fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) -> IResult { delimited(multispace0, inner, multispace0) } /// and = not (~ "AND" not)* -fn parse_or<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { +fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; let (input, ors) = many0(preceded(ws(tag("OR")), cut(parse_and)))(input)?; @@ -123,7 +118,7 @@ fn parse_or<'a, E: FPError<'a>>(input: Span<'a>) -> IResult>(input: Span<'a>) -> IResult { +fn parse_and(input: Span) -> IResult { let (input, lhs) = parse_not(input)?; let (input, ors) = many0(preceded(ws(tag("AND")), cut(parse_not)))(input)?; let expr = ors @@ -133,7 +128,7 @@ fn parse_and<'a, E: FPError<'a>>(input: Span<'a>) -> IResult>(input: Span<'a>) -> IResult { +fn parse_not(input: Span) -> IResult { alt(( map(preceded(alt((tag("!"), tag("NOT"))), cut(parse_not)), |e| e.negate()), cut(parse_primary), @@ -141,7 +136,7 @@ fn parse_not<'a, E: FPError<'a>>(input: Span<'a>) -> IResult>(input: Span<'a>) -> IResult, FilterCondition, E> { +fn parse_geo_radius(input: Span) -> IResult { let err_msg_args_incomplete = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; // we want to forbid space BEFORE the _geoRadius but not after @@ -153,8 +148,8 @@ fn parse_geo_radius<'a, E: FPError<'a>>(input: Span<'a>) -> IResult, Fi let (input, args): (Span, Vec) = parsed?; if args.len() != 3 { - let e = E::from_char(input, '('); - return Err(nom::Err::Failure(E::add_context(input, err_msg_args_incomplete, e))); + let e = Error::from_char(input, '('); + return Err(nom::Err::Failure(Error::add_context(input, err_msg_args_incomplete, e))); } let res = FilterCondition::GeoLowerThan { @@ -165,7 +160,7 @@ fn parse_geo_radius<'a, E: FPError<'a>>(input: Span<'a>) -> IResult, Fi } /// primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to -fn parse_primary<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { +fn parse_primary(input: Span) -> IResult { alt(( delimited(ws(char('(')), cut(parse_expression), cut(ws(char(')')))), |c| parse_geo_radius(c), @@ -175,12 +170,12 @@ fn parse_primary<'a, E: FPError<'a>>(input: Span<'a>) -> IResult>(input: Span<'a>) -> IResult { +pub fn parse_expression(input: Span) -> IResult { parse_or(input) } /// filter = expression ~ EOF -pub fn parse_filter<'a, E: FPError<'a>>(input: Span<'a>) -> IResult { +pub fn parse_filter(input: Span) -> IResult { terminated(parse_expression, eof)(input) } @@ -472,7 +467,7 @@ pub mod tests { ]; for (input, expected) in test_case { - let result = Fc::parse::>(input); + let result = Fc::parse(input); assert!( result.is_ok(), @@ -489,22 +484,22 @@ pub mod tests { fn error() { use FilterCondition as Fc; - let result = Fc::parse::>("test = truc OR truc"); + let result = Fc::parse("test = truc OR truc"); assert!(result.is_err()); let test_case = [ // simple test + ("channel = Ponce = 12", "An error occured"), ("OR", "An error occured"), ("AND", "An error occured"), ("channel = Ponce OR", "An error occured"), - ("channel = Ponce = 12", "An error occured"), ("_geoRadius = 12", "An error occured"), ("_geoPoint(12, 13, 14)", "An error occured"), ("_geo = _geoRadius(12, 13, 14)", "An error occured"), ]; for (input, expected) in test_case { - let result = Fc::parse::>(input); + let result = Fc::parse(input); assert!( result.is_err(), diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index 55c9aec23..7c708aa73 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -2,25 +2,25 @@ use nom::branch::alt; use nom::bytes::complete::{take_till, take_while1}; use nom::character::complete::char; use nom::sequence::delimited; -use nom::IResult; -use crate::{ws, FPError, Span, Token}; +use crate::{ws, Error, IResult, Span, Token}; /// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* -pub fn parse_value<'a, E: FPError<'a>>(input: Span<'a>) -> IResult, Token, E> { +pub fn parse_value(input: Span) -> IResult { // singleQuoted = "'" .* all but quotes "'" - let simple_quoted_key = |input| take_till(|c: char| c == '\'')(input); + let simple_quoted = |input| take_till(|c: char| c == '\'')(input); // doubleQuoted = "\"" (word | spaces)* "\"" - let quoted_key = |input| take_till(|c: char| c == '"')(input); + let double_quoted = |input| take_till(|c: char| c == '"')(input); // word = (alphanumeric | _ | - | .)+ let word = |input| take_while1(is_key_component)(input); - alt(( - ws(delimited(char('\''), simple_quoted_key, char('\''))), - ws(delimited(char('"'), quoted_key, char('"'))), - ws(word), - ))(input) + ws(alt(( + delimited(char('\''), simple_quoted, char('\'')), + delimited(char('"'), double_quoted, char('"')), + word, + )))(input) .map(|(s, t)| (s, t.into())) + .map_err(|e| e.map(|_| Error::expected_value(input))) } fn is_key_component(c: char) -> bool { @@ -29,8 +29,6 @@ fn is_key_component(c: char) -> bool { #[cfg(test)] pub mod tests { - use nom::error::Error; - use super::*; use crate::tests::rtok; @@ -58,7 +56,7 @@ pub mod tests { for (input, expected) in test_case { let input = Span::new_extra(input, input); - let result = parse_value::>(input); + let result = parse_value(input); assert!( result.is_ok(), From 0c0038488c4881c4b5dd8d1c8e8ba59ea7d5737c Mon Sep 17 00:00:00 2001 From: many Date: Wed, 3 Nov 2021 11:24:06 +0100 Subject: [PATCH 1122/1889] Change last error messages --- milli/src/error.rs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index be8458ce6..59744a32e 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -232,11 +232,21 @@ impl fmt::Display for UserError { ) } Self::InvalidFilter(error) => error.fmt(f), - Self::InvalidGeoField { document_id, object } => write!( - f, - "The document with the id: `{}` contains an invalid _geo field: `{}`.", - document_id, object - ), + Self::InvalidGeoField { document_id, object } => { + let document_id = match document_id { + Value::String(id) => id.clone(), + _ => document_id.to_string(), + }; + let object = match object { + Value::String(id) => id.clone(), + _ => object.to_string(), + }; + write!( + f, + "The document with the id: `{}` contains an invalid _geo field: `{}`.", + document_id, object + ) + }, Self::InvalidDocumentId { document_id } => { let document_id = match document_id { Value::String(id) => id.clone(), @@ -268,10 +278,9 @@ ranking rules settings to use the sort parameter at search time.", write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json) } Self::MissingPrimaryKey => f.write_str("Missing primary key."), - Self::MaxDatabaseSizeReached => f.write_str("Maximum database size reached."), - // TODO where can we find it instead of writing the text ourselves? - Self::NoSpaceLeftOnDevice => f.write_str("No space left on device."), - Self::InvalidStoreFile => f.write_str("Store file is not a valid database file."), + Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."), + Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."), + Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."), Self::PrimaryKeyCannotBeChanged(primary_key) => { write!(f, "Index already has a primary key: `{}`.", primary_key) } From 702589104d45e5c1f51eac2ecaa269727ebf34a8 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 3 Nov 2021 14:20:01 +0100 Subject: [PATCH 1123/1889] Update version for the next release (v0.20.1) --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 24fb214b9..44f7d110b 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.1.0" +version = "0.20.1" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 2e71dfb21..da26a8baa 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.20.0" +version = "0.20.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 824771ce7..43980009a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.20.0" +version = "0.20.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 461c22de0..c96ff094c 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.20.0" +version = "0.20.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 375cbf0dc..fc45b5355 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.20.0" +version = "0.20.1" authors = ["Kerollmops "] edition = "2018" From 7b3bac46a0cc6e58e1ee5456f6aa1550cdf2852a Mon Sep 17 00:00:00 2001 From: many Date: Thu, 4 Nov 2021 13:19:32 +0100 Subject: [PATCH 1124/1889] Change Attribute and Ranking rules errors --- milli/src/criterion.rs | 2 +- milli/src/error.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 0586fcc0f..aca2f95b5 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -17,7 +17,7 @@ pub enum CriterionError { impl fmt::Display for CriterionError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name), + Self::InvalidName { name } => write!(f, "`{}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.", name), Self::ReservedName { name } => { write!( f, diff --git a/milli/src/error.rs b/milli/src/error.rs index 59744a32e..9e8ad515d 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -219,7 +219,7 @@ impl StdError for InternalError {} impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::AttributeLimitReached => f.write_str("Maximum number of attributes reached."), + Self::AttributeLimitReached => f.write_str("A document cannot contain more than 65,535 fields."), Self::CriterionError(error) => write!(f, "{}", error), Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."), Self::InvalidFacetsDistribution { invalid_facets_name } => { @@ -277,7 +277,7 @@ ranking rules settings to use the sort parameter at search time.", let json = serde_json::to_string(document).unwrap(); write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json) } - Self::MissingPrimaryKey => f.write_str("Missing primary key."), + Self::MissingPrimaryKey => f.write_str("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index."), Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."), Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."), Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."), From 743ed9f57f04decf5fb6b0d43b0a5a609cbebf3c Mon Sep 17 00:00:00 2001 From: many Date: Thu, 4 Nov 2021 14:04:21 +0100 Subject: [PATCH 1125/1889] Bump milli version --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 44f7d110b..eb03842ca 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.20.1" +version = "0.20.2" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index da26a8baa..5b33d2a4f 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.20.1" +version = "0.20.2" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 43980009a..04e1c708a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.20.1" +version = "0.20.2" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index c96ff094c..645bc4cdd 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.20.1" +version = "0.20.2" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index fc45b5355..5aa04e569 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.20.1" +version = "0.20.2" authors = ["Kerollmops "] edition = "2018" From 54aec7ac5f541b0b5a160e3a790a4688613f0d8b Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 14:22:35 +0100 Subject: [PATCH 1126/1889] update the filter parser and some code for the fuzzer --- filter_parser/fuzz/.gitignore | 3 + filter_parser/fuzz/Cargo.toml | 25 +++ filter_parser/fuzz/corpus/parse/test_1 | 1 + filter_parser/fuzz/corpus/parse/test_10 | 1 + filter_parser/fuzz/corpus/parse/test_11 | 1 + filter_parser/fuzz/corpus/parse/test_12 | 1 + filter_parser/fuzz/corpus/parse/test_13 | 1 + filter_parser/fuzz/corpus/parse/test_14 | 1 + filter_parser/fuzz/corpus/parse/test_15 | 1 + filter_parser/fuzz/corpus/parse/test_16 | 1 + filter_parser/fuzz/corpus/parse/test_17 | 1 + filter_parser/fuzz/corpus/parse/test_18 | 1 + filter_parser/fuzz/corpus/parse/test_19 | 1 + filter_parser/fuzz/corpus/parse/test_2 | 1 + filter_parser/fuzz/corpus/parse/test_20 | 1 + filter_parser/fuzz/corpus/parse/test_21 | 1 + filter_parser/fuzz/corpus/parse/test_22 | 1 + filter_parser/fuzz/corpus/parse/test_23 | 1 + filter_parser/fuzz/corpus/parse/test_24 | 1 + filter_parser/fuzz/corpus/parse/test_25 | 1 + filter_parser/fuzz/corpus/parse/test_26 | 1 + filter_parser/fuzz/corpus/parse/test_27 | 1 + filter_parser/fuzz/corpus/parse/test_28 | 1 + filter_parser/fuzz/corpus/parse/test_29 | 1 + filter_parser/fuzz/corpus/parse/test_3 | 1 + filter_parser/fuzz/corpus/parse/test_30 | 1 + filter_parser/fuzz/corpus/parse/test_31 | 1 + filter_parser/fuzz/corpus/parse/test_32 | 1 + filter_parser/fuzz/corpus/parse/test_33 | 1 + filter_parser/fuzz/corpus/parse/test_34 | 1 + filter_parser/fuzz/corpus/parse/test_35 | 1 + filter_parser/fuzz/corpus/parse/test_36 | 1 + filter_parser/fuzz/corpus/parse/test_37 | 1 + filter_parser/fuzz/corpus/parse/test_38 | 1 + filter_parser/fuzz/corpus/parse/test_39 | 1 + filter_parser/fuzz/corpus/parse/test_4 | 1 + filter_parser/fuzz/corpus/parse/test_40 | 1 + filter_parser/fuzz/corpus/parse/test_41 | 1 + filter_parser/fuzz/corpus/parse/test_42 | 1 + filter_parser/fuzz/corpus/parse/test_43 | 1 + filter_parser/fuzz/corpus/parse/test_5 | 1 + filter_parser/fuzz/corpus/parse/test_6 | 1 + filter_parser/fuzz/corpus/parse/test_7 | 1 + filter_parser/fuzz/corpus/parse/test_8 | 1 + filter_parser/fuzz/corpus/parse/test_9 | 1 + filter_parser/fuzz/fuzz_targets/parse.rs | 13 ++ filter_parser/src/error.rs | 195 +++++++++++++++++++++++ filter_parser/src/lib.rs | 117 +++++++++----- filter_parser/src/main.rs | 11 ++ filter_parser/src/value.rs | 50 ++++-- 50 files changed, 406 insertions(+), 51 deletions(-) create mode 100644 filter_parser/fuzz/.gitignore create mode 100644 filter_parser/fuzz/Cargo.toml create mode 100644 filter_parser/fuzz/corpus/parse/test_1 create mode 100644 filter_parser/fuzz/corpus/parse/test_10 create mode 100644 filter_parser/fuzz/corpus/parse/test_11 create mode 100644 filter_parser/fuzz/corpus/parse/test_12 create mode 100644 filter_parser/fuzz/corpus/parse/test_13 create mode 100644 filter_parser/fuzz/corpus/parse/test_14 create mode 100644 filter_parser/fuzz/corpus/parse/test_15 create mode 100644 filter_parser/fuzz/corpus/parse/test_16 create mode 100644 filter_parser/fuzz/corpus/parse/test_17 create mode 100644 filter_parser/fuzz/corpus/parse/test_18 create mode 100644 filter_parser/fuzz/corpus/parse/test_19 create mode 100644 filter_parser/fuzz/corpus/parse/test_2 create mode 100644 filter_parser/fuzz/corpus/parse/test_20 create mode 100644 filter_parser/fuzz/corpus/parse/test_21 create mode 100644 filter_parser/fuzz/corpus/parse/test_22 create mode 100644 filter_parser/fuzz/corpus/parse/test_23 create mode 100644 filter_parser/fuzz/corpus/parse/test_24 create mode 100644 filter_parser/fuzz/corpus/parse/test_25 create mode 100644 filter_parser/fuzz/corpus/parse/test_26 create mode 100644 filter_parser/fuzz/corpus/parse/test_27 create mode 100644 filter_parser/fuzz/corpus/parse/test_28 create mode 100644 filter_parser/fuzz/corpus/parse/test_29 create mode 100644 filter_parser/fuzz/corpus/parse/test_3 create mode 100644 filter_parser/fuzz/corpus/parse/test_30 create mode 100644 filter_parser/fuzz/corpus/parse/test_31 create mode 100644 filter_parser/fuzz/corpus/parse/test_32 create mode 100644 filter_parser/fuzz/corpus/parse/test_33 create mode 100644 filter_parser/fuzz/corpus/parse/test_34 create mode 100644 filter_parser/fuzz/corpus/parse/test_35 create mode 100644 filter_parser/fuzz/corpus/parse/test_36 create mode 100644 filter_parser/fuzz/corpus/parse/test_37 create mode 100644 filter_parser/fuzz/corpus/parse/test_38 create mode 100644 filter_parser/fuzz/corpus/parse/test_39 create mode 100644 filter_parser/fuzz/corpus/parse/test_4 create mode 100644 filter_parser/fuzz/corpus/parse/test_40 create mode 100644 filter_parser/fuzz/corpus/parse/test_41 create mode 100644 filter_parser/fuzz/corpus/parse/test_42 create mode 100644 filter_parser/fuzz/corpus/parse/test_43 create mode 100644 filter_parser/fuzz/corpus/parse/test_5 create mode 100644 filter_parser/fuzz/corpus/parse/test_6 create mode 100644 filter_parser/fuzz/corpus/parse/test_7 create mode 100644 filter_parser/fuzz/corpus/parse/test_8 create mode 100644 filter_parser/fuzz/corpus/parse/test_9 create mode 100644 filter_parser/fuzz/fuzz_targets/parse.rs create mode 100644 filter_parser/src/error.rs create mode 100644 filter_parser/src/main.rs diff --git a/filter_parser/fuzz/.gitignore b/filter_parser/fuzz/.gitignore new file mode 100644 index 000000000..a0925114d --- /dev/null +++ b/filter_parser/fuzz/.gitignore @@ -0,0 +1,3 @@ +target +corpus +artifacts diff --git a/filter_parser/fuzz/Cargo.toml b/filter_parser/fuzz/Cargo.toml new file mode 100644 index 000000000..33e604e73 --- /dev/null +++ b/filter_parser/fuzz/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "filter_parser-fuzz" +version = "0.0.0" +authors = ["Automatically generated"] +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.filter_parser] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "parse" +path = "fuzz_targets/parse.rs" +test = false +doc = false diff --git a/filter_parser/fuzz/corpus/parse/test_1 b/filter_parser/fuzz/corpus/parse/test_1 new file mode 100644 index 000000000..2523a328e --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_1 @@ -0,0 +1 @@ +channel = Ponce diff --git a/filter_parser/fuzz/corpus/parse/test_10 b/filter_parser/fuzz/corpus/parse/test_10 new file mode 100644 index 000000000..d0e9f1e51 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_10 @@ -0,0 +1 @@ +channel != ponce diff --git a/filter_parser/fuzz/corpus/parse/test_11 b/filter_parser/fuzz/corpus/parse/test_11 new file mode 100644 index 000000000..ca3db9223 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_11 @@ -0,0 +1 @@ +NOT channel = ponce diff --git a/filter_parser/fuzz/corpus/parse/test_12 b/filter_parser/fuzz/corpus/parse/test_12 new file mode 100644 index 000000000..325f848c1 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_12 @@ -0,0 +1 @@ +subscribers < 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_13 b/filter_parser/fuzz/corpus/parse/test_13 new file mode 100644 index 000000000..ca7b96f30 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_13 @@ -0,0 +1 @@ +subscribers > 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_14 b/filter_parser/fuzz/corpus/parse/test_14 new file mode 100644 index 000000000..f72f48bdb --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_14 @@ -0,0 +1 @@ +subscribers <= 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_15 b/filter_parser/fuzz/corpus/parse/test_15 new file mode 100644 index 000000000..75073fc74 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_15 @@ -0,0 +1 @@ +subscribers >= 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_16 b/filter_parser/fuzz/corpus/parse/test_16 new file mode 100644 index 000000000..bdd39241b --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_16 @@ -0,0 +1 @@ +NOT subscribers < 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_17 b/filter_parser/fuzz/corpus/parse/test_17 new file mode 100644 index 000000000..4487643e4 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_17 @@ -0,0 +1 @@ +NOT subscribers > 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_18 b/filter_parser/fuzz/corpus/parse/test_18 new file mode 100644 index 000000000..150604012 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_18 @@ -0,0 +1 @@ +NOT subscribers <= 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_19 b/filter_parser/fuzz/corpus/parse/test_19 new file mode 100644 index 000000000..11bc15103 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_19 @@ -0,0 +1 @@ +NOT subscribers >= 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_2 b/filter_parser/fuzz/corpus/parse/test_2 new file mode 100644 index 000000000..8ac19cad4 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_2 @@ -0,0 +1 @@ +subscribers = 12 diff --git a/filter_parser/fuzz/corpus/parse/test_20 b/filter_parser/fuzz/corpus/parse/test_20 new file mode 100644 index 000000000..f52ad8ff2 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_20 @@ -0,0 +1 @@ +subscribers 100 TO 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_21 b/filter_parser/fuzz/corpus/parse/test_21 new file mode 100644 index 000000000..e86e6b89d --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_21 @@ -0,0 +1 @@ +NOT subscribers 100 TO 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_22 b/filter_parser/fuzz/corpus/parse/test_22 new file mode 100644 index 000000000..8ceeb6c1a --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_22 @@ -0,0 +1 @@ +_geoRadius(12, 13, 14) diff --git a/filter_parser/fuzz/corpus/parse/test_23 b/filter_parser/fuzz/corpus/parse/test_23 new file mode 100644 index 000000000..614effb98 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_23 @@ -0,0 +1 @@ +NOT _geoRadius(12, 13, 14) diff --git a/filter_parser/fuzz/corpus/parse/test_24 b/filter_parser/fuzz/corpus/parse/test_24 new file mode 100644 index 000000000..2b8b39279 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_24 @@ -0,0 +1 @@ +channel = ponce AND 'dog race' != 'bernese mountain' diff --git a/filter_parser/fuzz/corpus/parse/test_25 b/filter_parser/fuzz/corpus/parse/test_25 new file mode 100644 index 000000000..8f6fef74a --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_25 @@ -0,0 +1 @@ +channel = ponce OR 'dog race' != 'bernese mountain' diff --git a/filter_parser/fuzz/corpus/parse/test_26 b/filter_parser/fuzz/corpus/parse/test_26 new file mode 100644 index 000000000..5134b354d --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_26 @@ -0,0 +1 @@ +channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_27 b/filter_parser/fuzz/corpus/parse/test_27 new file mode 100644 index 000000000..b63559b9f --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_27 @@ -0,0 +1 @@ +channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 ) diff --git a/filter_parser/fuzz/corpus/parse/test_28 b/filter_parser/fuzz/corpus/parse/test_28 new file mode 100644 index 000000000..5bc97fb2b --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_28 @@ -0,0 +1 @@ +(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14) diff --git a/filter_parser/fuzz/corpus/parse/test_29 b/filter_parser/fuzz/corpus/parse/test_29 new file mode 100644 index 000000000..7713618bb --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_29 @@ -0,0 +1 @@ +channel = Ponce = 12 diff --git a/filter_parser/fuzz/corpus/parse/test_3 b/filter_parser/fuzz/corpus/parse/test_3 new file mode 100644 index 000000000..2533e8fcf --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_3 @@ -0,0 +1 @@ +channel = 'Mister Mv' diff --git a/filter_parser/fuzz/corpus/parse/test_30 b/filter_parser/fuzz/corpus/parse/test_30 new file mode 100644 index 000000000..c35941150 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_30 @@ -0,0 +1 @@ +channel = diff --git a/filter_parser/fuzz/corpus/parse/test_31 b/filter_parser/fuzz/corpus/parse/test_31 new file mode 100644 index 000000000..f7982669f --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_31 @@ -0,0 +1 @@ +channel = 🐻 diff --git a/filter_parser/fuzz/corpus/parse/test_32 b/filter_parser/fuzz/corpus/parse/test_32 new file mode 100644 index 000000000..c4a102dc8 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_32 @@ -0,0 +1 @@ +OR diff --git a/filter_parser/fuzz/corpus/parse/test_33 b/filter_parser/fuzz/corpus/parse/test_33 new file mode 100644 index 000000000..eb80eb4e6 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_33 @@ -0,0 +1 @@ +AND diff --git a/filter_parser/fuzz/corpus/parse/test_34 b/filter_parser/fuzz/corpus/parse/test_34 new file mode 100644 index 000000000..60fc05e7f --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_34 @@ -0,0 +1 @@ +channel Ponce diff --git a/filter_parser/fuzz/corpus/parse/test_35 b/filter_parser/fuzz/corpus/parse/test_35 new file mode 100644 index 000000000..4a868f1d8 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_35 @@ -0,0 +1 @@ +channel = Ponce OR diff --git a/filter_parser/fuzz/corpus/parse/test_36 b/filter_parser/fuzz/corpus/parse/test_36 new file mode 100644 index 000000000..d7a0abac7 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_36 @@ -0,0 +1 @@ +_geoRadius diff --git a/filter_parser/fuzz/corpus/parse/test_37 b/filter_parser/fuzz/corpus/parse/test_37 new file mode 100644 index 000000000..44b5105b6 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_37 @@ -0,0 +1 @@ +_geoRadius = 12 diff --git a/filter_parser/fuzz/corpus/parse/test_38 b/filter_parser/fuzz/corpus/parse/test_38 new file mode 100644 index 000000000..ab45b973f --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_38 @@ -0,0 +1 @@ +_geoPoint(12, 13, 14) diff --git a/filter_parser/fuzz/corpus/parse/test_39 b/filter_parser/fuzz/corpus/parse/test_39 new file mode 100644 index 000000000..283095326 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_39 @@ -0,0 +1 @@ +position <= _geoPoint(12, 13, 14) diff --git a/filter_parser/fuzz/corpus/parse/test_4 b/filter_parser/fuzz/corpus/parse/test_4 new file mode 100644 index 000000000..9c2716e79 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_4 @@ -0,0 +1 @@ +channel = "Mister Mv" diff --git a/filter_parser/fuzz/corpus/parse/test_40 b/filter_parser/fuzz/corpus/parse/test_40 new file mode 100644 index 000000000..c4c038c15 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_40 @@ -0,0 +1 @@ +position <= _geoRadius(12, 13, 14) diff --git a/filter_parser/fuzz/corpus/parse/test_41 b/filter_parser/fuzz/corpus/parse/test_41 new file mode 100644 index 000000000..6952aa87e --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_41 @@ -0,0 +1 @@ +channel = 'ponce diff --git a/filter_parser/fuzz/corpus/parse/test_42 b/filter_parser/fuzz/corpus/parse/test_42 new file mode 100644 index 000000000..485d8da96 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_42 @@ -0,0 +1 @@ +channel = "ponce diff --git a/filter_parser/fuzz/corpus/parse/test_43 b/filter_parser/fuzz/corpus/parse/test_43 new file mode 100644 index 000000000..728c8aa22 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_43 @@ -0,0 +1 @@ +channel = mv OR (followers >= 1000 diff --git a/filter_parser/fuzz/corpus/parse/test_5 b/filter_parser/fuzz/corpus/parse/test_5 new file mode 100644 index 000000000..89f5ec8ee --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_5 @@ -0,0 +1 @@ +'dog race' = Borzoi diff --git a/filter_parser/fuzz/corpus/parse/test_6 b/filter_parser/fuzz/corpus/parse/test_6 new file mode 100644 index 000000000..be3e203cb --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_6 @@ -0,0 +1 @@ +"dog race" = Chusky diff --git a/filter_parser/fuzz/corpus/parse/test_7 b/filter_parser/fuzz/corpus/parse/test_7 new file mode 100644 index 000000000..eb77a2875 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_7 @@ -0,0 +1 @@ +"dog race" = "Bernese Mountain" diff --git a/filter_parser/fuzz/corpus/parse/test_8 b/filter_parser/fuzz/corpus/parse/test_8 new file mode 100644 index 000000000..a25477648 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_8 @@ -0,0 +1 @@ +'dog race' = 'Bernese Mountain' diff --git a/filter_parser/fuzz/corpus/parse/test_9 b/filter_parser/fuzz/corpus/parse/test_9 new file mode 100644 index 000000000..c347e68f5 --- /dev/null +++ b/filter_parser/fuzz/corpus/parse/test_9 @@ -0,0 +1 @@ +"dog race" = 'Bernese Mountain' diff --git a/filter_parser/fuzz/fuzz_targets/parse.rs b/filter_parser/fuzz/fuzz_targets/parse.rs new file mode 100644 index 000000000..99d4a03a6 --- /dev/null +++ b/filter_parser/fuzz/fuzz_targets/parse.rs @@ -0,0 +1,13 @@ +#![no_main] +use filter_parser::FilterCondition; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + if let Ok(s) = std::str::from_utf8(data) { + // When we are fuzzing the parser we can get stack overflow really easily. + // But since this doesn't happens with a normal build we are just going to limit the fuzzer to 500 characters. + if s.len() < 500 { + let _ = FilterCondition::parse(s); + } + } +}); diff --git a/filter_parser/src/error.rs b/filter_parser/src/error.rs new file mode 100644 index 000000000..b4155bb51 --- /dev/null +++ b/filter_parser/src/error.rs @@ -0,0 +1,195 @@ +use std::fmt::Display; + +use nom::{Parser, error::{self, ParseError}}; + +use crate::{IResult, Span}; + +pub trait ExtendNomError { + fn is_failure(&self) -> bool; + fn map_err E>(self, op: O) -> nom::Err; + fn map_fail E>(self, op: O) -> nom::Err; +} + +impl ExtendNomError for nom::Err { + fn is_failure(&self) -> bool { + matches!(self, Self::Failure(_)) + } + + fn map_err E>(self, op: O) -> nom::Err { + match self { + e @ Self::Failure(_) => e, + e => e.map(|e| op(e)), + } + } + + fn map_fail E>(self, op: O) -> nom::Err { + match self { + e @ Self::Error(_) => e, + e => e.map(|e| op(e)), + } + } +} + +/// cut a parser and map the error +pub fn cut_with_err<'a, O>(mut parser: impl FnMut(Span<'a>) -> IResult, mut with: impl FnMut(Error<'a>) -> Error<'a>) -> impl FnMut(Span<'a>) -> IResult { + move |input| match parser.parse(input) { + Err(nom::Err::Error(e)) => Err(nom::Err::Failure(with(e))), + rest => rest, + } +} + +#[derive(Debug)] +pub struct Error<'a> { + context: Span<'a>, + kind: ErrorKind<'a>, +} + +#[derive(Debug)] +pub enum ErrorKind<'a> { + ReservedGeo(&'a str), + Geo, + MisusedGeo, + InvalidPrimary, + ReservedKeyword, + ExpectedEof, + ExpectedValue, + MissingClosingDelimiter(char), + UnexpectedInput(Vec<&'a str>), + Context(&'a str), + Char(char), + Unreachable, +} + +impl<'a> Error<'a> { + pub fn kind(context: Span<'a>, kind: ErrorKind<'a>) -> Self { + Self { context, kind } + } + pub fn char(self) -> char { + match self.kind { + ErrorKind::Char(c) => c, + _ => panic!("Internal filter parser error"), + } + } +} + +impl<'a> ParseError> for Error<'a> { + fn from_error_kind(input: Span<'a>, kind: error::ErrorKind) -> Self { + let kind = match kind { + error::ErrorKind::Eof => ErrorKind::ExpectedEof, + error::ErrorKind::Tag => ErrorKind::UnexpectedInput(Vec::new()), + error::ErrorKind::MapRes => todo!(), + error::ErrorKind::MapOpt => todo!(), + error::ErrorKind::Alt => todo!(), + error::ErrorKind::IsNot => todo!(), + error::ErrorKind::IsA => todo!(), + error::ErrorKind::SeparatedList => todo!(), + error::ErrorKind::SeparatedNonEmptyList => todo!(), + error::ErrorKind::Many0 => todo!(), + error::ErrorKind::Many1 => todo!(), + error::ErrorKind::ManyTill => todo!(), + error::ErrorKind::Count => todo!(), + error::ErrorKind::TakeUntil => todo!(), + error::ErrorKind::LengthValue => todo!(), + error::ErrorKind::TagClosure => todo!(), + error::ErrorKind::Alpha => todo!(), + error::ErrorKind::Digit => todo!(), + error::ErrorKind::HexDigit => todo!(), + error::ErrorKind::OctDigit => todo!(), + error::ErrorKind::AlphaNumeric => todo!(), + error::ErrorKind::Space => todo!(), + error::ErrorKind::MultiSpace => todo!(), + error::ErrorKind::LengthValueFn => todo!(), + error::ErrorKind::Switch => todo!(), + error::ErrorKind::TagBits => todo!(), + error::ErrorKind::OneOf => todo!(), + error::ErrorKind::NoneOf => todo!(), + error::ErrorKind::Char => todo!(), + error::ErrorKind::CrLf => todo!(), + error::ErrorKind::RegexpMatch => todo!(), + error::ErrorKind::RegexpMatches => todo!(), + error::ErrorKind::RegexpFind => todo!(), + error::ErrorKind::RegexpCapture => todo!(), + error::ErrorKind::RegexpCaptures => todo!(), + error::ErrorKind::TakeWhile1 => ErrorKind::Unreachable, + error::ErrorKind::Complete => todo!(), + error::ErrorKind::Fix => todo!(), + error::ErrorKind::Escaped => todo!(), + error::ErrorKind::EscapedTransform => todo!(), + error::ErrorKind::NonEmpty => todo!(), + error::ErrorKind::ManyMN => todo!(), + error::ErrorKind::Not => todo!(), + error::ErrorKind::Permutation => todo!(), + error::ErrorKind::Verify => todo!(), + error::ErrorKind::TakeTill1 => todo!(), + error::ErrorKind::TakeWhileMN => todo!(), + error::ErrorKind::TooLarge => todo!(), + error::ErrorKind::Many0Count => todo!(), + error::ErrorKind::Many1Count => todo!(), + error::ErrorKind::Float => todo!(), + error::ErrorKind::Satisfy => todo!(), + error::ErrorKind::Fail => todo!(), + }; + Self { context: input, kind } + } + + fn append(_input: Span<'a>, _kind: error::ErrorKind, other: Self) -> Self { + other + } + + fn from_char(input: Span<'a>, c: char) -> Self { + Self { context: input, kind: ErrorKind::Char(c) } + } +} + +impl<'a> Display for Error<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let input = self.context.fragment(); + + match self.kind { + ErrorKind::ExpectedValue if input.trim().is_empty() => { + writeln!(f, "Was expecting a value but instead got nothing.")? + } + ErrorKind::MissingClosingDelimiter(c) => { + writeln!(f, "Expression `{}` is missing the following closing delemiter: `{}`.", input, c)? + } + ErrorKind::ExpectedValue => { + writeln!(f, "Was expecting a value but instead got `{}`.", input)? + } + ErrorKind::InvalidPrimary if input.trim().is_empty() => { + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing.")? + } + ErrorKind::InvalidPrimary => { + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `{}`.", input)? + } + ErrorKind::ExpectedEof => { + writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", input)? + } + ErrorKind::Geo => { + writeln!(f, "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`.")? + } + ErrorKind::ReservedGeo(name) => { + writeln!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates.", name)? + } + ErrorKind::MisusedGeo => { + writeln!(f, "The `_geoRadius` filter is an operation and can't be used as a value.")? + } + ErrorKind::Char(c) => { + panic!("Tried to display a char error with `{}`", c) + } + ErrorKind::ReservedKeyword => writeln!(f, "reserved keyword")?, + ErrorKind::UnexpectedInput(ref v) => writeln!(f, "Unexpected input found `{}`, vec: `{:?}`", input, v)?, + ErrorKind::Context(_) => todo!(), + ErrorKind::Unreachable => writeln!( + f, + "Encountered an internal error while parsing your filter. Please fill an issue" + )?, + } + write!( + f, + "{}:{} in `{}`.", + self.context.location_line(), + self.context.get_utf8_column(), + self.context.extra, + ) + } +} diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 86c6cd79c..cb9a13f58 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -20,6 +20,20 @@ //! ```text //! geoPoint = WS* ~ "_geoPoint(" ~ (float ~ ",")* ~ ")" //! ``` +//! +//! Specific errors: +//! ================ +//! - If a user try to use a geoPoint, as a primary OR as a value we must throw an error. +//! ```text +//! field = _geoPoint(12, 13, 14) +//! field < 12 AND _geoPoint(1, 2) +//! ``` +//! +//! - If a user try to use a geoRadius as a value we must throw an error. +//! ```text +//! field = _geoRadius(12, 13, 14) +//! ``` +//! mod condition; mod error; @@ -28,12 +42,12 @@ mod value; use std::fmt::Debug; pub use condition::{parse_condition, parse_to, Condition}; +use error::{cut_with_err, ExtendNomError}; pub use error::{Error, ErrorKind}; use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::{char, multispace0}; use nom::combinator::{cut, eof, map}; -use nom::error::{ContextError, ParseError}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, terminated, tuple}; @@ -102,14 +116,15 @@ impl<'a> FilterCondition<'a> { } } -// remove OPTIONAL whitespaces before AND after the the provided parser +/// remove OPTIONAL whitespaces before AND after the the provided parser. fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) -> IResult { delimited(multispace0, inner, multispace0) } -/// and = not (~ "AND" not)* +/// or = and (~ "OR" ~ and) fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; + // if we found a `OR` then we MUST find something next let (input, ors) = many0(preceded(ws(tag("OR")), cut(parse_and)))(input)?; let expr = ors @@ -118,8 +133,10 @@ fn parse_or(input: Span) -> IResult { Ok((input, expr)) } +/// and = not (~ "AND" not)* fn parse_and(input: Span) -> IResult { let (input, lhs) = parse_not(input)?; + // if we found a `AND` then we MUST find something next let (input, ors) = many0(preceded(ws(tag("AND")), cut(parse_not)))(input)?; let expr = ors .into_iter() @@ -128,28 +145,29 @@ fn parse_and(input: Span) -> IResult { } /// not = ("NOT" | "!") not | primary +/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. +/// If we parse a `NOT` or `!` we MUST parse something behind. fn parse_not(input: Span) -> IResult { - alt(( - map(preceded(alt((tag("!"), tag("NOT"))), cut(parse_not)), |e| e.negate()), - cut(parse_primary), - ))(input) + alt((map(preceded(alt((tag("!"), tag("NOT"))), cut(parse_not)), |e| e.negate()), parse_primary))( + input, + ) } /// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) +/// If we parse `_geoRadius` we MUST parse the rest of the expression. fn parse_geo_radius(input: Span) -> IResult { - let err_msg_args_incomplete = "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; - // we want to forbid space BEFORE the _geoRadius but not after - let parsed = preceded::<_, _, _, _, _, _>( + let parsed = preceded( tuple((multispace0, tag("_geoRadius"))), + // if we were able to parse `_geoRadius` and can't parse the rest of the input we returns a failure cut(delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')'))), - )(input); + )(input) + .map_err(|e| e.map(|_| Error::kind(input, ErrorKind::Geo))); - let (input, args): (Span, Vec) = parsed?; + let (input, args) = parsed?; if args.len() != 3 { - let e = Error::from_char(input, '('); - return Err(nom::Err::Failure(Error::add_context(input, err_msg_args_incomplete, e))); + return Err(nom::Err::Failure(Error::kind(input, ErrorKind::Geo))); } let res = FilterCondition::GeoLowerThan { @@ -159,14 +177,39 @@ fn parse_geo_radius(input: Span) -> IResult { Ok((input, res)) } +/// geoPoint = WS* ~ "_geoPoint(float ~ "," ~ float ~ "," float) +fn parse_geo_point(input: Span) -> IResult { + // we want to forbid space BEFORE the _geoPoint but not after + tuple(( + multispace0, + tag("_geoPoint"), + // if we were able to parse `_geoPoint` we are going to return a Failure whatever happens next. + cut(delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')'))), + ))(input) + .map_err(|e| e.map(|_| Error::kind(input, ErrorKind::ReservedGeo("_geoPoint"))))?; + // if we succeeded we still returns a Failure because geoPoints are not allowed + Err(nom::Err::Failure(Error::kind(input, ErrorKind::ReservedGeo("_geoPoint")))) +} + /// primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to fn parse_primary(input: Span) -> IResult { alt(( - delimited(ws(char('(')), cut(parse_expression), cut(ws(char(')')))), + // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis + delimited( + ws(char('(')), + cut(parse_expression), + cut_with_err(ws(char(')')), |c| { + Error::kind(input, ErrorKind::MissingClosingDelimiter(c.char())) + }), + ), |c| parse_geo_radius(c), |c| parse_condition(c), |c| parse_to(c), + // the next lines are only for error handling and are written at the end to have the less possible performance impact + |c| parse_geo_point(c), ))(input) + // if the inner parsers did not match enough information to return an accurate error + .map_err(|e| e.map_err(|_| Error::kind(input, ErrorKind::InvalidPrimary))) } /// expression = or @@ -484,18 +527,24 @@ pub mod tests { fn error() { use FilterCondition as Fc; - let result = Fc::parse("test = truc OR truc"); - assert!(result.is_err()); - let test_case = [ // simple test - ("channel = Ponce = 12", "An error occured"), - ("OR", "An error occured"), - ("AND", "An error occured"), - ("channel = Ponce OR", "An error occured"), - ("_geoRadius = 12", "An error occured"), - ("_geoPoint(12, 13, 14)", "An error occured"), - ("_geo = _geoRadius(12, 13, 14)", "An error occured"), + ("channel = Ponce = 12", "Found unexpected characters at the end of the filter: `= 12`. You probably forgot an `OR` or an `AND` rule."), + ("channel = ", "Was expecting a value but instead got nothing."), + ("channel = 🐻", "Was expecting a value but instead got `🐻`."), + ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `OR`."), + ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `AND`."), + ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `channel Ponce`."), + ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing."), + ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), + ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), + ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), + ("position <= _geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), + ("position <= _geoRadius(12, 13, 14)", "The `_geoRadius` filter is an operation and can't be used as a value."), + ("channel = 'ponce", "Expression `'ponce` is missing the following closing delemiter: `'`."), + ("channel = \"ponce", "Expression `\"ponce` is missing the following closing delemiter: `\"`."), + ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delemiter: `)`."), + ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."), ]; for (input, expected) in test_case { @@ -503,24 +552,12 @@ pub mod tests { assert!( result.is_err(), - "Filter `{:?}` wasn't supposed to be parsed but it did with the following result: `{:?}`", - expected, + "Filter `{}` wasn't supposed to be parsed but it did with the following result: `{:?}`", + input, result.unwrap() ); let filter = result.unwrap_err().to_string(); - assert_eq!(filter, expected, "Filter `{:?}` was supposed to return the following error: `{}`, but instead returned `{}`.", input, filter, expected); + assert!(filter.starts_with(expected), "Filter `{:?}` was supposed to return the following error:\n{}\n, but instead returned\n{}\n.", input, expected, filter); } } - - /* - #[test] - fn bidule() { - use FilterCondition as Fc; - - let result = Fc::parse::>("test = truc OR truc"); - dbg!(result); - - assert!(false); - } - */ } diff --git a/filter_parser/src/main.rs b/filter_parser/src/main.rs new file mode 100644 index 000000000..4158a2063 --- /dev/null +++ b/filter_parser/src/main.rs @@ -0,0 +1,11 @@ +fn main() { + let input = std::env::args().nth(1).expect("You must provide a filter to test"); + + println!("Trying to execute the following filter:\n{}\n\n", input); + + if let Err(e) = filter_parser::FilterCondition::parse(&input) { + println!("{}", e.to_string()); + } else { + println!("✅ Valid filter"); + } +} diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index 7c708aa73..5f4677a2e 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -1,12 +1,29 @@ use nom::branch::alt; -use nom::bytes::complete::{take_till, take_while1}; -use nom::character::complete::char; -use nom::sequence::delimited; +use nom::bytes::complete::{take_till, take_while, take_while1}; +use nom::character::complete::{char, multispace0}; +use nom::combinator::cut; +use nom::sequence::{delimited, terminated}; -use crate::{ws, Error, IResult, Span, Token}; +use crate::error::ExtendNomError; +use crate::{parse_geo_point, parse_geo_radius, Error, ErrorKind, IResult, Span, Token}; /// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* pub fn parse_value(input: Span) -> IResult { + // before anything we want to check if the user is misusing a geo expression + let err = parse_geo_point(input).unwrap_err(); + if err.is_failure() { + return Err(err); + } + match parse_geo_radius(input) { + Ok(_) => return Err(nom::Err::Failure(Error::kind(input, ErrorKind::MisusedGeo))), + // if we encountered a failure it means the user badly wrote a _geoRadius filter. + // But instead of showing him how to fix his syntax we are going to tell him he should not use this filter as a value. + Err(e) if e.is_failure() => { + return Err(nom::Err::Failure(Error::kind(input, ErrorKind::MisusedGeo))) + } + _ => (), + } + // singleQuoted = "'" .* all but quotes "'" let simple_quoted = |input| take_till(|c: char| c == '\'')(input); // doubleQuoted = "\"" (word | spaces)* "\"" @@ -14,13 +31,23 @@ pub fn parse_value(input: Span) -> IResult { // word = (alphanumeric | _ | - | .)+ let word = |input| take_while1(is_key_component)(input); - ws(alt(( - delimited(char('\''), simple_quoted, char('\'')), - delimited(char('"'), double_quoted, char('"')), - word, - )))(input) + // we want to remove the space before entering the alt because if we don't, + // when we create the errors from the output of the alt we have spaces everywhere + let (input, _) = take_while(char::is_whitespace)(input)?; + + terminated( + alt(( + delimited(char('\''), simple_quoted, cut(char('\''))), + delimited(char('"'), double_quoted, cut(char('"'))), + word, + )), + multispace0, + )(input) .map(|(s, t)| (s, t.into())) - .map_err(|e| e.map(|_| Error::expected_value(input))) + // if we found nothing in the alt it means the user did not input any value + .map_err(|e| e.map_err(|_| Error::kind(input, ErrorKind::ExpectedValue))) + // if we found encountered a failure it means the user really tried to input a value, but had an unmatched quote + .map_err(|e| e.map_fail(|c| Error::kind(input, ErrorKind::MissingClosingDelimiter(c.char())))) } fn is_key_component(c: char) -> bool { @@ -38,12 +65,13 @@ pub mod tests { ("channel", rtok("", "channel")), (".private", rtok("", ".private")), ("I-love-kebab", rtok("", "I-love-kebab")), - ("but_snakes_are_also_good", rtok("", "but_snakes_are_also_good")), + ("but_snakes_is_also_good", rtok("", "but_snakes_is_also_good")), ("parens(", rtok("", "parens")), ("parens)", rtok("", "parens")), ("not!", rtok("", "not")), (" channel", rtok(" ", "channel")), ("channel ", rtok("", "channel")), + (" channel ", rtok(" ", "channel")), ("'channel'", rtok("'", "channel")), ("\"channel\"", rtok("\"", "channel")), ("'cha)nnel'", rtok("'", "cha)nnel")), From b165c77fa79ded0f9ef8ebbee444e4a0dcc429bb Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 14:39:02 +0100 Subject: [PATCH 1127/1889] add a smol README --- filter_parser/README.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 filter_parser/README.md diff --git a/filter_parser/README.md b/filter_parser/README.md new file mode 100644 index 000000000..3ba9d8f23 --- /dev/null +++ b/filter_parser/README.md @@ -0,0 +1,35 @@ +# Filter parser + +This workspace is dedicated to the parsing of the MeiliSearch filters. + +Most of the code and explanation are in the [src/lib.rs]. Especially, the BNF of the filters at the top of this file. + +The parser use [nom](https://docs.rs/nom/) to do most of its work and [nom-locate](https://docs.rs/nom_locate/) to keep track of what we were doing when we encountered an error. + +## Cli +A simple main is provided to quick-test if a filter can be parsed or not without bringing milli. +It takes one argument and try to parse it. +``` +cargo run -- 'field = value' # success +cargo run -- 'field = "doggo' # error => missing closing delimiter " +``` + +## Fuzz +The workspace have been fuzzed with [cargo-fuzz](https://rust-fuzz.github.io/book/cargo-fuzz.html). + +### Setup +You'll need rust-nightly to execute the fuzzer. + +``` +cargo install cargo-fuzz +``` + +### Run +``` +cargo fuzz run parse +``` + +## What to do if you find a bug in the parser + +- Write a test at the end of the [src/lib.rs] to ensure it never happens again. +- Add a file in [fuzz/corpus/parse/] with your filter to help the fuzzer finding new bug. From d0fe9dea6177746b77633553be606cdcae601216 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 14:43:36 +0100 Subject: [PATCH 1128/1889] update the readme --- filter_parser/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/filter_parser/README.md b/filter_parser/README.md index 3ba9d8f23..44ffdada3 100644 --- a/filter_parser/README.md +++ b/filter_parser/README.md @@ -2,7 +2,7 @@ This workspace is dedicated to the parsing of the MeiliSearch filters. -Most of the code and explanation are in the [src/lib.rs]. Especially, the BNF of the filters at the top of this file. +Most of the code and explanation are in the [`lib.rs`](./src/lib.rs). Especially, the BNF of the filters at the top of this file. The parser use [nom](https://docs.rs/nom/) to do most of its work and [nom-locate](https://docs.rs/nom_locate/) to keep track of what we were doing when we encountered an error. @@ -31,5 +31,6 @@ cargo fuzz run parse ## What to do if you find a bug in the parser -- Write a test at the end of the [src/lib.rs] to ensure it never happens again. -- Add a file in [fuzz/corpus/parse/] with your filter to help the fuzzer finding new bug. +- Write a test at the end of the [`lib.rs`](./src/lib.rs) to ensure it never happens again. +- Add a file in [the corpus directory](./fuzz/corpus/parse/) with your filter to help the fuzzer finding new bug. Since this directory is going to be heavily polluted by the execution of the fuzzer it's in the gitignore and you'll need to force push your new test. + Since this directory is going to be heavily polluted by the execution of the fuzzer it's in the gitignore and you'll need to force add your new test. From b1a0110a47faf8362ee66513d121e45e1c28957d Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 14:48:39 +0100 Subject: [PATCH 1129/1889] update the main --- filter_parser/src/main.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/filter_parser/src/main.rs b/filter_parser/src/main.rs index 4158a2063..a3e4cab28 100644 --- a/filter_parser/src/main.rs +++ b/filter_parser/src/main.rs @@ -1,11 +1,16 @@ fn main() { let input = std::env::args().nth(1).expect("You must provide a filter to test"); - println!("Trying to execute the following filter:\n{}\n\n", input); + println!("Trying to execute the following filter:\n{}\n", input); - if let Err(e) = filter_parser::FilterCondition::parse(&input) { - println!("{}", e.to_string()); - } else { - println!("✅ Valid filter"); + match filter_parser::FilterCondition::parse(&input) { + Ok(filter) => { + println!("✅ Valid filter"); + println!("{:#?}", filter); + } + Err(e) => { + println!("❎ Invalid filter"); + println!("{}", e.to_string()); + } } } From a58bc5bebbef23550b46ba456c2db3271f6cb2f9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 15:02:36 +0100 Subject: [PATCH 1130/1889] update milli with the new parser_filter --- milli/Cargo.toml | 3 --- milli/src/facet/mod.rs | 2 -- milli/src/lib.rs | 5 ++--- milli/src/search/facet/filter_condition.rs | 16 +++------------- milli/src/search/facet/mod.rs | 2 -- milli/src/search/mod.rs | 2 +- 6 files changed, 6 insertions(+), 24 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1aaeed008..6913178b0 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -38,10 +38,7 @@ smallvec = "1.6.1" tempfile = "3.2.0" uuid = { version = "0.8.2", features = ["v4"] } -# facet filter parser filter_parser = { path = "../filter_parser" } -nom = "7.0.0" -nom-greedyerror = "0.4.0" # documents words self-join itertools = "0.10.0" diff --git a/milli/src/facet/mod.rs b/milli/src/facet/mod.rs index aaa7a65ce..274d2588d 100644 --- a/milli/src/facet/mod.rs +++ b/milli/src/facet/mod.rs @@ -2,7 +2,5 @@ mod facet_type; mod facet_value; pub mod value_encoding; -pub use filter_parser::{Condition, FilterCondition, FilterParserError, Span, Token}; - pub use self::facet_type::FacetType; pub use self::facet_value::FacetValue; diff --git a/milli/src/lib.rs b/milli/src/lib.rs index e2ecb060c..044d74ec1 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -17,6 +17,7 @@ use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; +pub use filter_parser::{Condition, FilterCondition}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; use serde_json::{Map, Value}; @@ -34,9 +35,7 @@ pub use self::heed_codec::{ RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, }; pub use self::index::Index; -pub use self::search::{ - Condition, FacetDistribution, Filter, FilterCondition, MatchingWords, Search, SearchResult, -}; +pub use self::search::{FacetDistribution, Filter, MatchingWords, Search, SearchResult}; pub type Result = std::result::Result; diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index b61cd451b..bb342fa27 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -3,11 +3,9 @@ use std::ops::Bound::{self, Excluded, Included}; use std::str::FromStr; use either::Either; -pub use filter_parser::{Condition, FilterCondition, FilterParserError, Span, Token}; +pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; use log::debug; -use nom::error::{ErrorKind, VerboseError}; -use nom_greedyerror::{convert_error, GreedyError}; use roaring::RoaringBitmap; use super::FacetNumberRange; @@ -22,12 +20,6 @@ pub struct Filter<'a> { condition: FilterCondition<'a>, } -impl<'a> From>> for Error { - fn from(nom_error: VerboseError>) -> Self { - UserError::InvalidFilter { input: nom_error.to_string() }.into() - } -} - fn parse(tok: &Token) -> Result { match tok.inner.parse::() { Ok(t) => Ok(t), @@ -90,11 +82,9 @@ impl<'a> Filter<'a> { } pub fn from_str(expression: &'a str) -> Result { - let condition = match FilterCondition::parse::>(expression) { + let condition = match FilterCondition::parse(expression) { Ok(fc) => Ok(fc), - Err(e) => Err(Error::UserError(UserError::InvalidFilter { - input: convert_error(Span::new_extra(expression, expression), e).to_string(), - })), + Err(e) => Err(Error::UserError(UserError::InvalidFilter { input: e.to_string() })), }?; Ok(Self { condition }) } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index c0b692de7..d6f276fbb 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,5 +1,3 @@ -pub use filter_parser::{Condition, FilterCondition}; - pub use self::facet_distribution::FacetDistribution; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::facet_string::FacetStringIter; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f52dd06f0..a31ead1ec 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -14,7 +14,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{Condition, FacetDistribution, FacetNumberIter, Filter, FilterCondition}; +pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; From 07a5ffb04c3e3e241d7bf4b9f0e1d6c5bf5ddeb4 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 15:52:22 +0100 Subject: [PATCH 1131/1889] update http-ui --- http-ui/src/main.rs | 24 ++++++++++++++-------- milli/src/search/facet/filter_condition.rs | 6 ++++++ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index e3f8f0317..e84c94e50 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -738,31 +738,37 @@ async fn main() -> anyhow::Result<()> { search.query(query); } - let filters = match query.filters { + let filters = match query.filters.as_ref() { Some(condition) if !condition.trim().is_empty() => { - Some(MilliFilter::from_str(&condition).unwrap()) + Some(MilliFilter::from_str(condition).unwrap()) } _otherwise => None, }; - let facet_filters = match query.facet_filters { + let facet_filters = match query.facet_filters.as_ref() { Some(array) => { - let eithers = array.into_iter().map(Into::into); + let eithers = array.iter().map(|either| match either { + UntaggedEither::Left(l) => { + Either::Left(l.iter().map(|s| s.as_str()).collect::>()) + } + UntaggedEither::Right(r) => Either::Right(r.as_str()), + }); MilliFilter::from_array(eithers).unwrap() } _otherwise => None, }; let condition = match (filters, facet_filters) { - (Some(filters), Some(facet_filters)) => { - Some(FilterCondition::And(Box::new(filters), Box::new(facet_filters))) - } - (Some(condition), None) | (None, Some(condition)) => Some(condition), + (Some(filters), Some(facet_filters)) => Some(FilterCondition::And( + Box::new(filters.into()), + Box::new(facet_filters.into()), + )), + (Some(condition), None) | (None, Some(condition)) => Some(condition.into()), _otherwise => None, }; if let Some(condition) = condition { - search.filter(condition); + search.filter(condition.into()); } if let Some(limit) = query.limit { diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index bb342fa27..d0c32c8f4 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -35,6 +35,12 @@ fn parse(tok: &Token) -> Result { } } +impl<'a> From> for FilterCondition<'a> { + fn from(f: Filter<'a>) -> Self { + f.condition + } +} + impl<'a> Filter<'a> { pub fn from_array(array: I) -> Result> where From 72a90712037607688a04b56254089056a89de602 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 16:03:52 +0100 Subject: [PATCH 1132/1889] fix typo --- filter_parser/src/error.rs | 2 +- filter_parser/src/lib.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/filter_parser/src/error.rs b/filter_parser/src/error.rs index b4155bb51..f92200882 100644 --- a/filter_parser/src/error.rs +++ b/filter_parser/src/error.rs @@ -150,7 +150,7 @@ impl<'a> Display for Error<'a> { writeln!(f, "Was expecting a value but instead got nothing.")? } ErrorKind::MissingClosingDelimiter(c) => { - writeln!(f, "Expression `{}` is missing the following closing delemiter: `{}`.", input, c)? + writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", input, c)? } ErrorKind::ExpectedValue => { writeln!(f, "Was expecting a value but instead got `{}`.", input)? diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index cb9a13f58..e6f8a75d1 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -541,9 +541,9 @@ pub mod tests { ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), ("position <= _geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), ("position <= _geoRadius(12, 13, 14)", "The `_geoRadius` filter is an operation and can't be used as a value."), - ("channel = 'ponce", "Expression `'ponce` is missing the following closing delemiter: `'`."), - ("channel = \"ponce", "Expression `\"ponce` is missing the following closing delemiter: `\"`."), - ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delemiter: `)`."), + ("channel = 'ponce", "Expression `'ponce` is missing the following closing delimiter: `'`."), + ("channel = \"ponce", "Expression `\"ponce` is missing the following closing delimiter: `\"`."), + ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."), ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."), ]; From 3e5550c910068df31b9b5adc42c603938a49eddd Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 16:12:17 +0100 Subject: [PATCH 1133/1889] clean the errors --- filter_parser/src/error.rs | 87 ++++++++------------------------------ 1 file changed, 18 insertions(+), 69 deletions(-) diff --git a/filter_parser/src/error.rs b/filter_parser/src/error.rs index f92200882..fbfbbe30b 100644 --- a/filter_parser/src/error.rs +++ b/filter_parser/src/error.rs @@ -1,6 +1,7 @@ use std::fmt::Display; -use nom::{Parser, error::{self, ParseError}}; +use nom::error::{self, ParseError}; +use nom::Parser; use crate::{IResult, Span}; @@ -31,11 +32,14 @@ impl ExtendNomError for nom::Err { } /// cut a parser and map the error -pub fn cut_with_err<'a, O>(mut parser: impl FnMut(Span<'a>) -> IResult, mut with: impl FnMut(Error<'a>) -> Error<'a>) -> impl FnMut(Span<'a>) -> IResult { - move |input| match parser.parse(input) { - Err(nom::Err::Error(e)) => Err(nom::Err::Failure(with(e))), - rest => rest, - } +pub fn cut_with_err<'a, O>( + mut parser: impl FnMut(Span<'a>) -> IResult, + mut with: impl FnMut(Error<'a>) -> Error<'a>, +) -> impl FnMut(Span<'a>) -> IResult { + move |input| match parser.parse(input) { + Err(nom::Err::Error(e)) => Err(nom::Err::Failure(with(e))), + rest => rest, + } } #[derive(Debug)] @@ -50,14 +54,12 @@ pub enum ErrorKind<'a> { Geo, MisusedGeo, InvalidPrimary, - ReservedKeyword, ExpectedEof, ExpectedValue, MissingClosingDelimiter(char), - UnexpectedInput(Vec<&'a str>), - Context(&'a str), Char(char), - Unreachable, + InternalError(error::ErrorKind), + External(String), } impl<'a> Error<'a> { @@ -68,66 +70,15 @@ impl<'a> Error<'a> { match self.kind { ErrorKind::Char(c) => c, _ => panic!("Internal filter parser error"), - } } + } } impl<'a> ParseError> for Error<'a> { fn from_error_kind(input: Span<'a>, kind: error::ErrorKind) -> Self { let kind = match kind { error::ErrorKind::Eof => ErrorKind::ExpectedEof, - error::ErrorKind::Tag => ErrorKind::UnexpectedInput(Vec::new()), - error::ErrorKind::MapRes => todo!(), - error::ErrorKind::MapOpt => todo!(), - error::ErrorKind::Alt => todo!(), - error::ErrorKind::IsNot => todo!(), - error::ErrorKind::IsA => todo!(), - error::ErrorKind::SeparatedList => todo!(), - error::ErrorKind::SeparatedNonEmptyList => todo!(), - error::ErrorKind::Many0 => todo!(), - error::ErrorKind::Many1 => todo!(), - error::ErrorKind::ManyTill => todo!(), - error::ErrorKind::Count => todo!(), - error::ErrorKind::TakeUntil => todo!(), - error::ErrorKind::LengthValue => todo!(), - error::ErrorKind::TagClosure => todo!(), - error::ErrorKind::Alpha => todo!(), - error::ErrorKind::Digit => todo!(), - error::ErrorKind::HexDigit => todo!(), - error::ErrorKind::OctDigit => todo!(), - error::ErrorKind::AlphaNumeric => todo!(), - error::ErrorKind::Space => todo!(), - error::ErrorKind::MultiSpace => todo!(), - error::ErrorKind::LengthValueFn => todo!(), - error::ErrorKind::Switch => todo!(), - error::ErrorKind::TagBits => todo!(), - error::ErrorKind::OneOf => todo!(), - error::ErrorKind::NoneOf => todo!(), - error::ErrorKind::Char => todo!(), - error::ErrorKind::CrLf => todo!(), - error::ErrorKind::RegexpMatch => todo!(), - error::ErrorKind::RegexpMatches => todo!(), - error::ErrorKind::RegexpFind => todo!(), - error::ErrorKind::RegexpCapture => todo!(), - error::ErrorKind::RegexpCaptures => todo!(), - error::ErrorKind::TakeWhile1 => ErrorKind::Unreachable, - error::ErrorKind::Complete => todo!(), - error::ErrorKind::Fix => todo!(), - error::ErrorKind::Escaped => todo!(), - error::ErrorKind::EscapedTransform => todo!(), - error::ErrorKind::NonEmpty => todo!(), - error::ErrorKind::ManyMN => todo!(), - error::ErrorKind::Not => todo!(), - error::ErrorKind::Permutation => todo!(), - error::ErrorKind::Verify => todo!(), - error::ErrorKind::TakeTill1 => todo!(), - error::ErrorKind::TakeWhileMN => todo!(), - error::ErrorKind::TooLarge => todo!(), - error::ErrorKind::Many0Count => todo!(), - error::ErrorKind::Many1Count => todo!(), - error::ErrorKind::Float => todo!(), - error::ErrorKind::Satisfy => todo!(), - error::ErrorKind::Fail => todo!(), + kind => ErrorKind::InternalError(kind), }; Self { context: input, kind } } @@ -149,7 +100,7 @@ impl<'a> Display for Error<'a> { ErrorKind::ExpectedValue if input.trim().is_empty() => { writeln!(f, "Was expecting a value but instead got nothing.")? } - ErrorKind::MissingClosingDelimiter(c) => { + ErrorKind::MissingClosingDelimiter(c) => { writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", input, c)? } ErrorKind::ExpectedValue => { @@ -176,13 +127,11 @@ impl<'a> Display for Error<'a> { ErrorKind::Char(c) => { panic!("Tried to display a char error with `{}`", c) } - ErrorKind::ReservedKeyword => writeln!(f, "reserved keyword")?, - ErrorKind::UnexpectedInput(ref v) => writeln!(f, "Unexpected input found `{}`, vec: `{:?}`", input, v)?, - ErrorKind::Context(_) => todo!(), - ErrorKind::Unreachable => writeln!( + ErrorKind::InternalError(kind) => writeln!( f, - "Encountered an internal error while parsing your filter. Please fill an issue" + "Encountered an internal `{:?}` error while parsing your filter. Please fill an issue", kind )?, + ErrorKind::External(ref error) => writeln!(f, "{}", error)?, } write!( f, From 7328ffb0340a7f092b251f345be1c6339dcb7431 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 16:20:53 +0100 Subject: [PATCH 1134/1889] stop panicking in case of internal error --- filter_parser/fuzz/fuzz_targets/parse.rs | 11 ++++++++--- filter_parser/src/error.rs | 7 ++++++- filter_parser/src/lib.rs | 12 ++++++------ filter_parser/src/value.rs | 10 ++++++---- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/filter_parser/fuzz/fuzz_targets/parse.rs b/filter_parser/fuzz/fuzz_targets/parse.rs index 99d4a03a6..6d0069c15 100644 --- a/filter_parser/fuzz/fuzz_targets/parse.rs +++ b/filter_parser/fuzz/fuzz_targets/parse.rs @@ -1,13 +1,18 @@ #![no_main] -use filter_parser::FilterCondition; +use filter_parser::{ErrorKind, FilterCondition}; use libfuzzer_sys::fuzz_target; fuzz_target!(|data: &[u8]| { if let Ok(s) = std::str::from_utf8(data) { - // When we are fuzzing the parser we can get stack overflow really easily. + // When we are fuzzing the parser we can get a stack overflow very easily. // But since this doesn't happens with a normal build we are just going to limit the fuzzer to 500 characters. if s.len() < 500 { - let _ = FilterCondition::parse(s); + match FilterCondition::parse(s) { + Err(e) if matches!(e.kind(), ErrorKind::InternalError(_)) => { + panic!("Found an internal error: `{:?}`", e) + } + _ => (), + } } } }); diff --git a/filter_parser/src/error.rs b/filter_parser/src/error.rs index fbfbbe30b..a0ea2efac 100644 --- a/filter_parser/src/error.rs +++ b/filter_parser/src/error.rs @@ -63,9 +63,14 @@ pub enum ErrorKind<'a> { } impl<'a> Error<'a> { - pub fn kind(context: Span<'a>, kind: ErrorKind<'a>) -> Self { + pub fn kind(&self) -> &ErrorKind<'a> { + &self.kind + } + + pub fn new_from_kind(context: Span<'a>, kind: ErrorKind<'a>) -> Self { Self { context, kind } } + pub fn char(self) -> char { match self.kind { ErrorKind::Char(c) => c, diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index e6f8a75d1..9335ef185 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -162,12 +162,12 @@ fn parse_geo_radius(input: Span) -> IResult { // if we were able to parse `_geoRadius` and can't parse the rest of the input we returns a failure cut(delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')'))), )(input) - .map_err(|e| e.map(|_| Error::kind(input, ErrorKind::Geo))); + .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::Geo))); let (input, args) = parsed?; if args.len() != 3 { - return Err(nom::Err::Failure(Error::kind(input, ErrorKind::Geo))); + return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::Geo))); } let res = FilterCondition::GeoLowerThan { @@ -186,9 +186,9 @@ fn parse_geo_point(input: Span) -> IResult { // if we were able to parse `_geoPoint` we are going to return a Failure whatever happens next. cut(delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')'))), ))(input) - .map_err(|e| e.map(|_| Error::kind(input, ErrorKind::ReservedGeo("_geoPoint"))))?; + .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))?; // if we succeeded we still returns a Failure because geoPoints are not allowed - Err(nom::Err::Failure(Error::kind(input, ErrorKind::ReservedGeo("_geoPoint")))) + Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) } /// primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to @@ -199,7 +199,7 @@ fn parse_primary(input: Span) -> IResult { ws(char('(')), cut(parse_expression), cut_with_err(ws(char(')')), |c| { - Error::kind(input, ErrorKind::MissingClosingDelimiter(c.char())) + Error::new_from_kind(input, ErrorKind::MissingClosingDelimiter(c.char())) }), ), |c| parse_geo_radius(c), @@ -209,7 +209,7 @@ fn parse_primary(input: Span) -> IResult { |c| parse_geo_point(c), ))(input) // if the inner parsers did not match enough information to return an accurate error - .map_err(|e| e.map_err(|_| Error::kind(input, ErrorKind::InvalidPrimary))) + .map_err(|e| e.map_err(|_| Error::new_from_kind(input, ErrorKind::InvalidPrimary))) } /// expression = or diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index 5f4677a2e..79fc00acd 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -15,11 +15,11 @@ pub fn parse_value(input: Span) -> IResult { return Err(err); } match parse_geo_radius(input) { - Ok(_) => return Err(nom::Err::Failure(Error::kind(input, ErrorKind::MisusedGeo))), + Ok(_) => return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::MisusedGeo))), // if we encountered a failure it means the user badly wrote a _geoRadius filter. // But instead of showing him how to fix his syntax we are going to tell him he should not use this filter as a value. Err(e) if e.is_failure() => { - return Err(nom::Err::Failure(Error::kind(input, ErrorKind::MisusedGeo))) + return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::MisusedGeo))) } _ => (), } @@ -45,9 +45,11 @@ pub fn parse_value(input: Span) -> IResult { )(input) .map(|(s, t)| (s, t.into())) // if we found nothing in the alt it means the user did not input any value - .map_err(|e| e.map_err(|_| Error::kind(input, ErrorKind::ExpectedValue))) + .map_err(|e| e.map_err(|_| Error::new_from_kind(input, ErrorKind::ExpectedValue))) // if we found encountered a failure it means the user really tried to input a value, but had an unmatched quote - .map_err(|e| e.map_fail(|c| Error::kind(input, ErrorKind::MissingClosingDelimiter(c.char())))) + .map_err(|e| { + e.map_fail(|c| Error::new_from_kind(input, ErrorKind::MissingClosingDelimiter(c.char()))) + }) } fn is_key_component(c: char) -> bool { From 8234f9fdf3886bebcfc1dfeef239c1bcbc442d1c Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 17:24:55 +0100 Subject: [PATCH 1135/1889] recreate most filter error except for the geosearch --- filter_parser/src/error.rs | 4 + filter_parser/src/lib.rs | 4 + milli/src/error.rs | 4 +- milli/src/search/facet/filter_condition.rs | 93 +++++++++++++++------- 4 files changed, 73 insertions(+), 32 deletions(-) diff --git a/filter_parser/src/error.rs b/filter_parser/src/error.rs index a0ea2efac..a1bbac47a 100644 --- a/filter_parser/src/error.rs +++ b/filter_parser/src/error.rs @@ -71,6 +71,10 @@ impl<'a> Error<'a> { Self { context, kind } } + pub fn new_from_external(context: Span<'a>, error: impl std::error::Error) -> Self { + Self::new_from_kind(context, ErrorKind::External(error.to_string())) + } + pub fn char(self) -> char { match self.kind { ErrorKind::Char(c) => c, diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 9335ef185..31aa973ab 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -69,6 +69,10 @@ impl<'a> Token<'a> { pub fn new(position: Span<'a>) -> Self { Self { position, inner: &position } } + + pub fn as_external_error(&self, error: impl std::error::Error) -> Error<'a> { + Error::new_from_external(self.position, error) + } } impl<'a> From> for Token<'a> { diff --git a/milli/src/error.rs b/milli/src/error.rs index c0ce101c8..3d744da5c 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -59,7 +59,7 @@ pub enum UserError { InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, InvalidGeoField { document_id: Value, object: Value }, - InvalidFilter { input: String }, + InvalidFilter(String), InvalidSortName { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, @@ -207,7 +207,7 @@ impl StdError for InternalError {} impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::InvalidFilter { input } => write!(f, "parser error {}", input), + Self::InvalidFilter(input) => write!(f, "{}", input), Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), Self::CriterionError(error) => write!(f, "{}", error), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index d0c32c8f4..13622a134 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,4 +1,4 @@ -use std::fmt::Debug; +use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; use std::str::FromStr; @@ -20,18 +20,50 @@ pub struct Filter<'a> { condition: FilterCondition<'a>, } -fn parse(tok: &Token) -> Result { +#[derive(Debug)] +enum FilterError<'a> { + AttributeNotFilterable { attribute: &'a str, filterable: String }, + BadGeo(&'a str), + Reserved(&'a str), +} +impl<'a> std::error::Error for FilterError<'a> {} + +impl<'a> Display for FilterError<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::AttributeNotFilterable { attribute, filterable } => write!( + f, + "Attribute `{}` is not filterable. Available filterable attributes are: `{}`.", + attribute, + filterable, + ), + Self::Reserved(keyword) => write!( + f, + "`{}` is a reserved keyword and thus can't be used as a filter expression.", + keyword + ), + Self::BadGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.", keyword), + } + } +} + +impl<'a> From> for Error { + fn from(error: FPError<'a>) -> Self { + Self::UserError(UserError::InvalidFilter(error.to_string())) + } +} + +fn parse(tok: &Token) -> Result +where + T: FromStr, + T::Err: std::error::Error, +{ match tok.inner.parse::() { Ok(t) => Ok(t), - Err(_e) => Err(UserError::InvalidFilter { - input: format!( - "Could not parse `{}` at line {} and offset {}", - tok.inner, - tok.position.location_line(), - tok.position.get_column() - ), + Err(e) => { + Err(UserError::InvalidFilter(FPError::new_from_external(tok.position, e).to_string()) + .into()) } - .into()), } } @@ -90,7 +122,7 @@ impl<'a> Filter<'a> { pub fn from_str(expression: &'a str) -> Result { let condition = match FilterCondition::parse(expression) { Ok(fc) => Ok(fc), - Err(e) => Err(Error::UserError(UserError::InvalidFilter { input: e.to_string() })), + Err(e) => Err(Error::UserError(UserError::InvalidFilter(e.to_string()))), }?; Ok(Self { condition }) } @@ -299,25 +331,26 @@ impl<'a> Filter<'a> { Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) } else { match fid.inner { - // TODO update the error messages according to the spec - "_geo" => { - return Err(UserError::InvalidFilter { input: format!("Tried to use _geo in a filter, you probably wanted to use _geoRadius(latitude, longitude, radius)") })?; + attribute @ "_geo" => { + return Err(fid.as_external_error(FilterError::BadGeo(attribute)))?; } - "_geoDistance" => { - return Err(UserError::InvalidFilter { - input: format!("Reserved field _geoDistance"), - })?; + attribute if attribute.starts_with("_geoPoint(") => { + return Err(fid.as_external_error(FilterError::BadGeo("_geoPoint")))?; } - fid if fid.starts_with("_geoPoint(") => { - return Err(UserError::InvalidFilter { input: format!("_geoPoint only available in sort. You wanted to use _geoRadius") })?; + attribute @ "_geoDistance" => { + return Err(fid.as_external_error(FilterError::Reserved(attribute)))?; } - fid => { - return Err(UserError::InvalidFilter { - input: format!( - "Bad filter {}, available filters are {:?}", - fid, filterable_fields - ), - })?; + attribute => { + return Err(fid.as_external_error( + FilterError::AttributeNotFilterable { + attribute, + filterable: filterable_fields + .iter() + .map(|(_, s)| s) + .collect::>() + .join(" "), + }, + ))?; } } } @@ -356,9 +389,9 @@ impl<'a> Filter<'a> { Ok(result) } else { // TODO TAMO: update the error message - return Err(UserError::InvalidFilter { - input: format!("You tried to use _geo in a filter, you probably wanted to use _geoRadius"), - })?; + return Err(UserError::InvalidFilter(format!( + "You tried to use _geo in a filter, you probably wanted to use _geoRadius" + )))?; } } FilterCondition::GeoGreaterThan { point, radius } => { From 76d961cc7720bfb5e0dc3ca321948dea2e8b73b9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 4 Nov 2021 17:42:06 +0100 Subject: [PATCH 1136/1889] implements the last errors --- milli/src/search/facet/filter_condition.rs | 30 +++++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 13622a134..83873285f 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -24,6 +24,8 @@ pub struct Filter<'a> { enum FilterError<'a> { AttributeNotFilterable { attribute: &'a str, filterable: String }, BadGeo(&'a str), + BadGeoLat(f64), + BadGeoLng(f64), Reserved(&'a str), } impl<'a> std::error::Error for FilterError<'a> {} @@ -43,6 +45,8 @@ impl<'a> Display for FilterError<'a> { keyword ), Self::BadGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.", keyword), + Self::BadGeoLat(lat) => write!(f, "Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ", lat), + Self::BadGeoLng(lng) => write!(f, "Bad longitude `{}`. Latitude must be contained between -180 and 180 degrees. ", lng), } } } @@ -369,9 +373,17 @@ impl<'a> Filter<'a> { FilterCondition::GeoLowerThan { point, radius } => { let filterable_fields = index.fields_ids_map(rtxn)?; if filterable_fields.id("_geo").is_some() { - let base_point = [parse(&point[0])?, parse(&point[1])?]; - // TODO TAMO: ensure lat is between -90 and 90 - // TODO TAMO: ensure lng is between -180 and 180 + let base_point: [f64; 2] = [parse(&point[0])?, parse(&point[1])?]; + if !(-90.0..=90.0).contains(&base_point[0]) { + return Err( + point[0].as_external_error(FilterError::BadGeoLat(base_point[0])) + )?; + } + if !(-180.0..=180.0).contains(&base_point[1]) { + return Err( + point[1].as_external_error(FilterError::BadGeoLng(base_point[1])) + )?; + } let radius = parse(&radius)?; let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, @@ -388,10 +400,14 @@ impl<'a> Filter<'a> { Ok(result) } else { - // TODO TAMO: update the error message - return Err(UserError::InvalidFilter(format!( - "You tried to use _geo in a filter, you probably wanted to use _geoRadius" - )))?; + return Err(point[0].as_external_error(FilterError::AttributeNotFilterable { + attribute: "_geo", + filterable: filterable_fields + .iter() + .map(|(_, s)| s) + .collect::>() + .join(" "), + }))?; } } FilterCondition::GeoGreaterThan { point, radius } => { From 27a6a26b4be13936f52172d31a9da4407dbc24d8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 5 Nov 2021 10:46:54 +0100 Subject: [PATCH 1137/1889] makes the parse function part of the filter_parser --- filter_parser/src/lib.rs | 9 +++++++ milli/src/search/facet/filter_condition.rs | 29 ++++++---------------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 31aa973ab..d09744196 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -40,6 +40,7 @@ mod error; mod value; use std::fmt::Debug; +use std::str::FromStr; pub use condition::{parse_condition, parse_to, Condition}; use error::{cut_with_err, ExtendNomError}; @@ -73,6 +74,14 @@ impl<'a> Token<'a> { pub fn as_external_error(&self, error: impl std::error::Error) -> Error<'a> { Error::new_from_external(self.position, error) } + + pub fn parse(&self) -> Result + where + T: FromStr, + T::Err: std::error::Error, + { + self.inner.parse().map_err(|e| self.as_external_error(e)) + } } impl<'a> From> for Token<'a> { diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 83873285f..164e9aed5 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,6 +1,5 @@ use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; -use std::str::FromStr; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; @@ -57,20 +56,6 @@ impl<'a> From> for Error { } } -fn parse(tok: &Token) -> Result -where - T: FromStr, - T::Err: std::error::Error, -{ - match tok.inner.parse::() { - Ok(t) => Ok(t), - Err(e) => { - Err(UserError::InvalidFilter(FPError::new_from_external(tok.position, e).to_string()) - .into()) - } - } -} - impl<'a> From> for FilterCondition<'a> { fn from(f: Filter<'a>) -> Self { f.condition @@ -254,11 +239,11 @@ impl<'a> Filter<'a> { // field id and the level. let (left, right) = match operator { - Condition::GreaterThan(val) => (Excluded(parse(val)?), Included(f64::MAX)), - Condition::GreaterThanOrEqual(val) => (Included(parse(val)?), Included(f64::MAX)), - Condition::LowerThan(val) => (Included(f64::MIN), Excluded(parse(val)?)), - Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(parse(val)?)), - Condition::Between { from, to } => (Included(parse(from)?), Included(parse(to)?)), + Condition::GreaterThan(val) => (Excluded(val.parse()?), Included(f64::MAX)), + Condition::GreaterThanOrEqual(val) => (Included(val.parse()?), Included(f64::MAX)), + Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)), + Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)), + Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)), Condition::Equal(val) => { let (_original_value, string_docids) = strings_db .get(rtxn, &(field_id, &val.inner.to_lowercase()))? @@ -373,7 +358,7 @@ impl<'a> Filter<'a> { FilterCondition::GeoLowerThan { point, radius } => { let filterable_fields = index.fields_ids_map(rtxn)?; if filterable_fields.id("_geo").is_some() { - let base_point: [f64; 2] = [parse(&point[0])?, parse(&point[1])?]; + let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?]; if !(-90.0..=90.0).contains(&base_point[0]) { return Err( point[0].as_external_error(FilterError::BadGeoLat(base_point[0])) @@ -384,7 +369,7 @@ impl<'a> Filter<'a> { point[1].as_external_error(FilterError::BadGeoLng(base_point[1])) )?; } - let radius = parse(&radius)?; + let radius = radius.parse()?; let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, None => return Ok(RoaringBitmap::new()), From 070ec9bd97c48cf55519463d8faa4ca8f4cb6f26 Mon Sep 17 00:00:00 2001 From: Tamo Date: Fri, 5 Nov 2021 17:45:20 +0100 Subject: [PATCH 1138/1889] small update on the README --- filter_parser/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/filter_parser/README.md b/filter_parser/README.md index 44ffdada3..0999b4340 100644 --- a/filter_parser/README.md +++ b/filter_parser/README.md @@ -25,8 +25,9 @@ cargo install cargo-fuzz ``` ### Run +When the filter parser is executed by the fuzzer it's triggering a stackoverflow really fast. We can avoid this problem by limiting the `max_len` of [libfuzzer](https://llvm.org/docs/LibFuzzer.html) at 500 characters. ``` -cargo fuzz run parse +cargo fuzz run parse -- -max_len=500 ``` ## What to do if you find a bug in the parser From b249989befa31a9baf0e340f20706f065f79f129 Mon Sep 17 00:00:00 2001 From: Tamo Date: Sat, 6 Nov 2021 01:32:12 +0100 Subject: [PATCH 1139/1889] fix most of the tests --- milli/src/search/facet/filter_condition.rs | 351 ++++----------------- milli/src/update/delete_documents.rs | 4 +- milli/src/update/settings.rs | 5 +- milli/tests/search/filters.rs | 8 +- 4 files changed, 66 insertions(+), 302 deletions(-) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 164e9aed5..acab64171 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -14,7 +14,7 @@ use crate::heed_codec::facet::{ }; use crate::{distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result}; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Filter<'a> { condition: FilterCondition<'a>, } @@ -45,7 +45,7 @@ impl<'a> Display for FilterError<'a> { ), Self::BadGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.", keyword), Self::BadGeoLat(lat) => write!(f, "Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ", lat), - Self::BadGeoLng(lng) => write!(f, "Bad longitude `{}`. Latitude must be contained between -180 and 180 degrees. ", lng), + Self::BadGeoLng(lng) => write!(f, "Bad longitude `{}`. Longitude must be contained between -180 and 180 degrees. ", lng), } } } @@ -426,275 +426,64 @@ mod tests { use crate::update::Settings; use crate::Index; - #[test] - fn number() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut map = index.fields_ids_map(&wtxn).unwrap(); - map.insert("timestamp"); - index.put_fields_ids_map(&mut wtxn, &map).unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { "timestamp".into() }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = FilterCondition::Operator(0, Between(22.0, 44.0)); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, LowerThan(22.0))), - Box::new(FilterCondition::Operator(0, GreaterThan(44.0))), - ); - assert_eq!(condition, expected); - } - - #[test] - fn compare() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp"), S("id")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") ,S("id")}); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "channel < 20").unwrap(); - let expected = FilterCondition::Operator(0, LowerThan(20.0)); - assert_eq!(condition, expected); - - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "id < 200").unwrap(); - let expected = FilterCondition::Operator(2, LowerThan(200.0)); - assert_eq!(condition, expected); - } - - #[test] - fn parentheses() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), - Box::new(FilterCondition::And( - Box::new(FilterCondition::Operator(1, Between(22.0, 44.0))), - Box::new(FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce")))), - )), - ); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), - Box::new(FilterCondition::Or( - Box::new(FilterCondition::Or( - Box::new(FilterCondition::Operator(1, LowerThan(22.0))), - Box::new(FilterCondition::Operator(1, GreaterThan(44.0))), - )), - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("ponce")))), - )), - ); - assert_eq!(condition, expected); - } - #[test] fn from_array() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - // Simple array with Left - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, _, _, &str>( - &rtxn, - &index, - vec![Either::Left(["channel = mv"])], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = mv").unwrap(); + let condition = Filter::from_array(vec![Either::Left(["channel = mv"])]).unwrap().unwrap(); + let expected = Filter::from_str("channel = mv").unwrap(); assert_eq!(condition, expected); // Simple array with Right - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( - &rtxn, - &index, - vec![Either::Right("channel = mv")], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = mv").unwrap(); + let condition = Filter::from_array::<_, Option<&str>>(vec![Either::Right("channel = mv")]) + .unwrap() + .unwrap(); + let expected = Filter::from_str("channel = mv").unwrap(); assert_eq!(condition, expected); // Array with Left and escaped quote - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, _, _, &str>( - &rtxn, - &index, - vec![Either::Left(["channel = \"Mister Mv\""])], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = \"Mister Mv\"").unwrap(); + let condition = + Filter::from_array(vec![Either::Left(["channel = \"Mister Mv\""])]).unwrap().unwrap(); + let expected = Filter::from_str("channel = \"Mister Mv\"").unwrap(); assert_eq!(condition, expected); // Array with Right and escaped quote - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( - &rtxn, - &index, - vec![Either::Right("channel = \"Mister Mv\"")], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = \"Mister Mv\"").unwrap(); + let condition = + Filter::from_array::<_, Option<&str>>(vec![Either::Right("channel = \"Mister Mv\"")]) + .unwrap() + .unwrap(); + let expected = Filter::from_str("channel = \"Mister Mv\"").unwrap(); assert_eq!(condition, expected); // Array with Left and escaped simple quote - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, _, _, &str>( - &rtxn, - &index, - vec![Either::Left(["channel = 'Mister Mv'"])], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = 'Mister Mv'").unwrap(); + let condition = + Filter::from_array(vec![Either::Left(["channel = 'Mister Mv'"])]).unwrap().unwrap(); + let expected = Filter::from_str("channel = 'Mister Mv'").unwrap(); assert_eq!(condition, expected); // Array with Right and escaped simple quote - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, Option<&str>, _, _>( - &rtxn, - &index, - vec![Either::Right("channel = 'Mister Mv'")], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "channel = 'Mister Mv'").unwrap(); + let condition = + Filter::from_array::<_, Option<&str>>(vec![Either::Right("channel = 'Mister Mv'")]) + .unwrap() + .unwrap(); + let expected = Filter::from_str("channel = 'Mister Mv'").unwrap(); assert_eq!(condition, expected); // Simple with parenthesis - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array::<_, _, _, &str>( - &rtxn, - &index, - vec![Either::Left(["(channel = mv)"])], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str(&rtxn, &index, "(channel = mv)").unwrap(); + let condition = + Filter::from_array(vec![Either::Left(["(channel = mv)"])]).unwrap().unwrap(); + let expected = Filter::from_str("(channel = mv)").unwrap(); assert_eq!(condition, expected); // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array( - &rtxn, - &index, - vec![ - Either::Right("channel = gotaga"), - Either::Left(vec!["timestamp = 44", "channel != ponce"]), - ], - ) + let condition = Filter::from_array(vec![ + Either::Right("channel = gotaga"), + Either::Left(vec!["timestamp = 44", "channel != ponce"]), + ]) .unwrap() .unwrap(); - let expected = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga AND (timestamp = 44 OR channel != ponce)", - ) - .unwrap(); - assert_eq!(condition, expected); - } - - #[test] - fn geo_radius() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - // basic test - let condition = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); - let expected = FilterCondition::Operator(0, GeoLowerThan([12., 13.0005], 2000.)); - assert_eq!(condition, expected); - - // test the negation of the GeoLowerThan - let condition = - FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); - let expected = FilterCondition::Operator(0, GeoGreaterThan([50., 18.], 2000.500)); - assert_eq!(condition, expected); - - // composition of multiple operations - let condition = FilterCondition::from_str( - &rtxn, - &index, - "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::And( - Box::new(FilterCondition::Operator(0, GeoGreaterThan([1., 2.], 300.))), - Box::new(FilterCondition::Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), - )), - Box::new(FilterCondition::Operator(1, LowerThanOrEqual(10.))), - ); + let expected = + Filter::from_str("channel = gotaga AND (timestamp = 44 OR channel != ponce)").unwrap(); + println!("\nExpecting: {:#?}\nGot: {:#?}\n", expected, condition); assert_eq!(condition, expected); } @@ -715,62 +504,40 @@ mod tests { let rtxn = index.read_txn().unwrap(); - // georadius don't have any parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius don't have any parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius don't have enough parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius have too many parameters - let result = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); + // georadius have a bad latitude + let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!( - error.to_string().contains("Latitude must be contained between -90 and 90 degrees."), + error.to_string().starts_with( + "Bad latitude `-100`. Latitude must be contained between -90 and 90 degrees." + ), "{}", error.to_string() ); // georadius have a bad latitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Latitude must be contained between -90 and 90 degrees.")); + let filter = Filter::from_str("_geoRadius(-90.0000001, 150, 10)").unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().contains( + "Bad latitude `-90.0000001`. Latitude must be contained between -90 and 90 degrees." + )); // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Longitude must be contained between -180 and 180 degrees.")); + let filter = Filter::from_str("_geoRadius(-10, 250, 10)").unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!( + error.to_string().contains( + "Bad longitude `250`. Longitude must be contained between -180 and 180 degrees." + ), + "{}", + error.to_string(), + ); // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Longitude must be contained between -180 and 180 degrees.")); + let filter = Filter::from_str("_geoRadius(-10, 180.000001, 10)").unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().contains( + "Bad longitude `180.000001`. Longitude must be contained between -180 and 180 degrees." + )); } } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 207aed63c..e1a658218 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -567,7 +567,7 @@ mod tests { use super::*; use crate::update::{IndexDocuments, Settings}; - use crate::FilterCondition; + use crate::Filter; #[test] fn delete_documents_with_numbers_as_primary_key() { @@ -667,7 +667,7 @@ mod tests { builder.delete_external_id("1_4"); builder.execute().unwrap(); - let filter = FilterCondition::from_str(&wtxn, &index, "label = sign").unwrap(); + let filter = Filter::from_str("label = sign").unwrap(); let results = index.search(&wtxn).filter(filter).execute().unwrap(); assert!(results.documents_ids.is_empty()); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index dee63c726..f25bceb7b 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -524,7 +524,7 @@ mod tests { use super::*; use crate::error::Error; use crate::update::IndexDocuments; - use crate::{Criterion, FilterCondition, SearchResult}; + use crate::{Criterion, Filter, SearchResult}; #[test] fn set_and_reset_searchable_fields() { @@ -1066,7 +1066,8 @@ mod tests { wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); - FilterCondition::from_str(&rtxn, &index, "toto = 32").unwrap_err(); + let filter = Filter::from_str("toto = 32").unwrap(); + let _ = filter.evaluate(&rtxn, &index).unwrap_err(); } #[test] diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index d992a8e95..99063f9f6 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -1,5 +1,5 @@ use either::{Either, Left, Right}; -use milli::{Criterion, FilterCondition, Search, SearchResult}; +use milli::{Criterion, Filter, Search, SearchResult}; use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -13,11 +13,7 @@ macro_rules! test_filter { let rtxn = index.read_txn().unwrap(); let filter_conditions = - FilterCondition::from_array::, &str>>, _, _, _>( - &rtxn, &index, $filter, - ) - .unwrap() - .unwrap(); + Filter::from_array::, &str>>, _>($filter).unwrap().unwrap(); let mut search = Search::new(&rtxn, &index); search.query(search::TEST_QUERY); From 075d9c97c079143ce2e8ccfe810f9402a35c2623 Mon Sep 17 00:00:00 2001 From: Tamo Date: Sat, 6 Nov 2021 16:02:27 +0100 Subject: [PATCH 1140/1889] re-implement the equality between tokens to only compare the inner value --- filter_parser/src/lib.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index d09744196..a1d66819f 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -60,12 +60,18 @@ pub type Span<'a> = LocatedSpan<&'a str, &'a str>; type IResult<'a, Ret> = nom::IResult, Ret, Error<'a>>; -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, Eq)] pub struct Token<'a> { pub position: Span<'a>, pub inner: &'a str, } +impl<'a> PartialEq for Token<'a> { + fn eq(&self, other: &Self) -> bool { + self.inner == other.inner + } +} + impl<'a> Token<'a> { pub fn new(position: Span<'a>) -> Self { Self { position, inner: &position } From 5c01e9bf7cdbcc221011bec721e6ae685ca83017 Mon Sep 17 00:00:00 2001 From: Tamo Date: Sat, 6 Nov 2021 16:03:49 +0100 Subject: [PATCH 1141/1889] fix the benchmarks --- benchmarks/benches/utils.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 24f5d5343..00bd4e72a 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -9,7 +9,7 @@ use criterion::BenchmarkId; use heed::EnvOpenOptions; use milli::documents::DocumentBatchReader; use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder}; -use milli::{FilterCondition, Index}; +use milli::{Filter, Index}; use serde_json::{Map, Value}; pub struct Conf<'a> { @@ -117,7 +117,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let mut search = index.search(&rtxn); search.query(query).optional_words(conf.optional_words); if let Some(filter) = conf.filter { - let filter = FilterCondition::from_str(&rtxn, &index, filter).unwrap(); + let filter = Filter::from_str(filter).unwrap(); search.filter(filter); } if let Some(sort) = &conf.sort { From e5af3ac65c183a3e7aa7d4eca6b07f3ee4d537c6 Mon Sep 17 00:00:00 2001 From: Tamo Date: Sat, 6 Nov 2021 16:37:55 +0100 Subject: [PATCH 1142/1889] rename the filter_condition.rs to filter.rs --- milli/src/search/facet/{filter_condition.rs => filter.rs} | 0 milli/src/search/facet/mod.rs | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename milli/src/search/facet/{filter_condition.rs => filter.rs} (100%) diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter.rs similarity index 100% rename from milli/src/search/facet/filter_condition.rs rename to milli/src/search/facet/filter.rs diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index d6f276fbb..c8f91352b 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,9 +1,9 @@ pub use self::facet_distribution::FacetDistribution; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::facet_string::FacetStringIter; -pub use self::filter_condition::Filter; +pub use self::filter::Filter; mod facet_distribution; mod facet_number; mod facet_string; -mod filter_condition; +mod filter; From 7483c7513a6804184016fcc82117d4bcf0f1ff59 Mon Sep 17 00:00:00 2001 From: Tamo Date: Sun, 7 Nov 2021 01:52:19 +0100 Subject: [PATCH 1143/1889] fix the filterable fields --- milli/src/search/facet/filter.rs | 70 ++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index acab64171..a26c41736 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -26,6 +26,7 @@ enum FilterError<'a> { BadGeoLat(f64), BadGeoLng(f64), Reserved(&'a str), + InternalError, } impl<'a> std::error::Error for FilterError<'a> {} @@ -46,6 +47,7 @@ impl<'a> Display for FilterError<'a> { Self::BadGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.", keyword), Self::BadGeoLat(lat) => write!(f, "Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ", lat), Self::BadGeoLng(lng) => write!(f, "Bad longitude `{}`. Longitude must be contained between -180 and 180 degrees. ", lng), + Self::InternalError => write!(f, "Internal error while executing this filter."), } } } @@ -315,9 +317,14 @@ impl<'a> Filter<'a> { match &self.condition { FilterCondition::Condition { fid, op } => { - let filterable_fields = index.fields_ids_map(rtxn)?; - if let Some(fid) = filterable_fields.id(&fid.inner.to_lowercase()) { - Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) + let filterable_fields = index.filterable_fields(rtxn)?; + if filterable_fields.contains(&fid.inner.to_lowercase()) { + let field_ids_map = index.fields_ids_map(rtxn)?; + if let Some(fid) = field_ids_map.id(fid.inner) { + Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) + } else { + return Err(fid.as_external_error(FilterError::InternalError))?; + } } else { match fid.inner { attribute @ "_geo" => { @@ -334,8 +341,7 @@ impl<'a> Filter<'a> { FilterError::AttributeNotFilterable { attribute, filterable: filterable_fields - .iter() - .map(|(_, s)| s) + .into_iter() .collect::>() .join(" "), }, @@ -356,8 +362,8 @@ impl<'a> Filter<'a> { } FilterCondition::Empty => Ok(RoaringBitmap::new()), FilterCondition::GeoLowerThan { point, radius } => { - let filterable_fields = index.fields_ids_map(rtxn)?; - if filterable_fields.id("_geo").is_some() { + let filterable_fields = index.filterable_fields(rtxn)?; + if filterable_fields.contains("_geo") { let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?]; if !(-90.0..=90.0).contains(&base_point[0]) { return Err( @@ -387,11 +393,7 @@ impl<'a> Filter<'a> { } else { return Err(point[0].as_external_error(FilterError::AttributeNotFilterable { attribute: "_geo", - filterable: filterable_fields - .iter() - .map(|(_, s)| s) - .collect::>() - .join(" "), + filterable: filterable_fields.into_iter().collect::>().join(" "), }))?; } } @@ -487,6 +489,50 @@ mod tests { assert_eq!(condition, expected); } + #[test] + fn not_filterable() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let rtxn = index.read_txn().unwrap(); + let filter = Filter::from_str("_geoRadius(42, 150, 10)").unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().starts_with( + "Attribute `_geo` is not filterable. Available filterable attributes are: ``." + )); + + let filter = Filter::from_str("dog = \"bernese mountain\"").unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().starts_with( + "Attribute `dog` is not filterable. Available filterable attributes are: ``." + )); + drop(rtxn); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("title")]); + builder.set_filterable_fields(hashset! { S("title") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().starts_with( + "Attribute `_geo` is not filterable. Available filterable attributes are: `title`." + )); + + let filter = Filter::from_str("name = 12").unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().starts_with( + "Attribute `name` is not filterable. Available filterable attributes are: `title`." + )); + } + #[test] fn geo_radius_error() { let path = tempfile::tempdir().unwrap(); From 959ca66125c9f70c20e6f54b44c49ff51a3014ff Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 8 Nov 2021 15:30:26 +0100 Subject: [PATCH 1144/1889] improve the error diagnostic when parsing values --- filter_parser/src/error.rs | 4 +++ filter_parser/src/lib.rs | 1 + filter_parser/src/value.rs | 59 ++++++++++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/filter_parser/src/error.rs b/filter_parser/src/error.rs index a1bbac47a..d52b17200 100644 --- a/filter_parser/src/error.rs +++ b/filter_parser/src/error.rs @@ -67,6 +67,10 @@ impl<'a> Error<'a> { &self.kind } + pub fn context(&self) -> &Span<'a> { + &self.context + } + pub fn new_from_kind(context: Span<'a>, kind: ErrorKind<'a>) -> Self { Self { context, kind } } diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index a1d66819f..7db80888b 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -551,6 +551,7 @@ pub mod tests { ("channel = Ponce = 12", "Found unexpected characters at the end of the filter: `= 12`. You probably forgot an `OR` or an `AND` rule."), ("channel = ", "Was expecting a value but instead got nothing."), ("channel = 🐻", "Was expecting a value but instead got `🐻`."), + ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."), ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `OR`."), ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `AND`."), ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `channel Ponce`."), diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index 79fc00acd..4c769fe5f 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -9,7 +9,10 @@ use crate::{parse_geo_point, parse_geo_radius, Error, ErrorKind, IResult, Span, /// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* pub fn parse_value(input: Span) -> IResult { - // before anything we want to check if the user is misusing a geo expression + // to get better diagnostic message we are going to strip the left whitespaces from the input right now + let (input, _) = take_while(char::is_whitespace)(input)?; + + // then, we want to check if the user is misusing a geo expression let err = parse_geo_point(input).unwrap_err(); if err.is_failure() { return Err(err); @@ -29,23 +32,30 @@ pub fn parse_value(input: Span) -> IResult { // doubleQuoted = "\"" (word | spaces)* "\"" let double_quoted = |input| take_till(|c: char| c == '"')(input); // word = (alphanumeric | _ | - | .)+ - let word = |input| take_while1(is_key_component)(input); + let word = take_while1(is_key_component); + // this parser is only used when an error is encountered and it parse the + // largest string possible that do not contain any “language” syntax. + // If we try to parse `name = 🦀 AND language = rust` we want to return an + // error saying we could not parse `🦀`. Not that no value were found or that + // we could note parse `🦀 AND language = rust`. // we want to remove the space before entering the alt because if we don't, // when we create the errors from the output of the alt we have spaces everywhere - let (input, _) = take_while(char::is_whitespace)(input)?; + let error_word = take_till::<_, _, Error>(is_syntax_component); terminated( alt(( - delimited(char('\''), simple_quoted, cut(char('\''))), - delimited(char('"'), double_quoted, cut(char('"'))), + delimited(char('\''), cut(simple_quoted), cut(char('\''))), + delimited(char('"'), cut(double_quoted), cut(char('"'))), word, )), multispace0, )(input) .map(|(s, t)| (s, t.into())) - // if we found nothing in the alt it means the user did not input any value - .map_err(|e| e.map_err(|_| Error::new_from_kind(input, ErrorKind::ExpectedValue))) + // if we found nothing in the alt it means the user specified something that was not recognized as a value + .map_err(|e: nom::Err| { + e.map_err(|_| Error::new_from_kind(error_word(input).unwrap().1, ErrorKind::ExpectedValue)) + }) // if we found encountered a failure it means the user really tried to input a value, but had an unmatched quote .map_err(|e| { e.map_fail(|c| Error::new_from_kind(input, ErrorKind::MissingClosingDelimiter(c.char()))) @@ -56,8 +66,14 @@ fn is_key_component(c: char) -> bool { c.is_alphanumeric() || ['_', '-', '.'].contains(&c) } +fn is_syntax_component(c: char) -> bool { + c.is_whitespace() || ['(', ')', '=', '<', '>', '!'].contains(&c) +} + #[cfg(test)] -pub mod tests { +pub mod test { + use nom::Finish; + use super::*; use crate::tests::rtok; @@ -82,6 +98,7 @@ pub mod tests { ("\" some spaces \"", rtok("\"", " some spaces ")), ("\"cha'nnel\"", rtok("'", "cha'nnel")), ("\"cha'nnel\"", rtok("'", "cha'nnel")), + ("I'm tamo", rtok("'m tamo", "I")), ]; for (input, expected) in test_case { @@ -98,4 +115,30 @@ pub mod tests { assert_eq!(value, expected, "Filter `{}` failed.", input); } } + + #[test] + fn diagnostic() { + let test_case = [ + ("🦀", "🦀"), + (" 🦀", "🦀"), + ("🦀 AND crab = truc", "🦀"), + ("🦀_in_name", "🦀_in_name"), + (" (name = ...", ""), + ]; + + for (input, expected) in test_case { + let input = Span::new_extra(input, input); + let result = parse_value(input); + + assert!( + result.is_err(), + "Filter `{}` wasn’t supposed to be parsed but it did with the following result: `{:?}`", + expected, + result.unwrap() + ); + // get the inner string referenced in the error + let value = *result.finish().unwrap_err().context().fragment(); + assert_eq!(value, expected, "Filter `{}` was supposed to fail with the following value: `{}`, but it failed with: `{}`.", input, expected, value); + } + } } From 21d115dcbbc5e20d68b3483ece1f5089b9ad5618 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 8 Nov 2021 17:53:41 +0100 Subject: [PATCH 1145/1889] remove greedy-error --- filter_parser/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/filter_parser/Cargo.toml b/filter_parser/Cargo.toml index 2bdb3316a..80767d5c4 100644 --- a/filter_parser/Cargo.toml +++ b/filter_parser/Cargo.toml @@ -8,4 +8,3 @@ edition = "2021" [dependencies] nom = "7.0.0" nom_locate = "4.0.0" -nom-greedyerror = "0.4.0" From 15bd14297efbcf5ae901ab0e73ad6201473485fa Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 9 Nov 2021 00:45:46 +0100 Subject: [PATCH 1146/1889] Remove useless closure Co-authored-by: marin --- filter_parser/src/condition.rs | 5 ++--- filter_parser/src/value.rs | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/filter_parser/src/condition.rs b/filter_parser/src/condition.rs index faacceb72..b58a9f9f9 100644 --- a/filter_parser/src/condition.rs +++ b/filter_parser/src/condition.rs @@ -43,9 +43,8 @@ impl<'a> Condition<'a> { /// condition = value ("==" | ">" ...) value pub fn parse_condition(input: Span) -> IResult { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); - let (input, (key, op, value)) = tuple((|c| parse_value(c), operator, cut(parse_value)))(input)?; + let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?; - let fid = key; match *op.fragment() { "=" => { @@ -73,7 +72,7 @@ pub fn parse_condition(input: Span) -> IResult { /// to = value value TO value pub fn parse_to(input: Span) -> IResult { let (input, (key, from, _, to)) = - tuple((|c| parse_value(c), |c| parse_value(c), tag("TO"), cut(parse_value)))(input)?; + tuple((parse_value, parse_value, tag("TO"), cut(parse_value)))(input)?; Ok(( input, diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index 4c769fe5f..d82eda008 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -28,9 +28,9 @@ pub fn parse_value(input: Span) -> IResult { } // singleQuoted = "'" .* all but quotes "'" - let simple_quoted = |input| take_till(|c: char| c == '\'')(input); + let simple_quoted = take_till(|c: char| c == '\''); // doubleQuoted = "\"" (word | spaces)* "\"" - let double_quoted = |input| take_till(|c: char| c == '"')(input); + let double_quoted = take_till(|c: char| c == '"'); // word = (alphanumeric | _ | - | .)+ let word = take_while1(is_key_component); From ef0d5a824093d2365ee635e061980e446b65e4d6 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 9 Nov 2021 00:49:13 +0100 Subject: [PATCH 1147/1889] flatten a match --- filter_parser/src/condition.rs | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/filter_parser/src/condition.rs b/filter_parser/src/condition.rs index b58a9f9f9..cff2f2fdd 100644 --- a/filter_parser/src/condition.rs +++ b/filter_parser/src/condition.rs @@ -45,28 +45,17 @@ pub fn parse_condition(input: Span) -> IResult { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?; - - match *op.fragment() { - "=" => { - let k = FilterCondition::Condition { fid, op: Equal(value) }; - Ok((input, k)) - } - "!=" => { - let k = FilterCondition::Condition { fid, op: NotEqual(value) }; - Ok((input, k)) - } - ">" | "<" | "<=" | ">=" => { - let k = match *op.fragment() { - ">" => FilterCondition::Condition { fid, op: GreaterThan(value) }, - "<" => FilterCondition::Condition { fid, op: LowerThan(value) }, - "<=" => FilterCondition::Condition { fid, op: LowerThanOrEqual(value) }, - ">=" => FilterCondition::Condition { fid, op: GreaterThanOrEqual(value) }, - _ => unreachable!(), - }; - Ok((input, k)) - } + let condition = match *op.fragment() { + "=" => FilterCondition::Condition { fid, op: Equal(value) }, + "!=" => FilterCondition::Condition { fid, op: NotEqual(value) }, + ">" => FilterCondition::Condition { fid, op: GreaterThan(value) }, + "<" => FilterCondition::Condition { fid, op: LowerThan(value) }, + "<=" => FilterCondition::Condition { fid, op: LowerThanOrEqual(value) }, + ">=" => FilterCondition::Condition { fid, op: GreaterThanOrEqual(value) }, _ => unreachable!(), - } + }; + + Ok((input, condition)) } /// to = value value TO value From ea52aff6dc5e0630d2cf0681186b59ffe8eb533d Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 9 Nov 2021 00:50:15 +0100 Subject: [PATCH 1148/1889] Rename the ExtendNomError trait to NomErrorExt Co-authored-by: marin --- filter_parser/src/error.rs | 4 ++-- filter_parser/src/lib.rs | 2 +- filter_parser/src/value.rs | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/filter_parser/src/error.rs b/filter_parser/src/error.rs index d52b17200..b162fb554 100644 --- a/filter_parser/src/error.rs +++ b/filter_parser/src/error.rs @@ -5,13 +5,13 @@ use nom::Parser; use crate::{IResult, Span}; -pub trait ExtendNomError { +pub trait NomErrorExt { fn is_failure(&self) -> bool; fn map_err E>(self, op: O) -> nom::Err; fn map_fail E>(self, op: O) -> nom::Err; } -impl ExtendNomError for nom::Err { +impl NomErrorExt for nom::Err { fn is_failure(&self) -> bool { matches!(self, Self::Failure(_)) } diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 7db80888b..c4091fa86 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -43,7 +43,7 @@ use std::fmt::Debug; use std::str::FromStr; pub use condition::{parse_condition, parse_to, Condition}; -use error::{cut_with_err, ExtendNomError}; +use error::{cut_with_err, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; use nom::bytes::complete::tag; diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index d82eda008..6f7952ebd 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -4,7 +4,7 @@ use nom::character::complete::{char, multispace0}; use nom::combinator::cut; use nom::sequence::{delimited, terminated}; -use crate::error::ExtendNomError; +use crate::error::NomErrorExt; use crate::{parse_geo_point, parse_geo_radius, Error, ErrorKind, IResult, Span, Token}; /// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* From 6515838d35ad510af9ade79245278044db96c331 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 9 Nov 2021 00:57:46 +0100 Subject: [PATCH 1149/1889] improve the readability of the _geoPoint thingy in the value --- filter_parser/src/value.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index 6f7952ebd..b716dab66 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -13,9 +13,12 @@ pub fn parse_value(input: Span) -> IResult { let (input, _) = take_while(char::is_whitespace)(input)?; // then, we want to check if the user is misusing a geo expression - let err = parse_geo_point(input).unwrap_err(); - if err.is_failure() { - return Err(err); + // This expression can’t finish without error. + // We want to return an error in case of failure. + if let Err(err) = parse_geo_point(input) { + if err.is_failure() { + return Err(err); + } } match parse_geo_radius(input) { Ok(_) => return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::MisusedGeo))), From 9c36e497d9bbfd9da2a8a6cbe5776128da413683 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 9 Nov 2021 00:58:23 +0100 Subject: [PATCH 1150/1889] Rename the key_component into a value_component Co-authored-by: marin --- filter_parser/src/value.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/filter_parser/src/value.rs b/filter_parser/src/value.rs index b716dab66..b9d929ab0 100644 --- a/filter_parser/src/value.rs +++ b/filter_parser/src/value.rs @@ -35,7 +35,7 @@ pub fn parse_value(input: Span) -> IResult { // doubleQuoted = "\"" (word | spaces)* "\"" let double_quoted = take_till(|c: char| c == '"'); // word = (alphanumeric | _ | - | .)+ - let word = take_while1(is_key_component); + let word = take_while1(is_value_component); // this parser is only used when an error is encountered and it parse the // largest string possible that do not contain any “language” syntax. @@ -65,7 +65,7 @@ pub fn parse_value(input: Span) -> IResult { }) } -fn is_key_component(c: char) -> bool { +fn is_value_component(c: char) -> bool { c.is_alphanumeric() || ['_', '-', '.'].contains(&c) } From bc9daf90410ca09abdb84e0fdc03b74c68d35ce4 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 9 Nov 2021 01:00:42 +0100 Subject: [PATCH 1151/1889] update the bnf Co-authored-by: marin --- filter_parser/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index c4091fa86..40e1fb3d4 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -11,7 +11,7 @@ //! to = value value TO value //! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* //! singleQuoted = "'" .* all but quotes "'" -//! doubleQuoted = "\"" (word | spaces)* "\"" +//! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ //! geoRadius = WS* ~ "_geoRadius(" ~ float ~ "," ~ float ~ "," float ~ ")" //! ``` From cf98bf37d0994239f059a595fff60762f496f776 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 9 Nov 2021 01:03:02 +0100 Subject: [PATCH 1152/1889] Simplify some closure Co-authored-by: marin --- filter_parser/src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 40e1fb3d4..71e04af03 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -179,7 +179,7 @@ fn parse_geo_radius(input: Span) -> IResult { let parsed = preceded( tuple((multispace0, tag("_geoRadius"))), // if we were able to parse `_geoRadius` and can't parse the rest of the input we returns a failure - cut(delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')'))), + cut(delimited(char('('), separated_list1(tag(","), ws(recognize_float)), char(')'))), )(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::Geo))); @@ -221,11 +221,11 @@ fn parse_primary(input: Span) -> IResult { Error::new_from_kind(input, ErrorKind::MissingClosingDelimiter(c.char())) }), ), - |c| parse_geo_radius(c), - |c| parse_condition(c), - |c| parse_to(c), + parse_geo_radius, + parse_condition, + parse_to, // the next lines are only for error handling and are written at the end to have the less possible performance impact - |c| parse_geo_point(c), + parse_geo_point, ))(input) // if the inner parsers did not match enough information to return an accurate error .map_err(|e| e.map_err(|_| Error::new_from_kind(input, ErrorKind::InvalidPrimary))) From 18eb4b9c51a2979b9170b1eb7c48c4a0742133ac Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 9 Nov 2021 01:04:50 +0100 Subject: [PATCH 1153/1889] fix spaces in the bnf --- filter_parser/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 71e04af03..be9ed9370 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -13,7 +13,7 @@ //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* ~ "_geoRadius(" ~ float ~ "," ~ float ~ "," float ~ ")" +//! geoRadius = WS* ~ "_geoRadius(" ~ WS* ~ float ~ WS* ~ "," ~ WS* ~ float ~ WS* ~ "," float ~ WS* ~ ")" //! ``` //! //! Other BNF grammar used to handle some specific errors: From 2c6d08c5197c48c6dec72e0960adb0ff34f43049 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 9 Nov 2021 01:06:03 +0100 Subject: [PATCH 1154/1889] Simplify the tokens to only wrap one span and no inner value Co-authored-by: marin --- filter_parser/src/lib.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index be9ed9370..014a008b1 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -61,24 +61,21 @@ pub type Span<'a> = LocatedSpan<&'a str, &'a str>; type IResult<'a, Ret> = nom::IResult, Ret, Error<'a>>; #[derive(Debug, Clone, Eq)] -pub struct Token<'a> { - pub position: Span<'a>, - pub inner: &'a str, -} +pub struct Token<'a>(Span<'a>); impl<'a> PartialEq for Token<'a> { fn eq(&self, other: &Self) -> bool { - self.inner == other.inner + self.0.fragment() == other.0.fragment() } } impl<'a> Token<'a> { pub fn new(position: Span<'a>) -> Self { - Self { position, inner: &position } + Self(position) } pub fn as_external_error(&self, error: impl std::error::Error) -> Error<'a> { - Error::new_from_external(self.position, error) + Error::new_from_external(self.0, error) } pub fn parse(&self) -> Result @@ -86,13 +83,13 @@ impl<'a> Token<'a> { T: FromStr, T::Err: std::error::Error, { - self.inner.parse().map_err(|e| self.as_external_error(e)) + self.0.parse().map_err(|e| self.as_external_error(e)) } } impl<'a> From> for Token<'a> { fn from(span: Span<'a>) -> Self { - Self { inner: &span, position: span } + Self(span) } } From 9b24f83456d02954d0b5fcd6ab081cf61738cda3 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 9 Nov 2021 10:27:29 +0100 Subject: [PATCH 1155/1889] in case of error return a range of chars position instead of one line and column --- filter_parser/src/error.rs | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/filter_parser/src/error.rs b/filter_parser/src/error.rs index b162fb554..4580cde4f 100644 --- a/filter_parser/src/error.rs +++ b/filter_parser/src/error.rs @@ -146,12 +146,8 @@ impl<'a> Display for Error<'a> { )?, ErrorKind::External(ref error) => writeln!(f, "{}", error)?, } - write!( - f, - "{}:{} in `{}`.", - self.context.location_line(), - self.context.get_utf8_column(), - self.context.extra, - ) + let base_column = self.context.get_utf8_column(); + let size = self.context.fragment().chars().count(); + write!(f, "{}:{} in `{}`.", base_column, base_column + size, self.context.extra,) } } From a211a9cdcdb309a4e3ef7eb03d14a7b877cc40bf Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 9 Nov 2021 11:19:30 +0100 Subject: [PATCH 1156/1889] update the error format so it can be easily parsed by someone else --- filter_parser/src/error.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/filter_parser/src/error.rs b/filter_parser/src/error.rs index 4580cde4f..401b8d7f3 100644 --- a/filter_parser/src/error.rs +++ b/filter_parser/src/error.rs @@ -109,30 +109,34 @@ impl<'a> Display for Error<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let input = self.context.fragment(); + // When printing our error message we want to escape all `\n` to be sure we keep our format with the + // first line being the diagnostic and the second line being the incriminated filter. + let escaped_input = input.escape_debug(); + match self.kind { ErrorKind::ExpectedValue if input.trim().is_empty() => { writeln!(f, "Was expecting a value but instead got nothing.")? } ErrorKind::MissingClosingDelimiter(c) => { - writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", input, c)? + writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? } ErrorKind::ExpectedValue => { - writeln!(f, "Was expecting a value but instead got `{}`.", input)? + writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)? } ErrorKind::InvalidPrimary if input.trim().is_empty() => { writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing.")? } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `{}`.", input)? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `{}`.", escaped_input)? } ErrorKind::ExpectedEof => { - writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", input)? + writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? } ErrorKind::Geo => { writeln!(f, "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`.")? } ErrorKind::ReservedGeo(name) => { - writeln!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates.", name)? + writeln!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates.", name.escape_debug())? } ErrorKind::MisusedGeo => { writeln!(f, "The `_geoRadius` filter is an operation and can't be used as a value.")? @@ -148,6 +152,7 @@ impl<'a> Display for Error<'a> { } let base_column = self.context.get_utf8_column(); let size = self.context.fragment().chars().count(); - write!(f, "{}:{} in `{}`.", base_column, base_column + size, self.context.extra,) + + write!(f, "{}:{} {}", base_column, base_column + size, self.context.extra) } } From 0ea0146e048e09ba2c3f4690c1c1697534e6cf69 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 9 Nov 2021 11:34:10 +0100 Subject: [PATCH 1157/1889] implement deref &str on the tokens --- filter_parser/src/lib.rs | 9 +++++++++ milli/src/search/facet/filter.rs | 16 ++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/filter_parser/src/lib.rs b/filter_parser/src/lib.rs index 014a008b1..6276023a9 100644 --- a/filter_parser/src/lib.rs +++ b/filter_parser/src/lib.rs @@ -40,6 +40,7 @@ mod error; mod value; use std::fmt::Debug; +use std::ops::Deref; use std::str::FromStr; pub use condition::{parse_condition, parse_to, Condition}; @@ -63,6 +64,14 @@ type IResult<'a, Ret> = nom::IResult, Ret, Error<'a>>; #[derive(Debug, Clone, Eq)] pub struct Token<'a>(Span<'a>); +impl<'a> Deref for Token<'a> { + type Target = &'a str; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + impl<'a> PartialEq for Token<'a> { fn eq(&self, other: &Self) -> bool { self.0.fragment() == other.0.fragment() diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index a26c41736..ec2c0b3eb 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,5 +1,6 @@ use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; +use std::ops::Deref; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; @@ -247,10 +248,9 @@ impl<'a> Filter<'a> { Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)), Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)), Condition::Equal(val) => { - let (_original_value, string_docids) = strings_db - .get(rtxn, &(field_id, &val.inner.to_lowercase()))? - .unwrap_or_default(); - let number = val.inner.parse::().ok(); + let (_original_value, string_docids) = + strings_db.get(rtxn, &(field_id, &val.to_lowercase()))?.unwrap_or_default(); + let number = val.parse::().ok(); let number_docids = match number { Some(n) => { let n = Included(n); @@ -271,7 +271,7 @@ impl<'a> Filter<'a> { return Ok(string_docids | number_docids); } Condition::NotEqual(val) => { - let number = val.inner.parse::().ok(); + let number = val.parse::().ok(); let all_numbers_ids = if number.is_some() { index.number_faceted_documents_ids(rtxn, field_id)? } else { @@ -318,15 +318,15 @@ impl<'a> Filter<'a> { match &self.condition { FilterCondition::Condition { fid, op } => { let filterable_fields = index.filterable_fields(rtxn)?; - if filterable_fields.contains(&fid.inner.to_lowercase()) { + if filterable_fields.contains(&fid.to_lowercase()) { let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.inner) { + if let Some(fid) = field_ids_map.id(&fid) { Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) } else { return Err(fid.as_external_error(FilterError::InternalError))?; } } else { - match fid.inner { + match *fid.deref() { attribute @ "_geo" => { return Err(fid.as_external_error(FilterError::BadGeo(attribute)))?; } From f28600031d6edc3be6ab1709f45d14e9c07b5ccb Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 9 Nov 2021 16:16:28 +0100 Subject: [PATCH 1158/1889] Rename the filter_parser crate into filter-parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- Cargo.toml | 2 +- {filter_parser => filter-parser}/Cargo.toml | 2 +- {filter_parser => filter-parser}/README.md | 0 {filter_parser => filter-parser}/fuzz/Cargo.toml | 4 ++-- {filter_parser => filter-parser}/fuzz/corpus/parse/test_1 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_10 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_11 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_12 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_13 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_14 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_15 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_16 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_17 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_18 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_19 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_2 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_20 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_21 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_22 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_23 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_24 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_25 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_26 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_27 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_28 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_29 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_3 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_30 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_31 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_32 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_33 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_34 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_35 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_36 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_37 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_38 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_39 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_4 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_40 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_41 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_42 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_43 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_5 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_6 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_7 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_8 | 0 {filter_parser => filter-parser}/fuzz/corpus/parse/test_9 | 0 {filter_parser => filter-parser}/fuzz/fuzz_targets/parse.rs | 0 {filter_parser => filter-parser}/src/condition.rs | 0 {filter_parser => filter-parser}/src/error.rs | 0 {filter_parser => filter-parser}/src/lib.rs | 0 {filter_parser => filter-parser}/src/main.rs | 0 {filter_parser => filter-parser}/src/value.rs | 0 filter_parser/fuzz/.gitignore | 3 --- milli/Cargo.toml | 2 +- 55 files changed, 5 insertions(+), 8 deletions(-) rename {filter_parser => filter-parser}/Cargo.toml (89%) rename {filter_parser => filter-parser}/README.md (100%) rename {filter_parser => filter-parser}/fuzz/Cargo.toml (85%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_1 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_10 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_11 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_12 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_13 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_14 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_15 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_16 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_17 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_18 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_19 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_2 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_20 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_21 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_22 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_23 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_24 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_25 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_26 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_27 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_28 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_29 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_3 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_30 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_31 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_32 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_33 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_34 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_35 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_36 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_37 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_38 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_39 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_4 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_40 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_41 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_42 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_43 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_5 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_6 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_7 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_8 (100%) rename {filter_parser => filter-parser}/fuzz/corpus/parse/test_9 (100%) rename {filter_parser => filter-parser}/fuzz/fuzz_targets/parse.rs (100%) rename {filter_parser => filter-parser}/src/condition.rs (100%) rename {filter_parser => filter-parser}/src/error.rs (100%) rename {filter_parser => filter-parser}/src/lib.rs (100%) rename {filter_parser => filter-parser}/src/main.rs (100%) rename {filter_parser => filter-parser}/src/value.rs (100%) delete mode 100644 filter_parser/fuzz/.gitignore diff --git a/Cargo.toml b/Cargo.toml index 5d2d47713..6b3e12f07 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["milli", "filter_parser", "http-ui", "benchmarks", "infos", "helpers", "cli"] +members = ["milli", "filter-parser", "http-ui", "benchmarks", "infos", "helpers", "cli"] default-members = ["milli"] [profile.dev] diff --git a/filter_parser/Cargo.toml b/filter-parser/Cargo.toml similarity index 89% rename from filter_parser/Cargo.toml rename to filter-parser/Cargo.toml index 80767d5c4..ee44bcb7f 100644 --- a/filter_parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "filter_parser" +name = "filter-parser" version = "0.1.0" edition = "2021" diff --git a/filter_parser/README.md b/filter-parser/README.md similarity index 100% rename from filter_parser/README.md rename to filter-parser/README.md diff --git a/filter_parser/fuzz/Cargo.toml b/filter-parser/fuzz/Cargo.toml similarity index 85% rename from filter_parser/fuzz/Cargo.toml rename to filter-parser/fuzz/Cargo.toml index 33e604e73..246276f2c 100644 --- a/filter_parser/fuzz/Cargo.toml +++ b/filter-parser/fuzz/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "filter_parser-fuzz" +name = "filter-parser-fuzz" version = "0.0.0" authors = ["Automatically generated"] publish = false @@ -11,7 +11,7 @@ cargo-fuzz = true [dependencies] libfuzzer-sys = "0.4" -[dependencies.filter_parser] +[dependencies.filter-parser] path = ".." # Prevent this from interfering with workspaces diff --git a/filter_parser/fuzz/corpus/parse/test_1 b/filter-parser/fuzz/corpus/parse/test_1 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_1 rename to filter-parser/fuzz/corpus/parse/test_1 diff --git a/filter_parser/fuzz/corpus/parse/test_10 b/filter-parser/fuzz/corpus/parse/test_10 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_10 rename to filter-parser/fuzz/corpus/parse/test_10 diff --git a/filter_parser/fuzz/corpus/parse/test_11 b/filter-parser/fuzz/corpus/parse/test_11 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_11 rename to filter-parser/fuzz/corpus/parse/test_11 diff --git a/filter_parser/fuzz/corpus/parse/test_12 b/filter-parser/fuzz/corpus/parse/test_12 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_12 rename to filter-parser/fuzz/corpus/parse/test_12 diff --git a/filter_parser/fuzz/corpus/parse/test_13 b/filter-parser/fuzz/corpus/parse/test_13 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_13 rename to filter-parser/fuzz/corpus/parse/test_13 diff --git a/filter_parser/fuzz/corpus/parse/test_14 b/filter-parser/fuzz/corpus/parse/test_14 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_14 rename to filter-parser/fuzz/corpus/parse/test_14 diff --git a/filter_parser/fuzz/corpus/parse/test_15 b/filter-parser/fuzz/corpus/parse/test_15 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_15 rename to filter-parser/fuzz/corpus/parse/test_15 diff --git a/filter_parser/fuzz/corpus/parse/test_16 b/filter-parser/fuzz/corpus/parse/test_16 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_16 rename to filter-parser/fuzz/corpus/parse/test_16 diff --git a/filter_parser/fuzz/corpus/parse/test_17 b/filter-parser/fuzz/corpus/parse/test_17 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_17 rename to filter-parser/fuzz/corpus/parse/test_17 diff --git a/filter_parser/fuzz/corpus/parse/test_18 b/filter-parser/fuzz/corpus/parse/test_18 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_18 rename to filter-parser/fuzz/corpus/parse/test_18 diff --git a/filter_parser/fuzz/corpus/parse/test_19 b/filter-parser/fuzz/corpus/parse/test_19 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_19 rename to filter-parser/fuzz/corpus/parse/test_19 diff --git a/filter_parser/fuzz/corpus/parse/test_2 b/filter-parser/fuzz/corpus/parse/test_2 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_2 rename to filter-parser/fuzz/corpus/parse/test_2 diff --git a/filter_parser/fuzz/corpus/parse/test_20 b/filter-parser/fuzz/corpus/parse/test_20 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_20 rename to filter-parser/fuzz/corpus/parse/test_20 diff --git a/filter_parser/fuzz/corpus/parse/test_21 b/filter-parser/fuzz/corpus/parse/test_21 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_21 rename to filter-parser/fuzz/corpus/parse/test_21 diff --git a/filter_parser/fuzz/corpus/parse/test_22 b/filter-parser/fuzz/corpus/parse/test_22 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_22 rename to filter-parser/fuzz/corpus/parse/test_22 diff --git a/filter_parser/fuzz/corpus/parse/test_23 b/filter-parser/fuzz/corpus/parse/test_23 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_23 rename to filter-parser/fuzz/corpus/parse/test_23 diff --git a/filter_parser/fuzz/corpus/parse/test_24 b/filter-parser/fuzz/corpus/parse/test_24 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_24 rename to filter-parser/fuzz/corpus/parse/test_24 diff --git a/filter_parser/fuzz/corpus/parse/test_25 b/filter-parser/fuzz/corpus/parse/test_25 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_25 rename to filter-parser/fuzz/corpus/parse/test_25 diff --git a/filter_parser/fuzz/corpus/parse/test_26 b/filter-parser/fuzz/corpus/parse/test_26 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_26 rename to filter-parser/fuzz/corpus/parse/test_26 diff --git a/filter_parser/fuzz/corpus/parse/test_27 b/filter-parser/fuzz/corpus/parse/test_27 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_27 rename to filter-parser/fuzz/corpus/parse/test_27 diff --git a/filter_parser/fuzz/corpus/parse/test_28 b/filter-parser/fuzz/corpus/parse/test_28 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_28 rename to filter-parser/fuzz/corpus/parse/test_28 diff --git a/filter_parser/fuzz/corpus/parse/test_29 b/filter-parser/fuzz/corpus/parse/test_29 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_29 rename to filter-parser/fuzz/corpus/parse/test_29 diff --git a/filter_parser/fuzz/corpus/parse/test_3 b/filter-parser/fuzz/corpus/parse/test_3 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_3 rename to filter-parser/fuzz/corpus/parse/test_3 diff --git a/filter_parser/fuzz/corpus/parse/test_30 b/filter-parser/fuzz/corpus/parse/test_30 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_30 rename to filter-parser/fuzz/corpus/parse/test_30 diff --git a/filter_parser/fuzz/corpus/parse/test_31 b/filter-parser/fuzz/corpus/parse/test_31 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_31 rename to filter-parser/fuzz/corpus/parse/test_31 diff --git a/filter_parser/fuzz/corpus/parse/test_32 b/filter-parser/fuzz/corpus/parse/test_32 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_32 rename to filter-parser/fuzz/corpus/parse/test_32 diff --git a/filter_parser/fuzz/corpus/parse/test_33 b/filter-parser/fuzz/corpus/parse/test_33 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_33 rename to filter-parser/fuzz/corpus/parse/test_33 diff --git a/filter_parser/fuzz/corpus/parse/test_34 b/filter-parser/fuzz/corpus/parse/test_34 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_34 rename to filter-parser/fuzz/corpus/parse/test_34 diff --git a/filter_parser/fuzz/corpus/parse/test_35 b/filter-parser/fuzz/corpus/parse/test_35 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_35 rename to filter-parser/fuzz/corpus/parse/test_35 diff --git a/filter_parser/fuzz/corpus/parse/test_36 b/filter-parser/fuzz/corpus/parse/test_36 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_36 rename to filter-parser/fuzz/corpus/parse/test_36 diff --git a/filter_parser/fuzz/corpus/parse/test_37 b/filter-parser/fuzz/corpus/parse/test_37 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_37 rename to filter-parser/fuzz/corpus/parse/test_37 diff --git a/filter_parser/fuzz/corpus/parse/test_38 b/filter-parser/fuzz/corpus/parse/test_38 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_38 rename to filter-parser/fuzz/corpus/parse/test_38 diff --git a/filter_parser/fuzz/corpus/parse/test_39 b/filter-parser/fuzz/corpus/parse/test_39 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_39 rename to filter-parser/fuzz/corpus/parse/test_39 diff --git a/filter_parser/fuzz/corpus/parse/test_4 b/filter-parser/fuzz/corpus/parse/test_4 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_4 rename to filter-parser/fuzz/corpus/parse/test_4 diff --git a/filter_parser/fuzz/corpus/parse/test_40 b/filter-parser/fuzz/corpus/parse/test_40 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_40 rename to filter-parser/fuzz/corpus/parse/test_40 diff --git a/filter_parser/fuzz/corpus/parse/test_41 b/filter-parser/fuzz/corpus/parse/test_41 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_41 rename to filter-parser/fuzz/corpus/parse/test_41 diff --git a/filter_parser/fuzz/corpus/parse/test_42 b/filter-parser/fuzz/corpus/parse/test_42 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_42 rename to filter-parser/fuzz/corpus/parse/test_42 diff --git a/filter_parser/fuzz/corpus/parse/test_43 b/filter-parser/fuzz/corpus/parse/test_43 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_43 rename to filter-parser/fuzz/corpus/parse/test_43 diff --git a/filter_parser/fuzz/corpus/parse/test_5 b/filter-parser/fuzz/corpus/parse/test_5 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_5 rename to filter-parser/fuzz/corpus/parse/test_5 diff --git a/filter_parser/fuzz/corpus/parse/test_6 b/filter-parser/fuzz/corpus/parse/test_6 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_6 rename to filter-parser/fuzz/corpus/parse/test_6 diff --git a/filter_parser/fuzz/corpus/parse/test_7 b/filter-parser/fuzz/corpus/parse/test_7 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_7 rename to filter-parser/fuzz/corpus/parse/test_7 diff --git a/filter_parser/fuzz/corpus/parse/test_8 b/filter-parser/fuzz/corpus/parse/test_8 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_8 rename to filter-parser/fuzz/corpus/parse/test_8 diff --git a/filter_parser/fuzz/corpus/parse/test_9 b/filter-parser/fuzz/corpus/parse/test_9 similarity index 100% rename from filter_parser/fuzz/corpus/parse/test_9 rename to filter-parser/fuzz/corpus/parse/test_9 diff --git a/filter_parser/fuzz/fuzz_targets/parse.rs b/filter-parser/fuzz/fuzz_targets/parse.rs similarity index 100% rename from filter_parser/fuzz/fuzz_targets/parse.rs rename to filter-parser/fuzz/fuzz_targets/parse.rs diff --git a/filter_parser/src/condition.rs b/filter-parser/src/condition.rs similarity index 100% rename from filter_parser/src/condition.rs rename to filter-parser/src/condition.rs diff --git a/filter_parser/src/error.rs b/filter-parser/src/error.rs similarity index 100% rename from filter_parser/src/error.rs rename to filter-parser/src/error.rs diff --git a/filter_parser/src/lib.rs b/filter-parser/src/lib.rs similarity index 100% rename from filter_parser/src/lib.rs rename to filter-parser/src/lib.rs diff --git a/filter_parser/src/main.rs b/filter-parser/src/main.rs similarity index 100% rename from filter_parser/src/main.rs rename to filter-parser/src/main.rs diff --git a/filter_parser/src/value.rs b/filter-parser/src/value.rs similarity index 100% rename from filter_parser/src/value.rs rename to filter-parser/src/value.rs diff --git a/filter_parser/fuzz/.gitignore b/filter_parser/fuzz/.gitignore deleted file mode 100644 index a0925114d..000000000 --- a/filter_parser/fuzz/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -target -corpus -artifacts diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 36e63916c..90bd1f926 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -38,7 +38,7 @@ smallvec = "1.6.1" tempfile = "3.2.0" uuid = { version = "0.8.2", features = ["v4"] } -filter_parser = { path = "../filter_parser" } +filter-parser = { path = "../filter-parser" } # documents words self-join itertools = "0.10.0" From 99197387af040700b7360a48a25628d3ad7eaf28 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 9 Nov 2021 16:25:53 +0100 Subject: [PATCH 1159/1889] fix the test with the new escaped format --- filter-parser/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 6276023a9..073057b76 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -567,8 +567,8 @@ pub mod tests { ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), ("position <= _geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), ("position <= _geoRadius(12, 13, 14)", "The `_geoRadius` filter is an operation and can't be used as a value."), - ("channel = 'ponce", "Expression `'ponce` is missing the following closing delimiter: `'`."), - ("channel = \"ponce", "Expression `\"ponce` is missing the following closing delimiter: `\"`."), + ("channel = 'ponce", "Expression `\\'ponce` is missing the following closing delimiter: `'`."), + ("channel = \"ponce", "Expression `\\\"ponce` is missing the following closing delimiter: `\"`."), ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."), ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."), ]; From 73df873f44adeb2d483c975e9ecbb3791e40de3f Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 9 Nov 2021 16:40:05 +0100 Subject: [PATCH 1160/1889] fix typos --- filter-parser/README.md | 3 +-- filter-parser/src/lib.rs | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/filter-parser/README.md b/filter-parser/README.md index 0999b4340..dfbc03d07 100644 --- a/filter-parser/README.md +++ b/filter-parser/README.md @@ -33,5 +33,4 @@ cargo fuzz run parse -- -max_len=500 ## What to do if you find a bug in the parser - Write a test at the end of the [`lib.rs`](./src/lib.rs) to ensure it never happens again. -- Add a file in [the corpus directory](./fuzz/corpus/parse/) with your filter to help the fuzzer finding new bug. Since this directory is going to be heavily polluted by the execution of the fuzzer it's in the gitignore and you'll need to force push your new test. - Since this directory is going to be heavily polluted by the execution of the fuzzer it's in the gitignore and you'll need to force add your new test. +- Add a file in [the corpus directory](./fuzz/corpus/parse/) with your filter to help the fuzzer find new bugs. Since this directory is going to be heavily polluted by the execution of the fuzzer it's in the gitignore and you'll need to force push your new test. diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 073057b76..3e34e4d96 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -141,7 +141,7 @@ impl<'a> FilterCondition<'a> { } } -/// remove OPTIONAL whitespaces before AND after the the provided parser. +/// remove OPTIONAL whitespaces before AND after the provided parser. fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) -> IResult { delimited(multispace0, inner, multispace0) } @@ -184,7 +184,7 @@ fn parse_geo_radius(input: Span) -> IResult { // we want to forbid space BEFORE the _geoRadius but not after let parsed = preceded( tuple((multispace0, tag("_geoRadius"))), - // if we were able to parse `_geoRadius` and can't parse the rest of the input we returns a failure + // if we were able to parse `_geoRadius` and can't parse the rest of the input we return a failure cut(delimited(char('('), separated_list1(tag(","), ws(recognize_float)), char(')'))), )(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::Geo))); @@ -212,7 +212,7 @@ fn parse_geo_point(input: Span) -> IResult { cut(delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')'))), ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))?; - // if we succeeded we still returns a Failure because geoPoints are not allowed + // if we succeeded we still return a `Failure` because geoPoints are not allowed Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) } From 519d6b2bf3081ace12a8cbf9ac369831b2df7fc6 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 9 Nov 2021 16:47:54 +0100 Subject: [PATCH 1161/1889] remove the `!` syntax for the not --- filter-parser/src/lib.rs | 10 ++++------ filter-parser/src/value.rs | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 3e34e4d96..ed36b1bf4 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -5,7 +5,7 @@ //! expression = or //! or = and (~ "OR" ~ and) //! and = not (~ "AND" not)* -//! not = ("NOT" | "!") not | primary +//! not = ("NOT" ~ not) | primary //! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to //! condition = value ("==" | ">" ...) value //! to = value value TO value @@ -169,13 +169,11 @@ fn parse_and(input: Span) -> IResult { Ok((input, expr)) } -/// not = ("NOT" | "!") not | primary +/// not = ("NOT" ~ not) | primary /// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. -/// If we parse a `NOT` or `!` we MUST parse something behind. +/// If we parse a `NOT` we MUST parse something behind. fn parse_not(input: Span) -> IResult { - alt((map(preceded(alt((tag("!"), tag("NOT"))), cut(parse_not)), |e| e.negate()), parse_primary))( - input, - ) + alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input) } /// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index b9d929ab0..936305837 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -70,7 +70,7 @@ fn is_value_component(c: char) -> bool { } fn is_syntax_component(c: char) -> bool { - c.is_whitespace() || ['(', ')', '=', '<', '>', '!'].contains(&c) + c.is_whitespace() || ['(', ')', '=', '<', '>'].contains(&c) } #[cfg(test)] From bff48681d2ef798e2df555bc0e91e7c4bcca1184 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 9 Nov 2021 17:05:36 +0100 Subject: [PATCH 1162/1889] Re-order the operator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- filter-parser/src/condition.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index cff2f2fdd..abd549534 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -46,12 +46,12 @@ pub fn parse_condition(input: Span) -> IResult { let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?; let condition = match *op.fragment() { - "=" => FilterCondition::Condition { fid, op: Equal(value) }, - "!=" => FilterCondition::Condition { fid, op: NotEqual(value) }, - ">" => FilterCondition::Condition { fid, op: GreaterThan(value) }, - "<" => FilterCondition::Condition { fid, op: LowerThan(value) }, "<=" => FilterCondition::Condition { fid, op: LowerThanOrEqual(value) }, ">=" => FilterCondition::Condition { fid, op: GreaterThanOrEqual(value) }, + "!=" => FilterCondition::Condition { fid, op: NotEqual(value) }, + "<" => FilterCondition::Condition { fid, op: LowerThan(value) }, + ">" => FilterCondition::Condition { fid, op: GreaterThan(value) }, + "=" => FilterCondition::Condition { fid, op: Equal(value) }, _ => unreachable!(), }; From 7c3017734a15421e2b061e5ce1bafac60fa5e9ea Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 9 Nov 2021 17:08:04 +0100 Subject: [PATCH 1163/1889] re-ignore the ! symbol when generating a good error message --- filter-parser/src/value.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 936305837..b9d929ab0 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -70,7 +70,7 @@ fn is_value_component(c: char) -> bool { } fn is_syntax_component(c: char) -> bool { - c.is_whitespace() || ['(', ')', '=', '<', '>'].contains(&c) + c.is_whitespace() || ['(', ')', '=', '<', '>', '!'].contains(&c) } #[cfg(test)] From 721fc294bef5563c86b183875413cad18fe2c9ba Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Tue, 9 Nov 2021 20:19:49 +0100 Subject: [PATCH 1164/1889] improve document deletion returned meta returns both the remaining number of documents and the number of deleted documents. --- milli/src/update/delete_documents.rs | 34 +++++++++++++++++++++------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e1a658218..d9c3fba14 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -6,6 +6,7 @@ use fst::IntoStreamer; use heed::types::ByteSlice; use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; use serde_json::Value; use super::ClearDocuments; @@ -25,6 +26,12 @@ pub struct DeleteDocuments<'t, 'u, 'i> { update_id: u64, } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DocumentDeletionResult { + pub deleted_documents: u64, + pub remaining_documents: u64, +} + impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -56,26 +63,34 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { Some(docid) } - pub fn execute(self) -> Result { + pub fn execute(self) -> Result { self.index.set_updated_at(self.wtxn, &Utc::now())?; // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; + let current_documents_ids_len = documents_ids.len(); // We can and must stop removing documents in a database that is empty. if documents_ids.is_empty() { - return Ok(0); + return Ok(DocumentDeletionResult { + deleted_documents: 0, + remaining_documents: current_documents_ids_len, + }); } // We remove the documents ids that we want to delete // from the documents in the database and write them back. - let current_documents_ids_len = documents_ids.len(); documents_ids -= &self.documents_ids; self.index.put_documents_ids(self.wtxn, &documents_ids)?; // We can execute a ClearDocuments operation when the number of documents // to delete is exactly the number of documents in the database. if current_documents_ids_len == self.documents_ids.len() { - return ClearDocuments::new(self.wtxn, self.index, self.update_id).execute(); + let remaining_documents = + ClearDocuments::new(self.wtxn, self.index, self.update_id).execute()?; + return Ok(DocumentDeletionResult { + deleted_documents: current_documents_ids_len, + remaining_documents, + }); } let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -86,11 +101,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } })?; - // If we can't find the id of the primary key it means that the database - // is empty and it should be safe to return that we deleted 0 documents. + // Since we already checked if the DB was empty, if we can't find the primary key, then + // something is wrong, and we must return an error. let id_field = match fields_ids_map.id(primary_key) { Some(field) => field, - None => return Ok(0), + None => return Err(UserError::MissingPrimaryKey.into()), }; let Index { @@ -439,7 +454,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { )?; } - Ok(self.documents_ids.len()) + Ok(DocumentDeletionResult { + deleted_documents: self.documents_ids.len(), + remaining_documents: documents_ids.len(), + }) } } From 09b4281cff5f72821461a04f92be4c4c52686fb8 Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Wed, 10 Nov 2021 14:08:36 +0100 Subject: [PATCH 1165/1889] improve document addition returned metaimprove document addition returned metaimprove document addition returned metaimprove document addition returned metaimprove document addition returned metaimprove document addition returned metaimprove document addition returned metaimprove document addition returned meta --- milli/src/update/index_documents/mod.rs | 31 ++++++++++++++++--------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 440546b10..cb3c1a75c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -35,9 +35,12 @@ static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct DocumentAdditionResult { - pub nb_documents: usize, + /// The number of documents that were indexed during the update + pub indexed_documents: u64, + /// The total number of documents in the index after the update + pub number_of_documents: u64, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] @@ -137,7 +140,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { { // Early return when there is no document to add if reader.is_empty() { - return Ok(DocumentAdditionResult { nb_documents: 0 }); + return Ok(DocumentAdditionResult { + indexed_documents: 0, + number_of_documents: self.index.number_of_documents(self.wtxn)?, + }); } self.index.set_updated_at(self.wtxn, &Utc::now())?; @@ -157,16 +163,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }; let output = transform.read_documents(reader, progress_callback)?; - let nb_documents = output.documents_count; + let indexed_documents = output.documents_count as u64; info!("Update transformed in {:.02?}", before_transform.elapsed()); - self.execute_raw(output, progress_callback)?; - Ok(DocumentAdditionResult { nb_documents }) - } + let number_of_documents = self.execute_raw(output, progress_callback)?; + Ok(DocumentAdditionResult { indexed_documents, number_of_documents }) + } + /// Returns the total number of documents in the index after the update. #[logging_timer::time("IndexDocuments::{}")] - pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> Result<()> + pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> Result where F: Fn(UpdateIndexingStep) + Sync, { @@ -294,7 +301,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { debug!("documents to delete {:?}", replaced_documents_ids); deletion_builder.delete_documents(&replaced_documents_ids); let deleted_documents_count = deletion_builder.execute()?; - debug!("{} documents actually deleted", deleted_documents_count); + debug!("{} documents actually deleted", deleted_documents_count.deleted_documents); } let index_documents_ids = self.index.documents_ids(self.wtxn)?; @@ -325,7 +332,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { if is_merged_database { databases_seen += 1; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: databases_seen, + databases_seen, total_databases: TOTAL_POSTING_DATABASE_COUNT, }); } @@ -343,7 +350,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids; self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; - self.execute_prefix_databases(progress_callback) + self.execute_prefix_databases(progress_callback)?; + + Ok(all_documents_ids.len()) } #[logging_timer::time("IndexDocuments::{}")] From 6eb47ab792391667bfb9927db979469116a5f48a Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Wed, 3 Nov 2021 13:12:01 +0100 Subject: [PATCH 1166/1889] remove update_id in UpdateBuilder --- benchmarks/benches/indexing.rs | 48 ++++---- benchmarks/benches/utils.rs | 8 +- cli/src/main.rs | 8 +- http-ui/src/main.rs | 6 +- milli/src/index.rs | 12 +- milli/src/search/distinct/mod.rs | 8 +- milli/src/search/facet/filter.rs | 8 +- milli/src/update/clear_documents.rs | 13 +- milli/src/update/delete_documents.rs | 36 +++--- milli/src/update/facets.rs | 8 +- milli/src/update/index_documents/mod.rs | 114 +++++++++--------- milli/src/update/settings.rs | 152 +++++++++++------------- milli/src/update/update_builder.rs | 14 +-- milli/src/update/words_prefixes_fst.rs | 10 +- milli/tests/search/distinct.rs | 4 +- milli/tests/search/mod.rs | 8 +- milli/tests/search/query_criteria.rs | 16 +-- 17 files changed, 217 insertions(+), 256 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index e4657d5b6..93a57091a 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -39,7 +39,7 @@ fn indexing_songs_default(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); @@ -61,17 +61,17 @@ fn indexing_songs_default(c: &mut Criterion) { .map(|s| s.to_string()) .collect(); builder.set_filterable_fields(faceted_fields); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); index }, move |index| { - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let builder = update_builder.index_documents(&mut wtxn, &index); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.execute(documents, |_, _| ()).unwrap(); + builder.execute(documents, |_| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -88,7 +88,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); @@ -107,17 +107,17 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { let faceted_fields = ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect(); builder.set_filterable_fields(faceted_fields); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); index }, move |index| { - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let builder = update_builder.index_documents(&mut wtxn, &index); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.execute(documents, |_, _| ()).unwrap(); + builder.execute(documents, |_| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -134,7 +134,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); @@ -149,17 +149,17 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); builder.set_searchable_fields(searchable_fields); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); index }, move |index| { - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let builder = update_builder.index_documents(&mut wtxn, &index); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.execute(documents, |_, _| ()).unwrap(); + builder.execute(documents, |_| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -176,7 +176,7 @@ fn indexing_wiki(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); @@ -190,18 +190,18 @@ fn indexing_wiki(c: &mut Criterion) { // there is NO faceted fields at all - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); index }, move |index| { - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.index_documents(&mut wtxn, &index); builder.enable_autogenerate_docids(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.execute(documents, |_, _| ()).unwrap(); + builder.execute(documents, |_| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -218,7 +218,7 @@ fn indexing_movies_default(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); @@ -237,17 +237,17 @@ fn indexing_movies_default(c: &mut Criterion) { ["released_date", "genres"].iter().map(|s| s.to_string()).collect(); builder.set_filterable_fields(faceted_fields); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); index }, move |index| { - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let builder = update_builder.index_documents(&mut wtxn, &index); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.execute(documents, |_, _| ()).unwrap(); + builder.execute(documents, |_| ()).unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -264,7 +264,7 @@ fn indexing_geo(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); @@ -288,17 +288,17 @@ fn indexing_geo(c: &mut Criterion) { ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); builder.set_sortable_fields(sortable_fields); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); index }, move |index| { - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let builder = update_builder.index_documents(&mut wtxn, &index); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.execute(documents, |_, _| ()).unwrap(); + builder.execute(documents, |_| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 655275967..1b1d9be8c 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -65,7 +65,7 @@ pub fn base_setup(conf: &Conf) -> Index { options.max_readers(10); let index = Index::new(options, conf.database_name).unwrap(); - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); @@ -84,10 +84,10 @@ pub fn base_setup(conf: &Conf) -> Index { (conf.configure)(&mut builder); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); - let update_builder = UpdateBuilder::new(0); + let update_builder = UpdateBuilder::new(); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.index_documents(&mut wtxn, &index); if let None = conf.primary_key { @@ -96,7 +96,7 @@ pub fn base_setup(conf: &Conf) -> Index { let documents = documents_from(conf.dataset, conf.dataset_format); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - builder.execute(documents, |_, _| ()).unwrap(); + builder.execute(documents, |_| ()).unwrap(); wtxn.commit().unwrap(); index diff --git a/cli/src/main.rs b/cli/src/main.rs index 5e11dc3fb..44c197de6 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -122,7 +122,7 @@ impl DocumentAddition { println!("Adding {} documents to the index.", reader.len()); let mut txn = index.env.write_txn()?; - let mut addition = milli::update::IndexDocuments::new(&mut txn, &index, 0); + let mut addition = milli::update::IndexDocuments::new(&mut txn, &index); if self.update_documents { addition.index_documents_method(milli::update::IndexDocumentsMethod::UpdateDocuments); @@ -146,7 +146,7 @@ impl DocumentAddition { progesses.join().unwrap(); }); - let result = addition.execute(reader, |step, _| indexing_callback(step, &bars))?; + let result = addition.execute(reader, |step| indexing_callback(step, &bars))?; txn.commit()?; @@ -292,7 +292,7 @@ impl SettingsUpdate { fn perform(&self, index: milli::Index) -> Result<()> { let mut txn = index.env.write_txn()?; - let mut update = milli::update::Settings::new(&mut txn, &index, 0); + let mut update = milli::update::Settings::new(&mut txn, &index); update.log_every_n(100); if let Some(ref filterable_attributes) = self.filterable_attributes { @@ -315,7 +315,7 @@ impl SettingsUpdate { progesses.join().unwrap(); }); - update.execute(|step, _| indexing_callback(step, &bars))?; + update.execute(|step| indexing_callback(step, &bars))?; txn.commit()?; Ok(()) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 8efd8ed69..4bd8815a5 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -343,7 +343,7 @@ async fn main() -> anyhow::Result<()> { // the type hint is necessary: https://github.com/rust-lang/rust/issues/32600 move |update_id, meta, content: &_| { // We prepare the update by using the update builder. - let mut update_builder = UpdateBuilder::new(update_id); + let mut update_builder = UpdateBuilder::new(); if let Some(max_nb_chunks) = indexer_opt_cloned.max_nb_chunks { update_builder.max_nb_chunks(max_nb_chunks); } @@ -393,7 +393,7 @@ async fn main() -> anyhow::Result<()> { let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?; - let result = builder.execute(documents, |indexing_step, update_id| { + let result = builder.execute(documents, |indexing_step| { let (current, total) = match indexing_step { RemapDocumentAddition { documents_seen } => (documents_seen, None), ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { @@ -494,7 +494,7 @@ async fn main() -> anyhow::Result<()> { Setting::NotSet => (), } - let result = builder.execute(|indexing_step, update_id| { + let result = builder.execute(|indexing_step| { let (current, total) = match indexing_step { RemapDocumentAddition { documents_seen } => (documents_seen, None), ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { diff --git a/milli/src/index.rs b/milli/src/index.rs index fe89fe734..2f51b8c6b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -908,8 +908,8 @@ pub(crate) mod tests { { "id": 2, "name": "bob", "age": 20 }, { "id": 2, "name": "bob", "age": 20 } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -927,13 +927,13 @@ pub(crate) mod tests { // we add all the documents a second time. we are supposed to get the same // field_distribution in the end let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); + let builder = IndexDocuments::new(&mut wtxn, &index); let content = documents!([ { "id": 1, "name": "kevin" }, { "id": 2, "name": "bob", "age": 20 }, { "id": 2, "name": "bob", "age": 20 } ]); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -955,8 +955,8 @@ pub(crate) mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 11f6379e3..3d36ed2a3 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -84,19 +84,19 @@ mod test { let mut txn = index.write_txn().unwrap(); // set distinct and faceted attributes for the index. - let builder = UpdateBuilder::new(0); + let builder = UpdateBuilder::new(); let mut update = builder.settings(&mut txn, &index); update.set_distinct_field(distinct.to_string()); - update.execute(|_, _| ()).unwrap(); + update.execute(|_| ()).unwrap(); // add documents to the index - let builder = UpdateBuilder::new(1); + let builder = UpdateBuilder::new(); let mut addition = builder.index_documents(&mut txn, &index); addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); let reader = crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); - addition.execute(reader, |_, _| ()).unwrap(); + addition.execute(reader, |_| ()).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index ec2c0b3eb..e994f36d9 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -512,10 +512,10 @@ mod tests { // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_searchable_fields(vec![S("title")]); builder.set_filterable_fields(hashset! { S("title") }); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -542,10 +542,10 @@ mod tests { // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index a820c2a49..5be3bc23d 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -6,16 +6,11 @@ use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - _update_id: u64, } impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - update_id: u64, - ) -> ClearDocuments<'t, 'u, 'i> { - ClearDocuments { wtxn, index, _update_id: update_id } + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> ClearDocuments<'t, 'u, 'i> { + ClearDocuments { wtxn, index } } pub fn execute(self) -> Result { @@ -97,10 +92,10 @@ mod tests { { "id": 1, "name": "kevina" }, { "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } } ]); - IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap(); + IndexDocuments::new(&mut wtxn, &index).execute(content, |_| ()).unwrap(); // Clear all documents from the database. - let builder = ClearDocuments::new(&mut wtxn, &index, 1); + let builder = ClearDocuments::new(&mut wtxn, &index); assert_eq!(builder.execute().unwrap(), 3); wtxn.commit().unwrap(); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index d9c3fba14..2fd3e084e 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -23,7 +23,6 @@ pub struct DeleteDocuments<'t, 'u, 'i> { index: &'i Index, external_documents_ids: ExternalDocumentsIds<'static>, documents_ids: RoaringBitmap, - update_id: u64, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -36,7 +35,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - update_id: u64, ) -> Result> { let external_documents_ids = index.external_documents_ids(wtxn)?.into_static(); @@ -45,7 +43,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { index, external_documents_ids, documents_ids: RoaringBitmap::new(), - update_id, }) } @@ -85,8 +82,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We can execute a ClearDocuments operation when the number of documents // to delete is exactly the number of documents in the database. if current_documents_ids_len == self.documents_ids.len() { - let remaining_documents = - ClearDocuments::new(self.wtxn, self.index, self.update_id).execute()?; + let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?; return Ok(DocumentDeletionResult { deleted_documents: current_documents_ids_len, remaining_documents, @@ -600,11 +596,11 @@ mod tests { { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); // delete those documents, ids are synchronous therefore 0, 1, and 2. - let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); builder.delete_document(0); builder.delete_document(1); builder.delete_document(2); @@ -630,11 +626,11 @@ mod tests { { "mysuperid": 1, "name": "kevina" }, { "mysuperid": 2, "name": "benoit" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); builder.delete_external_id("0"); builder.delete_external_id("1"); builder.execute().unwrap(); @@ -650,10 +646,10 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_primary_key(S("docid")); builder.set_filterable_fields(hashset! { S("label") }); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); let content = documents!([ {"docid":"1_4","label":"sign"}, @@ -677,11 +673,11 @@ mod tests { {"docid":"1_68","label":"design"}, {"docid":"1_69","label":"geometry"} ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); builder.delete_external_id("1_4"); builder.execute().unwrap(); @@ -700,11 +696,11 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_primary_key(S("id")); builder.set_filterable_fields(hashset!(S("_geo"))); builder.set_sortable_fields(hashset!(S("_geo"))); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); let content = documents!([ {"id":"1","city":"Lille", "_geo": { "lat": 50.629973371633746, "lng": 3.0569447399419570 } }, @@ -730,7 +726,7 @@ mod tests { ]); let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; - IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap(); + IndexDocuments::new(&mut wtxn, &index).execute(content, |_| ()).unwrap(); let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); let ids_to_delete: Vec = external_ids_to_delete @@ -739,7 +735,7 @@ mod tests { .collect(); // Delete some documents. - let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); external_ids_to_delete.iter().for_each(|id| drop(builder.delete_external_id(id))); builder.execute().unwrap(); diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 9b7d6d42c..a2f17cba3 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -27,15 +27,10 @@ pub struct Facets<'t, 'u, 'i> { pub(crate) chunk_compression_level: Option, level_group_size: NonZeroUsize, min_level_size: NonZeroUsize, - _update_id: u64, } impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - update_id: u64, - ) -> Facets<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> { Facets { wtxn, index, @@ -43,7 +38,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { chunk_compression_level: None, level_group_size: NonZeroUsize::new(4).unwrap(), min_level_size: NonZeroUsize::new(5).unwrap(), - _update_id: update_id, } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index cb3c1a75c..b0c0a5362 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -80,14 +80,12 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { words_positions_min_level_size: Option, update_method: IndexDocumentsMethod, autogenerate_docids: bool, - update_id: u64, } impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - update_id: u64, ) -> IndexDocuments<'t, 'u, 'i, 'a> { IndexDocuments { wtxn, @@ -107,7 +105,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { words_positions_min_level_size: None, update_method: IndexDocumentsMethod::ReplaceDocuments, autogenerate_docids: false, - update_id, max_positions_per_attributes: None, } } @@ -136,7 +133,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { ) -> Result where R: Read + Seek, - F: Fn(UpdateIndexingStep, u64) + Sync, + F: Fn(UpdateIndexingStep) + Sync, { // Early return when there is no document to add if reader.is_empty() { @@ -148,8 +145,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.set_updated_at(self.wtxn, &Utc::now())?; let before_transform = Instant::now(); - let update_id = self.update_id; - let progress_callback = |step| progress_callback(step, update_id); let transform = Transform { rtxn: &self.wtxn, index: self.index, @@ -162,7 +157,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { autogenerate_docids: self.autogenerate_docids, }; - let output = transform.read_documents(reader, progress_callback)?; + let output = transform.read_documents(reader, &progress_callback)?; let indexed_documents = output.documents_count as u64; info!("Update transformed in {:.02?}", before_transform.elapsed()); @@ -295,7 +290,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_level: self.chunk_compression_level, thread_pool: self.thread_pool, max_positions_per_attributes: self.max_positions_per_attributes, - update_id: self.update_id, }; let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; debug!("documents to delete {:?}", replaced_documents_ids); @@ -364,7 +358,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut databases_seen = MERGED_DATABASE_COUNT; // Run the facets update operation. - let mut builder = Facets::new(self.wtxn, self.index, self.update_id); + let mut builder = Facets::new(self.wtxn, self.index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; if let Some(value) = self.facet_level_group_size { @@ -382,7 +376,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }); // Run the words prefixes update operation. - let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id); + let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); if let Some(value) = self.words_prefix_threshold { builder.threshold(value); } @@ -475,8 +469,8 @@ mod tests { { "id": 2, "name": "kevina" }, { "id": 3, "name": "benoit" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -488,8 +482,8 @@ mod tests { // Second we send 1 document with id 1, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "name": "updated kevin" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -505,8 +499,8 @@ mod tests { { "id": 2, "name": "updated kevina" }, { "id": 3, "name": "updated benoit" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 2); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -531,9 +525,9 @@ mod tests { { "id": 1, "name": "kevina" }, { "id": 1, "name": "benoit" } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is only 1 document now. @@ -557,9 +551,9 @@ mod tests { // Second we send 1 document with id 1, to force it to be merged with the previous one. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "age": 25 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 1 document. @@ -596,8 +590,8 @@ mod tests { { "name": "kevina" }, { "name": "benoit" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - assert!(builder.execute(content, |_, _| ()).is_err()); + let builder = IndexDocuments::new(&mut wtxn, &index); + assert!(builder.execute(content, |_| ()).is_err()); wtxn.commit().unwrap(); // Check that there is no document. @@ -621,9 +615,9 @@ mod tests { { "name": "kevina" }, { "name": "benoit" } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -639,8 +633,8 @@ mod tests { // Second we send 1 document with the generated uuid, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -676,8 +670,8 @@ mod tests { { "id": 2, "name": "kevina" }, { "id": 3, "name": "benoit" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -689,9 +683,9 @@ mod tests { // Second we send 1 document without specifying the id. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "name": "new kevin" } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 4 documents now. @@ -711,8 +705,8 @@ mod tests { // First we send 0 documents and only headers. let mut wtxn = index.write_txn().unwrap(); let content = documents!([]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is no documents. @@ -733,16 +727,16 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - assert!(builder.execute(content, |_, _| ()).is_err()); + let builder = IndexDocuments::new(&mut wtxn, &index); + assert!(builder.execute(content, |_| ()).is_err()); wtxn.commit().unwrap(); // First we send 1 document with a valid id. let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. let content = documents!([ { "id": 32, "name": "kevin" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 1 document now. @@ -766,8 +760,8 @@ mod tests { { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 1 documents now. @@ -805,13 +799,13 @@ mod tests { { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" }, { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - builder.execute(documents, |_, _| ()).unwrap(); + builder.execute(documents, |_| ()).unwrap(); wtxn.commit().unwrap(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); let documents = documents!([ { @@ -821,7 +815,7 @@ mod tests { } ]); - builder.execute(documents, |_, _| ()).unwrap(); + builder.execute(documents, |_| ()).unwrap(); wtxn.commit().unwrap(); } @@ -839,12 +833,12 @@ mod tests { { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap(); + IndexDocuments::new(&mut wtxn, &index).execute(content, |_| ()).unwrap(); assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index, 1).unwrap(); + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); builder.delete_external_id("30"); builder.execute().unwrap(); @@ -854,7 +848,7 @@ mod tests { let content = documents!([ { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap(); + IndexDocuments::new(&mut wtxn, &index).execute(content, |_| ()).unwrap(); let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_some()); @@ -862,7 +856,7 @@ mod tests { let content = documents!([ { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap(); + IndexDocuments::new(&mut wtxn, &index).execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); } @@ -892,8 +886,8 @@ mod tests { cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); } @@ -922,8 +916,8 @@ mod tests { cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); @@ -975,8 +969,8 @@ mod tests { }, ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); } @@ -996,8 +990,8 @@ mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 4 document now. @@ -1008,8 +1002,8 @@ mod tests { let content = documents!([]); let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 4 document now. @@ -1025,8 +1019,8 @@ mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index, 2); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that there is 4 document now. @@ -1048,8 +1042,8 @@ mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 0326b5144..9c270ed71 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -85,7 +85,6 @@ pub struct Settings<'a, 't, 'u, 'i> { pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, pub(crate) max_positions_per_attributes: Option, - update_id: u64, searchable_fields: Setting>, displayed_fields: Setting>, @@ -99,11 +98,7 @@ pub struct Settings<'a, 't, 'u, 'i> { } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - update_id: u64, - ) -> Settings<'a, 't, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Settings<'a, 't, 'u, 'i> { Settings { wtxn, index, @@ -123,7 +118,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { distinct_field: Setting::NotSet, synonyms: Setting::NotSet, primary_key: Setting::NotSet, - update_id, max_positions_per_attributes: None, } } @@ -207,11 +201,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where - F: Fn(UpdateIndexingStep, u64) + Sync, + F: Fn(UpdateIndexingStep) + Sync, { let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let update_id = self.update_id; - let cb = |step| cb(step, update_id); // if the settings are set before any document update, we don't need to do anything, and // will set the primary key during the first document addition. if self.index.number_of_documents(&self.wtxn)? == 0 { @@ -242,11 +234,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { )?; // We clear the full database (words-fst, documents ids and documents content). - ClearDocuments::new(self.wtxn, self.index, self.update_id).execute()?; + ClearDocuments::new(self.wtxn, self.index).execute()?; // We index the generated `TransformOutput` which must contain // all the documents with fields in the newly defined searchable order. - let mut indexing_builder = IndexDocuments::new(self.wtxn, self.index, self.update_id); + let mut indexing_builder = IndexDocuments::new(self.wtxn, self.index); indexing_builder.log_every_n = self.log_every_n; indexing_builder.max_nb_chunks = self.max_nb_chunks; indexing_builder.max_memory = self.max_memory; @@ -484,7 +476,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { pub fn execute(mut self, progress_callback: F) -> Result<()> where - F: Fn(UpdateIndexingStep, u64) + Sync, + F: Fn(UpdateIndexingStep) + Sync, { self.index.set_updated_at(self.wtxn, &Utc::now())?; @@ -543,15 +535,15 @@ mod tests { { "id": 2, "name": "kevina", "age": 21}, { "id": 3, "name": "benoit", "age": 34 } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 0); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // We change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 1); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_searchable_fields(vec!["name".into()]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the searchable field is correctly set to "name" only. @@ -571,9 +563,9 @@ mod tests { // We change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 2); + let mut builder = Settings::new(&mut wtxn, &index); builder.reset_searchable_fields(); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the searchable field have been reset and documents are found now. @@ -600,18 +592,18 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // In the same transaction we change the displayed fields to be only the "age". // We also change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 1); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_displayed_fields(vec!["age".into()]); builder.set_searchable_fields(vec!["name".into()]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). @@ -622,9 +614,9 @@ mod tests { // We change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 2); + let mut builder = Settings::new(&mut wtxn, &index); builder.reset_searchable_fields(); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields always contains only the "age" field. @@ -647,9 +639,9 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). @@ -672,14 +664,14 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); // In the same transaction we change the displayed fields to be only the age. - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_displayed_fields(vec!["age".into()]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to only the "age" field. @@ -690,9 +682,9 @@ mod tests { // We reset the fields ids to become `None`, the default value. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.reset_displayed_fields(); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). @@ -710,9 +702,9 @@ mod tests { // Set the filterable fields to be the age. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_filterable_fields(hashset! { S("age") }); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); // Then index some documents. let content = documents!([ @@ -720,9 +712,9 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set. @@ -757,9 +749,9 @@ mod tests { { "name": "benoit", "age": 35 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -782,11 +774,11 @@ mod tests { // Set the filterable fields to be the age. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); // Don't display the generated `id` field. builder.set_displayed_fields(vec![S("name")]); builder.set_criteria(vec![S("age:asc")]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); // Then index some documents. let content = documents!([ @@ -794,9 +786,9 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Run an empty query just to ensure that the search results are ordered. @@ -824,11 +816,11 @@ mod tests { // Set the filterable fields to be the age. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); // Don't display the generated `id` field. builder.set_displayed_fields(vec![S("name"), S("age")]); builder.set_distinct_field(S("age")); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); // Then index some documents. let content = documents!([ @@ -840,9 +832,9 @@ mod tests { { "name": "bernie", "age": 34 }, { "name": "ben", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Run an empty query just to ensure that the search results are ordered. @@ -867,9 +859,9 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // Ensure there is no stop_words by default @@ -892,15 +884,15 @@ mod tests { { "name": "kevina", "age": 21, "maxim": "Doggos are the best" }, { "name": "benoit", "age": 34, "maxim": "The crepes are really good" }, ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); // In the same transaction we provide some stop_words - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); let set = btreeset! { "i".to_string(), "the".to_string(), "are".to_string() }; builder.set_stop_words(set.clone()); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Ensure stop_words are effectively stored @@ -928,9 +920,9 @@ mod tests { // now we'll reset the stop_words and ensure it's None let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.reset_stop_words(); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -966,18 +958,18 @@ mod tests { { "name": "kevina", "age": 21, "maxim": "Doggos are the best"}, { "name": "benoit", "age": 34, "maxim": "The crepes are really good"}, ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.enable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); // In the same transaction provide some synonyms - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_synonyms(hashmap! { "blini".to_string() => vec!["crepes".to_string()], "super like".to_string() => vec!["love".to_string()], "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] }); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Ensure synonyms are effectively stored @@ -995,9 +987,9 @@ mod tests { // Reset the synonyms let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.reset_synonyms(); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // Ensure synonyms are reset @@ -1023,11 +1015,11 @@ mod tests { // Set all the settings except searchable let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_displayed_fields(vec!["hello".to_string()]); builder.set_filterable_fields(hashset! { S("age"), S("toto") }); builder.set_criteria(vec!["toto:asc".to_string()]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); // check the output @@ -1040,9 +1032,9 @@ mod tests { // We set toto and age as searchable to force reordering of the fields let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 1); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1060,11 +1052,11 @@ mod tests { // Set all the settings except searchable let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_displayed_fields(vec!["hello".to_string()]); // It is only Asc(toto), there is a facet database but it is denied to filter with toto. builder.set_criteria(vec!["toto:asc".to_string()]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1081,10 +1073,10 @@ mod tests { // Set the primary key settings let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_primary_key(S("mykey")); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); // Then index some documents with the "mykey" primary key. @@ -1097,29 +1089,29 @@ mod tests { { "mykey": 6, "name": "bernie", "age": 34 }, { "mykey": 7, "name": "ben", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + let mut builder = IndexDocuments::new(&mut wtxn, &index); builder.disable_autogenerate_docids(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // We now try to reset the primary key let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.reset_primary_key(); - let err = builder.execute(|_, _| ()).unwrap_err(); + let err = builder.execute(|_| ()).unwrap_err(); assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); wtxn.abort().unwrap(); // But if we clear the database... let mut wtxn = index.write_txn().unwrap(); - let builder = ClearDocuments::new(&mut wtxn, &index, 0); + let builder = ClearDocuments::new(&mut wtxn, &index); builder.execute().unwrap(); // ...we can change the primary key - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_primary_key(S("myid")); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); } @@ -1132,9 +1124,9 @@ mod tests { // Set the genres setting let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_filterable_fields(hashset! { S("genres") }); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); let content = documents!([ { @@ -1155,8 +1147,8 @@ mod tests { "release_date": 819676800 } ]); - let builder = IndexDocuments::new(&mut wtxn, &index, 1); - builder.execute(content, |_, _| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); // We now try to reset the primary key diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 20ec28e06..6e892a356 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -13,11 +13,10 @@ pub struct UpdateBuilder<'a> { pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, pub(crate) max_positions_per_attributes: Option, - pub(crate) update_id: u64, } impl<'a> UpdateBuilder<'a> { - pub fn new(update_id: u64) -> UpdateBuilder<'a> { + pub fn new() -> UpdateBuilder<'a> { UpdateBuilder { log_every_n: None, max_nb_chunks: None, @@ -27,7 +26,6 @@ impl<'a> UpdateBuilder<'a> { chunk_compression_level: None, thread_pool: None, max_positions_per_attributes: None, - update_id, } } @@ -68,7 +66,7 @@ impl<'a> UpdateBuilder<'a> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, ) -> ClearDocuments<'t, 'u, 'i> { - ClearDocuments::new(wtxn, index, self.update_id) + ClearDocuments::new(wtxn, index) } pub fn delete_documents<'t, 'u, 'i>( @@ -76,7 +74,7 @@ impl<'a> UpdateBuilder<'a> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, ) -> Result> { - DeleteDocuments::new(wtxn, index, self.update_id) + DeleteDocuments::new(wtxn, index) } pub fn index_documents<'t, 'u, 'i>( @@ -84,7 +82,7 @@ impl<'a> UpdateBuilder<'a> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, ) -> IndexDocuments<'t, 'u, 'i, 'a> { - let mut builder = IndexDocuments::new(wtxn, index, self.update_id); + let mut builder = IndexDocuments::new(wtxn, index); builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; @@ -103,7 +101,7 @@ impl<'a> UpdateBuilder<'a> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, ) -> Settings<'a, 't, 'u, 'i> { - let mut builder = Settings::new(wtxn, index, self.update_id); + let mut builder = Settings::new(wtxn, index); builder.log_every_n = self.log_every_n; builder.max_nb_chunks = self.max_nb_chunks; @@ -122,7 +120,7 @@ impl<'a> UpdateBuilder<'a> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, ) -> Facets<'t, 'u, 'i> { - let mut builder = Facets::new(wtxn, index, self.update_id); + let mut builder = Facets::new(wtxn, index); builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index eaaacc26f..49406deb5 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -10,22 +10,14 @@ pub struct WordsPrefixesFst<'t, 'u, 'i> { index: &'i Index, threshold: u32, max_prefix_length: usize, - _update_id: u64, } impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - update_id: u64, ) -> WordsPrefixesFst<'t, 'u, 'i> { - WordsPrefixesFst { - wtxn, - index, - threshold: 100, - max_prefix_length: 4, - _update_id: update_id, - } + WordsPrefixesFst { wtxn, index, threshold: 100, max_prefix_length: 4 } } /// Set the number of words required to make a prefix be part of the words prefixes diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index f044756eb..da7251389 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -16,9 +16,9 @@ macro_rules! test_distinct { // update distinct attribute let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_distinct_field(S(stringify!($distinct))); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index e8fb3fdfa..d1467fd72 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -32,7 +32,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); let criteria = criteria.iter().map(|c| c.to_string()).collect(); builder.set_criteria(criteria); @@ -51,10 +51,10 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("america") => vec![S("the united states")], }); builder.set_searchable_fields(vec![S("title"), S("description")]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); // index documents - let mut builder = UpdateBuilder::new(0); + let mut builder = UpdateBuilder::new(); builder.max_memory(10 * 1024 * 1024); // 10MiB let mut builder = builder.index_documents(&mut wtxn, &index); builder.enable_autogenerate_docids(); @@ -73,7 +73,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { // index documents let content = DocumentBatchReader::from_reader(cursor).unwrap(); - builder.execute(content, |_, _| ()).unwrap(); + builder.execute(content, |_| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index e5dde049c..8968eff90 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -341,9 +341,9 @@ fn criteria_mixup() { eprintln!("Testing with criteria order: {:?}", &criteria); //update criteria let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_criteria(criteria.iter().map(ToString::to_string).collect()); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); let mut rtxn = index.read_txn().unwrap(); @@ -376,16 +376,16 @@ fn criteria_ascdesc() { let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_sortable_fields(hashset! { S("name"), S("age"), }); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); // index documents - let mut builder = UpdateBuilder::new(0); + let mut builder = UpdateBuilder::new(); builder.max_memory(10 * 1024 * 1024); // 10MiB let mut builder = builder.index_documents(&mut wtxn, &index); builder.enable_autogenerate_docids(); @@ -419,7 +419,7 @@ fn criteria_ascdesc() { let reader = DocumentBatchReader::from_reader(cursor).unwrap(); - builder.execute(reader, |_, _| ()).unwrap(); + builder.execute(reader, |_| ()).unwrap(); wtxn.commit().unwrap(); @@ -430,9 +430,9 @@ fn criteria_ascdesc() { eprintln!("Testing with criterion: {:?}", &criterion); let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); + let mut builder = Settings::new(&mut wtxn, &index); builder.set_criteria(vec![criterion.to_string()]); - builder.execute(|_, _| ()).unwrap(); + builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); let mut rtxn = index.read_txn().unwrap(); From 64ef5869d72c6b962c20b55740e2260d042520d9 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Nov 2021 16:56:05 +0100 Subject: [PATCH 1167/1889] Update tokenizer v0.2.6 --- http-ui/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 04e1c708a..d807c4923 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" } memmap2 = "0.5.0" milli = { path = "../milli" } once_cell = "1.5.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 90bd1f926..790b52647 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -22,7 +22,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.5" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" } memmap2 = "0.5.0" obkv = "0.2.0" once_cell = "1.5.2" From 35f949963821a1c6ae43cb1a660b087d5ea36c0c Mon Sep 17 00:00:00 2001 From: many Date: Thu, 18 Nov 2021 16:57:12 +0100 Subject: [PATCH 1168/1889] Export tokenizer from milli --- milli/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 044d74ec1..9e7bb8966 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -20,6 +20,7 @@ use std::hash::BuildHasherDefault; pub use filter_parser::{Condition, FilterCondition}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; +pub use meilisearch_tokenizer as tokenizer; use serde_json::{Map, Value}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; From 6e977dd8e84cdc1681d14c76a0dac0719de7394b Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Mon, 22 Nov 2021 15:19:09 +0100 Subject: [PATCH 1169/1889] change visibility of DocumentDeletionResult --- milli/src/update/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 3b6edb0a3..3dd8abd28 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -1,6 +1,6 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; -pub use self::delete_documents::DeleteDocuments; +pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; pub use self::facets::Facets; pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod}; pub use self::settings::{Setting, Settings}; From 8970246bc4ddab7fc82a08e6043fe2859f5af243 Mon Sep 17 00:00:00 2001 From: many Date: Mon, 22 Nov 2021 18:16:54 +0100 Subject: [PATCH 1170/1889] Sort positions before iterating over them during word pair proximity extraction --- .../extract/extract_word_pair_proximity_docids.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 982799a65..f3667694a 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -53,7 +53,9 @@ pub fn extract_word_pair_proximity_docids( } let word = word.to_string(); - let mut iter = read_u32_ne_bytes(value).collect::>().into_iter(); + let mut positions: Vec<_> = read_u32_ne_bytes(value).collect(); + positions.sort_unstable(); + let mut iter = positions.into_iter(); if let Some(position) = iter.next() { document_word_positions_heap.push(PeekedWordPosition { word, position, iter }); } From 1b3923b5cefb69d7357b7626a7c72fda93d4c1db Mon Sep 17 00:00:00 2001 From: many Date: Mon, 29 Nov 2021 12:17:59 +0100 Subject: [PATCH 1171/1889] Update all packages to 0.21.0 --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index eb03842ca..e50bf8e55 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.20.2" +version = "0.21.0" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 5b33d2a4f..e2c3d44f9 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.20.2" +version = "0.21.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index d807c4923..593dba3e5 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.20.2" +version = "0.21.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 645bc4cdd..f00fa0d24 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.20.2" +version = "0.21.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 790b52647..a3d8cf627 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.20.2" +version = "0.21.0" authors = ["Kerollmops "] edition = "2018" From 57502fcf6a8f7b3c846502e20ad9785b5ab12a52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 6 Dec 2021 17:35:20 +0100 Subject: [PATCH 1172/1889] Introduce the depth method on FilterCondition --- filter-parser/src/lib.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index ed36b1bf4..8b72c69ee 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -113,6 +113,17 @@ pub enum FilterCondition<'a> { } impl<'a> FilterCondition<'a> { + pub fn depth(&self) -> usize { + match self { + FilterCondition::Condition { .. } => 1, + FilterCondition::Or(left, right) => 1 + left.depth().max(right.depth()), + FilterCondition::And(left, right) => 1 + left.depth().max(right.depth()), + FilterCondition::GeoLowerThan { .. } => 1, + FilterCondition::GeoGreaterThan { .. } => 1, + FilterCondition::Empty => 0, + } + } + pub fn negate(self) -> FilterCondition<'a> { use FilterCondition::*; @@ -584,4 +595,10 @@ pub mod tests { assert!(filter.starts_with(expected), "Filter `{:?}` was supposed to return the following error:\n{}\n, but instead returned\n{}\n.", input, expected, filter); } } + + #[test] + fn depth() { + let filter = FilterCondition::parse("account_ids=1 OR account_ids=2 OR account_ids=3 OR account_ids=4 OR account_ids=5 OR account_ids=6").unwrap(); + assert_eq!(filter.depth(), 6); + } } From 49c2db948561529cdd2ffb68d6bae5ec01e29f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 7 Dec 2021 15:16:29 +0100 Subject: [PATCH 1173/1889] Change the depth function to return the token depth --- filter-parser/src/lib.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 8b72c69ee..3ef31c3c8 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -113,14 +113,19 @@ pub enum FilterCondition<'a> { } impl<'a> FilterCondition<'a> { - pub fn depth(&self) -> usize { + /// Returns the first token found at the specified depth, `None` if no token at this depth. + pub fn token_at_depth(&self, depth: usize) -> Option<&Token> { match self { - FilterCondition::Condition { .. } => 1, - FilterCondition::Or(left, right) => 1 + left.depth().max(right.depth()), - FilterCondition::And(left, right) => 1 + left.depth().max(right.depth()), - FilterCondition::GeoLowerThan { .. } => 1, - FilterCondition::GeoGreaterThan { .. } => 1, - FilterCondition::Empty => 0, + FilterCondition::Condition { fid, .. } if depth == 0 => Some(fid), + FilterCondition::Or(left, right) => { + left.token_at_depth(depth - 1).or_else(|| right.token_at_depth(depth - 1)) + } + FilterCondition::And(left, right) => { + left.token_at_depth(depth - 1).or_else(|| right.token_at_depth(depth - 1)) + } + FilterCondition::GeoLowerThan { point: [point, _], .. } if depth == 0 => Some(point), + FilterCondition::GeoGreaterThan { point: [point, _], .. } if depth == 0 => Some(point), + _ => None, } } @@ -599,6 +604,6 @@ pub mod tests { #[test] fn depth() { let filter = FilterCondition::parse("account_ids=1 OR account_ids=2 OR account_ids=3 OR account_ids=4 OR account_ids=5 OR account_ids=6").unwrap(); - assert_eq!(filter.depth(), 6); + assert!(filter.token_at_depth(5).is_some()); } } From 90f49eab6d671c45bf7107868dc41c872aad4a49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 7 Dec 2021 16:32:48 +0100 Subject: [PATCH 1174/1889] Check the filter max depth limit and reject the invalid ones --- milli/src/search/facet/filter.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index e994f36d9..5a88c14dc 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -15,6 +15,9 @@ use crate::heed_codec::facet::{ }; use crate::{distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result}; +/// The maximum number of filters the filter AST can process. +const MAX_FILTER_DEPTH: usize = 1000; + #[derive(Debug, Clone, PartialEq, Eq)] pub struct Filter<'a> { condition: FilterCondition<'a>, @@ -27,6 +30,7 @@ enum FilterError<'a> { BadGeoLat(f64), BadGeoLng(f64), Reserved(&'a str), + TooDeep, InternalError, } impl<'a> std::error::Error for FilterError<'a> {} @@ -40,6 +44,10 @@ impl<'a> Display for FilterError<'a> { attribute, filterable, ), + Self::TooDeep => write!(f, + "Too many filter conditions, can't process more than {} filters.", + MAX_FILTER_DEPTH + ), Self::Reserved(keyword) => write!( f, "`{}` is a reserved keyword and thus can't be used as a filter expression.", @@ -108,6 +116,10 @@ impl<'a> Filter<'a> { } } + if let Some(token) = ands.as_ref().and_then(|fc| fc.token_at_depth(MAX_FILTER_DEPTH)) { + return Err(token.as_external_error(FilterError::TooDeep).into()); + } + Ok(ands.map(|ands| Self { condition: ands })) } From 32bd9f091fd74b50e4f3623d42a471d2ebb23241 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 7 Dec 2021 17:20:11 +0100 Subject: [PATCH 1175/1889] Detect the filters that are too deep and return an error --- filter-parser/src/lib.rs | 6 +++-- milli/src/search/facet/filter.rs | 40 ++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 2 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 3ef31c3c8..0e49e00e9 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -118,10 +118,12 @@ impl<'a> FilterCondition<'a> { match self { FilterCondition::Condition { fid, .. } if depth == 0 => Some(fid), FilterCondition::Or(left, right) => { - left.token_at_depth(depth - 1).or_else(|| right.token_at_depth(depth - 1)) + let depth = depth.saturating_sub(1); + right.token_at_depth(depth).or_else(|| left.token_at_depth(depth)) } FilterCondition::And(left, right) => { - left.token_at_depth(depth - 1).or_else(|| right.token_at_depth(depth - 1)) + let depth = depth.saturating_sub(1); + right.token_at_depth(depth).or_else(|| left.token_at_depth(depth)) } FilterCondition::GeoLowerThan { point: [point, _], .. } if depth == 0 => Some(point), FilterCondition::GeoGreaterThan { point: [point, _], .. } if depth == 0 => Some(point), diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 5a88c14dc..2c78816fb 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -128,6 +128,11 @@ impl<'a> Filter<'a> { Ok(fc) => Ok(fc), Err(e) => Err(Error::UserError(UserError::InvalidFilter(e.to_string()))), }?; + + if let Some(token) = condition.token_at_depth(MAX_FILTER_DEPTH) { + return Err(token.as_external_error(FilterError::TooDeep).into()); + } + Ok(Self { condition }) } } @@ -431,6 +436,8 @@ impl<'a> From> for Filter<'a> { #[cfg(test)] mod tests { + use std::fmt::Write; + use big_s::S; use either::Either; use heed::EnvOpenOptions; @@ -598,4 +605,37 @@ mod tests { "Bad longitude `180.000001`. Longitude must be contained between -180 and 180 degrees." )); } + + #[test] + fn filter_depth() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_searchable_fields(vec![S("account_ids")]); + builder.set_filterable_fields(hashset! { S("account_ids") }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // generates a big (2 MiB) filter with too much of ORs. + let tipic_filter = "account_ids=14361 OR "; + let mut filter_string = String::with_capacity(tipic_filter.len() * 14360); + for i in 1..=14361 { + let _ = write!(&mut filter_string, "account_ids={}", i); + if i != 14361 { + let _ = write!(&mut filter_string, " OR "); + } + } + + let error = Filter::from_str(&filter_string).unwrap_err(); + assert!( + error.to_string().starts_with("Too many filter conditions"), + "{}", + error.to_string() + ); + } } From ee856a7a46be0ad3bfa40247956848d1dcf6f079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 7 Dec 2021 17:36:45 +0100 Subject: [PATCH 1176/1889] Limit the max filter depth to 2000 --- milli/src/search/facet/filter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 2c78816fb..9d9d16de5 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -16,7 +16,7 @@ use crate::heed_codec::facet::{ use crate::{distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result}; /// The maximum number of filters the filter AST can process. -const MAX_FILTER_DEPTH: usize = 1000; +const MAX_FILTER_DEPTH: usize = 2000; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Filter<'a> { From ef59762d8ea933170f8545e52203e1b4ab157b53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Dec 2021 11:13:12 +0100 Subject: [PATCH 1177/1889] Prefer returning None instead of the Empty Filter state --- filter-parser/src/lib.rs | 12 ++--- milli/src/search/facet/filter.rs | 74 +++++++++++++++------------- milli/src/update/delete_documents.rs | 2 +- milli/src/update/settings.rs | 2 +- 4 files changed, 47 insertions(+), 43 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 0e49e00e9..4c5e03c82 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -109,7 +109,6 @@ pub enum FilterCondition<'a> { And(Box, Box), GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> }, GeoGreaterThan { point: [Token<'a>; 2], radius: Token<'a> }, - Empty, } impl<'a> FilterCondition<'a> { @@ -144,18 +143,17 @@ impl<'a> FilterCondition<'a> { }, Or(a, b) => And(a.negate().into(), b.negate().into()), And(a, b) => Or(a.negate().into(), b.negate().into()), - Empty => Empty, GeoLowerThan { point, radius } => GeoGreaterThan { point, radius }, GeoGreaterThan { point, radius } => GeoLowerThan { point, radius }, } } - pub fn parse(input: &'a str) -> Result { + pub fn parse(input: &'a str) -> Result, Error> { if input.trim().is_empty() { - return Ok(Self::Empty); + return Ok(None); } let span = Span::new_extra(input, input); - parse_filter(span).finish().map(|(_rem, output)| output) + parse_filter(span).finish().map(|(_rem, output)| Some(output)) } } @@ -560,7 +558,7 @@ pub mod tests { result.unwrap_err() ); let filter = result.unwrap(); - assert_eq!(filter, expected, "Filter `{}` failed.", input); + assert_eq!(filter, Some(expected), "Filter `{}` failed.", input); } } @@ -605,7 +603,7 @@ pub mod tests { #[test] fn depth() { - let filter = FilterCondition::parse("account_ids=1 OR account_ids=2 OR account_ids=3 OR account_ids=4 OR account_ids=5 OR account_ids=6").unwrap(); + let filter = FilterCondition::parse("account_ids=1 OR account_ids=2 OR account_ids=3 OR account_ids=4 OR account_ids=5 OR account_ids=6").unwrap().unwrap(); assert!(filter.token_at_depth(5).is_some()); } } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 9d9d16de5..88815d884 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -86,13 +86,15 @@ impl<'a> Filter<'a> { Either::Left(array) => { let mut ors = None; for rule in array { - let condition = Self::from_str(rule.as_ref())?.condition; - ors = match ors.take() { - Some(ors) => { - Some(FilterCondition::Or(Box::new(ors), Box::new(condition))) - } - None => Some(condition), - }; + if let Some(filter) = Self::from_str(rule.as_ref())? { + let condition = filter.condition; + ors = match ors.take() { + Some(ors) => { + Some(FilterCondition::Or(Box::new(ors), Box::new(condition))) + } + None => Some(condition), + }; + } } if let Some(rule) = ors { @@ -105,13 +107,15 @@ impl<'a> Filter<'a> { } } Either::Right(rule) => { - let condition = Self::from_str(rule.as_ref())?.condition; - ands = match ands.take() { - Some(ands) => { - Some(FilterCondition::And(Box::new(ands), Box::new(condition))) - } - None => Some(condition), - }; + if let Some(filter) = Self::from_str(rule.as_ref())? { + let condition = filter.condition; + ands = match ands.take() { + Some(ands) => { + Some(FilterCondition::And(Box::new(ands), Box::new(condition))) + } + None => Some(condition), + }; + } } } } @@ -123,9 +127,10 @@ impl<'a> Filter<'a> { Ok(ands.map(|ands| Self { condition: ands })) } - pub fn from_str(expression: &'a str) -> Result { + pub fn from_str(expression: &'a str) -> Result> { let condition = match FilterCondition::parse(expression) { - Ok(fc) => Ok(fc), + Ok(Some(fc)) => Ok(fc), + Ok(None) => return Ok(None), Err(e) => Err(Error::UserError(UserError::InvalidFilter(e.to_string()))), }?; @@ -133,7 +138,7 @@ impl<'a> Filter<'a> { return Err(token.as_external_error(FilterError::TooDeep).into()); } - Ok(Self { condition }) + Ok(Some(Self { condition })) } } @@ -377,7 +382,6 @@ impl<'a> Filter<'a> { let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?; Ok(lhs & rhs) } - FilterCondition::Empty => Ok(RoaringBitmap::new()), FilterCondition::GeoLowerThan { point, radius } => { let filterable_fields = index.filterable_fields(rtxn)?; if filterable_fields.contains("_geo") { @@ -451,20 +455,20 @@ mod tests { fn from_array() { // Simple array with Left let condition = Filter::from_array(vec![Either::Left(["channel = mv"])]).unwrap().unwrap(); - let expected = Filter::from_str("channel = mv").unwrap(); + let expected = Filter::from_str("channel = mv").unwrap().unwrap(); assert_eq!(condition, expected); // Simple array with Right let condition = Filter::from_array::<_, Option<&str>>(vec![Either::Right("channel = mv")]) .unwrap() .unwrap(); - let expected = Filter::from_str("channel = mv").unwrap(); + let expected = Filter::from_str("channel = mv").unwrap().unwrap(); assert_eq!(condition, expected); // Array with Left and escaped quote let condition = Filter::from_array(vec![Either::Left(["channel = \"Mister Mv\""])]).unwrap().unwrap(); - let expected = Filter::from_str("channel = \"Mister Mv\"").unwrap(); + let expected = Filter::from_str("channel = \"Mister Mv\"").unwrap().unwrap(); assert_eq!(condition, expected); // Array with Right and escaped quote @@ -472,13 +476,13 @@ mod tests { Filter::from_array::<_, Option<&str>>(vec![Either::Right("channel = \"Mister Mv\"")]) .unwrap() .unwrap(); - let expected = Filter::from_str("channel = \"Mister Mv\"").unwrap(); + let expected = Filter::from_str("channel = \"Mister Mv\"").unwrap().unwrap(); assert_eq!(condition, expected); // Array with Left and escaped simple quote let condition = Filter::from_array(vec![Either::Left(["channel = 'Mister Mv'"])]).unwrap().unwrap(); - let expected = Filter::from_str("channel = 'Mister Mv'").unwrap(); + let expected = Filter::from_str("channel = 'Mister Mv'").unwrap().unwrap(); assert_eq!(condition, expected); // Array with Right and escaped simple quote @@ -486,13 +490,13 @@ mod tests { Filter::from_array::<_, Option<&str>>(vec![Either::Right("channel = 'Mister Mv'")]) .unwrap() .unwrap(); - let expected = Filter::from_str("channel = 'Mister Mv'").unwrap(); + let expected = Filter::from_str("channel = 'Mister Mv'").unwrap().unwrap(); assert_eq!(condition, expected); // Simple with parenthesis let condition = Filter::from_array(vec![Either::Left(["(channel = mv)"])]).unwrap().unwrap(); - let expected = Filter::from_str("(channel = mv)").unwrap(); + let expected = Filter::from_str("(channel = mv)").unwrap().unwrap(); assert_eq!(condition, expected); // Test that the facet condition is correctly generated. @@ -503,7 +507,9 @@ mod tests { .unwrap() .unwrap(); let expected = - Filter::from_str("channel = gotaga AND (timestamp = 44 OR channel != ponce)").unwrap(); + Filter::from_str("channel = gotaga AND (timestamp = 44 OR channel != ponce)") + .unwrap() + .unwrap(); println!("\nExpecting: {:#?}\nGot: {:#?}\n", expected, condition); assert_eq!(condition, expected); } @@ -516,13 +522,13 @@ mod tests { let index = Index::new(options, &path).unwrap(); let rtxn = index.read_txn().unwrap(); - let filter = Filter::from_str("_geoRadius(42, 150, 10)").unwrap(); + let filter = Filter::from_str("_geoRadius(42, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!(error.to_string().starts_with( "Attribute `_geo` is not filterable. Available filterable attributes are: ``." )); - let filter = Filter::from_str("dog = \"bernese mountain\"").unwrap(); + let filter = Filter::from_str("dog = \"bernese mountain\"").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!(error.to_string().starts_with( "Attribute `dog` is not filterable. Available filterable attributes are: ``." @@ -539,13 +545,13 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap(); + let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!(error.to_string().starts_with( "Attribute `_geo` is not filterable. Available filterable attributes are: `title`." )); - let filter = Filter::from_str("name = 12").unwrap(); + let filter = Filter::from_str("name = 12").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!(error.to_string().starts_with( "Attribute `name` is not filterable. Available filterable attributes are: `title`." @@ -570,7 +576,7 @@ mod tests { let rtxn = index.read_txn().unwrap(); // georadius have a bad latitude - let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap(); + let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!( error.to_string().starts_with( @@ -581,14 +587,14 @@ mod tests { ); // georadius have a bad latitude - let filter = Filter::from_str("_geoRadius(-90.0000001, 150, 10)").unwrap(); + let filter = Filter::from_str("_geoRadius(-90.0000001, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!(error.to_string().contains( "Bad latitude `-90.0000001`. Latitude must be contained between -90 and 90 degrees." )); // georadius have a bad longitude - let filter = Filter::from_str("_geoRadius(-10, 250, 10)").unwrap(); + let filter = Filter::from_str("_geoRadius(-10, 250, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!( error.to_string().contains( @@ -599,7 +605,7 @@ mod tests { ); // georadius have a bad longitude - let filter = Filter::from_str("_geoRadius(-10, 180.000001, 10)").unwrap(); + let filter = Filter::from_str("_geoRadius(-10, 180.000001, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!(error.to_string().contains( "Bad longitude `180.000001`. Longitude must be contained between -180 and 180 degrees." diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 2fd3e084e..ed87132bd 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -681,7 +681,7 @@ mod tests { builder.delete_external_id("1_4"); builder.execute().unwrap(); - let filter = Filter::from_str("label = sign").unwrap(); + let filter = Filter::from_str("label = sign").unwrap().unwrap(); let results = index.search(&wtxn).filter(filter).execute().unwrap(); assert!(results.documents_ids.is_empty()); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 9c270ed71..fff5eb0fa 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1060,7 +1060,7 @@ mod tests { wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); - let filter = Filter::from_str("toto = 32").unwrap(); + let filter = Filter::from_str("toto = 32").unwrap().unwrap(); let _ = filter.evaluate(&rtxn, &index).unwrap_err(); } From 65519bc04ba573f5465d9ad9ed864117eedf2872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Dec 2021 11:14:51 +0100 Subject: [PATCH 1178/1889] Test that empty filters return a None --- milli/src/search/facet/filter.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 88815d884..642a32472 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -644,4 +644,10 @@ mod tests { error.to_string() ); } + + #[test] + fn empty_filter() { + let option = Filter::from_str(" ").unwrap(); + assert_eq!(option, None); + } } From 25faef67d052d52eeb9d1ef4844238df162a01fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Dec 2021 11:15:16 +0100 Subject: [PATCH 1179/1889] Remove the database setup in the filter_depth test --- milli/src/search/facet/filter.rs | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 642a32472..6ece17eb4 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -614,19 +614,6 @@ mod tests { #[test] fn filter_depth() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); - builder.set_searchable_fields(vec![S("account_ids")]); - builder.set_filterable_fields(hashset! { S("account_ids") }); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - // generates a big (2 MiB) filter with too much of ORs. let tipic_filter = "account_ids=14361 OR "; let mut filter_string = String::with_capacity(tipic_filter.len() * 14360); From 1c6c89f34561c62a05b1f4575e9b6f2c060a579f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Dec 2021 11:50:12 +0100 Subject: [PATCH 1180/1889] Fix the binaries that use the new optional filters --- cli/src/main.rs | 5 +++-- http-ui/src/main.rs | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 44c197de6..b3c18244d 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -250,8 +250,9 @@ impl Search { } if let Some(ref filter) = self.filter { - let condition = milli::Filter::from_str(filter)?; - search.filter(condition); + if let Some(condition) = milli::Filter::from_str(filter)? { + search.filter(condition); + } } if let Some(offset) = self.offset { diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 4bd8815a5..75a9012c6 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -738,7 +738,7 @@ async fn main() -> anyhow::Result<()> { let filters = match query.filters.as_ref() { Some(condition) if !condition.trim().is_empty() => { - Some(MilliFilter::from_str(condition).unwrap()) + MilliFilter::from_str(condition).unwrap() } _otherwise => None, }; From 94011bb9a8cba5dd8c99bae463a1762472a7e6f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 9 Dec 2021 12:14:16 +0100 Subject: [PATCH 1181/1889] Fix the benchmarks to work with optional filters --- benchmarks/benches/utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 1b1d9be8c..df5a7b828 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -117,7 +117,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let mut search = index.search(&rtxn); search.query(query).optional_words(conf.optional_words); if let Some(filter) = conf.filter { - let filter = Filter::from_str(filter).unwrap(); + let filter = Filter::from_str(filter).unwrap().unwrap(); search.filter(filter); } if let Some(sort) = &conf.sort { From d671d6f0f1852aac4c47ad40bb05974e4d745130 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 Dec 2021 19:27:34 +0100 Subject: [PATCH 1182/1889] remove an unused file --- milli/src/update/index_documents/store.rs | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 milli/src/update/index_documents/store.rs diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs deleted file mode 100644 index e69de29bb..000000000 From 98a365aaae53e2d543f65f5261811a23dad65660 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 14 Dec 2021 12:21:24 +0100 Subject: [PATCH 1183/1889] store the geopoint in three dimensions --- milli/src/lib.rs | 17 ++++++++++++++++- milli/src/search/criteria/geo.rs | 8 +++++--- milli/src/search/facet/filter.rs | 12 ++++++++---- milli/src/update/delete_documents.rs | 6 +++--- milli/src/update/index_documents/typed_chunk.rs | 9 +++++++-- 5 files changed, 39 insertions(+), 13 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 9e7bb8966..bb4f47e47 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -54,7 +54,11 @@ pub type FieldId = u16; pub type Position = u32; pub type RelativePosition = u16; pub type FieldDistribution = BTreeMap; -pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>; + +/// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata +/// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point +/// expressed in term of latitude and longitude. +pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>; pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; @@ -168,6 +172,17 @@ pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { a.haversine_distance_to(&b).meters() } +/// Convert a point expressed in terms of latitude and longitude to a point in the +/// cartesian coordinate expressed in terms of x, y and z. +pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] { + let [lat, lng] = coord.map(|f| f.to_radians()); + let x = lat.cos() * lng.cos(); + let y = lat.cos() * lng.sin(); + let z = lat.sin(); + + [x, y, z] +} + #[cfg(test)] mod tests { use serde_json::json; diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index de6de8912..e3bda51de 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -5,7 +5,7 @@ use rstar::RTree; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::{GeoPoint, Index, Result}; +use crate::{lat_lng_to_xyz, GeoPoint, Index, Result}; pub struct Geo<'t> { index: &'t Index, @@ -132,10 +132,12 @@ fn geo_point( point: [f64; 2], ascending: bool, ) -> Box> { + let point = lat_lng_to_xyz(&point); + let mut results = Vec::new(); for point in rtree.nearest_neighbor_iter(&point) { - if candidates.remove(point.data) { - results.push(std::iter::once(point.data).collect()); + if candidates.remove(point.data.0) { + results.push(std::iter::once(point.data.0).collect()); if candidates.is_empty() { break; } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 6ece17eb4..551fa0d4e 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -13,7 +13,9 @@ use crate::error::{Error, UserError}; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, }; -use crate::{distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result}; +use crate::{ + distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, +}; /// The maximum number of filters the filter AST can process. const MAX_FILTER_DEPTH: usize = 2000; @@ -402,12 +404,14 @@ impl<'a> Filter<'a> { None => return Ok(RoaringBitmap::new()), }; + let xyz_base_point = lat_lng_to_xyz(&base_point); + let result = rtree - .nearest_neighbor_iter(&base_point) + .nearest_neighbor_iter(&xyz_base_point) .take_while(|point| { - distance_between_two_points(&base_point, point.geom()) < radius + distance_between_two_points(&base_point, &point.data.1) < radius }) - .map(|point| point.data) + .map(|point| point.data.0) .collect(); Ok(result) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index ed87132bd..4c41cbd53 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -395,9 +395,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree .iter() - .filter(|&point| self.documents_ids.contains(point.data)) + .filter(|&point| self.documents_ids.contains(point.data.0)) .cloned() - .map(|point| (point, point.data)) + .map(|point| (point, point.data.0)) .unzip(); points_to_remove.iter().for_each(|point| { rtree.remove(&point); @@ -747,7 +747,7 @@ mod tests { let all_geo_ids = rtree.iter().map(|point| point.data).collect::>(); let all_geo_documents = index - .documents(&rtxn, all_geo_ids.iter().copied()) + .documents(&rtxn, all_geo_ids.iter().map(|(id, _)| id).copied()) .unwrap() .iter() .map(|(id, _)| *id) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b24a03ff6..7f0cfcab3 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -12,7 +12,10 @@ use super::helpers::{ }; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::into_clonable_grenad; -use crate::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result}; +use crate::{ + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, + Result, +}; pub(crate) enum TypedChunk { DocidWordPositions(grenad::Reader), @@ -192,7 +195,9 @@ pub(crate) fn write_typed_chunk_into_index( let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; - rtree.insert(GeoPoint::new(point, docid)); + let xyz_point = lat_lng_to_xyz(&point); + + rtree.insert(GeoPoint::new(xyz_point, (docid, point))); geo_faceted_docids.insert(docid); } index.put_geo_rtree(wtxn, &rtree)?; From 02a21fd30915f1f2c3361501924fde1c2c5f208b Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 20 Dec 2021 16:18:15 +0100 Subject: [PATCH 1184/1889] Handle the escapes of quote in the filters --- filter-parser/src/error.rs | 6 +- filter-parser/src/lib.rs | 26 +++-- filter-parser/src/value.rs | 188 ++++++++++++++++++++++++++++++++++--- 3 files changed, 197 insertions(+), 23 deletions(-) diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index 401b8d7f3..dc13861a1 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -56,6 +56,7 @@ pub enum ErrorKind<'a> { InvalidPrimary, ExpectedEof, ExpectedValue, + MalformedValue, MissingClosingDelimiter(char), Char(char), InternalError(error::ErrorKind), @@ -82,7 +83,7 @@ impl<'a> Error<'a> { pub fn char(self) -> char { match self.kind { ErrorKind::Char(c) => c, - _ => panic!("Internal filter parser error"), + error => panic!("Internal filter parser error: {:?}", error), } } } @@ -117,6 +118,9 @@ impl<'a> Display for Error<'a> { ErrorKind::ExpectedValue if input.trim().is_empty() => { writeln!(f, "Was expecting a value but instead got nothing.")? } + ErrorKind::MalformedValue => { + writeln!(f, "Malformed value: `{}`.", escaped_input)? + } ErrorKind::MissingClosingDelimiter(c) => { writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 4c5e03c82..07ee57a99 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -62,29 +62,39 @@ pub type Span<'a> = LocatedSpan<&'a str, &'a str>; type IResult<'a, Ret> = nom::IResult, Ret, Error<'a>>; #[derive(Debug, Clone, Eq)] -pub struct Token<'a>(Span<'a>); +pub struct Token<'a> { + /// The token in the original input, it should be used when possible. + span: Span<'a>, + /// If you need to modify the original input you can use the `value` field + /// to store your modified input. + value: Option, +} impl<'a> Deref for Token<'a> { type Target = &'a str; fn deref(&self) -> &Self::Target { - &self.0 + &self.span } } impl<'a> PartialEq for Token<'a> { fn eq(&self, other: &Self) -> bool { - self.0.fragment() == other.0.fragment() + self.span.fragment() == other.span.fragment() } } impl<'a> Token<'a> { - pub fn new(position: Span<'a>) -> Self { - Self(position) + pub fn new(span: Span<'a>, value: Option) -> Self { + Self { span, value } + } + + pub fn value(&self) -> &str { + self.value.as_ref().map_or(&self.span, |value| value) } pub fn as_external_error(&self, error: impl std::error::Error) -> Error<'a> { - Error::new_from_external(self.0, error) + Error::new_from_external(self.span, error) } pub fn parse(&self) -> Result @@ -92,13 +102,13 @@ impl<'a> Token<'a> { T: FromStr, T::Err: std::error::Error, { - self.0.parse().map_err(|e| self.as_external_error(e)) + self.span.parse().map_err(|e| self.as_external_error(e)) } } impl<'a> From> for Token<'a> { fn from(span: Span<'a>) -> Self { - Self(span) + Self { span, value: None } } } diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index b9d929ab0..d2ca1c932 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -7,8 +7,54 @@ use nom::sequence::{delimited, terminated}; use crate::error::NomErrorExt; use crate::{parse_geo_point, parse_geo_radius, Error, ErrorKind, IResult, Span, Token}; +/// This function goes through all chacaters in the [Span], if it finds any escaped character (`\`). +/// It generate a new string with all `\` removed from the [Span]. +fn unescape(buf: Span, char_to_escape: char) -> String { + let to_escape = format!("\\{}", char_to_escape); + buf.replace(&to_escape, &char_to_escape.to_string()) +} + +use nom::{InputIter, InputLength, InputTake, Slice}; + +/// Parse a value in quote. If it encounter an escaped quote it'll unescape it. +fn quoted_by(quote: char, input: Span) -> IResult { + // empty fields / values are valid in json + if input.is_empty() { + return Ok((input.slice(input.input_len()..), input.into())); + } + + let mut escaped = false; + let mut i = input.iter_indices(); + + while let Some((idx, c)) = i.next() { + match c { + c if c == quote => { + let (rem, output) = input.take_split(idx); + return Ok((rem, Token::new(output, escaped.then(|| unescape(output, quote))))); + } + '\\' => { + if let Some((_, c)) = i.next() { + escaped |= c == quote; + } else { + return Err(nom::Err::Error(Error::new_from_kind( + input, + ErrorKind::MalformedValue, + ))); + } + } + // if it was preceeded by a `\` or if it was anything else we can continue to advance + _ => (), + } + } + + Ok(( + input.slice(input.input_len()..), + Token::new(input, escaped.then(|| unescape(input, quote))), + )) +} + /// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* -pub fn parse_value(input: Span) -> IResult { +pub fn parse_value<'a>(input: Span<'a>) -> IResult> { // to get better diagnostic message we are going to strip the left whitespaces from the input right now let (input, _) = take_while(char::is_whitespace)(input)?; @@ -30,12 +76,10 @@ pub fn parse_value(input: Span) -> IResult { _ => (), } - // singleQuoted = "'" .* all but quotes "'" - let simple_quoted = take_till(|c: char| c == '\''); - // doubleQuoted = "\"" (word | spaces)* "\"" - let double_quoted = take_till(|c: char| c == '"'); // word = (alphanumeric | _ | - | .)+ - let word = take_while1(is_value_component); + let word = |input: Span<'a>| -> IResult> { + take_while1(is_value_component)(input).map(|(s, t)| (s, t.into())) + }; // this parser is only used when an error is encountered and it parse the // largest string possible that do not contain any “language” syntax. @@ -48,20 +92,27 @@ pub fn parse_value(input: Span) -> IResult { terminated( alt(( - delimited(char('\''), cut(simple_quoted), cut(char('\''))), - delimited(char('"'), cut(double_quoted), cut(char('"'))), + delimited(char('\''), cut(|input| quoted_by('\'', input)), cut(char('\''))), + delimited(char('"'), cut(|input| quoted_by('"', input)), cut(char('"'))), word, )), multispace0, )(input) - .map(|(s, t)| (s, t.into())) + // .map(|(s, t)| (s, t.into())) // if we found nothing in the alt it means the user specified something that was not recognized as a value .map_err(|e: nom::Err| { e.map_err(|_| Error::new_from_kind(error_word(input).unwrap().1, ErrorKind::ExpectedValue)) }) - // if we found encountered a failure it means the user really tried to input a value, but had an unmatched quote .map_err(|e| { - e.map_fail(|c| Error::new_from_kind(input, ErrorKind::MissingClosingDelimiter(c.char()))) + e.map_fail(|failure| { + // if we found encountered a char failure it means the user had an unmatched quote + if matches!(failure.kind(), ErrorKind::Char(_)) { + Error::new_from_kind(input, ErrorKind::MissingClosingDelimiter(failure.char())) + } else { + // else we let the failure untouched + failure + } + }) }) } @@ -81,7 +132,7 @@ pub mod test { use crate::tests::rtok; #[test] - fn name() { + fn test_span() { let test_case = [ ("channel", rtok("", "channel")), (".private", rtok("", ".private")), @@ -102,6 +153,7 @@ pub mod test { ("\"cha'nnel\"", rtok("'", "cha'nnel")), ("\"cha'nnel\"", rtok("'", "cha'nnel")), ("I'm tamo", rtok("'m tamo", "I")), + ("\"I'm \\\"super\\\" tamo\"", rtok("\"", "I'm \\\"super\\\" tamo")), ]; for (input, expected) in test_case { @@ -114,8 +166,116 @@ pub mod test { expected, result.unwrap_err() ); - let value = result.unwrap().1; - assert_eq!(value, expected, "Filter `{}` failed.", input); + let token = result.unwrap().1; + assert_eq!(token, expected, "Filter `{}` failed.", input); + } + } + + #[test] + fn test_escape_inside_double_quote() { + // (input, remaining, expected output token, output value) + let test_case = [ + ("aaaa", "", rtok("", "aaaa"), "aaaa"), + (r#"aa"aa"#, r#""aa"#, rtok("", "aa"), "aa"), + (r#"aa\"aa"#, r#""#, rtok("", r#"aa\"aa"#), r#"aa"aa"#), + (r#"aa\\\aa"#, r#""#, rtok("", r#"aa\\\aa"#), r#"aa\\\aa"#), + (r#"aa\\"\aa"#, r#""\aa"#, rtok("", r#"aa\\"#), r#"aa\\"#), + (r#"aa\\\"\aa"#, r#""#, rtok("", r#"aa\\\"\aa"#), r#"aa\\"\aa"#), + (r#"\"\""#, r#""#, rtok("", r#"\"\""#), r#""""#), + ]; + + for (input, remaining, expected_tok, expected_val) in test_case { + let span = Span::new_extra(input, ""); + let result = quoted_by('"', span); + assert!(result.is_ok()); + + let (rem, output) = result.unwrap(); + assert_eq!(rem.to_string(), remaining); + assert_eq!(output, expected_tok); + assert_eq!(output.value(), expected_val.to_string()); + } + } + + #[test] + fn test_unescape() { + // double quote + assert_eq!( + unescape(Span::new_extra(r#"Hello \"World\""#, ""), '"'), + r#"Hello "World""#.to_string() + ); + assert_eq!( + unescape(Span::new_extra(r#"Hello \\\"World\\\""#, ""), '"'), + r#"Hello \\"World\\""#.to_string() + ); + // simple quote + assert_eq!( + unescape(Span::new_extra(r#"Hello \'World\'"#, ""), '\''), + r#"Hello 'World'"#.to_string() + ); + assert_eq!( + unescape(Span::new_extra(r#"Hello \\\'World\\\'"#, ""), '\''), + r#"Hello \\'World\\'"#.to_string() + ); + } + + #[test] + fn test_value() { + let test_case = [ + // (input, expected value, if a string was generated to hold the new value) + ("channel", "channel", false), + // All the base test, no escaped string should be generated + (".private", ".private", false), + ("I-love-kebab", "I-love-kebab", false), + ("but_snakes_is_also_good", "but_snakes_is_also_good", false), + ("parens(", "parens", false), + ("parens)", "parens", false), + ("not!", "not", false), + (" channel", "channel", false), + ("channel ", "channel", false), + (" channel ", "channel", false), + ("'channel'", "channel", false), + ("\"channel\"", "channel", false), + ("'cha)nnel'", "cha)nnel", false), + ("'cha\"nnel'", "cha\"nnel", false), + ("\"cha'nnel\"", "cha'nnel", false), + ("\" some spaces \"", " some spaces ", false), + ("\"cha'nnel\"", "cha'nnel", false), + ("\"cha'nnel\"", "cha'nnel", false), + ("I'm tamo", "I", false), + // escaped thing but not quote + (r#""\\""#, r#"\\"#, false), + (r#""\\\\\\""#, r#"\\\\\\"#, false), + (r#""aa\\aa""#, r#"aa\\aa"#, false), + // with double quote + (r#""Hello \"world\"""#, r#"Hello "world""#, true), + (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true), + (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true), + (r#""\"\"""#, r#""""#, true), + // with simple quote + (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true), + (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true), + (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true), + (r#"'\'\''"#, r#"''"#, true), + ]; + + for (input, expected, escaped) in test_case { + let input = Span::new_extra(input, input); + let result = parse_value(input); + + assert!( + result.is_ok(), + "Filter `{:?}` was supposed to be parsed but failed with the following error: `{}`", + expected, + result.unwrap_err() + ); + let token = result.unwrap().1; + assert_eq!( + token.value.is_some(), + escaped, + "Filter `{}` was not supposed to be escaped", + input + ); + assert_eq!(token.value(), expected, "Filter `{}` failed.", input); } } From 6a1216bd5102fc8e533bb6b57dc5352d7af5de92 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 Dec 2021 18:50:35 +0100 Subject: [PATCH 1185/1889] Integrate telegraf into our CI --- .../{benchmarks.yml => manual_benchmarks.yml} | 0 ...rks_indexing.yml => push_benchmarks_indexing.yml} | 12 +++++++++--- ...search_geo.yml => push_benchmarks_search_geo.yml} | 12 +++++++++--- ...ch_songs.yml => push_benchmarks_search_songs.yml} | 12 +++++++++--- ...arch_wiki.yml => push_benchmarks_search_wiki.yml} | 12 +++++++++--- 5 files changed, 36 insertions(+), 12 deletions(-) rename .github/workflows/{benchmarks.yml => manual_benchmarks.yml} (100%) rename .github/workflows/{cron_benchmarks_indexing.yml => push_benchmarks_indexing.yml} (88%) rename .github/workflows/{cron_benchmarks_search_geo.yml => push_benchmarks_search_geo.yml} (88%) rename .github/workflows/{cron_benchmarks_search_songs.yml => push_benchmarks_search_songs.yml} (88%) rename .github/workflows/{cron_benchmarks_search_wiki.yml => push_benchmarks_search_wiki.yml} (87%) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/manual_benchmarks.yml similarity index 100% rename from .github/workflows/benchmarks.yml rename to .github/workflows/manual_benchmarks.yml diff --git a/.github/workflows/cron_benchmarks_indexing.yml b/.github/workflows/push_benchmarks_indexing.yml similarity index 88% rename from .github/workflows/cron_benchmarks_indexing.yml rename to .github/workflows/push_benchmarks_indexing.yml index 452966194..4a0dd6a10 100644 --- a/.github/workflows/cron_benchmarks_indexing.yml +++ b/.github/workflows/push_benchmarks_indexing.yml @@ -1,10 +1,12 @@ -name: Benchmarks indexing (cron) +name: Benchmarks indexing (push) on: - schedule: - - cron: "30 0 * * FRI" # every friday at 00:30 + push: + branches: + - main env: + INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }} BENCH_NAME: "indexing" jobs: @@ -61,6 +63,10 @@ jobs: source: ${{ steps.file.outputs.basename }}.json out_dir: critcmp_results + # Upload benchmarks to influxdb + - name: Upload ${{ steps.file.outputs.basename }}.json to influxDB + run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug + # Helper - name: 'README: compare with another benchmark' run: | diff --git a/.github/workflows/cron_benchmarks_search_geo.yml b/.github/workflows/push_benchmarks_search_geo.yml similarity index 88% rename from .github/workflows/cron_benchmarks_search_geo.yml rename to .github/workflows/push_benchmarks_search_geo.yml index 642b5018e..ef7c05cd6 100644 --- a/.github/workflows/cron_benchmarks_search_geo.yml +++ b/.github/workflows/push_benchmarks_search_geo.yml @@ -1,11 +1,13 @@ -name: Benchmarks search geo (cron) +name: Benchmarks search geo (push) on: - schedule: - - cron: "30 18 * * FRI" # every friday at 18:30 + push: + branches: + - main env: BENCH_NAME: "search_geo" + INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }} jobs: benchmarks: @@ -61,6 +63,10 @@ jobs: source: ${{ steps.file.outputs.basename }}.json out_dir: critcmp_results + # Upload benchmarks to influxdb + - name: Upload ${{ steps.file.outputs.basename }}.json to influxDB + run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug + # Helper - name: 'README: compare with another benchmark' run: | diff --git a/.github/workflows/cron_benchmarks_search_songs.yml b/.github/workflows/push_benchmarks_search_songs.yml similarity index 88% rename from .github/workflows/cron_benchmarks_search_songs.yml rename to .github/workflows/push_benchmarks_search_songs.yml index d15cc7ab6..82d764434 100644 --- a/.github/workflows/cron_benchmarks_search_songs.yml +++ b/.github/workflows/push_benchmarks_search_songs.yml @@ -1,11 +1,13 @@ -name: Benchmarks search songs (cron) +name: Benchmarks search songs (push) on: - schedule: - - cron: "30 08 * * FRI" # every friday at 08:30 + push: + branches: + - main env: BENCH_NAME: "search_songs" + INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }} jobs: benchmarks: @@ -61,6 +63,10 @@ jobs: source: ${{ steps.file.outputs.basename }}.json out_dir: critcmp_results + # Upload benchmarks to influxdb + - name: Upload ${{ steps.file.outputs.basename }}.json to influxDB + run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug + # Helper - name: 'README: compare with another benchmark' run: | diff --git a/.github/workflows/cron_benchmarks_search_wiki.yml b/.github/workflows/push_benchmarks_search_wiki.yml similarity index 87% rename from .github/workflows/cron_benchmarks_search_wiki.yml rename to .github/workflows/push_benchmarks_search_wiki.yml index c73e8c037..efb18d86d 100644 --- a/.github/workflows/cron_benchmarks_search_wiki.yml +++ b/.github/workflows/push_benchmarks_search_wiki.yml @@ -1,11 +1,13 @@ -name: Benchmarks search wikipedia articles (cron) +name: Benchmarks search wikipedia articles (push) on: - schedule: - - cron: "30 16 * * FRI" # every friday at 16:30 (it’s snacky snack-time!) + push: + branches: + - main env: BENCH_NAME: "search_wiki" + INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }} jobs: benchmarks: @@ -61,6 +63,10 @@ jobs: source: ${{ steps.file.outputs.basename }}.json out_dir: critcmp_results + # Upload benchmarks to influxdb + - name: Upload ${{ steps.file.outputs.basename }}.json to influxDB + run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug + # Helper - name: 'README: compare with another benchmark' run: | From 9bdcd42b9bdf3aea6e283e81febf5767acd6d4de Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 4 Jan 2022 13:02:15 +0100 Subject: [PATCH 1186/1889] reintroduce the gitignore for the fuzzer --- filter-parser/fuzz/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 filter-parser/fuzz/.gitignore diff --git a/filter-parser/fuzz/.gitignore b/filter-parser/fuzz/.gitignore new file mode 100644 index 000000000..cb73742e4 --- /dev/null +++ b/filter-parser/fuzz/.gitignore @@ -0,0 +1,2 @@ +/corpus/ +/artifacts/ From 3d99686f7ab71bb83655b45d377e9b45202727dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 4 Jan 2022 16:01:01 +0100 Subject: [PATCH 1187/1889] Change self-hosted label by benchmarks --- .github/workflows/manual_benchmarks.yml | 2 +- .github/workflows/push_benchmarks_indexing.yml | 2 +- .github/workflows/push_benchmarks_search_geo.yml | 2 +- .github/workflows/push_benchmarks_search_songs.yml | 2 +- .github/workflows/push_benchmarks_search_wiki.yml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/manual_benchmarks.yml b/.github/workflows/manual_benchmarks.yml index a857618d0..19b071f57 100644 --- a/.github/workflows/manual_benchmarks.yml +++ b/.github/workflows/manual_benchmarks.yml @@ -14,7 +14,7 @@ env: jobs: benchmarks: name: Run and upload benchmarks - runs-on: self-hosted + runs-on: benchmarks steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 diff --git a/.github/workflows/push_benchmarks_indexing.yml b/.github/workflows/push_benchmarks_indexing.yml index 4a0dd6a10..0806f5646 100644 --- a/.github/workflows/push_benchmarks_indexing.yml +++ b/.github/workflows/push_benchmarks_indexing.yml @@ -12,7 +12,7 @@ env: jobs: benchmarks: name: Run and upload benchmarks - runs-on: self-hosted + runs-on: benchmarks steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 diff --git a/.github/workflows/push_benchmarks_search_geo.yml b/.github/workflows/push_benchmarks_search_geo.yml index ef7c05cd6..cdf9264a7 100644 --- a/.github/workflows/push_benchmarks_search_geo.yml +++ b/.github/workflows/push_benchmarks_search_geo.yml @@ -12,7 +12,7 @@ env: jobs: benchmarks: name: Run and upload benchmarks - runs-on: self-hosted + runs-on: benchmarks steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 diff --git a/.github/workflows/push_benchmarks_search_songs.yml b/.github/workflows/push_benchmarks_search_songs.yml index 82d764434..cb2eddd46 100644 --- a/.github/workflows/push_benchmarks_search_songs.yml +++ b/.github/workflows/push_benchmarks_search_songs.yml @@ -12,7 +12,7 @@ env: jobs: benchmarks: name: Run and upload benchmarks - runs-on: self-hosted + runs-on: benchmarks steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 diff --git a/.github/workflows/push_benchmarks_search_wiki.yml b/.github/workflows/push_benchmarks_search_wiki.yml index efb18d86d..71eb89c97 100644 --- a/.github/workflows/push_benchmarks_search_wiki.yml +++ b/.github/workflows/push_benchmarks_search_wiki.yml @@ -12,7 +12,7 @@ env: jobs: benchmarks: name: Run and upload benchmarks - runs-on: self-hosted + runs-on: benchmarks steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 From 3c7ea1d298ca8dac991da7f006b9a4bf2a25f9d1 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 10 Jan 2022 15:14:32 +0100 Subject: [PATCH 1188/1889] Apply code suggestions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- filter-parser/src/value.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index d2ca1c932..ec7c93656 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -3,19 +3,18 @@ use nom::bytes::complete::{take_till, take_while, take_while1}; use nom::character::complete::{char, multispace0}; use nom::combinator::cut; use nom::sequence::{delimited, terminated}; +use nom::{InputIter, InputLength, InputTake, Slice}; use crate::error::NomErrorExt; use crate::{parse_geo_point, parse_geo_radius, Error, ErrorKind, IResult, Span, Token}; -/// This function goes through all chacaters in the [Span], if it finds any escaped character (`\`). -/// It generate a new string with all `\` removed from the [Span]. +/// This function goes through all characters in the [Span] if it finds any escaped character (`\`). +/// It generates a new string with all `\` removed from the [Span]. fn unescape(buf: Span, char_to_escape: char) -> String { let to_escape = format!("\\{}", char_to_escape); buf.replace(&to_escape, &char_to_escape.to_string()) } -use nom::{InputIter, InputLength, InputTake, Slice}; - /// Parse a value in quote. If it encounter an escaped quote it'll unescape it. fn quoted_by(quote: char, input: Span) -> IResult { // empty fields / values are valid in json @@ -98,7 +97,6 @@ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { )), multispace0, )(input) - // .map(|(s, t)| (s, t.into())) // if we found nothing in the alt it means the user specified something that was not recognized as a value .map_err(|e: nom::Err| { e.map_err(|_| Error::new_from_kind(error_word(input).unwrap().1, ErrorKind::ExpectedValue)) From 0fcde35a20a933963f867b0abb2e471a729dd822 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 10 Jan 2022 15:53:44 +0100 Subject: [PATCH 1189/1889] Update filter-parser/src/value.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- filter-parser/src/value.rs | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index ec7c93656..84dd21902 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -26,24 +26,20 @@ fn quoted_by(quote: char, input: Span) -> IResult { let mut i = input.iter_indices(); while let Some((idx, c)) = i.next() { - match c { - c if c == quote => { - let (rem, output) = input.take_split(idx); - return Ok((rem, Token::new(output, escaped.then(|| unescape(output, quote))))); + if c == quote { + let (rem, output) = input.take_split(idx); + return Ok((rem, Token::new(output, escaped.then(|| unescape(output, quote))))); + } else if c == '\\' { + if let Some((_, c)) = i.next() { + escaped |= c == quote; + } else { + return Err(nom::Err::Error(Error::new_from_kind( + input, + ErrorKind::MalformedValue, + ))); } - '\\' => { - if let Some((_, c)) = i.next() { - escaped |= c == quote; - } else { - return Err(nom::Err::Error(Error::new_from_kind( - input, - ErrorKind::MalformedValue, - ))); - } - } - // if it was preceeded by a `\` or if it was anything else we can continue to advance - _ => (), } + // if it was preceeded by a `\` or if it was anything else we can continue to advance } Ok(( From 92804f6f459bb1b5defa46de8f90923fa3b78d94 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 10 Jan 2022 15:59:04 +0100 Subject: [PATCH 1190/1889] apply clippy suggestions --- filter-parser/src/condition.rs | 8 +------- filter-parser/src/error.rs | 4 ++-- filter-parser/src/lib.rs | 2 +- filter-parser/src/main.rs | 2 +- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index abd549534..264787055 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -63,11 +63,5 @@ pub fn parse_to(input: Span) -> IResult { let (input, (key, from, _, to)) = tuple((parse_value, parse_value, tag("TO"), cut(parse_value)))(input)?; - Ok(( - input, - FilterCondition::Condition { - fid: key.into(), - op: Between { from: from.into(), to: to.into() }, - }, - )) + Ok((input, FilterCondition::Condition { fid: key, op: Between { from, to } })) } diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index dc13861a1..ddf7bea47 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -19,14 +19,14 @@ impl NomErrorExt for nom::Err { fn map_err E>(self, op: O) -> nom::Err { match self { e @ Self::Failure(_) => e, - e => e.map(|e| op(e)), + e => e.map(op), } } fn map_fail E>(self, op: O) -> nom::Err { match self { e @ Self::Error(_) => e, - e => e.map(|e| op(e)), + e => e.map(op), } } } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 07ee57a99..bad7dbc64 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -233,7 +233,7 @@ fn parse_geo_point(input: Span) -> IResult { multispace0, tag("_geoPoint"), // if we were able to parse `_geoPoint` we are going to return a Failure whatever happens next. - cut(delimited(char('('), separated_list1(tag(","), ws(|c| recognize_float(c))), char(')'))), + cut(delimited(char('('), separated_list1(tag(","), ws(recognize_float)), char(')'))), ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))?; // if we succeeded we still return a `Failure` because geoPoints are not allowed diff --git a/filter-parser/src/main.rs b/filter-parser/src/main.rs index a3e4cab28..15ab86188 100644 --- a/filter-parser/src/main.rs +++ b/filter-parser/src/main.rs @@ -10,7 +10,7 @@ fn main() { } Err(e) => { println!("❎ Invalid filter"); - println!("{}", e.to_string()); + println!("{}", e); } } } From e1053989c016f66b14f26997d0ff44b0331007b4 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 12 Jan 2022 17:57:54 +0100 Subject: [PATCH 1191/1889] add a fuzzer on milli --- milli/README.md | 26 ++++++++++ milli/fuzz/.gitignore | 2 + milli/fuzz/Cargo.toml | 36 ++++++++++++++ milli/fuzz/fuzz_targets/indexing.rs | 76 +++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+) create mode 100644 milli/README.md create mode 100644 milli/fuzz/.gitignore create mode 100644 milli/fuzz/Cargo.toml create mode 100644 milli/fuzz/fuzz_targets/indexing.rs diff --git a/milli/README.md b/milli/README.md new file mode 100644 index 000000000..7479eff45 --- /dev/null +++ b/milli/README.md @@ -0,0 +1,26 @@ +# Milli + +## Fuzzing milli + +Currently you can only fuzz the indexation. +To execute the fuzzer run: +``` +cargo fuzz run indexing +``` + +To execute the fuzzer on multiple thread you can also run: +``` +cargo fuzz run -j4 indexing +``` + +Since the fuzzer is going to create a lot of temporary file to let milli index its documents +I would also recommand to execute it on a ramdisk. +Here is how to setup a ramdisk on linux: +``` +sudo mount -t tmpfs none path/to/your/ramdisk +``` +And then set the [TMPDIR](https://doc.rust-lang.org/std/env/fn.temp_dir.html) environment variable +to make the fuzzer create its file in it: +``` +export TMPDIR=path/to/your/ramdisk +``` diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore new file mode 100644 index 000000000..cb73742e4 --- /dev/null +++ b/milli/fuzz/.gitignore @@ -0,0 +1,2 @@ +/corpus/ +/artifacts/ diff --git a/milli/fuzz/Cargo.toml b/milli/fuzz/Cargo.toml new file mode 100644 index 000000000..04b329600 --- /dev/null +++ b/milli/fuzz/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "milli-fuzz" +version = "0.0.0" +authors = ["Automatically generated"] +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } +serde_json = { version = "1.0.62", features = ["preserve_order"] } +anyhow = "1.0" +tempfile = "3.3" +arbitrary-json = { path = "../../../arbitrary-json" } + +[target.'cfg(target_os = "linux")'.dependencies] +jemallocator = "0.3.2" + +[dependencies.milli] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[profile.release] +debug = true + +[[bin]] +name = "indexing" +path = "fuzz_targets/indexing.rs" +test = false +doc = false diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs new file mode 100644 index 000000000..179ccf757 --- /dev/null +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -0,0 +1,76 @@ +#![no_main] + +use std::io::{BufWriter, Cursor, Read, Seek, Write}; + +use anyhow::{bail, Result}; +use arbitrary_json::ArbitraryValue; +use heed::EnvOpenOptions; +use libfuzzer_sys::fuzz_target; +use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::update::UpdateBuilder; +use milli::Index; +use serde_json::Value; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +/// reads json from input and write an obkv batch to writer. +pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { + let writer = BufWriter::new(writer); + let mut builder = DocumentBatchBuilder::new(writer)?; + builder.extend_from_json(input)?; + + if builder.len() == 0 { + bail!("Empty payload"); + } + + let count = builder.finish()?; + + Ok(count) +} + +fn index_documents( + index: &mut milli::Index, + documents: DocumentBatchReader>>, +) -> Result<()> { + let update_builder = UpdateBuilder::new(); + let mut wtxn = index.write_txn()?; + let builder = update_builder.index_documents(&mut wtxn, &index); + + builder.execute(documents, |_| ())?; + wtxn.commit()?; + Ok(()) +} + +fn create_index() -> Result { + let dir = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.max_readers(1); + Ok(Index::new(options, dir.path())?) +} + +fuzz_target!(|batches: Vec>| { + if let Ok(mut index) = create_index() { + for batch in batches { + let documents: Vec = + batch.into_iter().map(|value| serde_json::Value::from(value)).collect(); + let json = Value::Array(documents); + let json = serde_json::to_string(&json).unwrap(); + + let mut documents = Cursor::new(Vec::new()); + + // We ignore all badly generated documents + if let Ok(_count) = read_json(json.as_bytes(), &mut documents) { + let documents = DocumentBatchReader::from_reader(documents).unwrap(); + match index_documents(&mut index, documents) { + // Err(e @ InternalError(_) | e @ IoError(_)) => panic!("{:?}", e), + _ => (), + } + } + } + + index.prepare_for_closing().wait(); + } +}); From c94952e25d33b3750da86aa00bede852133a1915 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 12 Jan 2022 18:30:11 +0100 Subject: [PATCH 1192/1889] update the readme + dependencies --- milli/README.md | 4 ++-- milli/fuzz/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/README.md b/milli/README.md index 7479eff45..56db42a86 100644 --- a/milli/README.md +++ b/milli/README.md @@ -5,12 +5,12 @@ Currently you can only fuzz the indexation. To execute the fuzzer run: ``` -cargo fuzz run indexing +cargo +nightly fuzz run indexing ``` To execute the fuzzer on multiple thread you can also run: ``` -cargo fuzz run -j4 indexing +cargo +nightly fuzz run -j4 indexing ``` Since the fuzzer is going to create a lot of temporary file to let milli index its documents diff --git a/milli/fuzz/Cargo.toml b/milli/fuzz/Cargo.toml index 04b329600..3386ddaf9 100644 --- a/milli/fuzz/Cargo.toml +++ b/milli/fuzz/Cargo.toml @@ -14,7 +14,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } serde_json = { version = "1.0.62", features = ["preserve_order"] } anyhow = "1.0" tempfile = "3.3" -arbitrary-json = { path = "../../../arbitrary-json" } +arbitrary-json = { git = "https://github.com/irevoire/arbitrary-json" } [target.'cfg(target_os = "linux")'.dependencies] jemallocator = "0.3.2" From b22c80106f6766f413e16f1576a0c532b7c120ce Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 13 Jan 2022 15:35:24 +0100 Subject: [PATCH 1193/1889] add some settings to the fuzzed milli and use the published version of arbitrary json --- milli/fuzz/Cargo.toml | 3 ++- milli/fuzz/fuzz_targets/indexing.rs | 36 ++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/milli/fuzz/Cargo.toml b/milli/fuzz/Cargo.toml index 3386ddaf9..0456e7098 100644 --- a/milli/fuzz/Cargo.toml +++ b/milli/fuzz/Cargo.toml @@ -9,12 +9,13 @@ edition = "2018" cargo-fuzz = true [dependencies] +arbitrary = "1.0" libfuzzer-sys = "0.4" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } serde_json = { version = "1.0.62", features = ["preserve_order"] } anyhow = "1.0" tempfile = "3.3" -arbitrary-json = { git = "https://github.com/irevoire/arbitrary-json" } +arbitrary-json = "0.1.0" [target.'cfg(target_os = "linux")'.dependencies] jemallocator = "0.3.2" diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index 179ccf757..9b63983fb 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -1,5 +1,6 @@ #![no_main] +use std::collections::HashSet; use std::io::{BufWriter, Cursor, Read, Seek, Write}; use anyhow::{bail, Result}; @@ -46,9 +47,38 @@ fn index_documents( fn create_index() -> Result { let dir = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.map_size(10 * 1024 * 1024 * 1024); // 10 GB options.max_readers(1); - Ok(Index::new(options, dir.path())?) + let index = Index::new(options, dir.path())?; + + let update_builder = UpdateBuilder::new(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update_builder.settings(&mut wtxn, &index); + + let displayed_fields = + ["id", "title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields: HashSet = + ["released-timestamp", "duration-float", "genre", "country", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_filterable_fields(faceted_fields.clone()); + builder.set_sortable_fields(faceted_fields); + + builder.set_distinct_field("same".to_string()); + + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + Ok(index) } fuzz_target!(|batches: Vec>| { @@ -63,9 +93,9 @@ fuzz_target!(|batches: Vec>| { // We ignore all badly generated documents if let Ok(_count) = read_json(json.as_bytes(), &mut documents) { + documents.rewind().unwrap(); let documents = DocumentBatchReader::from_reader(documents).unwrap(); match index_documents(&mut index, documents) { - // Err(e @ InternalError(_) | e @ IoError(_)) => panic!("{:?}", e), _ => (), } } From 0605c0ac682993233e98b70cdd553564abee7ebd Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 13 Jan 2022 18:51:08 +0100 Subject: [PATCH 1194/1889] apply review comments --- milli/fuzz/fuzz_targets/indexing.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index 9b63983fb..327df09d1 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -91,13 +91,13 @@ fuzz_target!(|batches: Vec>| { let mut documents = Cursor::new(Vec::new()); - // We ignore all badly generated documents - if let Ok(_count) = read_json(json.as_bytes(), &mut documents) { + // We ignore all malformed documents + if let Ok(_) = read_json(json.as_bytes(), &mut documents) { documents.rewind().unwrap(); let documents = DocumentBatchReader::from_reader(documents).unwrap(); - match index_documents(&mut index, documents) { - _ => (), - } + // A lot of errors can come out of milli and we don't know which ones are normal or not + // so we are only going to look for the unexpected panics. + let _ = index_documents(&mut index, documents); } } From 30247d70cd3da8f85c960d14f5af291257328a2e Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Fri, 17 Dec 2021 22:53:34 +0530 Subject: [PATCH 1195/1889] Fix search highlight for non-unicode chars The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function returns the number of matching grapheme clusters instead of bytes. Should this function be renamed then? Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched --- http-ui/Cargo.toml | 1 + http-ui/src/main.rs | 23 ++++++++++++++++------- milli/src/search/matching_words.rs | 14 +++++++++----- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 593dba3e5..79c784fdd 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -17,6 +17,7 @@ once_cell = "1.5.2" rayon = "1.5.0" structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] } tempfile = "3.2.0" +unicode-segmentation = "1.6.0" # http server askama = "0.10.5" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 75a9012c6..386f10cb4 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -34,6 +34,7 @@ use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; use tokio::sync::broadcast; +use unicode_segmentation::UnicodeSegmentation; use warp::filters::ws::Message; use warp::http::Response; use warp::Filter; @@ -160,13 +161,21 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed = self.analyzer.analyze(&old_string); for (word, token) in analyzed.reconstruct() { if token.is_word() { - let to_highlight = matching_words.matching_bytes(token.text()).is_some(); - if to_highlight { - string.push_str("") - } - string.push_str(word); - if to_highlight { - string.push_str("") + let chars_to_highlight = matching_words.matching_bytes(&token).unwrap_or(0); + if chars_to_highlight > 0 { + let graphemes = word.graphemes(true); + let chars = graphemes.clone().into_iter(); + + string.push_str(""); + string.push_str( + chars.take(chars_to_highlight).collect::().as_str(), + ); + string.push_str(""); + + let chars = graphemes.into_iter().skip(chars_to_highlight); + string.push_str(chars.collect::().as_str()); + } else { + string.push_str(word); } } else { string.push_str(word); diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 37754a782..b22335658 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::{Index, IndexMut}; use levenshtein_automata::{Distance, DFA}; +use meilisearch_tokenizer::Token; use super::build_dfa; use crate::search::query_tree::{Operation, Query}; @@ -33,15 +34,18 @@ impl MatchingWords { } /// Returns the number of matching bytes if the word matches one of the query words. - pub fn matching_bytes(&self, word_to_highlight: &str) -> Option { + pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option { self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| { - match dfa.eval(word_to_highlight) { + match dfa.eval(word_to_highlight.text()) { Distance::Exact(t) if t <= *typo => { if *is_prefix { - let len = bytes_to_highlight(word_to_highlight, query_word); - Some(len) + let len = bytes_to_highlight(word_to_highlight.text(), query_word); + Some(word_to_highlight.num_graphemes_from_bytes(len)) } else { - Some(word_to_highlight.len()) + Some( + word_to_highlight + .num_graphemes_from_bytes(word_to_highlight.text().len()), + ) } } _otherwise => None, From e752bd06f7f5a2e0221a484029d31b1f705155dc Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Fri, 17 Dec 2021 23:26:06 +0530 Subject: [PATCH 1196/1889] Fix matching_words tests to compile successfully The tests still fail due to a bug in https://github.com/meilisearch/tokenizer/pull/59 --- milli/src/search/matching_words.rs | 66 ++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 7 deletions(-) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index b22335658..6df2e0121 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -182,8 +182,11 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize { #[cfg(test)] mod tests { + use std::borrow::Cow; use std::str::from_utf8; + use meilisearch_tokenizer::TokenKind; + use super::*; use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::MatchingWords; @@ -273,12 +276,61 @@ mod tests { let matching_words = MatchingWords::from_query_tree(&query_tree); - assert_eq!(matching_words.matching_bytes("word"), Some(3)); - assert_eq!(matching_words.matching_bytes("nyc"), None); - assert_eq!(matching_words.matching_bytes("world"), Some(5)); - assert_eq!(matching_words.matching_bytes("splitted"), Some(5)); - assert_eq!(matching_words.matching_bytes("thisnew"), None); - assert_eq!(matching_words.matching_bytes("borld"), Some(5)); - assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4)); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("word"), + byte_start: 0, + char_index: 0, + byte_end: "word".len(), + char_map: None, + }), Some(3)); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("nyc"), + byte_start: 0, + char_index: 0, + byte_end: "nyc".len(), + char_map: None, + }), None); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("world"), + byte_start: 0, + char_index: 0, + byte_end: "world".len(), + char_map: None, + }), Some(5)); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("splitted"), + byte_start: 0, + char_index: 0, + byte_end: "splitted".len(), + char_map: None, + }), Some(5)); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("thisnew"), + byte_start: 0, + char_index: 0, + byte_end: "thisnew".len(), + char_map: None, + }), None); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("borld"), + byte_start: 0, + char_index: 0, + byte_end: "borld".len(), + char_map: None, + }), Some(5)); + assert_eq!(matching_words.matching_bytes(&Token{ + kind: TokenKind::Word, + word: Cow::Borrowed("wordsplit"), + byte_start: 0, + char_index: 0, + byte_end: "wordsplit".len(), + char_map: None, + }), Some(4)); } } From c10f58b7bdbf7ccf958c24e1cb628963af060972 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Mon, 17 Jan 2022 13:02:00 +0530 Subject: [PATCH 1197/1889] Update tokenizer to v0.2.7 --- http-ui/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 79c784fdd..7406a1c1b 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,7 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } memmap2 = "0.5.0" milli = { path = "../milli" } once_cell = "1.5.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index a3d8cf627..3d77654eb 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -22,7 +22,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } memmap2 = "0.5.0" obkv = "0.2.0" once_cell = "1.5.2" From 5ab505be33256c1d338d9abeef2f78633c4cd89a Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Mon, 17 Jan 2022 13:02:55 +0530 Subject: [PATCH 1198/1889] Fix highlight by replacing num_graphemes_from_bytes num_graphemes_from_bytes has been renamed in the tokenizer to num_chars_from_bytes. Highlight now works correctly! --- milli/src/search/matching_words.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 6df2e0121..74ff14382 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -40,11 +40,11 @@ impl MatchingWords { Distance::Exact(t) if t <= *typo => { if *is_prefix { let len = bytes_to_highlight(word_to_highlight.text(), query_word); - Some(word_to_highlight.num_graphemes_from_bytes(len)) + Some(word_to_highlight.num_chars_from_bytes(len)) } else { Some( word_to_highlight - .num_graphemes_from_bytes(word_to_highlight.text().len()), + .num_chars_from_bytes(word_to_highlight.text().len()), ) } } From 2d7607734eaf8baf343da07290170d0eadec4e53 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Mon, 17 Jan 2022 13:04:33 +0530 Subject: [PATCH 1199/1889] Run cargo fmt on matching_words.rs --- milli/src/search/matching_words.rs | 138 ++++++++++++++++------------- 1 file changed, 78 insertions(+), 60 deletions(-) diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 74ff14382..67bdefb37 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -42,10 +42,7 @@ impl MatchingWords { let len = bytes_to_highlight(word_to_highlight.text(), query_word); Some(word_to_highlight.num_chars_from_bytes(len)) } else { - Some( - word_to_highlight - .num_chars_from_bytes(word_to_highlight.text().len()), - ) + Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len())) } } _otherwise => None, @@ -276,61 +273,82 @@ mod tests { let matching_words = MatchingWords::from_query_tree(&query_tree); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("word"), - byte_start: 0, - char_index: 0, - byte_end: "word".len(), - char_map: None, - }), Some(3)); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("nyc"), - byte_start: 0, - char_index: 0, - byte_end: "nyc".len(), - char_map: None, - }), None); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("world"), - byte_start: 0, - char_index: 0, - byte_end: "world".len(), - char_map: None, - }), Some(5)); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("splitted"), - byte_start: 0, - char_index: 0, - byte_end: "splitted".len(), - char_map: None, - }), Some(5)); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("thisnew"), - byte_start: 0, - char_index: 0, - byte_end: "thisnew".len(), - char_map: None, - }), None); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("borld"), - byte_start: 0, - char_index: 0, - byte_end: "borld".len(), - char_map: None, - }), Some(5)); - assert_eq!(matching_words.matching_bytes(&Token{ - kind: TokenKind::Word, - word: Cow::Borrowed("wordsplit"), - byte_start: 0, - char_index: 0, - byte_end: "wordsplit".len(), - char_map: None, - }), Some(4)); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("word"), + byte_start: 0, + char_index: 0, + byte_end: "word".len(), + char_map: None, + }), + Some(3) + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("nyc"), + byte_start: 0, + char_index: 0, + byte_end: "nyc".len(), + char_map: None, + }), + None + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("world"), + byte_start: 0, + char_index: 0, + byte_end: "world".len(), + char_map: None, + }), + Some(5) + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("splitted"), + byte_start: 0, + char_index: 0, + byte_end: "splitted".len(), + char_map: None, + }), + Some(5) + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("thisnew"), + byte_start: 0, + char_index: 0, + byte_end: "thisnew".len(), + char_map: None, + }), + None + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("borld"), + byte_start: 0, + char_index: 0, + byte_end: "borld".len(), + char_map: None, + }), + Some(5) + ); + assert_eq!( + matching_words.matching_bytes(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("wordsplit"), + byte_start: 0, + char_index: 0, + byte_end: "wordsplit".len(), + char_map: None, + }), + Some(4) + ); } } From c0313f3026cb8577c4035fd72394772842dec4b4 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Mon, 17 Jan 2022 13:10:44 +0530 Subject: [PATCH 1200/1889] Use chars for highlight instead of graphemes Tokenizer v0.2.7 uses chars instead of graphemes for matching bytes. `unicode-segmentation` dependency isn't needed anymore. Also, oxidised the highlight code :) Co-authored-by: many --- http-ui/Cargo.toml | 1 - http-ui/src/main.rs | 27 ++++++++++++--------------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 7406a1c1b..f45a85753 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -17,7 +17,6 @@ once_cell = "1.5.2" rayon = "1.5.0" structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] } tempfile = "3.2.0" -unicode-segmentation = "1.6.0" # http server askama = "0.10.5" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 386f10cb4..6502bf83a 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -34,7 +34,6 @@ use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; use tokio::sync::broadcast; -use unicode_segmentation::UnicodeSegmentation; use warp::filters::ws::Message; use warp::http::Response; use warp::Filter; @@ -161,21 +160,19 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed = self.analyzer.analyze(&old_string); for (word, token) in analyzed.reconstruct() { if token.is_word() { - let chars_to_highlight = matching_words.matching_bytes(&token).unwrap_or(0); - if chars_to_highlight > 0 { - let graphemes = word.graphemes(true); - let chars = graphemes.clone().into_iter(); + match matching_words.matching_bytes(&token) { + Some(chars_to_highlight) => { + let mut chars = word.chars(); - string.push_str(""); - string.push_str( - chars.take(chars_to_highlight).collect::().as_str(), - ); - string.push_str(""); - - let chars = graphemes.into_iter().skip(chars_to_highlight); - string.push_str(chars.collect::().as_str()); - } else { - string.push_str(word); + string.push_str(""); + // push the part to highlight + string.extend(chars.by_ref().take(chars_to_highlight)); + string.push_str(""); + // push the suffix after highlight + string.extend(chars); + } + // no highlight + None => string.push_str(word), } } else { string.push_str(word); From d1ac40ea14309d8f2f123748a3277fc04d12dfd9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 17 Jan 2022 13:51:46 +0100 Subject: [PATCH 1201/1889] fix(filter): Fix two bugs. - Stop lowercasing the field when looking in the field id map - When a field id does not exist it means there is currently zero documents containing this field thus we returns an empty RoaringBitmap instead of throwing an internal error --- milli/src/search/facet/filter.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 551fa0d4e..3ca8c3984 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -33,7 +33,6 @@ enum FilterError<'a> { BadGeoLng(f64), Reserved(&'a str), TooDeep, - InternalError, } impl<'a> std::error::Error for FilterError<'a> {} @@ -58,7 +57,6 @@ impl<'a> Display for FilterError<'a> { Self::BadGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.", keyword), Self::BadGeoLat(lat) => write!(f, "Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ", lat), Self::BadGeoLng(lng) => write!(f, "Bad longitude `{}`. Longitude must be contained between -180 and 180 degrees. ", lng), - Self::InternalError => write!(f, "Internal error while executing this filter."), } } } @@ -342,12 +340,12 @@ impl<'a> Filter<'a> { match &self.condition { FilterCondition::Condition { fid, op } => { let filterable_fields = index.filterable_fields(rtxn)?; - if filterable_fields.contains(&fid.to_lowercase()) { + if filterable_fields.contains(fid.value()) { let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(&fid) { + if let Some(fid) = field_ids_map.id(fid.value()) { Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) } else { - return Err(fid.as_external_error(FilterError::InternalError))?; + return Ok(RoaringBitmap::new()); } } else { match *fid.deref() { From 367f403693498f12129ab5821e92bb3316ce4efe Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 17 Jan 2022 16:32:16 +0100 Subject: [PATCH 1202/1889] bump milli --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index e50bf8e55..e07cd9037 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.21.0" +version = "0.22.0" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index e2c3d44f9..c091ccd4c 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.21.0" +version = "0.22.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index f45a85753..94ba4beff 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.21.0" +version = "0.22.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index f00fa0d24..9359e76ce 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.21.0" +version = "0.22.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3d77654eb..6b830c29e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.21.0" +version = "0.22.0" authors = ["Kerollmops "] edition = "2018" From 01968d7ca76daecd533ec6da74cc9a234c725c75 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 18 Jan 2022 11:40:30 +0100 Subject: [PATCH 1203/1889] ensure we get no documents and no error when filtering on an empty db --- milli/src/search/facet/filter.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 3ca8c3984..b4b4b80b7 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -453,6 +453,32 @@ mod tests { use crate::update::Settings; use crate::Index; + #[test] + fn empty_db() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_searchable_fields(vec![S("PrIcE")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("PrIcE") }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let filter = Filter::from_str("PrIcE < 1000").unwrap().unwrap(); + let bitmap = filter.evaluate(&rtxn, &index).unwrap(); + assert!(bitmap.is_empty()); + + let filter = Filter::from_str("NOT PrIcE >= 1000").unwrap().unwrap(); + let bitmap = filter.evaluate(&rtxn, &index).unwrap(); + assert!(bitmap.is_empty()); + } + #[test] fn from_array() { // Simple array with Left From 0c84a4029816a8d53e0b3b6fe61dee2db38060d2 Mon Sep 17 00:00:00 2001 From: Marin Postma Date: Wed, 8 Dec 2021 14:12:07 +0100 Subject: [PATCH 1204/1889] document batch support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit reusable transform rework update api add indexer config fix tests review changes Co-authored-by: Clément Renault fmt --- benchmarks/benches/indexing.rs | 83 ++-- benchmarks/benches/utils.rs | 25 +- cli/src/main.rs | 38 +- http-ui/src/main.rs | 377 +++++++------- milli/src/index.rs | 23 +- milli/src/search/distinct/mod.rs | 21 +- milli/src/search/facet/filter.rs | 11 +- milli/src/update/clear_documents.rs | 8 +- milli/src/update/delete_documents.rs | 37 +- milli/src/update/index_documents/mod.rs | 464 ++++++++++-------- milli/src/update/index_documents/transform.rs | 177 ++++--- milli/src/update/indexer_config.rs | 29 ++ milli/src/update/mod.rs | 8 +- milli/src/update/settings.rs | 245 +++++---- milli/src/update/update_builder.rs | 130 ----- milli/tests/search/distinct.rs | 3 +- milli/tests/search/mod.rs | 16 +- milli/tests/search/query_criteria.rs | 20 +- 18 files changed, 912 insertions(+), 803 deletions(-) create mode 100644 milli/src/update/indexer_config.rs delete mode 100644 milli/src/update/update_builder.rs diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 93a57091a..a84998b12 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -6,7 +6,7 @@ use std::path::Path; use criterion::{criterion_group, criterion_main, Criterion}; use heed::EnvOpenOptions; -use milli::update::UpdateBuilder; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::Index; #[cfg(target_os = "linux")] @@ -39,9 +39,9 @@ fn indexing_songs_default(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_primary_key("id".to_owned()); let displayed_fields = @@ -66,12 +66,15 @@ fn indexing_songs_default(c: &mut Criterion) { index }, move |index| { - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = update_builder.index_documents(&mut wtxn, &index); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.execute(documents, |_| ()).unwrap(); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -88,9 +91,9 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_primary_key("id".to_owned()); let displayed_fields = @@ -112,12 +115,16 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { index }, move |index| { - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = update_builder.index_documents(&mut wtxn, &index); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.execute(documents, |_| ()).unwrap(); + + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -134,9 +141,9 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_primary_key("id".to_owned()); let displayed_fields = @@ -154,12 +161,15 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { index }, move |index| { - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = update_builder.index_documents(&mut wtxn, &index); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.execute(documents, |_| ()).unwrap(); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -176,9 +186,9 @@ fn indexing_wiki(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_primary_key("id".to_owned()); let displayed_fields = @@ -195,13 +205,16 @@ fn indexing_wiki(c: &mut Criterion) { index }, move |index| { - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); - builder.enable_autogenerate_docids(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.execute(documents, |_| ()).unwrap(); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -218,9 +231,9 @@ fn indexing_movies_default(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_primary_key("id".to_owned()); let displayed_fields = ["title", "poster", "overview", "release_date", "genres"] @@ -242,12 +255,15 @@ fn indexing_movies_default(c: &mut Criterion) { index }, move |index| { - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = update_builder.index_documents(&mut wtxn, &index); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.execute(documents, |_| ()).unwrap(); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); index.prepare_for_closing().wait(); @@ -264,9 +280,9 @@ fn indexing_geo(c: &mut Criterion) { move || { let index = setup_index(); - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_primary_key("geonameid".to_owned()); let displayed_fields = @@ -293,12 +309,15 @@ fn indexing_geo(c: &mut Criterion) { index }, move |index| { - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = update_builder.index_documents(&mut wtxn, &index); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.execute(documents, |_| ()).unwrap(); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index df5a7b828..383587ef8 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -8,7 +8,9 @@ use std::path::Path; use criterion::BenchmarkId; use heed::EnvOpenOptions; use milli::documents::DocumentBatchReader; -use milli::update::{IndexDocumentsMethod, Settings, UpdateBuilder}; +use milli::update::{ + IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, +}; use milli::{Filter, Index}; use serde_json::{Map, Value}; @@ -65,9 +67,9 @@ pub fn base_setup(conf: &Conf) -> Index { options.max_readers(10); let index = Index::new(options, conf.database_name).unwrap(); - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); if let Some(primary_key) = conf.primary_key { builder.set_primary_key(primary_key.to_string()); @@ -87,16 +89,19 @@ pub fn base_setup(conf: &Conf) -> Index { builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.index_documents(&mut wtxn, &index); - if let None = conf.primary_key { - builder.enable_autogenerate_docids(); - } + let indexing_config = IndexDocumentsConfig { + autogenerate_docids: conf.primary_key.is_none(), + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); let documents = documents_from(conf.dataset, conf.dataset_format); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - builder.execute(documents, |_| ()).unwrap(); + builder.add_documents(documents).unwrap(); + + builder.execute().unwrap(); wtxn.commit().unwrap(); index diff --git a/cli/src/main.rs b/cli/src/main.rs index b3c18244d..1edc171b0 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -9,6 +9,7 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; +use milli::update::{IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; use structopt::StructOpt; #[cfg(target_os = "linux")] @@ -122,18 +123,18 @@ impl DocumentAddition { println!("Adding {} documents to the index.", reader.len()); let mut txn = index.env.write_txn()?; - let mut addition = milli::update::IndexDocuments::new(&mut txn, &index); - - if self.update_documents { - addition.index_documents_method(milli::update::IndexDocumentsMethod::UpdateDocuments); - } - - addition.log_every_n(100); - - if self.autogen_docids { - addition.enable_autogenerate_docids() - } + let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() }; + let update_method = if self.update_documents { + IndexDocumentsMethod::UpdateDocuments + } else { + IndexDocumentsMethod::ReplaceDocuments + }; + let indexing_config = IndexDocumentsConfig { + update_method, + autogenerate_docids: self.autogen_docids, + ..Default::default() + }; let mut bars = Vec::new(); let progesses = MultiProgress::new(); for _ in 0..4 { @@ -141,12 +142,20 @@ impl DocumentAddition { let bar = progesses.add(bar); bars.push(bar); } + let mut addition = milli::update::IndexDocuments::new( + &mut txn, + &index, + &config, + indexing_config, + |step| indexing_callback(step, &bars), + ); + addition.add_documents(reader)?; std::thread::spawn(move || { progesses.join().unwrap(); }); - let result = addition.execute(reader, |step| indexing_callback(step, &bars))?; + let result = addition.execute()?; txn.commit()?; @@ -293,8 +302,9 @@ impl SettingsUpdate { fn perform(&self, index: milli::Index) -> Result<()> { let mut txn = index.env.write_txn()?; - let mut update = milli::update::Settings::new(&mut txn, &index); - update.log_every_n(100); + let config = IndexerConfig { log_every_n: Some(100), ..Default::default() }; + + let mut update = milli::update::Settings::new(&mut txn, &index, &config); if let Some(ref filterable_attributes) = self.filterable_attributes { if !filterable_attributes.is_empty() { diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 6502bf83a..039a6c2ae 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -21,13 +21,14 @@ use heed::EnvOpenOptions; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use milli::documents::DocumentBatchReader; use milli::update::UpdateIndexingStep::*; -use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder}; +use milli::update::{ + ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, +}; use milli::{ obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatchingWords, SearchResult, SortError, }; use once_cell::sync::OnceCell; -use rayon::ThreadPool; use serde::{Deserialize, Serialize}; use serde_json::{Map, Value}; use structopt::StructOpt; @@ -44,7 +45,7 @@ use self::update_store::UpdateStore; #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; -static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); +static GLOBAL_CONFIG: OnceCell = OnceCell::new(); #[derive(Debug, StructOpt)] /// The HTTP main server of the milli project. @@ -327,7 +328,19 @@ async fn main() -> anyhow::Result<()> { // Setup the global thread pool let jobs = opt.indexer.indexing_jobs.unwrap_or(0); let pool = rayon::ThreadPoolBuilder::new().num_threads(jobs).build()?; - GLOBAL_THREAD_POOL.set(pool).unwrap(); + + let config = IndexerConfig { + max_nb_chunks: opt.indexer.max_nb_chunks, + chunk_compression_level: opt.indexer.chunk_compression_level, + max_positions_per_attributes: opt.indexer.max_positions_per_attributes, + thread_pool: Some(pool), + log_every_n: Some(opt.indexer.log_every_n), + max_memory: Some(opt.indexer.max_memory.get_bytes() as usize), + chunk_compression_type: opt.indexer.chunk_compression_type.unwrap_or(CompressionType::None), + ..Default::default() + }; + + GLOBAL_CONFIG.set(config).unwrap(); // Open the LMDB database. let index = Index::new(options, &opt.database)?; @@ -342,209 +355,207 @@ async fn main() -> anyhow::Result<()> { let (update_status_sender, _) = broadcast::channel(100); let update_status_sender_cloned = update_status_sender.clone(); let index_cloned = index.clone(); - let indexer_opt_cloned = opt.indexer.clone(); let update_store = UpdateStore::open( update_store_options, update_store_path, // the type hint is necessary: https://github.com/rust-lang/rust/issues/32600 move |update_id, meta, content: &_| { // We prepare the update by using the update builder. - let mut update_builder = UpdateBuilder::new(); - if let Some(max_nb_chunks) = indexer_opt_cloned.max_nb_chunks { - update_builder.max_nb_chunks(max_nb_chunks); - } - if let Some(chunk_compression_level) = indexer_opt_cloned.chunk_compression_level { - update_builder.chunk_compression_level(chunk_compression_level); - } - if let Some(max_pos_per_attributes) = indexer_opt_cloned.max_positions_per_attributes { - update_builder.max_positions_per_attributes(max_pos_per_attributes); - } - update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap()); - update_builder.log_every_n(indexer_opt_cloned.log_every_n); - update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); - update_builder.chunk_compression_type( - indexer_opt_cloned.chunk_compression_type.unwrap_or(CompressionType::None), - ); let before_update = Instant::now(); // we extract the update type and execute the update itself. - let result: anyhow::Result<()> = - (|| match meta { - UpdateMeta::DocumentsAddition { method, format, encoding } => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.index_documents(&mut wtxn, &index_cloned); - builder.enable_autogenerate_docids(); + let result: anyhow::Result<()> = (|| match meta { + UpdateMeta::DocumentsAddition { method, format, encoding } => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let update_method = match method.as_str() { + "replace" => IndexDocumentsMethod::ReplaceDocuments, + "update" => IndexDocumentsMethod::UpdateDocuments, + otherwise => panic!("invalid indexing method {:?}", otherwise), + }; + let indexing_config = IndexDocumentsConfig { + update_method, + autogenerate_docids: true, + ..Default::default() + }; - match method.as_str() { - "replace" => builder - .index_documents_method(IndexDocumentsMethod::ReplaceDocuments), - "update" => builder - .index_documents_method(IndexDocumentsMethod::UpdateDocuments), - otherwise => panic!("invalid indexing method {:?}", otherwise), + let indexing_callback = |indexing_step| { + let (current, total) = match indexing_step { + RemapDocumentAddition { documents_seen } => (documents_seen, None), + ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) + } + IndexDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) + } + MergeDataIntoFinalDatabase { databases_seen, total_databases } => { + (databases_seen, Some(total_databases)) + } }; - - let reader = match encoding.as_deref() { - Some("gzip") => Box::new(GzDecoder::new(content)), - None => Box::new(content) as Box, - otherwise => panic!("invalid encoding format {:?}", otherwise), - }; - - let documents = match format.as_str() { - "csv" => documents_from_csv(reader)?, - "json" => documents_from_json(reader)?, - "jsonl" => documents_from_jsonl(reader)?, - otherwise => panic!("invalid update format {:?}", otherwise), - }; - - let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?; - - let result = builder.execute(documents, |indexing_step| { - let (current, total) = match indexing_step { - RemapDocumentAddition { documents_seen } => (documents_seen, None), - ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { - (documents_seen, Some(total_documents)) - } - IndexDocuments { documents_seen, total_documents } => { - (documents_seen, Some(total_documents)) - } - MergeDataIntoFinalDatabase { databases_seen, total_databases } => { - (databases_seen, Some(total_databases)) - } - }; - let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { - update_id, - meta: UpdateMetaProgress::DocumentsAddition { - step: indexing_step.step(), - total_steps: indexing_step.number_of_steps(), - current, - total, - }, - }); + let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { + update_id, + meta: UpdateMetaProgress::DocumentsAddition { + step: indexing_step.step(), + total_steps: indexing_step.number_of_steps(), + current, + total, + }, }); + }; - match result { - Ok(_) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } + let mut builder = milli::update::IndexDocuments::new( + &mut wtxn, + &index_cloned, + GLOBAL_CONFIG.get().unwrap(), + indexing_config, + indexing_callback, + ); + + let reader = match encoding.as_deref() { + Some("gzip") => Box::new(GzDecoder::new(content)), + None => Box::new(content) as Box, + otherwise => panic!("invalid encoding format {:?}", otherwise), + }; + + let documents = match format.as_str() { + "csv" => documents_from_csv(reader)?, + "json" => documents_from_json(reader)?, + "jsonl" => documents_from_jsonl(reader)?, + otherwise => panic!("invalid update format {:?}", otherwise), + }; + + let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?; + + builder.add_documents(documents)?; + + let result = builder.execute(); + + match result { + Ok(_) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), } - UpdateMeta::ClearDocuments => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let builder = update_builder.clear_documents(&mut wtxn, &index_cloned); + } + UpdateMeta::ClearDocuments => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let builder = ClearDocuments::new(&mut wtxn, &index_cloned); - match builder.execute() { - Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } + match builder.execute() { + Ok(_count) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), } - UpdateMeta::Settings(settings) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.settings(&mut wtxn, &index_cloned); + } + UpdateMeta::Settings(settings) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = milli::update::Settings::new( + &mut wtxn, + &index_cloned, + GLOBAL_CONFIG.get().unwrap(), + ); - // We transpose the settings JSON struct into a real setting update. - match settings.searchable_attributes { - Setting::Set(searchable_attributes) => { - builder.set_searchable_fields(searchable_attributes) + // We transpose the settings JSON struct into a real setting update. + match settings.searchable_attributes { + Setting::Set(searchable_attributes) => { + builder.set_searchable_fields(searchable_attributes) + } + Setting::Reset => builder.reset_searchable_fields(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.displayed_attributes { + Setting::Set(displayed_attributes) => { + builder.set_displayed_fields(displayed_attributes) + } + Setting::Reset => builder.reset_displayed_fields(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.filterable_attributes { + Setting::Set(filterable_attributes) => { + builder.set_filterable_fields(filterable_attributes) + } + Setting::Reset => builder.reset_filterable_fields(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.sortable_attributes { + Setting::Set(sortable_attributes) => { + builder.set_sortable_fields(sortable_attributes) + } + Setting::Reset => builder.reset_sortable_fields(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.criteria { + Setting::Set(criteria) => builder.set_criteria(criteria), + Setting::Reset => builder.reset_criteria(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.stop_words { + Setting::Set(stop_words) => builder.set_stop_words(stop_words), + Setting::Reset => builder.reset_stop_words(), + Setting::NotSet => (), + } + + // We transpose the settings JSON struct into a real setting update. + match settings.synonyms { + Setting::Set(synonyms) => builder.set_synonyms(synonyms), + Setting::Reset => builder.reset_synonyms(), + Setting::NotSet => (), + } + + let result = builder.execute(|indexing_step| { + let (current, total) = match indexing_step { + RemapDocumentAddition { documents_seen } => (documents_seen, None), + ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) } - Setting::Reset => builder.reset_searchable_fields(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.displayed_attributes { - Setting::Set(displayed_attributes) => { - builder.set_displayed_fields(displayed_attributes) + IndexDocuments { documents_seen, total_documents } => { + (documents_seen, Some(total_documents)) } - Setting::Reset => builder.reset_displayed_fields(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.filterable_attributes { - Setting::Set(filterable_attributes) => { - builder.set_filterable_fields(filterable_attributes) + MergeDataIntoFinalDatabase { databases_seen, total_databases } => { + (databases_seen, Some(total_databases)) } - Setting::Reset => builder.reset_filterable_fields(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.sortable_attributes { - Setting::Set(sortable_attributes) => { - builder.set_sortable_fields(sortable_attributes) - } - Setting::Reset => builder.reset_sortable_fields(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.criteria { - Setting::Set(criteria) => builder.set_criteria(criteria), - Setting::Reset => builder.reset_criteria(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.stop_words { - Setting::Set(stop_words) => builder.set_stop_words(stop_words), - Setting::Reset => builder.reset_stop_words(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.synonyms { - Setting::Set(synonyms) => builder.set_synonyms(synonyms), - Setting::Reset => builder.reset_synonyms(), - Setting::NotSet => (), - } - - let result = builder.execute(|indexing_step| { - let (current, total) = match indexing_step { - RemapDocumentAddition { documents_seen } => (documents_seen, None), - ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { - (documents_seen, Some(total_documents)) - } - IndexDocuments { documents_seen, total_documents } => { - (documents_seen, Some(total_documents)) - } - MergeDataIntoFinalDatabase { databases_seen, total_databases } => { - (databases_seen, Some(total_databases)) - } - }; - let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { - update_id, - meta: UpdateMetaProgress::DocumentsAddition { - step: indexing_step.step(), - total_steps: indexing_step.number_of_steps(), - current, - total, - }, - }); + }; + let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { + update_id, + meta: UpdateMetaProgress::DocumentsAddition { + step: indexing_step.step(), + total_steps: indexing_step.number_of_steps(), + current, + total, + }, }); + }); - match result { - Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } + match result { + Ok(_count) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), } - UpdateMeta::Facets(levels) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = update_builder.facets(&mut wtxn, &index_cloned); - if let Some(value) = levels.level_group_size { - builder.level_group_size(value); - } - if let Some(value) = levels.min_level_size { - builder.min_level_size(value); - } - match builder.execute() { - Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } + } + UpdateMeta::Facets(levels) => { + // We must use the write transaction of the update here. + let mut wtxn = index_cloned.write_txn()?; + let mut builder = milli::update::Facets::new(&mut wtxn, &index_cloned); + if let Some(value) = levels.level_group_size { + builder.level_group_size(value); } - })(); + if let Some(value) = levels.min_level_size { + builder.min_level_size(value); + } + match builder.execute() { + Ok(()) => wtxn.commit().map_err(Into::into), + Err(e) => Err(e.into()), + } + } + })(); let meta = match result { Ok(()) => { diff --git a/milli/src/index.rs b/milli/src/index.rs index 2f51b8c6b..70081dfb0 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -867,7 +867,7 @@ pub(crate) mod tests { use maplit::btreemap; use tempfile::TempDir; - use crate::update::IndexDocuments; + use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig}; use crate::Index; pub(crate) struct TempIndex { @@ -908,8 +908,13 @@ pub(crate) mod tests { { "id": 2, "name": "bob", "age": 20 }, { "id": 2, "name": "bob", "age": 20 } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -927,13 +932,15 @@ pub(crate) mod tests { // we add all the documents a second time. we are supposed to get the same // field_distribution in the end let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); let content = documents!([ { "id": 1, "name": "kevin" }, { "id": 2, "name": "bob", "age": 20 }, { "id": 2, "name": "bob", "age": 20 } ]); - builder.execute(content, |_| ()).unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -955,8 +962,10 @@ pub(crate) mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 3d36ed2a3..965423886 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -38,7 +38,9 @@ mod test { use crate::documents::{DocumentBatchBuilder, DocumentBatchReader}; use crate::index::tests::TempIndex; use crate::index::Index; - use crate::update::{IndexDocumentsMethod, UpdateBuilder}; + use crate::update::{ + IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, + }; use crate::{DocumentId, FieldId, BEU32}; static JSON: Lazy> = Lazy::new(generate_documents); @@ -84,19 +86,24 @@ mod test { let mut txn = index.write_txn().unwrap(); // set distinct and faceted attributes for the index. - let builder = UpdateBuilder::new(); - let mut update = builder.settings(&mut txn, &index); + let config = IndexerConfig::default(); + let mut update = Settings::new(&mut txn, &index, &config); update.set_distinct_field(distinct.to_string()); update.execute(|_| ()).unwrap(); // add documents to the index - let builder = UpdateBuilder::new(); - let mut addition = builder.index_documents(&mut txn, &index); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + let mut addition = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()); - addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); let reader = crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); - addition.execute(reader, |_| ()).unwrap(); + + addition.add_documents(reader).unwrap(); + addition.execute().unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index b4b4b80b7..edc86d0ca 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -450,7 +450,7 @@ mod tests { use maplit::hashset; use super::*; - use crate::update::Settings; + use crate::update::{IndexerConfig, Settings}; use crate::Index; #[test] @@ -461,8 +461,9 @@ mod tests { let index = Index::new(options, &path).unwrap(); // Set the filterable fields to be the channel. + let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_searchable_fields(vec![S("PrIcE")]); // to keep the fields order builder.set_filterable_fields(hashset! { S("PrIcE") }); builder.execute(|_| ()).unwrap(); @@ -563,9 +564,10 @@ mod tests { )); drop(rtxn); + let config = IndexerConfig::default(); // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_searchable_fields(vec![S("title")]); builder.set_filterable_fields(hashset! { S("title") }); builder.execute(|_| ()).unwrap(); @@ -593,9 +595,10 @@ mod tests { options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); // Set the filterable fields to be the channel. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); builder.execute(|_| ()).unwrap(); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 5be3bc23d..8c9178d4e 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -77,7 +77,7 @@ mod tests { use heed::EnvOpenOptions; use super::*; - use crate::update::IndexDocuments; + use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig}; #[test] fn clear_documents() { @@ -92,7 +92,11 @@ mod tests { { "id": 1, "name": "kevina" }, { "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } } ]); - IndexDocuments::new(&mut wtxn, &index).execute(content, |_| ()).unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + let config = IndexerConfig::default(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); // Clear all documents from the database. let builder = ClearDocuments::new(&mut wtxn, &index); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 4c41cbd53..19f1d9f42 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -580,7 +580,7 @@ mod tests { use maplit::hashset; use super::*; - use crate::update::{IndexDocuments, Settings}; + use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::Filter; #[test] @@ -596,8 +596,11 @@ mod tests { { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); // delete those documents, ids are synchronous therefore 0, 1, and 2. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); @@ -626,8 +629,12 @@ mod tests { { "mysuperid": 1, "name": "kevina" }, { "mysuperid": 2, "name": "benoit" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); // Delete not all of the documents but some of them. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); @@ -646,7 +653,8 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_primary_key(S("docid")); builder.set_filterable_fields(hashset! { S("label") }); builder.execute(|_| ()).unwrap(); @@ -673,8 +681,12 @@ mod tests { {"docid":"1_68","label":"design"}, {"docid":"1_69","label":"geometry"} ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); // Delete not all of the documents but some of them. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); @@ -696,7 +708,8 @@ mod tests { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_primary_key(S("id")); builder.set_filterable_fields(hashset!(S("_geo"))); builder.set_sortable_fields(hashset!(S("_geo"))); @@ -726,7 +739,11 @@ mod tests { ]); let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; - IndexDocuments::new(&mut wtxn, &index).execute(content, |_| ()).unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); let ids_to_delete: Vec = external_ids_to_delete diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b0c0a5362..4fbb75d5f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -7,13 +7,9 @@ use std::collections::HashSet; use std::io::{Read, Seek}; use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; -use std::time::Instant; -use chrono::Utc; use crossbeam_channel::{Receiver, Sender}; -use grenad::{self, CompressionType}; -use log::{debug, info}; -use rayon::ThreadPool; +use log::debug; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; @@ -26,8 +22,8 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::DocumentBatchReader; use crate::update::{ - Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, - WordPrefixPositionDocids, WordsPrefixesFst, + self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, + WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result}; @@ -55,120 +51,116 @@ pub enum IndexDocumentsMethod { UpdateDocuments, } +impl Default for IndexDocumentsMethod { + fn default() -> Self { + Self::ReplaceDocuments + } +} + #[derive(Debug, Copy, Clone)] pub enum WriteMethod { Append, GetMergePut, } -pub struct IndexDocuments<'t, 'u, 'i, 'a> { +pub struct IndexDocuments<'t, 'u, 'i, 'a, F> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - pub(crate) log_every_n: Option, - pub(crate) documents_chunk_size: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) thread_pool: Option<&'a ThreadPool>, - pub(crate) max_positions_per_attributes: Option, - facet_level_group_size: Option, - facet_min_level_size: Option, - words_prefix_threshold: Option, - max_prefix_length: Option, - words_positions_level_group_size: Option, - words_positions_min_level_size: Option, - update_method: IndexDocumentsMethod, - autogenerate_docids: bool, + config: IndexDocumentsConfig, + indexer_config: &'a IndexerConfig, + transform: Option>, + progress: F, + added_documents: u64, } -impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { +#[derive(Default, Debug, Clone)] +pub struct IndexDocumentsConfig { + pub facet_level_group_size: Option, + pub facet_min_level_size: Option, + pub words_prefix_threshold: Option, + pub max_prefix_length: Option, + pub words_positions_level_group_size: Option, + pub words_positions_min_level_size: Option, + pub update_method: IndexDocumentsMethod, + pub autogenerate_docids: bool, +} + +impl<'t, 'u, 'i, 'a, F> IndexDocuments<'t, 'u, 'i, 'a, F> +where + F: Fn(UpdateIndexingStep) + Sync, +{ pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - ) -> IndexDocuments<'t, 'u, 'i, 'a> { + indexer_config: &'a IndexerConfig, + config: IndexDocumentsConfig, + progress: F, + ) -> IndexDocuments<'t, 'u, 'i, 'a, F> { + let transform = Some(Transform::new( + &index, + indexer_config, + config.update_method, + config.autogenerate_docids, + )); + IndexDocuments { + transform, + config, + indexer_config, + progress, wtxn, index, - log_every_n: None, - documents_chunk_size: None, - max_nb_chunks: None, - max_memory: None, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - thread_pool: None, - facet_level_group_size: None, - facet_min_level_size: None, - words_prefix_threshold: None, - max_prefix_length: None, - words_positions_level_group_size: None, - words_positions_min_level_size: None, - update_method: IndexDocumentsMethod::ReplaceDocuments, - autogenerate_docids: false, - max_positions_per_attributes: None, + added_documents: 0, } } - pub fn log_every_n(&mut self, n: usize) { - self.log_every_n = Some(n); - } - - pub fn index_documents_method(&mut self, method: IndexDocumentsMethod) { - self.update_method = method; - } - - pub fn enable_autogenerate_docids(&mut self) { - self.autogenerate_docids = true; - } - - pub fn disable_autogenerate_docids(&mut self) { - self.autogenerate_docids = false; - } - - #[logging_timer::time("IndexDocuments::{}")] - pub fn execute( - self, - reader: DocumentBatchReader, - progress_callback: F, - ) -> Result + /// Adds a batch of documents to the current builder. + /// + /// Since the documents are progressively added to the writer, a failure will cause a stale + /// builder, and the builder must be discarded. + /// + /// Returns the number of documents added to the builder. + pub fn add_documents(&mut self, reader: DocumentBatchReader) -> Result where R: Read + Seek, - F: Fn(UpdateIndexingStep) + Sync, { // Early return when there is no document to add if reader.is_empty() { - return Ok(DocumentAdditionResult { - indexed_documents: 0, - number_of_documents: self.index.number_of_documents(self.wtxn)?, - }); + return Ok(0); } - self.index.set_updated_at(self.wtxn, &Utc::now())?; - let before_transform = Instant::now(); - let transform = Transform { - rtxn: &self.wtxn, - index: self.index, - log_every_n: self.log_every_n, - chunk_compression_type: self.chunk_compression_type, - chunk_compression_level: self.chunk_compression_level, - max_nb_chunks: self.max_nb_chunks, - max_memory: self.max_memory, - index_documents_method: self.update_method, - autogenerate_docids: self.autogenerate_docids, - }; + let indexed_documents = self + .transform + .as_mut() + .expect("Invalid document addition state") + .read_documents(reader, self.wtxn, &self.progress)? + as u64; - let output = transform.read_documents(reader, &progress_callback)?; + self.added_documents += indexed_documents; + + Ok(indexed_documents) + } + + #[logging_timer::time("IndexDocuments::{}")] + pub fn execute(mut self) -> Result { + if self.added_documents == 0 { + let number_of_documents = self.index.number_of_documents(self.wtxn)?; + return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); + } + let output = self + .transform + .take() + .expect("Invalid document addition state") + .output_from_sorter(self.wtxn, &self.progress)?; let indexed_documents = output.documents_count as u64; - - info!("Update transformed in {:.02?}", before_transform.elapsed()); - - let number_of_documents = self.execute_raw(output, progress_callback)?; + let number_of_documents = self.execute_raw(output)?; Ok(DocumentAdditionResult { indexed_documents, number_of_documents }) } + /// Returns the total number of documents in the index after the update. #[logging_timer::time("IndexDocuments::{}")] - pub fn execute_raw(self, output: TransformOutput, progress_callback: F) -> Result + pub fn execute_raw(self, output: TransformOutput) -> Result where F: Fn(UpdateIndexingStep) + Sync, { @@ -188,8 +180,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; let backup_pool; - let pool = match self.thread_pool { - Some(pool) => pool, + let pool = match self.indexer_config.thread_pool { + Some(ref pool) => pool, #[cfg(not(test))] None => { // We initialize a bakcup pool with the default @@ -237,22 +229,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { }; let stop_words = self.index.stop_words(self.wtxn)?; - // let stop_words = stop_words.as_ref(); // Run extraction pipeline in parallel. pool.install(|| { let params = GrenadParameters { - chunk_compression_type: self.chunk_compression_type, - chunk_compression_level: self.chunk_compression_level, - max_memory: self.max_memory, - max_nb_chunks: self.max_nb_chunks, // default value, may be chosen. + chunk_compression_type: self.indexer_config.chunk_compression_type, + chunk_compression_level: self.indexer_config.chunk_compression_level, + max_memory: self.indexer_config.max_memory, + max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. }; // split obkv file into several chuncks let chunk_iter = grenad_obkv_into_chunks( documents_file, params.clone(), - self.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB + self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB ); let result = chunk_iter.map(|chunk_iter| { @@ -266,7 +257,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { primary_key_id, geo_field_id, stop_words, - self.max_positions_per_attributes, + self.indexer_config.max_positions_per_attributes, ) }); @@ -281,17 +272,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We delete the documents that this document addition replaces. This way we are // able to simply insert all the documents even if they already exist in the database. if !replaced_documents_ids.is_empty() { - let update_builder = UpdateBuilder { - log_every_n: self.log_every_n, - max_nb_chunks: self.max_nb_chunks, - max_memory: self.max_memory, - documents_chunk_size: self.documents_chunk_size, - chunk_compression_type: self.chunk_compression_type, - chunk_compression_level: self.chunk_compression_level, - thread_pool: self.thread_pool, - max_positions_per_attributes: self.max_positions_per_attributes, - }; - let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; + let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?; debug!("documents to delete {:?}", replaced_documents_ids); deletion_builder.delete_documents(&replaced_documents_ids); let deleted_documents_count = deletion_builder.execute()?; @@ -303,7 +284,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut final_documents_ids = RoaringBitmap::new(); let mut databases_seen = 0; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen, total_databases: TOTAL_POSTING_DATABASE_COUNT, }); @@ -314,7 +295,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); - progress_callback(UpdateIndexingStep::IndexDocuments { + (self.progress)(UpdateIndexingStep::IndexDocuments { documents_seen: documents_seen_count as usize, total_documents: documents_count, }); @@ -325,7 +306,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { } if is_merged_database { databases_seen += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen, total_databases: TOTAL_POSTING_DATABASE_COUNT, }); @@ -344,98 +325,95 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids; self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; - self.execute_prefix_databases(progress_callback)?; + self.execute_prefix_databases()?; Ok(all_documents_ids.len()) } #[logging_timer::time("IndexDocuments::{}")] - pub fn execute_prefix_databases(self, progress_callback: F) -> Result<()> - where - F: Fn(UpdateIndexingStep) + Sync, - { + pub fn execute_prefix_databases(self) -> Result<()> { // Merged databases are already been indexed, we start from this count; let mut databases_seen = MERGED_DATABASE_COUNT; // Run the facets update operation. let mut builder = Facets::new(self.wtxn, self.index); - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - if let Some(value) = self.facet_level_group_size { + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + if let Some(value) = self.config.facet_level_group_size { builder.level_group_size(value); } - if let Some(value) = self.facet_min_level_size { + if let Some(value) = self.config.facet_min_level_size { builder.min_level_size(value); } builder.execute()?; databases_seen += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: databases_seen, + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, total_databases: TOTAL_POSTING_DATABASE_COUNT, }); // Run the words prefixes update operation. let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); - if let Some(value) = self.words_prefix_threshold { + if let Some(value) = self.config.words_prefix_threshold { builder.threshold(value); } - if let Some(value) = self.max_prefix_length { + if let Some(value) = self.config.max_prefix_length { builder.max_prefix_length(value); } builder.execute()?; databases_seen += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: databases_seen, + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, total_databases: TOTAL_POSTING_DATABASE_COUNT, }); // Run the word prefix docids update operation. let mut builder = WordPrefixDocids::new(self.wtxn, self.index); - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.max_nb_chunks = self.max_nb_chunks; - builder.max_memory = self.max_memory; + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + builder.max_nb_chunks = self.indexer_config.max_nb_chunks; + builder.max_memory = self.indexer_config.max_memory; builder.execute()?; databases_seen += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: databases_seen, + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, total_databases: TOTAL_POSTING_DATABASE_COUNT, }); // Run the word prefix pair proximity docids update operation. let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.max_nb_chunks = self.max_nb_chunks; - builder.max_memory = self.max_memory; + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + builder.max_nb_chunks = self.indexer_config.max_nb_chunks; + builder.max_memory = self.indexer_config.max_memory; builder.execute()?; databases_seen += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: databases_seen, + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, total_databases: TOTAL_POSTING_DATABASE_COUNT, }); // Run the words prefix position docids update operation. let mut builder = WordPrefixPositionDocids::new(self.wtxn, self.index); - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.max_nb_chunks = self.max_nb_chunks; - builder.max_memory = self.max_memory; - if let Some(value) = self.words_positions_level_group_size { + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + builder.max_nb_chunks = self.indexer_config.max_nb_chunks; + builder.max_memory = self.indexer_config.max_memory; + if let Some(value) = self.config.words_positions_level_group_size { builder.level_group_size(value); } - if let Some(value) = self.words_positions_min_level_size { + if let Some(value) = self.config.words_positions_min_level_size { builder.min_level_size(value); } builder.execute()?; databases_seen += 1; - progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen: databases_seen, + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, total_databases: TOTAL_POSTING_DATABASE_COUNT, }); @@ -469,8 +447,13 @@ mod tests { { "id": 2, "name": "kevina" }, { "id": 3, "name": "benoit" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -482,8 +465,10 @@ mod tests { // Second we send 1 document with id 1, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "name": "updated kevin" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -499,8 +484,8 @@ mod tests { { "id": 2, "name": "updated kevina" }, { "id": 3, "name": "updated benoit" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -525,9 +510,15 @@ mod tests { { "id": 1, "name": "kevina" }, { "id": 1, "name": "benoit" } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::UpdateDocuments, + ..Default::default() + }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is only 1 document now. @@ -551,9 +542,9 @@ mod tests { // Second we send 1 document with id 1, to force it to be merged with the previous one. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "age": 25 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); - builder.execute(content, |_| ()).unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 1 document. @@ -590,8 +581,10 @@ mod tests { { "name": "kevina" }, { "name": "benoit" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - assert!(builder.execute(content, |_| ()).is_err()); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + assert!(builder.add_documents(content).is_err()); wtxn.commit().unwrap(); // Check that there is no document. @@ -615,9 +608,13 @@ mod tests { { "name": "kevina" }, { "name": "benoit" } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -633,8 +630,9 @@ mod tests { // Second we send 1 document with the generated uuid, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -670,8 +668,11 @@ mod tests { { "id": 2, "name": "kevina" }, { "id": 3, "name": "benoit" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is 3 documents now. @@ -683,9 +684,11 @@ mod tests { // Second we send 1 document without specifying the id. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "name": "new kevin" } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is 4 documents now. @@ -705,8 +708,11 @@ mod tests { // First we send 0 documents and only headers. let mut wtxn = index.write_txn().unwrap(); let content = documents!([]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is no documents. @@ -727,16 +733,20 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - assert!(builder.execute(content, |_| ()).is_err()); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + assert!(builder.add_documents(content).is_err()); wtxn.commit().unwrap(); // First we send 1 document with a valid id. let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. let content = documents!([ { "id": 32, "name": "kevin" } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is 1 document now. @@ -760,8 +770,11 @@ mod tests { { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is 1 documents now. @@ -799,14 +812,22 @@ mod tests { { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" }, { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); - builder.execute(documents, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments); + let indexing_config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::UpdateDocuments, + ..Default::default() + }; + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); let documents = documents!([ { "id": 2, @@ -815,7 +836,8 @@ mod tests { } ]); - builder.execute(documents, |_| ()).unwrap(); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); } @@ -833,7 +855,12 @@ mod tests { { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - IndexDocuments::new(&mut wtxn, &index).execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); @@ -848,15 +875,22 @@ mod tests { let content = documents!([ { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - IndexDocuments::new(&mut wtxn, &index).execute(content, |_| ()).unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_some()); let content = documents!([ { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - IndexDocuments::new(&mut wtxn, &index).execute(content, |_| ()).unwrap(); + + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); } @@ -886,8 +920,12 @@ mod tests { cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); } @@ -916,8 +954,12 @@ mod tests { cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -969,8 +1011,12 @@ mod tests { }, ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); } @@ -990,8 +1036,12 @@ mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is 4 document now. @@ -1002,8 +1052,12 @@ mod tests { let content = documents!([]); let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is 4 document now. @@ -1019,8 +1073,12 @@ mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is 4 document now. @@ -1042,8 +1100,12 @@ mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 855fb8db9..f5fb1ec01 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -5,7 +5,6 @@ use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::time::Instant; -use grenad::CompressionType; use itertools::Itertools; use log::info; use roaring::RoaringBitmap; @@ -14,7 +13,7 @@ use serde_json::{Map, Value}; use super::helpers::{ create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn, }; -use super::IndexDocumentsMethod; +use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; @@ -40,16 +39,14 @@ pub struct TransformOutput { /// Outputs the new `FieldsIdsMap`, the new `UsersIdsDocumentsIds` map, the new documents ids, /// the replaced documents ids, the number of documents in this update and the file /// containing all those documents. -pub struct Transform<'t, 'i> { - pub rtxn: &'t heed::RoTxn<'i>, +pub struct Transform<'a, 'i> { pub index: &'i Index, - pub log_every_n: Option, - pub chunk_compression_type: CompressionType, - pub chunk_compression_level: Option, - pub max_nb_chunks: Option, - pub max_memory: Option, - pub index_documents_method: IndexDocumentsMethod, + indexer_settings: &'a IndexerConfig, pub autogenerate_docids: bool, + pub index_documents_method: IndexDocumentsMethod, + + sorter: grenad::Sorter, + documents_count: usize, } /// Create a mapping between the field ids found in the document batch and the one that were @@ -84,56 +81,73 @@ fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { .map(String::as_str) } -impl Transform<'_, '_> { - pub fn read_documents( - self, - mut reader: DocumentBatchReader, - progress_callback: F, - ) -> Result - where - R: Read + Seek, - F: Fn(UpdateIndexingStep) + Sync, - { - let fields_index = reader.index(); - let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let mapping = create_fields_mapping(&mut fields_ids_map, fields_index)?; - - let alternative_name = self - .index - .primary_key(self.rtxn)? - .or_else(|| find_primary_key(fields_index)) - .map(String::from); - - let (primary_key_id, primary_key_name) = compute_primary_key_pair( - self.index.primary_key(self.rtxn)?, - &mut fields_ids_map, - alternative_name, - self.autogenerate_docids, - )?; - +impl<'a, 'i> Transform<'a, 'i> { + pub fn new( + index: &'i Index, + indexer_settings: &'a IndexerConfig, + index_documents_method: IndexDocumentsMethod, + autogenerate_docids: bool, + ) -> Self { // We must choose the appropriate merge function for when two or more documents // with the same user id must be merged or fully replaced in the same batch. - let merge_function = match self.index_documents_method { + let merge_function = match index_documents_method { IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv, IndexDocumentsMethod::UpdateDocuments => merge_obkvs, }; // We initialize the sorter with the user indexing settings. - let mut sorter = create_sorter( + let sorter = create_sorter( merge_function, - self.chunk_compression_type, - self.chunk_compression_level, - self.max_nb_chunks, - self.max_memory, + indexer_settings.chunk_compression_type, + indexer_settings.chunk_compression_level, + indexer_settings.max_nb_chunks, + indexer_settings.max_memory, ); + Transform { + index, + indexer_settings, + autogenerate_docids, + sorter, + documents_count: 0, + index_documents_method, + } + } + + pub fn read_documents( + &mut self, + mut reader: DocumentBatchReader, + wtxn: &mut heed::RwTxn, + progress_callback: F, + ) -> Result + where + R: Read + Seek, + F: Fn(UpdateIndexingStep) + Sync, + { + let fields_index = reader.index(); + let mut fields_ids_map = self.index.fields_ids_map(wtxn)?; + let mapping = create_fields_mapping(&mut fields_ids_map, fields_index)?; + + let alternative_name = self + .index + .primary_key(wtxn)? + .or_else(|| find_primary_key(fields_index)) + .map(String::from); + + let (primary_key_id, primary_key_name) = compute_primary_key_pair( + self.index.primary_key(wtxn)?, + &mut fields_ids_map, + alternative_name, + self.autogenerate_docids, + )?; + let mut obkv_buffer = Vec::new(); let mut documents_count = 0; let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, &[u8])> = Vec::new(); while let Some((addition_index, document)) = reader.next_document_with_index()? { let mut field_buffer_cache = drop_and_reuse(field_buffer); - if self.log_every_n.map_or(false, |len| documents_count % len == 0) { + if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { documents_seen: documents_count, }); @@ -214,7 +228,7 @@ impl Transform<'_, '_> { } // We use the extracted/generated user id as the key for this document. - sorter.insert(&external_id.as_ref().as_bytes(), &obkv_buffer)?; + self.sorter.insert(&external_id.as_ref().as_bytes(), &obkv_buffer)?; documents_count += 1; progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -230,38 +244,40 @@ impl Transform<'_, '_> { documents_seen: documents_count, }); + self.index.put_fields_ids_map(wtxn, &fields_ids_map)?; + self.index.put_primary_key(wtxn, &primary_key_name)?; + self.documents_count += documents_count; // Now that we have a valid sorter that contains the user id and the obkv we // give it to the last transforming function which returns the TransformOutput. - self.output_from_sorter( - sorter, - primary_key_name, - fields_ids_map, - documents_count, - progress_callback, - ) + Ok(documents_count) } /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. - fn output_from_sorter( + pub(crate) fn output_from_sorter( self, - sorter: grenad::Sorter, - primary_key: String, - fields_ids_map: FieldsIdsMap, - approximate_number_of_documents: usize, + wtxn: &mut heed::RwTxn, progress_callback: F, ) -> Result where F: Fn(UpdateIndexingStep) + Sync, { - let mut external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); - let documents_ids = self.index.documents_ids(self.rtxn)?; - let mut field_distribution = self.index.field_distribution(self.rtxn)?; + let primary_key = self + .index + .primary_key(&wtxn)? + .ok_or(Error::UserError(UserError::MissingPrimaryKey))? + .to_string(); + let fields_ids_map = self.index.fields_ids_map(wtxn)?; + let approximate_number_of_documents = self.documents_count; + + let mut external_documents_ids = self.index.external_documents_ids(wtxn).unwrap(); + let documents_ids = self.index.documents_ids(wtxn)?; + let mut field_distribution = self.index.field_distribution(wtxn)?; let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); // consume sorter, in order to free the internal allocation, before creating a new one. - let mut iter = sorter.into_merger_iter()?; + let mut iter = self.sorter.into_merger_iter()?; // Once we have sort and deduplicated the documents we write them into a final file. let mut final_sorter = create_sorter( @@ -272,10 +288,10 @@ impl Transform<'_, '_> { Err(InternalError::IndexingMergingKeys { process: "documents" }.into()) } }, - self.chunk_compression_type, - self.chunk_compression_level, - self.max_nb_chunks, - self.max_memory, + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory, ); let mut new_external_documents_ids_builder = fst::MapBuilder::memory(); let mut replaced_documents_ids = RoaringBitmap::new(); @@ -285,7 +301,7 @@ impl Transform<'_, '_> { // While we write into final file we get or generate the internal documents ids. let mut documents_count = 0; while let Some((external_id, update_obkv)) = iter.next()? { - if self.log_every_n.map_or(false, |len| documents_count % len == 0) { + if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { documents_seen: documents_count, total_documents: approximate_number_of_documents, @@ -299,7 +315,7 @@ impl Transform<'_, '_> { replaced_documents_ids.insert(docid); let key = BEU32::new(docid); - let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( + let base_obkv = self.index.documents.get(wtxn, &key)?.ok_or( InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None, @@ -359,8 +375,11 @@ impl Transform<'_, '_> { // We create a final writer to write the new documents in order from the sorter. let file = tempfile::tempfile()?; - let mut writer = - create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; + let mut writer = create_writer( + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + file, + )?; // Once we have written all the documents into the final sorter, we write the documents // into this writer, extract the file and reset the seek to be able to read it again. @@ -392,22 +411,28 @@ impl Transform<'_, '_> { // TODO this can be done in parallel by using the rayon `ThreadPool`. pub fn remap_index_documents( self, - primary_key: String, + wtxn: &mut heed::RwTxn, old_fields_ids_map: FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, ) -> Result { - let field_distribution = self.index.field_distribution(self.rtxn)?; - let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; - let documents_ids = self.index.documents_ids(self.rtxn)?; + // There already has been a document addition, the primary key should be set by now. + let primary_key = + self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string(); + let field_distribution = self.index.field_distribution(wtxn)?; + let external_documents_ids = self.index.external_documents_ids(wtxn)?; + let documents_ids = self.index.documents_ids(wtxn)?; let documents_count = documents_ids.len() as usize; // We create a final writer to write the new documents in order from the sorter. let file = tempfile::tempfile()?; - let mut writer = - create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?; + let mut writer = create_writer( + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + file, + )?; let mut obkv_buffer = Vec::new(); - for result in self.index.documents.iter(self.rtxn)? { + for result in self.index.documents.iter(wtxn)? { let (docid, obkv) = result?; let docid = docid.get(); diff --git a/milli/src/update/indexer_config.rs b/milli/src/update/indexer_config.rs new file mode 100644 index 000000000..af7211f90 --- /dev/null +++ b/milli/src/update/indexer_config.rs @@ -0,0 +1,29 @@ +use grenad::CompressionType; +use rayon::ThreadPool; + +#[derive(Debug)] +pub struct IndexerConfig { + pub log_every_n: Option, + pub max_nb_chunks: Option, + pub documents_chunk_size: Option, + pub max_memory: Option, + pub chunk_compression_type: CompressionType, + pub chunk_compression_level: Option, + pub thread_pool: Option, + pub max_positions_per_attributes: Option, +} + +impl Default for IndexerConfig { + fn default() -> Self { + Self { + log_every_n: None, + max_nb_chunks: None, + documents_chunk_size: None, + max_memory: None, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + thread_pool: None, + max_positions_per_attributes: None, + } + } +} diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 3dd8abd28..965ed4fd2 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -2,9 +2,11 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; pub use self::facets::Facets; -pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod}; +pub use self::index_documents::{ + DocumentAdditionResult, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, +}; +pub use self::indexer_config::IndexerConfig; pub use self::settings::{Setting, Settings}; -pub use self::update_builder::UpdateBuilder; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; @@ -16,8 +18,8 @@ mod clear_documents; mod delete_documents; mod facets; mod index_documents; +mod indexer_config; mod settings; -mod update_builder; mod update_step; mod word_prefix_docids; mod word_prefix_pair_proximity_docids; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index fff5eb0fa..91ef187f5 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -2,15 +2,15 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::result::Result as StdResult; use chrono::Utc; -use grenad::CompressionType; use itertools::Itertools; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; -use rayon::ThreadPool; use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use super::index_documents::{IndexDocumentsConfig, Transform}; +use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; -use crate::update::index_documents::{IndexDocumentsMethod, Transform}; +use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::{FieldsIdsMap, Index, Result}; @@ -77,14 +77,8 @@ impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting { pub struct Settings<'a, 't, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, - pub(crate) log_every_n: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, - pub(crate) documents_chunk_size: Option, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) thread_pool: Option<&'a ThreadPool>, - pub(crate) max_positions_per_attributes: Option, + + indexer_config: &'a IndexerConfig, searchable_fields: Setting>, displayed_fields: Setting>, @@ -98,17 +92,14 @@ pub struct Settings<'a, 't, 'u, 'i> { } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Settings<'a, 't, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + indexer_config: &'a IndexerConfig, + ) -> Settings<'a, 't, 'u, 'i> { Settings { wtxn, index, - log_every_n: None, - max_nb_chunks: None, - max_memory: None, - documents_chunk_size: None, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - thread_pool: None, searchable_fields: Setting::NotSet, displayed_fields: Setting::NotSet, filterable_fields: Setting::NotSet, @@ -118,14 +109,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { distinct_field: Setting::NotSet, synonyms: Setting::NotSet, primary_key: Setting::NotSet, - max_positions_per_attributes: None, + indexer_config, } } - pub fn log_every_n(&mut self, n: usize) { - self.log_every_n = Some(n); - } - pub fn reset_searchable_fields(&mut self) { self.searchable_fields = Setting::Reset; } @@ -210,25 +197,16 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { return Ok(()); } - let transform = Transform { - rtxn: &self.wtxn, - index: self.index, - log_every_n: self.log_every_n, - chunk_compression_type: self.chunk_compression_type, - chunk_compression_level: self.chunk_compression_level, - max_nb_chunks: self.max_nb_chunks, - max_memory: self.max_memory, - index_documents_method: IndexDocumentsMethod::ReplaceDocuments, - autogenerate_docids: false, - }; - - // There already has been a document addition, the primary key should be set by now. - let primary_key = - self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?; + let transform = Transform::new( + &self.index, + &self.indexer_config, + IndexDocumentsMethod::ReplaceDocuments, + false, + ); // We remap the documents fields based on the new `FieldsIdsMap`. let output = transform.remap_index_documents( - primary_key.to_string(), + self.wtxn, old_fields_ids_map, fields_ids_map.clone(), )?; @@ -238,16 +216,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // We index the generated `TransformOutput` which must contain // all the documents with fields in the newly defined searchable order. - let mut indexing_builder = IndexDocuments::new(self.wtxn, self.index); - indexing_builder.log_every_n = self.log_every_n; - indexing_builder.max_nb_chunks = self.max_nb_chunks; - indexing_builder.max_memory = self.max_memory; - indexing_builder.documents_chunk_size = self.documents_chunk_size; - indexing_builder.chunk_compression_type = self.chunk_compression_type; - indexing_builder.chunk_compression_level = self.chunk_compression_level; - indexing_builder.thread_pool = self.thread_pool; - indexing_builder.max_positions_per_attributes = self.max_positions_per_attributes; - indexing_builder.execute_raw(output, &cb)?; + let indexing_builder = IndexDocuments::new( + self.wtxn, + self.index, + &self.indexer_config, + IndexDocumentsConfig::default(), + &cb, + ); + indexing_builder.execute_raw(output)?; Ok(()) } @@ -535,13 +511,17 @@ mod tests { { "id": 2, "name": "kevina", "age": 21}, { "id": 3, "name": "benoit", "age": 34 } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // We change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_searchable_fields(vec!["name".into()]); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); @@ -563,7 +543,7 @@ mod tests { // We change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.reset_searchable_fields(); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); @@ -592,15 +572,19 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // In the same transaction we change the displayed fields to be only the "age". // We also change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_displayed_fields(vec!["age".into()]); builder.set_searchable_fields(vec!["name".into()]); builder.execute(|_| ()).unwrap(); @@ -614,7 +598,7 @@ mod tests { // We change the searchable fields to be the "name" field only. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.reset_searchable_fields(); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); @@ -639,9 +623,13 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). @@ -664,12 +652,16 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); // In the same transaction we change the displayed fields to be only the age. - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_displayed_fields(vec!["age".into()]); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); @@ -682,7 +674,7 @@ mod tests { // We reset the fields ids to become `None`, the default value. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.reset_displayed_fields(); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); @@ -700,9 +692,11 @@ mod tests { options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); + // Set the filterable fields to be the age. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_filterable_fields(hashset! { S("age") }); builder.execute(|_| ()).unwrap(); @@ -712,9 +706,12 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set. @@ -749,9 +746,12 @@ mod tests { { "name": "benoit", "age": 35 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); @@ -771,10 +771,11 @@ mod tests { let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); // Set the filterable fields to be the age. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); // Don't display the generated `id` field. builder.set_displayed_fields(vec![S("name")]); builder.set_criteria(vec![S("age:asc")]); @@ -786,9 +787,12 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Run an empty query just to ensure that the search results are ordered. @@ -813,10 +817,11 @@ mod tests { let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); // Set the filterable fields to be the age. let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); // Don't display the generated `id` field. builder.set_displayed_fields(vec![S("name"), S("age")]); builder.set_distinct_field(S("age")); @@ -832,9 +837,12 @@ mod tests { { "name": "bernie", "age": 34 }, { "name": "ben", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Run an empty query just to ensure that the search results are ordered. @@ -859,9 +867,13 @@ mod tests { { "name": "kevina", "age": 21 }, { "name": "benoit", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Ensure there is no stop_words by default @@ -884,12 +896,16 @@ mod tests { { "name": "kevina", "age": 21, "maxim": "Doggos are the best" }, { "name": "benoit", "age": 34, "maxim": "The crepes are really good" }, ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); // In the same transaction we provide some stop_words - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); let set = btreeset! { "i".to_string(), "the".to_string(), "are".to_string() }; builder.set_stop_words(set.clone()); builder.execute(|_| ()).unwrap(); @@ -920,7 +936,7 @@ mod tests { // now we'll reset the stop_words and ensure it's None let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.reset_stop_words(); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); @@ -958,12 +974,16 @@ mod tests { { "name": "kevina", "age": 21, "maxim": "Doggos are the best"}, { "name": "benoit", "age": 34, "maxim": "The crepes are really good"}, ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.enable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); // In the same transaction provide some synonyms - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_synonyms(hashmap! { "blini".to_string() => vec!["crepes".to_string()], "super like".to_string() => vec!["love".to_string()], @@ -987,7 +1007,7 @@ mod tests { // Reset the synonyms let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.reset_synonyms(); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); @@ -1012,10 +1032,11 @@ mod tests { let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); // Set all the settings except searchable let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_displayed_fields(vec!["hello".to_string()]); builder.set_filterable_fields(hashset! { S("age"), S("toto") }); builder.set_criteria(vec!["toto:asc".to_string()]); @@ -1032,7 +1053,7 @@ mod tests { // We set toto and age as searchable to force reordering of the fields let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); @@ -1049,10 +1070,11 @@ mod tests { let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); // Set all the settings except searchable let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_displayed_fields(vec!["hello".to_string()]); // It is only Asc(toto), there is a facet database but it is denied to filter with toto. builder.set_criteria(vec!["toto:asc".to_string()]); @@ -1070,10 +1092,11 @@ mod tests { let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); // Set the primary key settings let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_primary_key(S("mykey")); builder.execute(|_| ()).unwrap(); @@ -1089,14 +1112,17 @@ mod tests { { "mykey": 6, "name": "bernie", "age": 34 }, { "mykey": 7, "name": "ben", "age": 34 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index); - builder.disable_autogenerate_docids(); - builder.execute(content, |_| ()).unwrap(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // We now try to reset the primary key let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.reset_primary_key(); let err = builder.execute(|_| ()).unwrap_err(); @@ -1109,7 +1135,7 @@ mod tests { builder.execute().unwrap(); // ...we can change the primary key - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_primary_key(S("myid")); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); @@ -1121,10 +1147,11 @@ mod tests { let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); // Set the genres setting let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_filterable_fields(hashset! { S("genres") }); builder.execute(|_| ()).unwrap(); @@ -1147,8 +1174,12 @@ mod tests { "release_date": 819676800 } ]); - let builder = IndexDocuments::new(&mut wtxn, &index); - builder.execute(content, |_| ()).unwrap(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // We now try to reset the primary key diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs deleted file mode 100644 index 6e892a356..000000000 --- a/milli/src/update/update_builder.rs +++ /dev/null @@ -1,130 +0,0 @@ -use grenad::CompressionType; -use rayon::ThreadPool; - -use super::{ClearDocuments, DeleteDocuments, Facets, IndexDocuments, Settings}; -use crate::{Index, Result}; - -pub struct UpdateBuilder<'a> { - pub(crate) log_every_n: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) documents_chunk_size: Option, - pub(crate) max_memory: Option, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) thread_pool: Option<&'a ThreadPool>, - pub(crate) max_positions_per_attributes: Option, -} - -impl<'a> UpdateBuilder<'a> { - pub fn new() -> UpdateBuilder<'a> { - UpdateBuilder { - log_every_n: None, - max_nb_chunks: None, - documents_chunk_size: None, - max_memory: None, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - thread_pool: None, - max_positions_per_attributes: None, - } - } - - pub fn log_every_n(&mut self, log_every_n: usize) { - self.log_every_n = Some(log_every_n); - } - - pub fn max_nb_chunks(&mut self, max_nb_chunks: usize) { - self.max_nb_chunks = Some(max_nb_chunks); - } - - pub fn max_memory(&mut self, max_memory: usize) { - self.max_memory = Some(max_memory); - } - - pub fn documents_chunk_size(&mut self, documents_chunk_size: usize) { - self.documents_chunk_size = Some(documents_chunk_size); - } - - pub fn chunk_compression_type(&mut self, chunk_compression_type: CompressionType) { - self.chunk_compression_type = chunk_compression_type; - } - - pub fn chunk_compression_level(&mut self, chunk_compression_level: u32) { - self.chunk_compression_level = Some(chunk_compression_level); - } - - pub fn thread_pool(&mut self, thread_pool: &'a ThreadPool) { - self.thread_pool = Some(thread_pool); - } - - pub fn max_positions_per_attributes(&mut self, max_positions_per_attributes: u32) { - self.max_positions_per_attributes = Some(max_positions_per_attributes); - } - - pub fn clear_documents<'t, 'u, 'i>( - self, - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> ClearDocuments<'t, 'u, 'i> { - ClearDocuments::new(wtxn, index) - } - - pub fn delete_documents<'t, 'u, 'i>( - self, - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> Result> { - DeleteDocuments::new(wtxn, index) - } - - pub fn index_documents<'t, 'u, 'i>( - self, - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> IndexDocuments<'t, 'u, 'i, 'a> { - let mut builder = IndexDocuments::new(wtxn, index); - - builder.log_every_n = self.log_every_n; - builder.max_nb_chunks = self.max_nb_chunks; - builder.max_memory = self.max_memory; - builder.documents_chunk_size = self.documents_chunk_size; - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.thread_pool = self.thread_pool; - builder.max_positions_per_attributes = self.max_positions_per_attributes; - - builder - } - - pub fn settings<'t, 'u, 'i>( - self, - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> Settings<'a, 't, 'u, 'i> { - let mut builder = Settings::new(wtxn, index); - - builder.log_every_n = self.log_every_n; - builder.max_nb_chunks = self.max_nb_chunks; - builder.max_memory = self.max_memory; - builder.documents_chunk_size = self.documents_chunk_size; - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - builder.thread_pool = self.thread_pool; - builder.max_positions_per_attributes = self.max_positions_per_attributes; - - builder - } - - pub fn facets<'t, 'u, 'i>( - self, - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> Facets<'t, 'u, 'i> { - let mut builder = Facets::new(wtxn, index); - - builder.chunk_compression_type = self.chunk_compression_type; - builder.chunk_compression_level = self.chunk_compression_level; - - builder - } -} diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index da7251389..631618f73 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -16,7 +16,8 @@ macro_rules! test_distinct { // update distinct attribute let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let config = milli::update::IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_distinct_field(S(stringify!($distinct))); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index d1467fd72..31d53b666 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -7,7 +7,7 @@ use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; -use milli::update::{Settings, UpdateBuilder}; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index, Member}; use serde::Deserialize; use slice_group_by::GroupBy; @@ -31,8 +31,9 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); let criteria = criteria.iter().map(|c| c.to_string()).collect(); builder.set_criteria(criteria); @@ -54,10 +55,10 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { builder.execute(|_| ()).unwrap(); // index documents - let mut builder = UpdateBuilder::new(); - builder.max_memory(10 * 1024 * 1024); // 10MiB - let mut builder = builder.index_documents(&mut wtxn, &index); - builder.enable_autogenerate_docids(); + let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; + let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let reader = Cursor::new(CONTENT.as_bytes()); @@ -73,7 +74,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { // index documents let content = DocumentBatchReader::from_reader(cursor).unwrap(); - builder.execute(content, |_| ()).unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 8968eff90..0dcbf660e 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -6,7 +6,7 @@ use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; -use milli::update::{Settings, UpdateBuilder}; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; use rand::Rng; use Criterion::*; @@ -337,11 +337,12 @@ fn criteria_mixup() { ] }; + let config = IndexerConfig::default(); for criteria in criteria_mix { eprintln!("Testing with criteria order: {:?}", &criteria); //update criteria let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.iter().map(ToString::to_string).collect()); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); @@ -375,8 +376,9 @@ fn criteria_ascdesc() { let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_sortable_fields(hashset! { S("name"), @@ -385,10 +387,9 @@ fn criteria_ascdesc() { builder.execute(|_| ()).unwrap(); // index documents - let mut builder = UpdateBuilder::new(); - builder.max_memory(10 * 1024 * 1024); // 10MiB - let mut builder = builder.index_documents(&mut wtxn, &index); - builder.enable_autogenerate_docids(); + let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; + let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); let mut cursor = Cursor::new(Vec::new()); let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); @@ -419,7 +420,8 @@ fn criteria_ascdesc() { let reader = DocumentBatchReader::from_reader(cursor).unwrap(); - builder.execute(reader, |_| ()).unwrap(); + builder.add_documents(reader).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -430,7 +432,7 @@ fn criteria_ascdesc() { eprintln!("Testing with criterion: {:?}", &criterion); let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); + let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(vec![criterion.to_string()]); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); From fb51d511be18f103851d9d2e72bb8a2a5b7bc387 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 25 Jan 2022 12:08:47 +0100 Subject: [PATCH 1205/1889] fix(fuzzer): fix the fuzzer after #430 --- milli/fuzz/fuzz_targets/indexing.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index 327df09d1..fc51f969a 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -8,7 +8,7 @@ use arbitrary_json::ArbitraryValue; use heed::EnvOpenOptions; use libfuzzer_sys::fuzz_target; use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; -use milli::update::UpdateBuilder; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::Index; use serde_json::Value; @@ -35,11 +35,14 @@ fn index_documents( index: &mut milli::Index, documents: DocumentBatchReader>>, ) -> Result<()> { - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); let mut wtxn = index.write_txn()?; - let builder = update_builder.index_documents(&mut wtxn, &index); - builder.execute(documents, |_| ())?; + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + builder.add_documents(documents)?; + builder.execute().unwrap(); + wtxn.commit()?; Ok(()) } @@ -51,9 +54,10 @@ fn create_index() -> Result { options.max_readers(1); let index = Index::new(options, dir.path())?; - let update_builder = UpdateBuilder::new(); + let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = update_builder.settings(&mut wtxn, &index); + + let mut builder = Settings::new(&mut wtxn, &index, &config); let displayed_fields = ["id", "title", "album", "artist", "genre", "country", "released", "duration"] From b5f01b52c76509290fa92e6f26952726cd608891 Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 20 Jan 2022 18:31:47 +0100 Subject: [PATCH 1206/1889] cli improvements --- cli/src/main.rs | 202 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 178 insertions(+), 24 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 1edc171b0..9d807e8c6 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,7 +1,9 @@ +use std::collections::BTreeMap; use std::fs::File; -use std::io::{stdin, BufRead, BufReader, Cursor, Read}; +use std::io::{stdin, BufRead, BufReader, Cursor, Read, Write}; use std::path::PathBuf; use std::str::FromStr; +use std::time::Instant; use byte_unit::Byte; use eyre::Result; @@ -10,6 +12,8 @@ use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; use milli::update::{IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; +use milli::Index; +use serde_json::{Map, Value}; use structopt::StructOpt; #[cfg(target_os = "linux")] @@ -32,9 +36,108 @@ struct Cli { #[derive(Debug, StructOpt)] enum Command { - DocumentAddition(DocumentAddition), + Documents { + #[structopt(subcommand)] + cmd: Documents, + }, Search(Search), - SettingsUpdate(SettingsUpdate), + Settings { + #[structopt(subcommand)] + cmd: Settings, + }, +} + +impl Performer for Command { + fn perform(self, index: Index) -> Result<()> { + match self { + Command::Documents { cmd } => cmd.perform(index), + Command::Search(cmd) => cmd.perform(index), + Command::Settings { cmd } => cmd.perform(index), + } + } +} + +#[derive(Debug, StructOpt)] +enum Settings { + Update(SettingsUpdate), + Show, +} + +impl Settings { + fn show(&self, index: Index) -> Result<()> { + let txn = index.read_txn()?; + let displayed_attributes = index + .displayed_fields(&txn)? + .map(|fields| fields.into_iter().map(String::from).collect()); + + let searchable_attributes: Option> = index + .searchable_fields(&txn)? + .map(|fields| fields.into_iter().map(String::from).collect()); + + let filterable_attributes: Vec<_> = index.filterable_fields(&txn)?.into_iter().collect(); + + let sortable_attributes: Vec<_> = index.sortable_fields(&txn)?.into_iter().collect(); + + let criteria: Vec<_> = index.criteria(&txn)?.into_iter().map(|c| c.to_string()).collect(); + + let stop_words = index + .stop_words(&txn)? + .map(|stop_words| -> Result> { + Ok(stop_words.stream().into_strs()?.into_iter().collect()) + }) + .transpose()? + .unwrap_or_else(Vec::new); + let distinct_field = index.distinct_field(&txn)?.map(String::from); + + // in milli each word in the synonyms map were split on their separator. Since we lost + // this information we are going to put space between words. + let synonyms: BTreeMap<_, Vec<_>> = index + .synonyms(&txn)? + .iter() + .map(|(key, values)| { + (key.join(" "), values.iter().map(|value| value.join(" ")).collect()) + }) + .collect(); + + println!( + "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\n", + displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), + searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), + filterable_attributes.join("\n\t"), + sortable_attributes.join("\n\t"), + criteria.join("\n\t"), + stop_words.join("\n\t"), + distinct_field.unwrap_or_default(), + synonyms.into_iter().map(|(k, v)| format!("\n\t{}:\n{:?}", k, v)).collect::(), + ); + Ok(()) + } +} + +impl Performer for Settings { + fn perform(self, index: Index) -> Result<()> { + match self { + Settings::Update(update) => update.perform(index), + Settings::Show => self.show(index), + } + } +} + +#[derive(Debug, StructOpt)] +enum Documents { + Add(DocumentAddition), +} + +impl Performer for Documents { + fn perform(self, index: Index) -> Result<()> { + match self { + Self::Add(addition) => addition.perform(index), + } + } +} + +trait Performer { + fn perform(self, index: Index) -> Result<()>; } fn setup(opt: &Cli) -> Result<()> { @@ -56,11 +159,7 @@ fn main() -> Result<()> { options.map_size(command.index_size.get_bytes() as usize); let index = milli::Index::new(options, command.index_path)?; - match command.subcommand { - Command::DocumentAddition(addition) => addition.perform(index)?, - Command::Search(search) => search.perform(index)?, - Command::SettingsUpdate(update) => update.perform(index)?, - } + command.subcommand.perform(index)?; Ok(()) } @@ -100,8 +199,8 @@ struct DocumentAddition { update_documents: bool, } -impl DocumentAddition { - fn perform(&self, index: milli::Index) -> Result<()> { +impl Performer for DocumentAddition { + fn perform(self, index: milli::Index) -> Result<()> { let reader: Box = match self.path { Some(ref path) => { let file = File::open(path)?; @@ -247,29 +346,88 @@ struct Search { offset: Option, #[structopt(short, long)] limit: Option, + #[structopt(short, long, conflicts_with = "query")] + interactive: bool, +} + +impl Performer for Search { + fn perform(self, index: milli::Index) -> Result<()> { + if self.interactive { + let stdin = std::io::stdin(); + let mut lines = stdin.lock().lines(); + loop { + eprint!("> "); + std::io::stdout().flush()?; + match lines.next() { + Some(Ok(line)) => { + let now = Instant::now(); + let jsons = Self::perform_single_search( + &index, + &Some(line), + &self.filter, + &self.offset, + &self.limit, + )?; + + let time = now.elapsed(); + + let hits = serde_json::to_string_pretty(&jsons)?; + + println!("{}", hits); + eprintln!("found {} results in {:.02?}", jsons.len(), time); + } + _ => break, + } + } + } else { + let now = Instant::now(); + let jsons = Self::perform_single_search( + &index, + &self.query, + &self.filter, + &self.offset, + &self.limit, + )?; + + let time = now.elapsed(); + + let hits = serde_json::to_string_pretty(&jsons)?; + + println!("{}", hits); + eprintln!("found {} results in {:.02?}", jsons.len(), time); + } + + Ok(()) + } } impl Search { - fn perform(&self, index: milli::Index) -> Result<()> { + fn perform_single_search( + index: &milli::Index, + query: &Option, + filter: &Option, + offset: &Option, + limit: &Option, + ) -> Result>> { let txn = index.env.read_txn()?; let mut search = index.search(&txn); - if let Some(ref query) = self.query { + if let Some(ref query) = query { search.query(query); } - if let Some(ref filter) = self.filter { + if let Some(ref filter) = filter { if let Some(condition) = milli::Filter::from_str(filter)? { search.filter(condition); } } - if let Some(offset) = self.offset { - search.offset(offset); + if let Some(offset) = offset { + search.offset(*offset); } - if let Some(limit) = self.limit { - search.limit(limit); + if let Some(limit) = limit { + search.limit(*limit); } let result = search.execute()?; @@ -284,11 +442,7 @@ impl Search { jsons.push(json); } - let hits = serde_json::to_string_pretty(&jsons)?; - - println!("{}", hits); - - Ok(()) + Ok(jsons) } } @@ -298,8 +452,8 @@ struct SettingsUpdate { filterable_attributes: Option>, } -impl SettingsUpdate { - fn perform(&self, index: milli::Index) -> Result<()> { +impl Performer for SettingsUpdate { + fn perform(self, index: milli::Index) -> Result<()> { let mut txn = index.env.write_txn()?; let config = IndexerConfig { log_every_n: Some(100), ..Default::default() }; From e3c34684c6d9708621ecc4a5f1473f55d1b97119 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 12 Jan 2022 15:22:06 +0100 Subject: [PATCH 1207/1889] Fix a bug where we were skipping most of the prefix pairs --- .../word_prefix_pair_proximity_docids.rs | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index eb098a91f..8180cefd4 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -64,28 +64,26 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { ); let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - let prefix_fst_keys = prefix_fst.into_stream().into_bytes(); - let prefix_fst_keys: Vec<_> = prefix_fst_keys - .as_slice() - .linear_group_by_key(|x| std::str::from_utf8(&x).unwrap().chars().nth(0).unwrap()) - .collect(); + let prefix_fst_keys = prefix_fst.into_stream().into_strs()?; + let prefix_fst_keys: Vec<_> = + prefix_fst_keys.as_slice().linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect(); let mut db = self.index.word_pair_proximity_docids.remap_data_type::().iter(self.wtxn)?; let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[Vec]> = None; + let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); while let Some(((w1, w2, prox), data)) = db.next().transpose()? { current_prefixes = match current_prefixes.take() { - Some(prefixes) if w2.as_bytes().starts_with(&prefixes[0]) => Some(prefixes), + Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), _otherwise => { write_prefixes_in_sorter( &mut prefixes_cache, &mut word_prefix_pair_proximity_docids_sorter, self.threshold, )?; - prefix_fst_keys.iter().find(|prefixes| w2.as_bytes().starts_with(&prefixes[0])) + prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) } }; @@ -93,9 +91,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { buffer.clear(); buffer.extend_from_slice(w1.as_bytes()); buffer.push(0); - for prefix in prefixes.iter().filter(|prefix| w2.as_bytes().starts_with(prefix)) { + for prefix in prefixes.iter().filter(|prefix| w2.starts_with(prefix.as_str())) { buffer.truncate(w1.len() + 1); - buffer.extend_from_slice(prefix); + buffer.extend_from_slice(prefix.as_bytes()); buffer.push(prox); match prefixes_cache.get_mut(&buffer) { @@ -135,17 +133,13 @@ fn write_prefixes_in_sorter( sorter: &mut grenad::Sorter, min_word_per_prefix: u32, ) -> Result<()> { - for (i, (key, data_slices)) in prefixes.drain().enumerate() { + for (key, data_slices) in prefixes.drain() { // if the number of words prefixed by the prefix is higher than the threshold, // we insert it in the sorter. if data_slices.len() > min_word_per_prefix as usize { for data in data_slices { sorter.insert(&key, data)?; } - // if the first prefix isn't elligible for insertion, - // then the other prefixes can't be elligible. - } else if i == 0 { - break; } } From 23ea3ad738f8d1c22f08acd1b01cac3c9aefde45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 12 Jan 2022 15:23:46 +0100 Subject: [PATCH 1208/1889] Remove the useless threshold when computing the word prefix pair proximity --- .../word_prefix_pair_proximity_docids.rs | 24 ++----------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 8180cefd4..1227ac08e 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -18,7 +18,6 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, - threshold: u32, } impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { @@ -33,21 +32,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { chunk_compression_level: None, max_nb_chunks: None, max_memory: None, - threshold: 100, } } - /// Set the number of words required to make a prefix be part of the words prefixes - /// database. If a word prefix is supposed to match more than this number of words in the - /// dictionnary, therefore this prefix is added to the words prefixes datastructures. - /// - /// Default value is 100. This value must be higher than 50 and will be clamped - /// to these bound otherwise. - pub fn threshold(&mut self, value: u32) -> &mut Self { - self.threshold = value.max(50); - self - } - #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute(self) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); @@ -81,7 +68,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { write_prefixes_in_sorter( &mut prefixes_cache, &mut word_prefix_pair_proximity_docids_sorter, - self.threshold, )?; prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) } @@ -109,7 +95,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { write_prefixes_in_sorter( &mut prefixes_cache, &mut word_prefix_pair_proximity_docids_sorter, - self.threshold, )?; drop(prefix_fst); @@ -131,15 +116,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { fn write_prefixes_in_sorter( prefixes: &mut HashMap, Vec<&[u8]>>, sorter: &mut grenad::Sorter, - min_word_per_prefix: u32, ) -> Result<()> { for (key, data_slices) in prefixes.drain() { - // if the number of words prefixed by the prefix is higher than the threshold, - // we insert it in the sorter. - if data_slices.len() > min_word_per_prefix as usize { - for data in data_slices { - sorter.insert(&key, data)?; - } + for data in data_slices { + sorter.insert(&key, data)?; } } From 1514dfa1b7c630657790c4380ac6ae46ebd125f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 12 Jan 2022 15:40:51 +0100 Subject: [PATCH 1209/1889] Introduce a max proximity parameter to the word prefix pair proximity update --- .../update/word_prefix_pair_proximity_docids.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 1227ac08e..b177e683d 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -18,6 +18,7 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, + max_proximity: u8, } impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { @@ -32,9 +33,21 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { chunk_compression_level: None, max_nb_chunks: None, max_memory: None, + max_proximity: 4, } } + /// Set the maximum proximity required to make a prefix be part of the words prefixes + /// database. If two words are two far from the threshold the associated documents will + /// not be part of the prefix database. + /// + /// Default value is 4. This value must be lower or equal than 4 and will be clamped + /// to this bound otherwise. + pub fn max_proximity(&mut self, value: u8) -> &mut Self { + self.max_proximity = value.max(7); + self + } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute(self) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); @@ -62,6 +75,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); while let Some(((w1, w2, prox), data)) = db.next().transpose()? { + if prox > self.max_proximity { + continue; + } + current_prefixes = match current_prefixes.take() { Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), _otherwise => { From f04cd198866b49d67887981a2fd8f058aec1bbdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 12 Jan 2022 16:14:53 +0100 Subject: [PATCH 1210/1889] Introduce a max prefix length parameter to the word prefix pair proximity update --- .../word_prefix_pair_proximity_docids.rs | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index b177e683d..808a0d8e4 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -19,6 +19,7 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { pub(crate) max_nb_chunks: Option, pub(crate) max_memory: Option, max_proximity: u8, + max_prefix_length: usize, } impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { @@ -34,6 +35,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { max_nb_chunks: None, max_memory: None, max_proximity: 4, + max_prefix_length: 2, } } @@ -48,6 +50,17 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self } + /// Set the maximum length the prefix of a word pair is allowed to have be part of the words + /// prefixes database. If two words are two far from the threshold the associated documents + /// will not be part of the prefix database. + /// + /// Default value is 4. This value must be lower or equal than 4 and will be clamped + /// to this bound otherwise. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value; + self + } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute(self) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); @@ -94,15 +107,17 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { buffer.clear(); buffer.extend_from_slice(w1.as_bytes()); buffer.push(0); - for prefix in prefixes.iter().filter(|prefix| w2.starts_with(prefix.as_str())) { - buffer.truncate(w1.len() + 1); - buffer.extend_from_slice(prefix.as_bytes()); - buffer.push(prox); + for prefix in prefixes.iter() { + if prefix.len() <= self.max_prefix_length && w2.starts_with(prefix) { + buffer.truncate(w1.len() + 1); + buffer.extend_from_slice(prefix.as_bytes()); + buffer.push(prox); - match prefixes_cache.get_mut(&buffer) { - Some(value) => value.push(data), - None => { - prefixes_cache.insert(buffer.clone(), vec![data]); + match prefixes_cache.get_mut(&buffer) { + Some(value) => value.push(data), + None => { + prefixes_cache.insert(buffer.clone(), vec![data]); + } } } } From f9b214f34e6b189877178a7cb6b6cf7f8bfa19d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 26 Jan 2022 11:28:11 +0100 Subject: [PATCH 1211/1889] Apply suggestions from code review Co-authored-by: Many --- milli/src/update/word_prefix_pair_proximity_docids.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 808a0d8e4..2dc00fb90 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -40,22 +40,21 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } /// Set the maximum proximity required to make a prefix be part of the words prefixes - /// database. If two words are two far from the threshold the associated documents will + /// database. If two words are too far from the threshold the associated documents will /// not be part of the prefix database. /// - /// Default value is 4. This value must be lower or equal than 4 and will be clamped + /// Default value is 4. This value must be lower or equal than 7 and will be clamped /// to this bound otherwise. pub fn max_proximity(&mut self, value: u8) -> &mut Self { self.max_proximity = value.max(7); self } - /// Set the maximum length the prefix of a word pair is allowed to have be part of the words - /// prefixes database. If two words are two far from the threshold the associated documents + /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words + /// prefixes database. If the prefix length is higher than the threshold, the associated documents /// will not be part of the prefix database. /// - /// Default value is 4. This value must be lower or equal than 4 and will be clamped - /// to this bound otherwise. + /// Default value is 2. pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { self.max_prefix_length = value; self From 0d282e3cc584ad2389a8c26e1c2f832e412561de Mon Sep 17 00:00:00 2001 From: meili-bot <74670311+meili-bot@users.noreply.github.com> Date: Wed, 26 Jan 2022 16:33:16 +0100 Subject: [PATCH 1212/1889] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4df258585..cbd9f4c35 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ ## Introduction -This repository contains the core engine used in [MeiliSearch]. +This repository contains the core engine used in [meilisearch]. -It contains a library that can manage one and only one index. MeiliSearch +It contains a library that can manage one and only one index. meilisearch manages the multi-index itself. Milli is unable to store updates in a store: it is the job of something else above and this is why it is only able to process one update at a time. @@ -65,5 +65,5 @@ To enable the hook, run the following command from the root of the project: cp script/pre-commit .git/hooks/pre-commit ``` -[MeiliSearch]: https://github.com/MeiliSearch/MeiliSearch +[meilisearch]: https://github.com/meilisearch/meilisearch [flamegraph]: https://github.com/flamegraph-rs/flamegraph From de808a391aeaf5fc9e195fc90ee55505ffff55b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 26 Jan 2022 17:47:26 +0100 Subject: [PATCH 1213/1889] Replace meilisearch by Meilisearch --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index cbd9f4c35..aa8770159 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ ## Introduction -This repository contains the core engine used in [meilisearch]. +This repository contains the core engine used in [Meilisearch]. -It contains a library that can manage one and only one index. meilisearch +It contains a library that can manage one and only one index. Meilisearch manages the multi-index itself. Milli is unable to store updates in a store: it is the job of something else above and this is why it is only able to process one update at a time. @@ -65,5 +65,5 @@ To enable the hook, run the following command from the root of the project: cp script/pre-commit .git/hooks/pre-commit ``` -[meilisearch]: https://github.com/meilisearch/meilisearch +[Meilisearch]: https://github.com/meilisearch/meilisearch [flamegraph]: https://github.com/flamegraph-rs/flamegraph From 0f213f22029f234a5a630e3dcea24f165e27e5c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 26 Jan 2022 17:49:55 +0100 Subject: [PATCH 1214/1889] Replace MeiliSearch by Meilisearch --- filter-parser/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter-parser/README.md b/filter-parser/README.md index dfbc03d07..b4ddda3d3 100644 --- a/filter-parser/README.md +++ b/filter-parser/README.md @@ -1,6 +1,6 @@ # Filter parser -This workspace is dedicated to the parsing of the MeiliSearch filters. +This workspace is dedicated to the parsing of the Meilisearch filters. Most of the code and explanation are in the [`lib.rs`](./src/lib.rs). Especially, the BNF of the filters at the top of this file. From d28f18658e58ac8dab0f9bb7346e70fbf5ac4f49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 18 Jan 2022 14:02:24 +0100 Subject: [PATCH 1215/1889] Retrieve the previous version of the words prefixes FST --- milli/src/update/index_documents/mod.rs | 5 ++++- milli/src/update/word_prefix_pair_proximity_docids.rs | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4fbb75d5f..cdea37d54 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -353,6 +353,9 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); + let previous_words_prefixes_fst = + self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?; + // Run the words prefixes update operation. let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); if let Some(value) = self.config.words_prefix_threshold { @@ -389,7 +392,7 @@ where builder.chunk_compression_level = self.indexer_config.chunk_compression_level; builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_memory = self.indexer_config.max_memory; - builder.execute()?; + builder.execute(&previous_words_prefixes_fst)?; databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 2dc00fb90..2788d5d35 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -61,7 +61,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute(self) -> Result<()> { + pub fn execute>(self, old_prefix_fst: &fst::Set) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; From 822f67e9ad1f17b8921c11aefd3d715170c96b70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 18 Jan 2022 14:59:51 +0100 Subject: [PATCH 1216/1889] Bring the newly created word pair proximity docids --- milli/src/update/index_documents/mod.rs | 37 ++++++++++++++++--- .../word_prefix_pair_proximity_docids.rs | 9 ++++- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index cdea37d54..0ef05dba5 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -16,11 +16,12 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; pub use self::helpers::{ create_sorter, create_writer, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, MergeFn, + sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::DocumentBatchReader; +pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, @@ -282,6 +283,7 @@ where let index_documents_ids = self.index.documents_ids(self.wtxn)?; let index_is_empty = index_documents_ids.len() == 0; let mut final_documents_ids = RoaringBitmap::new(); + let mut word_pair_proximity_docids = Vec::new(); let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -289,9 +291,26 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - for typed_chunk in lmdb_writer_rx { + for result in lmdb_writer_rx { + let typed_chunk = match result? { + TypedChunk::WordPairProximityDocids(chunk) => { + // We extract and mmap our chunk file to be able to get it for next processes. + let mut file = chunk.into_inner(); + let mmap = unsafe { memmap2::Mmap::map(&file)? }; + let cursor_mmap = CursorClonableMmap::new(ClonableMmap::from(mmap)); + let chunk = grenad::Reader::new(cursor_mmap)?; + word_pair_proximity_docids.push(chunk); + + // We reconstruct our typed-chunk back. + file.rewind()?; + let chunk = grenad::Reader::new(file)?; + TypedChunk::WordPairProximityDocids(chunk) + } + otherwise => otherwise, + }; + let (docids, is_merged_database) = - write_typed_chunk_into_index(typed_chunk?, &self.index, self.wtxn, index_is_empty)?; + write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?; if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); @@ -325,13 +344,19 @@ where let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids; self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; - self.execute_prefix_databases()?; + self.execute_prefix_databases(word_pair_proximity_docids)?; Ok(all_documents_ids.len()) } #[logging_timer::time("IndexDocuments::{}")] - pub fn execute_prefix_databases(self) -> Result<()> { + pub fn execute_prefix_databases( + self, + word_pair_proximity_docids: Vec>, + ) -> Result<()> + where + F: Fn(UpdateIndexingStep) + Sync, + { // Merged databases are already been indexed, we start from this count; let mut databases_seen = MERGED_DATABASE_COUNT; @@ -392,7 +417,7 @@ where builder.chunk_compression_level = self.indexer_config.chunk_compression_level; builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_memory = self.indexer_config.max_memory; - builder.execute(&previous_words_prefixes_fst)?; + builder.execute(word_pair_proximity_docids, &previous_words_prefixes_fst)?; databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 2788d5d35..6a49a0a63 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -7,7 +7,8 @@ use log::debug; use slice_group_by::GroupBy; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, MergeFn, WriteMethod, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, + MergeFn, WriteMethod, }; use crate::{Index, Result}; @@ -61,7 +62,11 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute>(self, old_prefix_fst: &fst::Set) -> Result<()> { + pub fn execute>( + self, + new_word_pair_proximity_docids: Vec>, + old_prefix_fst: &fst::Set, + ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; From c90fa95f93103c11b56d661b851b50a524d29f09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 18 Jan 2022 15:23:18 +0100 Subject: [PATCH 1217/1889] Only compute the word prefix pairs on the created word pair proximities --- .../word_prefix_pair_proximity_docids.rs | 63 ++++++++++++++----- 1 file changed, 47 insertions(+), 16 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 6a49a0a63..d35f39c10 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,8 +1,8 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; -use fst::IntoStreamer; -use grenad::CompressionType; -use heed::types::ByteSlice; +use fst::{IntoStreamer, Streamer}; +use grenad::{CompressionType, MergerBuilder}; +use heed::BytesDecode; use log::debug; use slice_group_by::GroupBy; @@ -10,7 +10,7 @@ use crate::update::index_documents::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn, WriteMethod, }; -use crate::{Index, Result}; +use crate::{Index, Result, StrStrU8Codec}; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -69,9 +69,12 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; + // We retrieve and merge the created word pair proximities docids entries + // for the newly added documents. + let mut wppd_merger = MergerBuilder::new(merge_cbo_roaring_bitmaps); + wppd_merger.extend(new_word_pair_proximity_docids); + let mut wppd_iter = wppd_merger.build().into_merger_iter()?; - // Here we create a sorter akin to the previous one. let mut word_prefix_pair_proximity_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, self.chunk_compression_type, @@ -85,13 +88,14 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { let prefix_fst_keys: Vec<_> = prefix_fst_keys.as_slice().linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect(); - let mut db = - self.index.word_pair_proximity_docids.remap_data_type::().iter(self.wtxn)?; + // We compute the set of prefixes that are no more part of the prefix fst. + let suppr_pw = stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); let mut buffer = Vec::new(); let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some(((w1, w2, prox), data)) = db.next().transpose()? { + while let Some((key, data)) = wppd_iter.next()? { + let (w1, w2, prox) = StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; if prox > self.max_proximity { continue; } @@ -118,9 +122,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { buffer.push(prox); match prefixes_cache.get_mut(&buffer) { - Some(value) => value.push(data), + Some(value) => value.push(data.to_owned()), None => { - prefixes_cache.insert(buffer.clone(), vec![data]); + prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); } } } @@ -134,15 +138,28 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { )?; drop(prefix_fst); - drop(db); - // We finally write the word prefix pair proximity docids into the LMDB database. + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + let mut iter = + self.index.word_prefix_pair_proximity_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + while let Some(((_, w2, _), _)) = iter.next().transpose()? { + if suppr_pw.contains(w2.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; + } + } + + drop(iter); + + // We finally write and merge the new word prefix pair proximity docids + // in the LMDB database. sorter_into_lmdb_database( self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), word_prefix_pair_proximity_docids_sorter, merge_cbo_roaring_bitmaps, - WriteMethod::Append, + WriteMethod::GetMergePut, )?; Ok(()) @@ -150,7 +167,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } fn write_prefixes_in_sorter( - prefixes: &mut HashMap, Vec<&[u8]>>, + prefixes: &mut HashMap, Vec>>, sorter: &mut grenad::Sorter, ) -> Result<()> { for (key, data_slices) in prefixes.drain() { @@ -161,3 +178,17 @@ fn write_prefixes_in_sorter( Ok(()) } + +/// Converts an fst Stream into an HashSet. +fn stream_into_hashset<'f, I, S>(stream: I) -> HashSet> +where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, +{ + let mut hashset = HashSet::new(); + let mut stream = stream.into_stream(); + while let Some(value) = stream.next() { + hashset.insert(value.to_owned()); + } + hashset +} From 5404bc02dddaf790b23d71cb1834b53a461c06fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 19 Jan 2022 14:30:03 +0100 Subject: [PATCH 1218/1889] Move the fst_stream_into_hashset method in the helper methods --- .../src/update/index_documents/helpers/mod.rs | 16 +++++++++++++ milli/src/update/index_documents/mod.rs | 5 ++-- .../word_prefix_pair_proximity_docids.rs | 24 ++++--------------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 128288982..4086bfb7f 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -2,9 +2,11 @@ mod clonable_mmap; mod grenad_helpers; mod merge_functions; +use std::collections::HashSet; use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; +use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ create_sorter, create_writer, grenad_obkv_into_chunks, into_clonable_grenad, merge_readers, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, @@ -43,3 +45,17 @@ where pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) } + +/// Converts an fst Stream into an HashSet. +pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet> +where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, +{ + let mut hashset = HashSet::new(); + let mut stream = stream.into_stream(); + while let Some(value) = stream.next() { + hashset.insert(value.to_owned()); + } + hashset +} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 0ef05dba5..ae8e28b33 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -15,8 +15,9 @@ use serde::{Deserialize, Serialize}; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; pub use self::helpers::{ - create_sorter, create_writer, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, + create_sorter, create_writer, fst_stream_into_hashset, merge_cbo_roaring_bitmaps, + merge_roaring_bitmaps, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, + ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index d35f39c10..50b7a978b 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,14 +1,14 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; -use fst::{IntoStreamer, Streamer}; +use fst::IntoStreamer; use grenad::{CompressionType, MergerBuilder}; use heed::BytesDecode; use log::debug; use slice_group_by::GroupBy; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, - MergeFn, WriteMethod, + create_sorter, fst_stream_into_hashset, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, + CursorClonableMmap, MergeFn, WriteMethod, }; use crate::{Index, Result, StrStrU8Codec}; @@ -89,7 +89,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { prefix_fst_keys.as_slice().linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect(); // We compute the set of prefixes that are no more part of the prefix fst. - let suppr_pw = stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); + let suppr_pw = fst_stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); let mut buffer = Vec::new(); let mut current_prefixes: Option<&&[String]> = None; @@ -178,17 +178,3 @@ fn write_prefixes_in_sorter( Ok(()) } - -/// Converts an fst Stream into an HashSet. -fn stream_into_hashset<'f, I, S>(stream: I) -> HashSet> -where - I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, - S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, -{ - let mut hashset = HashSet::new(); - let mut stream = stream.into_stream(); - while let Some(value) = stream.next() { - hashset.insert(value.to_owned()); - } - hashset -} From 28692f65be2166ec12617fd87306bf896fb07c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 19 Jan 2022 15:02:04 +0100 Subject: [PATCH 1219/1889] Rework the WordPrefixDocids update to compute a subset of the database --- milli/src/update/index_documents/mod.rs | 19 +++++- milli/src/update/word_prefix_docids.rs | 87 +++++++++++++++++++------ 2 files changed, 85 insertions(+), 21 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ae8e28b33..77b761e6e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -285,6 +285,7 @@ where let index_is_empty = index_documents_ids.len() == 0; let mut final_documents_ids = RoaringBitmap::new(); let mut word_pair_proximity_docids = Vec::new(); + let mut word_docids = Vec::new(); let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -294,6 +295,19 @@ where for result in lmdb_writer_rx { let typed_chunk = match result? { + TypedChunk::WordDocids(chunk) => { + // We extract and mmap our chunk file to be able to get it for next processes. + let mut file = chunk.into_inner(); + let mmap = unsafe { memmap2::Mmap::map(&file)? }; + let cursor_mmap = CursorClonableMmap::new(ClonableMmap::from(mmap)); + let chunk = grenad::Reader::new(cursor_mmap)?; + word_docids.push(chunk); + + // We reconstruct our typed-chunk back. + file.rewind()?; + let chunk = grenad::Reader::new(file)?; + TypedChunk::WordDocids(chunk) + } TypedChunk::WordPairProximityDocids(chunk) => { // We extract and mmap our chunk file to be able to get it for next processes. let mut file = chunk.into_inner(); @@ -345,7 +359,7 @@ where let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids; self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; - self.execute_prefix_databases(word_pair_proximity_docids)?; + self.execute_prefix_databases(word_docids, word_pair_proximity_docids)?; Ok(all_documents_ids.len()) } @@ -353,6 +367,7 @@ where #[logging_timer::time("IndexDocuments::{}")] pub fn execute_prefix_databases( self, + word_docids: Vec>, word_pair_proximity_docids: Vec>, ) -> Result<()> where @@ -404,7 +419,7 @@ where builder.chunk_compression_level = self.indexer_config.chunk_compression_level; builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_memory = self.indexer_config.max_memory; - builder.execute()?; + builder.execute(word_docids, &previous_words_prefixes_fst)?; databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 30dabf1ae..0703707f0 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -1,11 +1,12 @@ -use std::str; +use std::collections::HashMap; -use fst::Streamer; -use grenad::CompressionType; -use heed::types::ByteSlice; +use fst::IntoStreamer; +use grenad::{CompressionType, MergerBuilder}; +use slice_group_by::GroupBy; use crate::update::index_documents::{ - create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, + create_sorter, fst_stream_into_hashset, merge_roaring_bitmaps, sorter_into_lmdb_database, + CursorClonableMmap, MergeFn, WriteMethod, }; use crate::{Index, Result}; @@ -34,11 +35,18 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } #[logging_timer::time("WordPrefixDocids::{}")] - pub fn execute(self) -> Result<()> { - // Clear the word prefix docids database. - self.index.word_prefix_docids.clear(self.wtxn)?; - + pub fn execute>( + self, + new_word_docids: Vec>, + old_prefix_fst: &fst::Set, + ) -> Result<()> { let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + let prefix_fst_keys = prefix_fst.into_stream().into_strs()?; + let prefix_fst_keys: Vec<_> = + prefix_fst_keys.as_slice().linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect(); + + // We compute the set of prefixes that are no more part of the prefix fst. + let suppr_pw = fst_stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. @@ -50,18 +58,46 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { self.max_memory, ); - // We iterate over all the prefixes and retrieve the corresponding docids. - let mut prefix_stream = prefix_fst.stream(); - while let Some(bytes) = prefix_stream.next() { - let prefix = str::from_utf8(bytes)?; - let db = self.index.word_docids.remap_data_type::(); - for result in db.prefix_iter(self.wtxn, prefix)? { - let (_word, data) = result?; - prefix_docids_sorter.insert(prefix, data)?; + let mut word_docids_merger = MergerBuilder::new(merge_roaring_bitmaps); + word_docids_merger.extend(new_word_docids); + let mut word_docids_iter = word_docids_merger.build().into_merger_iter()?; + + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some((word, data)) = word_docids_iter.next()? { + current_prefixes = match current_prefixes.take() { + Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; + prefix_fst_keys + .iter() + .find(|prefixes| word.starts_with(&prefixes[0].as_bytes())) + } + }; + + if let Some(prefixes) = current_prefixes { + for prefix in prefixes.iter() { + if word.starts_with(prefix.as_bytes()) { + match prefixes_cache.get_mut(prefix.as_bytes()) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache.insert(prefix.clone().into(), vec![data.to_owned()]); + } + } + } + } } } - drop(prefix_fst); + // We remove all the entries that are no more required in this word prefix docids database. + let mut iter = self.index.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + while let Some((prefix, _)) = iter.next().transpose()? { + if suppr_pw.contains(prefix.as_bytes()) { + unsafe { iter.del_current()? }; + } + } + + drop(iter); // We finally write the word prefix docids into the LMDB database. sorter_into_lmdb_database( @@ -69,9 +105,22 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { *self.index.word_prefix_docids.as_polymorph(), prefix_docids_sorter, merge_roaring_bitmaps, - WriteMethod::Append, + WriteMethod::GetMergePut, )?; Ok(()) } } + +fn write_prefixes_in_sorter( + prefixes: &mut HashMap, Vec>>, + sorter: &mut grenad::Sorter, +) -> Result<()> { + for (key, data_slices) in prefixes.drain() { + for data in data_slices { + sorter.insert(&key, data)?; + } + } + + Ok(()) +} From 2ec854210506c615b4986f53029bae087f8e4a54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 25 Jan 2022 15:48:48 +0100 Subject: [PATCH 1220/1889] Rework the WordPrefixDocids update to compute a subset of the database --- milli/src/update/word_prefix_docids.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 0703707f0..105083d87 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -89,6 +89,21 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } } + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; + + // We fetch the docids associated to the newly added word prefix fst only. + let db = self.index.word_docids.remap_data_type::(); + let mut new_prefixes_stream = prefix_fst.op().add(old_prefix_fst).difference(); + while let Some(bytes) = new_prefixes_stream.next() { + let prefix = std::str::from_utf8(bytes)?; + for result in db.prefix_iter(self.wtxn, prefix)? { + let (_word, data) = result?; + prefix_docids_sorter.insert(prefix, data)?; + } + } + + drop(new_prefixes_stream); + // We remove all the entries that are no more required in this word prefix docids database. let mut iter = self.index.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data(); while let Some((prefix, _)) = iter.next().transpose()? { From d59e5593179accbfe71dda09b482bbed69c8cb1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 20 Jan 2022 17:55:52 +0100 Subject: [PATCH 1221/1889] Fix the computation of the newly added and common prefix words --- .../src/update/index_documents/helpers/mod.rs | 17 +++++++++++++++- milli/src/update/index_documents/mod.rs | 6 +++--- milli/src/update/word_prefix_docids.rs | 20 ++++++++++++------- 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 4086bfb7f..bbb2b9b95 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -46,7 +46,7 @@ pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) } -/// Converts an fst Stream into an HashSet. +/// Converts an fst Stream into an HashSet of Strings. pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet> where I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, @@ -59,3 +59,18 @@ where } hashset } + +// Converts an fst Stream into a Vec of Strings. +pub fn fst_stream_into_vec<'f, I, S>(stream: I) -> Vec +where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, +{ + let mut strings = Vec::new(); + let mut stream = stream.into_stream(); + while let Some(word) = stream.next() { + let s = std::str::from_utf8(word).unwrap(); + strings.push(s.to_owned()); + } + strings +} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 77b761e6e..ad3f73d0d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -15,9 +15,9 @@ use serde::{Deserialize, Serialize}; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; pub use self::helpers::{ - create_sorter, create_writer, fst_stream_into_hashset, merge_cbo_roaring_bitmaps, - merge_roaring_bitmaps, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, - ClonableMmap, MergeFn, + create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, + merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, sorter_into_lmdb_database, + write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 105083d87..1e2996c9b 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -1,12 +1,13 @@ use std::collections::HashMap; -use fst::IntoStreamer; +use fst::Streamer; use grenad::{CompressionType, MergerBuilder}; +use heed::types::ByteSlice; use slice_group_by::GroupBy; use crate::update::index_documents::{ - create_sorter, fst_stream_into_hashset, merge_roaring_bitmaps, sorter_into_lmdb_database, - CursorClonableMmap, MergeFn, WriteMethod, + create_sorter, fst_stream_into_hashset, fst_stream_into_vec, merge_roaring_bitmaps, + sorter_into_lmdb_database, CursorClonableMmap, MergeFn, WriteMethod, }; use crate::{Index, Result}; @@ -41,9 +42,14 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { old_prefix_fst: &fst::Set, ) -> Result<()> { let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - let prefix_fst_keys = prefix_fst.into_stream().into_strs()?; - let prefix_fst_keys: Vec<_> = - prefix_fst_keys.as_slice().linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect(); + + // We retrieve the common words between the previous and new prefix word fst. + let common_prefix_fst_keys = + fst_stream_into_vec(old_prefix_fst.op().add(&prefix_fst).intersection()); + let common_prefix_fst_keys: Vec<_> = common_prefix_fst_keys + .as_slice() + .linear_group_by_key(|x| x.chars().nth(0).unwrap()) + .collect(); // We compute the set of prefixes that are no more part of the prefix fst. let suppr_pw = fst_stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); @@ -69,7 +75,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), _otherwise => { write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; - prefix_fst_keys + common_prefix_fst_keys .iter() .find(|prefixes| word.starts_with(&prefixes[0].as_bytes())) } From e760e027376e4b31bdb287ced1239dff25f9f65a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 25 Jan 2022 11:18:20 +0100 Subject: [PATCH 1222/1889] Fix the computation of the newly added and common prefix pair proximity words --- .../word_prefix_pair_proximity_docids.rs | 97 ++++++++++++++++--- 1 file changed, 83 insertions(+), 14 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 50b7a978b..dcc5db614 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,14 +1,14 @@ use std::collections::HashMap; -use fst::IntoStreamer; use grenad::{CompressionType, MergerBuilder}; +use heed::types::ByteSlice; use heed::BytesDecode; use log::debug; use slice_group_by::GroupBy; use crate::update::index_documents::{ - create_sorter, fst_stream_into_hashset, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, - CursorClonableMmap, MergeFn, WriteMethod, + create_sorter, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, + sorter_into_lmdb_database, CursorClonableMmap, MergeFn, WriteMethod, }; use crate::{Index, Result, StrStrU8Codec}; @@ -75,6 +75,27 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { wppd_merger.extend(new_word_pair_proximity_docids); let mut wppd_iter = wppd_merger.build().into_merger_iter()?; + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + + // We retrieve the common words between the previous and new prefix word fst. + let common_prefix_fst_keys = + fst_stream_into_vec(old_prefix_fst.op().add(&prefix_fst).intersection()); + let common_prefix_fst_keys: Vec<_> = common_prefix_fst_keys + .as_slice() + .linear_group_by_key(|x| x.chars().nth(0).unwrap()) + .collect(); + + // We retrieve the newly added words between the previous and new prefix word fst. + let new_prefix_fst_keys = + fst_stream_into_vec(prefix_fst.op().add(old_prefix_fst).difference()); + let new_prefix_fst_keys: Vec<_> = new_prefix_fst_keys + .as_slice() + .linear_group_by_key(|x| x.chars().nth(0).unwrap()) + .collect(); + + // We compute the set of prefixes that are no more part of the prefix fst. + let suppr_pw = fst_stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); + let mut word_prefix_pair_proximity_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, self.chunk_compression_type, @@ -83,14 +104,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self.max_memory, ); - let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - let prefix_fst_keys = prefix_fst.into_stream().into_strs()?; - let prefix_fst_keys: Vec<_> = - prefix_fst_keys.as_slice().linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect(); - - // We compute the set of prefixes that are no more part of the prefix fst. - let suppr_pw = fst_stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); - + // We compute the prefix docids associated with the common prefixes between + // the old and new word prefix fst. let mut buffer = Vec::new(); let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); @@ -107,7 +122,57 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &mut prefixes_cache, &mut word_prefix_pair_proximity_docids_sorter, )?; - prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) + common_prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) + } + }; + + if let Some(prefixes) = current_prefixes { + buffer.clear(); + buffer.extend_from_slice(w1.as_bytes()); + buffer.push(0); + for prefix in prefixes.iter() { + if prefix.len() <= self.max_prefix_length && w2.starts_with(prefix) { + buffer.truncate(w1.len() + 1); + buffer.extend_from_slice(prefix.as_bytes()); + buffer.push(prox); + + match prefixes_cache.get_mut(&buffer) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); + } + } + } + } + } + } + + write_prefixes_in_sorter( + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + )?; + + // We compute the prefix docids associated with the newly added prefixes + // in the new word prefix fst. + let mut db_iter = + self.index.word_pair_proximity_docids.remap_data_type::().iter(self.wtxn)?; + + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? { + if prox > self.max_proximity { + continue; + } + + current_prefixes = match current_prefixes.take() { + Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter( + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + )?; + new_prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) } }; @@ -138,11 +203,15 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { )?; drop(prefix_fst); + drop(db_iter); // All of the word prefix pairs in the database that have a w2 // that is contained in the `suppr_pw` set must be removed as well. - let mut iter = - self.index.word_prefix_pair_proximity_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + let mut iter = self + .index + .word_prefix_pair_proximity_docids + .remap_data_type::() + .iter_mut(self.wtxn)?; while let Some(((_, w2, _), _)) = iter.next().transpose()? { if suppr_pw.contains(w2.as_bytes()) { // Delete this entry as the w2 prefix is no more in the words prefix fst. From dbba5fd461337cc9ab3b9ba82e9a3bb64c444d73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 25 Jan 2022 11:34:56 +0100 Subject: [PATCH 1223/1889] Create a function to simplify the word prefix pair proximity docids compute --- .../word_prefix_pair_proximity_docids.rs | 132 ++++++++++-------- 1 file changed, 72 insertions(+), 60 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index dcc5db614..f846e8d9e 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -115,36 +115,18 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { continue; } - current_prefixes = match current_prefixes.take() { - Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), - _otherwise => { - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - )?; - common_prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) - } - }; - - if let Some(prefixes) = current_prefixes { - buffer.clear(); - buffer.extend_from_slice(w1.as_bytes()); - buffer.push(0); - for prefix in prefixes.iter() { - if prefix.len() <= self.max_prefix_length && w2.starts_with(prefix) { - buffer.truncate(w1.len() + 1); - buffer.extend_from_slice(prefix.as_bytes()); - buffer.push(prox); - - match prefixes_cache.get_mut(&buffer) { - Some(value) => value.push(data.to_owned()), - None => { - prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); - } - } - } - } - } + insert_current_prefix_data_in_sorter( + &mut buffer, + &mut current_prefixes, + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + &common_prefix_fst_keys, + self.max_prefix_length, + w1, + w2, + prox, + data, + )?; } write_prefixes_in_sorter( @@ -165,36 +147,18 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { continue; } - current_prefixes = match current_prefixes.take() { - Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), - _otherwise => { - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - )?; - new_prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) - } - }; - - if let Some(prefixes) = current_prefixes { - buffer.clear(); - buffer.extend_from_slice(w1.as_bytes()); - buffer.push(0); - for prefix in prefixes.iter() { - if prefix.len() <= self.max_prefix_length && w2.starts_with(prefix) { - buffer.truncate(w1.len() + 1); - buffer.extend_from_slice(prefix.as_bytes()); - buffer.push(prox); - - match prefixes_cache.get_mut(&buffer) { - Some(value) => value.push(data.to_owned()), - None => { - prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); - } - } - } - } - } + insert_current_prefix_data_in_sorter( + &mut buffer, + &mut current_prefixes, + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + &new_prefix_fst_keys, + self.max_prefix_length, + w1, + w2, + prox, + data, + )?; } write_prefixes_in_sorter( @@ -247,3 +211,51 @@ fn write_prefixes_in_sorter( Ok(()) } + +/// Computes the current prefix based on the previous and the currently iterated value +/// i.e. w1, w2, prox. It also makes sure to follow the `max_prefix_length` setting. +/// +/// Uses the current prefixes values to insert the associated data i.e. RoaringBitmap, +/// into the sorter that will, later, be inserted in the LMDB database. +fn insert_current_prefix_data_in_sorter<'a>( + buffer: &mut Vec, + current_prefixes: &mut Option<&'a &'a [String]>, + prefixes_cache: &mut HashMap, Vec>>, + word_prefix_pair_proximity_docids_sorter: &mut grenad::Sorter, + prefix_fst_keys: &'a [&'a [std::string::String]], + max_prefix_length: usize, + w1: &str, + w2: &str, + prox: u8, + data: &[u8], +) -> Result<()> { + *current_prefixes = match current_prefixes.take() { + Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter(prefixes_cache, word_prefix_pair_proximity_docids_sorter)?; + prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) + } + }; + + if let Some(prefixes) = current_prefixes { + buffer.clear(); + buffer.extend_from_slice(w1.as_bytes()); + buffer.push(0); + for prefix in prefixes.iter() { + if prefix.len() <= max_prefix_length && w2.starts_with(prefix) { + buffer.truncate(w1.len() + 1); + buffer.extend_from_slice(prefix.as_bytes()); + buffer.push(prox); + + match prefixes_cache.get_mut(buffer.as_slice()) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); + } + } + } + } + } + + Ok(()) +} From e9c02173cfd71dcf0acb013c064429fa61a789b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 25 Jan 2022 14:06:45 +0100 Subject: [PATCH 1224/1889] Rework the WordsPrefixPositionDocids update to compute a subset of the database --- milli/src/update/index_documents/mod.rs | 23 +++- .../update/words_prefix_position_docids.rs | 128 +++++++++++++++--- 2 files changed, 128 insertions(+), 23 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ad3f73d0d..7ea5c3816 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -285,6 +285,7 @@ where let index_is_empty = index_documents_ids.len() == 0; let mut final_documents_ids = RoaringBitmap::new(); let mut word_pair_proximity_docids = Vec::new(); + let mut word_position_docids = Vec::new(); let mut word_docids = Vec::new(); let mut databases_seen = 0; @@ -321,6 +322,19 @@ where let chunk = grenad::Reader::new(file)?; TypedChunk::WordPairProximityDocids(chunk) } + TypedChunk::WordPositionDocids(chunk) => { + // We extract and mmap our chunk file to be able to get it for next processes. + let mut file = chunk.into_inner(); + let mmap = unsafe { memmap2::Mmap::map(&file)? }; + let cursor_mmap = CursorClonableMmap::new(ClonableMmap::from(mmap)); + let chunk = grenad::Reader::new(cursor_mmap)?; + word_position_docids.push(chunk); + + // We reconstruct our typed-chunk back. + file.rewind()?; + let chunk = grenad::Reader::new(file)?; + TypedChunk::WordPositionDocids(chunk) + } otherwise => otherwise, }; @@ -359,7 +373,11 @@ where let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids; self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; - self.execute_prefix_databases(word_docids, word_pair_proximity_docids)?; + self.execute_prefix_databases( + word_docids, + word_pair_proximity_docids, + word_position_docids, + )?; Ok(all_documents_ids.len()) } @@ -369,6 +387,7 @@ where self, word_docids: Vec>, word_pair_proximity_docids: Vec>, + word_position_docids: Vec>, ) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -453,7 +472,7 @@ where if let Some(value) = self.config.words_positions_min_level_size { builder.min_level_size(value); } - builder.execute()?; + builder.execute(word_position_docids, &previous_words_prefixes_fst)?; databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index a8346a1cb..b1b8273ef 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -1,17 +1,20 @@ +use std::collections::HashMap; use std::num::NonZeroU32; use std::{cmp, str}; use fst::Streamer; -use grenad::CompressionType; +use grenad::{CompressionType, MergerBuilder}; use heed::types::ByteSlice; -use heed::BytesEncode; +use heed::{BytesDecode, BytesEncode}; use log::debug; +use slice_group_by::GroupBy; use crate::error::SerializationError; use crate::heed_codec::StrBEU32Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, WriteMethod, + create_sorter, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, + sorter_into_lmdb_database, CursorClonableMmap, MergeFn, WriteMethod, }; use crate::{Index, Result}; @@ -54,12 +57,27 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { } #[logging_timer::time("WordPrefixPositionDocids::{}")] - pub fn execute(self) -> Result<()> { + pub fn execute>( + self, + new_word_position_docids: Vec>, + old_prefix_fst: &fst::Set, + ) -> Result<()> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); - self.index.word_prefix_position_docids.clear(self.wtxn)?; + let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - let mut word_prefix_positions_docids_sorter = create_sorter( + // We retrieve the common words between the previous and new prefix word fst. + let common_prefix_fst_keys = + fst_stream_into_vec(old_prefix_fst.op().add(&prefix_fst).intersection()); + let common_prefix_fst_keys: Vec<_> = common_prefix_fst_keys + .as_slice() + .linear_group_by_key(|x| x.chars().nth(0).unwrap()) + .collect(); + + // We compute the set of prefixes that are no more part of the prefix fst. + let suppr_pw = fst_stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); + + let mut prefix_position_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, @@ -67,39 +85,107 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { self.max_memory, ); - // We insert the word prefix position and - // corresponds to the word-prefix position where the prefixes appears - // in the prefix FST previously constructed. - let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + let mut word_position_docids_merger = MergerBuilder::new(merge_cbo_roaring_bitmaps); + word_position_docids_merger.extend(new_word_position_docids); + let mut word_position_docids_iter = + word_position_docids_merger.build().into_merger_iter()?; + + // We fetch all the new common prefixes between the previous and new prefix fst. + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some((key, data)) = word_position_docids_iter.next()? { + let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + + current_prefixes = match current_prefixes.take() { + Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter( + &mut prefixes_cache, + &mut prefix_position_docids_sorter, + )?; + common_prefix_fst_keys.iter().find(|prefixes| word.starts_with(&prefixes[0])) + } + }; + + if let Some(prefixes) = current_prefixes { + for prefix in prefixes.iter() { + if word.starts_with(prefix) { + buffer.clear(); + buffer.extend_from_slice(prefix.as_bytes()); + buffer.extend_from_slice(&pos.to_be_bytes()); + match prefixes_cache.get_mut(&buffer) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); + } + } + } + } + } + } + + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?; + + // We fetch the docids associated to the newly added word prefix fst only. let db = self.index.word_position_docids.remap_data_type::(); - // iter over all prefixes in the prefix fst. - let mut word_stream = prefix_fst.stream(); - while let Some(prefix_bytes) = word_stream.next() { + let mut new_prefixes_stream = prefix_fst.op().add(old_prefix_fst).difference(); + while let Some(prefix_bytes) = new_prefixes_stream.next() { let prefix = str::from_utf8(prefix_bytes).map_err(|_| { SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } })?; // iter over all lines of the DB where the key is prefixed by the current prefix. - let mut iter = db + let iter = db .remap_key_type::() - .prefix_iter(self.wtxn, &prefix_bytes)? + .prefix_iter(self.wtxn, prefix_bytes)? .remap_key_type::(); - while let Some(((_word, pos), data)) = iter.next().transpose()? { - let key = (prefix, pos); - let bytes = StrBEU32Codec::bytes_encode(&key).unwrap(); - word_prefix_positions_docids_sorter.insert(bytes, data)?; + for result in iter { + let ((word, pos), data) = result?; + if word.starts_with(prefix) { + let key = (prefix, pos); + let bytes = StrBEU32Codec::bytes_encode(&key).unwrap(); + prefix_position_docids_sorter.insert(bytes, data)?; + } } } + drop(new_prefixes_stream); + + // We remove all the entries that are no more required in this word prefix position + // docids database. + let mut iter = + self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + while let Some(((prefix, _), _)) = iter.next().transpose()? { + if suppr_pw.contains(prefix.as_bytes()) { + unsafe { iter.del_current()? }; + } + } + + drop(iter); + // We finally write all the word prefix position docids into the LMDB database. sorter_into_lmdb_database( self.wtxn, *self.index.word_prefix_position_docids.as_polymorph(), - word_prefix_positions_docids_sorter, + prefix_position_docids_sorter, merge_cbo_roaring_bitmaps, - WriteMethod::Append, + WriteMethod::GetMergePut, )?; Ok(()) } } + +fn write_prefixes_in_sorter( + prefixes: &mut HashMap, Vec>>, + sorter: &mut grenad::Sorter, +) -> Result<()> { + for (key, data_slices) in prefixes.drain() { + for data in data_slices { + sorter.insert(&key, data)?; + } + } + + Ok(()) +} From 51d1e64b238b73919bd15f564d3f88bfa1901947 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 25 Jan 2022 17:12:12 +0100 Subject: [PATCH 1225/1889] Remove, now useless, the WriteMethod enum --- milli/src/update/facets.rs | 6 +- .../index_documents/helpers/grenad_helpers.rs | 74 ++++++------------- milli/src/update/index_documents/mod.rs | 6 -- milli/src/update/word_prefix_docids.rs | 3 +- .../word_prefix_pair_proximity_docids.rs | 3 +- .../update/words_prefix_position_docids.rs | 3 +- 6 files changed, 28 insertions(+), 67 deletions(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index a2f17cba3..19684c6ea 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -15,9 +15,7 @@ use crate::heed_codec::facet::{ FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, }; use crate::heed_codec::CboRoaringBitmapCodec; -use crate::update::index_documents::{ - create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, -}; +use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; use crate::{FieldId, Index, Result}; pub struct Facets<'t, 'u, 'i> { @@ -120,7 +118,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { *self.index.facet_id_f64_docids.as_polymorph(), facet_number_levels, |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" })?, - WriteMethod::GetMergePut, )?; write_into_lmdb_database( @@ -128,7 +125,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { *self.index.facet_id_string_docids.as_polymorph(), facet_string_levels, |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" })?, - WriteMethod::GetMergePut, )?; } diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 10662892b..eef067122 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -9,7 +9,6 @@ use log::debug; use super::{ClonableMmap, MergeFn}; use crate::error::InternalError; -use crate::update::index_documents::WriteMethod; use crate::Result; pub type CursorClonableMmap = io::Cursor; @@ -169,34 +168,22 @@ pub fn write_into_lmdb_database( database: heed::PolyDatabase, mut reader: Reader, merge: MergeFn, - method: WriteMethod, ) -> Result<()> { debug!("Writing MTBL stores..."); let before = Instant::now(); - match method { - WriteMethod::Append => { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = reader.next()? { + while let Some((k, v)) = reader.next()? { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; + match iter.next().transpose()? { + Some((key, old_val)) if key == k => { + let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; + let val = merge(k, &vals)?; // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; + unsafe { iter.put_current(k, &val)? }; } - } - WriteMethod::GetMergePut => { - while let Some((k, v)) = reader.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; - let val = merge(k, &vals)?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; } } } @@ -210,12 +197,11 @@ pub fn sorter_into_lmdb_database( database: heed::PolyDatabase, sorter: Sorter, merge: MergeFn, - method: WriteMethod, ) -> Result<()> { debug!("Writing MTBL sorter..."); let before = Instant::now(); - merger_iter_into_lmdb_database(wtxn, database, sorter.into_merger_iter()?, merge, method)?; + merger_iter_into_lmdb_database(wtxn, database, sorter.into_merger_iter()?, merge)?; debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) @@ -226,34 +212,22 @@ fn merger_iter_into_lmdb_database( database: heed::PolyDatabase, mut sorter: MergerIter, merge: MergeFn, - method: WriteMethod, ) -> Result<()> { - match method { - WriteMethod::Append => { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = sorter.next()? { + while let Some((k, v)) = sorter.next()? { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; + match iter.next().transpose()? { + Some((key, old_val)) if key == k => { + let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; + let val = merge(k, &vals).map_err(|_| { + // TODO just wrap this error? + InternalError::IndexingMergingKeys { process: "get-put-merge" } + })?; // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; + unsafe { iter.put_current(k, &val)? }; } - } - WriteMethod::GetMergePut => { - while let Some((k, v)) = sorter.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).map_err(|_| { - // TODO just wrap this error? - InternalError::IndexingMergingKeys { process: "get-put-merge" } - })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; } } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7ea5c3816..ee80d8ada 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -59,12 +59,6 @@ impl Default for IndexDocumentsMethod { } } -#[derive(Debug, Copy, Clone)] -pub enum WriteMethod { - Append, - GetMergePut, -} - pub struct IndexDocuments<'t, 'u, 'i, 'a, F> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 1e2996c9b..cf50a5b8a 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -7,7 +7,7 @@ use slice_group_by::GroupBy; use crate::update::index_documents::{ create_sorter, fst_stream_into_hashset, fst_stream_into_vec, merge_roaring_bitmaps, - sorter_into_lmdb_database, CursorClonableMmap, MergeFn, WriteMethod, + sorter_into_lmdb_database, CursorClonableMmap, MergeFn, }; use crate::{Index, Result}; @@ -126,7 +126,6 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { *self.index.word_prefix_docids.as_polymorph(), prefix_docids_sorter, merge_roaring_bitmaps, - WriteMethod::GetMergePut, )?; Ok(()) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index f846e8d9e..5b025e4fc 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -8,7 +8,7 @@ use slice_group_by::GroupBy; use crate::update::index_documents::{ create_sorter, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, - sorter_into_lmdb_database, CursorClonableMmap, MergeFn, WriteMethod, + sorter_into_lmdb_database, CursorClonableMmap, MergeFn, }; use crate::{Index, Result, StrStrU8Codec}; @@ -192,7 +192,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { *self.index.word_prefix_pair_proximity_docids.as_polymorph(), word_prefix_pair_proximity_docids_sorter, merge_cbo_roaring_bitmaps, - WriteMethod::GetMergePut, )?; Ok(()) diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index b1b8273ef..178684cf0 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -14,7 +14,7 @@ use crate::heed_codec::StrBEU32Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ create_sorter, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, - sorter_into_lmdb_database, CursorClonableMmap, MergeFn, WriteMethod, + sorter_into_lmdb_database, CursorClonableMmap, MergeFn, }; use crate::{Index, Result}; @@ -170,7 +170,6 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { *self.index.word_prefix_position_docids.as_polymorph(), prefix_position_docids_sorter, merge_cbo_roaring_bitmaps, - WriteMethod::GetMergePut, )?; Ok(()) From fb79c324304ea093ba6e79b04444b0b1907f9511 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 27 Jan 2022 11:00:18 +0100 Subject: [PATCH 1226/1889] Compute the new, common and, deleted prefix words fst once --- milli/src/update/index_documents/mod.rs | 43 +++++++++++++++++-- milli/src/update/word_prefix_docids.rs | 37 +++++----------- .../word_prefix_pair_proximity_docids.rs | 43 ++++++------------- .../update/words_prefix_position_docids.rs | 40 ++++++----------- 4 files changed, 75 insertions(+), 88 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ee80d8ada..a31d1875b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -12,6 +12,7 @@ use crossbeam_channel::{Receiver, Sender}; use log::debug; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; +use slice_group_by::GroupBy; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; pub use self::helpers::{ @@ -420,6 +421,27 @@ where } builder.execute()?; + let current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + + // We retrieve the common words between the previous and new prefix word fst. + let common_prefix_fst_words = fst_stream_into_vec( + previous_words_prefixes_fst.op().add(¤t_prefix_fst).intersection(), + ); + let common_prefix_fst_words: Vec<_> = common_prefix_fst_words + .as_slice() + .linear_group_by_key(|x| x.chars().nth(0).unwrap()) + .collect(); + + // We retrieve the newly added words between the previous and new prefix word fst. + let new_prefix_fst_words = fst_stream_into_vec( + current_prefix_fst.op().add(&previous_words_prefixes_fst).difference(), + ); + + // We compute the set of prefixes that are no more part of the prefix fst. + let del_prefix_fst_words = fst_stream_into_hashset( + previous_words_prefixes_fst.op().add(¤t_prefix_fst).difference(), + ); + databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen, @@ -432,7 +454,12 @@ where builder.chunk_compression_level = self.indexer_config.chunk_compression_level; builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_memory = self.indexer_config.max_memory; - builder.execute(word_docids, &previous_words_prefixes_fst)?; + builder.execute( + word_docids, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -446,7 +473,12 @@ where builder.chunk_compression_level = self.indexer_config.chunk_compression_level; builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_memory = self.indexer_config.max_memory; - builder.execute(word_pair_proximity_docids, &previous_words_prefixes_fst)?; + builder.execute( + word_pair_proximity_docids, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -466,7 +498,12 @@ where if let Some(value) = self.config.words_positions_min_level_size { builder.min_level_size(value); } - builder.execute(word_position_docids, &previous_words_prefixes_fst)?; + builder.execute( + word_position_docids, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index cf50a5b8a..624037f8f 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -1,13 +1,10 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; -use fst::Streamer; use grenad::{CompressionType, MergerBuilder}; use heed::types::ByteSlice; -use slice_group_by::GroupBy; use crate::update::index_documents::{ - create_sorter, fst_stream_into_hashset, fst_stream_into_vec, merge_roaring_bitmaps, - sorter_into_lmdb_database, CursorClonableMmap, MergeFn, + create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn, }; use crate::{Index, Result}; @@ -36,24 +33,13 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } #[logging_timer::time("WordPrefixDocids::{}")] - pub fn execute>( + pub fn execute( self, new_word_docids: Vec>, - old_prefix_fst: &fst::Set, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, ) -> Result<()> { - let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - - // We retrieve the common words between the previous and new prefix word fst. - let common_prefix_fst_keys = - fst_stream_into_vec(old_prefix_fst.op().add(&prefix_fst).intersection()); - let common_prefix_fst_keys: Vec<_> = common_prefix_fst_keys - .as_slice() - .linear_group_by_key(|x| x.chars().nth(0).unwrap()) - .collect(); - - // We compute the set of prefixes that are no more part of the prefix fst. - let suppr_pw = fst_stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); - // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( @@ -75,7 +61,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), _otherwise => { write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; - common_prefix_fst_keys + common_prefix_fst_words .iter() .find(|prefixes| word.starts_with(&prefixes[0].as_bytes())) } @@ -99,21 +85,18 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // We fetch the docids associated to the newly added word prefix fst only. let db = self.index.word_docids.remap_data_type::(); - let mut new_prefixes_stream = prefix_fst.op().add(old_prefix_fst).difference(); - while let Some(bytes) = new_prefixes_stream.next() { - let prefix = std::str::from_utf8(bytes)?; + for prefix in new_prefix_fst_words { + let prefix = std::str::from_utf8(prefix.as_bytes())?; for result in db.prefix_iter(self.wtxn, prefix)? { let (_word, data) = result?; prefix_docids_sorter.insert(prefix, data)?; } } - drop(new_prefixes_stream); - // We remove all the entries that are no more required in this word prefix docids database. let mut iter = self.index.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data(); while let Some((prefix, _)) = iter.next().transpose()? { - if suppr_pw.contains(prefix.as_bytes()) { + if del_prefix_fst_words.contains(prefix.as_bytes()) { unsafe { iter.del_current()? }; } } diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 5b025e4fc..530c2867e 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use grenad::{CompressionType, MergerBuilder}; use heed::types::ByteSlice; @@ -7,8 +7,8 @@ use log::debug; use slice_group_by::GroupBy; use crate::update::index_documents::{ - create_sorter, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, - sorter_into_lmdb_database, CursorClonableMmap, MergeFn, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, + MergeFn, }; use crate::{Index, Result, StrStrU8Codec}; @@ -62,40 +62,24 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute>( + pub fn execute( self, new_word_pair_proximity_docids: Vec>, - old_prefix_fst: &fst::Set, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + let new_prefix_fst_words: Vec<_> = + new_prefix_fst_words.linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect(); + // We retrieve and merge the created word pair proximities docids entries // for the newly added documents. let mut wppd_merger = MergerBuilder::new(merge_cbo_roaring_bitmaps); wppd_merger.extend(new_word_pair_proximity_docids); let mut wppd_iter = wppd_merger.build().into_merger_iter()?; - let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - - // We retrieve the common words between the previous and new prefix word fst. - let common_prefix_fst_keys = - fst_stream_into_vec(old_prefix_fst.op().add(&prefix_fst).intersection()); - let common_prefix_fst_keys: Vec<_> = common_prefix_fst_keys - .as_slice() - .linear_group_by_key(|x| x.chars().nth(0).unwrap()) - .collect(); - - // We retrieve the newly added words between the previous and new prefix word fst. - let new_prefix_fst_keys = - fst_stream_into_vec(prefix_fst.op().add(old_prefix_fst).difference()); - let new_prefix_fst_keys: Vec<_> = new_prefix_fst_keys - .as_slice() - .linear_group_by_key(|x| x.chars().nth(0).unwrap()) - .collect(); - - // We compute the set of prefixes that are no more part of the prefix fst. - let suppr_pw = fst_stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); - let mut word_prefix_pair_proximity_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, self.chunk_compression_type, @@ -120,7 +104,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &mut current_prefixes, &mut prefixes_cache, &mut word_prefix_pair_proximity_docids_sorter, - &common_prefix_fst_keys, + common_prefix_fst_words, self.max_prefix_length, w1, w2, @@ -152,7 +136,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &mut current_prefixes, &mut prefixes_cache, &mut word_prefix_pair_proximity_docids_sorter, - &new_prefix_fst_keys, + &new_prefix_fst_words, self.max_prefix_length, w1, w2, @@ -166,7 +150,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &mut word_prefix_pair_proximity_docids_sorter, )?; - drop(prefix_fst); drop(db_iter); // All of the word prefix pairs in the database that have a w2 @@ -177,7 +160,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { .remap_data_type::() .iter_mut(self.wtxn)?; while let Some(((_, w2, _), _)) = iter.next().transpose()? { - if suppr_pw.contains(w2.as_bytes()) { + if del_prefix_fst_words.contains(w2.as_bytes()) { // Delete this entry as the w2 prefix is no more in the words prefix fst. unsafe { iter.del_current()? }; } diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 178684cf0..c992d01ec 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -1,20 +1,18 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::num::NonZeroU32; use std::{cmp, str}; -use fst::Streamer; use grenad::{CompressionType, MergerBuilder}; use heed::types::ByteSlice; use heed::{BytesDecode, BytesEncode}; use log::debug; -use slice_group_by::GroupBy; use crate::error::SerializationError; use crate::heed_codec::StrBEU32Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ - create_sorter, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, - sorter_into_lmdb_database, CursorClonableMmap, MergeFn, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, + MergeFn, }; use crate::{Index, Result}; @@ -57,26 +55,15 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { } #[logging_timer::time("WordPrefixPositionDocids::{}")] - pub fn execute>( + pub fn execute( self, new_word_position_docids: Vec>, - old_prefix_fst: &fst::Set, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, ) -> Result<()> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); - let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; - - // We retrieve the common words between the previous and new prefix word fst. - let common_prefix_fst_keys = - fst_stream_into_vec(old_prefix_fst.op().add(&prefix_fst).intersection()); - let common_prefix_fst_keys: Vec<_> = common_prefix_fst_keys - .as_slice() - .linear_group_by_key(|x| x.chars().nth(0).unwrap()) - .collect(); - - // We compute the set of prefixes that are no more part of the prefix fst. - let suppr_pw = fst_stream_into_hashset(old_prefix_fst.op().add(&prefix_fst).difference()); - let mut prefix_position_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, self.chunk_compression_type, @@ -104,7 +91,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { &mut prefixes_cache, &mut prefix_position_docids_sorter, )?; - common_prefix_fst_keys.iter().find(|prefixes| word.starts_with(&prefixes[0])) + common_prefix_fst_words.iter().find(|prefixes| word.starts_with(&prefixes[0])) } }; @@ -129,16 +116,15 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { // We fetch the docids associated to the newly added word prefix fst only. let db = self.index.word_position_docids.remap_data_type::(); - let mut new_prefixes_stream = prefix_fst.op().add(old_prefix_fst).difference(); - while let Some(prefix_bytes) = new_prefixes_stream.next() { - let prefix = str::from_utf8(prefix_bytes).map_err(|_| { + for prefix_bytes in new_prefix_fst_words { + let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } })?; // iter over all lines of the DB where the key is prefixed by the current prefix. let iter = db .remap_key_type::() - .prefix_iter(self.wtxn, prefix_bytes)? + .prefix_iter(self.wtxn, prefix_bytes.as_bytes())? .remap_key_type::(); for result in iter { let ((word, pos), data) = result?; @@ -150,14 +136,12 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { } } - drop(new_prefixes_stream); - // We remove all the entries that are no more required in this word prefix position // docids database. let mut iter = self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data(); while let Some(((prefix, _), _)) = iter.next().transpose()? { - if suppr_pw.contains(prefix.as_bytes()) { + if del_prefix_fst_words.contains(prefix.as_bytes()) { unsafe { iter.del_current()? }; } } From d852dc0d2bacede8b9ce3806bba9d3a9a4e29573 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 1 Feb 2022 20:10:16 +0100 Subject: [PATCH 1227/1889] fix phrase search --- cli/src/main.rs | 12 +++++++- milli/src/search/criteria/mod.rs | 49 +++++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 9d807e8c6..11e203f4d 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -448,8 +448,10 @@ impl Search { #[derive(Debug, StructOpt)] struct SettingsUpdate { - #[structopt(short, long)] + #[structopt(long)] filterable_attributes: Option>, + #[structopt(long)] + criteria: Option>, } impl Performer for SettingsUpdate { @@ -468,6 +470,14 @@ impl Performer for SettingsUpdate { } } + if let Some(criteria) = self.criteria { + if !criteria.is_empty() { + update.set_criteria(criteria); + } else { + update.reset_criteria(); + } + } + let mut bars = Vec::new(); let progesses = MultiProgress::new(); for _ in 0..4 { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 0cad7c013..40b426198 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use std::collections::HashMap; +use itertools::Itertools; use roaring::RoaringBitmap; use self::asc_desc::AscDesc; @@ -318,21 +319,41 @@ pub fn resolve_query_tree<'t>( } Phrase(words) => { let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for slice in words.windows(2) { - let (left, right) = (&slice[0], &slice[1]); - match ctx.word_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => { - if pair_docids.is_empty() { - return Ok(RoaringBitmap::new()); - } else if first_loop { - candidates = pair_docids; - first_loop = false; - } else { - candidates &= pair_docids; - } + let mut first_iter = true; + let winsize = words.len().min(7); + + for win in words.windows(winsize) { + // Get all the word pairs and their compute their relative distance + let dists = win + .iter() + .enumerate() + .cartesian_product(win.iter().enumerate()) + .filter(|(x, y)| y > x) + .map(|((pos1, s1), (pos2, s2))| (s1, s2, pos2 - pos1)); + + let mut bitmaps = Vec::with_capacity(winsize.pow(2)); + + for (s1, s2, d) in dists { + match ctx.word_pair_proximity_docids(s1, s2, d as u8)? { + Some(m) => bitmaps.push(m), + None => return Ok(RoaringBitmap::new()), + } + } + + // We sort the bitmaps so that we perform the small intersections first, which is faster. + bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); + + for bitmap in bitmaps { + if first_iter { + candidates = bitmap; + first_iter = false; + } else { + candidates &= bitmap; + } + // There will be no match, return early + if candidates.is_empty() { + break; } - None => return Ok(RoaringBitmap::new()), } } Ok(candidates) From 642c01d0dc4f0f55ce6ffaa1e1184da11c698cac Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 20 Jan 2022 18:34:54 +0100 Subject: [PATCH 1228/1889] set max typos on ngram to 1 --- milli/src/search/query_tree.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 0744231ae..d4cc338c8 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -260,12 +260,12 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result QueryKind { +fn typos(word: String, authorize_typos: bool, max_typos: u8) -> QueryKind { if authorize_typos { match word.chars().count() { 0..=4 => QueryKind::exact(word), - 5..=8 => QueryKind::tolerant(1, word), - _ => QueryKind::tolerant(2, word), + 5..=8 => QueryKind::tolerant(1.min(max_typos), word), + _ => QueryKind::tolerant(2.min(max_typos), word), } } else { QueryKind::exact(word) @@ -316,8 +316,10 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } - children - .push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); + children.push(Operation::Query(Query { + prefix, + kind: typos(word, authorize_typos, 2), + })); Ok(Operation::or(false, children)) } // create a CONSECUTIVE operation wrapping all word in the phrase @@ -363,8 +365,9 @@ fn create_query_tree( .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); - let query = - Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; + let query = Query { prefix: is_prefix, kind: typos(concat, true, 1) }; + // let query = + // Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; operations.push(Operation::Query(query)); and_op_children.push(Operation::or(false, operations)); } From 55e6cb9c7b179181e1e131265b0a66da76a76250 Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 20 Jan 2022 18:35:11 +0100 Subject: [PATCH 1229/1889] typos on first letter counts as 2 --- Cargo.toml | 3 +++ milli/src/search/mod.rs | 37 +++++++++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6b3e12f07..9b97dee88 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,3 +18,6 @@ opt-level = 3 opt-level = 3 [profile.test.build-override] opt-level = 3 + +[patch.crates-io] +fst = { path = "/Users/mpostma/Documents/code/rust/fst/" } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7c8722187..6b2e50c94 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -7,7 +7,8 @@ use std::str::Utf8Error; use std::time::Instant; use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; -use fst::{IntoStreamer, Streamer}; +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use log::debug; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; @@ -285,19 +286,39 @@ pub fn word_derivations<'c>( Entry::Vacant(entry) => { let mut derived_words = Vec::new(); let dfa = build_dfa(word, max_typo, is_prefix); - let mut stream = fst.search_with_state(&dfa).into_stream(); + if max_typo == 1 { + let starts = Str::new(get_first(word)); + let mut stream = fst.search_with_state(starts.intersection(&dfa)).into_stream(); - while let Some((word, state)) = stream.next() { - let word = std::str::from_utf8(word)?; - let distance = dfa.distance(state); - derived_words.push((word.to_string(), distance.to_u8())); + while let Some((word, state)) = stream.next() { + let word = std::str::from_utf8(word)?; + let distance = dfa.distance(state.1); + derived_words.push((word.to_string(), distance.to_u8())); + } + + Ok(entry.insert(derived_words)) + } else { + let mut stream = fst.search_with_state(&dfa).into_stream(); + + while let Some((word, state)) = stream.next() { + let word = std::str::from_utf8(word)?; + let distance = dfa.distance(state); + derived_words.push((word.to_string(), distance.to_u8())); + } + + Ok(entry.insert(derived_words)) } - - Ok(entry.insert(derived_words)) } } } +fn get_first(s: &str) -> &str { + match s.chars().next() { + Some(c) => &s[..c.len_utf8()], + None => s, + } +} + pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { let lev = match typos { 0 => &LEVDIST0, From d0aabde502f3450b8c26a8cd8e6ee0240bd7cf1a Mon Sep 17 00:00:00 2001 From: mpostma Date: Thu, 20 Jan 2022 23:23:07 +0100 Subject: [PATCH 1230/1889] optimize 2 typos case --- milli/src/search/mod.rs | 54 ++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 6b2e50c94..cf596fa7a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -285,29 +285,49 @@ pub fn word_derivations<'c>( Entry::Occupied(entry) => Ok(entry.into_mut()), Entry::Vacant(entry) => { let mut derived_words = Vec::new(); - let dfa = build_dfa(word, max_typo, is_prefix); - if max_typo == 1 { - let starts = Str::new(get_first(word)); - let mut stream = fst.search_with_state(starts.intersection(&dfa)).into_stream(); + if max_typo == 0 { + if is_prefix { + let prefix = Str::new(word).starts_with(); + let mut stream = fst.search(prefix).into_stream(); - while let Some((word, state)) = stream.next() { - let word = std::str::from_utf8(word)?; - let distance = dfa.distance(state.1); - derived_words.push((word.to_string(), distance.to_u8())); + while let Some(word) = stream.next() { + let word = std::str::from_utf8(word)?; + derived_words.push((word.to_string(), 0)); + } + } else { + let automaton = Str::new(word); + let mut stream = fst.search(automaton).into_stream(); + while let Some(word) = stream.next() { + let word = std::str::from_utf8(word)?; + derived_words.push((word.to_string(), 0)); + } } - - Ok(entry.insert(derived_words)) } else { - let mut stream = fst.search_with_state(&dfa).into_stream(); + if max_typo == 1 { + let dfa = build_dfa(word, 1, is_prefix); + let starts = Str::new(get_first(word)).starts_with(); + let mut stream = fst.search_with_state(starts.intersection(&dfa)).into_stream(); - while let Some((word, state)) = stream.next() { - let word = std::str::from_utf8(word)?; - let distance = dfa.distance(state); - derived_words.push((word.to_string(), distance.to_u8())); + while let Some((word, state)) = stream.next() { + let word = std::str::from_utf8(word)?; + let distance = dfa.distance(state.1); + derived_words.push((word.to_string(), distance.to_u8())); + } + } else { + let starts = Str::new(get_first(word)).starts_with(); + let first = build_dfa(word, 1, is_prefix).intersection((&starts).complement()); + let second = build_dfa(word, 2, is_prefix).intersection(&starts); + let automaton = first.union(second); + + let mut stream = fst.search(automaton).into_stream(); + + while let Some(word) = stream.next() { + let word = std::str::from_utf8(word)?; + derived_words.push((word.to_string(), 2)); + } } - - Ok(entry.insert(derived_words)) } + Ok(entry.insert(derived_words)) } } } From 7541ab99cdcc0f60fd92895f697a7e628d983b58 Mon Sep 17 00:00:00 2001 From: mpostma Date: Tue, 25 Jan 2022 10:06:27 +0100 Subject: [PATCH 1231/1889] review changes --- Cargo.toml | 3 --- milli/src/search/mod.rs | 18 ++++++------------ milli/src/search/query_tree.rs | 2 -- 3 files changed, 6 insertions(+), 17 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9b97dee88..6b3e12f07 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,3 @@ opt-level = 3 opt-level = 3 [profile.test.build-override] opt-level = 3 - -[patch.crates-io] -fst = { path = "/Users/mpostma/Documents/code/rust/fst/" } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index cf596fa7a..67b86d6bf 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -294,24 +294,18 @@ pub fn word_derivations<'c>( let word = std::str::from_utf8(word)?; derived_words.push((word.to_string(), 0)); } - } else { - let automaton = Str::new(word); - let mut stream = fst.search(automaton).into_stream(); - while let Some(word) = stream.next() { - let word = std::str::from_utf8(word)?; - derived_words.push((word.to_string(), 0)); - } + } else if fst.contains(word) { + derived_words.push((word.to_string(), 0)); } } else { if max_typo == 1 { let dfa = build_dfa(word, 1, is_prefix); let starts = Str::new(get_first(word)).starts_with(); - let mut stream = fst.search_with_state(starts.intersection(&dfa)).into_stream(); + let mut stream = fst.search(starts.intersection(&dfa)).into_stream(); - while let Some((word, state)) = stream.next() { + while let Some(word) = stream.next() { let word = std::str::from_utf8(word)?; - let distance = dfa.distance(state.1); - derived_words.push((word.to_string(), distance.to_u8())); + derived_words.push((word.to_string(), 1)); } } else { let starts = Str::new(get_first(word)).starts_with(); @@ -335,7 +329,7 @@ pub fn word_derivations<'c>( fn get_first(s: &str) -> &str { match s.chars().next() { Some(c) => &s[..c.len_utf8()], - None => s, + None => panic!("unexpected empty query"), } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index d4cc338c8..355e42663 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -366,8 +366,6 @@ fn create_query_tree( let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); let query = Query { prefix: is_prefix, kind: typos(concat, true, 1) }; - // let query = - // Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; operations.push(Operation::Query(query)); and_op_children.push(Operation::or(false, operations)); } From d59bcea74978ae24d8aa368254df0889ff0eec31 Mon Sep 17 00:00:00 2001 From: Many Date: Thu, 18 Nov 2021 17:04:09 +0100 Subject: [PATCH 1232/1889] Revert "Revert "Change chunk size to 4MiB to fit more the end user usage"" --- milli/src/update/index_documents/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4fbb75d5f..f238c7546 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -243,7 +243,7 @@ where let chunk_iter = grenad_obkv_into_chunks( documents_file, params.clone(), - self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 128), // 128MiB + self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB ); let result = chunk_iter.map(|chunk_iter| { From 9142ba9dd466c4dd2d79dcfc8c056adecd5e79bc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 2 Feb 2022 17:55:13 +0100 Subject: [PATCH 1233/1889] Fix the parsing of ndjson requests to index more than the first line --- http-ui/src/main.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 039a6c2ae..ebfe4b073 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1063,12 +1063,11 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let mut buf = String::new(); - let mut reader = BufReader::new(reader); - - while reader.read_line(&mut buf)? > 0 { - documents.extend_from_json(&mut buf.as_bytes())?; + for result in BufReader::new(reader).lines() { + let line = result?; + documents.extend_from_json(Cursor::new(line))?; } + documents.finish()?; Ok(writer.into_inner()) From 13de2510474f55851ef770d7671542fec12d47ec Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 3 Feb 2022 15:01:34 +0100 Subject: [PATCH 1234/1889] rewrite word pair distance gathering --- milli/src/search/criteria/mod.rs | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 40b426198..8306f5d0e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use std::collections::HashMap; -use itertools::Itertools; use roaring::RoaringBitmap; use self::asc_desc::AscDesc; @@ -323,20 +322,16 @@ pub fn resolve_query_tree<'t>( let winsize = words.len().min(7); for win in words.windows(winsize) { - // Get all the word pairs and their compute their relative distance - let dists = win - .iter() - .enumerate() - .cartesian_product(win.iter().enumerate()) - .filter(|(x, y)| y > x) - .map(|((pos1, s1), (pos2, s2))| (s1, s2, pos2 - pos1)); - + // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - - for (s1, s2, d) in dists { - match ctx.word_pair_proximity_docids(s1, s2, d as u8)? { - Some(m) => bitmaps.push(m), - None => return Ok(RoaringBitmap::new()), + for (offset, s1) in win.iter().enumerate() { + for (dist, s2) in win.iter().skip(offset).enumerate() { + match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this distance, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } } } From bd2262ceeaf3fa9b87724d8ae56d69f426504ed6 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 3 Feb 2022 15:46:11 +0100 Subject: [PATCH 1235/1889] allow null values in csv --- milli/src/documents/builder.rs | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index f95fa9190..2860c4b86 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -118,16 +118,26 @@ impl DocumentBatchBuilder { for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { let value = match ty { AllowedType::Number => { - value.parse::().map(Value::from).map_err(|error| { - Error::ParseFloat { - error, - // +1 for the header offset. - line: i + 1, - value: value.to_string(), - } - })? + if value.trim().is_empty() { + Value::Null + } else { + value.trim().parse::().map(Value::from).map_err(|error| { + Error::ParseFloat { + error, + // +1 for the header offset. + line: i + 1, + value: value.to_string(), + } + })? + } + } + AllowedType::String => { + if value.is_empty() { + Value::Null + } else { + Value::String(value.to_string()) + } } - AllowedType::String => Value::String(value.to_string()), }; this.value_buffer.clear(); From 1279c38ac957a3295670049c2eb38e176e29528e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Sat, 5 Feb 2022 18:29:11 +0100 Subject: [PATCH 1236/1889] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 1df4fd69c..c26083cf8 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 Meili SAS +Copyright (c) 2021-2022 Meilisearch Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From d03b3ceb58cc546b007cfe9745ef67e04dd094af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 7 Feb 2022 18:39:26 +0100 Subject: [PATCH 1237/1889] Update version for the next release (v0.22.1) --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index e07cd9037..6fa5a2be0 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.22.0" +version = "0.22.1" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index c091ccd4c..399b39bdf 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.22.0" +version = "0.22.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 94ba4beff..767098f0b 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.22.0" +version = "0.22.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 9359e76ce..8fc3a4f06 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.22.0" +version = "0.22.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 6b830c29e..63d0785cb 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.22.0" +version = "0.22.1" authors = ["Kerollmops "] edition = "2018" From 48542ac8fd8fb2009f932645a446a83f3535960d Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 15 Feb 2022 11:41:55 +0100 Subject: [PATCH 1238/1889] get rid of chrono in favor of time --- milli/Cargo.toml | 2 +- milli/src/index.rs | 33 ++++++++++++++++++---------- milli/src/update/clear_documents.rs | 4 ++-- milli/src/update/delete_documents.rs | 4 ++-- milli/src/update/facets.rs | 4 ++-- milli/src/update/settings.rs | 4 ++-- 6 files changed, 30 insertions(+), 21 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 63d0785cb..7957058f8 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,7 +9,6 @@ bimap = { version = "0.6.1", features = ["serde"] } bincode = "1.3.3" bstr = "0.2.15" byteorder = "1.4.2" -chrono = { version = "0.4.19", features = ["serde"] } concat-arrays = "0.1.2" crossbeam-channel = "0.5.1" either = "1.6.1" @@ -36,6 +35,7 @@ slice-group-by = "0.2.6" smallstr = { version = "0.2.0", features = ["serde"] } smallvec = "1.6.1" tempfile = "3.2.0" +time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } uuid = { version = "0.8.2", features = ["v4"] } filter-parser = { path = "../filter-parser" } diff --git a/milli/src/index.rs b/milli/src/index.rs index 70081dfb0..568d50ad8 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -3,12 +3,12 @@ use std::collections::{HashMap, HashSet}; use std::mem::size_of; use std::path::Path; -use chrono::{DateTime, Utc}; use heed::flags::Flags; use heed::types::*; use heed::{Database, PolyDatabase, RoTxn, RwTxn}; use roaring::RoaringBitmap; use rstar::RTree; +use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; @@ -156,10 +156,19 @@ impl Index { fn initialize_creation_dates(env: &heed::Env, main: PolyDatabase) -> heed::Result<()> { let mut txn = env.write_txn()?; // The db was just created, we update its metadata with the relevant information. - if main.get::<_, Str, SerdeJson>>(&txn, main_key::CREATED_AT_KEY)?.is_none() { - let now = Utc::now(); - main.put::<_, Str, SerdeJson>>(&mut txn, main_key::UPDATED_AT_KEY, &now)?; - main.put::<_, Str, SerdeJson>>(&mut txn, main_key::CREATED_AT_KEY, &now)?; + if main.get::<_, Str, SerdeJson>(&txn, main_key::CREATED_AT_KEY)?.is_none() + { + let now = OffsetDateTime::now_utc(); + main.put::<_, Str, SerdeJson>( + &mut txn, + main_key::UPDATED_AT_KEY, + &now, + )?; + main.put::<_, Str, SerdeJson>( + &mut txn, + main_key::CREATED_AT_KEY, + &now, + )?; txn.commit()?; } Ok(()) @@ -219,7 +228,7 @@ impl Index { /// Writes the documents primary key, this is the field name that is used to store the id. pub(crate) fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { - self.set_updated_at(wtxn, &Utc::now())?; + self.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, &primary_key) } @@ -829,10 +838,10 @@ impl Index { } /// Returns the index creation time. - pub fn created_at(&self, rtxn: &RoTxn) -> Result> { + pub fn created_at(&self, rtxn: &RoTxn) -> Result { Ok(self .main - .get::<_, Str, SerdeJson>>(rtxn, main_key::CREATED_AT_KEY)? + .get::<_, Str, SerdeJson>(rtxn, main_key::CREATED_AT_KEY)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, key: Some(main_key::CREATED_AT_KEY), @@ -840,10 +849,10 @@ impl Index { } /// Returns the index last updated time. - pub fn updated_at(&self, rtxn: &RoTxn) -> Result> { + pub fn updated_at(&self, rtxn: &RoTxn) -> Result { Ok(self .main - .get::<_, Str, SerdeJson>>(rtxn, main_key::UPDATED_AT_KEY)? + .get::<_, Str, SerdeJson>(rtxn, main_key::UPDATED_AT_KEY)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, key: Some(main_key::UPDATED_AT_KEY), @@ -853,9 +862,9 @@ impl Index { pub(crate) fn set_updated_at( &self, wtxn: &mut RwTxn, - time: &DateTime, + time: &OffsetDateTime, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>>(wtxn, main_key::UPDATED_AT_KEY, &time) + self.main.put::<_, Str, SerdeJson>(wtxn, main_key::UPDATED_AT_KEY, &time) } } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 8c9178d4e..644547b91 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,5 +1,5 @@ -use chrono::Utc; use roaring::RoaringBitmap; +use time::OffsetDateTime; use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; @@ -14,7 +14,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { } pub fn execute(self) -> Result { - self.index.set_updated_at(self.wtxn, &Utc::now())?; + self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; let Index { env: _env, main: _main, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 19f1d9f42..2391bd0e4 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,13 +1,13 @@ use std::collections::btree_map::Entry; use std::collections::HashMap; -use chrono::Utc; use fst::IntoStreamer; use heed::types::ByteSlice; use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; +use time::OffsetDateTime; use super::ClearDocuments; use crate::error::{InternalError, SerializationError, UserError}; @@ -61,7 +61,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } pub fn execute(self) -> Result { - self.index.set_updated_at(self.wtxn, &Utc::now())?; + self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; let current_documents_ids_len = documents_ids.len(); diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index a2f17cba3..83a3d2267 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -2,12 +2,12 @@ use std::fs::File; use std::num::{NonZeroU8, NonZeroUsize}; use std::{cmp, mem}; -use chrono::Utc; use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; +use time::OffsetDateTime; use crate::error::InternalError; use crate::heed_codec::facet::{ @@ -53,7 +53,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { #[logging_timer::time("Facets::{}")] pub fn execute(self) -> Result<()> { - self.index.set_updated_at(self.wtxn, &Utc::now())?; + self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; // We get the faceted fields to be able to create the facet levels. let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 91ef187f5..c413f81c3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,10 +1,10 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::result::Result as StdResult; -use chrono::Utc; use itertools::Itertools; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use time::OffsetDateTime; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; @@ -454,7 +454,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { where F: Fn(UpdateIndexingStep) + Sync, { - self.index.set_updated_at(self.wtxn, &Utc::now())?; + self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; let old_faceted_fields = self.index.faceted_fields(&self.wtxn)?; let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; From 84035a27f52eb6f592683f65db25bd974aa3ab8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Tue, 15 Feb 2022 15:52:50 +0100 Subject: [PATCH 1239/1889] Update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index c26083cf8..dbb16251e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021-2022 Meilisearch +Copyright (c) 2021-2022 Meili SAS Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 0defeb268c9bc88a1c445bd82f12b055f95c0a9d Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 16 Feb 2022 13:27:41 +0100 Subject: [PATCH 1240/1889] bump milli --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 6fa5a2be0..ee775eebe 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.22.1" +version = "0.23.0" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 399b39bdf..f1458ae9c 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.22.1" +version = "0.23.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 767098f0b..19445673a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.22.1" +version = "0.23.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 8fc3a4f06..781dfbd79 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.22.1" +version = "0.23.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7957058f8..b4e3e7695 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.22.1" +version = "0.23.0" authors = ["Kerollmops "] edition = "2018" From f367cc2e75971f512922a1276b00f556553f1daa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 16 Feb 2022 15:28:48 +0100 Subject: [PATCH 1241/1889] Finally bump grenad to v0.4.1 --- milli/Cargo.toml | 2 +- milli/src/error.rs | 7 ++ milli/src/update/facets.rs | 6 +- .../extract/extract_docid_word_positions.rs | 7 +- .../extract/extract_facet_number_docids.rs | 7 +- .../extract/extract_facet_string_docids.rs | 7 +- .../extract/extract_fid_docid_facet_values.rs | 7 +- .../extract/extract_fid_word_count_docids.rs | 7 +- .../extract/extract_geo_points.rs | 15 +++-- .../extract/extract_word_docids.rs | 7 +- .../extract_word_pair_proximity_docids.rs | 7 +- .../extract/extract_word_position_docids.rs | 7 +- .../index_documents/helpers/grenad_helpers.rs | 66 +++++++++++-------- milli/src/update/index_documents/transform.rs | 14 ++-- .../src/update/index_documents/typed_chunk.rs | 40 ++++++----- milli/src/update/word_prefix_docids.rs | 6 +- .../word_prefix_pair_proximity_docids.rs | 6 +- .../update/words_prefix_position_docids.rs | 6 +- 18 files changed, 130 insertions(+), 94 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 6b830c29e..9197fa818 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -16,7 +16,7 @@ either = "1.6.1" flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" -grenad = { version = "0.3.1", default-features = false, features = ["tempfile"] } +grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" diff --git a/milli/src/error.rs b/milli/src/error.rs index 47c9a5993..dce23582a 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -29,6 +29,7 @@ pub enum InternalError { FieldIdMapMissingEntry(FieldIdMapMissingEntry), Fst(fst::Error), GrenadInvalidCompressionType, + GrenadInvalidFormatVersion, IndexingMergingKeys { process: &'static str }, InvalidDatabaseTyping, RayonThreadPool(ThreadPoolBuildError), @@ -97,6 +98,9 @@ where grenad::Error::InvalidCompressionType => { Error::InternalError(InternalError::GrenadInvalidCompressionType) } + grenad::Error::InvalidFormatVersion => { + Error::InternalError(InternalError::GrenadInvalidFormatVersion) + } } } } @@ -186,6 +190,9 @@ impl fmt::Display for InternalError { Self::GrenadInvalidCompressionType => { f.write_str("Invalid compression type have been specified to grenad.") } + Self::GrenadInvalidFormatVersion => { + f.write_str("Invalid grenad file with an invalid version format.") + } Self::IndexingMergingKeys { process } => { write!(f, "Invalid merge while processing {}.", process) } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 19684c6ea..53305cdee 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -160,8 +160,7 @@ fn compute_facet_number_levels<'t>( // It is forbidden to keep a cursor and write in a database at the same time with LMDB // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = tempfile::tempfile() - .and_then(|file| create_writer(compression_type, compression_level, file))?; + let mut writer = create_writer(compression_type, compression_level, tempfile::tempfile()?); let level_0_range = { let left = (field_id, 0, f64::MIN, f64::MIN); @@ -279,8 +278,7 @@ fn compute_facet_string_levels<'t>( // It is forbidden to keep a cursor and write in a database at the same time with LMDB // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = tempfile::tempfile() - .and_then(|file| create_writer(compression_type, compression_level, file))?; + let mut writer = create_writer(compression_type, compression_level, tempfile::tempfile()?); // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index fa1381412..44bf9dbf7 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -18,8 +18,8 @@ use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_A /// Returns the generated internal documents ids and a grenad reader /// with the list of extracted words from the given chunk of documents. #[logging_timer::time] -pub fn extract_docid_word_positions( - mut obkv_documents: grenad::Reader, +pub fn extract_docid_word_positions( + obkv_documents: grenad::Reader, indexer: GrenadParameters, searchable_fields: &Option>, stop_words: Option<&fst::Set<&[u8]>>, @@ -46,7 +46,8 @@ pub fn extract_docid_word_positions( } let analyzer = Analyzer::>::new(AnalyzerConfig::default()); - while let Some((key, value)) = obkv_documents.next()? { + let mut cursor = obkv_documents.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { let document_id = key .try_into() .map(u32::from_be_bytes) diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 5480bd605..fa63d9549 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -14,8 +14,8 @@ use crate::Result; /// Returns a grenad reader with the list of extracted facet numbers and /// documents ids from the given chunk of docid facet number positions. #[logging_timer::time] -pub fn extract_facet_number_docids( - mut docid_fid_facet_number: grenad::Reader, +pub fn extract_facet_number_docids( + docid_fid_facet_number: grenad::Reader, indexer: GrenadParameters, ) -> Result> { let max_memory = indexer.max_memory_by_thread(); @@ -28,7 +28,8 @@ pub fn extract_facet_number_docids( max_memory, ); - while let Some((key_bytes, _)) = docid_fid_facet_number.next()? { + let mut cursor = docid_fid_facet_number.into_cursor()?; + while let Some((key_bytes, _)) = cursor.move_on_next()? { let (field_id, document_id, number) = FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index e08d062cf..8209d817b 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -16,8 +16,8 @@ use crate::{FieldId, Result}; /// Returns a grenad reader with the list of extracted facet strings and /// documents ids from the given chunk of docid facet string positions. #[logging_timer::time] -pub fn extract_facet_string_docids( - mut docid_fid_facet_string: grenad::Reader, +pub fn extract_facet_string_docids( + docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, ) -> Result> { let max_memory = indexer.max_memory_by_thread(); @@ -32,7 +32,8 @@ pub fn extract_facet_string_docids( let mut key_buffer = Vec::new(); let mut value_buffer = Vec::new(); - while let Some((key, original_value_bytes)) = docid_fid_facet_string.next()? { + let mut cursor = docid_fid_facet_string.into_cursor()?; + while let Some((key, original_value_bytes)) = cursor.move_on_next()? { let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap(); diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index a1bf0b1e3..628636f78 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -16,8 +16,8 @@ use crate::{DocumentId, FieldId, Result}; /// Returns the generated grenad reader containing the docid the fid and the orginal value as key /// and the normalized value as value extracted from the given chunk of documents. #[logging_timer::time] -pub fn extract_fid_docid_facet_values( - mut obkv_documents: grenad::Reader, +pub fn extract_fid_docid_facet_values( + obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, ) -> Result<(grenad::Reader, grenad::Reader)> { @@ -40,7 +40,8 @@ pub fn extract_fid_docid_facet_values( ); let mut key_buffer = Vec::new(); - while let Some((docid_bytes, value)) = obkv_documents.next()? { + let mut cursor = obkv_documents.into_cursor()?; + while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); for (field_id, field_bytes) in obkv.iter() { diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 4e25cb4f6..85a65ee14 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -18,8 +18,8 @@ use crate::{relative_from_absolute_position, DocumentId, FieldId, Result}; /// Returns a grenad reader with the list of extracted field id word counts /// and documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_fid_word_count_docids( - mut docid_word_positions: grenad::Reader, +pub fn extract_fid_word_count_docids( + docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { let max_memory = indexer.max_memory_by_thread(); @@ -36,7 +36,8 @@ pub fn extract_fid_word_count_docids( let mut document_fid_wordcount = HashMap::new(); let mut current_document_id = None; - while let Some((key, value)) = docid_word_positions.next()? { + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { let (document_id_bytes, _word_bytes) = try_split_array_at(key) .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index a36b608ee..e58d351d6 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -10,17 +10,20 @@ use crate::{FieldId, InternalError, Result, UserError}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. /// /// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude) -pub fn extract_geo_points( - mut obkv_documents: grenad::Reader, +pub fn extract_geo_points( + obkv_documents: grenad::Reader, indexer: GrenadParameters, primary_key_id: FieldId, geo_field_id: FieldId, ) -> Result> { - let mut writer = tempfile::tempfile().and_then(|file| { - create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) - })?; + let mut writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); - while let Some((docid_bytes, value)) = obkv_documents.next()? { + let mut cursor = obkv_documents.into_cursor()?; + while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); let point: Value = match obkv.get(geo_field_id) { Some(point) => serde_json::from_slice(point).map_err(InternalError::SerdeJson)?, diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 6d99fda44..80d68298a 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -17,8 +17,8 @@ use crate::Result; /// Returns a grenad reader with the list of extracted words and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_docids( - mut docid_word_positions: grenad::Reader, +pub fn extract_word_docids( + docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { let max_memory = indexer.max_memory_by_thread(); @@ -32,7 +32,8 @@ pub fn extract_word_docids( ); let mut value_buffer = Vec::new(); - while let Some((key, _value)) = docid_word_positions.next()? { + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, _value)) = cursor.move_on_next()? { let (document_id_bytes, word_bytes) = try_split_array_at(key) .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index f3667694a..90349eb93 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -17,8 +17,8 @@ use crate::{DocumentId, Result}; /// Returns a grenad reader with the list of extracted word pairs proximities and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_pair_proximity_docids( - mut docid_word_positions: grenad::Reader, +pub fn extract_word_pair_proximity_docids( + docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { let max_memory = indexer.max_memory_by_thread(); @@ -35,7 +35,8 @@ pub fn extract_word_pair_proximity_docids( let mut document_word_positions_heap = BinaryHeap::new(); let mut current_document_id = None; - while let Some((key, value)) = docid_word_positions.next()? { + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { let (document_id_bytes, word_bytes) = try_split_array_at(key) .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 4ca8537ac..a4720ba2b 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -14,8 +14,8 @@ use crate::{DocumentId, Result}; /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_position_docids( - mut docid_word_positions: grenad::Reader, +pub fn extract_word_position_docids( + docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { let max_memory = indexer.max_memory_by_thread(); @@ -29,7 +29,8 @@ pub fn extract_word_position_docids( ); let mut key_buffer = Vec::new(); - while let Some((key, value)) = docid_word_positions.next()? { + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { let (document_id_bytes, word_bytes) = try_split_array_at(key) .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = DocumentId::from_be_bytes(document_id_bytes); diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index eef067122..ec4a32755 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -17,7 +17,7 @@ pub fn create_writer( typ: grenad::CompressionType, level: Option, file: R, -) -> io::Result> { +) -> grenad::Writer { let mut builder = grenad::Writer::builder(); builder.compression_type(typ); if let Some(level) = level { @@ -52,10 +52,13 @@ pub fn sorter_into_reader( sorter: grenad::Sorter, indexer: GrenadParameters, ) -> Result> { - let mut writer = tempfile::tempfile().and_then(|file| { - create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) - })?; - sorter.write_into(&mut writer)?; + let mut writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + sorter.write_into_stream_writer(&mut writer)?; + Ok(writer_into_reader(writer)?) } @@ -75,20 +78,25 @@ pub unsafe fn into_clonable_grenad( Ok(reader) } -pub fn merge_readers( +pub fn merge_readers( readers: Vec>, merge_fn: MergeFn, indexer: GrenadParameters, ) -> Result> { let mut merger_builder = grenad::MergerBuilder::new(merge_fn); - merger_builder.extend(readers); + for reader in readers { + merger_builder.push(reader.into_cursor()?); + } + let merger = merger_builder.build(); - let mut writer = tempfile::tempfile().and_then(|file| { - create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file) - })?; - merger.write_into(&mut writer)?; - let reader = writer_into_reader(writer)?; - Ok(reader) + let mut writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + merger.write_into_stream_writer(&mut writer)?; + + Ok(writer_into_reader(writer)?) } #[derive(Debug, Clone, Copy)] @@ -125,12 +133,13 @@ impl GrenadParameters { /// The grenad obkv entries are composed of an incremental document id big-endian /// encoded as the key and an obkv object with an `u8` for the field as the key /// and a simple UTF-8 encoded string as the value. -pub fn grenad_obkv_into_chunks( - mut reader: grenad::Reader, +pub fn grenad_obkv_into_chunks( + reader: grenad::Reader, indexer: GrenadParameters, documents_chunk_size: usize, ) -> Result>>> { let mut continue_reading = true; + let mut cursor = reader.into_cursor()?; let indexer_clone = indexer.clone(); let mut transposer = move || { @@ -139,15 +148,13 @@ pub fn grenad_obkv_into_chunks( } let mut current_chunk_size = 0u64; - let mut obkv_documents = tempfile::tempfile().and_then(|file| { - create_writer( - indexer_clone.chunk_compression_type, - indexer_clone.chunk_compression_level, - file, - ) - })?; + let mut obkv_documents = create_writer( + indexer_clone.chunk_compression_type, + indexer_clone.chunk_compression_level, + tempfile::tempfile()?, + ); - while let Some((document_id, obkv)) = reader.next()? { + while let Some((document_id, obkv)) = cursor.move_on_next()? { obkv_documents.insert(document_id, obkv)?; current_chunk_size += document_id.len() as u64 + obkv.len() as u64; @@ -166,13 +173,14 @@ pub fn grenad_obkv_into_chunks( pub fn write_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, - mut reader: Reader, + reader: Reader, merge: MergeFn, ) -> Result<()> { debug!("Writing MTBL stores..."); let before = Instant::now(); - while let Some((k, v)) = reader.next()? { + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; match iter.next().transpose()? { Some((key, old_val)) if key == k => { @@ -201,19 +209,19 @@ pub fn sorter_into_lmdb_database( debug!("Writing MTBL sorter..."); let before = Instant::now(); - merger_iter_into_lmdb_database(wtxn, database, sorter.into_merger_iter()?, merge)?; + merger_iter_into_lmdb_database(wtxn, database, sorter.into_stream_merger_iter()?, merge)?; debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } -fn merger_iter_into_lmdb_database( +fn merger_iter_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, - mut sorter: MergerIter, + mut merger_iter: MergerIter, merge: MergeFn, ) -> Result<()> { - while let Some((k, v)) = sorter.next()? { + while let Some((k, v)) = merger_iter.next()? { let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; match iter.next().transpose()? { Some((key, old_val)) if key == k => { diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f5fb1ec01..4ec34c0c6 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -277,7 +277,7 @@ impl<'a, 'i> Transform<'a, 'i> { let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); // consume sorter, in order to free the internal allocation, before creating a new one. - let mut iter = self.sorter.into_merger_iter()?; + let mut iter = self.sorter.into_stream_merger_iter()?; // Once we have sort and deduplicated the documents we write them into a final file. let mut final_sorter = create_sorter( @@ -374,16 +374,15 @@ impl<'a, 'i> Transform<'a, 'i> { }); // We create a final writer to write the new documents in order from the sorter. - let file = tempfile::tempfile()?; let mut writer = create_writer( self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, - file, - )?; + tempfile::tempfile()?, + ); // Once we have written all the documents into the final sorter, we write the documents // into this writer, extract the file and reset the seek to be able to read it again. - final_sorter.write_into(&mut writer)?; + final_sorter.write_into_stream_writer(&mut writer)?; let mut documents_file = writer.into_inner()?; documents_file.seek(SeekFrom::Start(0))?; @@ -424,12 +423,11 @@ impl<'a, 'i> Transform<'a, 'i> { let documents_count = documents_ids.len() as usize; // We create a final writer to write the new documents in order from the sorter. - let file = tempfile::tempfile()?; let mut writer = create_writer( self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, - file, - )?; + tempfile::tempfile()?, + ); let mut obkv_buffer = Vec::new(); for result in self.index.documents.iter(wtxn)? { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 7f0cfcab3..3c77de7a1 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use std::convert::TryInto; use std::fs::File; +use std::io; use heed::types::ByteSlice; use heed::{BytesDecode, RwTxn}; @@ -65,8 +66,9 @@ pub(crate) fn write_typed_chunk_into_index( }, )?; } - TypedChunk::Documents(mut obkv_documents_iter) => { - while let Some((key, value)) = obkv_documents_iter.next()? { + TypedChunk::Documents(obkv_documents_iter) => { + let mut cursor = obkv_documents_iter.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { index.documents.remap_types::().put(wtxn, key, value)?; } } @@ -85,7 +87,7 @@ pub(crate) fn write_typed_chunk_into_index( return Ok((documents_ids, is_merged_database)) } TypedChunk::WordDocids(word_docids_iter) => { - let mut word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?; + let word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?; append_entries_into_database( word_docids_iter.clone(), &index.word_docids, @@ -97,7 +99,8 @@ pub(crate) fn write_typed_chunk_into_index( // create fst from word docids let mut builder = fst::SetBuilder::memory(); - while let Some((word, _value)) = word_docids_iter.next()? { + let mut cursor = word_docids_iter.into_cursor()?; + while let Some((word, _value)) = cursor.move_on_next()? { // This is a lexicographically ordered word position // we use the key to construct the words fst. builder.insert(word)?; @@ -146,19 +149,21 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::FieldIdDocidFacetNumbers(mut fid_docid_facet_number) => { + TypedChunk::FieldIdDocidFacetNumbers(fid_docid_facet_number) => { let index_fid_docid_facet_numbers = index.field_id_docid_facet_f64s.remap_types::(); - while let Some((key, value)) = fid_docid_facet_number.next()? { + let mut cursor = fid_docid_facet_number.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { index_fid_docid_facet_numbers.put(wtxn, key, &value)?; } } } - TypedChunk::FieldIdDocidFacetStrings(mut fid_docid_facet_string) => { + TypedChunk::FieldIdDocidFacetStrings(fid_docid_facet_string) => { let index_fid_docid_facet_strings = index.field_id_docid_facet_strings.remap_types::(); - while let Some((key, value)) = fid_docid_facet_string.next()? { + let mut cursor = fid_docid_facet_string.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { index_fid_docid_facet_strings.put(wtxn, key, &value)?; } @@ -183,11 +188,12 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::GeoPoints(mut geo_points) => { + TypedChunk::GeoPoints(geo_points) => { let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; - while let Some((key, value)) = geo_points.next()? { + let mut cursor = geo_points.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { // convert the key back to a u32 (4 bytes) let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); @@ -229,7 +235,7 @@ fn merge_cbo_roaring_bitmaps( /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( - mut data: grenad::Reader, + data: grenad::Reader, database: &heed::Database, wtxn: &mut RwTxn, index_is_empty: bool, @@ -237,14 +243,15 @@ fn write_entries_into_database( merge_values: FM, ) -> Result<()> where - R: std::io::Read, + R: io::Read + io::Seek, FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, { let mut buffer = Vec::new(); let database = database.remap_types::(); - while let Some((key, value)) = data.next()? { + let mut cursor = data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { buffer.clear(); let value = if index_is_empty { @@ -270,7 +277,7 @@ where /// All provided entries must be ordered. /// If the index is not empty, write_entries_into_database is called instead. fn append_entries_into_database( - mut data: grenad::Reader, + data: grenad::Reader, database: &heed::Database, wtxn: &mut RwTxn, index_is_empty: bool, @@ -278,7 +285,7 @@ fn append_entries_into_database( merge_values: FM, ) -> Result<()> where - R: std::io::Read, + R: io::Read + io::Seek, FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, { @@ -296,7 +303,8 @@ where let mut buffer = Vec::new(); let mut database = database.iter_mut(wtxn)?.remap_types::(); - while let Some((key, value)) = data.next()? { + let mut cursor = data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { buffer.clear(); let value = serialize_value(value, &mut buffer)?; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 624037f8f..0bb5edb9a 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -51,8 +51,10 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { ); let mut word_docids_merger = MergerBuilder::new(merge_roaring_bitmaps); - word_docids_merger.extend(new_word_docids); - let mut word_docids_iter = word_docids_merger.build().into_merger_iter()?; + for reader in new_word_docids { + word_docids_merger.push(reader.into_cursor()?); + } + let mut word_docids_iter = word_docids_merger.build().into_stream_merger_iter()?; let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 530c2867e..b498d5850 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -77,8 +77,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { // We retrieve and merge the created word pair proximities docids entries // for the newly added documents. let mut wppd_merger = MergerBuilder::new(merge_cbo_roaring_bitmaps); - wppd_merger.extend(new_word_pair_proximity_docids); - let mut wppd_iter = wppd_merger.build().into_merger_iter()?; + for reader in new_word_pair_proximity_docids { + wppd_merger.push(reader.into_cursor()?); + } + let mut wppd_iter = wppd_merger.build().into_stream_merger_iter()?; let mut word_prefix_pair_proximity_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index c992d01ec..9e15f4d6c 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -73,9 +73,11 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { ); let mut word_position_docids_merger = MergerBuilder::new(merge_cbo_roaring_bitmaps); - word_position_docids_merger.extend(new_word_position_docids); + for reader in new_word_position_docids { + word_position_docids_merger.push(reader.into_cursor()?); + } let mut word_position_docids_iter = - word_position_docids_merger.build().into_merger_iter()?; + word_position_docids_merger.build().into_stream_merger_iter()?; // We fetch all the new common prefixes between the previous and new prefix fst. let mut buffer = Vec::new(); From ff8d7a810de935db3f35583e3c3dba34d1ca32a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 16 Feb 2022 15:40:08 +0100 Subject: [PATCH 1242/1889] Change the behavior of the as_cloneable_grenad by taking a ref --- .../src/update/index_documents/extract/mod.rs | 10 ++--- .../index_documents/helpers/grenad_helpers.rs | 8 ++-- .../src/update/index_documents/helpers/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 42 ++++--------------- .../src/update/index_documents/typed_chunk.rs | 4 +- 5 files changed, 21 insertions(+), 45 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 0f04418ed..4c81b9334 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -25,7 +25,7 @@ use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ - into_clonable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, + as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, }; use super::{helpers, TypedChunk}; @@ -184,7 +184,7 @@ fn extract_documents_data( grenad::Reader, (grenad::Reader, grenad::Reader), )> { - let documents_chunk = documents_chunk.and_then(|c| unsafe { into_clonable_grenad(c) })?; + let documents_chunk = documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); @@ -217,7 +217,7 @@ fn extract_documents_data( // send docid_word_positions_chunk to DB writer let docid_word_positions_chunk = - unsafe { into_clonable_grenad(docid_word_positions_chunk)? }; + unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; let _ = lmdb_writer_sx .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); @@ -233,7 +233,7 @@ fn extract_documents_data( // send docid_fid_facet_numbers_chunk to DB writer let docid_fid_facet_numbers_chunk = - unsafe { into_clonable_grenad(docid_fid_facet_numbers_chunk)? }; + unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? }; let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( docid_fid_facet_numbers_chunk.clone(), @@ -241,7 +241,7 @@ fn extract_documents_data( // send docid_fid_facet_strings_chunk to DB writer let docid_fid_facet_strings_chunk = - unsafe { into_clonable_grenad(docid_fid_facet_strings_chunk)? }; + unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? }; let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( docid_fid_facet_strings_chunk.clone(), diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index ec4a32755..ded74b2af 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -68,11 +68,11 @@ pub fn writer_into_reader(writer: grenad::Writer) -> Result, +pub unsafe fn as_cloneable_grenad( + reader: &grenad::Reader, ) -> Result> { - let file = reader.into_inner(); - let mmap = memmap2::Mmap::map(&file)?; + let file = reader.get_ref(); + let mmap = memmap2::Mmap::map(file)?; let cursor = io::Cursor::new(ClonableMmap::from(mmap)); let reader = grenad::Reader::new(cursor)?; Ok(reader) diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index bbb2b9b95..22c1cfd6c 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -8,7 +8,7 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ - create_sorter, create_writer, grenad_obkv_into_chunks, into_clonable_grenad, merge_readers, + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_readers, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, GrenadParameters, }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a31d1875b..c69aae809 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -16,9 +16,9 @@ use slice_group_by::GroupBy; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; pub use self::helpers::{ - create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, - merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, sorter_into_lmdb_database, - write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, + as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, + fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, + sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; @@ -292,42 +292,18 @@ where for result in lmdb_writer_rx { let typed_chunk = match result? { TypedChunk::WordDocids(chunk) => { - // We extract and mmap our chunk file to be able to get it for next processes. - let mut file = chunk.into_inner(); - let mmap = unsafe { memmap2::Mmap::map(&file)? }; - let cursor_mmap = CursorClonableMmap::new(ClonableMmap::from(mmap)); - let chunk = grenad::Reader::new(cursor_mmap)?; - word_docids.push(chunk); - - // We reconstruct our typed-chunk back. - file.rewind()?; - let chunk = grenad::Reader::new(file)?; + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + word_docids.push(cloneable_chunk); TypedChunk::WordDocids(chunk) } TypedChunk::WordPairProximityDocids(chunk) => { - // We extract and mmap our chunk file to be able to get it for next processes. - let mut file = chunk.into_inner(); - let mmap = unsafe { memmap2::Mmap::map(&file)? }; - let cursor_mmap = CursorClonableMmap::new(ClonableMmap::from(mmap)); - let chunk = grenad::Reader::new(cursor_mmap)?; - word_pair_proximity_docids.push(chunk); - - // We reconstruct our typed-chunk back. - file.rewind()?; - let chunk = grenad::Reader::new(file)?; + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + word_pair_proximity_docids.push(cloneable_chunk); TypedChunk::WordPairProximityDocids(chunk) } TypedChunk::WordPositionDocids(chunk) => { - // We extract and mmap our chunk file to be able to get it for next processes. - let mut file = chunk.into_inner(); - let mmap = unsafe { memmap2::Mmap::map(&file)? }; - let cursor_mmap = CursorClonableMmap::new(ClonableMmap::from(mmap)); - let chunk = grenad::Reader::new(cursor_mmap)?; - word_position_docids.push(chunk); - - // We reconstruct our typed-chunk back. - file.rewind()?; - let chunk = grenad::Reader::new(file)?; + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + word_position_docids.push(cloneable_chunk); TypedChunk::WordPositionDocids(chunk) } otherwise => otherwise, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 3c77de7a1..77ea31138 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -12,7 +12,7 @@ use super::helpers::{ CursorClonableMmap, }; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; -use crate::update::index_documents::helpers::into_clonable_grenad; +use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, @@ -87,7 +87,7 @@ pub(crate) fn write_typed_chunk_into_index( return Ok((documents_ids, is_merged_database)) } TypedChunk::WordDocids(word_docids_iter) => { - let word_docids_iter = unsafe { into_clonable_grenad(word_docids_iter) }?; + let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?; append_entries_into_database( word_docids_iter.clone(), &index.word_docids, From 19bfb2649b4161d59960105365041303a105b5ab Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Feb 2022 15:27:51 +0100 Subject: [PATCH 1243/1889] Raise the GitHub CI timeout limit to 72h --- .github/workflows/manual_benchmarks.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/manual_benchmarks.yml b/.github/workflows/manual_benchmarks.yml index 19b071f57..456e87168 100644 --- a/.github/workflows/manual_benchmarks.yml +++ b/.github/workflows/manual_benchmarks.yml @@ -15,6 +15,7 @@ jobs: benchmarks: name: Run and upload benchmarks runs-on: benchmarks + timeout-minutes: 4320 # 72h steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 From ab5247dc6413ac69f09cccb9a5a5a86e6180dea0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 21 Feb 2022 16:30:13 +0100 Subject: [PATCH 1244/1889] Add a new songs benchmark to test multi batch indexing --- benchmarks/benches/indexing.rs | 72 ++++++++++++++++++++++++++++++++++ benchmarks/build.rs | 15 ++++++- 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index a84998b12..8536dabe8 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -83,6 +83,77 @@ fn indexing_songs_default(c: &mut Criterion) { }); } +fn indexing_songs_in_three_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing songs in three batches with default settings", |b| { + b.iter_with_setup( + move || { + let index = setup_index(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = + ["title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_| ()).unwrap(); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it take. + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(10); @@ -332,6 +403,7 @@ criterion_group!( indexing_songs_default, indexing_songs_without_faceted_numbers, indexing_songs_without_faceted_fields, + indexing_songs_in_three_batches_default, indexing_wiki, indexing_movies_default, indexing_geo diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 2495930bb..90ebf70af 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -11,10 +11,23 @@ use reqwest::IntoUrl; const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets"; const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv"); +const DATASET_SONGS_1_2: (&str, &str) = ("smol-songs-1_2", "csv"); +const DATASET_SONGS_3_4: (&str, &str) = ("smol-songs-3_4", "csv"); +const DATASET_SONGS_4_4: (&str, &str) = ("smol-songs-4_4", "csv"); const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv"); const DATASET_MOVIES: (&str, &str) = ("movies", "json"); const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl"); +const ALL_DATASETS: &[(&str, &str)] = &[ + DATASET_SONGS, + DATASET_SONGS_1_2, + DATASET_SONGS_3_4, + DATASET_SONGS_4_4, + DATASET_WIKI, + DATASET_MOVIES, + DATASET_GEO, +]; + /// The name of the environment variable used to select the path /// of the directory containing the datasets const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; @@ -33,7 +46,7 @@ fn main() -> anyhow::Result<()> { )?; writeln!(manifest_paths_file)?; - for (dataset, extension) in [DATASET_SONGS, DATASET_WIKI, DATASET_MOVIES, DATASET_GEO] { + for (dataset, extension) in ALL_DATASETS { let out_path = out_dir.join(dataset); let out_file = out_path.with_extension(extension); From 8d2e3e4aba86d62df0faf1f08d561c17e55951c0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 21 Feb 2022 17:59:03 +0100 Subject: [PATCH 1245/1889] Add a new wiki benchmark to test multi batch indexing --- benchmarks/benches/indexing.rs | 71 ++++++++++++++++++++++++++++++++++ benchmarks/build.rs | 6 +++ 2 files changed, 77 insertions(+) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 8536dabe8..97eee0a34 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -294,6 +294,76 @@ fn indexing_wiki(c: &mut Criterion) { }); } +fn indexing_wiki_in_three_batches(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing wiki in three batches", |b| { + b.iter_with_setup( + move || { + let index = setup_index(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = + ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + // there is NO faceted fields at all + builder.execute(|_| ()).unwrap(); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it take. + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let documents = + utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + + let documents = + utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + + let documents = + utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn indexing_movies_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(10); @@ -405,6 +475,7 @@ criterion_group!( indexing_songs_without_faceted_fields, indexing_songs_in_three_batches_default, indexing_wiki, + indexing_wiki_in_three_batches, indexing_movies_default, indexing_geo ); diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 90ebf70af..66a0a841b 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -15,6 +15,9 @@ const DATASET_SONGS_1_2: (&str, &str) = ("smol-songs-1_2", "csv"); const DATASET_SONGS_3_4: (&str, &str) = ("smol-songs-3_4", "csv"); const DATASET_SONGS_4_4: (&str, &str) = ("smol-songs-4_4", "csv"); const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv"); +const DATASET_WIKI_1_2: (&str, &str) = ("smol-wiki-articles-1_2", "csv"); +const DATASET_WIKI_3_4: (&str, &str) = ("smol-wiki-articles-3_4", "csv"); +const DATASET_WIKI_4_4: (&str, &str) = ("smol-wiki-articles-4_4", "csv"); const DATASET_MOVIES: (&str, &str) = ("movies", "json"); const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl"); @@ -24,6 +27,9 @@ const ALL_DATASETS: &[(&str, &str)] = &[ DATASET_SONGS_3_4, DATASET_SONGS_4_4, DATASET_WIKI, + DATASET_WIKI_1_2, + DATASET_WIKI_3_4, + DATASET_WIKI_4_4, DATASET_MOVIES, DATASET_GEO, ]; From a820aa11e6f252934eca61e859029deea3013792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 22 Feb 2022 13:47:37 +0100 Subject: [PATCH 1246/1889] Add a new movies benchmark to test multi batch indexing --- benchmarks/benches/indexing.rs | 72 ++++++++++++++++++++++++++++++++++ benchmarks/build.rs | 6 +++ 2 files changed, 78 insertions(+) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 97eee0a34..b9ad7cad9 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -413,6 +413,77 @@ fn indexing_movies_default(c: &mut Criterion) { }); } +fn indexing_movies_in_three_batches(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing movies in three batches", |b| { + b.iter_with_setup( + move || { + let index = setup_index(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_primary_key("id".to_owned()); + let displayed_fields = ["title", "poster", "overview", "release_date", "genres"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["title", "overview"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = + ["released_date", "genres"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(faceted_fields); + + builder.execute(|_| ()).unwrap(); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it take. + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + + let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + + let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + + let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn indexing_geo(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(10); @@ -477,6 +548,7 @@ criterion_group!( indexing_wiki, indexing_wiki_in_three_batches, indexing_movies_default, + indexing_movies_in_three_batches, indexing_geo ); criterion_main!(benches); diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 66a0a841b..906230fd4 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -19,6 +19,9 @@ const DATASET_WIKI_1_2: (&str, &str) = ("smol-wiki-articles-1_2", "csv"); const DATASET_WIKI_3_4: (&str, &str) = ("smol-wiki-articles-3_4", "csv"); const DATASET_WIKI_4_4: (&str, &str) = ("smol-wiki-articles-4_4", "csv"); const DATASET_MOVIES: (&str, &str) = ("movies", "json"); +const DATASET_MOVIES_1_2: (&str, &str) = ("movies-1_2", "json"); +const DATASET_MOVIES_3_4: (&str, &str) = ("movies-3_4", "json"); +const DATASET_MOVIES_4_4: (&str, &str) = ("movies-4_4", "json"); const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl"); const ALL_DATASETS: &[(&str, &str)] = &[ @@ -31,6 +34,9 @@ const ALL_DATASETS: &[(&str, &str)] = &[ DATASET_WIKI_3_4, DATASET_WIKI_4_4, DATASET_MOVIES, + DATASET_MOVIES_1_2, + DATASET_MOVIES_3_4, + DATASET_MOVIES_4_4, DATASET_GEO, ]; From acfc96525cc975dd1ad932a80a1e6d2df23bc9f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 22 Feb 2022 17:39:24 +0100 Subject: [PATCH 1247/1889] Apply GitHub suggestions --- benchmarks/benches/indexing.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index b9ad7cad9..ee74f2a80 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -116,7 +116,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { builder.execute(|_| ()).unwrap(); // We index only one half of the dataset in the setup part - // as we don't care about the time it take. + // as we don't care about the time it takes. let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = @@ -318,7 +318,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { builder.execute(|_| ()).unwrap(); // We index only one half of the dataset in the setup part - // as we don't care about the time it take. + // as we don't care about the time it takes. let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; @@ -443,7 +443,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { builder.execute(|_| ()).unwrap(); // We index only one half of the dataset in the setup part - // as we don't care about the time it take. + // as we don't care about the time it takes. let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = From 04b1bbf93201c0c872c6a1c10be061158fd85a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 22 Feb 2022 17:28:57 +0100 Subject: [PATCH 1248/1889] Reintroduce appending sorted entries when possible --- .../index_documents/helpers/grenad_helpers.rs | 54 +++++++++---------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index ded74b2af..e0ac3a175 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::time::Instant; -use grenad::{CompressionType, MergerIter, Reader, Sorter}; +use grenad::{CompressionType, Reader, Sorter}; use heed::types::ByteSlice; use log::debug; @@ -209,36 +209,34 @@ pub fn sorter_into_lmdb_database( debug!("Writing MTBL sorter..."); let before = Instant::now(); - merger_iter_into_lmdb_database(wtxn, database, sorter.into_stream_merger_iter()?, merge)?; - - debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); - Ok(()) -} - -fn merger_iter_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - mut merger_iter: MergerIter, - merge: MergeFn, -) -> Result<()> { - while let Some((k, v)) = merger_iter.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).map_err(|_| { - // TODO just wrap this error? - InternalError::IndexingMergingKeys { process: "get-put-merge" } - })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + let mut merger_iter = sorter.into_stream_merger_iter()?; + if database.is_empty(wtxn)? { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + while let Some((k, v)) = merger_iter.next()? { + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; + } + } else { + while let Some((k, v)) = merger_iter.next()? { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; + match iter.next().transpose()? { + Some((key, old_val)) if key == k => { + let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; + let val = merge(k, &vals).map_err(|_| { + // TODO just wrap this error? + InternalError::IndexingMergingKeys { process: "get-put-merge" } + })?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(k, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } } } } + debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } From 8d26f3040cdb75559a4ec71bad249b54127accbc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 28 Feb 2022 10:14:54 +0100 Subject: [PATCH 1249/1889] Remove a useless grenad file merging --- milli/src/update/index_documents/mod.rs | 24 +++++++++---------- milli/src/update/word_prefix_docids.rs | 13 ++++------ .../word_prefix_pair_proximity_docids.rs | 15 ++++-------- .../update/words_prefix_position_docids.rs | 13 ++++------ 4 files changed, 24 insertions(+), 41 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 9b1c73b36..93b86617c 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -279,9 +279,9 @@ where let index_documents_ids = self.index.documents_ids(self.wtxn)?; let index_is_empty = index_documents_ids.len() == 0; let mut final_documents_ids = RoaringBitmap::new(); - let mut word_pair_proximity_docids = Vec::new(); - let mut word_position_docids = Vec::new(); - let mut word_docids = Vec::new(); + let mut word_pair_proximity_docids = None; + let mut word_position_docids = None; + let mut word_docids = None; let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -293,17 +293,17 @@ where let typed_chunk = match result? { TypedChunk::WordDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_docids.push(cloneable_chunk); + word_docids = Some(cloneable_chunk); TypedChunk::WordDocids(chunk) } TypedChunk::WordPairProximityDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_pair_proximity_docids.push(cloneable_chunk); + word_pair_proximity_docids = Some(cloneable_chunk); TypedChunk::WordPairProximityDocids(chunk) } TypedChunk::WordPositionDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; - word_position_docids.push(cloneable_chunk); + word_position_docids = Some(cloneable_chunk); TypedChunk::WordPositionDocids(chunk) } otherwise => otherwise, @@ -345,9 +345,9 @@ where self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; self.execute_prefix_databases( - word_docids, - word_pair_proximity_docids, - word_position_docids, + word_docids.unwrap(), + word_pair_proximity_docids.unwrap(), + word_position_docids.unwrap(), )?; Ok(all_documents_ids.len()) @@ -356,9 +356,9 @@ where #[logging_timer::time("IndexDocuments::{}")] pub fn execute_prefix_databases( self, - word_docids: Vec>, - word_pair_proximity_docids: Vec>, - word_position_docids: Vec>, + word_docids: grenad::Reader, + word_pair_proximity_docids: grenad::Reader, + word_position_docids: grenad::Reader, ) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 0bb5edb9a..2baaf2f19 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -1,6 +1,6 @@ use std::collections::{HashMap, HashSet}; -use grenad::{CompressionType, MergerBuilder}; +use grenad::CompressionType; use heed::types::ByteSlice; use crate::update::index_documents::{ @@ -35,7 +35,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { #[logging_timer::time("WordPrefixDocids::{}")] pub fn execute( self, - new_word_docids: Vec>, + new_word_docids: grenad::Reader, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -50,15 +50,10 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { self.max_memory, ); - let mut word_docids_merger = MergerBuilder::new(merge_roaring_bitmaps); - for reader in new_word_docids { - word_docids_merger.push(reader.into_cursor()?); - } - let mut word_docids_iter = word_docids_merger.build().into_stream_merger_iter()?; - + let mut new_word_docids_iter = new_word_docids.into_cursor()?; let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((word, data)) = word_docids_iter.next()? { + while let Some((word, data)) = new_word_docids_iter.move_on_next()? { current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), _otherwise => { diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index b498d5850..692dd1568 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,6 +1,6 @@ use std::collections::{HashMap, HashSet}; -use grenad::{CompressionType, MergerBuilder}; +use grenad::CompressionType; use heed::types::ByteSlice; use heed::BytesDecode; use log::debug; @@ -64,7 +64,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute( self, - new_word_pair_proximity_docids: Vec>, + new_word_pair_proximity_docids: grenad::Reader, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -74,14 +74,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { let new_prefix_fst_words: Vec<_> = new_prefix_fst_words.linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect(); - // We retrieve and merge the created word pair proximities docids entries - // for the newly added documents. - let mut wppd_merger = MergerBuilder::new(merge_cbo_roaring_bitmaps); - for reader in new_word_pair_proximity_docids { - wppd_merger.push(reader.into_cursor()?); - } - let mut wppd_iter = wppd_merger.build().into_stream_merger_iter()?; - + let mut new_wppd_iter = new_word_pair_proximity_docids.into_cursor()?; let mut word_prefix_pair_proximity_docids_sorter = create_sorter( merge_cbo_roaring_bitmaps, self.chunk_compression_type, @@ -95,7 +88,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { let mut buffer = Vec::new(); let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = wppd_iter.next()? { + while let Some((key, data)) = new_wppd_iter.move_on_next()? { let (w1, w2, prox) = StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; if prox > self.max_proximity { continue; diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 9e15f4d6c..324516325 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -2,7 +2,7 @@ use std::collections::{HashMap, HashSet}; use std::num::NonZeroU32; use std::{cmp, str}; -use grenad::{CompressionType, MergerBuilder}; +use grenad::CompressionType; use heed::types::ByteSlice; use heed::{BytesDecode, BytesEncode}; use log::debug; @@ -57,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { #[logging_timer::time("WordPrefixPositionDocids::{}")] pub fn execute( self, - new_word_position_docids: Vec>, + new_word_position_docids: grenad::Reader, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -72,18 +72,13 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { self.max_memory, ); - let mut word_position_docids_merger = MergerBuilder::new(merge_cbo_roaring_bitmaps); - for reader in new_word_position_docids { - word_position_docids_merger.push(reader.into_cursor()?); - } - let mut word_position_docids_iter = - word_position_docids_merger.build().into_stream_merger_iter()?; + let mut new_word_position_docids_iter = new_word_position_docids.into_cursor()?; // We fetch all the new common prefixes between the previous and new prefix fst. let mut buffer = Vec::new(); let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = word_position_docids_iter.next()? { + while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; current_prefixes = match current_prefixes.take() { From d5b8b5a2f846d26e7c65f0634dd34bd7da460581 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 28 Feb 2022 16:00:33 +0100 Subject: [PATCH 1250/1889] Replace the ugly unwraps by clean if let Somes --- milli/src/update/index_documents/mod.rs | 100 +++++++++++++----------- 1 file changed, 53 insertions(+), 47 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 93b86617c..2d3004444 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -345,9 +345,9 @@ where self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; self.execute_prefix_databases( - word_docids.unwrap(), - word_pair_proximity_docids.unwrap(), - word_position_docids.unwrap(), + word_docids, + word_pair_proximity_docids, + word_position_docids, )?; Ok(all_documents_ids.len()) @@ -356,9 +356,9 @@ where #[logging_timer::time("IndexDocuments::{}")] pub fn execute_prefix_databases( self, - word_docids: grenad::Reader, - word_pair_proximity_docids: grenad::Reader, - word_position_docids: grenad::Reader, + word_docids: Option>, + word_pair_proximity_docids: Option>, + word_position_docids: Option>, ) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -424,18 +424,20 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - // Run the word prefix docids update operation. - let mut builder = WordPrefixDocids::new(self.wtxn, self.index); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - builder.max_nb_chunks = self.indexer_config.max_nb_chunks; - builder.max_memory = self.indexer_config.max_memory; - builder.execute( - word_docids, - &new_prefix_fst_words, - &common_prefix_fst_words, - &del_prefix_fst_words, - )?; + if let Some(word_docids) = word_docids { + // Run the word prefix docids update operation. + let mut builder = WordPrefixDocids::new(self.wtxn, self.index); + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + builder.max_nb_chunks = self.indexer_config.max_nb_chunks; + builder.max_memory = self.indexer_config.max_memory; + builder.execute( + word_docids, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -443,18 +445,20 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - // Run the word prefix pair proximity docids update operation. - let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - builder.max_nb_chunks = self.indexer_config.max_nb_chunks; - builder.max_memory = self.indexer_config.max_memory; - builder.execute( - word_pair_proximity_docids, - &new_prefix_fst_words, - &common_prefix_fst_words, - &del_prefix_fst_words, - )?; + if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { + // Run the word prefix pair proximity docids update operation. + let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + builder.max_nb_chunks = self.indexer_config.max_nb_chunks; + builder.max_memory = self.indexer_config.max_memory; + builder.execute( + word_pair_proximity_docids, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -462,24 +466,26 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - // Run the words prefix position docids update operation. - let mut builder = WordPrefixPositionDocids::new(self.wtxn, self.index); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - builder.max_nb_chunks = self.indexer_config.max_nb_chunks; - builder.max_memory = self.indexer_config.max_memory; - if let Some(value) = self.config.words_positions_level_group_size { - builder.level_group_size(value); + if let Some(word_position_docids) = word_position_docids { + // Run the words prefix position docids update operation. + let mut builder = WordPrefixPositionDocids::new(self.wtxn, self.index); + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + builder.max_nb_chunks = self.indexer_config.max_nb_chunks; + builder.max_memory = self.indexer_config.max_memory; + if let Some(value) = self.config.words_positions_level_group_size { + builder.level_group_size(value); + } + if let Some(value) = self.config.words_positions_min_level_size { + builder.min_level_size(value); + } + builder.execute( + word_position_docids, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; } - if let Some(value) = self.config.words_positions_min_level_size { - builder.min_level_size(value); - } - builder.execute( - word_position_docids, - &new_prefix_fst_words, - &common_prefix_fst_words, - &del_prefix_fst_words, - )?; databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { From d9ed9de2b0cb4813d234e81d018f21e0bac0c460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 1 Mar 2022 19:45:29 +0100 Subject: [PATCH 1251/1889] Update heed link in cargo toml --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- milli/fuzz/Cargo.toml | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 99a36b740..b48599679 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -14,7 +14,7 @@ csv = "1.1.6" jemallocator = "0.3.2" [dev-dependencies] -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } criterion = { version = "0.3.4", features = ["html_reports"] } [build-dependencies] diff --git a/cli/Cargo.toml b/cli/Cargo.toml index ee775eebe..023cd06f6 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -14,7 +14,7 @@ structopt = "0.3.22" milli = { path = "../milli" } eyre = "0.6.5" color-eyre = "0.5.11" -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } byte-unit = { version = "4.0.12", features = ["serde"] } bimap = "0.6.1" csv = "1.1.6" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index f1458ae9c..dc0f7dc81 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -7,7 +7,7 @@ edition = "2018" [dependencies] anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } milli = { path = "../milli" } stderrlog = "0.5.1" structopt = { version = "0.3.21", default-features = false } diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 19445673a..34b302f03 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -9,7 +9,7 @@ edition = "2018" anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } memmap2 = "0.5.0" milli = { path = "../milli" } diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 781dfbd79..41c9241ba 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -8,7 +8,7 @@ edition = "2018" anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } csv = "1.1.5" -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } milli = { path = "../milli" } roaring = "0.6.6" serde_json = "1.0.62" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3bf641926..43123d53b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ fst = "0.4.5" fxhash = "0.2.1" grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" diff --git a/milli/fuzz/Cargo.toml b/milli/fuzz/Cargo.toml index 0456e7098..e734936fb 100644 --- a/milli/fuzz/Cargo.toml +++ b/milli/fuzz/Cargo.toml @@ -11,7 +11,7 @@ cargo-fuzz = true [dependencies] arbitrary = "1.0" libfuzzer-sys = "0.4" -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } serde_json = { version = "1.0.62", features = ["preserve_order"] } anyhow = "1.0" tempfile = "3.3" From 66c6d5e1ef10c6de59fab47b1fb5287d820ff594 Mon Sep 17 00:00:00 2001 From: Bruno Casali Date: Fri, 4 Mar 2022 16:34:03 -0300 Subject: [PATCH 1252/1889] Add a new error message when the `valid_fields` is empty > "Attribute `{}` is not sortable. This index doesn't have configured sortable attributes." > "Attribute `{}` is not sortable. Available sortable attributes are: `{}`." coexist in the error handling --- milli/src/error.rs | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index dce23582a..e6fbc0605 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -256,11 +256,20 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco Self::InvalidSortableAttribute { field, valid_fields } => { let valid_names = valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); - write!( - f, - "Attribute `{}` is not sortable. Available sortable attributes are: `{}`.", - field, valid_names - ) + + if valid_names.is_empty() { + write!( + f, + "Attribute `{}` is not sortable. This index does not have configured sortable attributes.", + field + ) + } else { + write!( + f, + "Attribute `{}` is not sortable. Available sortable attributes are: `{}`.", + field, valid_names + ) + } } Self::SortRankingRuleMissing => f.write_str( "The sort ranking rule must be specified in the \ @@ -320,3 +329,19 @@ impl fmt::Display for SerializationError { } impl StdError for SerializationError {} + +#[test] +fn conditionally_lookup_for_error_message() { + let prefix = "Attribute `name` is not sortable."; + let messages = vec![ + (BTreeSet::new(), "This index does not have configured sortable attributes."), + (BTreeSet::from(["age".to_string()]), "Available sortable attributes are: `age`."), + ]; + + for (list, suffix) in messages { + let err = + UserError::InvalidSortableAttribute { field: "name".to_string(), valid_fields: list }; + + assert_eq!(err.to_string(), format!("{} {}", prefix, suffix)); + } +} From 6cf82ba993fb74364a7b84844a0bf93d2f04495a Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 7 Mar 2022 10:29:52 +0100 Subject: [PATCH 1253/1889] bufread documents --- cli/src/main.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cli/src/main.rs b/cli/src/main.rs index 11e203f4d..065be01f8 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -211,6 +211,8 @@ impl Performer for DocumentAddition { println!("parsing documents..."); + let reader = BufReader::new(reader); + let documents = match self.format { DocumentAdditionFormat::Csv => documents_from_csv(reader)?, DocumentAdditionFormat::Json => documents_from_json(reader)?, From db3a1905de637286d5b37c9a9fbeda87dc74c69f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 7 Mar 2022 10:30:47 +0100 Subject: [PATCH 1254/1889] default db path --- cli/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 065be01f8..f349325a7 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -23,7 +23,7 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; #[derive(Debug, StructOpt)] #[structopt(name = "Milli CLI", about = "A simple CLI to manipulate a milli index.")] struct Cli { - #[structopt(short, long)] + #[structopt(short, long, default_value = ".")] index_path: PathBuf, #[structopt(short = "s", long, default_value = "100GiB")] index_size: Byte, From 8bb45956d4bfe5031a8bfaab0b3fcd0b21fd4123 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 7 Mar 2022 14:55:13 +0100 Subject: [PATCH 1255/1889] allow to set the primary key in the cli --- cli/src/main.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index f349325a7..503b02887 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -11,7 +11,7 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; -use milli::update::{IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; +use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; use milli::Index; use serde_json::{Map, Value}; use structopt::StructOpt; @@ -191,6 +191,9 @@ struct DocumentAddition { /// Path to the update file, if not present, will read from stdin. #[structopt(short, long)] path: Option, + /// Specify the primary key. + #[structopt(long)] + primary: Option, /// Whether to generate missing document ids. #[structopt(short, long)] autogen_docids: bool, @@ -231,6 +234,12 @@ impl Performer for DocumentAddition { IndexDocumentsMethod::ReplaceDocuments }; + if let Some(primary) = self.primary { + let mut builder = update::Settings::new(&mut txn, &index, &config); + builder.set_primary_key(primary); + builder.execute(|_| ()).unwrap(); + } + let indexing_config = IndexDocumentsConfig { update_method, autogenerate_docids: self.autogen_docids, From 1ae13c137430bc88b9418d382964d362afb4af4e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Mar 2022 18:02:12 +0100 Subject: [PATCH 1256/1889] Avoid iterating on big databases when useless --- milli/src/update/word_prefix_docids.rs | 47 ++++---- .../word_prefix_pair_proximity_docids.rs | 102 +++++++++--------- .../update/words_prefix_position_docids.rs | 60 ++++++----- 3 files changed, 111 insertions(+), 98 deletions(-) diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 2baaf2f19..076816f09 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -50,35 +50,38 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { self.max_memory, ); - let mut new_word_docids_iter = new_word_docids.into_cursor()?; - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some((word, data)) = new_word_docids_iter.move_on_next()? { - current_prefixes = match current_prefixes.take() { - Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), - _otherwise => { - write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; - common_prefix_fst_words - .iter() - .find(|prefixes| word.starts_with(&prefixes[0].as_bytes())) - } - }; + if !common_prefix_fst_words.is_empty() { + let mut new_word_docids_iter = new_word_docids.into_cursor()?; + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some((word, data)) = new_word_docids_iter.move_on_next()? { + current_prefixes = match current_prefixes.take() { + Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; + common_prefix_fst_words + .iter() + .find(|prefixes| word.starts_with(&prefixes[0].as_bytes())) + } + }; - if let Some(prefixes) = current_prefixes { - for prefix in prefixes.iter() { - if word.starts_with(prefix.as_bytes()) { - match prefixes_cache.get_mut(prefix.as_bytes()) { - Some(value) => value.push(data.to_owned()), - None => { - prefixes_cache.insert(prefix.clone().into(), vec![data.to_owned()]); + if let Some(prefixes) = current_prefixes { + for prefix in prefixes.iter() { + if word.starts_with(prefix.as_bytes()) { + match prefixes_cache.get_mut(prefix.as_bytes()) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache + .insert(prefix.clone().into(), vec![data.to_owned()]); + } } } } } } - } - write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; + } // We fetch the docids associated to the newly added word prefix fst only. let db = self.index.word_docids.remap_data_type::(); diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 692dd1568..284bb8981 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -83,70 +83,76 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { self.max_memory, ); - // We compute the prefix docids associated with the common prefixes between - // the old and new word prefix fst. - let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = new_wppd_iter.move_on_next()? { - let (w1, w2, prox) = StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - if prox > self.max_proximity { - continue; + if !common_prefix_fst_words.is_empty() { + // We compute the prefix docids associated with the common prefixes between + // the old and new word prefix fst. + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some((key, data)) = new_wppd_iter.move_on_next()? { + let (w1, w2, prox) = + StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + if prox > self.max_proximity { + continue; + } + + insert_current_prefix_data_in_sorter( + &mut buffer, + &mut current_prefixes, + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + common_prefix_fst_words, + self.max_prefix_length, + w1, + w2, + prox, + data, + )?; } - insert_current_prefix_data_in_sorter( - &mut buffer, - &mut current_prefixes, + write_prefixes_in_sorter( &mut prefixes_cache, &mut word_prefix_pair_proximity_docids_sorter, - common_prefix_fst_words, - self.max_prefix_length, - w1, - w2, - prox, - data, )?; } - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - )?; + if !new_prefix_fst_words.is_empty() { + // We compute the prefix docids associated with the newly added prefixes + // in the new word prefix fst. + let mut db_iter = self + .index + .word_pair_proximity_docids + .remap_data_type::() + .iter(self.wtxn)?; - // We compute the prefix docids associated with the newly added prefixes - // in the new word prefix fst. - let mut db_iter = - self.index.word_pair_proximity_docids.remap_data_type::().iter(self.wtxn)?; + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? { + if prox > self.max_proximity { + continue; + } - let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? { - if prox > self.max_proximity { - continue; + insert_current_prefix_data_in_sorter( + &mut buffer, + &mut current_prefixes, + &mut prefixes_cache, + &mut word_prefix_pair_proximity_docids_sorter, + &new_prefix_fst_words, + self.max_prefix_length, + w1, + w2, + prox, + data, + )?; } - insert_current_prefix_data_in_sorter( - &mut buffer, - &mut current_prefixes, + write_prefixes_in_sorter( &mut prefixes_cache, &mut word_prefix_pair_proximity_docids_sorter, - &new_prefix_fst_words, - self.max_prefix_length, - w1, - w2, - prox, - data, )?; } - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - )?; - - drop(db_iter); - // All of the word prefix pairs in the database that have a w2 // that is contained in the `suppr_pw` set must be removed as well. let mut iter = self diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 324516325..77e9e7c29 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -74,42 +74,46 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { let mut new_word_position_docids_iter = new_word_position_docids.into_cursor()?; - // We fetch all the new common prefixes between the previous and new prefix fst. - let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { - let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + if !common_prefix_fst_words.is_empty() { + // We fetch all the new common prefixes between the previous and new prefix fst. + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { + let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - current_prefixes = match current_prefixes.take() { - Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), - _otherwise => { - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut prefix_position_docids_sorter, - )?; - common_prefix_fst_words.iter().find(|prefixes| word.starts_with(&prefixes[0])) - } - }; + current_prefixes = match current_prefixes.take() { + Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter( + &mut prefixes_cache, + &mut prefix_position_docids_sorter, + )?; + common_prefix_fst_words + .iter() + .find(|prefixes| word.starts_with(&prefixes[0])) + } + }; - if let Some(prefixes) = current_prefixes { - for prefix in prefixes.iter() { - if word.starts_with(prefix) { - buffer.clear(); - buffer.extend_from_slice(prefix.as_bytes()); - buffer.extend_from_slice(&pos.to_be_bytes()); - match prefixes_cache.get_mut(&buffer) { - Some(value) => value.push(data.to_owned()), - None => { - prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); + if let Some(prefixes) = current_prefixes { + for prefix in prefixes.iter() { + if word.starts_with(prefix) { + buffer.clear(); + buffer.extend_from_slice(prefix.as_bytes()); + buffer.extend_from_slice(&pos.to_be_bytes()); + match prefixes_cache.get_mut(&buffer) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); + } } } } } } - } - write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?; + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?; + } // We fetch the docids associated to the newly added word prefix fst only. let db = self.index.word_position_docids.remap_data_type::(); From 92e2e09434ac744dbe25c8c0bd83666953c39b5b Mon Sep 17 00:00:00 2001 From: psvnl sai kumar Date: Mon, 14 Mar 2022 01:01:58 +0530 Subject: [PATCH 1257/1889] exporting heed to avoid having different versions of Heed in Meilisearch --- milli/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index bb4f47e47..9cae8b254 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -20,6 +20,7 @@ use std::hash::BuildHasherDefault; pub use filter_parser::{Condition, FilterCondition}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; +pub use heed; pub use meilisearch_tokenizer as tokenizer; use serde_json::{Map, Value}; From 5e08fac729af28a18b2eef313e5ab006bfba1112 Mon Sep 17 00:00:00 2001 From: psvnl sai kumar Date: Mon, 14 Mar 2022 19:22:41 +0530 Subject: [PATCH 1258/1889] fixes for rustfmt pass --- milli/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 9cae8b254..ba2bd9b0f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -20,9 +20,8 @@ use std::hash::BuildHasherDefault; pub use filter_parser::{Condition, FilterCondition}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; -pub use heed; -pub use meilisearch_tokenizer as tokenizer; use serde_json::{Map, Value}; +pub use {heed, meilisearch_tokenizer as tokenizer}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::criterion::{default_criteria, Criterion, CriterionError}; From 288a879411542fa4f74d56dae587f1abd9898218 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 14 Mar 2022 17:00:00 +0100 Subject: [PATCH 1259/1889] Remove three useless dependencies --- http-ui/Cargo.toml | 1 - http-ui/src/main.rs | 2 +- milli/Cargo.toml | 3 --- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 34b302f03..d8c1775f5 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,6 @@ anyhow = "1.0.38" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } crossbeam-channel = "0.5.0" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } memmap2 = "0.5.0" milli = { path = "../milli" } once_cell = "1.5.2" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index ebfe4b073..b608e79ec 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -18,8 +18,8 @@ use either::Either; use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; use heed::EnvOpenOptions; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use milli::documents::DocumentBatchReader; +use milli::tokenizer::{Analyzer, AnalyzerConfig}; use milli::update::UpdateIndexingStep::*; use milli::update::{ ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 43123d53b..107674db1 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -12,15 +12,12 @@ byteorder = "1.4.2" concat-arrays = "0.1.2" crossbeam-channel = "0.5.1" either = "1.6.1" -flate2 = "1.0.20" fst = "0.4.5" fxhash = "0.2.1" grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } -human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } -linked-hash-map = "0.5.4" meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } memmap2 = "0.5.0" obkv = "0.2.0" From 63682c2c9a9c0b01a54db7e84b320cd2818a6178 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 14 Mar 2022 17:00:53 +0100 Subject: [PATCH 1260/1889] Upgrade the dependencies --- benchmarks/Cargo.toml | 16 +++++++-------- cli/Cargo.toml | 14 ++++++------- filter-parser/Cargo.toml | 2 +- helpers/Cargo.toml | 6 +++--- http-ui/Cargo.toml | 42 +++++++++++++++++++------------------- infos/Cargo.toml | 12 +++++------ milli/Cargo.toml | 44 ++++++++++++++++++++-------------------- 7 files changed, 68 insertions(+), 68 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index b48599679..0cac5e017 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -6,8 +6,8 @@ publish = false [dependencies] milli = { path = "../milli" } -anyhow = "1.0" -serde_json = { version = "1.0.62", features = ["preserve_order"] } +anyhow = "1.0.56" +serde_json = { version = "1.0.79", features = ["preserve_order"] } csv = "1.1.6" [target.'cfg(target_os = "linux")'.dependencies] @@ -15,14 +15,14 @@ jemallocator = "0.3.2" [dev-dependencies] heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } -criterion = { version = "0.3.4", features = ["html_reports"] } +criterion = { version = "0.3.5", features = ["html_reports"] } [build-dependencies] -anyhow = "1.0" -bytes = "1.0" -flate2 = "1.0.20" -convert_case = "0.4" -reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false } +anyhow = "1.0.56" +bytes = "1.1.0" +flate2 = "1.0.22" +convert_case = "0.5.0" +reqwest = { version = "0.11.9", features = ["blocking", "rustls-tls"], default-features = false } [[bench]] name = "search_songs" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 023cd06f6..4378902ca 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -8,15 +8,15 @@ description = "A CLI to interact with a milli index" [dependencies] indicatif = "0.16.2" -serde = "1.0.129" -serde_json = "1.0.66" -structopt = "0.3.22" +serde = "1.0.136" +serde_json = "1.0.79" +structopt = "0.3.26" milli = { path = "../milli" } -eyre = "0.6.5" -color-eyre = "0.5.11" +eyre = "0.6.7" +color-eyre = "0.6.1" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } -byte-unit = { version = "4.0.12", features = ["serde"] } -bimap = "0.6.1" +byte-unit = { version = "4.0.14", features = ["serde"] } +bimap = "0.6.2" csv = "1.1.6" stderrlog = "0.5.1" diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index ee44bcb7f..ea29404ed 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -6,5 +6,5 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -nom = "7.0.0" +nom = "7.1.0" nom_locate = "4.0.0" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index dc0f7dc81..482750636 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -5,12 +5,12 @@ authors = ["Clément Renault "] edition = "2018" [dependencies] -anyhow = "1.0.38" -byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } +anyhow = "1.0.56" +byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } milli = { path = "../milli" } stderrlog = "0.5.1" -structopt = { version = "0.3.21", default-features = false } +structopt = { version = "0.3.26", default-features = false } [target.'cfg(target_os = "linux")'.dependencies] jemallocator = "0.3.2" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index d8c1775f5..9dd269970 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -6,42 +6,42 @@ authors = ["Clément Renault "] edition = "2018" [dependencies] -anyhow = "1.0.38" -byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -crossbeam-channel = "0.5.0" +anyhow = "1.0.56" +byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } +crossbeam-channel = "0.5.2" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } -memmap2 = "0.5.0" +memmap2 = "0.5.3" milli = { path = "../milli" } -once_cell = "1.5.2" -rayon = "1.5.0" -structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] } -tempfile = "3.2.0" +once_cell = "1.10.0" +rayon = "1.5.1" +structopt = { version = "0.3.26", default-features = false, features = ["wrap_help"] } +tempfile = "3.3.0" # http server -askama = "0.10.5" -askama_warp = "0.10.0" -bytes = "0.5.6" +askama = "0.11.1" +askama_warp = "0.12.0" +bytes = "1.1.0" either = "1.6.1" -flate2 = "1.0.20" -futures = "0.3.12" -serde = { version = "1.0.123", features = ["derive"] } -serde_json = { version = "1.0.62", features = ["preserve_order"] } -tokio = { version = "0.2.25", features = ["full"] } -warp = "0.2.5" +flate2 = "1.0.22" +futures = "0.3.21" +serde = { version = "1.0.136", features = ["derive"] } +serde_json = { version = "1.0.79", features = ["preserve_order"] } +tokio = { version = "1.17.0", features = ["full"] } +warp = "0.3.2" # logging log = "0.4.14" stderrlog = "0.5.1" -fst = "0.4.5" +fst = "0.4.7" # Temporary fix for bitvec, remove once fixed. (https://github.com/bitvecto-rs/bitvec/issues/105) -funty = "=1.1" -bimap = "0.6.1" +funty = "2.0.0" +bimap = "0.6.2" csv = "1.1.6" [dev-dependencies] maplit = "1.0.2" -serde_test = "1.0.125" +serde_test = "1.0.136" [target.'cfg(target_os = "linux")'.dependencies] jemallocator = "0.3.2" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 41c9241ba..2863695f0 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -5,15 +5,15 @@ authors = ["Clément Renault "] edition = "2018" [dependencies] -anyhow = "1.0.38" -byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -csv = "1.1.5" +anyhow = "1.0.56" +byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } +csv = "1.1.6" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } milli = { path = "../milli" } -roaring = "0.6.6" -serde_json = "1.0.62" +roaring = "0.9.0" +serde_json = "1.0.79" stderrlog = "0.5.1" -structopt = { version = "0.3.21", default-features = false } +structopt = { version = "0.3.26", default-features = false } [target.'cfg(target_os = "linux")'.dependencies] jemallocator = "0.3.2" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 107674db1..ef89e7819 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -5,50 +5,50 @@ authors = ["Kerollmops "] edition = "2018" [dependencies] -bimap = { version = "0.6.1", features = ["serde"] } +bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" -bstr = "0.2.15" -byteorder = "1.4.2" +bstr = "0.2.17" +byteorder = "1.4.3" concat-arrays = "0.1.2" -crossbeam-channel = "0.5.1" +crossbeam-channel = "0.5.2" either = "1.6.1" -fst = "0.4.5" +fst = "0.4.7" fxhash = "0.2.1" grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } -levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } -memmap2 = "0.5.0" +levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.8" } +memmap2 = "0.5.3" obkv = "0.2.0" -once_cell = "1.5.2" -ordered-float = "2.1.1" -rayon = "1.5.0" -roaring = "0.6.6" -rstar = { version = "0.9.1", features = ["serde"] } -serde = { version = "1.0.123", features = ["derive"] } -serde_json = { version = "1.0.62", features = ["preserve_order"] } -slice-group-by = "0.2.6" -smallstr = { version = "0.2.0", features = ["serde"] } -smallvec = "1.6.1" -tempfile = "3.2.0" +once_cell = "1.10.0" +ordered-float = "2.10.0" +rayon = "1.5.1" +roaring = "0.9.0" +rstar = { version = "0.9.2", features = ["serde"] } +serde = { version = "1.0.136", features = ["derive"] } +serde_json = { version = "1.0.79", features = ["preserve_order"] } +slice-group-by = "0.3.0" +smallstr = { version = "0.3.0", features = ["serde"] } +smallvec = "1.8.0" +tempfile = "3.3.0" time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } uuid = { version = "0.8.2", features = ["v4"] } filter-parser = { path = "../filter-parser" } # documents words self-join -itertools = "0.10.0" +itertools = "0.10.3" # logging log = "0.4.14" -logging_timer = "1.0.0" +logging_timer = "1.1.0" csv = "1.1.6" [dev-dependencies] big_s = "1.0.2" maplit = "1.0.2" -rand = "0.8.3" +rand = "0.8.5" [features] default = [] From 21ec334dcc056847a5f2bbcb799946bc5c298706 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 14 Mar 2022 17:13:07 +0100 Subject: [PATCH 1261/1889] Fix the compilation error of the dependency versions --- http-ui/Cargo.toml | 1 + http-ui/src/main.rs | 9 ++++--- .../cbo_roaring_bitmap_codec.rs | 25 ++++++++++--------- milli/src/search/criteria/mod.rs | 6 +++-- milli/src/search/query_tree.rs | 3 +-- milli/src/update/delete_documents.rs | 6 ++--- 6 files changed, 27 insertions(+), 23 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 9dd269970..e7ed8455a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -27,6 +27,7 @@ futures = "0.3.21" serde = { version = "1.0.136", features = ["derive"] } serde_json = { version = "1.0.79", features = ["preserve_order"] } tokio = { version = "1.17.0", features = ["full"] } +tokio-stream = { version = "0.1.8", default-features = false, features = ["sync"] } warp = "0.3.2" # logging diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index b608e79ec..26c1034eb 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -3,7 +3,7 @@ mod update_store; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; -use std::io::{BufRead, BufReader, Cursor}; +use std::io::{BufRead, BufReader, Cursor, Read}; use std::net::SocketAddr; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; @@ -35,6 +35,7 @@ use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; use tokio::sync::broadcast; +use tokio_stream::wrappers::BroadcastStream; use warp::filters::ws::Message; use warp::http::Response; use warp::Filter; @@ -885,7 +886,8 @@ async fn main() -> anyhow::Result<()> { let mut file = TFile::from_std(file); while let Some(result) = stream.next().await { - let bytes = result.unwrap().to_bytes(); + let mut bytes = Vec::new(); + result.unwrap().reader().read_to_end(&mut bytes).unwrap(); file.write_all(&bytes[..]).await.unwrap(); } @@ -1004,8 +1006,7 @@ async fn main() -> anyhow::Result<()> { let update_status_receiver = update_status_sender.subscribe(); ws.on_upgrade(|websocket| { // Just echo all updates messages... - update_status_receiver - .into_stream() + BroadcastStream::new(update_status_receiver) .flat_map(|result| match result { Ok(status) => { let msg = serde_json::to_string(&status).unwrap(); diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 519997274..96aee6855 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -82,7 +82,8 @@ impl CboRoaringBitmapCodec { buffer.extend_from_slice(&integer.to_ne_bytes()); } } else { - let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()); + // Integers *must* be ordered here, no matter what. + let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap(); roaring.serialize_into(buffer)?; } } else { @@ -152,25 +153,25 @@ mod tests { let mut buffer = Vec::new(); let small_data = vec![ - RoaringBitmap::from_sorted_iter(1..4), - RoaringBitmap::from_sorted_iter(2..5), - RoaringBitmap::from_sorted_iter(4..6), - RoaringBitmap::from_sorted_iter(1..3), + RoaringBitmap::from_sorted_iter(1..4).unwrap(), + RoaringBitmap::from_sorted_iter(2..5).unwrap(), + RoaringBitmap::from_sorted_iter(4..6).unwrap(), + RoaringBitmap::from_sorted_iter(1..3).unwrap(), ]; let small_data: Vec<_> = small_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); CboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap(); let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); - let expected = RoaringBitmap::from_sorted_iter(1..6); + let expected = RoaringBitmap::from_sorted_iter(1..6).unwrap(); assert_eq!(bitmap, expected); let medium_data = vec![ - RoaringBitmap::from_sorted_iter(1..4), - RoaringBitmap::from_sorted_iter(2..5), - RoaringBitmap::from_sorted_iter(4..8), - RoaringBitmap::from_sorted_iter(0..3), - RoaringBitmap::from_sorted_iter(7..23), + RoaringBitmap::from_sorted_iter(1..4).unwrap(), + RoaringBitmap::from_sorted_iter(2..5).unwrap(), + RoaringBitmap::from_sorted_iter(4..8).unwrap(), + RoaringBitmap::from_sorted_iter(0..3).unwrap(), + RoaringBitmap::from_sorted_iter(7..23).unwrap(), ]; let medium_data: Vec<_> = @@ -179,7 +180,7 @@ mod tests { CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap(); let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); - let expected = RoaringBitmap::from_sorted_iter(0..23); + let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap(); assert_eq!(bitmap, expected); } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 8306f5d0e..1dbfd2524 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -498,6 +498,7 @@ fn query_pair_proximity_docids( #[cfg(test)] pub mod test { use std::collections::HashMap; + use std::iter; use maplit::hashmap; use rand::rngs::StdRng; @@ -567,7 +568,8 @@ pub mod test { .iter() .enumerate() .map(|(i, w)| { - (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32))) + let bitmap = RoaringBitmap::from_sorted_iter(iter::once(i as u32)).unwrap(); + (w.clone(), bitmap) }) .collect()) } else { @@ -622,7 +624,7 @@ pub mod test { } values.sort_unstable(); - RoaringBitmap::from_sorted_iter(values.into_iter()) + RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() } let word_docids = hashmap! { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 0744231ae..237bb9be2 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -587,8 +587,7 @@ mod test { values.push(rng.gen()); } values.sort_unstable(); - - RoaringBitmap::from_sorted_iter(values.into_iter()) + RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() } TestContext { diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 2391bd0e4..402cc61dd 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -186,7 +186,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We create the FST map of the external ids that we must delete. external_ids.sort_unstable(); - let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?; + let external_ids_to_delete = fst::Set::from_iter(external_ids)?; // We acquire the current external documents ids map... let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; @@ -209,7 +209,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // the LMDB B-Tree two times but only once. let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; if let Some((key, mut docids)) = iter.next().transpose()? { - if key == word.as_ref() { + if key == word.as_str() { let previous_len = docids.len(); docids -= &self.documents_ids; if docids.is_empty() { @@ -230,7 +230,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { words.iter().filter_map( |(word, must_remove)| { if *must_remove { - Some(word.as_ref()) + Some(word.as_str()) } else { None } From 0c5f4ed7de9a05d456af6245f8990eebcf7e236d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 15 Mar 2022 14:18:29 +0100 Subject: [PATCH 1262/1889] Apply suggestions Co-authored-by: Many --- milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 96aee6855..1bd132974 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -82,7 +82,7 @@ impl CboRoaringBitmapCodec { buffer.extend_from_slice(&integer.to_ne_bytes()); } } else { - // Integers *must* be ordered here, no matter what. + // We can unwrap safely because the vector is sorted upper. let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap(); roaring.serialize_into(buffer)?; } From 08a06b49f00f4a3457b260b779a4f5f43b7d694d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 15 Mar 2022 13:53:26 +0100 Subject: [PATCH 1263/1889] Bump version to 0.23.1 --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 4378902ca..fc0eab435 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.23.0" +version = "0.23.1" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 482750636..1049de3a9 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.23.0" +version = "0.23.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index e7ed8455a..434855fd4 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.23.0" +version = "0.23.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 2863695f0..c6387f431 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.23.0" +version = "0.23.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ef89e7819..f4fb24feb 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.23.0" +version = "0.23.1" authors = ["Kerollmops "] edition = "2018" From d68fe2b3c7600ec6c280693d338cba698cab1f77 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 15 Mar 2022 15:56:07 +0100 Subject: [PATCH 1264/1889] optimize word prefix fst --- milli/src/update/words_prefixes_fst.rs | 39 +++++++++++++------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index 49406deb5..0977bc9f0 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -1,7 +1,6 @@ use std::iter::FromIterator; -use std::str; -use fst::Streamer; +use fst::{SetBuilder, Streamer}; use crate::{Index, Result, SmallString32}; @@ -44,43 +43,45 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { pub fn execute(self) -> Result<()> { let words_fst = self.index.words_fst(&self.wtxn)?; - let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); - for n in 1..=self.max_prefix_length { - let mut current_prefix = SmallString32::new(); - let mut current_prefix_count = 0; - let mut builder = fst::SetBuilder::memory(); + let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length]; + let mut current_prefix_count = vec![0; self.max_prefix_length]; + let mut builders: Vec<_> = + std::iter::repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect(); + + let mut stream = words_fst.stream(); + while let Some(bytes) = stream.next() { + for n in 0..self.max_prefix_length { + let current_prefix = &mut current_prefix[n]; + let current_prefix_count = &mut current_prefix_count[n]; + let builder = &mut builders[n]; - let mut stream = words_fst.stream(); - while let Some(bytes) = stream.next() { // We try to get the first n bytes out of this string but we only want // to split at valid characters bounds. If we try to split in the middle of // a character we ignore this word and go to the next one. - let word = str::from_utf8(bytes)?; - let prefix = match word.get(..n) { + let word = std::str::from_utf8(bytes)?; + let prefix = match word.get(..=n) { Some(prefix) => prefix, None => continue, }; // This is the first iteration of the loop, // or the current word doesn't starts with the current prefix. - if current_prefix_count == 0 || prefix != current_prefix.as_str() { - current_prefix = SmallString32::from(prefix); - current_prefix_count = 0; + if *current_prefix_count == 0 || prefix != current_prefix.as_str() { + *current_prefix = SmallString32::from(prefix); + *current_prefix_count = 0; } - current_prefix_count += 1; + *current_prefix_count += 1; // There is enough words corresponding to this prefix to add it to the cache. - if current_prefix_count >= self.threshold { + if *current_prefix_count >= self.threshold { builder.insert(prefix)?; } } - - // We construct the final set for prefixes of size n. - prefix_fsts.push(builder.into_set()); } // We merge all of the previously computed prefixes into on final set. + let prefix_fsts: Vec<_> = builders.into_iter().map(|sb| sb.into_set()).collect(); let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); let mut builder = fst::SetBuilder::memory(); builder.extend_stream(op.r#union())?; From d633ac5b9d6c7229d50b1eaacbe57ae9cc5d9ae6 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 15 Mar 2022 16:37:22 +0100 Subject: [PATCH 1265/1889] optimize word prefix pair --- .../word_prefix_pair_proximity_docids.rs | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 284bb8981..be0ddf005 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -155,20 +155,20 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { // All of the word prefix pairs in the database that have a w2 // that is contained in the `suppr_pw` set must be removed as well. - let mut iter = self - .index - .word_prefix_pair_proximity_docids - .remap_data_type::() - .iter_mut(self.wtxn)?; - while let Some(((_, w2, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(w2.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; + if !del_prefix_fst_words.is_empty() { + let mut iter = self + .index + .word_prefix_pair_proximity_docids + .remap_data_type::() + .iter_mut(self.wtxn)?; + while let Some(((_, w2, _), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(w2.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; + } } } - drop(iter); - // We finally write and merge the new word prefix pair proximity docids // in the LMDB database. sorter_into_lmdb_database( From d127c57f2de034378fca1adec7c622744efbbf28 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 15 Mar 2022 17:12:48 +0100 Subject: [PATCH 1266/1889] review edits --- milli/src/update/words_prefixes_fst.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index 0977bc9f0..95c9f3b01 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -1,4 +1,5 @@ -use std::iter::FromIterator; +use std::iter::{repeat_with, FromIterator}; +use std::str; use fst::{SetBuilder, Streamer}; @@ -45,8 +46,8 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length]; let mut current_prefix_count = vec![0; self.max_prefix_length]; - let mut builders: Vec<_> = - std::iter::repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect(); + let mut builders = + repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect::>(); let mut stream = words_fst.stream(); while let Some(bytes) = stream.next() { @@ -58,7 +59,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { // We try to get the first n bytes out of this string but we only want // to split at valid characters bounds. If we try to split in the middle of // a character we ignore this word and go to the next one. - let word = std::str::from_utf8(bytes)?; + let word = str::from_utf8(bytes)?; let prefix = match word.get(..=n) { Some(prefix) => prefix, None => continue, From 628c835a220c4b29f12bc23b40ff90ca5292a620 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 2 Feb 2022 18:45:11 +0100 Subject: [PATCH 1267/1889] fix tests --- Cargo.toml | 3 +++ milli/src/search/mod.rs | 30 +++++++++++++++++++--------- milli/src/search/query_tree.rs | 17 +++++++++------- milli/tests/assets/test_set.ndjson | 2 +- milli/tests/search/query_criteria.rs | 1 + 5 files changed, 36 insertions(+), 17 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6b3e12f07..52599b1bd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,3 +18,6 @@ opt-level = 3 opt-level = 3 [profile.test.build-override] opt-level = 3 + +[patch.crates-io] +fst = { git = "https://github.com/MarinPostma/fst.git", rev = "e6c606b7507e8cb5e502d1609f9b909b8690bac5" } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 67b86d6bf..bfe5e023c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -70,6 +70,7 @@ impl<'a> Search<'a> { pub fn offset(&mut self, offset: usize) -> &mut Search<'a> { self.offset = offset; + self } @@ -301,23 +302,34 @@ pub fn word_derivations<'c>( if max_typo == 1 { let dfa = build_dfa(word, 1, is_prefix); let starts = Str::new(get_first(word)).starts_with(); - let mut stream = fst.search(starts.intersection(&dfa)).into_stream(); + let mut stream = fst.search_with_state(starts.intersection(&dfa)).into_stream(); - while let Some(word) = stream.next() { + while let Some((word, state)) = stream.next() { let word = std::str::from_utf8(word)?; - derived_words.push((word.to_string(), 1)); + let d = dfa.distance(state.1); + derived_words.push((word.to_string(), d.to_u8())); } } else { let starts = Str::new(get_first(word)).starts_with(); let first = build_dfa(word, 1, is_prefix).intersection((&starts).complement()); - let second = build_dfa(word, 2, is_prefix).intersection(&starts); - let automaton = first.union(second); + let second_dfa = build_dfa(word, 2, is_prefix); + let second = (&second_dfa).intersection(&starts); + let automaton = first.union(&second); - let mut stream = fst.search(automaton).into_stream(); + let mut stream = fst.search_with_state(automaton).into_stream(); - while let Some(word) = stream.next() { - let word = std::str::from_utf8(word)?; - derived_words.push((word.to_string(), 2)); + while let Some((found_word, state)) = stream.next() { + let found_word = std::str::from_utf8(found_word)?; + // in the case the typo is on the first letter, we know the number of typo + // is two + if get_first(found_word) != get_first(word) { + derived_words.push((word.to_string(), 2)); + } else { + // Else, we know that it is the second dfa that matched and compute the + // correct distance + let d = second_dfa.distance((state.1).0); + derived_words.push((word.to_string(), d.to_u8())); + } } } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 355e42663..a7285ccaa 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -365,7 +365,10 @@ fn create_query_tree( .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); - let query = Query { prefix: is_prefix, kind: typos(concat, true, 1) }; + let query = Query { + prefix: is_prefix, + kind: typos(concat, authorize_typos, 1), + }; operations.push(Operation::Query(query)); and_op_children.push(Operation::or(false, operations)); } @@ -657,7 +660,7 @@ mod test { ]), Operation::Query(Query { prefix: true, - kind: QueryKind::tolerant(2, "heyfriends".to_string()), + kind: QueryKind::tolerant(1, "heyfriends".to_string()), }), ], ); @@ -690,7 +693,7 @@ mod test { ]), Operation::Query(Query { prefix: false, - kind: QueryKind::tolerant(2, "heyfriends".to_string()), + kind: QueryKind::tolerant(1, "heyfriends".to_string()), }), ], ); @@ -755,7 +758,7 @@ mod test { ]), Operation::Query(Query { prefix: false, - kind: QueryKind::tolerant(2, "helloworld".to_string()), + kind: QueryKind::tolerant(1, "helloworld".to_string()), }), ], ); @@ -853,7 +856,7 @@ mod test { ]), Operation::Query(Query { prefix: false, - kind: QueryKind::tolerant(2, "newyorkcity".to_string()), + kind: QueryKind::tolerant(1, "newyorkcity".to_string()), }), ], ), @@ -927,7 +930,7 @@ mod test { ]), Operation::Query(Query { prefix: false, - kind: QueryKind::tolerant(2, "wordsplitfish".to_string()), + kind: QueryKind::tolerant(1, "wordsplitfish".to_string()), }), ], ); @@ -1047,7 +1050,7 @@ mod test { ]), Operation::Query(Query { prefix: false, - kind: QueryKind::tolerant(2, "heymyfriend".to_string()), + kind: QueryKind::tolerant(1, "heymyfriend".to_string()), }), ], ), diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 9a0fe5b0a..6383d274e 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -8,7 +8,7 @@ {"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":""} {"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""} {"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":""} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"ello creation system","description":"in few word ello was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""} {"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":""} {"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":""} {"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":""} diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 0dcbf660e..ef080db9f 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -61,6 +61,7 @@ test_criterion!( vec![Attribute], vec![] ); +test_criterion!(typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Typo], vec![]); test_criterion!( attribute_disallow_typo, DISALLOW_OPTIONAL_WORDS, From 3f24555c3d16b3078ef0182980341e2fbdc3ea43 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 15 Mar 2022 17:28:57 +0100 Subject: [PATCH 1268/1889] custom fst automatons --- Cargo.toml | 3 - milli/src/search/fst_utils.rs | 187 ++++++++++++++++++++++++++++++++++ milli/src/search/mod.rs | 16 +-- 3 files changed, 196 insertions(+), 10 deletions(-) create mode 100644 milli/src/search/fst_utils.rs diff --git a/Cargo.toml b/Cargo.toml index 52599b1bd..6b3e12f07 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,3 @@ opt-level = 3 opt-level = 3 [profile.test.build-override] opt-level = 3 - -[patch.crates-io] -fst = { git = "https://github.com/MarinPostma/fst.git", rev = "e6c606b7507e8cb5e502d1609f9b909b8690bac5" } diff --git a/milli/src/search/fst_utils.rs b/milli/src/search/fst_utils.rs new file mode 100644 index 000000000..b488e6c19 --- /dev/null +++ b/milli/src/search/fst_utils.rs @@ -0,0 +1,187 @@ +/// This mod is necessary until https://github.com/BurntSushi/fst/pull/137 gets merged. +/// All credits for this code go to BurntSushi. +use fst::Automaton; + +pub struct StartsWith(pub A); + +/// The `Automaton` state for `StartsWith`. +pub struct StartsWithState(pub StartsWithStateKind); + +impl Clone for StartsWithState +where + A::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +/// The inner state of a `StartsWithState`. +pub enum StartsWithStateKind { + /// Sink state that is reached when the automaton has matched the prefix. + Done, + /// State in which the automaton is while it hasn't matched the prefix. + Running(A::State), +} + +impl Clone for StartsWithStateKind +where + A::State: Clone, +{ + fn clone(&self) -> Self { + match self { + StartsWithStateKind::Done => StartsWithStateKind::Done, + StartsWithStateKind::Running(inner) => StartsWithStateKind::Running(inner.clone()), + } + } +} + +impl Automaton for StartsWith { + type State = StartsWithState; + + fn start(&self) -> StartsWithState { + StartsWithState({ + let inner = self.0.start(); + if self.0.is_match(&inner) { + StartsWithStateKind::Done + } else { + StartsWithStateKind::Running(inner) + } + }) + } + fn is_match(&self, state: &StartsWithState) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(_) => false, + } + } + fn can_match(&self, state: &StartsWithState) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(ref inner) => self.0.can_match(inner), + } + } + fn will_always_match(&self, state: &StartsWithState) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(_) => false, + } + } + fn accept(&self, state: &StartsWithState, byte: u8) -> StartsWithState { + StartsWithState(match state.0 { + StartsWithStateKind::Done => StartsWithStateKind::Done, + StartsWithStateKind::Running(ref inner) => { + let next_inner = self.0.accept(inner, byte); + if self.0.is_match(&next_inner) { + StartsWithStateKind::Done + } else { + StartsWithStateKind::Running(next_inner) + } + } + }) + } +} +/// An automaton that matches when one of its component automata match. +#[derive(Clone, Debug)] +pub struct Union(pub A, pub B); + +/// The `Automaton` state for `Union`. +pub struct UnionState(pub A::State, pub B::State); + +impl Clone for UnionState +where + A::State: Clone, + B::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone(), self.1.clone()) + } +} + +impl Automaton for Union { + type State = UnionState; + fn start(&self) -> UnionState { + UnionState(self.0.start(), self.1.start()) + } + fn is_match(&self, state: &UnionState) -> bool { + self.0.is_match(&state.0) || self.1.is_match(&state.1) + } + fn can_match(&self, state: &UnionState) -> bool { + self.0.can_match(&state.0) || self.1.can_match(&state.1) + } + fn will_always_match(&self, state: &UnionState) -> bool { + self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1) + } + fn accept(&self, state: &UnionState, byte: u8) -> UnionState { + UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte)) + } +} +/// An automaton that matches when both of its component automata match. +#[derive(Clone, Debug)] +pub struct Intersection(pub A, pub B); + +/// The `Automaton` state for `Intersection`. +pub struct IntersectionState(pub A::State, pub B::State); + +impl Clone for IntersectionState +where + A::State: Clone, + B::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone(), self.1.clone()) + } +} + +impl Automaton for Intersection { + type State = IntersectionState; + fn start(&self) -> IntersectionState { + IntersectionState(self.0.start(), self.1.start()) + } + fn is_match(&self, state: &IntersectionState) -> bool { + self.0.is_match(&state.0) && self.1.is_match(&state.1) + } + fn can_match(&self, state: &IntersectionState) -> bool { + self.0.can_match(&state.0) && self.1.can_match(&state.1) + } + fn will_always_match(&self, state: &IntersectionState) -> bool { + self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1) + } + fn accept(&self, state: &IntersectionState, byte: u8) -> IntersectionState { + IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte)) + } +} +/// An automaton that matches exactly when the automaton it wraps does not. +#[derive(Clone, Debug)] +pub struct Complement(pub A); + +/// The `Automaton` state for `Complement`. +pub struct ComplementState(pub A::State); + +impl Clone for ComplementState +where + A::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +impl Automaton for Complement { + type State = ComplementState; + fn start(&self) -> ComplementState { + ComplementState(self.0.start()) + } + fn is_match(&self, state: &ComplementState) -> bool { + !self.0.is_match(&state.0) + } + fn can_match(&self, state: &ComplementState) -> bool { + !self.0.will_always_match(&state.0) + } + fn will_always_match(&self, state: &ComplementState) -> bool { + !self.0.can_match(&state.0) + } + fn accept(&self, state: &ComplementState, byte: u8) -> ComplementState { + ComplementState(self.0.accept(&state.0, byte)) + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index bfe5e023c..40e4bca24 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -16,6 +16,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; +use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; @@ -30,6 +31,7 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod criteria; mod distinct; mod facet; +mod fst_utils; mod matching_words; mod query_tree; @@ -70,7 +72,6 @@ impl<'a> Search<'a> { pub fn offset(&mut self, offset: usize) -> &mut Search<'a> { self.offset = offset; - self } @@ -301,8 +302,9 @@ pub fn word_derivations<'c>( } else { if max_typo == 1 { let dfa = build_dfa(word, 1, is_prefix); - let starts = Str::new(get_first(word)).starts_with(); - let mut stream = fst.search_with_state(starts.intersection(&dfa)).into_stream(); + let starts = StartsWith(Str::new(get_first(word))); + let mut stream = + fst.search_with_state(Intersection(starts, &dfa)).into_stream(); while let Some((word, state)) = stream.next() { let word = std::str::from_utf8(word)?; @@ -310,11 +312,11 @@ pub fn word_derivations<'c>( derived_words.push((word.to_string(), d.to_u8())); } } else { - let starts = Str::new(get_first(word)).starts_with(); - let first = build_dfa(word, 1, is_prefix).intersection((&starts).complement()); + let starts = StartsWith(Str::new(get_first(word))); + let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); let second_dfa = build_dfa(word, 2, is_prefix); - let second = (&second_dfa).intersection(&starts); - let automaton = first.union(&second); + let second = Intersection(&second_dfa, &starts); + let automaton = Union(first, &second); let mut stream = fst.search_with_state(automaton).into_stream(); From 4822fe1beb4825abe416a04148993b9fdcfec21b Mon Sep 17 00:00:00 2001 From: Bruno Casali Date: Tue, 15 Mar 2022 18:12:51 -0300 Subject: [PATCH 1269/1889] Add a better error message when the filterable attrs are empty Fixes https://github.com/meilisearch/meilisearch/issues/2140 --- milli/src/search/facet/filter.rs | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index edc86d0ca..932fd21d9 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -39,12 +39,22 @@ impl<'a> std::error::Error for FilterError<'a> {} impl<'a> Display for FilterError<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::AttributeNotFilterable { attribute, filterable } => write!( - f, - "Attribute `{}` is not filterable. Available filterable attributes are: `{}`.", - attribute, - filterable, - ), + Self::AttributeNotFilterable { attribute, filterable } => { + if filterable.is_empty() { + write!( + f, + "Attribute `{}` is not filterable. This index does not have configured filterable attributes.", + attribute, + ) + } else { + write!( + f, + "Attribute `{}` is not filterable. Available filterable attributes are: `{}`.", + attribute, + filterable, + ) + } + }, Self::TooDeep => write!(f, "Too many filter conditions, can't process more than {} filters.", MAX_FILTER_DEPTH @@ -554,13 +564,13 @@ mod tests { let filter = Filter::from_str("_geoRadius(42, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!(error.to_string().starts_with( - "Attribute `_geo` is not filterable. Available filterable attributes are: ``." + "Attribute `_geo` is not filterable. This index does not have configured filterable attributes." )); let filter = Filter::from_str("dog = \"bernese mountain\"").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); assert!(error.to_string().starts_with( - "Attribute `dog` is not filterable. Available filterable attributes are: ``." + "Attribute `dog` is not filterable. This index does not have configured filterable attributes." )); drop(rtxn); From 2a31cd13c912666121ac519005ddd6c592c6a2a6 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 16 Mar 2022 11:47:27 +0100 Subject: [PATCH 1270/1889] set resolver to v2 --- Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.toml b/Cargo.toml index 6b3e12f07..3f2732444 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,4 +1,5 @@ [workspace] +resolver = "2" members = ["milli", "filter-parser", "http-ui", "benchmarks", "infos", "helpers", "cli"] default-members = ["milli"] From adc71742c8dca54e731f23e96e594718ee7fe95f Mon Sep 17 00:00:00 2001 From: Bruno Casali Date: Tue, 15 Mar 2022 18:36:10 -0300 Subject: [PATCH 1271/1889] Move string concat to the struct instead of in the calling --- milli/src/search/facet/filter.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 932fd21d9..9388cfa33 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; use std::ops::Deref; @@ -27,7 +28,7 @@ pub struct Filter<'a> { #[derive(Debug)] enum FilterError<'a> { - AttributeNotFilterable { attribute: &'a str, filterable: String }, + AttributeNotFilterable { attribute: &'a str, filterable_fields: HashSet }, BadGeo(&'a str), BadGeoLat(f64), BadGeoLng(f64), @@ -39,19 +40,21 @@ impl<'a> std::error::Error for FilterError<'a> {} impl<'a> Display for FilterError<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::AttributeNotFilterable { attribute, filterable } => { - if filterable.is_empty() { + Self::AttributeNotFilterable { attribute, filterable_fields } => { + if filterable_fields.is_empty() { write!( f, "Attribute `{}` is not filterable. This index does not have configured filterable attributes.", attribute, ) } else { + let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::>().join(" "); + write!( f, "Attribute `{}` is not filterable. Available filterable attributes are: `{}`.", attribute, - filterable, + filterables_list, ) } }, @@ -372,10 +375,7 @@ impl<'a> Filter<'a> { return Err(fid.as_external_error( FilterError::AttributeNotFilterable { attribute, - filterable: filterable_fields - .into_iter() - .collect::>() - .join(" "), + filterable_fields, }, ))?; } @@ -426,7 +426,7 @@ impl<'a> Filter<'a> { } else { return Err(point[0].as_external_error(FilterError::AttributeNotFilterable { attribute: "_geo", - filterable: filterable_fields.into_iter().collect::>().join(" "), + filterable_fields, }))?; } } From 49d59d88c286feffd07c996ce7af154db656e037 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 16 Mar 2022 16:12:52 +0100 Subject: [PATCH 1272/1889] Remove useless variables in proximity --- milli/src/search/criteria/proximity.rs | 37 +++++--------------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index f884de160..2bfa61e85 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -11,7 +11,7 @@ use super::{ }; use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; use crate::search::{build_dfa, WordDerivationsCache}; -use crate::{DocumentId, Position, Result}; +use crate::{Position, Result}; type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; @@ -90,7 +90,6 @@ impl<'t> Criterion for Proximity<'t> { self.ctx, query_tree, allowed_candidates, - params.wdcache, )?; self.plane_sweep_cache = Some(cache.into_iter()); @@ -343,7 +342,6 @@ fn resolve_plane_sweep_candidates( ctx: &dyn Context, query_tree: &Operation, allowed_candidates: &RoaringBitmap, - wdcache: &mut WordDerivationsCache, ) -> Result> { /// FIXME may be buggy with query like "new new york" fn plane_sweep( @@ -467,12 +465,9 @@ fn resolve_plane_sweep_candidates( } fn resolve_operation<'a>( - ctx: &dyn Context, query_tree: &'a Operation, - docid: DocumentId, rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, words_positions: &HashMap, - wdcache: &mut WordDerivationsCache, ) -> Result> { use Operation::{And, Or, Phrase}; @@ -484,14 +479,7 @@ fn resolve_plane_sweep_candidates( And(ops) => { let mut groups_positions = Vec::with_capacity(ops.len()); for operation in ops { - let positions = resolve_operation( - ctx, - operation, - docid, - rocache, - words_positions, - wdcache, - )?; + let positions = resolve_operation(operation, rocache, words_positions)?; groups_positions.push(positions); } plane_sweep(groups_positions, false)? @@ -501,7 +489,7 @@ fn resolve_plane_sweep_candidates( for word in words { let positions = match words_positions.get(word) { Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), - None => vec![], + None => return Ok(vec![]), }; groups_positions.push(positions); } @@ -510,14 +498,7 @@ fn resolve_plane_sweep_candidates( Or(_, ops) => { let mut result = Vec::new(); for op in ops { - result.extend(resolve_operation( - ctx, - op, - docid, - rocache, - words_positions, - wdcache, - )?) + result.extend(resolve_operation(op, rocache, words_positions)?) } result.sort_unstable(); @@ -572,14 +553,8 @@ fn resolve_plane_sweep_candidates( for docid in allowed_candidates { let words_positions = ctx.docid_words_positions(docid)?; resolve_operation_cache.clear(); - let positions = resolve_operation( - ctx, - query_tree, - docid, - &mut resolve_operation_cache, - &words_positions, - wdcache, - )?; + let positions = + resolve_operation(query_tree, &mut resolve_operation_cache, &words_positions)?; let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7); candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid); From 5dc464b9a73f39102711504410e4a507c0945022 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 21 Mar 2022 17:29:10 +0100 Subject: [PATCH 1273/1889] rollback meilisearch-tokenizer version --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f4fb24feb..9e9869bf4 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,7 +18,7 @@ grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] geoutils = "0.4.1" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.8" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } memmap2 = "0.5.3" obkv = "0.2.0" once_cell = "1.10.0" From 86dd88698d6de55d0a75f376895f62559268969a Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 21 Mar 2022 17:33:13 +0100 Subject: [PATCH 1274/1889] bump tokenizer --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 9e9869bf4..b45b58834 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,7 +18,7 @@ grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] geoutils = "0.4.1" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" } memmap2 = "0.5.3" obkv = "0.2.0" once_cell = "1.10.0" From ddf78a735bda93e888d41d0430e98e6c1737336f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 24 Mar 2022 16:39:45 +0100 Subject: [PATCH 1275/1889] Update version (v0.24.1) --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index fc0eab435..0d1a6a6a0 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.23.1" +version = "0.24.1" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 1049de3a9..f654ab29b 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.23.1" +version = "0.24.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 434855fd4..653de243a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.23.1" +version = "0.24.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index c6387f431..8fe140582 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.23.1" +version = "0.24.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b45b58834..d7eb7b9bf 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.23.1" +version = "0.24.1" authors = ["Kerollmops "] edition = "2018" From 6a77c81a28f7b430b855f5cdce0634863985c140 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 29 Mar 2022 09:45:29 -0700 Subject: [PATCH 1276/1889] Increase benchmarks (push) CI timeout --- .github/workflows/push_benchmarks_indexing.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/push_benchmarks_indexing.yml b/.github/workflows/push_benchmarks_indexing.yml index 0806f5646..f00542001 100644 --- a/.github/workflows/push_benchmarks_indexing.yml +++ b/.github/workflows/push_benchmarks_indexing.yml @@ -13,6 +13,7 @@ jobs: benchmarks: name: Run and upload benchmarks runs-on: benchmarks + timeout-minutes: 4320 # 72h steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 From c4653347fd296cbc4fb289bdb448dee313c35d79 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 16 Mar 2022 10:03:18 +0100 Subject: [PATCH 1277/1889] add authorize typo setting --- milli/src/index.rs | 20 ++++++++++++++++++++ milli/src/search/mod.rs | 6 +++++- milli/src/search/query_tree.rs | 2 -- milli/src/update/settings.rs | 25 +++++++++++++++++++++++++ 4 files changed, 50 insertions(+), 3 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 568d50ad8..4e43e404e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -46,6 +46,7 @@ pub mod main_key { pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; pub const CREATED_AT_KEY: &str = "created-at"; pub const UPDATED_AT_KEY: &str = "updated-at"; + pub const AUTHORIZE_TYPOS: &str = "authorize-typos"; } pub mod db_name { @@ -866,6 +867,25 @@ impl Index { ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>(wtxn, main_key::UPDATED_AT_KEY, &time) } + + pub fn authorize_typos(&self, txn: &RoTxn) -> heed::Result { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + match self.main.get::<_, Str, OwnedType>(txn, main_key::AUTHORIZE_TYPOS)? { + Some(0) => Ok(false), + _ => Ok(true), + } + } + + pub(crate) fn put_authorize_typos(&self, txn: &mut RwTxn, flag: bool) -> heed::Result<()> { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + self.main.put::<_, Str, OwnedType>(txn, main_key::AUTHORIZE_TYPOS, &(flag as u8))?; + + Ok(()) + } } #[cfg(test)] diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 40e4bca24..c9eef5a0d 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -112,7 +112,11 @@ impl<'a> Search<'a> { Some(query) => { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); - builder.authorize_typos(self.authorize_typos); + + // only authorize typos if both the index and the query allow it. + let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; + builder.authorize_typos(self.authorize_typos && index_authorizes_typos); + builder.words_limit(self.words_limit); // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index f3ee99d9e..5437199e1 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -191,7 +191,6 @@ impl<'a> QueryTreeBuilder<'a> { /// generated forcing all query words to be present in each matching documents /// (the criterion `words` will be ignored). /// default value if not called: `true` - #[allow(unused)] pub fn optional_words(&mut self, optional_words: bool) -> &mut Self { self.optional_words = optional_words; self @@ -201,7 +200,6 @@ impl<'a> QueryTreeBuilder<'a> { /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored). /// default value if not called: `true` - #[allow(unused)] pub fn authorize_typos(&mut self, authorize_typos: bool) -> &mut Self { self.authorize_typos = authorize_typos; self diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c413f81c3..25f3e92a3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -89,6 +89,7 @@ pub struct Settings<'a, 't, 'u, 'i> { distinct_field: Setting, synonyms: Setting>>, primary_key: Setting, + authorize_typos: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -109,6 +110,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { distinct_field: Setting::NotSet, synonyms: Setting::NotSet, primary_key: Setting::NotSet, + authorize_typos: Setting::NotSet, indexer_config, } } @@ -186,6 +188,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.primary_key = Setting::Set(primary_key); } + pub fn set_autorize_typos(&mut self, val: bool) { + self.authorize_typos = Setting::Set(val); + } + + pub fn reset_authorize_typos(&mut self) { + self.authorize_typos = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -450,6 +460,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + fn update_authorize_typos(&mut self) -> Result<()> { + match self.authorize_typos { + Setting::Set(flag) => { + self.index.put_authorize_typos(self.wtxn, flag)?; + Ok(()) + } + Setting::Reset => { + self.index.put_authorize_typos(self.wtxn, true)?; + Ok(()) + } + Setting::NotSet => Ok(()), + } + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -465,6 +489,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_distinct_field()?; self.update_criteria()?; self.update_primary_key()?; + self.update_authorize_typos()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, From f782fe20625755e78d89b630ccf87fbee9bb8b59 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 09:54:49 +0200 Subject: [PATCH 1278/1889] add authorize_typo_test --- milli/src/index.rs | 14 ++++++++++++++ milli/src/search/mod.rs | 42 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 4e43e404e..badcac0e5 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1009,4 +1009,18 @@ pub(crate) mod tests { } ); } + + #[test] + fn put_and_retrieve_disable_typo() { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + // default value is true + assert!(index.authorize_typos(&txn).unwrap()); + // set to false + index.put_authorize_typos(&mut txn, false).unwrap(); + txn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + assert!(!index.authorize_typos(&txn).unwrap()); + } } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index c9eef5a0d..4f753a607 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -105,6 +105,12 @@ impl<'a> Search<'a> { self } + fn is_typo_authorized(&self) -> Result { + let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; + // only authorize typos if both the index and the query allow it. + Ok(self.authorize_typos && index_authorizes_typos) + } + pub fn execute(&self) -> Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); @@ -113,9 +119,7 @@ impl<'a> Search<'a> { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); - // only authorize typos if both the index and the query allow it. - let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; - builder.authorize_typos(self.authorize_typos && index_authorizes_typos); + builder.authorize_typos(self.is_typo_authorized()?); builder.words_limit(self.words_limit); // We make sure that the analyzer is aware of the stop words @@ -364,3 +368,35 @@ pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { lev.build_dfa(word) } } + +#[cfg(test)] +mod test { + use crate::index::tests::TempIndex; + + use super::*; + + #[test] + fn test_is_authorized_typos() { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + + // default is authorized + assert!(search.is_typo_authorized().unwrap()); + + search.authorize_typos(false); + assert!(!search.is_typo_authorized().unwrap()); + + index.put_authorize_typos(&mut txn, false).unwrap(); + txn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + let mut search = Search::new(&txn, &index); + + assert!(!search.is_typo_authorized().unwrap()); + + search.authorize_typos(true); + assert!(!search.is_typo_authorized().unwrap()); + } +} From 6ef3bb9d83382c2797240a1a16a904e743e30081 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 14:06:23 +0200 Subject: [PATCH 1279/1889] fmt --- milli/src/search/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 4f753a607..614927877 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -371,9 +371,8 @@ pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { #[cfg(test)] mod test { - use crate::index::tests::TempIndex; - use super::*; + use crate::index::tests::TempIndex; #[test] fn test_is_authorized_typos() { From 3e34981d9b29e11e02f549c582b015ffad097854 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 14:12:00 +0200 Subject: [PATCH 1280/1889] add test for authorize_typos in update --- milli/src/update/settings.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 25f3e92a3..17924da8a 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -518,6 +518,7 @@ mod tests { use super::*; use crate::error::Error; + use crate::index::tests::TempIndex; use crate::update::IndexDocuments; use crate::{Criterion, Filter, SearchResult}; @@ -1218,4 +1219,18 @@ mod tests { let line = std::str::from_utf8(content.get(fid).unwrap()).unwrap(); assert_eq!(line, r#""Star Wars""#); } + + #[test] + fn test_disable_typo() { + let index = TempIndex::new(); + + let mut txn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + + assert!(index.authorize_typos(&txn).unwrap()); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_autorize_typos(false); + builder.execute(|_| ()).unwrap(); + assert!(!index.authorize_typos(&txn).unwrap()); + } } From d5ddc6b0805f6df260424274525bdb22b87b6b84 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 1 Apr 2022 10:51:22 +0200 Subject: [PATCH 1281/1889] fix 2 typos word derivation bug --- milli/src/search/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 614927877..95e26b594 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -333,12 +333,12 @@ pub fn word_derivations<'c>( // in the case the typo is on the first letter, we know the number of typo // is two if get_first(found_word) != get_first(word) { - derived_words.push((word.to_string(), 2)); + derived_words.push((found_word.to_string(), 2)); } else { // Else, we know that it is the second dfa that matched and compute the // correct distance let d = second_dfa.distance((state.1).0); - derived_words.push((word.to_string(), d.to_u8())); + derived_words.push((found_word.to_string(), d.to_u8())); } } } From 9fe40df960affbc4c42b4fc7f6fc6fef6e9219f3 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 1 Apr 2022 11:05:18 +0200 Subject: [PATCH 1282/1889] add word derivations tests --- milli/src/search/mod.rs | 63 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 95e26b594..0d33d9042 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -398,4 +398,67 @@ mod test { search.authorize_typos(true); assert!(!search.is_typo_authorized().unwrap()); } + + #[test] + fn test_one_typos_tolerance() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[("zealand".to_string(), 1)]); + } + + #[test] + fn test_one_typos_first_letter() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[]); + } + + #[test] + fn test_two_typos_tolerance() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[("zealand".to_string(), 2)]); + } + + #[test] + fn test_two_typos_first_letter() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[("zealand".to_string(), 2)]); + } + + #[test] + fn test_prefix() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[("zealand".to_string(), 0)]); + } + + #[test] + fn test_bad_prefix() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[]); + } + + #[test] + fn test_prefix_with_typo() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[("zealand".to_string(), 1)]); + } } From 5a24e605728c7c6b2a80b5d90c1dc553ebe3f9ba Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 13:03:06 +0100 Subject: [PATCH 1283/1889] introduce word len for typo setting --- milli/src/error.rs | 2 ++ milli/src/index.rs | 41 ++++++++++++++++++++++++++++ milli/src/update/settings.rs | 53 ++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+) diff --git a/milli/src/error.rs b/milli/src/error.rs index e6fbc0605..3ef6aa81d 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -72,6 +72,7 @@ pub enum UserError { SerdeJson(serde_json::Error), SortError(SortError), UnknownInternalDocumentId { document_id: DocumentId }, + InvalidMinTypoWordSetting(u8, u8), } impl From for Error { @@ -291,6 +292,7 @@ ranking rules settings to use the sort parameter at search time.", Self::UnknownInternalDocumentId { document_id } => { write!(f, "An unknown internal document id have been used: `{}`.", document_id) } + Self::InvalidMinTypoWordSetting(one, two) => write!(f, "Invalid settings for MinWordLenForTypo, expected 0 < 1-typo < 2-typos < 255, but found 1-typo: {} and 2-typo: {}", one, two), } } } diff --git a/milli/src/index.rs b/milli/src/index.rs index badcac0e5..3c1ba948f 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,6 +23,9 @@ use crate::{ Search, StrBEU32Codec, StrStrU8Codec, BEU32, }; +pub const DEFAULT_MIN_WORD_LEN_1_TYPO: u8 = 5; +pub const DEFAULT_MIN_WORD_LEN_2_TYPOS: u8 = 9; + pub mod main_key { pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; @@ -47,6 +50,8 @@ pub mod main_key { pub const CREATED_AT_KEY: &str = "created-at"; pub const UPDATED_AT_KEY: &str = "updated-at"; pub const AUTHORIZE_TYPOS: &str = "authorize-typos"; + pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; + pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; } pub mod db_name { @@ -886,6 +891,42 @@ impl Index { Ok(()) } + + pub fn min_word_len_1_typo(&self, txn: &RoTxn) -> heed::Result { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + Ok(self + .main + .get::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN)? + .unwrap_or(DEFAULT_MIN_WORD_LEN_1_TYPO)) + } + + pub(crate) fn put_min_word_len_1_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + self.main.put::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN, &val)?; + Ok(()) + } + + pub fn min_word_len_2_typo(&self, txn: &RoTxn) -> heed::Result { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + Ok(self + .main + .get::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN)? + .unwrap_or(DEFAULT_MIN_WORD_LEN_2_TYPOS)) + } + + pub(crate) fn put_min_word_len_2_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + self.main.put::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?; + Ok(()) + } } #[cfg(test)] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 17924da8a..72b416b02 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -90,6 +90,8 @@ pub struct Settings<'a, 't, 'u, 'i> { synonyms: Setting>>, primary_key: Setting, authorize_typos: Setting, + min_2_typos_word_len: Setting, + min_1_typo_word_len: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -112,6 +114,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, indexer_config, + min_2_typos_word_len: Setting::Reset, + min_1_typo_word_len: Setting::Reset, } } @@ -196,6 +200,22 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.authorize_typos = Setting::Reset; } + pub fn set_min_2_typos_word_len(&mut self, val: u8) { + self.min_2_typos_word_len = Setting::Set(val); + } + + pub fn reset_min_2_typos_word_len(&mut self) { + self.min_2_typos_word_len = Setting::Reset; + } + + pub fn set_min_1_typo_word_len(&mut self, val: u8) { + self.min_1_typo_word_len = Setting::Set(val); + } + + pub fn reset_min_1_typos_word_len(&mut self) { + self.min_1_typo_word_len = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -474,6 +494,38 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + fn update_min_typo_word_len(&mut self) -> Result<()> { + match (&self.min_1_typo_word_len, &self.min_2_typos_word_len) { + (Setting::Set(one), Setting::Set(two)) => { + if one < two { + self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; + self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?; + } else { + return Err(UserError::InvalidMinTypoWordSetting(*one, *two).into()); + } + } + (Setting::Set(one), _) => { + let two = self.index.min_word_len_2_typo(&self.wtxn)?; + if *one < two { + self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; + } else { + return Err(UserError::InvalidMinTypoWordSetting(*one, two).into()); + } + } + (_, Setting::Set(two)) => { + let one = self.index.min_word_len_1_typo(&self.wtxn)?; + if one < *two { + self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?; + } else { + return Err(UserError::InvalidMinTypoWordSetting(one, *two).into()); + } + } + _ => (), + } + + Ok(()) + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -490,6 +542,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_criteria()?; self.update_primary_key()?; self.update_authorize_typos()?; + self.update_min_typo_word_len()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, From a1a3a49bc9493c91c38b86d11370d2c66d8d348f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 13:29:59 +0100 Subject: [PATCH 1284/1889] dynamic minimum word len for typos in query tree builder --- milli/src/search/query_tree.rs | 40 ++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 5437199e1..6db2ce7a7 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -155,6 +155,8 @@ trait Context { None => Ok(None), } } + /// Returns the minimum word len for 1 and 2 typos. + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; } /// The query tree builder is the interface to build a query tree. @@ -178,6 +180,12 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn word_documents_count(&self, word: &str) -> heed::Result> { self.index.word_documents_count(self.rtxn, word) } + + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { + let one = self.index.min_word_len_1_typo(&self.rtxn)?; + let two = self.index.min_word_len_2_typo(&self.rtxn)?; + Ok((one, two)) + } } impl<'a> QueryTreeBuilder<'a> { @@ -256,14 +264,23 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result QueryKind { +fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { if authorize_typos { - match word.chars().count() { - 0..=4 => QueryKind::exact(word), - 5..=8 => QueryKind::tolerant(1.min(max_typos), word), - _ => QueryKind::tolerant(2.min(max_typos), word), + let count = word.chars().count().min(u8::MAX as usize) as u8; + if (0..config.word_len_1_typo).contains(&count) { + QueryKind::exact(word) + } else if (config.word_len_1_typo..config.word_len_2_typo).contains(&count) { + QueryKind::tolerant(1.min(config.max_typos), word) + } else { + QueryKind::tolerant(2.min(config.max_typos), word) } } else { QueryKind::exact(word) @@ -314,9 +331,11 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } + let (word_len_1_typo, word_len_2_typo) = ctx.min_word_len_for_typo()?; + let config = TypoConfig { max_typos: 2, word_len_1_typo, word_len_2_typo }; children.push(Operation::Query(Query { prefix, - kind: typos(word, authorize_typos, 2), + kind: typos(word, authorize_typos, config), })); Ok(Operation::or(false, children)) } @@ -363,9 +382,12 @@ fn create_query_tree( .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); + let (word_len_1_typo, word_len_2_typo) = ctx.min_word_len_for_typo()?; + let config = + TypoConfig { max_typos: 1, word_len_1_typo, word_len_2_typo }; let query = Query { prefix: is_prefix, - kind: typos(concat, authorize_typos, 1), + kind: typos(concat, authorize_typos, config), }; operations.push(Operation::Query(query)); and_op_children.push(Operation::or(false, operations)); @@ -576,6 +598,10 @@ mod test { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) } + + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { + Ok((5, 9)) + } } impl Default for TestContext { From 9102de55003498020e1adf81ce07ec2cec7377ef Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 10:22:39 +0200 Subject: [PATCH 1285/1889] fix error message --- milli/src/error.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 3ef6aa81d..471952a36 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -292,7 +292,7 @@ ranking rules settings to use the sort parameter at search time.", Self::UnknownInternalDocumentId { document_id } => { write!(f, "An unknown internal document id have been used: `{}`.", document_id) } - Self::InvalidMinTypoWordSetting(one, two) => write!(f, "Invalid settings for MinWordLenForTypo, expected 0 < 1-typo < 2-typos < 255, but found 1-typo: {} and 2-typo: {}", one, two), + Self::InvalidMinTypoWordSetting(one, two) => write!(f, "`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {}` and twoTypos: {}`."", one, two), } } } From 55af85db3c46b3e9abb896631389521b141f0c48 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 13:50:18 +0200 Subject: [PATCH 1286/1889] add tests for min_word_len_for_typo --- milli/src/error.rs | 2 +- milli/src/index.rs | 19 +++++++++++++ milli/src/search/query_tree.rs | 21 ++++++++++++++ milli/src/update/settings.rs | 51 ++++++++++++++++++++++++++++------ 4 files changed, 83 insertions(+), 10 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 471952a36..611160319 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -292,7 +292,7 @@ ranking rules settings to use the sort parameter at search time.", Self::UnknownInternalDocumentId { document_id } => { write!(f, "An unknown internal document id have been used: `{}`.", document_id) } - Self::InvalidMinTypoWordSetting(one, two) => write!(f, "`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {}` and twoTypos: {}`."", one, two), + Self::InvalidMinTypoWordSetting(one, two) => write!(f, "`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {}` and twoTypos: {}`.", one, two), } } } diff --git a/milli/src/index.rs b/milli/src/index.rs index 3c1ba948f..0095352e4 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -937,6 +937,7 @@ pub(crate) mod tests { use maplit::btreemap; use tempfile::TempDir; + use crate::index::{DEFAULT_MIN_WORD_LEN_1_TYPO, DEFAULT_MIN_WORD_LEN_2_TYPOS}; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig}; use crate::Index; @@ -1064,4 +1065,22 @@ pub(crate) mod tests { let txn = index.read_txn().unwrap(); assert!(!index.authorize_typos(&txn).unwrap()); } + + #[test] + fn set_min_word_len_for_typos() { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + + assert_eq!(index.min_word_len_1_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_1_TYPO); + assert_eq!(index.min_word_len_2_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_2_TYPOS); + + index.put_min_word_len_1_typo(&mut txn, 3).unwrap(); + index.put_min_word_len_2_typo(&mut txn, 15).unwrap(); + + txn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + assert_eq!(index.min_word_len_1_typo(&txn).unwrap(), 3); + assert_eq!(index.min_word_len_2_typo(&txn).unwrap(), 15); + } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 6db2ce7a7..acaba680f 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -264,6 +264,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result Settings<'a, 't, 'u, 'i> { fn update_min_typo_word_len(&mut self) -> Result<()> { match (&self.min_1_typo_word_len, &self.min_2_typos_word_len) { (Setting::Set(one), Setting::Set(two)) => { - if one < two { + if one > two { + return Err(UserError::InvalidMinTypoWordSetting(*one, *two).into()); + } else { self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?; - } else { - return Err(UserError::InvalidMinTypoWordSetting(*one, *two).into()); } } (Setting::Set(one), _) => { let two = self.index.min_word_len_2_typo(&self.wtxn)?; - if *one < two { - self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; - } else { + if *one > two { return Err(UserError::InvalidMinTypoWordSetting(*one, two).into()); + } else { + self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; } } (_, Setting::Set(two)) => { let one = self.index.min_word_len_1_typo(&self.wtxn)?; - if one < *two { - self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?; - } else { + if one > *two { return Err(UserError::InvalidMinTypoWordSetting(one, *two).into()); + } else { + self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?; } } _ => (), @@ -1286,4 +1286,37 @@ mod tests { builder.execute(|_| ()).unwrap(); assert!(!index.authorize_typos(&txn).unwrap()); } + + #[test] + fn update_min_word_len_for_typo() { + let index = TempIndex::new(); + let config = IndexerConfig::default(); + + // Set the genres setting + let mut txn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_min_1_typo_word_len(8); + builder.set_min_2_typos_word_len(8); + builder.execute(|_| ()).unwrap(); + + txn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + + assert_eq!(index.min_word_len_1_typo(&txn).unwrap(), 8); + assert_eq!(index.min_word_len_2_typo(&txn).unwrap(), 8); + } + + #[test] + fn update_invalid_min_word_len_for_typo() { + let index = TempIndex::new(); + let config = IndexerConfig::default(); + + // Set the genres setting + let mut txn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_min_1_typo_word_len(10); + builder.set_min_2_typos_word_len(7); + assert!(builder.execute(|_| ()).is_err()); + } } From 286dd7b2e43fdc0ba93931bdf094c1d9573b7954 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 13:51:27 +0200 Subject: [PATCH 1287/1889] rename min_word_len_2_typo --- milli/src/index.rs | 10 +++++----- milli/src/search/query_tree.rs | 2 +- milli/src/update/settings.rs | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 0095352e4..98f0093b8 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -910,7 +910,7 @@ impl Index { Ok(()) } - pub fn min_word_len_2_typo(&self, txn: &RoTxn) -> heed::Result { + pub fn min_word_len_2_typos(&self, txn: &RoTxn) -> heed::Result { // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We // identify 0 as being false, and anything else as true. The absence of a value is true, // because by default, we authorize typos. @@ -920,7 +920,7 @@ impl Index { .unwrap_or(DEFAULT_MIN_WORD_LEN_2_TYPOS)) } - pub(crate) fn put_min_word_len_2_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + pub(crate) fn put_min_word_len_2_typos(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We // identify 0 as being false, and anything else as true. The absence of a value is true, // because by default, we authorize typos. @@ -1072,15 +1072,15 @@ pub(crate) mod tests { let mut txn = index.write_txn().unwrap(); assert_eq!(index.min_word_len_1_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_1_TYPO); - assert_eq!(index.min_word_len_2_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_2_TYPOS); + assert_eq!(index.min_word_len_2_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_2_TYPOS); index.put_min_word_len_1_typo(&mut txn, 3).unwrap(); - index.put_min_word_len_2_typo(&mut txn, 15).unwrap(); + index.put_min_word_len_2_typos(&mut txn, 15).unwrap(); txn.commit().unwrap(); let txn = index.read_txn().unwrap(); assert_eq!(index.min_word_len_1_typo(&txn).unwrap(), 3); - assert_eq!(index.min_word_len_2_typo(&txn).unwrap(), 15); + assert_eq!(index.min_word_len_2_typos(&txn).unwrap(), 15); } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index acaba680f..5d78eb674 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -183,7 +183,7 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { let one = self.index.min_word_len_1_typo(&self.rtxn)?; - let two = self.index.min_word_len_2_typo(&self.rtxn)?; + let two = self.index.min_word_len_2_typos(&self.rtxn)?; Ok((one, two)) } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 5ccaba9ba..8fd9b9a9a 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -501,11 +501,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { return Err(UserError::InvalidMinTypoWordSetting(*one, *two).into()); } else { self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; - self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?; + self.index.put_min_word_len_2_typos(&mut self.wtxn, *two)?; } } (Setting::Set(one), _) => { - let two = self.index.min_word_len_2_typo(&self.wtxn)?; + let two = self.index.min_word_len_2_typos(&self.wtxn)?; if *one > two { return Err(UserError::InvalidMinTypoWordSetting(*one, two).into()); } else { @@ -517,7 +517,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { if one > *two { return Err(UserError::InvalidMinTypoWordSetting(one, *two).into()); } else { - self.index.put_min_word_len_2_typo(&mut self.wtxn, *two)?; + self.index.put_min_word_len_2_typos(&mut self.wtxn, *two)?; } } _ => (), @@ -1304,7 +1304,7 @@ mod tests { let txn = index.read_txn().unwrap(); assert_eq!(index.min_word_len_1_typo(&txn).unwrap(), 8); - assert_eq!(index.min_word_len_2_typo(&txn).unwrap(), 8); + assert_eq!(index.min_word_len_2_typos(&txn).unwrap(), 8); } #[test] From 4c4b336ecb992c10606f9899535c5c39708bd347 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 14:15:02 +0200 Subject: [PATCH 1288/1889] rename min word len for typo error --- milli/src/error.rs | 4 ++-- milli/src/update/settings.rs | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 611160319..688977741 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -72,7 +72,7 @@ pub enum UserError { SerdeJson(serde_json::Error), SortError(SortError), UnknownInternalDocumentId { document_id: DocumentId }, - InvalidMinTypoWordSetting(u8, u8), + InvalidMinTypoWordLenSetting(u8, u8), } impl From for Error { @@ -292,7 +292,7 @@ ranking rules settings to use the sort parameter at search time.", Self::UnknownInternalDocumentId { document_id } => { write!(f, "An unknown internal document id have been used: `{}`.", document_id) } - Self::InvalidMinTypoWordSetting(one, two) => write!(f, "`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {}` and twoTypos: {}`.", one, two), + Self::InvalidMinTypoWordLenSetting(one, two) => write!(f, "`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {}` and twoTypos: {}`.", one, two), } } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 8fd9b9a9a..26ed5730a 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -498,7 +498,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match (&self.min_1_typo_word_len, &self.min_2_typos_word_len) { (Setting::Set(one), Setting::Set(two)) => { if one > two { - return Err(UserError::InvalidMinTypoWordSetting(*one, *two).into()); + return Err(UserError::InvalidMinTypoWordLenSetting(*one, *two).into()); } else { self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; self.index.put_min_word_len_2_typos(&mut self.wtxn, *two)?; @@ -507,7 +507,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { (Setting::Set(one), _) => { let two = self.index.min_word_len_2_typos(&self.wtxn)?; if *one > two { - return Err(UserError::InvalidMinTypoWordSetting(*one, two).into()); + return Err(UserError::InvalidMinTypoWordLenSetting(*one, two).into()); } else { self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; } @@ -515,7 +515,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { (_, Setting::Set(two)) => { let one = self.index.min_word_len_1_typo(&self.wtxn)?; if one > *two { - return Err(UserError::InvalidMinTypoWordSetting(one, *two).into()); + return Err(UserError::InvalidMinTypoWordLenSetting(one, *two).into()); } else { self.index.put_min_word_len_2_typos(&mut self.wtxn, *two)?; } From 66020cd9239c98658ba7824a632d8c1df944edd5 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 18:23:12 +0200 Subject: [PATCH 1289/1889] rename min_word_len* to use plain letter numbers --- milli/src/index.rs | 30 ++++++++++---------- milli/src/search/query_tree.rs | 23 ++++++++-------- milli/src/update/settings.rs | 50 +++++++++++++++++----------------- 3 files changed, 52 insertions(+), 51 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 98f0093b8..853e7537d 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,8 +23,8 @@ use crate::{ Search, StrBEU32Codec, StrStrU8Codec, BEU32, }; -pub const DEFAULT_MIN_WORD_LEN_1_TYPO: u8 = 5; -pub const DEFAULT_MIN_WORD_LEN_2_TYPOS: u8 = 9; +pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; +pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9; pub mod main_key { pub const CRITERIA_KEY: &str = "criteria"; @@ -892,17 +892,17 @@ impl Index { Ok(()) } - pub fn min_word_len_1_typo(&self, txn: &RoTxn) -> heed::Result { + pub fn min_word_len_one_typo(&self, txn: &RoTxn) -> heed::Result { // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We // identify 0 as being false, and anything else as true. The absence of a value is true, // because by default, we authorize typos. Ok(self .main .get::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN)? - .unwrap_or(DEFAULT_MIN_WORD_LEN_1_TYPO)) + .unwrap_or(DEFAULT_MIN_WORD_LEN_ONE_TYPO)) } - pub(crate) fn put_min_word_len_1_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + pub(crate) fn put_min_word_len_one_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We // identify 0 as being false, and anything else as true. The absence of a value is true, // because by default, we authorize typos. @@ -910,17 +910,17 @@ impl Index { Ok(()) } - pub fn min_word_len_2_typos(&self, txn: &RoTxn) -> heed::Result { + pub fn min_word_len_two_typos(&self, txn: &RoTxn) -> heed::Result { // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We // identify 0 as being false, and anything else as true. The absence of a value is true, // because by default, we authorize typos. Ok(self .main .get::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN)? - .unwrap_or(DEFAULT_MIN_WORD_LEN_2_TYPOS)) + .unwrap_or(DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) } - pub(crate) fn put_min_word_len_2_typos(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + pub(crate) fn put_min_word_len_two_typos(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We // identify 0 as being false, and anything else as true. The absence of a value is true, // because by default, we authorize typos. @@ -937,7 +937,7 @@ pub(crate) mod tests { use maplit::btreemap; use tempfile::TempDir; - use crate::index::{DEFAULT_MIN_WORD_LEN_1_TYPO, DEFAULT_MIN_WORD_LEN_2_TYPOS}; + use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig}; use crate::Index; @@ -1071,16 +1071,16 @@ pub(crate) mod tests { let index = TempIndex::new(); let mut txn = index.write_txn().unwrap(); - assert_eq!(index.min_word_len_1_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_1_TYPO); - assert_eq!(index.min_word_len_2_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_2_TYPOS); + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS); - index.put_min_word_len_1_typo(&mut txn, 3).unwrap(); - index.put_min_word_len_2_typos(&mut txn, 15).unwrap(); + index.put_min_word_len_one_typo(&mut txn, 3).unwrap(); + index.put_min_word_len_two_typos(&mut txn, 15).unwrap(); txn.commit().unwrap(); let txn = index.read_txn().unwrap(); - assert_eq!(index.min_word_len_1_typo(&txn).unwrap(), 3); - assert_eq!(index.min_word_len_2_typos(&txn).unwrap(), 15); + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 3); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 15); } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 5d78eb674..c1803f40b 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -182,8 +182,8 @@ impl<'a> Context for QueryTreeBuilder<'a> { } fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { - let one = self.index.min_word_len_1_typo(&self.rtxn)?; - let two = self.index.min_word_len_2_typos(&self.rtxn)?; + let one = self.index.min_word_len_one_typo(&self.rtxn)?; + let two = self.index.min_word_len_two_typos(&self.rtxn)?; Ok((one, two)) } } @@ -267,8 +267,8 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result QueryKind { if authorize_typos { let count = word.chars().count().min(u8::MAX as usize) as u8; - if (0..config.word_len_1_typo).contains(&count) { + if (0..config.word_len_one_typo).contains(&count) { QueryKind::exact(word) - } else if (config.word_len_1_typo..config.word_len_2_typo).contains(&count) { + } else if (config.word_len_one_typo..config.word_len_two_typo).contains(&count) { QueryKind::tolerant(1.min(config.max_typos), word) } else { QueryKind::tolerant(2.min(config.max_typos), word) @@ -332,8 +332,8 @@ fn create_query_tree( if let Some(child) = split_best_frequency(ctx, &word)? { children.push(child); } - let (word_len_1_typo, word_len_2_typo) = ctx.min_word_len_for_typo()?; - let config = TypoConfig { max_typos: 2, word_len_1_typo, word_len_2_typo }; + let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; + let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo }; children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos, config), @@ -383,9 +383,10 @@ fn create_query_tree( .collect(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let concat = words.concat(); - let (word_len_1_typo, word_len_2_typo) = ctx.min_word_len_for_typo()?; + let (word_len_one_typo, word_len_two_typo) = + ctx.min_word_len_for_typo()?; let config = - TypoConfig { max_typos: 1, word_len_1_typo, word_len_2_typo }; + TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo }; let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos, config), @@ -1223,7 +1224,7 @@ mod test { #[test] fn test_min_word_len_typo() { - let config = TypoConfig { max_typos: 2, word_len_1_typo: 5, word_len_2_typo: 7 }; + let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 }; assert_eq!( typos("hello".to_string(), true, config.clone()), diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 26ed5730a..94ae29595 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -90,8 +90,8 @@ pub struct Settings<'a, 't, 'u, 'i> { synonyms: Setting>>, primary_key: Setting, authorize_typos: Setting, - min_2_typos_word_len: Setting, - min_1_typo_word_len: Setting, + min_word_len_two_typos: Setting, + min_word_len_one_typo: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -114,8 +114,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, indexer_config, - min_2_typos_word_len: Setting::Reset, - min_1_typo_word_len: Setting::Reset, + min_word_len_two_typos: Setting::Reset, + min_word_len_one_typo: Setting::Reset, } } @@ -200,20 +200,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.authorize_typos = Setting::Reset; } - pub fn set_min_2_typos_word_len(&mut self, val: u8) { - self.min_2_typos_word_len = Setting::Set(val); + pub fn set_min_word_len_two_typos(&mut self, val: u8) { + self.min_word_len_two_typos = Setting::Set(val); } - pub fn reset_min_2_typos_word_len(&mut self) { - self.min_2_typos_word_len = Setting::Reset; + pub fn reset_min_word_len_two_typos(&mut self) { + self.min_word_len_two_typos = Setting::Reset; } - pub fn set_min_1_typo_word_len(&mut self, val: u8) { - self.min_1_typo_word_len = Setting::Set(val); + pub fn set_min_word_len_one_typo(&mut self, val: u8) { + self.min_word_len_one_typo = Setting::Set(val); } - pub fn reset_min_1_typos_word_len(&mut self) { - self.min_1_typo_word_len = Setting::Reset; + pub fn reset_min_word_len_one_typo(&mut self) { + self.min_word_len_one_typo = Setting::Reset; } fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> @@ -495,29 +495,29 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } fn update_min_typo_word_len(&mut self) -> Result<()> { - match (&self.min_1_typo_word_len, &self.min_2_typos_word_len) { + match (&self.min_word_len_one_typo, &self.min_word_len_two_typos) { (Setting::Set(one), Setting::Set(two)) => { if one > two { return Err(UserError::InvalidMinTypoWordLenSetting(*one, *two).into()); } else { - self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; - self.index.put_min_word_len_2_typos(&mut self.wtxn, *two)?; + self.index.put_min_word_len_one_typo(&mut self.wtxn, *one)?; + self.index.put_min_word_len_two_typos(&mut self.wtxn, *two)?; } } (Setting::Set(one), _) => { - let two = self.index.min_word_len_2_typos(&self.wtxn)?; + let two = self.index.min_word_len_two_typos(&self.wtxn)?; if *one > two { return Err(UserError::InvalidMinTypoWordLenSetting(*one, two).into()); } else { - self.index.put_min_word_len_1_typo(&mut self.wtxn, *one)?; + self.index.put_min_word_len_one_typo(&mut self.wtxn, *one)?; } } (_, Setting::Set(two)) => { - let one = self.index.min_word_len_1_typo(&self.wtxn)?; + let one = self.index.min_word_len_one_typo(&self.wtxn)?; if one > *two { return Err(UserError::InvalidMinTypoWordLenSetting(one, *two).into()); } else { - self.index.put_min_word_len_2_typos(&mut self.wtxn, *two)?; + self.index.put_min_word_len_two_typos(&mut self.wtxn, *two)?; } } _ => (), @@ -1295,16 +1295,16 @@ mod tests { // Set the genres setting let mut txn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut txn, &index, &config); - builder.set_min_1_typo_word_len(8); - builder.set_min_2_typos_word_len(8); + builder.set_min_word_len_one_typo(8); + builder.set_min_word_len_two_typos(8); builder.execute(|_| ()).unwrap(); txn.commit().unwrap(); let txn = index.read_txn().unwrap(); - assert_eq!(index.min_word_len_1_typo(&txn).unwrap(), 8); - assert_eq!(index.min_word_len_2_typos(&txn).unwrap(), 8); + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 8); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 8); } #[test] @@ -1315,8 +1315,8 @@ mod tests { // Set the genres setting let mut txn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut txn, &index, &config); - builder.set_min_1_typo_word_len(10); - builder.set_min_2_typos_word_len(7); + builder.set_min_word_len_one_typo(10); + builder.set_min_word_len_two_typos(7); assert!(builder.execute(|_| ()).is_err()); } } From 950a740bd46c5e06ff2b4f6d2fefb24fe44b3cd1 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 18:37:43 +0200 Subject: [PATCH 1290/1889] refactor typos for readability --- milli/src/search/query_tree.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index c1803f40b..b1c846324 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -276,9 +276,9 @@ pub struct TypoConfig { fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { if authorize_typos { let count = word.chars().count().min(u8::MAX as usize) as u8; - if (0..config.word_len_one_typo).contains(&count) { + if count < config.word_len_one_typo { QueryKind::exact(word) - } else if (config.word_len_one_typo..config.word_len_two_typo).contains(&count) { + } else if count < config.word_len_two_typo { QueryKind::tolerant(1.min(config.max_typos), word) } else { QueryKind::tolerant(2.min(config.max_typos), word) From fdaf45aab2e898b2f730ec81c2d469f055fe0452 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 18:42:10 +0200 Subject: [PATCH 1291/1889] replace hardcoded value with constant in TestContext --- milli/src/search/query_tree.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index b1c846324..1bb4c9516 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -564,6 +564,8 @@ mod test { use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; + use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; + use super::*; #[derive(Debug)] @@ -602,7 +604,7 @@ mod test { } fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { - Ok((5, 9)) + Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) } } From 1941072bb29ba7763519d85ef59797e634d0f82e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 31 Mar 2022 18:44:51 +0200 Subject: [PATCH 1292/1889] implement Copy on Setting --- milli/src/update/settings.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 94ae29595..c03d6e0ae 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -14,7 +14,7 @@ use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::{FieldsIdsMap, Index, Result}; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Copy)] pub enum Setting { Set(T), Reset, @@ -495,29 +495,29 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } fn update_min_typo_word_len(&mut self) -> Result<()> { - match (&self.min_word_len_one_typo, &self.min_word_len_two_typos) { + match (self.min_word_len_one_typo, self.min_word_len_two_typos) { (Setting::Set(one), Setting::Set(two)) => { if one > two { - return Err(UserError::InvalidMinTypoWordLenSetting(*one, *two).into()); + return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); } else { - self.index.put_min_word_len_one_typo(&mut self.wtxn, *one)?; - self.index.put_min_word_len_two_typos(&mut self.wtxn, *two)?; + self.index.put_min_word_len_one_typo(&mut self.wtxn, one)?; + self.index.put_min_word_len_two_typos(&mut self.wtxn, two)?; } } (Setting::Set(one), _) => { let two = self.index.min_word_len_two_typos(&self.wtxn)?; - if *one > two { - return Err(UserError::InvalidMinTypoWordLenSetting(*one, two).into()); + if one > two { + return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); } else { - self.index.put_min_word_len_one_typo(&mut self.wtxn, *one)?; + self.index.put_min_word_len_one_typo(&mut self.wtxn, one)?; } } (_, Setting::Set(two)) => { let one = self.index.min_word_len_one_typo(&self.wtxn)?; - if one > *two { - return Err(UserError::InvalidMinTypoWordLenSetting(one, *two).into()); + if one > two { + return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); } else { - self.index.put_min_word_len_two_typos(&mut self.wtxn, *two)?; + self.index.put_min_word_len_two_typos(&mut self.wtxn, two)?; } } _ => (), From 2cb71dff4afdc7b4f16f9692066dbb3bc4096896 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 1 Apr 2022 10:50:01 +0200 Subject: [PATCH 1293/1889] add typo integration tests --- milli/tests/search/mod.rs | 1 + milli/tests/search/typo_tolerance.rs | 97 ++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 milli/tests/search/typo_tolerance.rs diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 31d53b666..52b4c7114 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -16,6 +16,7 @@ mod distinct; mod filters; mod query_criteria; mod sort; +mod typo_tolerance; pub const TEST_QUERY: &'static str = "hello world america"; diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs new file mode 100644 index 000000000..8898fb353 --- /dev/null +++ b/milli/tests/search/typo_tolerance.rs @@ -0,0 +1,97 @@ +use milli::{ + update::{IndexerConfig, Settings}, + Criterion, Search, +}; +use Criterion::*; + +#[test] +fn test_typo_tolerance_one_typo() { + let criteria = [Typo]; + let index = super::setup_search_index_with_criteria(&criteria); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zeal"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + + let mut search = Search::new(&txn, &index); + search.query("zean"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_min_word_len_one_typo(4); + builder.execute(|_| ()).unwrap(); + + // typo is now supported for 4 letters words + let mut search = Search::new(&txn, &index); + search.query("zean"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); +} + +#[test] +fn test_typo_tolerance_two_typo() { + let criteria = [Typo]; + let index = super::setup_search_index_with_criteria(&criteria); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + + let mut search = Search::new(&txn, &index); + search.query("zealemd"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_min_word_len_two_typos(7); + builder.execute(|_| ()).unwrap(); + + // typo is now supported for 4 letters words + let mut search = Search::new(&txn, &index); + search.query("zealemd"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); +} From 853b4a520fb2a6fd10909b085b16460b23c6e249 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 1 Apr 2022 11:21:51 +0200 Subject: [PATCH 1294/1889] fmt --- milli/src/search/query_tree.rs | 3 +-- milli/tests/search/typo_tolerance.rs | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 1bb4c9516..934d2fd9b 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -564,9 +564,8 @@ mod test { use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; - use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; - use super::*; + use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; #[derive(Debug)] struct TestContext { diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 8898fb353..00e6853cc 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -1,7 +1,5 @@ -use milli::{ - update::{IndexerConfig, Settings}, - Criterion, Search, -}; +use milli::update::{IndexerConfig, Settings}; +use milli::{Criterion, Search}; use Criterion::*; #[test] From 9bbffb8fee9ab73fb59eab731f1e739c85e536dd Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 14:03:31 +0100 Subject: [PATCH 1295/1889] add exact words setting --- milli/src/index.rs | 22 ++++++++++++++++++++++ milli/src/update/settings.rs | 27 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index 853e7537d..c0be985da 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -52,6 +52,7 @@ pub mod main_key { pub const AUTHORIZE_TYPOS: &str = "authorize-typos"; pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; + pub const EXACT_WORDS: &str = "exact-words"; } pub mod db_name { @@ -927,6 +928,27 @@ impl Index { self.main.put::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?; Ok(()) } + + /// List the words on which typo are not allowed + pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result>> { + match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? { + Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), + None => Ok(fst::Set::default().map_data(Cow::Owned)?), + } + } + + pub(crate) fn put_exact_words>( + &self, + txn: &mut RwTxn, + words: &fst::Set, + ) -> Result<()> { + self.main.put::<_, Str, ByteSlice>( + txn, + main_key::EXACT_WORDS, + words.as_fst().as_bytes(), + )?; + Ok(()) + } } #[cfg(test)] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c03d6e0ae..513dee42c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -92,6 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> { authorize_typos: Setting, min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, + exact_words: Setting>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -113,6 +114,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { synonyms: Setting::NotSet, primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, + exact_words: Setting::NotSet, indexer_config, min_word_len_two_typos: Setting::Reset, min_word_len_one_typo: Setting::Reset, @@ -216,6 +218,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.min_word_len_one_typo = Setting::Reset; } + pub fn set_exact_words(&mut self, words: Vec) { + self.exact_words = Setting::Set(words); + } + + pub fn reset_exact_words(&mut self) { + self.exact_words = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -526,6 +536,22 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_exact_words(&mut self) -> Result<()> { + match self.exact_words { + Setting::Set(ref mut words) => { + words.sort_unstable(); + let words = fst::Set::from_iter(words)?; + self.index.put_exact_words(&mut self.wtxn, &words)?; + } + Setting::Reset => { + self.index.put_exact_words(&mut self.wtxn, &fst::Set::default())?; + } + Setting::NotSet => (), + } + + Ok(()) + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -543,6 +569,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_primary_key()?; self.update_authorize_typos()?; self.update_min_typo_word_len()?; + self.update_exact_words()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, From 774fa8f06578d7dd0d660efe2f084429f4fb31c6 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 16:25:15 +0100 Subject: [PATCH 1296/1889] disable typos on exact words --- milli/src/search/query_tree.rs | 35 +++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 934d2fd9b..a31a71590 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,4 +1,4 @@ -use std::{cmp, fmt, mem}; +use std::{borrow::Cow, cmp, fmt, mem}; use fst::Set; use meilisearch_tokenizer::token::SeparatorKind; @@ -157,6 +157,7 @@ trait Context { } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; + fn exact_words(&self) -> crate::Result>>; } /// The query tree builder is the interface to build a query tree. @@ -186,6 +187,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { let two = self.index.min_word_len_two_typos(&self.rtxn)?; Ok((one, two)) } + + fn exact_words(&self) -> crate::Result>> { + self.index.exact_words(self.rtxn) + } } impl<'a> QueryTreeBuilder<'a> { @@ -265,15 +270,16 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result { pub max_typos: u8, pub word_len_one_typo: u8, pub word_len_two_typo: u8, + pub exact_words: fst::Set>, } /// Return the `QueryKind` of a word depending on `authorize_typos` /// and the provided word length. -fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { +fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind { if authorize_typos { let count = word.chars().count().min(u8::MAX as usize) as u8; if count < config.word_len_one_typo { @@ -333,7 +339,9 @@ fn create_query_tree( children.push(child); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo }; + let exact_words = ctx.exact_words()?; + let config = + TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos, config), @@ -385,8 +393,13 @@ fn create_query_tree( let concat = words.concat(); let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let config = - TypoConfig { max_typos: 1, word_len_one_typo, word_len_two_typo }; + let exact_words = ctx.exact_words()?; + let config = TypoConfig { + max_typos: 1, + word_len_one_typo, + word_len_two_typo, + exact_words, + }; let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos, config), @@ -605,6 +618,12 @@ mod test { fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) } + + fn exact_words(&self) -> crate::Result>> { + let builder = fst::SetBuilder::new(Vec::new()).unwrap(); + let data = builder.into_inner().unwrap(); + Ok(fst::Set::new(Cow::Owned(data)).unwrap()) + } } impl Default for TestContext { @@ -1225,7 +1244,9 @@ mod test { #[test] fn test_min_word_len_typo() { - let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7 }; + let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap(); + let config = + TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words }; assert_eq!( typos("hello".to_string(), true, config.clone()), From 8b1e5d9c6d654be95159519e8c233e8868694e1b Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 22 Mar 2022 09:55:49 +0100 Subject: [PATCH 1297/1889] add test for exact words --- milli/src/search/query_tree.rs | 25 ++++++++++++++++++++++--- milli/src/update/settings.rs | 9 ++++----- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index a31a71590..0014075d4 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -584,6 +584,8 @@ mod test { struct TestContext { synonyms: HashMap, Vec>>, postings: HashMap, + // Raw bytes for the exact word fst Set + exact_words: Vec, } impl TestContext { @@ -620,9 +622,7 @@ mod test { } fn exact_words(&self) -> crate::Result>> { - let builder = fst::SetBuilder::new(Vec::new()).unwrap(); - let data = builder.into_inner().unwrap(); - Ok(fst::Set::new(Cow::Owned(data)).unwrap()) + Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap()) } } @@ -640,6 +640,8 @@ mod test { RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() } + let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap(); + TestContext { synonyms: hashmap! { vec![String::from("hello")] => vec![ @@ -679,6 +681,7 @@ mod test { String::from("good") => random_postings(rng, 1250), String::from("morning") => random_postings(rng, 125), }, + exact_words, } } } @@ -1263,4 +1266,20 @@ mod test { QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() } ); } + #[test] + fn disable_typo_on_word() { + let query = "goodbye"; + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + let result = analyzer.analyze(query); + + let tokens = result.tokens(); + let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); + let context = TestContext { exact_words, ..Default::default() }; + let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap(); + + assert!(matches!( + query_tree, + Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) + )); + } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 513dee42c..503fbd06e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -92,7 +92,7 @@ pub struct Settings<'a, 't, 'u, 'i> { authorize_typos: Setting, min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, - exact_words: Setting>, + exact_words: Setting>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -115,9 +115,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, exact_words: Setting::NotSet, - indexer_config, min_word_len_two_typos: Setting::Reset, min_word_len_one_typo: Setting::Reset, + indexer_config, } } @@ -218,7 +218,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.min_word_len_one_typo = Setting::Reset; } - pub fn set_exact_words(&mut self, words: Vec) { + pub fn set_exact_words(&mut self, words: BTreeSet) { self.exact_words = Setting::Set(words); } @@ -539,8 +539,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_exact_words(&mut self) -> Result<()> { match self.exact_words { Setting::Set(ref mut words) => { - words.sort_unstable(); - let words = fst::Set::from_iter(words)?; + let words = fst::Set::from_iter(words.iter())?; self.index.put_exact_words(&mut self.wtxn, &words)?; } Setting::Reset => { From 559e46be5e23d5fca45856a5201dfa223bfa3d29 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 11:02:43 +0200 Subject: [PATCH 1298/1889] fix bad rebase bug --- milli/src/search/query_tree.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 0014075d4..585f4fbf3 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -280,7 +280,7 @@ pub struct TypoConfig<'a> { /// Return the `QueryKind` of a word depending on `authorize_typos` /// and the provided word length. fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind { - if authorize_typos { + if authorize_typos && !config.exact_words.contains(&word) { let count = word.chars().count().min(u8::MAX as usize) as u8; if count < config.word_len_one_typo { QueryKind::exact(word) @@ -1278,7 +1278,7 @@ mod test { let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap(); assert!(matches!( - query_tree, + dbg!(query_tree), Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) )); } From 0fd55db21c1fbe3471e8210b943dc426f9741b5e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 11:52:35 +0200 Subject: [PATCH 1299/1889] fmt --- milli/src/search/query_tree.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 585f4fbf3..4eccae8ce 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,4 +1,5 @@ -use std::{borrow::Cow, cmp, fmt, mem}; +use std::borrow::Cow; +use std::{cmp, fmt, mem}; use fst::Set; use meilisearch_tokenizer::token::SeparatorKind; @@ -1278,7 +1279,7 @@ mod test { let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap(); assert!(matches!( - dbg!(query_tree), + query_tree, Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) )); } From 30a2711bacfdfbe1fee4a9b52a840c1c7b890c8e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 13:51:50 +0200 Subject: [PATCH 1300/1889] rename serde module to serde_impl module needed because of issues with rustfmt --- milli/src/documents/builder.rs | 2 +- milli/src/documents/mod.rs | 4 ++-- milli/src/documents/{serde.rs => serde_impl.rs} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename milli/src/documents/{serde.rs => serde_impl.rs} (100%) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 2860c4b86..2be7c1dd8 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -6,7 +6,7 @@ use byteorder::{BigEndian, WriteBytesExt}; use serde::Deserializer; use serde_json::Value; -use super::serde::DocumentVisitor; +use super::serde_impl::DocumentVisitor; use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; use crate::FieldId; diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 14d97ee7d..8fd018328 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -5,15 +5,15 @@ mod builder; /// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can /// later be read by milli using the `DocumentBatchReader` interface. mod reader; -mod serde; +mod serde_impl; use std::fmt::{self, Debug}; use std::io; -use ::serde::{Deserialize, Serialize}; use bimap::BiHashMap; pub use builder::DocumentBatchBuilder; pub use reader::DocumentBatchReader; +use serde::{Deserialize, Serialize}; use crate::FieldId; diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde_impl.rs similarity index 100% rename from milli/src/documents/serde.rs rename to milli/src/documents/serde_impl.rs From 284d8a24e0caf2376a68aeb1fd63691e4b2270c9 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 13:59:29 +0200 Subject: [PATCH 1301/1889] add intergration test for disabled typon on word --- milli/tests/search/typo_tolerance.rs | 81 +++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 00e6853cc..7d19e4ab0 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -1,5 +1,10 @@ -use milli::update::{IndexerConfig, Settings}; -use milli::{Criterion, Search}; +use std::collections::BTreeSet; + +use heed::EnvOpenOptions; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use milli::{Criterion, Index, Search}; +use serde_json::json; +use tempfile::tempdir; use Criterion::*; #[test] @@ -93,3 +98,75 @@ fn test_typo_tolerance_two_typo() { let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); } + +#[test] +fn test_typo_disabled_on_word() { + let tmp = tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 100); + let index = Index::new(options, tmp.path()).unwrap(); + + let documents = json!([ + { + "id": 1usize, + "data": "zealand", + }, + { + "id": 2usize, + "data": "zearand", + }, + ]); + + let mut writer = std::io::Cursor::new(Vec::new()); + let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); + let documents = serde_json::to_vec(&documents).unwrap(); + builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); + builder.finish().unwrap(); + + writer.set_position(0); + + let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap(); + + let mut txn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()); + + builder.add_documents(documents).unwrap(); + + builder.execute().unwrap(); + txn.commit().unwrap(); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + let mut exact_words = BTreeSet::new(); + // sealand doesn't allow typos anymore + exact_words.insert("zealand".to_string()); + builder.set_exact_words(exact_words); + builder.execute(|_| ()).unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); +} From 3e67d8818cc84127881883f0dca17e95e365b511 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 20:34:23 +0200 Subject: [PATCH 1302/1889] fix typo in test comment --- milli/tests/search/typo_tolerance.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 7d19e4ab0..df15fb768 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -156,7 +156,7 @@ fn test_typo_disabled_on_word() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); let mut exact_words = BTreeSet::new(); - // sealand doesn't allow typos anymore + // `zealand` doesn't allow typos anymore exact_words.insert("zealand".to_string()); builder.set_exact_words(exact_words); builder.execute(|_| ()).unwrap(); From 7e9d56a9e75391724f2c24a6f892a17dd7c30c5b Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 21 Mar 2022 16:25:15 +0100 Subject: [PATCH 1303/1889] disable typos on exact words --- milli/src/search/query_tree.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4eccae8ce..7d13f27a3 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -623,7 +623,9 @@ mod test { } fn exact_words(&self) -> crate::Result>> { - Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap()) + let builder = fst::SetBuilder::new(Vec::new()).unwrap(); + let data = builder.into_inner().unwrap(); + Ok(fst::Set::new(Cow::Owned(data)).unwrap()) } } From c882d8daf0dd174c8bb8c51734493e6814780d24 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 22 Mar 2022 09:55:49 +0100 Subject: [PATCH 1304/1889] add test for exact words --- milli/src/search/query_tree.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 7d13f27a3..ff9d3f4e9 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -623,9 +623,7 @@ mod test { } fn exact_words(&self) -> crate::Result>> { - let builder = fst::SetBuilder::new(Vec::new()).unwrap(); - let data = builder.into_inner().unwrap(); - Ok(fst::Set::new(Cow::Owned(data)).unwrap()) + Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap()) } } @@ -1269,6 +1267,7 @@ mod test { QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() } ); } + #[test] fn disable_typo_on_word() { let query = "goodbye"; From f82d4b36eb37212df5b3b9f42120fcef50419108 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 22 Mar 2022 19:07:59 +0100 Subject: [PATCH 1305/1889] introduce exact attribute setting --- milli/src/index.rs | 18 ++++++++++++++++++ milli/src/update/settings.rs | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index c0be985da..f4e17d93c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -53,6 +53,7 @@ pub mod main_key { pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; pub const EXACT_WORDS: &str = "exact-words"; + pub const EXACT_ATTRIBUTES: &str = "exact-attributes"; } pub mod db_name { @@ -949,6 +950,23 @@ impl Index { )?; Ok(()) } + + pub fn exact_attributes<'t>(&self, txn: &'t RoTxn) -> Result> { + Ok(self + .main + .get::<_, Str, SerdeBincode>>(txn, main_key::EXACT_ATTRIBUTES)? + .unwrap_or_default()) + } + + pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?; + Ok(()) + } + + pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result<()> { + self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?; + Ok(()) + } } #[cfg(test)] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 503fbd06e..3ed2a4152 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -93,6 +93,8 @@ pub struct Settings<'a, 't, 'u, 'i> { min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, exact_words: Setting>, + /// attributes on which typo tolerance is not enabled. + exact_attributes: Setting>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -117,6 +119,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { exact_words: Setting::NotSet, min_word_len_two_typos: Setting::Reset, min_word_len_one_typo: Setting::Reset, + exact_attributes: Setting::Reset, indexer_config, } } @@ -226,6 +229,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.exact_words = Setting::Reset; } + pub fn set_exact_attributes(&mut self, attrs: HashSet) { + self.exact_attributes = Setting::Set(attrs); + } + + pub fn reset_exact_attributes(&mut self) { + self.exact_attributes = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -411,6 +422,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + fn update_exact_attributes(&mut self) -> Result { + match self.exact_attributes { + Setting::Set(ref attrs) => { + let attrs = attrs.iter().map(String::as_str).collect::>(); + self.index.put_exact_attributes(&mut self.wtxn, &attrs)?; + Ok(true) + } + Setting::Reset => { + self.index.delete_exact_attributes(&mut self.wtxn)?; + Ok(true) + } + Setting::NotSet => Ok(false), + } + } + fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { @@ -579,8 +605,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let stop_words_updated = self.update_stop_words()?; let synonyms_updated = self.update_synonyms()?; let searchable_updated = self.update_searchable()?; + let exact_attributes_updated = self.update_exact_attributes()?; - if stop_words_updated || faceted_updated || synonyms_updated || searchable_updated { + if stop_words_updated + || faceted_updated + || synonyms_updated + || searchable_updated + || exact_attributes_updated + { self.reindex(&progress_callback, old_fields_ids_map)?; } From 5f9f82757dbebec7087cd56b2e624e372c3bbb4f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 23 Mar 2022 14:48:15 +0100 Subject: [PATCH 1306/1889] refactor spawn_extraction_task --- .../src/update/index_documents/extract/mod.rs | 26 +++---- .../index_documents/helpers/grenad_helpers.rs | 69 ++++++++++++++----- .../src/update/index_documents/helpers/mod.rs | 4 +- 3 files changed, 69 insertions(+), 30 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 4c81b9334..100431237 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -26,7 +26,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, - merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, + merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -66,7 +66,7 @@ pub(crate) fn data_from_obkv_documents( (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), ) = result?; - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -76,7 +76,7 @@ pub(crate) fn data_from_obkv_documents( "word-pair-proximity-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -86,7 +86,7 @@ pub(crate) fn data_from_obkv_documents( "field-id-wordcount-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -96,7 +96,7 @@ pub(crate) fn data_from_obkv_documents( "word-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -106,7 +106,7 @@ pub(crate) fn data_from_obkv_documents( "word-position-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_strings_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -116,7 +116,7 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-string-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_numbers_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents( /// Generated grenad chunks are merged using the merge_fn. /// The result of merged chunks is serialized as TypedChunk using the serialize_fn /// and sent into lmdb_writer_sx. -fn spawn_extraction_task( +fn spawn_extraction_task( chunks: Vec>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, @@ -142,19 +142,21 @@ fn spawn_extraction_task( serialize_fn: FS, name: &'static str, ) where - FE: Fn(grenad::Reader, GrenadParameters) -> Result> + FE: Fn(grenad::Reader, GrenadParameters) -> Result + Sync + Send + 'static, - FS: Fn(grenad::Reader) -> TypedChunk + Sync + Send + 'static, + FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static, + M: MergeableReader + FromParallelIterator + Send + 'static, + M::Output: Send, { rayon::spawn(move || { - let chunks: Result> = + let chunks: Result = chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect(); rayon::spawn(move || match chunks { Ok(chunks) => { debug!("merge {} database", name); - let reader = merge_readers(chunks, merge_fn, indexer); + let reader = chunks.merge(merge_fn, &indexer); let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))); } Err(e) => { diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index e0ac3a175..fc28860b2 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -78,25 +78,62 @@ pub unsafe fn as_cloneable_grenad( Ok(reader) } -pub fn merge_readers( - readers: Vec>, - merge_fn: MergeFn, - indexer: GrenadParameters, -) -> Result> { - let mut merger_builder = grenad::MergerBuilder::new(merge_fn); - for reader in readers { - merger_builder.push(reader.into_cursor()?); +pub trait MergeableReader +where + Self: Sized, +{ + type Output; + + fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result; +} + +impl MergeableReader for Vec> { + type Output = grenad::Reader; + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut merger = MergerBuilder::new(merge_fn); + self.into_iter().try_for_each(|r| merger.push(r))?; + merger.finish(params) + } +} + +impl MergeableReader for Vec<(grenad::Reader, grenad::Reader)> { + type Output = (grenad::Reader, grenad::Reader); + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut m1 = MergerBuilder::new(merge_fn); + let mut m2 = MergerBuilder::new(merge_fn); + for (r1, r2) in self.into_iter() { + m1.push(r1)?; + m2.push(r2)?; + } + Ok((m1.finish(params)?, m2.finish(params)?)) + } +} + +struct MergerBuilder(grenad::MergerBuilder); + +impl MergerBuilder { + fn new(merge_fn: MergeFn) -> Self { + Self(grenad::MergerBuilder::new(merge_fn)) } - let merger = merger_builder.build(); - let mut writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); - merger.write_into_stream_writer(&mut writer)?; + fn push(&mut self, reader: grenad::Reader) -> Result<()> { + self.0.push(reader.into_cursor()?); + Ok(()) + } - Ok(writer_into_reader(writer)?) + fn finish(self, params: &GrenadParameters) -> Result> { + let merger = self.0.build(); + let mut writer = create_writer( + params.chunk_compression_type, + params.chunk_compression_level, + tempfile::tempfile()?, + ); + merger.write_into_stream_writer(&mut writer)?; + + Ok(writer_into_reader(writer)?) + } } #[derive(Debug, Clone, Copy)] diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 22c1cfd6c..f4940af1d 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ - as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_readers, + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, - GrenadParameters, + GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, From 0a77be4ec02f29df26242a6ffa7a94ddcb3b0724 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 24 Mar 2022 15:22:57 +0100 Subject: [PATCH 1307/1889] introduce exact_word_docids db --- milli/src/index.rs | 9 ++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 71 +++++++++++++------ .../extract/extract_word_docids.rs | 12 +++- .../src/update/index_documents/extract/mod.rs | 7 +- .../index_documents/helpers/grenad_helpers.rs | 5 ++ .../src/update/index_documents/helpers/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 18 +++-- .../src/update/index_documents/typed_chunk.rs | 49 +++++++++---- milli/src/update/word_prefix_docids.rs | 5 +- 10 files changed, 133 insertions(+), 47 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index f4e17d93c..8f9c9beb7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -59,6 +59,7 @@ pub mod main_key { pub mod db_name { pub const MAIN: &str = "main"; pub const WORD_DOCIDS: &str = "word-docids"; + pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; @@ -83,6 +84,10 @@ pub struct Index { /// A word and all the documents ids containing the word. pub word_docids: Database, + + /// A word and all the documents ids containing the word, from attributes for which typos are not allowed. + pub exact_word_docids: Database, + /// A prefix of word and all the documents ids containing this prefix. pub word_prefix_docids: Database, @@ -119,12 +124,13 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(14); + options.max_dbs(15); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; let main = env.create_poly_database(Some(MAIN))?; let word_docids = env.create_database(Some(WORD_DOCIDS))?; + let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?; let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; @@ -146,6 +152,7 @@ impl Index { env, main, word_docids, + exact_word_docids, word_prefix_docids, docid_word_positions, word_pair_proximity_docids, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 644547b91..57c0969c7 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { env: _env, main: _main, word_docids, + exact_word_docids, word_prefix_docids, docid_word_positions, word_pair_proximity_docids, @@ -55,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // Clear the other databases. word_docids.clear(self.wtxn)?; + exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 402cc61dd..46a4721c0 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,7 +2,7 @@ use std::collections::btree_map::Entry; use std::collections::HashMap; use fst::IntoStreamer; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Str}; use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -16,7 +16,10 @@ use crate::heed_codec::facet::{ }; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; -use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; +use crate::{ + DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32, + BEU32, +}; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -108,6 +111,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { env: _env, main: _main, word_docids, + exact_word_docids, word_prefix_docids, docid_word_positions, word_pair_proximity_docids, @@ -204,25 +208,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We iterate over the words and delete the documents ids // from the word docids database. for (word, must_remove) in &mut words { - // We create an iterator to be able to get the content and delete the word docids. - // It's faster to acquire a cursor to get and delete or put, as we avoid traversing - // the LMDB B-Tree two times but only once. - let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; - if let Some((key, mut docids)) = iter.next().transpose()? { - if key == word.as_str() { - let previous_len = docids.len(); - docids -= &self.documents_ids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - *must_remove = true; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - } + remove_from_word_docids( + self.wtxn, + word_docids, + word.as_str(), + must_remove, + &self.documents_ids, + )?; + + remove_from_word_docids( + self.wtxn, + exact_word_docids, + word.as_str(), + must_remove, + &self.documents_ids, + )?; } // We construct an FST set that contains the words to delete from the words FST. @@ -457,6 +457,35 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } +fn remove_from_word_docids( + txn: &mut heed::RwTxn, + db: &heed::Database, + word: &str, + must_remove: &mut bool, + to_remove: &RoaringBitmap, +) -> Result<()> { + // We create an iterator to be able to get the content and delete the word docids. + // It's faster to acquire a cursor to get and delete or put, as we avoid traversing + // the LMDB B-Tree two times but only once. + let mut iter = db.prefix_iter_mut(txn, &word)?; + if let Some((key, mut docids)) = iter.next().transpose()? { + if key == word { + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + *must_remove = true; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } + } + } + Ok(()) +} + fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 80d68298a..03bfada21 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -10,17 +10,21 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::update::index_documents::MergeFn; use crate::Result; /// Extracts the word and the documents ids where this word appear. /// /// Returns a grenad reader with the list of extracted words and /// documents ids from the given chunk of docid word positions. +/// +/// The first returned reader in the one for normal word_docids, and the second one is for +/// exact_word_docids #[logging_timer::time] pub fn extract_word_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, -) -> Result> { +) -> Result<(grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); let mut word_docids_sorter = create_sorter( @@ -43,5 +47,9 @@ pub fn extract_word_docids( word_docids_sorter.insert(word_bytes, &value_buffer)?; } - sorter_into_reader(word_docids_sorter, indexer) + let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn); + Ok(( + sorter_into_reader(word_docids_sorter, indexer)?, + sorter_into_reader(empty_sorter, indexer)?, + )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 100431237..4e7f211ce 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -86,13 +86,16 @@ pub(crate) fn data_from_obkv_documents( "field-id-wordcount-docids", ); - spawn_extraction_task::<_, _, Vec>>( + spawn_extraction_task::<_, _, Vec<(grenad::Reader, grenad::Reader)>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), extract_word_docids, merge_roaring_bitmaps, - TypedChunk::WordDocids, + |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + }, "word-docids", ); diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index fc28860b2..fb5242910 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -277,3 +277,8 @@ pub fn sorter_into_lmdb_database( debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } + +/// Used when trying to merge readers, but you don't actually care about the values. +pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { + Ok(Cow::Owned(Vec::new())) +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index f4940af1d..4642bcf14 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -8,7 +8,7 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ - as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, GrenadParameters, MergeableReader, }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2d3004444..633b72cc9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,7 +20,7 @@ pub use self::helpers::{ fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, }; -use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; +use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::DocumentBatchReader; pub use crate::update::index_documents::helpers::CursorClonableMmap; @@ -282,6 +282,7 @@ where let mut word_pair_proximity_docids = None; let mut word_position_docids = None; let mut word_docids = None; + let mut _exact_word_docids = None; let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -291,10 +292,13 @@ where for result in lmdb_writer_rx { let typed_chunk = match result? { - TypedChunk::WordDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; word_docids = Some(cloneable_chunk); - TypedChunk::WordDocids(chunk) + let cloneable_chunk = + unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; + _exact_word_docids = Some(cloneable_chunk); + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } } TypedChunk::WordPairProximityDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; @@ -425,6 +429,10 @@ where }); if let Some(word_docids) = word_docids { + let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn); + word_docids_builder.push(word_docids.into_cursor()?); + // TODO: push exact_word_docids + let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?; // Run the word prefix docids update operation. let mut builder = WordPrefixDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.indexer_config.chunk_compression_type; @@ -432,7 +440,7 @@ where builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_memory = self.indexer_config.max_memory; builder.execute( - word_docids, + word_docids_iter, &new_prefix_fst_words, &common_prefix_fst_words, &del_prefix_fst_words, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 77ea31138..be440114f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -3,14 +3,16 @@ use std::convert::TryInto; use std::fs::File; use std::io; +use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ - self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, + self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, }; +use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ @@ -25,7 +27,10 @@ pub(crate) enum TypedChunk { Documents(grenad::Reader), FieldIdWordcountDocids(grenad::Reader), NewDocumentsIds(RoaringBitmap), - WordDocids(grenad::Reader), + WordDocids { + word_docids_reader: grenad::Reader, + exact_word_docids_reader: grenad::Reader, + }, WordPositionDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), @@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index( TypedChunk::NewDocumentsIds(documents_ids) => { return Ok((documents_ids, is_merged_database)) } - TypedChunk::WordDocids(word_docids_iter) => { - let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?; + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; append_entries_into_database( word_docids_iter.clone(), &index.word_docids, @@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index( merge_roaring_bitmaps, )?; + let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; + append_entries_into_database( + exact_word_docids_iter.clone(), + &index.exact_word_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_roaring_bitmaps, + )?; + // create fst from word docids - let mut builder = fst::SetBuilder::memory(); - let mut cursor = word_docids_iter.into_cursor()?; - while let Some((word, _value)) = cursor.move_on_next()? { - // This is a lexicographically ordered word position - // we use the key to construct the words fst. - builder.insert(word)?; - } - let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?; + let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?; let db_fst = index.words_fst(wtxn)?; // merge new fst with database fst @@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index( Ok((RoaringBitmap::new(), is_merged_database)) } +fn merge_word_docids_reader_into_fst( + word_docids_iter: grenad::Reader>, + exact_word_docids_iter: grenad::Reader>, +) -> Result>> { + let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn); + merger_builder.push(word_docids_iter.into_cursor()?); + merger_builder.push(exact_word_docids_iter.into_cursor()?); + let mut iter = merger_builder.build().into_stream_merger_iter()?; + let mut builder = fst::SetBuilder::memory(); + + while let Some((k, _)) = iter.next()? { + builder.insert(k)?; + } + + Ok(builder.into_set()) +} + fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { let new_value = RoaringBitmap::deserialize_from(new_value)?; let db_value = RoaringBitmap::deserialize_from(db_value)?; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 076816f09..4114f8baf 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -35,7 +35,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { #[logging_timer::time("WordPrefixDocids::{}")] pub fn execute( self, - new_word_docids: grenad::Reader, + mut new_word_docids_iter: grenad::MergerIter, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -51,10 +51,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { ); if !common_prefix_fst_words.is_empty() { - let mut new_word_docids_iter = new_word_docids.into_cursor()?; let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((word, data)) = new_word_docids_iter.move_on_next()? { + while let Some((word, data)) = new_word_docids_iter.next()? { current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), _otherwise => { From 5451c64d5d84ecbc154dc7708ad1c72c62336f6e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 24 Mar 2022 15:55:29 +0100 Subject: [PATCH 1308/1889] increase criteria asc desc test map size --- milli/tests/search/query_criteria.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index ef080db9f..786fdbcae 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -373,7 +373,7 @@ fn criteria_mixup() { fn criteria_ascdesc() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB + options.map_size(12 * 1024 * 1024); // 10 MB let index = Index::new(options, &path).unwrap(); let mut wtxn = index.write_txn().unwrap(); From 8d46a5b0b5d86b85a4c865a72522b915d540ccce Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 24 Mar 2022 17:00:29 +0100 Subject: [PATCH 1309/1889] extract exact word docids --- milli/src/index.rs | 5 +++ milli/src/lib.rs | 4 ++ .../extract/extract_word_docids.rs | 43 ++++++++++++++++--- .../src/update/index_documents/extract/mod.rs | 3 +- milli/src/update/index_documents/mod.rs | 2 + 5 files changed, 50 insertions(+), 7 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 8f9c9beb7..3d6d954f0 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -964,6 +964,11 @@ impl Index { .get::<_, Str, SerdeBincode>>(txn, main_key::EXACT_ATTRIBUTES)? .unwrap_or_default()) } + pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result> { + let attrs = self.exact_attributes(txn)?; + let fid_map = self.fields_ids_map(txn)?; + Ok(attrs.iter().filter_map(|attr| fid_map.id(attr)).collect()) + } pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> { self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?; diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ba2bd9b0f..b68c76048 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -74,6 +74,10 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi (field_id as u32) << 16 | (relative as u32) } +pub fn field_id_from_position(position: u32) -> FieldId { + (position >> 16 & 0xffff) as u16 +} + /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 03bfada21..5f231e5aa 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -1,3 +1,4 @@ +use std::collections::HashSet; use std::fs::File; use std::io; use std::iter::FromIterator; @@ -10,8 +11,8 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::update::index_documents::MergeFn; -use crate::Result; +use crate::update::index_documents::helpers::read_u32_ne_bytes; +use crate::{field_id_from_position, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. /// @@ -24,6 +25,7 @@ use crate::Result; pub fn extract_word_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, + exact_attributes: &HashSet, ) -> Result<(grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); @@ -35,21 +37,50 @@ pub fn extract_word_docids( max_memory, ); + let mut exact_word_docids_sorter = create_sorter( + merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + let mut value_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; - while let Some((key, _value)) = cursor.move_on_next()? { + while let Some((key, positions)) = cursor.move_on_next()? { let (document_id_bytes, word_bytes) = try_split_array_at(key) .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let bitmap = RoaringBitmap::from_iter(Some(document_id)); serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; - word_docids_sorter.insert(word_bytes, &value_buffer)?; + + // If there are no exact attributes, we do not need to iterate over positions. + if exact_attributes.is_empty() { + word_docids_sorter.insert(word_bytes, &value_buffer)?; + } else { + let mut added_to_exact = false; + let mut added_to_word_docids = false; + for position in read_u32_ne_bytes(positions) { + // as soon as we know that this word had been to both readers, we don't need to + // iterate over the positions. + if added_to_exact && added_to_word_docids { + break; + } + let fid = field_id_from_position(position); + if exact_attributes.contains(&fid) && !added_to_exact { + exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; + added_to_exact = true; + } else if !added_to_word_docids { + word_docids_sorter.insert(word_bytes, &value_buffer)?; + added_to_word_docids = true; + } + } + } } - let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn); Ok(( sorter_into_reader(word_docids_sorter, indexer)?, - sorter_into_reader(empty_sorter, indexer)?, + sorter_into_reader(exact_word_docids_sorter, indexer)?, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 4e7f211ce..8f6797a3b 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -43,6 +43,7 @@ pub(crate) fn data_from_obkv_documents( geo_field_id: Option, stop_words: Option>, max_positions_per_attributes: Option, + exact_attributes: HashSet, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() @@ -90,7 +91,7 @@ pub(crate) fn data_from_obkv_documents( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), - extract_word_docids, + move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), merge_roaring_bitmaps, |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { word_docids_reader, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 633b72cc9..c490e93da 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -226,6 +226,7 @@ where }; let stop_words = self.index.stop_words(self.wtxn)?; + let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; // Run extraction pipeline in parallel. pool.install(|| { @@ -255,6 +256,7 @@ where geo_field_id, stop_words, self.indexer_config.max_positions_per_attributes, + exact_attributes, ) }); From c4c6e3535290c88016e6a74f0f015563432e7fc9 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 24 Mar 2022 19:25:11 +0100 Subject: [PATCH 1310/1889] query exact_word_docids in resolve_query_tree --- milli/src/search/criteria/mod.rs | 20 ++++++++++++++++++-- milli/src/update/index_documents/mod.rs | 11 ++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1dbfd2524..df9189239 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -68,6 +68,7 @@ impl Default for Candidates { pub trait Context<'c> { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; + fn exact_word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; fn word_pair_proximity_docids( &self, @@ -118,6 +119,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.word_docids.get(self.rtxn, &word) } + fn exact_word_docids(&self, word: &str) -> heed::Result> { + self.index.exact_word_docids.get(self.rtxn, &word) + } + fn word_prefix_docids(&self, word: &str) -> heed::Result> { self.index.word_prefix_docids.get(self.rtxn, &word) } @@ -400,11 +405,14 @@ fn query_docids( let mut docids = RoaringBitmap::new(); for (word, _typo) in words { let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); - docids |= current_docids; + let exact_current_docids = ctx.exact_word_docids(&word)?.unwrap_or_default(); + docids |= current_docids | exact_current_docids; } Ok(docids) } else { - Ok(ctx.word_docids(&word)?.unwrap_or_default()) + let word_docids = ctx.word_docids(&word)?.unwrap_or_default(); + let exact_word_docids = ctx.exact_word_docids(&word)?.unwrap_or_default(); + Ok(word_docids | exact_word_docids) } } QueryKind::Tolerant { typo, word } => { @@ -512,6 +520,7 @@ pub mod test { pub struct TestContext<'t> { words_fst: fst::Set>, word_docids: HashMap, + exact_word_docids: HashMap, word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, @@ -527,6 +536,10 @@ pub mod test { Ok(self.word_docids.get(&word.to_string()).cloned()) } + fn exact_word_docids(&self, word: &str) -> heed::Result> { + Ok(self.exact_word_docids.get(&word.to_string()).cloned()) + } + fn word_prefix_docids(&self, word: &str) -> heed::Result> { Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) } @@ -643,6 +656,8 @@ pub mod test { s("morning") => random_postings(rng, 125), }; + let exact_word_docids = HashMap::new(); + let mut docid_words = HashMap::new(); for (word, docids) in word_docids.iter() { for docid in docids { @@ -712,6 +727,7 @@ pub mod test { TestContext { words_fst, word_docids, + exact_word_docids, word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c490e93da..54d30f8fb 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -284,7 +284,7 @@ where let mut word_pair_proximity_docids = None; let mut word_position_docids = None; let mut word_docids = None; - let mut _exact_word_docids = None; + let mut exact_word_docids = None; let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -299,7 +299,7 @@ where word_docids = Some(cloneable_chunk); let cloneable_chunk = unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; - _exact_word_docids = Some(cloneable_chunk); + exact_word_docids = Some(cloneable_chunk); TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } } TypedChunk::WordPairProximityDocids(chunk) => { @@ -352,6 +352,7 @@ where self.execute_prefix_databases( word_docids, + exact_word_docids, word_pair_proximity_docids, word_position_docids, )?; @@ -363,6 +364,7 @@ where pub fn execute_prefix_databases( self, word_docids: Option>, + exact_word_docids: Option>, word_pair_proximity_docids: Option>, word_position_docids: Option>, ) -> Result<()> @@ -433,7 +435,10 @@ where if let Some(word_docids) = word_docids { let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn); word_docids_builder.push(word_docids.into_cursor()?); - // TODO: push exact_word_docids + if let Some(exact_word_docids) = exact_word_docids { + word_docids_builder.push(exact_word_docids.into_cursor()?); + } + let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?; // Run the word prefix docids update operation. let mut builder = WordPrefixDocids::new(self.wtxn, self.index); From ba0bb29cd8a1b748b325c2854ce9ea6daaf127a1 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 25 Mar 2022 10:20:39 +0100 Subject: [PATCH 1311/1889] refactor WordPrefixDocids to take dbs instead of indexes --- milli/src/update/index_documents/mod.rs | 6 +++++- milli/src/update/word_prefix_docids.rs | 20 ++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 54d30f8fb..91d108c72 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -441,7 +441,11 @@ where let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?; // Run the word prefix docids update operation. - let mut builder = WordPrefixDocids::new(self.wtxn, self.index); + let mut builder = WordPrefixDocids::new( + self.wtxn, + self.index.word_docids.clone(), + self.index.word_prefix_docids.clone(), + ); builder.chunk_compression_type = self.indexer_config.chunk_compression_type; builder.chunk_compression_level = self.indexer_config.chunk_compression_level; builder.max_nb_chunks = self.indexer_config.max_nb_chunks; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 4114f8baf..b166812a5 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -1,16 +1,18 @@ use std::collections::{HashMap, HashSet}; use grenad::CompressionType; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Str}; +use heed::Database; use crate::update::index_documents::{ create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn, }; -use crate::{Index, Result}; +use crate::{Result, RoaringBitmapCodec}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, + word_docids: Database, + word_prefix_docids: Database, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, @@ -20,11 +22,13 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, + word_docids: Database, + word_prefixes_docids: Database, ) -> WordPrefixDocids<'t, 'u, 'i> { WordPrefixDocids { wtxn, - index, + word_docids, + word_prefix_docids: word_prefixes_docids, chunk_compression_type: CompressionType::None, chunk_compression_level: None, max_nb_chunks: None, @@ -83,7 +87,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } // We fetch the docids associated to the newly added word prefix fst only. - let db = self.index.word_docids.remap_data_type::(); + let db = self.word_docids.remap_data_type::(); for prefix in new_prefix_fst_words { let prefix = std::str::from_utf8(prefix.as_bytes())?; for result in db.prefix_iter(self.wtxn, prefix)? { @@ -93,7 +97,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { } // We remove all the entries that are no more required in this word prefix docids database. - let mut iter = self.index.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + let mut iter = self.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data(); while let Some((prefix, _)) = iter.next().transpose()? { if del_prefix_fst_words.contains(prefix.as_bytes()) { unsafe { iter.del_current()? }; @@ -105,7 +109,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // We finally write the word prefix docids into the LMDB database. sorter_into_lmdb_database( self.wtxn, - *self.index.word_prefix_docids.as_polymorph(), + *self.word_prefix_docids.as_polymorph(), prefix_docids_sorter, merge_roaring_bitmaps, )?; From 6dd2e4ffbd97bac64d0d3a7a5c39a51b0a5639a5 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 25 Mar 2022 10:49:34 +0100 Subject: [PATCH 1312/1889] introduce exact_word_prefix database in index --- milli/src/index.rs | 8 +++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 67 ++++++++++++++++++---------- 3 files changed, 53 insertions(+), 24 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 3d6d954f0..80f62f684 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -61,6 +61,7 @@ pub mod db_name { pub const WORD_DOCIDS: &str = "word-docids"; pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; + pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; @@ -91,6 +92,9 @@ pub struct Index { /// A prefix of word and all the documents ids containing this prefix. pub word_prefix_docids: Database, + /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. + pub exact_word_prefix_docids: Database, + /// Maps a word and a document id (u32) to all the positions where the given word appears. pub docid_word_positions: Database, @@ -124,7 +128,7 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(15); + options.max_dbs(16); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -132,6 +136,7 @@ impl Index { let word_docids = env.create_database(Some(WORD_DOCIDS))?; let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?; let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; + let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_prefix_pair_proximity_docids = @@ -154,6 +159,7 @@ impl Index { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 57c0969c7..3665d2313 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -21,6 +21,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, @@ -58,6 +59,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_docids.clear(self.wtxn)?; exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; + exact_word_prefix_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 46a4721c0..58c4d4f70 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; -use heed::{BytesDecode, BytesEncode}; +use heed::{BytesDecode, BytesEncode, Database}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -113,6 +113,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, field_id_word_count_docids, @@ -254,34 +255,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We write the new words FST into the main database. self.index.put_words_fst(self.wtxn, &new_words_fst)?; - // We iterate over the word prefix docids database and remove the deleted documents ids - // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. - let mut prefixes_to_delete = fst::SetBuilder::memory(); - let mut iter = word_prefix_docids.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (prefix, mut docids) = result?; - let prefix = prefix.to_owned(); - let previous_len = docids.len(); - docids -= &self.documents_ids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - prefixes_to_delete.insert(prefix)?; - } else if docids.len() != previous_len { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&prefix, &docids)? }; - } - } + let prefixes_to_delete = + remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?; - drop(iter); + let exact_prefix_to_delete = remove_from_word_prefix_docids( + self.wtxn, + exact_word_prefix_docids, + &self.documents_ids, + )?; + + let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); // We compute the new prefix FST and write it only if there is a change. - let prefixes_to_delete = prefixes_to_delete.into_set(); - if !prefixes_to_delete.is_empty() { + if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() { let new_words_prefixes_fst = { // We retrieve the current words prefixes FST from the database. let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?; - let difference = words_prefixes_fst.op().add(&prefixes_to_delete).difference(); + let difference = + words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference(); // We stream the new external ids that does no more contains the to-delete external ids. let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory(); @@ -457,6 +448,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } +fn remove_from_word_prefix_docids( + txn: &mut heed::RwTxn, + db: &Database, + to_remove: &RoaringBitmap, +) -> Result>> { + let mut prefixes_to_delete = fst::SetBuilder::memory(); + + // We iterate over the word prefix docids database and remove the deleted documents ids + // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. + let mut iter = db.iter_mut(txn)?; + while let Some(result) = iter.next() { + let (prefix, mut docids) = result?; + let prefix = prefix.to_owned(); + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + prefixes_to_delete.insert(prefix)?; + } else if docids.len() != previous_len { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&prefix, &docids)? }; + } + } + + drop(iter); + + Ok(prefixes_to_delete.into_set()) +} + fn remove_from_word_docids( txn: &mut heed::RwTxn, db: &heed::Database, From e8f06f6c0606b130b2e398246bb55ceeb51602b3 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 25 Mar 2022 16:17:55 +0100 Subject: [PATCH 1313/1889] extract exact_word_prefix_docids --- milli/src/update/index_documents/mod.rs | 66 ++++++++++++++++++------- milli/src/update/word_prefix_docids.rs | 8 +-- 2 files changed, 51 insertions(+), 23 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 91d108c72..0e6e59e10 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -4,11 +4,13 @@ mod transform; mod typed_chunk; use std::collections::HashSet; -use std::io::{Read, Seek}; +use std::io::{Cursor, Read, Seek}; use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; use crossbeam_channel::{Receiver, Sender}; +use heed::types::Str; +use heed::Database; use log::debug; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -20,7 +22,7 @@ pub use self::helpers::{ fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, }; -use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters}; +use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::DocumentBatchReader; pub use crate::update::index_documents::helpers::CursorClonableMmap; @@ -28,7 +30,7 @@ use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; -use crate::{Index, Result}; +use crate::{Index, Result, RoaringBitmapCodec}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -433,25 +435,25 @@ where }); if let Some(word_docids) = word_docids { - let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn); - word_docids_builder.push(word_docids.into_cursor()?); - if let Some(exact_word_docids) = exact_word_docids { - word_docids_builder.push(exact_word_docids.into_cursor()?); - } - - let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?; - // Run the word prefix docids update operation. - let mut builder = WordPrefixDocids::new( + execute_word_prefix_docids( self.wtxn, + word_docids, self.index.word_docids.clone(), self.index.word_prefix_docids.clone(), - ); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - builder.max_nb_chunks = self.indexer_config.max_nb_chunks; - builder.max_memory = self.indexer_config.max_memory; - builder.execute( - word_docids_iter, + &self.indexer_config, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } + + if let Some(exact_word_docids) = exact_word_docids { + execute_word_prefix_docids( + self.wtxn, + exact_word_docids, + self.index.exact_word_docids.clone(), + self.index.exact_word_prefix_docids.clone(), + &self.indexer_config, &new_prefix_fst_words, &common_prefix_fst_words, &del_prefix_fst_words, @@ -516,6 +518,32 @@ where } } +/// Run the word prefix docids update operation. +fn execute_word_prefix_docids( + txn: &mut heed::RwTxn, + reader: grenad::Reader>, + word_docids_db: Database, + word_prefix_docids_db: Database, + indexer_config: &IndexerConfig, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, +) -> Result<()> { + let cursor = reader.into_cursor()?; + let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db); + builder.chunk_compression_type = indexer_config.chunk_compression_type; + builder.chunk_compression_level = indexer_config.chunk_compression_level; + builder.max_nb_chunks = indexer_config.max_nb_chunks; + builder.max_memory = indexer_config.max_memory; + builder.execute( + cursor, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + Ok(()) +} + #[cfg(test)] mod tests { use std::io::Cursor; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index b166812a5..2887b5583 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -23,12 +23,12 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, word_docids: Database, - word_prefixes_docids: Database, + word_prefix_docids: Database, ) -> WordPrefixDocids<'t, 'u, 'i> { WordPrefixDocids { wtxn, word_docids, - word_prefix_docids: word_prefixes_docids, + word_prefix_docids, chunk_compression_type: CompressionType::None, chunk_compression_level: None, max_nb_chunks: None, @@ -39,7 +39,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { #[logging_timer::time("WordPrefixDocids::{}")] pub fn execute( self, - mut new_word_docids_iter: grenad::MergerIter, + mut new_word_docids_iter: grenad::ReaderCursor, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -57,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { if !common_prefix_fst_words.is_empty() { let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((word, data)) = new_word_docids_iter.next()? { + while let Some((word, data)) = new_word_docids_iter.move_on_next()? { current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), _otherwise => { From 21ae4143b177389dde584411107f6559a5fbe4aa Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 25 Mar 2022 16:27:48 +0100 Subject: [PATCH 1314/1889] add exact_word_prefix to Context --- milli/src/search/criteria/mod.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index df9189239..3daa258bf 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -70,6 +70,7 @@ pub trait Context<'c> { fn word_docids(&self, word: &str) -> heed::Result>; fn exact_word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result>; fn word_pair_proximity_docids( &self, left: &str, @@ -127,6 +128,10 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { self.index.word_prefix_docids.get(self.rtxn, &word) } + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { + self.index.exact_word_prefix_docids.get(self.rtxn, &word) + } + fn word_pair_proximity_docids( &self, left: &str, @@ -522,6 +527,7 @@ pub mod test { word_docids: HashMap, exact_word_docids: HashMap, word_prefix_docids: HashMap, + exact_word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, docid_words: HashMap>, @@ -544,6 +550,10 @@ pub mod test { Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) } + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { + Ok(self.exact_word_prefix_docids.get(&word.to_string()).cloned()) + } + fn word_pair_proximity_docids( &self, left: &str, @@ -672,6 +682,8 @@ pub mod test { s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], }; + let exact_word_prefix_docids = HashMap::new(); + let mut word_pair_proximity_docids = HashMap::new(); let mut word_prefix_pair_proximity_docids = HashMap::new(); for (lword, lcandidates) in &word_docids { @@ -729,6 +741,7 @@ pub mod test { word_docids, exact_word_docids, word_prefix_docids, + exact_word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, docid_words, From 56b4f5dce2a32505e6e25b973880b7d682e4d4be Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 25 Mar 2022 16:30:18 +0100 Subject: [PATCH 1315/1889] add exact prefix to query_docids --- milli/src/search/criteria/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 3daa258bf..6ac076ea4 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -404,7 +404,9 @@ fn query_docids( match &query.kind { QueryKind::Exact { word, .. } => { if query.prefix && ctx.in_prefix_cache(&word) { - Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default()) + let doc_ids = ctx.word_prefix_docids(&word)?.unwrap_or_default(); + let exact_docids = ctx.exact_word_prefix_docids(&word)?.unwrap_or_default(); + Ok(doc_ids | exact_docids) } else if query.prefix { let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); From 6b2c2509b2e5bfcd5f522a3129f2c8c42bed2c07 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 30 Mar 2022 16:07:59 +0200 Subject: [PATCH 1316/1889] fix bug in exact search --- milli/src/index.rs | 1 + milli/src/search/criteria/mod.rs | 35 ++++++++++++------- .../extract/extract_word_docids.rs | 2 ++ 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 80f62f684..c7441c590 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -970,6 +970,7 @@ impl Index { .get::<_, Str, SerdeBincode>>(txn, main_key::EXACT_ATTRIBUTES)? .unwrap_or_default()) } + pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result> { let attrs = self.exact_attributes(txn)?; let fid_map = self.fields_ids_map(txn)?; diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 6ac076ea4..05305d724 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -402,31 +402,42 @@ fn query_docids( wdcache: &mut WordDerivationsCache, ) -> Result { match &query.kind { - QueryKind::Exact { word, .. } => { + QueryKind::Exact { word, original_typo } => { if query.prefix && ctx.in_prefix_cache(&word) { - let doc_ids = ctx.word_prefix_docids(&word)?.unwrap_or_default(); - let exact_docids = ctx.exact_word_prefix_docids(&word)?.unwrap_or_default(); - Ok(doc_ids | exact_docids) + let mut docids = ctx.word_prefix_docids(&word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_prefix_docids(&word)?.unwrap_or_default(); + } + Ok(docids) } else if query.prefix { let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { - let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); - let exact_current_docids = ctx.exact_word_docids(&word)?.unwrap_or_default(); - docids |= current_docids | exact_current_docids; + docids |= ctx.word_docids(&word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_docids(&word)?.unwrap_or_default(); + } } Ok(docids) } else { - let word_docids = ctx.word_docids(&word)?.unwrap_or_default(); - let exact_word_docids = ctx.exact_word_docids(&word)?.unwrap_or_default(); - Ok(word_docids | exact_word_docids) + let mut docids = ctx.word_docids(&word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_docids(&word)?.unwrap_or_default(); + } + Ok(docids) } } QueryKind::Tolerant { typo, word } => { let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + for (word, typo) in words { + let mut current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + if *typo == 0 { + current_docids |= ctx.exact_word_docids(&word)?.unwrap_or_default() + } docids |= current_docids; } Ok(docids) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 5f231e5aa..fbc9f6919 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -69,9 +69,11 @@ pub fn extract_word_docids( } let fid = field_id_from_position(position); if exact_attributes.contains(&fid) && !added_to_exact { + println!("is exact: {}", std::str::from_utf8(&word_bytes).unwrap()); exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; added_to_exact = true; } else if !added_to_word_docids { + println!("isnt exact: {}", std::str::from_utf8(&word_bytes).unwrap()); word_docids_sorter.insert(word_bytes, &value_buffer)?; added_to_word_docids = true; } From bfd81ce050c6f0723f7322300958a0834529bcf6 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 30 Mar 2022 16:08:20 +0200 Subject: [PATCH 1317/1889] add exact atttributes to cli settings --- cli/src/main.rs | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 503b02887..6523cef2e 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,4 +1,4 @@ -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashSet}; use std::fs::File; use std::io::{stdin, BufRead, BufReader, Cursor, Read, Write}; use std::path::PathBuf; @@ -99,8 +99,10 @@ impl Settings { }) .collect(); + let exact_attributes = index.exact_attributes(&txn)?; + println!( - "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\n", + "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}", displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), filterable_attributes.join("\n\t"), @@ -109,6 +111,7 @@ impl Settings { stop_words.join("\n\t"), distinct_field.unwrap_or_default(), synonyms.into_iter().map(|(k, v)| format!("\n\t{}:\n{:?}", k, v)).collect::(), + exact_attributes.join("\n\t"), ); Ok(()) } @@ -463,6 +466,8 @@ struct SettingsUpdate { filterable_attributes: Option>, #[structopt(long)] criteria: Option>, + #[structopt(long)] + exact_attributes: Option>, } impl Performer for SettingsUpdate { @@ -489,6 +494,14 @@ impl Performer for SettingsUpdate { } } + if let Some(exact_attributes) = self.exact_attributes { + if !exact_attributes.is_empty() { + update.set_exact_attributes(exact_attributes.into_iter().collect()); + } else { + update.reset_exact_attributes(); + } + } + let mut bars = Vec::new(); let progesses = MultiProgress::new(); for _ in 0..4 { From c8d3a09af83d7f1cdfab65d45ac6173dfa1b31d3 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 14:47:07 +0200 Subject: [PATCH 1318/1889] add integration test for disabel typo on attributes --- milli/tests/search/typo_tolerance.rs | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index df15fb768..92d57c9b9 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -170,3 +170,40 @@ fn test_typo_disabled_on_word() { let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); } + +#[test] +fn test_disable_typo_on_attribute() { + let criteria = [Typo]; + let index = super::setup_search_index_with_criteria(&criteria); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("antebelum"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); + builder.execute(|_| ()).unwrap(); + + // typo is now supported for 4 letters words + let mut search = Search::new(&txn, &index); + search.query("antebelum"); + search.limit(10); + search.authorize_typos(true); + search.optional_words(true); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); +} From 9963f11172a06fa79ed06c3baf8cb4ae727c743b Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 20:38:45 +0200 Subject: [PATCH 1319/1889] fix infos crate compilation issue --- cli/src/main.rs | 2 +- infos/src/main.rs | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 6523cef2e..cf1e85984 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashSet}; +use std::collections::BTreeMap; use std::fs::File; use std::io::{stdin, BufRead, BufReader, Cursor, Read, Write}; use std::path::PathBuf; diff --git a/infos/src/main.rs b/infos/src/main.rs index dc98d410d..6a270833b 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -387,6 +387,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho field_id_docid_facet_f64s: _, field_id_docid_facet_strings: _, documents, + .. } = index; let main_name = "main"; @@ -968,6 +969,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, + .. } = index; let names = if names.is_empty() { From 6cabd47c32bcf2ba53a3ebe94f254a7fe63de520 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 20:59:20 +0200 Subject: [PATCH 1320/1889] fix typo in comment --- milli/src/update/index_documents/extract/extract_word_docids.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index fbc9f6919..b577ef567 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -19,7 +19,7 @@ use crate::{field_id_from_position, FieldId, Result}; /// Returns a grenad reader with the list of extracted words and /// documents ids from the given chunk of docid word positions. /// -/// The first returned reader in the one for normal word_docids, and the second one is for +/// The first returned reader is the one for normal word_docids, and the second one is for /// exact_word_docids #[logging_timer::time] pub fn extract_word_docids( From b7694c34f53da8f3253236aff1a5b4a24503bf3c Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 21:00:07 +0200 Subject: [PATCH 1321/1889] remove println --- milli/src/update/index_documents/extract/extract_word_docids.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index b577ef567..5083bbd90 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -69,11 +69,9 @@ pub fn extract_word_docids( } let fid = field_id_from_position(position); if exact_attributes.contains(&fid) && !added_to_exact { - println!("is exact: {}", std::str::from_utf8(&word_bytes).unwrap()); exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; added_to_exact = true; } else if !added_to_word_docids { - println!("isnt exact: {}", std::str::from_utf8(&word_bytes).unwrap()); word_docids_sorter.insert(word_bytes, &value_buffer)?; added_to_word_docids = true; } From 1810927dbd5f23b85c7e6d9c01d4e68907e84a3f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 21:04:49 +0200 Subject: [PATCH 1322/1889] rephrase exact_attributes doc --- milli/src/update/settings.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 3ed2a4152..7a26361d4 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -93,7 +93,7 @@ pub struct Settings<'a, 't, 'u, 'i> { min_word_len_two_typos: Setting, min_word_len_one_typo: Setting, exact_words: Setting>, - /// attributes on which typo tolerance is not enabled. + /// Attributes on which typo tolerance is disabled. exact_attributes: Setting>, } From 59e41d98e303205fbb38b467d947c853d15f9ca8 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 4 Apr 2022 21:17:06 +0200 Subject: [PATCH 1323/1889] add comments to integration test --- milli/tests/search/typo_tolerance.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 92d57c9b9..35cc4b4c2 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -181,6 +181,7 @@ fn test_disable_typo_on_attribute() { let txn = index.read_txn().unwrap(); let mut search = Search::new(&txn, &index); + // typo in `antebel(l)um` search.query("antebelum"); search.limit(10); search.authorize_typos(true); @@ -194,10 +195,10 @@ fn test_disable_typo_on_attribute() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); + // disable typos on `description` builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); builder.execute(|_| ()).unwrap(); - // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); search.query("antebelum"); search.limit(10); From ab185a59b5a969f76013670cb61c6892e435f32d Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 09:46:56 +0200 Subject: [PATCH 1324/1889] fix infos --- infos/src/main.rs | 26 ++++++++++++++++++++++++-- milli/src/update/delete_documents.rs | 2 -- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 6a270833b..05c168233 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -29,6 +29,8 @@ const ALL_DATABASE_NAMES: &[&str] = &[ FACET_ID_STRING_DOCIDS, FIELD_ID_DOCID_FACET_F64S, FIELD_ID_DOCID_FACET_STRINGS, + EXACT_WORD_DOCIDS, + EXACT_WORD_PREFIX_DOCIDS, DOCUMENTS, ]; @@ -384,10 +386,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + exact_word_docids, + exact_word_prefix_docids, field_id_docid_facet_f64s: _, field_id_docid_facet_strings: _, documents, - .. } = index; let main_name = "main"; @@ -437,6 +440,14 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho } } + for result in exact_word_docids.remap_data_type::().iter(rtxn)? { + let (word, value) = result?; + heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); + if heap.len() > limit { + heap.pop(); + } + } + for result in word_prefix_docids.remap_data_type::().iter(rtxn)? { let (word, value) = result?; heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); @@ -445,6 +456,14 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho } } + for result in exact_word_prefix_docids.remap_data_type::().iter(rtxn)? { + let (word, value) = result?; + heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); + if heap.len() > limit { + heap.pop(); + } + } + for result in docid_word_positions.remap_data_type::().iter(rtxn)? { let ((docid, word), value) = result?; let key = format!("{} {}", docid, word); @@ -968,8 +987,9 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a facet_id_string_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, + exact_word_prefix_docids, + exact_word_docids, documents, - .. } = index; let names = if names.is_empty() { @@ -993,6 +1013,8 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(), FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(), FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(), + EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(), + EXACT_WORD_PREFIX_DOCIDS => exact_word_prefix_docids.as_polymorph(), DOCUMENTS => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 58c4d4f70..b347aae38 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -473,8 +473,6 @@ fn remove_from_word_prefix_docids( } } - drop(iter); - Ok(prefixes_to_delete.into_set()) } From dac81b2d44e479a838f20cc9bc14e37efa430d7f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 09:48:56 +0200 Subject: [PATCH 1325/1889] add missing \n in cli settings --- cli/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index cf1e85984..202c67707 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -102,7 +102,7 @@ impl Settings { let exact_attributes = index.exact_attributes(&txn)?; println!( - "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}", + "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}\n", displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), filterable_attributes.join("\n\t"), From b85cd4983ea01b062ce5e3a2c79a8a3a06f7b0ed Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 09:50:34 +0200 Subject: [PATCH 1326/1889] remove field_id_from_position --- milli/src/lib.rs | 4 ---- .../src/update/index_documents/extract/extract_word_docids.rs | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index b68c76048..ba2bd9b0f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -74,10 +74,6 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi (field_id as u32) << 16 | (relative as u32) } -pub fn field_id_from_position(position: u32) -> FieldId { - (position >> 16 & 0xffff) as u16 -} - /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 5083bbd90..0f8b4c039 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -12,7 +12,7 @@ use super::helpers::{ use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::update::index_documents::helpers::read_u32_ne_bytes; -use crate::{field_id_from_position, FieldId, Result}; +use crate::{relative_from_absolute_position, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. /// @@ -67,7 +67,7 @@ pub fn extract_word_docids( if added_to_exact && added_to_word_docids { break; } - let fid = field_id_from_position(position); + let (fid, _) = relative_from_absolute_position(position); if exact_attributes.contains(&fid) && !added_to_exact { exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; added_to_exact = true; From 9eec44dd9897f337be0109d077cd5ff46df917f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 5 Apr 2022 12:06:42 +0200 Subject: [PATCH 1327/1889] Update version (v0.25.0) --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 0d1a6a6a0..f516a60ba 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.24.1" +version = "0.25.0" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index f654ab29b..d1a244bbc 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.24.1" +version = "0.25.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 653de243a..a0590ce8e 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.24.1" +version = "0.25.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 8fe140582..b703ad04d 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.24.1" +version = "0.25.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d7eb7b9bf..07c509438 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.24.1" +version = "0.25.0" authors = ["Kerollmops "] edition = "2018" From 5cfd3d8407bd2bc11f6771385436681726ea8e12 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 14:10:22 +0200 Subject: [PATCH 1328/1889] add exact attributes documentation --- milli/src/index.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index c7441c590..42170bc80 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -964,6 +964,7 @@ impl Index { Ok(()) } + /// Returns the exact attributes: attributes for which typo is disallowed. pub fn exact_attributes<'t>(&self, txn: &'t RoTxn) -> Result> { Ok(self .main @@ -971,17 +972,20 @@ impl Index { .unwrap_or_default()) } + /// Returns the list of exact attributes field ids. pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result> { let attrs = self.exact_attributes(txn)?; let fid_map = self.fields_ids_map(txn)?; Ok(attrs.iter().filter_map(|attr| fid_map.id(attr)).collect()) } + /// Writes the exact attributes to the database. pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> { self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?; Ok(()) } + /// Clears the exact attributes from the store. pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result<()> { self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?; Ok(()) From 201fea0fdaae3a334936a8ad52e2c5de8f178a84 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 14:14:15 +0200 Subject: [PATCH 1329/1889] limit extract_word_docids memory usage --- milli/src/update/delete_documents.rs | 1 + .../src/update/index_documents/extract/extract_word_docids.rs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index b347aae38..77c32f0fb 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -502,6 +502,7 @@ fn remove_from_word_docids( } } } + Ok(()) } diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 0f8b4c039..f3a44162b 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -34,7 +34,7 @@ pub fn extract_word_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory, + max_memory.map(|x| x / 2), ); let mut exact_word_docids_sorter = create_sorter( @@ -42,7 +42,7 @@ pub fn extract_word_docids( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory, + max_memory.map(|x| x / 2), ); let mut value_buffer = Vec::new(); From d96e72e5dc2c01305d41fd3cb927ff77696f698f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 22 Mar 2022 15:22:14 +0100 Subject: [PATCH 1330/1889] Create formater with some tests --- .../search/{ => matches}/matching_words.rs | 48 +- milli/src/search/matches/mod.rs | 434 ++++++++++++++++++ milli/src/search/mod.rs | 4 +- 3 files changed, 469 insertions(+), 17 deletions(-) rename milli/src/search/{ => matches}/matching_words.rs (89%) create mode 100644 milli/src/search/matches/mod.rs diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matches/matching_words.rs similarity index 89% rename from milli/src/search/matching_words.rs rename to milli/src/search/matches/matching_words.rs index 67bdefb37..48f6fe809 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -1,11 +1,11 @@ use std::cmp::{min, Reverse}; -use std::collections::{BTreeMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::ops::{Index, IndexMut}; use levenshtein_automata::{Distance, DFA}; use meilisearch_tokenizer::Token; -use super::build_dfa; +use crate::search::build_dfa; use crate::search::query_tree::{Operation, Query}; type IsPrefix = bool; @@ -14,7 +14,7 @@ type IsPrefix = bool; /// referencing words that match the given query tree. #[derive(Default)] pub struct MatchingWords { - dfas: Vec<(DFA, String, u8, IsPrefix)>, + dfas: Vec<(DFA, String, u8, IsPrefix, usize)>, } impl MatchingWords { @@ -23,11 +23,11 @@ impl MatchingWords { let mut dfas: Vec<_> = fetch_queries(tree) .into_iter() // create DFAs for each word - .map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p)) + .map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id)) .collect(); // Sort word by len in DESC order prioritizing the longuest word, // in order to highlight the longuest part of the matched word. - dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| { + dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| { Reverse(query_word.len()) }); Self { dfas } @@ -35,14 +35,21 @@ impl MatchingWords { /// Returns the number of matching bytes if the word matches one of the query words. pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option { - self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| { + self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len) + } + + pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> { + self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| { match dfa.eval(word_to_highlight.text()) { Distance::Exact(t) if t <= *typo => { if *is_prefix { let len = bytes_to_highlight(word_to_highlight.text(), query_word); - Some(word_to_highlight.num_chars_from_bytes(len)) + Some((word_to_highlight.num_chars_from_bytes(len), *id)) } else { - Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len())) + Some(( + word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()), + *id, + )) } } _otherwise => None, @@ -52,26 +59,37 @@ impl MatchingWords { } /// Lists all words which can be considered as a match for the query tree. -fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { - fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { +fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> { + fn resolve_ops<'a>( + tree: &'a Operation, + out: &mut HashMap<(&'a str, u8, IsPrefix), usize>, + id: &mut usize, + ) { match tree { Operation::Or(_, ops) | Operation::And(ops) => { - ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); + ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id)); } Operation::Query(Query { prefix, kind }) => { let typo = if kind.is_exact() { 0 } else { kind.typo() }; - out.insert((kind.word(), typo, *prefix)); + out.entry((kind.word(), typo, *prefix)).or_insert_with(|| { + *id += 1; + *id + }); } Operation::Phrase(words) => { for word in words { - out.insert((word, 0, false)); + out.entry((word, 0, false)).or_insert_with(|| { + *id += 1; + *id + }); } } } } - let mut queries = HashSet::new(); - resolve_ops(tree, &mut queries); + let mut queries = HashMap::new(); + let mut id = 0; + resolve_ops(tree, &mut queries, &mut id); queries } diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs new file mode 100644 index 000000000..0ebf6305f --- /dev/null +++ b/milli/src/search/matches/mod.rs @@ -0,0 +1,434 @@ +use std::borrow::Cow; + +use matching_words::MatchingWords; +use meilisearch_tokenizer::token::SeparatorKind; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; + +use crate::search::query_tree::Operation; + +pub mod matching_words; + +const DEFAULT_CROP_SIZE: usize = 10; +const DEFAULT_CROP_MARKER: &'static str = "…"; +const DEFAULT_HIGHLIGHT_PREFIX: &'static str = ""; +const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = ""; + +pub struct MatcherBuilder { + matching_words: MatchingWords, + crop_size: usize, + crop_marker: Option, + highlight_prefix: Option, + highlight_suffix: Option, +} + +impl MatcherBuilder { + pub fn from_query_tree(query_tree: &Operation) -> Self { + let matching_words = MatchingWords::from_query_tree(query_tree); + + Self { + matching_words, + crop_size: DEFAULT_CROP_SIZE, + crop_marker: None, + highlight_prefix: None, + highlight_suffix: None, + } + } + + pub fn crop_size(&mut self, word_count: usize) -> &Self { + self.crop_size = word_count; + self + } + + pub fn crop_marker(&mut self, marker: String) -> &Self { + self.crop_marker = Some(marker); + self + } + + pub fn highlight_prefix(&mut self, prefix: String) -> &Self { + self.highlight_prefix = Some(prefix); + self + } + + pub fn highlight_suffix(&mut self, suffix: String) -> &Self { + self.highlight_suffix = Some(suffix); + self + } + + pub fn build<'t, 'm>(&'m self, tokens: &'t [Token], text: &'t str) -> Matcher<'t, 'm> { + let crop_marker = match &self.crop_marker { + Some(marker) => marker.as_str(), + None => &DEFAULT_CROP_MARKER, + }; + + let highlight_prefix = match &self.highlight_prefix { + Some(marker) => marker.as_str(), + None => &DEFAULT_HIGHLIGHT_PREFIX, + }; + let highlight_suffix = match &self.highlight_suffix { + Some(marker) => marker.as_str(), + None => &DEFAULT_HIGHLIGHT_SUFFIX, + }; + Matcher { + text, + tokens, + matching_words: &self.matching_words, + crop_size: self.crop_size, + crop_marker, + highlight_prefix, + highlight_suffix, + matches: None, + } + } +} + +// impl Default for MatcherBuilder { +// fn default() -> Self { +// Self { +// crop_size: DEFAULT_CROP_SIZE, +// crop_marker: None, +// highlight_prefix: None, +// highlight_suffix: None, +// } +// } +// } + +pub struct Match<'t> { + token: &'t Token<'t>, + match_len: usize, + // id of the query word that matches. + id: usize, + // position of the word in the whole text. + position: usize, +} + +pub struct MatchBounds { + start: usize, + length: usize, +} + +impl<'t> From<&Match<'t>> for MatchBounds { + fn from(m: &Match) -> Self { + MatchBounds { start: m.token.byte_start, length: m.match_len } + } +} + +pub struct Matcher<'t, 'm> { + text: &'t str, + tokens: &'t [Token<'t>], + matching_words: &'m MatchingWords, + crop_size: usize, + crop_marker: &'m str, + highlight_prefix: &'m str, + highlight_suffix: &'m str, + matches: Option>>, +} + +impl<'t> Matcher<'t, '_> { + fn compute_matches(&mut self) -> &mut Self { + let mut matches = Vec::new(); + let mut position = 0; + for token in self.tokens { + match token.is_separator() { + Some(SeparatorKind::Hard) => position += 7, + None => { + if let Some((match_len, id)) = + self.matching_words.matching_bytes_with_id(&token) + { + matches.push(Match { token, match_len, id, position }); + } + position += 1; + } + _otherwise => {} + } + } + + self.matches = Some(matches); + self + } + + pub fn matches(&mut self) -> Vec { + match &self.matches { + None => self.compute_matches().matches(), + Some(matches) => matches.iter().map(MatchBounds::from).collect(), + } + } + + fn crop_bounds(&self, matches: &[Match<'t>]) -> (usize, usize) { + let byte_end = self + .tokens + .iter() + .filter(|t| t.is_separator().is_none()) + .enumerate() + .take_while(|(i, _)| *i < self.crop_size) + .last() + .map_or(self.text.len(), |(_, t)| t.byte_end); + + (0, byte_end) + } + + pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { + if !highlight && !crop { + // compute matches is not needed if no highlight or crop is requested. + Cow::Borrowed(self.text) + } else { + match &self.matches { + Some(matches) => { + let (byte_start, byte_end) = + if crop { self.crop_bounds(matches) } else { (0, self.text.len()) }; + + let mut formatted = Vec::new(); + + // push crop marker if it's not the start of the text. + if byte_start > 0 && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + let mut byte_index = byte_start; + + if highlight { + // insert highlight markers around matches. + for m in matches + .iter() + .skip_while(|m| m.token.byte_start < byte_start) + .take_while(|m| m.token.byte_start < byte_end) + { + if byte_index < m.token.byte_start { + formatted.push(&self.text[byte_index..m.token.byte_start]); + } + + formatted.push(self.highlight_prefix); + formatted.push(&self.text[m.token.byte_start..m.token.byte_end]); + formatted.push(self.highlight_suffix); + + byte_index = m.token.byte_end; + } + } + + // push the rest of the text between last match and the end of crop. + if byte_index < byte_end { + formatted.push(&self.text[byte_index..byte_end]); + } + + // push crop marker if it's not the end of the text. + if byte_end < self.text.len() && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + if formatted.len() == 1 { + // avoid concatenating if there is already 1 slice. + Cow::Borrowed(&self.text[byte_start..byte_end]) + } else { + Cow::Owned(formatted.concat()) + } + } + None => self.compute_matches().format(highlight, crop), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::search::query_tree::{Query, QueryKind}; + + fn query_tree() -> Operation { + Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: true, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("the".to_string()), + }), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ) + } + + #[test] + fn format_identity() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = false; + let crop = false; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(highlight, crop), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(highlight, crop), &text); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(highlight, crop), &text); + } + + #[test] + fn format_highlight() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = true; + let crop = false; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text, because there is no matches. + assert_eq!(&matcher.format(highlight, crop), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!( + &matcher.format(highlight, crop), + "Natalie risk her future to build a world with the boy she loves." + ); + } + + #[test] + fn format_crop() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = false; + let crop = true; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(highlight, crop), + "A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…she loves. Emily Henry: The Love That Split The World" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…future to build a world with the boy she loves." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…void void void void void split the world void void" + ); + } + + #[test] + fn format_highlight_crop() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = true; + let crop = true; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(highlight, crop), + "A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World"); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!( + &matcher.format(highlight, crop), + "…future to build a world with the boy she loves." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…void void void void void split the world void void" + ); + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 0d33d9042..a80e520a0 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matching_words::MatchingWords; +pub use self::matches::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; @@ -32,7 +32,7 @@ mod criteria; mod distinct; mod facet; mod fst_utils; -mod matching_words; +mod matches; mod query_tree; pub struct Search<'a> { From 3be179080321308baa4c4e82f741fefcb61e5869 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 28 Mar 2022 15:57:05 +0200 Subject: [PATCH 1331/1889] Add crop algorithm with naive match algorithm --- milli/src/search/matches/mod.rs | 199 +++++++++++++++++++++++--------- 1 file changed, 144 insertions(+), 55 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 0ebf6305f..fb3ab9c37 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -92,13 +92,15 @@ impl MatcherBuilder { // } // } -pub struct Match<'t> { - token: &'t Token<'t>, +#[derive(Clone)] +pub struct Match { match_len: usize, // id of the query word that matches. id: usize, // position of the word in the whole text. - position: usize, + word_position: usize, + // position of the token in the whole text. + token_position: usize, } pub struct MatchBounds { @@ -106,12 +108,6 @@ pub struct MatchBounds { length: usize, } -impl<'t> From<&Match<'t>> for MatchBounds { - fn from(m: &Match) -> Self { - MatchBounds { start: m.token.byte_start, length: m.match_len } - } -} - pub struct Matcher<'t, 'm> { text: &'t str, tokens: &'t [Token<'t>], @@ -120,26 +116,22 @@ pub struct Matcher<'t, 'm> { crop_marker: &'m str, highlight_prefix: &'m str, highlight_suffix: &'m str, - matches: Option>>, + matches: Option>, } impl<'t> Matcher<'t, '_> { fn compute_matches(&mut self) -> &mut Self { let mut matches = Vec::new(); - let mut position = 0; + let mut word_position = 0; + let mut token_position = 0; for token in self.tokens { - match token.is_separator() { - Some(SeparatorKind::Hard) => position += 7, - None => { - if let Some((match_len, id)) = - self.matching_words.matching_bytes_with_id(&token) - { - matches.push(Match { token, match_len, id, position }); - } - position += 1; + if token.is_separator().is_none() { + if let Some((match_len, id)) = self.matching_words.matching_bytes_with_id(&token) { + matches.push(Match { match_len, id, word_position, token_position }); } - _otherwise => {} + word_position += 1; } + token_position += 1; } self.matches = Some(matches); @@ -149,21 +141,104 @@ impl<'t> Matcher<'t, '_> { pub fn matches(&mut self) -> Vec { match &self.matches { None => self.compute_matches().matches(), - Some(matches) => matches.iter().map(MatchBounds::from).collect(), + Some(matches) => matches + .iter() + .map(|m| MatchBounds { + start: self.tokens[m.token_position].byte_start, + length: m.match_len, + }) + .collect(), } } - fn crop_bounds(&self, matches: &[Match<'t>]) -> (usize, usize) { - let byte_end = self - .tokens - .iter() - .filter(|t| t.is_separator().is_none()) - .enumerate() - .take_while(|(i, _)| *i < self.crop_size) - .last() - .map_or(self.text.len(), |(_, t)| t.byte_end); + fn crop_around(&self, matches: &[Match]) -> (usize, usize) { + let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); + let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); + let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); + let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); - (0, byte_end) + // TODO: buggy if no match and fisrt token is a sepparator + let mut remaining_words = + self.crop_size + first_match_word_position - last_match_word_position - 1; + let mut first_token_position = first_match_token_position; + let mut last_token_position = last_match_token_position; + + while remaining_words > 0 { + match ( + first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)), + last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)), + ) { + (Some(ft), Some(lt)) => { + match (ft.is_separator(), lt.is_separator()) { + // if they are both separators and are the same kind then advance both + (Some(f_kind), Some(s_kind)) => { + if f_kind == s_kind { + first_token_position -= 1; + last_token_position += 1; + } else if f_kind == SeparatorKind::Hard { + last_token_position += 1; + } else { + first_token_position -= 1; + } + } + // left is a word, advance left + (None, Some(_)) => { + first_token_position -= 1; + remaining_words -= 1; + } + // right is a word, advance right + (Some(_), None) => { + last_token_position += 1; + remaining_words -= 1; + } + // both are words, advance left then right if remaining_word > 0 + (None, None) => { + first_token_position -= 1; + remaining_words -= 1; + + if remaining_words > 0 { + last_token_position += 1; + remaining_words -= 1; + } + } + } + } + (Some(ft), None) => { + first_token_position -= 1; + if ft.is_separator().is_none() { + remaining_words -= 1; + } + } + (None, Some(lt)) => { + last_token_position += 1; + if lt.is_separator().is_none() { + remaining_words -= 1; + } + } + (None, None) => break, + } + } + + // if tokens after the end of the window are separators, + // then add them to the window in order to keep context in cropped text. + while let Some(_separator_kind) = last_token_position + .checked_add(1) + .and_then(|i| self.tokens.get(i)) + .and_then(|t| t.is_separator()) + { + last_token_position += 1; + } + + (self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end) + } + + fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { + match matches { + // at least 2 matches + [first, last, ..] => self.crop_around(&[first.clone()][..]), + // less than 2 matches + _ => self.crop_around(matches), + } } pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { @@ -187,20 +262,23 @@ impl<'t> Matcher<'t, '_> { if highlight { // insert highlight markers around matches. + let tokens = self.tokens; for m in matches .iter() - .skip_while(|m| m.token.byte_start < byte_start) - .take_while(|m| m.token.byte_start < byte_end) + .skip_while(|m| tokens[m.token_position].byte_start < byte_start) + .take_while(|m| tokens[m.token_position].byte_start < byte_end) { - if byte_index < m.token.byte_start { - formatted.push(&self.text[byte_index..m.token.byte_start]); + let token = &tokens[m.token_position]; + + if byte_index < token.byte_start { + formatted.push(&self.text[byte_index..token.byte_start]); } formatted.push(self.highlight_prefix); - formatted.push(&self.text[m.token.byte_start..m.token.byte_end]); + formatted.push(&self.text[token.byte_start..token.byte_end]); formatted.push(self.highlight_suffix); - byte_index = m.token.byte_end; + byte_index = token.byte_end; } } @@ -271,7 +349,7 @@ mod tests { assert_eq!(&matcher.format(highlight, crop), &text); // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); @@ -306,12 +384,12 @@ mod tests { assert_eq!(&matcher.format(highlight, crop), &text); // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"); + assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -343,18 +421,18 @@ mod tests { // no highlight should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "A quick brown fox can not jump 32 feet, right…" + "A quick brown fox can not jump 32 feet, right? …" ); - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + // Test phrase propagation + let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - // no highlight should return 10 last words with a marker at the start. + // should crop the phrase instead of croping around the match. assert_eq!( &matcher.format(highlight, crop), - "…she loves. Emily Henry: The Love That Split The World" + "…Split The World is a book written by Emily Henry. …" ); // Text containing some matches. @@ -368,6 +446,17 @@ mod tests { "…future to build a world with the boy she loves." ); + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…she loves. Emily Henry: The Love That Split The World." + ); + // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; let analyzed = analyzer.analyze(&text); @@ -398,17 +487,9 @@ mod tests { // both should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "A quick brown fox can not jump 32 feet, right…" + "A quick brown fox can not jump 32 feet, right? …" ); - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); - let mut matcher = builder.build(&tokens[..], text); - // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World"); - // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; let analyzed = analyzer.analyze(&text); @@ -420,6 +501,14 @@ mod tests { "…future to build a world with the boy she loves." ); + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World."); + // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; let analyzed = analyzer.analyze(&text); From 844f546a8b3713df444dce8a0c016e0827be0067 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 28 Mar 2022 18:17:50 +0200 Subject: [PATCH 1332/1889] Add matches algorithm V1 --- milli/src/search/matches/mod.rs | 109 +++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 9 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index fb3ab9c37..9266992d0 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -92,7 +92,7 @@ impl MatcherBuilder { // } // } -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct Match { match_len: usize, // id of the query word that matches. @@ -103,6 +103,7 @@ pub struct Match { token_position: usize, } +#[derive(Clone, Debug)] pub struct MatchBounds { start: usize, length: usize, @@ -151,7 +152,7 @@ impl<'t> Matcher<'t, '_> { } } - fn crop_around(&self, matches: &[Match]) -> (usize, usize) { + fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) { let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); @@ -229,16 +230,84 @@ impl<'t> Matcher<'t, '_> { last_token_position += 1; } - (self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end) + (first_token_position, last_token_position) + } + + fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { + let mut ids = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.id > m.id { + order_score += 1; + } + + // compute distance between matches + distance_score -= (next_match.word_position - m.word_position).min(7) as i16; + } + + ids.push(m.id); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + (uniq_score, distance_score, order_score) + } + + fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { + if matches.len() > 1 { + let mut best_interval = (0, 1); + let mut best_interval_score = self.match_interval_score(&matches[0..=1]); + let mut interval_first = 0; + let mut interval_last = 1; + for (index, next_match) in matches.iter().enumerate().skip(2) { + // if next match would make interval gross more than crop_size + if next_match.word_position - matches[interval_first].word_position > self.crop_size + { + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + + // keep interval if it's the best + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + best_interval_score = interval_score; + } + + // advance start of the interval while interval is longer than crop_size + while next_match.word_position - matches[interval_first].word_position + > self.crop_size + { + interval_first += 1; + } + } + interval_last = index; + } + + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + } + + &matches[best_interval.0..=best_interval.1] + } else { + matches + } } fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { - match matches { - // at least 2 matches - [first, last, ..] => self.crop_around(&[first.clone()][..]), - // less than 2 matches - _ => self.crop_around(matches), - } + let match_interval = self.find_best_match_interval(matches); + + let (first_token_position, last_token_position) = self.token_crop_bounds(match_interval); + + (self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end) } pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { @@ -467,6 +536,28 @@ mod tests { &matcher.format(highlight, crop), "…void void void void void split the world void void" ); + + // Text containing matches with diferent density. + let text = "split void the void void world void void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…void void void void void split the world void void" + ); + + // Text containing matches with same word. + let text = "split split split split split split void void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…void void void void void split the world void void" + ); } #[test] From 4428cb5909a8e936ff4d80fa5414fa63c9b8773d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 29 Mar 2022 14:51:02 +0200 Subject: [PATCH 1333/1889] Add some tests and fix some corner cases --- milli/src/search/matches/mod.rs | 118 +++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 9 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 9266992d0..680dbdffc 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -158,9 +158,13 @@ impl<'t> Matcher<'t, '_> { let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); - // TODO: buggy if no match and fisrt token is a sepparator + // TODO: buggy if no match and first token is a sepparator let mut remaining_words = - self.crop_size + first_match_word_position - last_match_word_position - 1; + self.crop_size + first_match_word_position - last_match_word_position; + // if first token is a word, then remove 1 to remaining_words. + if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) { + remaining_words -= 1; + } let mut first_token_position = first_match_token_position; let mut last_token_position = last_match_token_position; @@ -204,18 +208,21 @@ impl<'t> Matcher<'t, '_> { } } } + // the end of the text is reached, advance left. (Some(ft), None) => { first_token_position -= 1; if ft.is_separator().is_none() { remaining_words -= 1; } } + // the start of the text is reached, advance right. (None, Some(lt)) => { last_token_position += 1; if lt.is_separator().is_none() { remaining_words -= 1; } } + // no more token to add. (None, None) => break, } } @@ -263,13 +270,14 @@ impl<'t> Matcher<'t, '_> { fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { if matches.len() > 1 { - let mut best_interval = (0, 1); - let mut best_interval_score = self.match_interval_score(&matches[0..=1]); + let mut best_interval = (0, 0); + let mut best_interval_score = self.match_interval_score(&matches[0..=0]); let mut interval_first = 0; - let mut interval_last = 1; - for (index, next_match) in matches.iter().enumerate().skip(2) { + let mut interval_last = 0; + for (index, next_match) in matches.iter().enumerate().skip(1) { // if next match would make interval gross more than crop_size - if next_match.word_position - matches[interval_first].word_position > self.crop_size + if next_match.word_position - matches[interval_first].word_position + >= self.crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -282,7 +290,7 @@ impl<'t> Matcher<'t, '_> { // advance start of the interval while interval is longer than crop_size while next_match.word_position - matches[interval_first].word_position - > self.crop_size + >= self.crop_size { interval_first += 1; } @@ -307,10 +315,15 @@ impl<'t> Matcher<'t, '_> { let (first_token_position, last_token_position) = self.token_crop_bounds(match_interval); - (self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end) + let byte_start = self.tokens.get(first_token_position).map_or(0, |t| t.byte_start); + let byte_end = self.tokens.get(last_token_position).map_or(byte_start, |t| t.byte_end); + (byte_start, byte_end) } pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { + // If 0 it will be considered null and thus not crop the field + // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 + let crop = crop && self.crop_size > 0; if !highlight && !crop { // compute matches is not needed if no highlight or crop is requested. Cow::Borrowed(self.text) @@ -444,6 +457,20 @@ mod tests { let highlight = true; let crop = false; + // empty text. + let text = ""; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ""); + + // text containing only separators. + let text = ":-)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ":-)"); + // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let analyzed = analyzer.analyze(&text); @@ -482,6 +509,20 @@ mod tests { let highlight = false; let crop = true; + // empty text. + let text = ""; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ""); + + // text containing only separators. + let text = ":-)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ":-)"); + // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let analyzed = analyzer.analyze(&text); @@ -493,6 +534,17 @@ mod tests { "A quick brown fox can not jump 32 feet, right? …" ); + // Text without any match starting by a separator. + let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(highlight, crop), + "(A quick brown fox can not jump 32 feet, right? …" + ); + // Test phrase propagation let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; let analyzed = analyzer.analyze(&text); @@ -570,6 +622,20 @@ mod tests { let highlight = true; let crop = true; + // empty text. + let text = ""; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ""); + + // text containing only separators. + let text = ":-)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ":-)"); + // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let analyzed = analyzer.analyze(&text); @@ -611,4 +677,38 @@ mod tests { "…void void void void void split the world void void" ); } + + #[test] + fn smaller_crop_size() { + //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 + let query_tree = query_tree(); + + let mut builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = false; + let crop = true; + + let text = "void void split the world void void."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + + // set a smaller crop size + builder.crop_size(2); + let mut matcher = builder.build(&tokens[..], text); + // because crop size < query size, partially format matches. + assert_eq!(&matcher.format(highlight, crop), "…split the …"); + + // set a smaller crop size + builder.crop_size(1); + let mut matcher = builder.build(&tokens[..], text); + // because crop size < query size, partially format matches. + assert_eq!(&matcher.format(highlight, crop), "…split …"); + + // set a smaller crop size + builder.crop_size(0); + let mut matcher = builder.build(&tokens[..], text); + // because crop size is 0, crop is ignored. + assert_eq!(&matcher.format(highlight, crop), "void void split the world void void."); + } } From 734d0899d341a23f6bb3e49efc2da7828b46fa63 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 29 Mar 2022 14:57:21 +0200 Subject: [PATCH 1334/1889] Publish Matcher --- milli/src/search/matches/mod.rs | 4 ++-- milli/src/search/mod.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 680dbdffc..2169c54ab 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,12 +1,12 @@ use std::borrow::Cow; -use matching_words::MatchingWords; +pub use matching_words::MatchingWords; use meilisearch_tokenizer::token::SeparatorKind; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; use crate::search::query_tree::Operation; -pub mod matching_words; +mod matching_words; const DEFAULT_CROP_SIZE: usize = 10; const DEFAULT_CROP_MARKER: &'static str = "…"; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index a80e520a0..752ae236b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matches::matching_words::MatchingWords; +pub use self::matches::{Matcher, MatcherBuilder, MatchingWords}; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; From 29c5f76d7f389b3bfb02f00b88aa1b98318eecce Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 10:50:23 +0200 Subject: [PATCH 1335/1889] Use new matcher in http-ui --- http-ui/src/main.rs | 41 +++++++++++---------------------- milli/src/lib.rs | 4 +++- milli/src/search/matches/mod.rs | 10 ++++++++ 3 files changed, 26 insertions(+), 29 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 26c1034eb..fdfc04af9 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -25,7 +25,7 @@ use milli::update::{ ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, }; use milli::{ - obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatchingWords, + obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatcherBuilder, SearchResult, SortError, }; use once_cell::sync::OnceCell; @@ -152,43 +152,25 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { Self { analyzer } } - fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value { + fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value { match value { Value::Null => Value::Null, Value::Bool(boolean) => Value::Bool(boolean), Value::Number(number) => Value::Number(number), Value::String(old_string) => { - let mut string = String::new(); let analyzed = self.analyzer.analyze(&old_string); - for (word, token) in analyzed.reconstruct() { - if token.is_word() { - match matching_words.matching_bytes(&token) { - Some(chars_to_highlight) => { - let mut chars = word.chars(); + let analyzed: Vec<_> = analyzed.tokens().collect(); + let mut matcher = matcher_builder.build(&analyzed[..], &old_string); - string.push_str(""); - // push the part to highlight - string.extend(chars.by_ref().take(chars_to_highlight)); - string.push_str(""); - // push the suffix after highlight - string.extend(chars); - } - // no highlight - None => string.push_str(word), - } - } else { - string.push_str(word); - } - } - Value::String(string) + Value::String(matcher.format(true, true).to_string()) } Value::Array(values) => Value::Array( - values.into_iter().map(|v| self.highlight_value(v, matching_words)).collect(), + values.into_iter().map(|v| self.highlight_value(v, matcher_builder)).collect(), ), Value::Object(object) => Value::Object( object .into_iter() - .map(|(k, v)| (k, self.highlight_value(v, matching_words))) + .map(|(k, v)| (k, self.highlight_value(v, matcher_builder))) .collect(), ), } @@ -197,14 +179,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { fn highlight_record( &self, object: &mut Map, - matching_words: &MatchingWords, + matcher_builder: &MatcherBuilder, attributes_to_highlight: &HashSet, ) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { let old_value = mem::take(value); - *value = self.highlight_value(old_value, matching_words); + *value = self.highlight_value(old_value, matcher_builder); } } } @@ -819,12 +801,15 @@ async fn main() -> anyhow::Result<()> { let stop_words = fst::Set::default(); let highlighter = Highlighter::new(&stop_words); + let mut matcher_builder = MatcherBuilder::from_matching_words(matching_words); + matcher_builder.highlight_prefix("".to_string()); + matcher_builder.highlight_suffix("".to_string()); for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { highlighter.highlight_record( &mut object, - &matching_words, + &matcher_builder, &attributes_to_highlight, ); } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ba2bd9b0f..9a9ec428c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -36,7 +36,9 @@ pub use self::heed_codec::{ RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, }; pub use self::index::Index; -pub use self::search::{FacetDistribution, Filter, MatchingWords, Search, SearchResult}; +pub use self::search::{ + FacetDistribution, Filter, MatcherBuilder, MatchingWords, Search, SearchResult, +}; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 2169c54ab..aeaa8196e 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -34,6 +34,16 @@ impl MatcherBuilder { } } + pub fn from_matching_words(matching_words: MatchingWords) -> Self { + Self { + matching_words, + crop_size: DEFAULT_CROP_SIZE, + crop_marker: None, + highlight_prefix: None, + highlight_suffix: None, + } + } + pub fn crop_size(&mut self, word_count: usize) -> &Self { self.crop_size = word_count; self From bd30ee97b8c67c32c264bbeaad7114bb5367caf2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 14:00:06 +0200 Subject: [PATCH 1336/1889] Keep separators at start of the croped string --- milli/src/search/matches/mod.rs | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index aeaa8196e..9ab1ef50f 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -247,6 +247,15 @@ impl<'t> Matcher<'t, '_> { last_token_position += 1; } + // same for start + while let Some(_separator_kind) = first_token_position + .checked_sub(1) + .and_then(|i| self.tokens.get(i)) + .and_then(|t| t.is_separator()) + { + first_token_position -= 1; + } + (first_token_position, last_token_position) } @@ -563,7 +572,7 @@ mod tests { // should crop the phrase instead of croping around the match. assert_eq!( &matcher.format(highlight, crop), - "…Split The World is a book written by Emily Henry. …" + "…. Split The World is a book written by Emily Henry. …" ); // Text containing some matches. @@ -574,7 +583,7 @@ mod tests { // no highlight should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…future to build a world with the boy she loves." + "… future to build a world with the boy she loves." ); // Text containing all matches. @@ -585,7 +594,7 @@ mod tests { // no highlight should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…she loves. Emily Henry: The Love That Split The World." + "… she loves. Emily Henry: The Love That Split The World." ); // Text containing a match unordered and a match ordered. @@ -596,7 +605,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…void void void void void split the world void void" + "… void void void void void split the world void void" ); // Text containing matches with diferent density. @@ -607,7 +616,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…void void void void void split the world void void" + "… void void void void void split the world void void" ); // Text containing matches with same word. @@ -618,7 +627,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…void void void void void split the world void void" + "… void void void void void split the world void void" ); } @@ -665,7 +674,7 @@ mod tests { // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( &matcher.format(highlight, crop), - "…future to build a world with the boy she loves." + "… future to build a world with the boy she loves." ); // Text containing all matches. @@ -674,7 +683,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(highlight, crop), "… she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; @@ -684,7 +693,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…void void void void void split the world void void" + "… void void void void void split the world void void" ); } @@ -707,13 +716,13 @@ mod tests { builder.crop_size(2); let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "…split the …"); + assert_eq!(&matcher.format(highlight, crop), "… split the …"); // set a smaller crop size builder.crop_size(1); let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "…split …"); + assert_eq!(&matcher.format(highlight, crop), "… split …"); // set a smaller crop size builder.crop_size(0); From 6dc345bc531015c565b8640ca1a4576cbf9e4486 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 15:15:14 +0200 Subject: [PATCH 1337/1889] Test and Fix prefix highlight --- milli/src/search/matches/mod.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 9ab1ef50f..816f5e273 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -376,8 +376,10 @@ impl<'t> Matcher<'t, '_> { } formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..token.byte_end]); + formatted.push(&self.text[token.byte_start..][..m.match_len]); formatted.push(self.highlight_suffix); + formatted + .push(&self.text[token.byte_start + m.match_len..token.byte_end]); byte_index = token.byte_end; } @@ -516,6 +518,17 @@ mod tests { &matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves." ); + + // Text containing some matches by prefix. + let text = "Natalie risk her future to build a worldle with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!( + &matcher.format(highlight, crop), + "Natalie risk her future to build a worldle with the boy she loves." + ); } #[test] From b3f0f39106fba1c596aefca61877fd8ca1395d8b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 15:22:18 +0200 Subject: [PATCH 1338/1889] Make some cleaning --- milli/src/lib.rs | 2 +- milli/src/search/matches/mod.rs | 18 +++--------------- milli/src/search/mod.rs | 2 +- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 9a9ec428c..6cbb9f126 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,7 +37,7 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, MatcherBuilder, MatchingWords, Search, SearchResult, + FacetDistribution, Filter, MatchBounds, MatcherBuilder, MatchingWords, Search, SearchResult, }; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 816f5e273..e66ba781c 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,8 +1,7 @@ use std::borrow::Cow; pub use matching_words::MatchingWords; -use meilisearch_tokenizer::token::SeparatorKind; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; +use meilisearch_tokenizer::token::{SeparatorKind, Token}; use crate::search::query_tree::Operation; @@ -91,17 +90,6 @@ impl MatcherBuilder { } } -// impl Default for MatcherBuilder { -// fn default() -> Self { -// Self { -// crop_size: DEFAULT_CROP_SIZE, -// crop_marker: None, -// highlight_prefix: None, -// highlight_suffix: None, -// } -// } -// } - #[derive(Clone, Debug)] pub struct Match { match_len: usize, @@ -115,8 +103,8 @@ pub struct Match { #[derive(Clone, Debug)] pub struct MatchBounds { - start: usize, - length: usize, + pub start: usize, + pub length: usize, } pub struct Matcher<'t, 'm> { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 752ae236b..8804d9151 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matches::{Matcher, MatcherBuilder, MatchingWords}; +pub use self::matches::{MatchBounds, Matcher, MatcherBuilder, MatchingWords}; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; From a93cd8c61c0b6b4f096e65e06de76aaaf81fde52 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 15:43:49 +0200 Subject: [PATCH 1339/1889] Fix prefix highlight with special chars --- milli/src/search/matches/mod.rs | 60 ++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index e66ba781c..a4c29ce66 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -363,11 +363,15 @@ impl<'t> Matcher<'t, '_> { formatted.push(&self.text[byte_index..token.byte_start]); } + let highlight_byte_index = self.text[token.byte_start..] + .char_indices() + .enumerate() + .find(|(i, _)| *i == m.match_len) + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..][..m.match_len]); + formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); - formatted - .push(&self.text[token.byte_start + m.match_len..token.byte_end]); + formatted.push(&self.text[highlight_byte_index..token.byte_end]); byte_index = token.byte_end; } @@ -398,6 +402,8 @@ impl<'t> Matcher<'t, '_> { #[cfg(test)] mod tests { + use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; + use super::*; use crate::search::query_tree::{Query, QueryKind}; @@ -506,17 +512,53 @@ mod tests { &matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves." ); + } - // Text containing some matches by prefix. - let text = "Natalie risk her future to build a worldle with the boy she loves."; + #[test] + fn highlight_unicode() { + let query_tree = Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "wessfalia".to_string()), + }), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = true; + let crop = false; + + // Text containing prefix match. + let text = "Ŵôřlḑôle"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!( - &matcher.format(highlight, crop), - "Natalie risk her future to build a worldle with the boy she loves." - ); + assert_eq!(&matcher.format(highlight, crop), "Ŵôřlḑôle"); + + // Text containing unicode match. + let text = "Ŵôřlḑ"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "Ŵôřlḑ"); + + // Text containing unicode match. + let text = "Westfália"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "Westfália"); } #[test] From 56e0edd62119d345e67755a4f8a94ad2f03de5cc Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 17:22:58 +0200 Subject: [PATCH 1340/1889] Put crop markers direclty around words --- milli/src/search/matches/mod.rs | 47 ++++++++++----------------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index a4c29ce66..c6b89f9ec 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -225,25 +225,6 @@ impl<'t> Matcher<'t, '_> { } } - // if tokens after the end of the window are separators, - // then add them to the window in order to keep context in cropped text. - while let Some(_separator_kind) = last_token_position - .checked_add(1) - .and_then(|i| self.tokens.get(i)) - .and_then(|t| t.is_separator()) - { - last_token_position += 1; - } - - // same for start - while let Some(_separator_kind) = first_token_position - .checked_sub(1) - .and_then(|i| self.tokens.get(i)) - .and_then(|t| t.is_separator()) - { - first_token_position -= 1; - } - (first_token_position, last_token_position) } @@ -593,7 +574,7 @@ mod tests { // no highlight should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "A quick brown fox can not jump 32 feet, right? …" + "A quick brown fox can not jump 32 feet, right…" ); // Text without any match starting by a separator. @@ -604,7 +585,7 @@ mod tests { // no highlight should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "(A quick brown fox can not jump 32 feet, right? …" + "(A quick brown fox can not jump 32 feet, right…" ); // Test phrase propagation @@ -615,7 +596,7 @@ mod tests { // should crop the phrase instead of croping around the match. assert_eq!( &matcher.format(highlight, crop), - "…. Split The World is a book written by Emily Henry. …" + "…Split The World is a book written by Emily Henry…" ); // Text containing some matches. @@ -626,7 +607,7 @@ mod tests { // no highlight should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… future to build a world with the boy she loves." + "…future to build a world with the boy she loves…" ); // Text containing all matches. @@ -637,7 +618,7 @@ mod tests { // no highlight should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… she loves. Emily Henry: The Love That Split The World." + "…she loves. Emily Henry: The Love That Split The World." ); // Text containing a match unordered and a match ordered. @@ -648,7 +629,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… void void void void void split the world void void" + "…void void void void void split the world void void" ); // Text containing matches with diferent density. @@ -659,7 +640,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… void void void void void split the world void void" + "…void void void void void split the world void void" ); // Text containing matches with same word. @@ -670,7 +651,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… void void void void void split the world void void" + "…void void void void void split the world void void" ); } @@ -706,7 +687,7 @@ mod tests { // both should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "A quick brown fox can not jump 32 feet, right? …" + "A quick brown fox can not jump 32 feet, right…" ); // Text containing some matches. @@ -717,7 +698,7 @@ mod tests { // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( &matcher.format(highlight, crop), - "… future to build a world with the boy she loves." + "…future to build a world with the boy she loves…" ); // Text containing all matches. @@ -726,7 +707,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "… she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; @@ -736,7 +717,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… void void void void void split the world void void" + "…void void void void void split the world void void" ); } @@ -759,13 +740,13 @@ mod tests { builder.crop_size(2); let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "… split the …"); + assert_eq!(&matcher.format(highlight, crop), "…split the…"); // set a smaller crop size builder.crop_size(1); let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "… split …"); + assert_eq!(&matcher.format(highlight, crop), "…split…"); // set a smaller crop size builder.crop_size(0); From 3bb1e35adac89c9b1e371dcd7b82372063b520ce Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 4 Apr 2022 18:56:59 +0200 Subject: [PATCH 1341/1889] Fix match count --- milli/src/search/matches/matching_words.rs | 339 ++++++++++++--------- milli/src/search/matches/mod.rs | 169 +++++----- milli/src/search/mod.rs | 17 +- milli/src/search/query_tree.rs | 175 ++++++++++- 4 files changed, 469 insertions(+), 231 deletions(-) diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 48f6fe809..274634554 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -1,12 +1,12 @@ use std::cmp::{min, Reverse}; -use std::collections::{BTreeMap, HashMap}; +use std::collections::BTreeMap; +use std::fmt; use std::ops::{Index, IndexMut}; use levenshtein_automata::{Distance, DFA}; use meilisearch_tokenizer::Token; use crate::search::build_dfa; -use crate::search::query_tree::{Operation, Query}; type IsPrefix = bool; @@ -14,83 +14,129 @@ type IsPrefix = bool; /// referencing words that match the given query tree. #[derive(Default)] pub struct MatchingWords { - dfas: Vec<(DFA, String, u8, IsPrefix, usize)>, + inner: Vec<(Vec, Vec)>, } impl MatchingWords { - pub fn from_query_tree(tree: &Operation) -> Self { - // fetch matchable words from the query tree - let mut dfas: Vec<_> = fetch_queries(tree) - .into_iter() - // create DFAs for each word - .map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id)) - .collect(); - // Sort word by len in DESC order prioritizing the longuest word, + pub fn new(mut matching_words: Vec<(Vec, Vec)>) -> Self { + // Sort word by len in DESC order prioritizing the longuest matches, // in order to highlight the longuest part of the matched word. - dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| { - Reverse(query_word.len()) - }); - Self { dfas } + matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); + + Self { inner: matching_words } } - /// Returns the number of matching bytes if the word matches one of the query words. - pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option { - self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len) - } - - pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> { - self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| { - match dfa.eval(word_to_highlight.text()) { - Distance::Exact(t) if t <= *typo => { - if *is_prefix { - let len = bytes_to_highlight(word_to_highlight.text(), query_word); - Some((word_to_highlight.num_chars_from_bytes(len), *id)) - } else { - Some(( - word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()), - *id, - )) - } - } - _otherwise => None, - } - }) + pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { + MatchesIter { inner: Box::new(self.inner.iter()), token } } } -/// Lists all words which can be considered as a match for the query tree. -fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> { - fn resolve_ops<'a>( - tree: &'a Operation, - out: &mut HashMap<(&'a str, u8, IsPrefix), usize>, - id: &mut usize, - ) { - match tree { - Operation::Or(_, ops) | Operation::And(ops) => { - ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id)); - } - Operation::Query(Query { prefix, kind }) => { - let typo = if kind.is_exact() { 0 } else { kind.typo() }; - out.entry((kind.word(), typo, *prefix)).or_insert_with(|| { - *id += 1; - *id - }); - } - Operation::Phrase(words) => { - for word in words { - out.entry((word, 0, false)).or_insert_with(|| { - *id += 1; - *id - }); +pub struct MatchesIter<'a, 'b> { + inner: Box, Vec)> + 'a>, + token: &'b Token<'b>, +} + +impl<'a> Iterator for MatchesIter<'a, '_> { + type Item = MatchType<'a>; + + fn next(&mut self) -> Option { + match self.inner.next() { + Some((matching_words, ids)) => match matching_words[0].match_token(&self.token) { + Some(char_len) => { + if matching_words.len() > 1 { + Some(MatchType::Partial(PartialMatch { + matching_words: &matching_words[1..], + ids, + char_len, + })) + } else { + Some(MatchType::Full { char_len, ids }) + } } - } + None => self.next(), + }, + None => None, } } +} - let mut queries = HashMap::new(); - let mut id = 0; - resolve_ops(tree, &mut queries, &mut id); - queries +pub type PrimitiveWordId = u8; +pub struct MatchingWord { + pub dfa: DFA, + pub word: String, + pub typo: u8, + pub prefix: IsPrefix, +} + +impl fmt::Debug for MatchingWord { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MatchingWord") + .field("word", &self.word) + .field("typo", &self.typo) + .field("prefix", &self.prefix) + .finish() + } +} + +impl PartialEq for MatchingWord { + fn eq(&self, other: &Self) -> bool { + self.prefix == other.prefix && self.typo == other.typo && self.word == other.word + } +} + +impl MatchingWord { + pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self { + let dfa = build_dfa(&word, typo, prefix); + + Self { dfa, word, typo, prefix } + } + + pub fn match_token(&self, token: &Token) -> Option { + match self.dfa.eval(token.text()) { + Distance::Exact(t) if t <= self.typo => { + if self.prefix { + let len = bytes_to_highlight(token.text(), &self.word); + Some(token.num_chars_from_bytes(len)) + } else { + Some(token.num_chars_from_bytes(token.text().len())) + } + } + _otherwise => None, + } + } +} + +#[derive(Debug, PartialEq)] +pub enum MatchType<'a> { + Full { char_len: usize, ids: &'a [PrimitiveWordId] }, + Partial(PartialMatch<'a>), +} + +#[derive(Debug, PartialEq)] +pub struct PartialMatch<'a> { + matching_words: &'a [MatchingWord], + ids: &'a [PrimitiveWordId], + char_len: usize, +} + +impl<'a> PartialMatch<'a> { + pub fn match_token(self, token: &Token) -> Option> { + self.matching_words[0].match_token(token).map(|char_len| { + if self.matching_words.len() > 1 { + MatchType::Partial(PartialMatch { + matching_words: &self.matching_words[1..], + ids: self.ids, + char_len, + }) + } else { + MatchType::Full { char_len, ids: self.ids } + } + }) + } + + pub fn char_len(&self) -> usize { + self.char_len + } } // A simple wrapper around vec so we can get contiguous but index it like it's 2D array. @@ -203,7 +249,6 @@ mod tests { use meilisearch_tokenizer::TokenKind; use super::*; - use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::MatchingWords; #[test] @@ -271,102 +316,104 @@ mod tests { #[test] fn matching_words() { - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: true, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); + let matching_words = vec![ + (vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]), + (vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + ]; - let matching_words = MatchingWords::from_query_tree(&query_tree); + let matching_words = MatchingWords::new(matching_words); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("word"), - byte_start: 0, - char_index: 0, - byte_end: "word".len(), - char_map: None, - }), - Some(3) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("word"), + byte_start: 0, + char_index: 0, + byte_end: "word".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 3, ids: &[2] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("nyc"), - byte_start: 0, - char_index: 0, - byte_end: "nyc".len(), - char_map: None, - }), + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("nyc"), + byte_start: 0, + char_index: 0, + byte_end: "nyc".len(), + char_map: None, + }) + .next(), None ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("world"), - byte_start: 0, - char_index: 0, - byte_end: "world".len(), - char_map: None, - }), - Some(5) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("world"), + byte_start: 0, + char_index: 0, + byte_end: "world".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[2] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("splitted"), - byte_start: 0, - char_index: 0, - byte_end: "splitted".len(), - char_map: None, - }), - Some(5) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("splitted"), + byte_start: 0, + char_index: 0, + byte_end: "splitted".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[0] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("thisnew"), - byte_start: 0, - char_index: 0, - byte_end: "thisnew".len(), - char_map: None, - }), + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("thisnew"), + byte_start: 0, + char_index: 0, + byte_end: "thisnew".len(), + char_map: None, + }) + .next(), None ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("borld"), - byte_start: 0, - char_index: 0, - byte_end: "borld".len(), - char_map: None, - }), - Some(5) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("borld"), + byte_start: 0, + char_index: 0, + byte_end: "borld".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[2] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("wordsplit"), - byte_start: 0, - char_index: 0, - byte_end: "wordsplit".len(), - char_map: None, - }), - Some(4) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("wordsplit"), + byte_start: 0, + char_index: 0, + byte_end: "wordsplit".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 4, ids: &[2] }) ); } } diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index c6b89f9ec..a99798a9b 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,11 +1,10 @@ use std::borrow::Cow; pub use matching_words::MatchingWords; +use matching_words::{MatchType, PrimitiveWordId}; use meilisearch_tokenizer::token::{SeparatorKind, Token}; -use crate::search::query_tree::Operation; - -mod matching_words; +pub mod matching_words; const DEFAULT_CROP_SIZE: usize = 10; const DEFAULT_CROP_MARKER: &'static str = "…"; @@ -21,18 +20,6 @@ pub struct MatcherBuilder { } impl MatcherBuilder { - pub fn from_query_tree(query_tree: &Operation) -> Self { - let matching_words = MatchingWords::from_query_tree(query_tree); - - Self { - matching_words, - crop_size: DEFAULT_CROP_SIZE, - crop_marker: None, - highlight_prefix: None, - highlight_suffix: None, - } - } - pub fn from_matching_words(matching_words: MatchingWords) -> Self { Self { matching_words, @@ -93,8 +80,8 @@ impl MatcherBuilder { #[derive(Clone, Debug)] pub struct Match { match_len: usize, - // id of the query word that matches. - id: usize, + // ids of the query words that matches. + ids: Vec, // position of the word in the whole text. word_position: usize, // position of the token in the whole text. @@ -123,10 +110,72 @@ impl<'t> Matcher<'t, '_> { let mut matches = Vec::new(); let mut word_position = 0; let mut token_position = 0; - for token in self.tokens { + while let Some(token) = self.tokens.get(token_position) { if token.is_separator().is_none() { - if let Some((match_len, id)) = self.matching_words.matching_bytes_with_id(&token) { - matches.push(Match { match_len, id, word_position, token_position }); + 'matches: for match_type in self.matching_words.match_token(&token) { + match match_type { + MatchType::Full { char_len, ids } => { + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + // stop on the first match + break; + } + MatchType::Partial(mut partial) => { + let mut potential_matches = + vec![(token_position, word_position, partial.char_len())]; + let mut t_position = 1; + let mut w_position = 1; + 'partials: for token in &self.tokens[token_position + 1..] { + if token.is_separator().is_none() { + partial = match partial.match_token(&token) { + Some(MatchType::Partial(partial)) => { + potential_matches.push(( + token_position + t_position, + word_position + w_position, + partial.char_len(), + )); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| { + Match { + match_len, + ids: ids.to_vec(), + word_position, + token_position, + } + }, + ); + + matches.extend(iter); + + word_position += w_position; + token_position += t_position; + + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + + break 'matches; + } + // no match, continue to next match. + None => break 'partials, + }; + w_position += 1; + } + t_position += 1; + } + } + } } word_position += 1; } @@ -229,7 +278,7 @@ impl<'t> Matcher<'t, '_> { } fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { - let mut ids = Vec::with_capacity(matches.len()); + let mut ids: Vec = Vec::with_capacity(matches.len()); let mut order_score = 0; let mut distance_score = 0; @@ -237,7 +286,7 @@ impl<'t> Matcher<'t, '_> { while let Some(m) = iter.next() { if let Some(next_match) = iter.peek() { // if matches are ordered - if next_match.id > m.id { + if next_match.ids.iter().min() > m.ids.iter().min() { order_score += 1; } @@ -245,7 +294,7 @@ impl<'t> Matcher<'t, '_> { distance_score -= (next_match.word_position - m.word_position).min(7) as i16; } - ids.push(m.id); + ids.extend(m.ids.iter()); } ids.sort_unstable(); @@ -348,7 +397,8 @@ impl<'t> Matcher<'t, '_> { .char_indices() .enumerate() .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start) + .min(token.byte_end); formatted.push(self.highlight_prefix); formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); @@ -386,33 +436,23 @@ mod tests { use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use super::*; - use crate::search::query_tree::{Query, QueryKind}; + use crate::search::matches::matching_words::MatchingWord; - fn query_tree() -> Operation { - Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: true, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("the".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ) + fn matching_words() -> MatchingWords { + let matching_words = vec![ + (vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + ]; + + MatchingWords::new(matching_words) } #[test] fn format_identity() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = false; @@ -445,9 +485,9 @@ mod tests { #[test] fn format_highlight() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = true; @@ -497,21 +537,14 @@ mod tests { #[test] fn highlight_unicode() { - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "wessfalia".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); + let matching_words = vec![ + (vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]), + ]; - let builder = MatcherBuilder::from_query_tree(&query_tree); + let matching_words = MatchingWords::new(matching_words); + + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = true; @@ -539,14 +572,14 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Westfália"); + assert_eq!(&matcher.format(highlight, crop), "Westfália"); } #[test] fn format_crop() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = false; @@ -657,9 +690,9 @@ mod tests { #[test] fn format_highlight_crop() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = true; @@ -724,9 +757,9 @@ mod tests { #[test] fn smaller_crop_size() { //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let query_tree = query_tree(); + let matching_words = matching_words(); - let mut builder = MatcherBuilder::from_query_tree(&query_tree); + let mut builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = false; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 8804d9151..2b025f269 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -114,7 +114,7 @@ impl<'a> Search<'a> { pub fn execute(&self) -> Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); - let (query_tree, primitive_query) = match self.query.as_ref() { + let (query_tree, primitive_query, matching_words) = match self.query.as_ref() { Some(query) => { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); @@ -132,9 +132,11 @@ impl<'a> Search<'a> { let analyzer = Analyzer::new(config); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) + builder + .build(tokens)? + .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) } - None => (None, None), + None => (None, None, None), }; debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); @@ -148,11 +150,6 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed()); - let matching_words = match query_tree.as_ref() { - Some(query_tree) => MatchingWords::from_query_tree(&query_tree), - None => MatchingWords::default(), - }; - // We check that we are allowed to use the sort criteria, we check // that they are declared in the sortable fields. if let Some(sort_criteria) = &self.sort_criteria { @@ -193,13 +190,13 @@ impl<'a> Search<'a> { )?; match self.index.distinct_field(self.rtxn)? { - None => self.perform_sort(NoopDistinct, matching_words, criteria), + None => self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; match field_ids_map.id(name) { Some(fid) => { let distinct = FacetDistinct::new(fid, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) + self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) } None => Ok(SearchResult::default()), } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4eccae8ce..a45034a3b 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -8,7 +8,8 @@ use meilisearch_tokenizer::TokenKind; use roaring::RoaringBitmap; use slice_group_by::GroupBy; -use crate::{Index, Result}; +use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; +use crate::{Index, MatchingWords, Result}; type IsOptionalWord = bool; type IsPrefix = bool; @@ -233,7 +234,10 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build(&self, query: TokenStream) -> Result> { + pub fn build( + &self, + query: TokenStream, + ) -> Result> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { @@ -243,7 +247,9 @@ impl<'a> QueryTreeBuilder<'a> { self.authorize_typos, &primitive_query, )?; - Ok(Some((qt, primitive_query))) + let matching_words = + create_matching_words(self, self.authorize_typos, &primitive_query)?; + Ok(Some((qt, primitive_query, matching_words))) } else { Ok(None) } @@ -251,7 +257,7 @@ impl<'a> QueryTreeBuilder<'a> { } /// Split the word depending on the frequency of subwords in the database documents. -fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { +fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -267,7 +273,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some(child) = split_best_frequency(ctx, &word)? { - children.push(child); + if let Some((left, right)) = split_best_frequency(ctx, &word)? { + children.push(Operation::Phrase(vec![left, right])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let exact_words = ctx.exact_words()?; @@ -464,6 +470,154 @@ fn create_query_tree( } } +/// Main function that matchings words used for crop and highlight. +fn create_matching_words( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], +) -> Result { + /// Matches on the `PrimitiveQueryPart` and create matchings words from it. + fn resolve_primitive_part( + ctx: &impl Context, + authorize_typos: bool, + part: PrimitiveQueryPart, + matching_words: &mut Vec<(Vec, Vec)>, + id: PrimitiveWordId, + ) -> Result<()> { + match part { + // 1. try to split word in 2 + // 2. try to fetch synonyms + PrimitiveQueryPart::Word(word, prefix) => { + if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? { + for synonym in synonyms { + let synonym = synonym + .into_iter() + .map(|syn| MatchingWord::new(syn.to_string(), 0, false)) + .collect(); + matching_words.push((synonym, vec![id])); + } + } + + if let Some((left, right)) = split_best_frequency(ctx, &word)? { + let left = MatchingWord::new(left, 0, false); + let right = MatchingWord::new(right, 0, false); + matching_words.push((vec![left, right], vec![id])); + } + + let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words()?; + let config = + TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; + + let matching_word = match typos(word, authorize_typos, config) { + QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix), + QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix), + }; + matching_words.push((vec![matching_word], vec![id])); + } + // create a CONSECUTIVE matchings words wrapping all word in the phrase + PrimitiveQueryPart::Phrase(words) => { + let ids: Vec<_> = + (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); + let words = + words.into_iter().map(|w| MatchingWord::new(w.to_string(), 0, false)).collect(); + matching_words.push((words, ids)); + } + } + + Ok(()) + } + + /// Create all ngrams 1..=3 generating query tree branches. + fn ngrams( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], + matching_words: &mut Vec<(Vec, Vec)>, + mut id: PrimitiveWordId, + ) -> Result<()> { + const MAX_NGRAM: usize = 3; + + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { + for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { + if let Some(group) = sub_query.get(..ngram) { + let tail = &sub_query[ngram..]; + let is_last = tail.is_empty(); + + match group { + [part] => { + resolve_primitive_part( + ctx, + authorize_typos, + part.clone(), + matching_words, + id, + )?; + } + words => { + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); + let ids: Vec<_> = (0..words.len()) + .into_iter() + .map(|i| id + i as PrimitiveWordId) + .collect(); + + if let Some(synonyms) = ctx.synonyms(&words)? { + for synonym in synonyms { + let synonym = synonym + .into_iter() + .map(|syn| MatchingWord::new(syn.to_string(), 0, false)) + .collect(); + matching_words.push((synonym, ids.clone())); + } + } + let word = words.concat(); + let (word_len_one_typo, word_len_two_typo) = + ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words()?; + let config = TypoConfig { + max_typos: 1, + word_len_one_typo, + word_len_two_typo, + exact_words, + }; + let matching_word = match typos(word, authorize_typos, config) { + QueryKind::Exact { word, .. } => { + MatchingWord::new(word, 0, is_prefix) + } + QueryKind::Tolerant { typo, word } => { + MatchingWord::new(word, typo, is_prefix) + } + }; + matching_words.push((vec![matching_word], ids)); + } + } + + if !is_last { + ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?; + } + } + } + id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::(); + } + + Ok(()) + } + + let mut matching_words = Vec::new(); + ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?; + Ok(MatchingWords::new(matching_words)) +} + pub type PrimitiveQuery = Vec; #[derive(Debug, Clone)] @@ -480,6 +634,13 @@ impl PrimitiveQueryPart { fn is_prefix(&self) -> bool { matches!(self, Self::Word(_, is_prefix) if *is_prefix) } + + fn len(&self) -> usize { + match self { + Self::Phrase(words) => words.len(), + Self::Word(_, _) => 1, + } + } } /// Create primitive query from tokenized query string, From fa7d3a37c0d86d8b3129071889e6bc3e4746a26d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 5 Apr 2022 17:35:52 +0200 Subject: [PATCH 1342/1889] Make some cleaning and add comments --- milli/src/search/matches/mod.rs | 180 +++++++++++++++++++++----------- 1 file changed, 117 insertions(+), 63 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index a99798a9b..993ee1f2b 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -4,6 +4,8 @@ pub use matching_words::MatchingWords; use matching_words::{MatchType, PrimitiveWordId}; use meilisearch_tokenizer::token::{SeparatorKind, Token}; +use crate::search::matches::matching_words::PartialMatch; + pub mod matching_words; const DEFAULT_CROP_SIZE: usize = 10; @@ -106,14 +108,80 @@ pub struct Matcher<'t, 'm> { } impl<'t> Matcher<'t, '_> { + /// Iterates over tokens and save any of them that matches the query. fn compute_matches(&mut self) -> &mut Self { + fn compute_partial_match( + mut partial: PartialMatch, + tokens: &[Token], + token_position: &mut usize, + word_position: &mut usize, + matches: &mut Vec, + ) -> bool { + let mut potential_matches = vec![(*token_position, *word_position, partial.char_len())]; + let mut t_position = 1; + let mut w_position = 1; + for token in &tokens[*token_position + 1..] { + if token.is_separator().is_none() { + partial = match partial.match_token(&token) { + // token matches the partial match, but the match is not full, + // we temporarly save the current token then we try to match the next one. + Some(MatchType::Partial(partial)) => { + potential_matches.push(( + *token_position + t_position, + *word_position + w_position, + partial.char_len(), + )); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + // save previously matched tokens as matches. + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| Match { + match_len, + ids: ids.to_vec(), + word_position, + token_position, + }, + ); + matches.extend(iter); + + // move word and token positions after the end of the match. + *word_position += w_position; + *token_position += t_position; + + // save the token that closes the partial match as a match. + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position: *word_position, + token_position: *token_position, + }); + + // the match is complete, we return true. + return true; + } + // no match, continue to next match. + None => break, + }; + w_position += 1; + } + t_position += 1; + } + + // the match is not complete, we return false. + false + } + let mut matches = Vec::new(); let mut word_position = 0; let mut token_position = 0; while let Some(token) = self.tokens.get(token_position) { if token.is_separator().is_none() { - 'matches: for match_type in self.matching_words.match_token(&token) { + for match_type in self.matching_words.match_token(&token) { match match_type { + // we match, we save the current token as a match, + // then we continue the rest of the tokens. MatchType::Full { char_len, ids } => { matches.push(Match { match_len: char_len, @@ -121,58 +189,20 @@ impl<'t> Matcher<'t, '_> { word_position, token_position, }); - // stop on the first match break; } - MatchType::Partial(mut partial) => { - let mut potential_matches = - vec![(token_position, word_position, partial.char_len())]; - let mut t_position = 1; - let mut w_position = 1; - 'partials: for token in &self.tokens[token_position + 1..] { - if token.is_separator().is_none() { - partial = match partial.match_token(&token) { - Some(MatchType::Partial(partial)) => { - potential_matches.push(( - token_position + t_position, - word_position + w_position, - partial.char_len(), - )); - partial - } - // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| { - Match { - match_len, - ids: ids.to_vec(), - word_position, - token_position, - } - }, - ); - - matches.extend(iter); - - word_position += w_position; - token_position += t_position; - - matches.push(Match { - match_len: char_len, - ids: ids.to_vec(), - word_position, - token_position, - }); - - break 'matches; - } - // no match, continue to next match. - None => break 'partials, - }; - w_position += 1; - } - t_position += 1; + // we match partially, iterate over next tokens to check if we can complete the match. + MatchType::Partial(partial) => { + // if match is completed, we break the matching loop over the current token, + // then we continue the rest of the tokens. + if compute_partial_match( + partial, + &self.tokens, + &mut token_position, + &mut word_position, + &mut matches, + ) { + break; } } } @@ -186,6 +216,7 @@ impl<'t> Matcher<'t, '_> { self } + /// Returns boundaries of the words that match the query. pub fn matches(&mut self) -> Vec { match &self.matches { None => self.compute_matches().matches(), @@ -199,30 +230,37 @@ impl<'t> Matcher<'t, '_> { } } + /// Returns token position of the window to crop around. fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) { + // if there is no match, we start from the beginning of the string by default. let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); - // TODO: buggy if no match and first token is a sepparator + // matches needs to be counted in the crop len. let mut remaining_words = self.crop_size + first_match_word_position - last_match_word_position; // if first token is a word, then remove 1 to remaining_words. if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) { remaining_words -= 1; } + + // we start from matches positions, then we expand the window in both sides. let mut first_token_position = first_match_token_position; let mut last_token_position = last_match_token_position; - while remaining_words > 0 { match ( + // try to expand left first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)), + // try to expand right last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)), ) { + // we can expand both sides. (Some(ft), Some(lt)) => { match (ft.is_separator(), lt.is_separator()) { - // if they are both separators and are the same kind then advance both + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. (Some(f_kind), Some(s_kind)) => { if f_kind == s_kind { first_token_position -= 1; @@ -233,17 +271,18 @@ impl<'t> Matcher<'t, '_> { first_token_position -= 1; } } - // left is a word, advance left + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. (None, Some(_)) => { first_token_position -= 1; remaining_words -= 1; } - // right is a word, advance right + // right is a word, advance right. (Some(_), None) => { last_token_position += 1; remaining_words -= 1; } - // both are words, advance left then right if remaining_word > 0 + // both are words, advance left then right if remaining_word > 0. (None, None) => { first_token_position -= 1; remaining_words -= 1; @@ -277,6 +316,10 @@ impl<'t> Matcher<'t, '_> { (first_token_position, last_token_position) } + /// Compute the score of a match interval: + /// 1) count unique matches + /// 2) calculate distance between matches + /// 3) count ordered matches fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { let mut ids: Vec = Vec::with_capacity(matches.len()); let mut order_score = 0; @@ -305,14 +348,20 @@ impl<'t> Matcher<'t, '_> { (uniq_score, distance_score, order_score) } + /// Returns the matches interval where the score computed by match_interval_score is maximal. fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { + // we compute the matches interval if we have at least 2 matches. if matches.len() > 1 { + // positions of the first and the last match of the best matches interval in `matches`. let mut best_interval = (0, 0); let mut best_interval_score = self.match_interval_score(&matches[0..=0]); + // current interval positions. let mut interval_first = 0; let mut interval_last = 0; for (index, next_match) in matches.iter().enumerate().skip(1) { - // if next match would make interval gross more than crop_size + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. if next_match.word_position - matches[interval_first].word_position >= self.crop_size { @@ -325,7 +374,7 @@ impl<'t> Matcher<'t, '_> { best_interval_score = interval_score; } - // advance start of the interval while interval is longer than crop_size + // advance start of the interval while interval is longer than crop_size. while next_match.word_position - matches[interval_first].word_position >= self.crop_size { @@ -335,6 +384,7 @@ impl<'t> Matcher<'t, '_> { interval_last = index; } + // compute the last interval score and compare it to the best one. let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); if interval_score > best_interval_score { @@ -347,6 +397,7 @@ impl<'t> Matcher<'t, '_> { } } + /// Returns the bounds in byte index of the crop window. fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { let match_interval = self.find_best_match_interval(matches); @@ -357,12 +408,13 @@ impl<'t> Matcher<'t, '_> { (byte_start, byte_end) } + // Returns the formatted version of the original text. pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { // If 0 it will be considered null and thus not crop the field // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 let crop = crop && self.crop_size > 0; if !highlight && !crop { - // compute matches is not needed if no highlight or crop is requested. + // compute matches is not needed if no highlight nor crop is requested. Cow::Borrowed(self.text) } else { match &self.matches { @@ -397,12 +449,14 @@ impl<'t> Matcher<'t, '_> { .char_indices() .enumerate() .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start) - .min(token.byte_end); + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); formatted.push(self.highlight_prefix); formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); - formatted.push(&self.text[highlight_byte_index..token.byte_end]); + // if it's a prefix highlight, we put the end of the word after the highlight marker. + if highlight_byte_index < token.byte_end { + formatted.push(&self.text[highlight_byte_index..token.byte_end]); + } byte_index = token.byte_end; } From b799f3326b982e382f8f1b7a809f1abe1521c008 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 18:44:35 +0200 Subject: [PATCH 1343/1889] rename merge_nothing to merge_ignore_values --- milli/src/update/index_documents/helpers/grenad_helpers.rs | 2 +- milli/src/update/index_documents/helpers/mod.rs | 6 +++--- milli/src/update/index_documents/typed_chunk.rs | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index fb5242910..9d5a67d78 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -279,6 +279,6 @@ pub fn sorter_into_lmdb_database( } /// Used when trying to merge readers, but you don't actually care about the values. -pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { +pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { Ok(Cow::Owned(Vec::new())) } diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 4642bcf14..79d0d0466 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ - as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing, - sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, - GrenadParameters, MergeableReader, + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, + merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, + writer_into_reader, GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index be440114f..26b97c3a0 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -9,8 +9,8 @@ use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ - self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, - CursorClonableMmap, + self, merge_ignore_values, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, + valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; @@ -226,7 +226,7 @@ fn merge_word_docids_reader_into_fst( word_docids_iter: grenad::Reader>, exact_word_docids_iter: grenad::Reader>, ) -> Result>> { - let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn); + let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn); merger_builder.push(word_docids_iter.into_cursor()?); merger_builder.push(exact_word_docids_iter.into_cursor()?); let mut iter = merger_builder.build().into_stream_merger_iter()?; From 86249e2ae43e5a2e9bbdc747435fc6938ce2abc5 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 5 Apr 2022 21:35:06 +0200 Subject: [PATCH 1344/1889] add missing \t in cli update display MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- cli/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 202c67707..542b9d472 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -102,7 +102,7 @@ impl Settings { let exact_attributes = index.exact_attributes(&txn)?; println!( - "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n{}\n", + "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n\t{}\n", displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), filterable_attributes.join("\n\t"), From ee1d627803049fdd5ac94d2a76a3dc76d0a2bbb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 7 Apr 2022 15:56:10 +0200 Subject: [PATCH 1345/1889] Update version (v0.26.0) --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index f516a60ba..79ace436a 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.25.0" +version = "0.26.0" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index d1a244bbc..633ac2cc7 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.25.0" +version = "0.26.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index a0590ce8e..2a4fea85f 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.25.0" +version = "0.26.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index b703ad04d..3a61b6165 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.25.0" +version = "0.26.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 07c509438..ebbe54af9 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.25.0" +version = "0.26.0" authors = ["Kerollmops "] edition = "2018" From 4f3ce6d9cd07d3cb874c64c7b6a23219e3f3dcd3 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 23 Mar 2022 17:28:41 +0100 Subject: [PATCH 1346/1889] nested fields --- benchmarks/benches/indexing.rs | 45 +- benchmarks/benches/utils.rs | 3 +- cli/src/main.rs | 3 +- http-ui/src/main.rs | 2 +- milli/Cargo.toml | 1 + milli/src/documents/mod.rs | 18 + milli/src/error.rs | 16 +- milli/src/index.rs | 58 +- milli/src/lib.rs | 37 + milli/src/search/distinct/mod.rs | 3 +- milli/src/search/facet/facet_distribution.rs | 14 +- milli/src/search/facet/filter.rs | 4 +- milli/src/search/mod.rs | 2 +- milli/src/update/clear_documents.rs | 6 +- milli/src/update/delete_documents.rs | 12 +- .../extract/extract_geo_points.rs | 29 +- .../src/update/index_documents/extract/mod.rs | 58 +- milli/src/update/index_documents/mod.rs | 480 +++++++++++-- milli/src/update/index_documents/transform.rs | 672 ++++++++++++------ milli/src/update/settings.rs | 95 ++- milli/tests/search/mod.rs | 3 +- milli/tests/search/query_criteria.rs | 3 +- 22 files changed, 1197 insertions(+), 367 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index ee74f2a80..2d0604750 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -70,7 +70,8 @@ fn indexing_songs_default(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); builder.add_documents(documents).unwrap(); @@ -120,7 +121,8 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); builder.add_documents(documents).unwrap(); builder.execute().unwrap(); @@ -134,14 +136,16 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); builder.add_documents(documents).unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); builder.add_documents(documents).unwrap(); builder.execute().unwrap(); @@ -190,7 +194,8 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); @@ -236,7 +241,8 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); builder.add_documents(documents).unwrap(); @@ -281,7 +287,8 @@ fn indexing_wiki(c: &mut Criterion) { IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); builder.add_documents(documents).unwrap(); @@ -323,7 +330,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); builder.add_documents(documents).unwrap(); @@ -339,7 +347,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); @@ -349,7 +358,8 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); @@ -400,7 +410,8 @@ fn indexing_movies_default(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); builder.add_documents(documents).unwrap(); @@ -447,7 +458,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); builder.add_documents(documents).unwrap(); @@ -462,7 +474,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); builder.add_documents(documents).unwrap(); @@ -470,7 +483,8 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); builder.add_documents(documents).unwrap(); @@ -525,7 +539,8 @@ fn indexing_geo(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); builder.add_documents(documents).unwrap(); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 383587ef8..b769bf2c7 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -96,7 +96,8 @@ pub fn base_setup(conf: &Conf) -> Index { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let documents = documents_from(conf.dataset, conf.dataset_format); builder.add_documents(documents).unwrap(); diff --git a/cli/src/main.rs b/cli/src/main.rs index 542b9d472..3e9e8c75f 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -261,7 +261,8 @@ impl Performer for DocumentAddition { &config, indexing_config, |step| indexing_callback(step, &bars), - ); + ) + .unwrap(); addition.add_documents(reader)?; std::thread::spawn(move || { diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 26c1034eb..7a3ed8ebe 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -410,7 +410,7 @@ async fn main() -> anyhow::Result<()> { GLOBAL_CONFIG.get().unwrap(), indexing_config, indexing_callback, - ); + )?; let reader = match encoding.as_deref() { Some("gzip") => Box::new(GzDecoder::new(content)), diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 07c509438..e8723dc6a 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -14,6 +14,7 @@ crossbeam-channel = "0.5.2" either = "1.6.1" fst = "0.4.7" fxhash = "0.2.1" +flatten-serde-json = "0.1.0" grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 8fd018328..09f15901d 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -49,6 +49,24 @@ impl DocumentsBatchIndex { pub fn name(&self, id: FieldId) -> Option<&String> { self.0.get_by_left(&id) } + + pub fn recreate_json( + &self, + document: &obkv::KvReaderU16, + ) -> Result, crate::Error> { + let mut map = serde_json::Map::new(); + + for (k, v) in document.iter() { + // TODO: TAMO: update the error type + let key = + self.0.get_by_left(&k).ok_or(crate::error::InternalError::DatabaseClosing)?.clone(); + let value = serde_json::from_slice::(v) + .map_err(crate::error::InternalError::SerdeJson)?; + map.insert(key, value); + } + + Ok(map) + } } #[derive(Debug, Serialize, Deserialize)] diff --git a/milli/src/error.rs b/milli/src/error.rs index 688977741..a2d5219c1 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -27,6 +27,7 @@ pub enum InternalError { DatabaseClosing, DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, FieldIdMapMissingEntry(FieldIdMapMissingEntry), + FieldIdMappingMissingEntry { key: FieldId }, Fst(fst::Error), GrenadInvalidCompressionType, GrenadInvalidFormatVersion, @@ -59,7 +60,7 @@ pub enum UserError { DocumentLimitReached, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: BTreeSet }, - InvalidGeoField { document_id: Value, object: Value }, + InvalidGeoField { document_id: Value }, InvalidFilter(String), InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, SortRankingRuleMissing, @@ -187,6 +188,9 @@ impl fmt::Display for InternalError { write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name) } Self::FieldIdMapMissingEntry(error) => error.fmt(f), + Self::FieldIdMappingMissingEntry { key } => { + write!(f, "Missing {} in the field id mapping.", key) + } Self::Fst(error) => error.fmt(f), Self::GrenadInvalidCompressionType => { f.write_str("Invalid compression type have been specified to grenad.") @@ -226,19 +230,15 @@ impl fmt::Display for UserError { name_list ) } - Self::InvalidGeoField { document_id, object } => { + Self::InvalidGeoField { document_id } => { let document_id = match document_id { Value::String(id) => id.clone(), _ => document_id.to_string(), }; - let object = match object { - Value::String(id) => id.clone(), - _ => object.to_string(), - }; write!( f, - "The document with the id: `{}` contains an invalid _geo field: `{}`.", - document_id, object + "The document with the id: `{}` contains an invalid `_geo` field.", + document_id ) }, Self::InvalidDocumentId { document_id } => { diff --git a/milli/src/index.rs b/milli/src/index.rs index 42170bc80..3adfd2629 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -31,6 +31,7 @@ pub mod main_key { pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; + pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; @@ -567,12 +568,46 @@ impl Index { Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect()) } - /* faceted documents ids */ + /* faceted fields */ + + /// Writes the faceted fields in the database. + pub(crate) fn put_faceted_fields( + &self, + wtxn: &mut RwTxn, + fields: &HashSet, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::HIDDEN_FACETED_FIELDS_KEY, fields) + } /// Returns the faceted fields names. + pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { + Ok(self + .main + .get::<_, Str, SerdeJson<_>>(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)? + .unwrap_or_default()) + } + + /// Identical to `faceted_fields`, but returns ids instead. + pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.faceted_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + + let mut fields_ids = HashSet::new(); + for name in fields { + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.insert(field_id); + } + } + + Ok(fields_ids) + } + + /* faceted documents ids */ + + /// Returns the user defined faceted fields names. /// - /// Faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. - pub fn faceted_fields(&self, rtxn: &RoTxn) -> Result> { + /// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. + pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn) -> Result> { let filterable_fields = self.filterable_fields(rtxn)?; let sortable_fields = self.sortable_fields(rtxn)?; let distinct_field = self.distinct_field(rtxn)?; @@ -592,8 +627,8 @@ impl Index { Ok(faceted_fields) } - /// Identical to `faceted_fields`, but returns ids instead. - pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result> { + /// Identical to `user_defined_faceted_fields`, but returns ids instead. + pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result> { let fields = self.faceted_fields(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?; @@ -1040,13 +1075,14 @@ pub(crate) mod tests { let content = documents!([ { "id": 1, "name": "kevin" }, { "id": 2, "name": "bob", "age": 20 }, - { "id": 2, "name": "bob", "age": 20 } + { "id": 2, "name": "bob", "age": 20 }, ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1067,11 +1103,12 @@ pub(crate) mod tests { // field_distribution in the end let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); let content = documents!([ { "id": 1, "name": "kevin" }, { "id": 2, "name": "bob", "age": 20 }, - { "id": 2, "name": "bob", "age": 20 } + { "id": 2, "name": "bob", "age": 20 }, ]); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -1097,7 +1134,8 @@ pub(crate) mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ba2bd9b0f..ec28dbb1b 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -183,6 +183,43 @@ pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] { [x, y, z] } +/// Returns `true` if the field match one of the faceted fields. +/// See the function [`is_faceted_by`] below to see what “matching” means. +pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator>) -> bool { + faceted_fields.into_iter().find(|facet| is_faceted_by(field, facet.as_ref())).is_some() +} + +/// Returns `true` if the field match the facet. +/// ``` +/// use milli::is_faceted_by; +/// // -- the valid basics +/// assert!(is_faceted_by("animaux", "animaux")); +/// assert!(is_faceted_by("animaux.chien", "animaux")); +/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux")); +/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien")); +/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois")); +/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure")); +/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure.couleur")); +/// +/// // -- the wrongs +/// assert!(!is_faceted_by("chien", "chat")); +/// assert!(!is_faceted_by("animaux", "animaux.chien")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.chat")); +/// +/// // -- the strange edge cases +/// assert!(!is_faceted_by("animaux.chien", "anima")); +/// assert!(!is_faceted_by("animaux.chien", "animau")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.c")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.ch")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.chi")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.chie")); +/// ``` +pub fn is_faceted_by(field: &str, facet: &str) -> bool { + field.starts_with(facet) + && field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true) +} + #[cfg(test)] mod tests { use serde_json::json; diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 965423886..237fd718a 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -97,7 +97,8 @@ mod test { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut addition = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()); + let mut addition = + IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); let reader = crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 91bf21cf7..2208ee636 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -220,9 +220,13 @@ impl<'a> FacetDistribution<'a> { pub fn execute(&self) -> Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let filterable_fields = self.index.filterable_fields(self.rtxn)?; + let fields = match self.facets { Some(ref facets) => { - let invalid_fields: HashSet<_> = facets.difference(&filterable_fields).collect(); + let invalid_fields: HashSet<_> = facets + .iter() + .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) + .collect(); if !invalid_fields.is_empty() { return Err(UserError::InvalidFacetsDistribution { invalid_facets_name: invalid_fields.into_iter().cloned().collect(), @@ -236,10 +240,12 @@ impl<'a> FacetDistribution<'a> { }; let mut distribution = BTreeMap::new(); - for name in fields { - if let Some(fid) = fields_ids_map.id(&name) { + for (fid, name) in fields_ids_map.iter() { + if crate::is_faceted(name, &fields) { let values = self.facet_values(fid)?; - distribution.insert(name, values); + if !values.is_empty() { + distribution.insert(name.to_string(), values); + } } } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 9388cfa33..8f1ee749f 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -353,7 +353,8 @@ impl<'a> Filter<'a> { match &self.condition { FilterCondition::Condition { fid, op } => { let filterable_fields = index.filterable_fields(rtxn)?; - if filterable_fields.contains(fid.value()) { + + if crate::is_faceted(fid.value(), &filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) @@ -549,7 +550,6 @@ mod tests { Filter::from_str("channel = gotaga AND (timestamp = 44 OR channel != ponce)") .unwrap() .unwrap(); - println!("\nExpecting: {:#?}\nGot: {:#?}\n", expected, condition); assert_eq!(condition, expected); } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 0d33d9042..b01bae817 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -159,7 +159,7 @@ impl<'a> Search<'a> { let sortable_fields = self.index.sortable_fields(self.rtxn)?; for asc_desc in sort_criteria { match asc_desc.member() { - Member::Field(ref field) if !sortable_fields.contains(field) => { + Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => { return Err(UserError::InvalidSortableAttribute { field: field.to_string(), valid_fields: sortable_fields.into_iter().collect(), diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 3665d2313..f93ba60fa 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -98,7 +98,8 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig::default(); let config = IndexerConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -110,7 +111,8 @@ mod tests { let rtxn = index.read_txn().unwrap(); - assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 5); + // the value is 7 because there is `[id, name, age, country, _geo, _geo.lng, _geo.lat]` + assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 7); assert!(index.words_fst(&rtxn).unwrap().is_empty()); assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 77c32f0fb..97250d988 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -647,7 +647,8 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -681,7 +682,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -733,7 +735,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -790,7 +793,8 @@ mod tests { let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index e58d351d6..65cb1c3ce 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -2,7 +2,6 @@ use std::fs::File; use std::io; use concat_arrays::concat_arrays; -use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::{FieldId, InternalError, Result, UserError}; @@ -14,7 +13,7 @@ pub fn extract_geo_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, primary_key_id: FieldId, - geo_field_id: FieldId, + (lat_fid, lng_fid): (FieldId, FieldId), ) -> Result> { let mut writer = create_writer( indexer.chunk_compression_type, @@ -25,22 +24,18 @@ pub fn extract_geo_points( let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); - let point: Value = match obkv.get(geo_field_id) { - Some(point) => serde_json::from_slice(point).map_err(InternalError::SerdeJson)?, - None => continue, - }; - - if let Some((lat, lng)) = point["lat"].as_f64().zip(point["lng"].as_f64()) { - // this will create an array of 16 bytes (two 8 bytes floats) - let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; - writer.insert(docid_bytes, bytes)?; - } else { - // All document must have a primary key so we can unwrap safely here + let (lat, lng) = obkv.get(lat_fid).zip(obkv.get(lng_fid)).ok_or_else(|| { let primary_key = obkv.get(primary_key_id).unwrap(); - let primary_key = - serde_json::from_slice(primary_key).map_err(InternalError::SerdeJson)?; - Err(UserError::InvalidGeoField { document_id: primary_key, object: point })? - } + let primary_key = serde_json::from_slice(primary_key).unwrap(); + UserError::InvalidGeoField { document_id: primary_key } + })?; + let (lat, lng): (f64, f64) = ( + serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, + serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, + ); + + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + writer.insert(docid_bytes, bytes)?; } Ok(writer_into_reader(writer)?) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 8f6797a3b..c3c2033a6 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -34,28 +34,36 @@ use crate::{FieldId, Result}; /// Extract data for each databases from obkv documents in parallel. /// Send data in grenad file over provided Sender. pub(crate) fn data_from_obkv_documents( - obkv_chunks: impl Iterator>> + Send, + original_obkv_chunks: impl Iterator>> + Send, + flattened_obkv_chunks: impl Iterator>> + Send, indexer: GrenadParameters, lmdb_writer_sx: Sender>, searchable_fields: Option>, faceted_fields: HashSet, primary_key_id: FieldId, - geo_field_id: Option, + geo_fields_ids: Option<(FieldId, FieldId)>, stop_words: Option>, max_positions_per_attributes: Option, exact_attributes: HashSet, ) -> Result<()> { - let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks + original_obkv_chunks .par_bridge() - .map(|result| { - extract_documents_data( - result, + .map(|original_documents_chunk| { + send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone()) + }) + .collect::>()?; + + let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks + .par_bridge() + .map(|flattened_obkv_chunks| { + send_and_extract_flattened_documents_data( + flattened_obkv_chunks, indexer, lmdb_writer_sx.clone(), &searchable_fields, &faceted_fields, primary_key_id, - geo_field_id, + geo_fields_ids, &stop_words, max_positions_per_attributes, ) @@ -170,36 +178,48 @@ fn spawn_extraction_task( }); } -/// Extract chuncked data and send it into lmdb_writer_sx sender: +/// Extract chunked data and send it into lmdb_writer_sx sender: /// - documents +fn send_original_documents_data( + original_documents_chunk: Result>, + lmdb_writer_sx: Sender>, +) -> Result<()> { + let original_documents_chunk = + original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; + + // TODO: create a custom internal error + lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap(); + Ok(()) +} + +/// Extract chunked data and send it into lmdb_writer_sx sender: /// - documents_ids /// - docid_word_positions /// - docid_fid_facet_numbers /// - docid_fid_facet_strings -fn extract_documents_data( - documents_chunk: Result>, +fn send_and_extract_flattened_documents_data( + flattened_documents_chunk: Result>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, searchable_fields: &Option>, faceted_fields: &HashSet, primary_key_id: FieldId, - geo_field_id: Option, + geo_fields_ids: Option<(FieldId, FieldId)>, stop_words: &Option>, max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, (grenad::Reader, grenad::Reader), )> { - let documents_chunk = documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; + let flattened_documents_chunk = + flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; - let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(documents_chunk.clone()))); - - if let Some(geo_field_id) = geo_field_id { - let documents_chunk_cloned = documents_chunk.clone(); + if let Some(geo_fields_ids) = geo_fields_ids { + let documents_chunk_cloned = flattened_documents_chunk.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); rayon::spawn(move || { let result = - extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_field_id); + extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids); let _ = match result { Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), Err(error) => lmdb_writer_sx_cloned.send(Err(error)), @@ -211,7 +231,7 @@ fn extract_documents_data( rayon::join( || { let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( - documents_chunk.clone(), + flattened_documents_chunk.clone(), indexer.clone(), searchable_fields, stop_words.as_ref(), @@ -232,7 +252,7 @@ fn extract_documents_data( || { let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = extract_fid_docid_facet_values( - documents_chunk.clone(), + flattened_documents_chunk.clone(), indexer.clone(), faceted_fields, )?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 0e6e59e10..eb50a85ed 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -30,7 +30,7 @@ use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; -use crate::{Index, Result, RoaringBitmapCodec}; +use crate::{Index, Result, RoaringBitmapCodec, UserError}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -94,15 +94,16 @@ where indexer_config: &'a IndexerConfig, config: IndexDocumentsConfig, progress: F, - ) -> IndexDocuments<'t, 'u, 'i, 'a, F> { + ) -> Result> { let transform = Some(Transform::new( + wtxn, &index, indexer_config, config.update_method, config.autogenerate_docids, - )); + )?); - IndexDocuments { + Ok(IndexDocuments { transform, config, indexer_config, @@ -110,7 +111,7 @@ where wtxn, index, added_documents: 0, - } + }) } /// Adds a batch of documents to the current builder. @@ -151,6 +152,10 @@ where .take() .expect("Invalid document addition state") .output_from_sorter(self.wtxn, &self.progress)?; + + let new_facets = output.compute_real_facets(self.wtxn, self.index)?; + self.index.put_faceted_fields(self.wtxn, &new_facets)?; + let indexed_documents = output.documents_count as u64; let number_of_documents = self.execute_raw(output)?; @@ -171,7 +176,8 @@ where new_documents_ids, replaced_documents_ids, documents_count, - documents_file, + original_documents, + flattened_documents, } = output; // The fields_ids_map is put back to the store now so the rest of the transaction sees an @@ -197,7 +203,8 @@ where } }; - let documents_file = grenad::Reader::new(documents_file)?; + let original_documents = grenad::Reader::new(original_documents)?; + let flattened_documents = grenad::Reader::new(flattened_documents)?; // create LMDB writer channel let (lmdb_writer_sx, lmdb_writer_rx): ( @@ -213,13 +220,20 @@ where self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); // get filterable fields for facet databases let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; - // get the fid of the `_geo` field. - let geo_field_id = match self.index.fields_ids_map(self.wtxn)?.id("_geo") { + // get the fid of the `_geo.lat` and `_geo.lng` fields. + let geo_fields_ids = match self.index.fields_ids_map(self.wtxn)?.id("_geo") { Some(gfid) => { let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid); let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid); + // if `_geo` is faceted then we get the `lat` and `lng` if is_sortable || is_filterable { - Some(gfid) + let field_ids = self + .index + .fields_ids_map(self.wtxn)? + .insert("_geo.lat") + .zip(self.index.fields_ids_map(self.wtxn)?.insert("_geo.lng")) + .ok_or(UserError::AttributeLimitReached)?; + Some(field_ids) } else { None } @@ -239,28 +253,38 @@ where max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. }; - // split obkv file into several chuncks - let chunk_iter = grenad_obkv_into_chunks( - documents_file, + // split obkv file into several chunks + let original_chunk_iter = grenad_obkv_into_chunks( + original_documents, params.clone(), self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB ); - let result = chunk_iter.map(|chunk_iter| { - // extract all databases from the chunked obkv douments - extract::data_from_obkv_documents( - chunk_iter, - params, - lmdb_writer_sx.clone(), - searchable_fields, - faceted_fields, - primary_key_id, - geo_field_id, - stop_words, - self.indexer_config.max_positions_per_attributes, - exact_attributes, - ) - }); + // split obkv file into several chunks + let flattened_chunk_iter = grenad_obkv_into_chunks( + flattened_documents, + params.clone(), + self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB + ); + + let result = original_chunk_iter + .and_then(|original_chunk_iter| Ok((original_chunk_iter, flattened_chunk_iter?))) + .map(|(original_chunk, flattened_chunk)| { + // extract all databases from the chunked obkv douments + extract::data_from_obkv_documents( + original_chunk, + flattened_chunk, + params, + lmdb_writer_sx.clone(), + searchable_fields, + faceted_fields, + primary_key_id, + geo_fields_ids, + stop_words, + self.indexer_config.max_positions_per_attributes, + exact_attributes, + ) + }); if let Err(e) = result { let _ = lmdb_writer_sx.send(Err(e)); @@ -550,6 +574,7 @@ mod tests { use big_s::S; use heed::EnvOpenOptions; + use maplit::hashset; use super::*; use crate::documents::DocumentBatchBuilder; @@ -574,7 +599,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -589,7 +615,8 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "name": "updated kevin" } ]); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -607,7 +634,8 @@ mod tests { { "id": 2, "name": "updated kevina" }, { "id": 3, "name": "updated benoit" } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); wtxn.commit().unwrap(); @@ -639,7 +667,8 @@ mod tests { ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -665,7 +694,8 @@ mod tests { // Second we send 1 document with id 1, to force it to be merged with the previous one. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "age": 25 } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -706,7 +736,8 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); assert!(builder.add_documents(content).is_err()); wtxn.commit().unwrap(); @@ -735,7 +766,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -753,7 +785,8 @@ mod tests { // Second we send 1 document with the generated uuid, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -793,7 +826,8 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -809,7 +843,8 @@ mod tests { let content = documents!([ { "name": "new kevin" } ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -833,7 +868,8 @@ mod tests { let content = documents!([]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -859,7 +895,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); assert!(builder.add_documents(content).is_err()); wtxn.commit().unwrap(); @@ -867,7 +904,8 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. let content = documents!([ { "id": 32, "name": "kevin" } ]); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -895,7 +933,8 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -912,7 +951,7 @@ mod tests { assert_eq!(result.documents_ids, vec![1]); // Search for a sub array sub object key - let result = index.search(&rtxn).query(r#""wow""#).execute().unwrap(); + let result = index.search(&rtxn).query(r#""amazing""#).execute().unwrap(); assert_eq!(result.documents_ids, vec![2]); drop(rtxn); @@ -940,7 +979,8 @@ mod tests { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(documents).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -950,7 +990,8 @@ mod tests { update_method: IndexDocumentsMethod::UpdateDocuments, ..Default::default() }; - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let documents = documents!([ { "id": 2, @@ -981,7 +1022,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -1000,7 +1042,8 @@ mod tests { ]); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); @@ -1011,7 +1054,8 @@ mod tests { ]); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -1046,7 +1090,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -1080,7 +1125,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -1137,13 +1183,333 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); } + #[test] + fn index_documents_with_nested_fields() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = documents!([ + { + "id": 0, + "title": "The zeroth document", + }, + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule", + }, + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field", + }, + { + "prout": "truc", + "machin": "lol", + }, + ], + }, + { + "id": 3, + "title": "The third document", + "nested": "I lied", + }, + ]); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")]; + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin")); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin"))); + + // testing the simple query search + let mut search = crate::Search::new(&rtxn, &index); + search.query("document"); + search.authorize_typos(true); + search.optional_words(true); + // all documents should be returned + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids.len(), 4); + + search.query("zeroth"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0]); + search.query("first"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1]); + search.query("second"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![2]); + search.query("third"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![3]); + + search.query("field"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1, 2]); + + search.query("lol"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![2]); + + search.query("object"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert!(documents_ids.is_empty()); + + search.query("array"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert!(documents_ids.is_empty()); // nested is not searchable + + search.query("lied"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert!(documents_ids.is_empty()); // nested is not searchable + + // testing the filters + let mut search = crate::Search::new(&rtxn, &index); + search.filter(crate::Filter::from_str(r#"title = "The first document""#).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1]); + + search.filter(crate::Filter::from_str(r#"nested.object = field"#).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1, 2]); + + search.filter(crate::Filter::from_str(r#"nested.machin = bidule"#).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1]); + + search.filter(crate::Filter::from_str(r#"nested = array"#).unwrap().unwrap()); + let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable + assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_)))); + + search.filter(crate::Filter::from_str(r#"nested = "I lied""#).unwrap().unwrap()); + let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable + assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_)))); + } + + #[test] + fn index_documents_with_nested_primary_key() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key("nested.id".to_owned()); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = documents!([ + { + "nested": { + "id": 0, + }, + "title": "The zeroth document", + }, + { + "nested": { + "id": 1, + }, + "title": "The first document", + }, + { + "nested": { + "id": 2, + }, + "title": "The second document", + }, + { + "nested.id": 3, + "title": "The third document", + }, + ]); + + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // testing the simple query search + let mut search = crate::Search::new(&rtxn, &index); + search.query("document"); + search.authorize_typos(true); + search.optional_words(true); + // all documents should be returned + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids.len(), 4); + + search.query("zeroth"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0]); + search.query("first"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1]); + search.query("second"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![2]); + search.query("third"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![3]); + } + + #[test] + fn test_facets_generation() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = documents!([ + { + "id": 0, + "dog": { + "race": { + "bernese mountain": "zeroth", + }, + }, + }, + { + "id": 1, + "dog.race": { + "bernese mountain": "first", + }, + }, + { + "id": 2, + "dog.race.bernese mountain": "second", + }, + { + "id": 3, + "dog": { + "race.bernese mountain": "third" + }, + }, + ]); + + // index the documents + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + // ---- ADD THE SETTING TO TEST THE FILTERABLE + + // add the settings + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + builder.set_filterable_fields(hashset!(String::from("dog"))); + + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let hidden = index.faceted_fields(&rtxn).unwrap(); + + assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain"))); + + for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] { + let mut search = crate::Search::new(&rtxn, &index); + let filter = format!(r#""dog.race.bernese mountain" = {s}"#); + search.filter(crate::Filter::from_str(&filter).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![i]); + } + + // ---- RESET THE SETTINGS + + // update the settings + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + builder.reset_filterable_fields(); + + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let facets = index.faceted_fields(&rtxn).unwrap(); + + assert_eq!(facets, hashset!()); + + // ---- UPDATE THE SETTINGS TO TEST THE SORTABLE + + // update the settings + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + builder.set_sortable_fields(hashset!(S("dog.race"))); + + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let facets = index.faceted_fields(&rtxn).unwrap(); + + assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain"))); + + let mut search = crate::Search::new(&rtxn, &index); + search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S( + "dog.race.bernese mountain", + )))]); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1, 2, 3, 0]); + } + #[test] fn index_2_times_documents_split_by_zero_document_indexation() { let path = tempfile::tempdir().unwrap(); @@ -1162,7 +1528,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1178,7 +1545,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1199,7 +1567,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1226,7 +1595,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 4ec34c0c6..4413e00ca 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,24 +1,27 @@ use std::borrow::Cow; -use std::collections::btree_map::Entry; -use std::collections::HashMap; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; -use std::time::Instant; +use byteorder::ReadBytesExt; +use fxhash::FxHashMap; +use heed::RoTxn; use itertools::Itertools; -use log::info; +use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use super::helpers::{ - create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn, -}; +use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; -use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32}; +use crate::{ + ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, + Result, BEU32, +}; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; @@ -30,7 +33,8 @@ pub struct TransformOutput { pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, pub documents_count: usize, - pub documents_file: File, + pub original_documents: File, + pub flattened_documents: File, } /// Extract the external ids, deduplicate and compute the new internal documents ids @@ -41,11 +45,17 @@ pub struct TransformOutput { /// containing all those documents. pub struct Transform<'a, 'i> { pub index: &'i Index, + fields_ids_map: FieldsIdsMap, + indexer_settings: &'a IndexerConfig, pub autogenerate_docids: bool, pub index_documents_method: IndexDocumentsMethod, - sorter: grenad::Sorter, + original_sorter: grenad::Sorter, + flattened_sorter: grenad::Sorter, + replaced_documents_ids: RoaringBitmap, + new_documents_ids: RoaringBitmap, + new_external_documents_ids_builder: FxHashMap, u64>, documents_count: usize, } @@ -72,6 +82,9 @@ fn create_fields_mapping( .collect() } +/// Look for a key containing the [DEFAULT_PRIMARY_KEY_NAME] in the fields. +/// It doesn't look in the subfield because we don't want to enable the +/// primary key inference on nested objects. fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { index .iter() @@ -83,11 +96,12 @@ fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { impl<'a, 'i> Transform<'a, 'i> { pub fn new( + wtxn: &mut heed::RwTxn, index: &'i Index, indexer_settings: &'a IndexerConfig, index_documents_method: IndexDocumentsMethod, autogenerate_docids: bool, - ) -> Self { + ) -> Result { // We must choose the appropriate merge function for when two or more documents // with the same user id must be merged or fully replaced in the same batch. let merge_function = match index_documents_method { @@ -96,22 +110,36 @@ impl<'a, 'i> Transform<'a, 'i> { }; // We initialize the sorter with the user indexing settings. - let sorter = create_sorter( + let original_sorter = create_sorter( merge_function, indexer_settings.chunk_compression_type, indexer_settings.chunk_compression_level, indexer_settings.max_nb_chunks, - indexer_settings.max_memory, + indexer_settings.max_memory.map(|mem| mem / 2), ); - Transform { + // We initialize the sorter with the user indexing settings. + let flattened_sorter = create_sorter( + merge_function, + indexer_settings.chunk_compression_type, + indexer_settings.chunk_compression_level, + indexer_settings.max_nb_chunks, + indexer_settings.max_memory.map(|mem| mem / 2), + ); + + Ok(Transform { index, + fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, autogenerate_docids, - sorter, - documents_count: 0, + original_sorter, + flattened_sorter, index_documents_method, - } + replaced_documents_ids: RoaringBitmap::new(), + new_documents_ids: RoaringBitmap::new(), + new_external_documents_ids_builder: FxHashMap::default(), + documents_count: 0, + }) } pub fn read_documents( @@ -125,8 +153,11 @@ impl<'a, 'i> Transform<'a, 'i> { F: Fn(UpdateIndexingStep) + Sync, { let fields_index = reader.index(); - let mut fields_ids_map = self.index.fields_ids_map(wtxn)?; - let mapping = create_fields_mapping(&mut fields_ids_map, fields_index)?; + let external_documents_ids = self.index.external_documents_ids(wtxn)?; + let documents_ids = self.index.documents_ids(wtxn)?; + let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); + + let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; let alternative_name = self .index @@ -136,15 +167,19 @@ impl<'a, 'i> Transform<'a, 'i> { let (primary_key_id, primary_key_name) = compute_primary_key_pair( self.index.primary_key(wtxn)?, - &mut fields_ids_map, + &mut self.fields_ids_map, alternative_name, self.autogenerate_docids, )?; + let primary_key_id_nested = primary_key_name.contains('.'); + + let mut flattened_document = None; let mut obkv_buffer = Vec::new(); + let mut flattened_obkv_buffer = Vec::new(); let mut documents_count = 0; let mut external_id_buffer = Vec::new(); - let mut field_buffer: Vec<(u16, &[u8])> = Vec::new(); + let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); while let Some((addition_index, document)) = reader.next_document_with_index()? { let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { @@ -154,8 +189,9 @@ impl<'a, 'i> Transform<'a, 'i> { } for (k, v) in document.iter() { - let mapped_id = *mapping.get(&k).unwrap(); - field_buffer_cache.push((mapped_id, v)); + let mapped_id = + *mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; + field_buffer_cache.push((mapped_id, Cow::from(v))); } // We need to make sure that every document has a primary key. After we have remapped @@ -164,87 +200,125 @@ impl<'a, 'i> Transform<'a, 'i> { // document. If none is found, and we were told to generate missing document ids, then // we create the missing field, and update the new document. let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; - let external_id = - match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { - Some((_, bytes)) => { - let value = match serde_json::from_slice(bytes).unwrap() { - Value::String(string) => match validate_document_id(&string) { - Some(s) if s.len() == string.len() => string, - Some(s) => s.to_string(), - None => { - return Err(UserError::InvalidDocumentId { - document_id: Value::String(string), - } - .into()) - } - }, - Value::Number(number) => number.to_string(), - content => { - return Err(UserError::InvalidDocumentId { - document_id: content.clone(), - } - .into()) - } - }; - serde_json::to_writer(&mut external_id_buffer, &value).unwrap(); - Cow::Owned(value) - } - None => { - if !self.autogenerate_docids { - let mut json = Map::new(); - for (key, value) in document.iter() { - let key = addition_index.name(key).cloned(); - let value = serde_json::from_slice::(&value).ok(); + let external_id = if primary_key_id_nested { + let mut field_buffer_cache = field_buffer_cache.clone(); + self.flatten_from_field_mapping( + &mapping, + &document, + &mut flattened_obkv_buffer, + &mut field_buffer_cache, + )?; + flattened_document = Some(&flattened_obkv_buffer); + let document = KvReader::new(&flattened_obkv_buffer); - if let Some((k, v)) = key.zip(value) { - json.insert(k, v); - } - } - - return Err(UserError::MissingDocumentId { - primary_key: primary_key_name, - document: json, - } - .into()); - } - - let uuid = - uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer); - serde_json::to_writer(&mut external_id_buffer, &uuid).unwrap(); - field_buffer_cache.push((primary_key_id, &external_id_buffer)); - Cow::Borrowed(&*uuid) - } - }; + update_primary_key( + document, + &addition_index, + primary_key_id, + &primary_key_name, + &mut uuid_buffer, + &mut field_buffer_cache, + &mut external_id_buffer, + self.autogenerate_docids, + )? + } else { + update_primary_key( + document, + &addition_index, + primary_key_id, + &primary_key_name, + &mut uuid_buffer, + &mut field_buffer_cache, + &mut external_id_buffer, + self.autogenerate_docids, + )? + }; // Insertion in a obkv need to be done with keys ordered. For now they are ordered // according to the document addition key order, so we sort it according to the // fieldids map keys order. field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2)); - // The last step is to build the new obkv document, and insert it in the sorter. + // Build the new obkv document. let mut writer = obkv::KvWriter::new(&mut obkv_buffer); for (k, v) in field_buffer_cache.iter() { writer.insert(*k, v)?; } + let (docid, should_insert_original_document) = + match external_documents_ids.get(&*external_id) { + // if the document is in the db but has already been inserted + // (ie: already exists in the list of replaced documents ids), + // we should not add the original document a second time. + Some(docid) => (docid, !self.replaced_documents_ids.contains(docid)), + None => { + // if the document has already been inserted in this + // batch we need to get its docid + match self + .new_external_documents_ids_builder + .entry(external_id.as_bytes().to_vec()) + { + Entry::Occupied(entry) => (*entry.get() as u32, false), + // if the document has never been encountered we give it a new docid + // and push this new docid to the external documents ids builder + Entry::Vacant(entry) => { + let new_docid = available_documents_ids + .next() + .ok_or(UserError::DocumentLimitReached)?; + entry.insert(new_docid as u64); + (new_docid, false) + } + } + } + }; + + if should_insert_original_document { + self.replaced_documents_ids.insert(docid); + + let key = BEU32::new(docid); + let base_obkv = self + .index + .documents + .remap_data_type::() + .get(wtxn, &key)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + })?; + + self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; + let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?; + + self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; + } else { + self.new_documents_ids.insert(docid); + } + // We use the extracted/generated user id as the key for this document. - self.sorter.insert(&external_id.as_ref().as_bytes(), &obkv_buffer)?; + self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?; documents_count += 1; + if let Some(flatten) = flattened_document { + self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; + } else { + let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?; + self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; + } + progress_callback(UpdateIndexingStep::RemapDocumentAddition { documents_seen: documents_count, }); - obkv_buffer.clear(); field_buffer = drop_and_reuse(field_buffer_cache); external_id_buffer.clear(); + obkv_buffer.clear(); } progress_callback(UpdateIndexingStep::RemapDocumentAddition { documents_seen: documents_count, }); - self.index.put_fields_ids_map(wtxn, &fields_ids_map)?; + self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; self.index.put_primary_key(wtxn, &primary_key_name)?; self.documents_count += documents_count; // Now that we have a valid sorter that contains the user id and the obkv we @@ -252,6 +326,87 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(documents_count) } + // Flatten a document from the fields ids map contained in self and insert the new + // created fields. + fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result> { + let mut doc = serde_json::Map::new(); + + for (k, v) in obkv.iter() { + let key = self.fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: k, + process: "Flatten from fields ids map.", + })?; + let value = serde_json::from_slice::(v) + .map_err(crate::error::InternalError::SerdeJson)?; + doc.insert(key.to_string(), value); + } + + let flattened = flatten_serde_json::flatten(&doc); + + // Once we have the flattened version we can convert it back to obkv and + // insert all the new generated fields_ids (if any) in the fields ids map. + let mut buffer: Vec = Vec::new(); + let mut writer = KvWriter::new(&mut buffer); + let mut flattened: Vec<_> = flattened.into_iter().collect(); + // we reorder the field to get all the known field first + flattened + .sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX)); + + for (key, value) in flattened { + let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; + let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; + writer.insert(fid, &value)?; + } + + Ok(buffer) + } + + // Flatten a document from a field mapping generated by [create_fields_mapping] + fn flatten_from_field_mapping( + &mut self, + mapping: &HashMap, + obkv: &KvReader, + output_buffer: &mut Vec, + field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>, + ) -> Result<()> { + // if the primary_key is nested we need to flatten the document before being able to do anything + let mut doc = serde_json::Map::new(); + + for (k, v) in obkv.iter() { + let key = + mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; + let key = self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: *key, + process: "Flatten from field mapping.", + })?; + let value = + serde_json::from_slice::(v).map_err(InternalError::SerdeJson)?; + doc.insert(key.to_string(), value); + } + + let flattened = flatten_serde_json::flatten(&doc); + + // Once we have the flattened version we can convert it back to obkv and + // insert all the new generated fields_ids (if any) in the fields ids map. + output_buffer.clear(); + let mut writer = KvWriter::new(output_buffer); + let mut flattened: Vec<_> = flattened.into_iter().collect(); + // we reorder the field to get all the known field first + flattened + .sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX)); + + for (key, value) in flattened { + let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; + let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; + writer.insert(fid, &value)?; + if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() { + field_buffer_cache.push((fid, value.into())); + } + } + + Ok(()) + } + /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. @@ -268,110 +423,8 @@ impl<'a, 'i> Transform<'a, 'i> { .primary_key(&wtxn)? .ok_or(Error::UserError(UserError::MissingPrimaryKey))? .to_string(); - let fields_ids_map = self.index.fields_ids_map(wtxn)?; - let approximate_number_of_documents = self.documents_count; - let mut external_documents_ids = self.index.external_documents_ids(wtxn).unwrap(); - let documents_ids = self.index.documents_ids(wtxn)?; - let mut field_distribution = self.index.field_distribution(wtxn)?; - let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); - - // consume sorter, in order to free the internal allocation, before creating a new one. - let mut iter = self.sorter.into_stream_merger_iter()?; - - // Once we have sort and deduplicated the documents we write them into a final file. - let mut final_sorter = create_sorter( - |_id, obkvs| { - if obkvs.len() == 1 { - Ok(obkvs[0].clone()) - } else { - Err(InternalError::IndexingMergingKeys { process: "documents" }.into()) - } - }, - self.indexer_settings.chunk_compression_type, - self.indexer_settings.chunk_compression_level, - self.indexer_settings.max_nb_chunks, - self.indexer_settings.max_memory, - ); - let mut new_external_documents_ids_builder = fst::MapBuilder::memory(); - let mut replaced_documents_ids = RoaringBitmap::new(); - let mut new_documents_ids = RoaringBitmap::new(); - let mut obkv_buffer = Vec::new(); - - // While we write into final file we get or generate the internal documents ids. - let mut documents_count = 0; - while let Some((external_id, update_obkv)) = iter.next()? { - if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { - progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { - documents_seen: documents_count, - total_documents: approximate_number_of_documents, - }); - } - - let (docid, obkv) = match external_documents_ids.get(external_id) { - Some(docid) => { - // If we find the user id in the current external documents ids map - // we use it and insert it in the list of replaced documents. - replaced_documents_ids.insert(docid); - - let key = BEU32::new(docid); - let base_obkv = self.index.documents.get(wtxn, &key)?.ok_or( - InternalError::DatabaseMissingEntry { - db_name: db_name::DOCUMENTS, - key: None, - }, - )?; - - // we remove all the fields that were already counted - for (field_id, _) in base_obkv.iter() { - let field_name = fields_ids_map.name(field_id).unwrap(); - if let Entry::Occupied(mut entry) = - field_distribution.entry(field_name.to_string()) - { - match entry.get().checked_sub(1) { - Some(0) | None => entry.remove(), - Some(count) => entry.insert(count), - }; - } - } - - // Depending on the update indexing method we will merge - // the document update with the current document or not. - match self.index_documents_method { - IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), - IndexDocumentsMethod::UpdateDocuments => { - let update_obkv = obkv::KvReader::new(update_obkv); - merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); - (docid, obkv_buffer.as_slice()) - } - } - } - None => { - // If this user id is new we add it to the external documents ids map - // for new ids and into the list of new documents. - let new_docid = - available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?; - new_external_documents_ids_builder.insert(external_id, new_docid as u64)?; - new_documents_ids.insert(new_docid); - (new_docid, update_obkv) - } - }; - - // We insert the document under the documents ids map into the final file. - final_sorter.insert(docid.to_be_bytes(), obkv)?; - documents_count += 1; - - let reader = obkv::KvReader::new(obkv); - for (field_id, _) in reader.iter() { - let field_name = fields_ids_map.name(field_id).unwrap(); - *field_distribution.entry(field_name.to_string()).or_default() += 1; - } - } - - progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { - documents_seen: documents_count, - total_documents: documents_count, - }); + let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; // We create a final writer to write the new documents in order from the sorter. let mut writer = create_writer( @@ -380,28 +433,103 @@ impl<'a, 'i> Transform<'a, 'i> { tempfile::tempfile()?, ); + // Once we have all the documents in the sorter, we write the documents + // in the writer. We also generate the field distribution. + let mut field_distribution = self.index.field_distribution(wtxn)?; + let mut iter = self.original_sorter.into_stream_merger_iter()?; + // used only for the callback + let mut documents_count = 0; + + while let Some((key, val)) = iter.next()? { + // send a callback to show at which step we are + documents_count += 1; + progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { + documents_seen: documents_count, + total_documents: self.documents_count, + }); + + let u32_key = key.clone().read_u32::()?; + // if the document was already in the db we remove all of its field + // from the field distribution. + if self.replaced_documents_ids.contains(u32_key) { + let obkv = self.index.documents.get(wtxn, &BEU32::new(u32_key))?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, + )?; + + for (key, _) in obkv.iter() { + let name = + self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + })?; + // We checked that the document was in the db earlier. If we can't find it it means + // there is an inconsistency between the field distribution and the field id map. + let field = field_distribution.get_mut(name).ok_or( + FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Accessing field distribution in transform.", + }, + )?; + *field -= 1; + if *field == 0 { + // since we were able to get the field right before it's safe to unwrap here + field_distribution.remove(name).unwrap(); + } + } + } + + // We increment all the field of the current document in the field distribution. + let obkv = KvReader::new(val); + + for (key, _) in obkv.iter() { + let name = + self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + })?; + *field_distribution.entry(name.to_string()).or_insert(0) += 1; + } + writer.insert(key, val)?; + } + + let mut original_documents = writer.into_inner()?; + // We then extract the file and reset the seek to be able to read it again. + original_documents.seek(SeekFrom::Start(0))?; + + // We create a final writer to write the new documents in order from the sorter. + let mut writer = create_writer( + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + tempfile::tempfile()?, + ); // Once we have written all the documents into the final sorter, we write the documents // into this writer, extract the file and reset the seek to be able to read it again. - final_sorter.write_into_stream_writer(&mut writer)?; - let mut documents_file = writer.into_inner()?; - documents_file.seek(SeekFrom::Start(0))?; + self.flattened_sorter.write_into_stream_writer(&mut writer)?; + let mut flattened_documents = writer.into_inner()?; + flattened_documents.seek(SeekFrom::Start(0))?; - let before_docids_merging = Instant::now(); - // We merge the new external ids with existing external documents ids. - let new_external_documents_ids = new_external_documents_ids_builder.into_map(); + let mut new_external_documents_ids_builder: Vec<_> = + self.new_external_documents_ids_builder.into_iter().collect(); + + new_external_documents_ids_builder + .sort_unstable_by(|(left, _), (right, _)| left.cmp(&right)); + let mut fst_new_external_documents_ids_builder = fst::MapBuilder::memory(); + new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { + fst_new_external_documents_ids_builder.insert(key, value) + })?; + let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map(); external_documents_ids.insert_ids(&new_external_documents_ids)?; - info!("Documents external merging took {:.02?}", before_docids_merging.elapsed()); - Ok(TransformOutput { primary_key, - fields_ids_map, + fields_ids_map: self.fields_ids_map, field_distribution, external_documents_ids: external_documents_ids.into_static(), - new_documents_ids, - replaced_documents_ids, - documents_count, - documents_file, + new_documents_ids: self.new_documents_ids, + replaced_documents_ids: self.replaced_documents_ids, + documents_count: self.documents_count, + original_documents, + flattened_documents, }) } @@ -412,7 +540,7 @@ impl<'a, 'i> Transform<'a, 'i> { self, wtxn: &mut heed::RwTxn, old_fields_ids_map: FieldsIdsMap, - new_fields_ids_map: FieldsIdsMap, + mut new_fields_ids_map: FieldsIdsMap, ) -> Result { // There already has been a document addition, the primary key should be set by now. let primary_key = @@ -423,7 +551,14 @@ impl<'a, 'i> Transform<'a, 'i> { let documents_count = documents_ids.len() as usize; // We create a final writer to write the new documents in order from the sorter. - let mut writer = create_writer( + let mut original_writer = create_writer( + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + tempfile::tempfile()?, + ); + + // We create a final writer to write the new documents in order from the sorter. + let mut flattened_writer = create_writer( self.indexer_settings.chunk_compression_type, self.indexer_settings.chunk_compression_level, tempfile::tempfile()?, @@ -445,13 +580,51 @@ impl<'a, 'i> Transform<'a, 'i> { } let buffer = obkv_writer.into_inner()?; - writer.insert(docid.to_be_bytes(), buffer)?; + original_writer.insert(docid.to_be_bytes(), &buffer)?; + + // Once we have the document. We're going to flatten it + // and insert it in the flattened sorter. + let mut doc = serde_json::Map::new(); + + let reader = obkv::KvReader::new(buffer); + for (k, v) in reader.iter() { + let key = new_fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: k, + process: "Accessing field distribution in transform.", + })?; + let value = serde_json::from_slice::(v) + .map_err(InternalError::SerdeJson)?; + doc.insert(key.to_string(), value); + } + + let flattened = flatten_serde_json::flatten(&doc); + + // Once we have the flattened version we can convert it back to obkv and + // insert all the new generated fields_ids (if any) in the fields ids map. + let mut buffer: Vec = Vec::new(); + let mut writer = KvWriter::new(&mut buffer); + let mut flattened: Vec<_> = flattened.into_iter().collect(); + // we reorder the field to get all the known field first + flattened.sort_unstable_by_key(|(key, _)| { + new_fields_ids_map.id(&key).unwrap_or(FieldId::MAX) + }); + + for (key, value) in flattened { + let fid = + new_fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; + let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; + writer.insert(fid, &value)?; + } + flattened_writer.insert(docid.to_be_bytes(), &buffer)?; } // Once we have written all the documents, we extract // the file and reset the seek to be able to read it again. - let mut documents_file = writer.into_inner()?; - documents_file.seek(SeekFrom::Start(0))?; + let mut original_documents = original_writer.into_inner()?; + original_documents.seek(SeekFrom::Start(0))?; + + let mut flattened_documents = flattened_writer.into_inner()?; + flattened_documents.seek(SeekFrom::Start(0))?; Ok(TransformOutput { primary_key, @@ -461,7 +634,8 @@ impl<'a, 'i> Transform<'a, 'i> { new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), documents_count, - documents_file, + original_documents, + flattened_documents, }) } } @@ -521,11 +695,84 @@ fn drop_and_reuse(mut vec: Vec) -> Vec { vec.into_iter().map(|_| unreachable!()).collect() } +fn update_primary_key<'a>( + document: KvReader<'a, FieldId>, + addition_index: &DocumentsBatchIndex, + primary_key_id: FieldId, + primary_key_name: &str, + uuid_buffer: &'a mut [u8; uuid::adapter::Hyphenated::LENGTH], + field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>, + mut external_id_buffer: &'a mut Vec, + autogenerate_docids: bool, +) -> Result> { + match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { + Some((_, bytes)) => { + let value = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + Value::String(string) => match validate_document_id(&string) { + Some(s) if s.len() == string.len() => string, + Some(s) => s.to_string(), + None => { + return Err(UserError::InvalidDocumentId { + document_id: Value::String(string), + } + .into()) + } + }, + Value::Number(number) => number.to_string(), + content => { + return Err(UserError::InvalidDocumentId { document_id: content.clone() }.into()) + } + }; + serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; + Ok(Cow::Owned(value)) + } + None if autogenerate_docids => { + let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer); + serde_json::to_writer(&mut external_id_buffer, &uuid) + .map_err(InternalError::SerdeJson)?; + field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into())); + Ok(Cow::Borrowed(&*uuid)) + } + None => { + let mut json = Map::new(); + for (key, value) in document.iter() { + let key = addition_index.name(key).cloned(); + let value = serde_json::from_slice::(&value).ok(); + + if let Some((k, v)) = key.zip(value) { + json.insert(k, v); + } + } + + Err(UserError::MissingDocumentId { + primary_key: primary_key_name.to_string(), + document: json, + })? + } + } +} + +impl TransformOutput { + // find and insert the new field ids + pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result> { + let user_defined_facets = index.user_defined_faceted_fields(rtxn)?; + + Ok(self + .fields_ids_map + .names() + .filter(|&field| crate::is_faceted(field, &user_defined_facets)) + .map(|field| field.to_string()) + .collect()) + } +} + #[cfg(test)] mod test { use super::*; mod compute_primary_key { + use big_s::S; + use super::{compute_primary_key_pair, FieldsIdsMap}; #[test] @@ -540,6 +787,18 @@ mod test { ); assert_eq!(result.unwrap(), (0, "toto".to_string())); assert_eq!(fields_map.len(), 1); + + // and with nested fields + let mut fields_map = FieldsIdsMap::new(); + fields_map.insert("toto.tata").unwrap(); + let result = compute_primary_key_pair( + Some("toto.tata"), + &mut fields_map, + Some(S("titi")), + false, + ); + assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); + assert_eq!(fields_map.len(), 1); } #[test] @@ -547,7 +806,7 @@ mod test { let mut fields_map = FieldsIdsMap::new(); let result = compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); - assert_eq!(result.unwrap(), (0, "tata".to_string())); + assert_eq!(result.unwrap(), (0, S("tata"))); assert_eq!(fields_map.len(), 1); } @@ -555,7 +814,7 @@ mod test { fn should_return_default_if_both_are_none() { let mut fields_map = FieldsIdsMap::new(); let result = compute_primary_key_pair(None, &mut fields_map, None, true); - assert_eq!(result.unwrap(), (0, "id".to_string())); + assert_eq!(result.unwrap(), (0, S("id"))); assert_eq!(fields_map.len(), 1); } @@ -569,6 +828,7 @@ mod test { } mod primary_key_inference { + use big_s::S; use bimap::BiHashMap; use crate::documents::DocumentsBatchIndex; @@ -579,11 +839,11 @@ mod test { // We run the test multiple times to change the order in which the fields are iterated upon. for _ in 1..50 { let mut map = BiHashMap::new(); - map.insert(1, "fakeId".to_string()); - map.insert(2, "fakeId".to_string()); - map.insert(3, "fakeId".to_string()); - map.insert(4, "fakeId".to_string()); - map.insert(0, "realId".to_string()); + map.insert(1, S("fakeId")); + map.insert(2, S("fakeId")); + map.insert(3, S("fakeId")); + map.insert(4, S("fakeId")); + map.insert(0, S("realId")); assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 7a26361d4..7dd37ccc2 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -249,11 +249,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } let transform = Transform::new( + self.wtxn, &self.index, &self.indexer_config, IndexDocumentsMethod::ReplaceDocuments, false, - ); + )?; // We remap the documents fields based on the new `FieldsIdsMap`. let output = transform.remap_index_documents( @@ -262,6 +263,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fields_ids_map.clone(), )?; + let new_facets = output.compute_real_facets(self.wtxn, self.index)?; + self.index.put_faceted_fields(self.wtxn, &new_facets)?; + // We clear the full database (words-fst, documents ids and documents content). ClearDocuments::new(self.wtxn, self.index).execute()?; @@ -273,7 +277,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { &self.indexer_config, IndexDocumentsConfig::default(), &cb, - ); + )?; indexing_builder.execute_raw(output)?; Ok(()) @@ -583,7 +587,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; - let old_faceted_fields = self.index.faceted_fields(&self.wtxn)?; + let old_faceted_fields = self.index.user_defined_faceted_fields(&self.wtxn)?; let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; self.update_displayed()?; @@ -599,7 +603,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, // an Asc/Desc criterion or a filtered attribute as be added or removed. - let new_faceted_fields = self.index.faceted_fields(&self.wtxn)?; + let new_faceted_fields = self.index.user_defined_faceted_fields(&self.wtxn)?; let faceted_updated = old_faceted_fields != new_faceted_fields; let stop_words_updated = self.update_stop_words()?; @@ -651,7 +655,8 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -713,7 +718,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -764,7 +770,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -793,7 +800,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -846,7 +854,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -858,7 +867,6 @@ mod tests { // Only count the field_id 0 and level 0 facet values. // TODO we must support typed CSVs for numbers to be understood. let fidmap = index.fields_ids_map(&rtxn).unwrap(); - println!("fidmap: {:?}", fidmap); for document in index.all_documents(&rtxn).unwrap() { let document = document.unwrap(); let json = crate::obkv_to_json(&fidmap.ids().collect::>(), &fidmap, document.1) @@ -886,7 +894,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -927,7 +936,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -977,7 +987,51 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + + // There must be at least one document with a 34 as the age. + assert_eq!(documents_ids.len(), 3); + } + + #[test] + fn set_nested_distinct_field() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); + + // Set the filterable fields to be the age. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + // Don't display the generated `id` field. + builder.set_displayed_fields(vec![S("person")]); + builder.set_distinct_field(S("person.age")); + builder.execute(|_| ()).unwrap(); + + // Then index some documents. + let content = documents!([ + { "person": { "name": "kevin", "age": 23 }}, + { "person": { "name": "kevina", "age": 21 }}, + { "person": { "name": "benoit", "age": 34 }}, + { "person": { "name": "bernard", "age": 34 }}, + { "person": { "name": "bertrand", "age": 34 }}, + { "person": { "name": "bernie", "age": 34 }}, + { "person": { "name": "ben", "age": 34 }} + ]); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1008,7 +1062,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1037,7 +1092,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -1115,7 +1171,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); @@ -1252,7 +1309,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1314,7 +1372,8 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()); + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 52b4c7114..c72ca8ba3 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -59,7 +59,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let reader = Cursor::new(CONTENT.as_bytes()); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 786fdbcae..893d7c30a 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -390,7 +390,8 @@ fn criteria_ascdesc() { // index documents let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut cursor = Cursor::new(Vec::new()); let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); From ab458d88408687cc5510502efecf9a65a2835270 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 7 Apr 2022 17:00:00 +0200 Subject: [PATCH 1347/1889] fix tests after rebase --- milli/tests/search/typo_tolerance.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 35cc4b4c2..9a7986c5e 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -130,7 +130,8 @@ fn test_typo_disabled_on_word() { let mut txn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()); + let mut builder = + IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); builder.add_documents(documents).unwrap(); From b1905dfa2409a8c7aa5a08431b51ad6c81cc73bd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 7 Apr 2022 17:05:44 +0200 Subject: [PATCH 1348/1889] Make split_best_frequency returns references instead of owned data --- milli/src/search/query_tree.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index a45034a3b..f8dd82a57 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -257,7 +257,10 @@ impl<'a> QueryTreeBuilder<'a> { } /// Split the word depending on the frequency of subwords in the database documents. -fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { +fn split_best_frequency<'a>( + ctx: &impl Context, + word: &'a str, +) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -273,7 +276,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); if let Some((left, right)) = split_best_frequency(ctx, &word)? { - children.push(Operation::Phrase(vec![left, right])); + children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let exact_words = ctx.exact_words()?; @@ -499,8 +502,8 @@ fn create_matching_words( } if let Some((left, right)) = split_best_frequency(ctx, &word)? { - let left = MatchingWord::new(left, 0, false); - let right = MatchingWord::new(right, 0, false); + let left = MatchingWord::new(left.to_string(), 0, false); + let right = MatchingWord::new(right.to_string(), 0, false); matching_words.push((vec![left, right], vec![id])); } From c8ed1675a75b40195a1371f1ce3bd8a4c8cdbac2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 7 Apr 2022 17:32:13 +0200 Subject: [PATCH 1349/1889] Add some documentation --- milli/src/search/matches/matching_words.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 274634554..84b47bba5 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -26,11 +26,14 @@ impl MatchingWords { Self { inner: matching_words } } + /// Returns an iterator over terms that match or partially match the given token. pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { MatchesIter { inner: Box::new(self.inner.iter()), token } } } +/// Iterator over terms that match the given token, +/// This allow to lazily evaluate matches. pub struct MatchesIter<'a, 'b> { inner: Box, Vec)> + 'a>, token: &'b Token<'b>, @@ -60,7 +63,10 @@ impl<'a> Iterator for MatchesIter<'a, '_> { } } +/// Id of a matching term corespounding to a word written by the end user. pub type PrimitiveWordId = u8; + +/// Structure used to match a specific term. pub struct MatchingWord { pub dfa: DFA, pub word: String, @@ -91,6 +97,7 @@ impl MatchingWord { Self { dfa, word, typo, prefix } } + /// Returns the lenght in chars of the match in case of the token matches the term. pub fn match_token(&self, token: &Token) -> Option { match self.dfa.eval(token.text()) { Distance::Exact(t) if t <= self.typo => { @@ -106,12 +113,17 @@ impl MatchingWord { } } +/// A given token can partially match a query word for several reasons: +/// - split words +/// - multi-word synonyms +/// In these cases we need to match consecutively several tokens to consider that the match is full. #[derive(Debug, PartialEq)] pub enum MatchType<'a> { Full { char_len: usize, ids: &'a [PrimitiveWordId] }, Partial(PartialMatch<'a>), } +/// Structure helper to match several tokens in a row in order to complete a partial match. #[derive(Debug, PartialEq)] pub struct PartialMatch<'a> { matching_words: &'a [MatchingWord], @@ -120,6 +132,10 @@ pub struct PartialMatch<'a> { } impl<'a> PartialMatch<'a> { + /// Returns: + /// - None if the given token breaks the partial match + /// - Partial if the given token matches the partial match but doesn't complete it + /// - Full if the given token completes the partial match pub fn match_token(self, token: &Token) -> Option> { self.matching_words[0].match_token(token).map(|char_len| { if self.matching_words.len() > 1 { From bab898ce8607468dd6ec17f095b8cdbd8f1133a0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 7 Apr 2022 18:20:44 +0200 Subject: [PATCH 1350/1889] move the flatten-serde-json crate inside of milli --- Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 15 + flatten-serde-json/README.md | 153 ++++++++++ flatten-serde-json/fuzz/Cargo.toml | 26 ++ .../fuzz/fuzz_targets/flatten.rs | 8 + flatten-serde-json/src/lib.rs | 264 ++++++++++++++++++ flatten-serde-json/src/main.rs | 11 + milli/Cargo.toml | 2 +- 8 files changed, 479 insertions(+), 2 deletions(-) create mode 100644 flatten-serde-json/Cargo.toml create mode 100644 flatten-serde-json/README.md create mode 100644 flatten-serde-json/fuzz/Cargo.toml create mode 100644 flatten-serde-json/fuzz/fuzz_targets/flatten.rs create mode 100644 flatten-serde-json/src/lib.rs create mode 100644 flatten-serde-json/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 3f2732444..a9378adc4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["milli", "filter-parser", "http-ui", "benchmarks", "infos", "helpers", "cli"] +members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"] default-members = ["milli"] [profile.dev] diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml new file mode 100644 index 000000000..db92c1ded --- /dev/null +++ b/flatten-serde-json/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "flatten-serde-json" +version = "0.1.0" +edition = "2021" +description = "Flatten serde-json objects like elastic search" +readme = "README.md" +author = ["Tamo tamo@meilisearch.com"] +repository = "https://github.com/irevoire/flatten-serde-json" +keywords = ["json", "flatten"] +categories = ["command-line-utilities"] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +serde_json = "1.0" diff --git a/flatten-serde-json/README.md b/flatten-serde-json/README.md new file mode 100644 index 000000000..a1dd7d275 --- /dev/null +++ b/flatten-serde-json/README.md @@ -0,0 +1,153 @@ +# Flatten serde Json + +This crate flatten [`serde_json`](https://docs.rs/serde_json/latest/serde_json/) `Object` in a format +similar to [elastic search](https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html). + +## Examples + +### There is nothing to do + +```json +{ + "id": "287947", + "title": "Shazam!", + "release_date": 1553299200, + "genres": [ + "Action", + "Comedy", + "Fantasy" + ] +} +``` + +Flattens to: +```json +{ + "id": "287947", + "title": "Shazam!", + "release_date": 1553299200, + "genres": [ + "Action", + "Comedy", + "Fantasy" + ] +} +``` + +------------ + +### Objects + +```json +{ + "a": { + "b": "c", + "d": "e", + "f": "g" + } +} +``` + +Flattens to: +```json +{ + "a.b": "c", + "a.d": "e", + "a.f": "g" +} +``` + +------------ + +### Array of objects + +```json +{ + "a": [ + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] +} +``` + +Flattens to: +```json +{ + "a.b": ["c", "d", "e"], +} +``` + +------------ + +### Array of objects with normal value in the array + +```json +{ + "a": [ + 42, + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] +} +``` + +Flattens to: +```json +{ + "a": 42, + "a.b": ["c", "d", "e"], +} +``` + +------------ + +### Array of objects of array of objects of ... + +```json +{ + "a": [ + "b", + ["c", "d"], + { "e": ["f", "g"] }, + [ + { "h": "i" }, + { "e": ["j", { "z": "y" }] }, + ], + ["l"], + "m", + ] +} +``` + +Flattens to: +```json +{ + "a": ["b", "c", "d", "l", "m"], + "a.e": ["f", "g", "j"], + "a.h": "i", + "a.e.z": "y", +} +``` + +------------ + +### Collision between a generated field name and an already existing field + +```json +{ + "a": { + "b": "c", + }, + "a.b": "d", +} +``` + +Flattens to: +```json +{ + "a.b": ["c", "d"], +} +``` + diff --git a/flatten-serde-json/fuzz/Cargo.toml b/flatten-serde-json/fuzz/Cargo.toml new file mode 100644 index 000000000..2e0510d5f --- /dev/null +++ b/flatten-serde-json/fuzz/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "flatten_serde_json-fuzz" +version = "0.0.0" +authors = ["Automatically generated"] +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +arbitrary-json = "0.1.1" + +[dependencies.flatten_serde_json] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "flatten" +path = "fuzz_targets/flatten.rs" +test = false +doc = false diff --git a/flatten-serde-json/fuzz/fuzz_targets/flatten.rs b/flatten-serde-json/fuzz/fuzz_targets/flatten.rs new file mode 100644 index 000000000..399d1c484 --- /dev/null +++ b/flatten-serde-json/fuzz/fuzz_targets/flatten.rs @@ -0,0 +1,8 @@ +#![no_main] +use arbitrary_json::ArbitraryObject; +use flatten_serde_json::flatten; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|object: ArbitraryObject| { + let _ = flatten(&object); +}); diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs new file mode 100644 index 000000000..734ae2a24 --- /dev/null +++ b/flatten-serde-json/src/lib.rs @@ -0,0 +1,264 @@ +#![doc = include_str!("../README.md")] + +use serde_json::{json, Map, Value}; + +pub fn flatten(json: &Map) -> Map { + let mut obj = Map::new(); + insert_object(&mut obj, None, json); + obj +} + +fn insert_object( + base_json: &mut Map, + base_key: Option<&str>, + object: &Map, +) { + for (key, value) in object { + let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); + + if let Some(array) = value.as_array() { + insert_array(base_json, &new_key, array); + } else if let Some(object) = value.as_object() { + insert_object(base_json, Some(&new_key), object); + } else { + insert_value(base_json, &new_key, value.clone()); + } + } +} + +fn insert_array(base_json: &mut Map, base_key: &str, array: &Vec) { + for value in array { + if let Some(object) = value.as_object() { + insert_object(base_json, Some(base_key), object); + } else if let Some(sub_array) = value.as_array() { + insert_array(base_json, base_key, sub_array); + } else { + insert_value(base_json, base_key, value.clone()); + } + } +} + +fn insert_value(base_json: &mut Map, key: &str, to_insert: Value) { + debug_assert!(!to_insert.is_object()); + debug_assert!(!to_insert.is_array()); + + // does the field aleardy exists? + if let Some(value) = base_json.get_mut(key) { + // is it already an array + if let Some(array) = value.as_array_mut() { + array.push(to_insert); + // or is there a collision + } else { + let value = std::mem::take(value); + base_json[key] = json!([value, to_insert]); + } + // if it does not exist we can push the value untouched + } else { + base_json.insert(key.to_string(), json!(to_insert)); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn no_flattening() { + let mut base: Value = json!({ + "id": "287947", + "title": "Shazam!", + "release_date": 1553299200, + "genres": [ + "Action", + "Comedy", + "Fantasy" + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + println!( + "got:\n{}\nexpected:\n{}\n", + serde_json::to_string_pretty(&flat).unwrap(), + serde_json::to_string_pretty(&json).unwrap() + ); + + assert_eq!(flat, json); + } + + #[test] + fn flatten_object() { + let mut base: Value = json!({ + "a": { + "b": "c", + "d": "e", + "f": "g" + } + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a.b": "c", + "a.d": "e", + "a.f": "g" + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn flatten_array() { + let mut base: Value = json!({ + "a": [ + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a.b": ["c", "d", "e"], + }) + .as_object() + .unwrap() + ); + + // here we must keep 42 in "a" + let mut base: Value = json!({ + "a": [ + 42, + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": 42, + "a.b": ["c", "d", "e"], + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn collision_with_object() { + let mut base: Value = json!({ + "a": { + "b": "c", + }, + "a.b": "d", + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a.b": ["c", "d"], + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn collision_with_array() { + let mut base: Value = json!({ + "a": [ + { "b": "c" }, + { "b": "d", "c": "e" }, + [35], + ], + "a.b": "f", + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a.b": ["c", "d", "f"], + "a.c": "e", + "a": 35, + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn flatten_nested_arrays() { + let mut base: Value = json!({ + "a": [ + ["b", "c"], + { "d": "e" }, + ["f", "g"], + [ + { "h": "i" }, + { "d": "j" }, + ], + ["k", "l"], + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": ["b", "c", "f", "g", "k", "l"], + "a.d": ["e", "j"], + "a.h": "i", + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn flatten_nested_arrays_and_objects() { + let mut base: Value = json!({ + "a": [ + "b", + ["c", "d"], + { "e": ["f", "g"] }, + [ + { "h": "i" }, + { "e": ["j", { "z": "y" }] }, + ], + ["l"], + "m", + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + println!("{}", serde_json::to_string_pretty(&flat).unwrap()); + + assert_eq!( + &flat, + json!({ + "a": ["b", "c", "d", "l", "m"], + "a.e": ["f", "g", "j"], + "a.h": "i", + "a.e.z": "y", + }) + .as_object() + .unwrap() + ); + } +} diff --git a/flatten-serde-json/src/main.rs b/flatten-serde-json/src/main.rs new file mode 100644 index 000000000..dabb386f1 --- /dev/null +++ b/flatten-serde-json/src/main.rs @@ -0,0 +1,11 @@ +use std::io::stdin; + +use flatten_serde_json::flatten; +use serde_json::{Map, Value}; + +fn main() { + let json: Map = serde_json::from_reader(stdin()).unwrap(); + + let result = flatten(&json); + println!("{}", serde_json::to_string_pretty(&result).unwrap()); +} diff --git a/milli/Cargo.toml b/milli/Cargo.toml index e8723dc6a..a83cfd6f2 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -14,7 +14,7 @@ crossbeam-channel = "0.5.2" either = "1.6.1" fst = "0.4.7" fxhash = "0.2.1" -flatten-serde-json = "0.1.0" +flatten-serde-json = { path = "../flatten-serde-json" } grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } From a769e09dfa57a5e1fe107f3ab1d6e830a9b939fa Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 7 Apr 2022 20:15:14 +0200 Subject: [PATCH 1351/1889] Make token_crop_bounds more rust idiomatic --- milli/src/search/matches/mod.rs | 71 +++++++++++++++------------------ 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 993ee1f2b..d6e7dcc37 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -230,7 +230,7 @@ impl<'t> Matcher<'t, '_> { } } - /// Returns token position of the window to crop around. + /// Returns the bounds in byte index of the crop window. fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); @@ -241,70 +241,64 @@ impl<'t> Matcher<'t, '_> { // matches needs to be counted in the crop len. let mut remaining_words = self.crop_size + first_match_word_position - last_match_word_position; - // if first token is a word, then remove 1 to remaining_words. - if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) { - remaining_words -= 1; - } - // we start from matches positions, then we expand the window in both sides. - let mut first_token_position = first_match_token_position; - let mut last_token_position = last_match_token_position; + let mut before_tokens = self.tokens[..first_match_token_position].iter().rev().peekable(); + let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); + while remaining_words > 0 { - match ( - // try to expand left - first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)), - // try to expand right - last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)), - ) { + let before_token = before_tokens.peek().map(|t| t.is_separator()); + let after_token = after_tokens.peek().map(|t| t.is_separator()); + + match (before_token, after_token) { // we can expand both sides. - (Some(ft), Some(lt)) => { - match (ft.is_separator(), lt.is_separator()) { + (Some(before_token), Some(after_token)) => { + match (before_token, after_token) { // if they are both separators and are the same kind then advance both, // or expand in the soft separator separator side. - (Some(f_kind), Some(s_kind)) => { - if f_kind == s_kind { - first_token_position -= 1; - last_token_position += 1; - } else if f_kind == SeparatorKind::Hard { - last_token_position += 1; + (Some(before_token_kind), Some(after_token_kind)) => { + if before_token_kind == after_token_kind { + before_tokens.next(); + after_tokens.next(); + } else if before_token_kind == SeparatorKind::Hard { + after_tokens.next(); } else { - first_token_position -= 1; + before_tokens.next(); } } // if one of the tokens is a word, we expend in the side of the word. // left is a word, advance left. (None, Some(_)) => { - first_token_position -= 1; + before_tokens.next(); remaining_words -= 1; } // right is a word, advance right. (Some(_), None) => { - last_token_position += 1; + after_tokens.next(); remaining_words -= 1; } // both are words, advance left then right if remaining_word > 0. (None, None) => { - first_token_position -= 1; + before_tokens.next(); remaining_words -= 1; if remaining_words > 0 { - last_token_position += 1; + after_tokens.next(); remaining_words -= 1; } } } } // the end of the text is reached, advance left. - (Some(ft), None) => { - first_token_position -= 1; - if ft.is_separator().is_none() { + (Some(before_token), None) => { + before_tokens.next(); + if before_token.is_none() { remaining_words -= 1; } } // the start of the text is reached, advance right. - (None, Some(lt)) => { - last_token_position += 1; - if lt.is_separator().is_none() { + (None, Some(after_token)) => { + after_tokens.next(); + if after_token.is_none() { remaining_words -= 1; } } @@ -313,7 +307,10 @@ impl<'t> Matcher<'t, '_> { } } - (first_token_position, last_token_position) + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) } /// Compute the score of a match interval: @@ -401,11 +398,7 @@ impl<'t> Matcher<'t, '_> { fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { let match_interval = self.find_best_match_interval(matches); - let (first_token_position, last_token_position) = self.token_crop_bounds(match_interval); - - let byte_start = self.tokens.get(first_token_position).map_or(0, |t| t.byte_start); - let byte_end = self.tokens.get(last_token_position).map_or(byte_start, |t| t.byte_end); - (byte_start, byte_end) + self.token_crop_bounds(match_interval) } // Returns the formatted version of the original text. From a16de5de84935e6663767ff9912bcd0ad0d98af2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Fri, 8 Apr 2022 11:20:41 +0200 Subject: [PATCH 1352/1889] Symplify format and remove intermediate function --- milli/src/search/matches/mod.rs | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index d6e7dcc37..71ff2f1b3 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -231,7 +231,7 @@ impl<'t> Matcher<'t, '_> { } /// Returns the bounds in byte index of the crop window. - fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) { + fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); @@ -394,13 +394,6 @@ impl<'t> Matcher<'t, '_> { } } - /// Returns the bounds in byte index of the crop window. - fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { - let match_interval = self.find_best_match_interval(matches); - - self.token_crop_bounds(match_interval) - } - // Returns the formatted version of the original text. pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { // If 0 it will be considered null and thus not crop the field @@ -412,6 +405,9 @@ impl<'t> Matcher<'t, '_> { } else { match &self.matches { Some(matches) => { + let matches = + if crop { self.find_best_match_interval(matches) } else { matches }; + let (byte_start, byte_end) = if crop { self.crop_bounds(matches) } else { (0, self.text.len()) }; @@ -427,11 +423,7 @@ impl<'t> Matcher<'t, '_> { if highlight { // insert highlight markers around matches. let tokens = self.tokens; - for m in matches - .iter() - .skip_while(|m| tokens[m.token_position].byte_start < byte_start) - .take_while(|m| tokens[m.token_position].byte_start < byte_end) - { + for m in matches { let token = &tokens[m.token_position]; if byte_index < token.byte_start { From 9383629d13de520cf946193522bc177fc8df437b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Sat, 9 Apr 2022 23:37:27 +0200 Subject: [PATCH 1353/1889] Enforce labelling for the PRs --- .github/release-draft-template.yml | 15 +++++++++------ .github/workflows/enforce-label.yml | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/enforce-label.yml diff --git a/.github/release-draft-template.yml b/.github/release-draft-template.yml index 08e1f2fc7..ba19b9d6e 100644 --- a/.github/release-draft-template.yml +++ b/.github/release-draft-template.yml @@ -1,18 +1,21 @@ name-template: 'Milli v$RESOLVED_VERSION' tag-template: 'v$RESOLVED_VERSION' exclude-labels: - - 'skip-changelog' + - 'skip changelog' version-resolver: minor: labels: - - 'breaking-change' + - 'DB breaking' + - 'API breaking' default: patch categories: - - title: 'Breaking changes ⚠️' - label: 'breaking-change' + - title: 'API breaking' + label: 'API breaking' + - title: 'DB breaking' + label: 'DB breaking' + - title: 'Changes' + label: 'no breaking' template: | - ## Changes - $CHANGES Thanks again to $CONTRIBUTORS! 🎉 diff --git a/.github/workflows/enforce-label.yml b/.github/workflows/enforce-label.yml new file mode 100644 index 000000000..f8d8bc4e6 --- /dev/null +++ b/.github/workflows/enforce-label.yml @@ -0,0 +1,14 @@ +name: Enforce PR labels + +on: + pull_request: + types: [labeled, unlabeled, opened, edited, synchronize] + +jobs: + enforce-label: + name: Specify breaking + runs-on: ubuntu-latest + steps: + - uses: yogevbd/enforce-label-action@2.1.0 + with: + REQUIRED_LABELS_ANY: 'no breaking,DB breaking,API breaking,skip changelog' From e153418b8adfd9a5af904ac414c6f9f8997588f7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 11 Apr 2022 14:52:41 +0200 Subject: [PATCH 1354/1889] remove the unused key warning --- flatten-serde-json/Cargo.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index db92c1ded..7c18656f1 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -4,10 +4,6 @@ version = "0.1.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" -author = ["Tamo tamo@meilisearch.com"] -repository = "https://github.com/irevoire/flatten-serde-json" -keywords = ["json", "flatten"] -categories = ["command-line-utilities"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html From 011f8210eddb7cf4bbfedc3aa07e9c5cb73206de Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 11 Apr 2022 16:46:45 +0200 Subject: [PATCH 1355/1889] Make compute_matches more rust idiomatic --- milli/src/search/matches/mod.rs | 207 ++++++++++++++++++++------------ 1 file changed, 127 insertions(+), 80 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 71ff2f1b3..04e552c8d 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -110,63 +110,53 @@ pub struct Matcher<'t, 'm> { impl<'t> Matcher<'t, '_> { /// Iterates over tokens and save any of them that matches the query. fn compute_matches(&mut self) -> &mut Self { - fn compute_partial_match( + fn compute_partial_match<'a>( mut partial: PartialMatch, - tokens: &[Token], - token_position: &mut usize, - word_position: &mut usize, + token_position: usize, + word_position: usize, + words_positions: &mut impl Iterator)>, matches: &mut Vec, ) -> bool { - let mut potential_matches = vec![(*token_position, *word_position, partial.char_len())]; - let mut t_position = 1; - let mut w_position = 1; - for token in &tokens[*token_position + 1..] { - if token.is_separator().is_none() { - partial = match partial.match_token(&token) { - // token matches the partial match, but the match is not full, - // we temporarly save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => { - potential_matches.push(( - *token_position + t_position, - *word_position + w_position, - partial.char_len(), - )); - partial - } - // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - // save previously matched tokens as matches. - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| Match { - match_len, - ids: ids.to_vec(), - word_position, - token_position, - }, - ); - matches.extend(iter); + let mut potential_matches = Vec::new(); - // move word and token positions after the end of the match. - *word_position += w_position; - *token_position += t_position; + // Add first match to potential matches. + potential_matches.push((token_position, word_position, partial.char_len())); - // save the token that closes the partial match as a match. - matches.push(Match { - match_len: char_len, + for (token_position, word_position, word) in words_positions { + partial = match partial.match_token(&word) { + // token matches the partial match, but the match is not full, + // we temporarly save the current token then we try to match the next one. + Some(MatchType::Partial(partial)) => { + potential_matches.push((token_position, word_position, partial.char_len())); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + // save previously matched tokens as matches. + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| Match { + match_len, ids: ids.to_vec(), - word_position: *word_position, - token_position: *token_position, - }); + word_position, + token_position, + }, + ); + matches.extend(iter); - // the match is complete, we return true. - return true; - } - // no match, continue to next match. - None => break, - }; - w_position += 1; - } - t_position += 1; + // save the token that closes the partial match as a match. + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + + // the match is complete, we return true. + return true; + } + // no match, continue to next match. + None => break, + }; } // the match is not complete, we return false. @@ -174,42 +164,54 @@ impl<'t> Matcher<'t, '_> { } let mut matches = Vec::new(); - let mut word_position = 0; - let mut token_position = 0; - while let Some(token) = self.tokens.get(token_position) { - if token.is_separator().is_none() { - for match_type in self.matching_words.match_token(&token) { - match match_type { - // we match, we save the current token as a match, + + let mut words_positions = self + .tokens + .iter() + .scan((0, 0), |(token_position, word_position), token| { + let current_token_position = *token_position; + let current_word_position = *word_position; + *token_position += 1; + if token.is_separator().is_none() { + *word_position += 1; + } + + Some((current_token_position, current_word_position, token)) + }) + .filter(|(_, _, token)| token.is_separator().is_none()); + + while let Some((token_position, word_position, word)) = words_positions.next() { + for match_type in self.matching_words.match_token(word) { + match match_type { + // we match, we save the current token as a match, + // then we continue the rest of the tokens. + MatchType::Full { char_len, ids } => { + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + break; + } + // we match partially, iterate over next tokens to check if we can complete the match. + MatchType::Partial(partial) => { + // if match is completed, we break the matching loop over the current token, // then we continue the rest of the tokens. - MatchType::Full { char_len, ids } => { - matches.push(Match { - match_len: char_len, - ids: ids.to_vec(), - word_position, - token_position, - }); + let mut wp = words_positions.clone(); + if compute_partial_match( + partial, + token_position, + word_position, + &mut wp, + &mut matches, + ) { + words_positions = wp; break; } - // we match partially, iterate over next tokens to check if we can complete the match. - MatchType::Partial(partial) => { - // if match is completed, we break the matching loop over the current token, - // then we continue the rest of the tokens. - if compute_partial_match( - partial, - &self.tokens, - &mut token_position, - &mut word_position, - &mut matches, - ) { - break; - } - } } } - word_position += 1; } - token_position += 1; } self.matches = Some(matches); @@ -826,4 +828,49 @@ mod tests { // because crop size is 0, crop is ignored. assert_eq!(&matcher.format(highlight, crop), "void void split the world void void."); } + + #[test] + fn partial_matches() { + let matching_words = vec![ + (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), + ( + vec![ + MatchingWord::new("t".to_string(), 0, false), + MatchingWord::new("he".to_string(), 0, false), + ], + vec![0], + ), + (vec![MatchingWord::new("door".to_string(), 0, false)], vec![1]), + ( + vec![ + MatchingWord::new("do".to_string(), 0, false), + MatchingWord::new("or".to_string(), 0, false), + ], + vec![1], + ), + (vec![MatchingWord::new("do".to_string(), 0, false)], vec![2]), + ]; + + let matching_words = MatchingWords::new(matching_words); + + let mut builder = MatcherBuilder::from_matching_words(matching_words); + builder.highlight_prefix("_".to_string()); + builder.highlight_suffix("_".to_string()); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = true; + let crop = false; + + let text = "the do or die can't be he do and or isn't he"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + + let mut matcher = builder.build(&tokens[..], text); + assert_eq!( + &matcher.format(highlight, crop), + "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_", + "matches: {:?}", + &matcher.matches + ); + } } From 827cedcd15b9943d52194d8ea17bfc154dfeebf4 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 12 Apr 2022 13:42:14 +0200 Subject: [PATCH 1356/1889] Add format option structure --- http-ui/src/main.rs | 8 +- milli/src/lib.rs | 3 +- milli/src/search/matches/mod.rs | 163 +++++++++++++++----------------- milli/src/search/mod.rs | 2 +- 4 files changed, 85 insertions(+), 91 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index fdfc04af9..adf7f1788 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -25,8 +25,8 @@ use milli::update::{ ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, }; use milli::{ - obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatcherBuilder, - SearchResult, SortError, + obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index, + MatcherBuilder, SearchResult, SortError, }; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; @@ -162,7 +162,9 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed: Vec<_> = analyzed.tokens().collect(); let mut matcher = matcher_builder.build(&analyzed[..], &old_string); - Value::String(matcher.format(true, true).to_string()) + let format_options = FormatOptions { highlight: true, crop: Some(10) }; + + Value::String(matcher.format(format_options).to_string()) } Value::Array(values) => Value::Array( values.into_iter().map(|v| self.highlight_value(v, matcher_builder)).collect(), diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 6cbb9f126..793079563 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,7 +37,8 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, MatchBounds, MatcherBuilder, MatchingWords, Search, SearchResult, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search, + SearchResult, }; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 04e552c8d..65ff0a255 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -8,14 +8,12 @@ use crate::search::matches::matching_words::PartialMatch; pub mod matching_words; -const DEFAULT_CROP_SIZE: usize = 10; const DEFAULT_CROP_MARKER: &'static str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &'static str = ""; const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = ""; pub struct MatcherBuilder { matching_words: MatchingWords, - crop_size: usize, crop_marker: Option, highlight_prefix: Option, highlight_suffix: Option, @@ -23,18 +21,7 @@ pub struct MatcherBuilder { impl MatcherBuilder { pub fn from_matching_words(matching_words: MatchingWords) -> Self { - Self { - matching_words, - crop_size: DEFAULT_CROP_SIZE, - crop_marker: None, - highlight_prefix: None, - highlight_suffix: None, - } - } - - pub fn crop_size(&mut self, word_count: usize) -> &Self { - self.crop_size = word_count; - self + Self { matching_words, crop_marker: None, highlight_prefix: None, highlight_suffix: None } } pub fn crop_marker(&mut self, marker: String) -> &Self { @@ -70,7 +57,6 @@ impl MatcherBuilder { text, tokens, matching_words: &self.matching_words, - crop_size: self.crop_size, crop_marker, highlight_prefix, highlight_suffix, @@ -79,6 +65,18 @@ impl MatcherBuilder { } } +#[derive(Copy, Clone, Default)] +pub struct FormatOptions { + pub highlight: bool, + pub crop: Option, +} + +impl FormatOptions { + pub fn merge(self, other: Self) -> Self { + Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } + } +} + #[derive(Clone, Debug)] pub struct Match { match_len: usize, @@ -100,7 +98,6 @@ pub struct Matcher<'t, 'm> { text: &'t str, tokens: &'t [Token<'t>], matching_words: &'m MatchingWords, - crop_size: usize, crop_marker: &'m str, highlight_prefix: &'m str, highlight_suffix: &'m str, @@ -233,7 +230,7 @@ impl<'t> Matcher<'t, '_> { } /// Returns the bounds in byte index of the crop window. - fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { + fn crop_bounds(&self, matches: &[Match], crop_size: usize) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); @@ -241,8 +238,7 @@ impl<'t> Matcher<'t, '_> { let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); // matches needs to be counted in the crop len. - let mut remaining_words = - self.crop_size + first_match_word_position - last_match_word_position; + let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; let mut before_tokens = self.tokens[..first_match_token_position].iter().rev().peekable(); let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); @@ -348,7 +344,7 @@ impl<'t> Matcher<'t, '_> { } /// Returns the matches interval where the score computed by match_interval_score is maximal. - fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { + fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { // we compute the matches interval if we have at least 2 matches. if matches.len() > 1 { // positions of the first and the last match of the best matches interval in `matches`. @@ -361,9 +357,7 @@ impl<'t> Matcher<'t, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - if next_match.word_position - matches[interval_first].word_position - >= self.crop_size - { + if next_match.word_position - matches[interval_first].word_position >= crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -375,7 +369,7 @@ impl<'t> Matcher<'t, '_> { // advance start of the interval while interval is longer than crop_size. while next_match.word_position - matches[interval_first].word_position - >= self.crop_size + >= crop_size { interval_first += 1; } @@ -397,21 +391,24 @@ impl<'t> Matcher<'t, '_> { } // Returns the formatted version of the original text. - pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { - // If 0 it will be considered null and thus not crop the field - // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let crop = crop && self.crop_size > 0; - if !highlight && !crop { + pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { + if !format_options.highlight && format_options.crop.is_none() { // compute matches is not needed if no highlight nor crop is requested. Cow::Borrowed(self.text) } else { match &self.matches { Some(matches) => { - let matches = - if crop { self.find_best_match_interval(matches) } else { matches }; + let matches = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.find_best_match_interval(matches, crop_size) + } + _ => matches, + }; - let (byte_start, byte_end) = - if crop { self.crop_bounds(matches) } else { (0, self.text.len()) }; + let (byte_start, byte_end) = match format_options.crop { + Some(crop_size) if crop_size > 0 => self.crop_bounds(matches, crop_size), + _ => (0, self.text.len()), + }; let mut formatted = Vec::new(); @@ -422,7 +419,7 @@ impl<'t> Matcher<'t, '_> { let mut byte_index = byte_start; - if highlight { + if format_options.highlight { // insert highlight markers around matches. let tokens = self.tokens; for m in matches { @@ -466,7 +463,7 @@ impl<'t> Matcher<'t, '_> { Cow::Owned(formatted.concat()) } } - None => self.compute_matches().format(highlight, crop), + None => self.compute_matches().format(format_options), } } } @@ -496,8 +493,7 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = false; - let crop = false; + let format_options = FormatOptions { highlight: false, crop: None }; // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -505,7 +501,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; @@ -513,7 +509,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -521,7 +517,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); } #[test] @@ -531,22 +527,21 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = false; + let format_options = FormatOptions { highlight: true, crop: None }; // empty text. let text = ""; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ""); + assert_eq!(&matcher.format(format_options.clone()), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ":-)"); + assert_eq!(&matcher.format(format_options.clone()), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -554,7 +549,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text, because there is no matches. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; @@ -562,7 +557,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(format_options.clone()), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -571,7 +566,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "Natalie risk her future to build a world with the boy she loves." ); } @@ -588,8 +583,7 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = false; + let format_options = FormatOptions { highlight: true, crop: None }; // Text containing prefix match. let text = "Ŵôřlḑôle"; @@ -597,7 +591,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Ŵôřlḑôle"); + assert_eq!(&matcher.format(format_options.clone()), "Ŵôřlḑôle"); // Text containing unicode match. let text = "Ŵôřlḑ"; @@ -605,7 +599,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Ŵôřlḑ"); + assert_eq!(&matcher.format(format_options.clone()), "Ŵôřlḑ"); // Text containing unicode match. let text = "Westfália"; @@ -613,7 +607,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Westfália"); + assert_eq!(&matcher.format(format_options.clone()), "Westfália"); } #[test] @@ -623,22 +617,21 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = false; - let crop = true; + let format_options = FormatOptions { highlight: false, crop: Some(10) }; // empty text. let text = ""; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ""); + assert_eq!(&matcher.format(format_options.clone()), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ":-)"); + assert_eq!(&matcher.format(format_options.clone()), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -647,7 +640,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "A quick brown fox can not jump 32 feet, right…" ); @@ -658,7 +651,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "(A quick brown fox can not jump 32 feet, right…" ); @@ -669,7 +662,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // should crop the phrase instead of croping around the match. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…Split The World is a book written by Emily Henry…" ); @@ -680,7 +673,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…future to build a world with the boy she loves…" ); @@ -691,7 +684,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…she loves. Emily Henry: The Love That Split The World." ); @@ -702,7 +695,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); @@ -713,7 +706,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); @@ -724,7 +717,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); } @@ -736,22 +729,21 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = true; + let format_options = FormatOptions { highlight: true, crop: Some(10) }; // empty text. let text = ""; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ""); + assert_eq!(&matcher.format(format_options.clone()), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ":-)"); + assert_eq!(&matcher.format(format_options.clone()), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -760,7 +752,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // both should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "A quick brown fox can not jump 32 feet, right…" ); @@ -771,7 +763,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…future to build a world with the boy she loves…" ); @@ -781,7 +773,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(format_options.clone()), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; @@ -790,7 +782,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); } @@ -800,33 +792,33 @@ mod tests { //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 let matching_words = matching_words(); - let mut builder = MatcherBuilder::from_matching_words(matching_words); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = false; - let crop = true; - let text = "void void split the world void void."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); // set a smaller crop size - builder.crop_size(2); + let format_options = FormatOptions { highlight: false, crop: Some(2) }; + let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "…split the…"); + assert_eq!(&matcher.format(format_options), "…split the…"); // set a smaller crop size - builder.crop_size(1); + let format_options = FormatOptions { highlight: false, crop: Some(1) }; + let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "…split…"); + assert_eq!(&matcher.format(format_options), "…split…"); + + // set crop size to 0 + let format_options = FormatOptions { highlight: false, crop: Some(0) }; - // set a smaller crop size - builder.crop_size(0); let mut matcher = builder.build(&tokens[..], text); // because crop size is 0, crop is ignored. - assert_eq!(&matcher.format(highlight, crop), "void void split the world void void."); + assert_eq!(&matcher.format(format_options), "void void split the world void void."); } #[test] @@ -858,8 +850,7 @@ mod tests { builder.highlight_suffix("_".to_string()); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = false; + let format_options = FormatOptions { highlight: true, crop: None }; let text = "the do or die can't be he do and or isn't he"; let analyzed = analyzer.analyze(&text); @@ -867,7 +858,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options), "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_", "matches: {:?}", &matcher.matches diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 2b025f269..a9712d261 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matches::{MatchBounds, Matcher, MatcherBuilder, MatchingWords}; +pub use self::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; From 5809d3ae0d3c7b86d4c65ccabd689038cc3b0bc7 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 12 Apr 2022 16:31:58 +0200 Subject: [PATCH 1357/1889] Add first benchmarks on formatting --- benchmarks/Cargo.toml | 4 ++ benchmarks/benches/formatting.rs | 68 ++++++++++++++++++++++++++++++++ milli/src/lib.rs | 4 +- milli/src/search/matches/mod.rs | 6 +-- milli/src/search/mod.rs | 4 +- 5 files changed, 79 insertions(+), 7 deletions(-) create mode 100644 benchmarks/benches/formatting.rs diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 0cac5e017..0dbbd6d6f 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -39,3 +39,7 @@ harness = false [[bench]] name = "indexing" harness = false + +[[bench]] +name = "formatting" +harness = false diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs new file mode 100644 index 000000000..5045df268 --- /dev/null +++ b/benchmarks/benches/formatting.rs @@ -0,0 +1,68 @@ +use criterion::{criterion_group, criterion_main}; +use milli::tokenizer::{Analyzer, AnalyzerConfig}; +use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +struct Conf<'a> { + name: &'a str, + text: &'a str, + matching_words: MatcherBuilder, +} + +fn bench_formatting(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + Conf { + name: "'the door d'", + text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#, + matching_words: MatcherBuilder::from_matching_words(MatchingWords::new(vec![ + (vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]), + (vec![MatchingWord::new("do".to_string(), 0, false), MatchingWord::new("or".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("thedoor".to_string(), 1, false)], vec![0, 1]), + (vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]), + (vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]), + (vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]), + ])), + }, + ]; + + let format_options = &[ + FormatOptions { highlight: false, crop: None }, + FormatOptions { highlight: true, crop: None }, + FormatOptions { highlight: false, crop: Some(10) }, + FormatOptions { highlight: true, crop: Some(10) }, + FormatOptions { highlight: false, crop: Some(20) }, + FormatOptions { highlight: true, crop: Some(20) }, + ]; + + for option in format_options { + let highlight = if option.highlight { "highlight" } else { "no-highlight" }; + + let name = match option.crop { + Some(size) => format!("{}-crop({})", highlight, size), + None => format!("{}-no-crop", highlight), + }; + + let mut group = c.benchmark_group(&name); + for conf in confs { + group.bench_function(conf.name, |b| { + b.iter(|| { + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + let analyzed = analyzer.analyze(&conf.text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = conf.matching_words.build(&tokens[..], conf.text); + matcher.format(option.clone()); + }) + }); + } + group.finish(); + } +} + +criterion_group!(benches, bench_formatting); +criterion_main!(benches); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 793079563..6f5d4abe8 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,8 +37,8 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search, - SearchResult, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, + MatchingWords, Search, SearchResult, }; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 65ff0a255..ad4f6cd69 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,11 +1,9 @@ use std::borrow::Cow; -pub use matching_words::MatchingWords; -use matching_words::{MatchType, PrimitiveWordId}; +use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; +pub use matching_words::{MatchingWord, MatchingWords}; use meilisearch_tokenizer::token::{SeparatorKind, Token}; -use crate::search::matches::matching_words::PartialMatch; - pub mod matching_words; const DEFAULT_CROP_MARKER: &'static str = "…"; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index a9712d261..979b2fd7a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,9 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; +pub use self::matches::{ + FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, +}; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; From 0261a0e3cf0d9df6e529a280795a3ce1f7a1ab41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Mon, 11 Apr 2022 15:15:13 +0200 Subject: [PATCH 1358/1889] Add the new `Specify breaking` check to bors.toml --- bors.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/bors.toml b/bors.toml index 717fd69d1..8ce9ec3da 100644 --- a/bors.toml +++ b/bors.toml @@ -4,5 +4,6 @@ status = [ 'Tests on windows-latest with stable', 'Run Rustfmt', ] +pr_status = ['Specify breaking'] # 3 hours timeout timeout-sec = 10800 From aa896f0e7a64ad9a96c27f094e1c50126f948552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Mon, 11 Apr 2022 15:47:50 +0200 Subject: [PATCH 1359/1889] Update bors.toml --- bors.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bors.toml b/bors.toml index 8ce9ec3da..1e90bbc8a 100644 --- a/bors.toml +++ b/bors.toml @@ -4,6 +4,6 @@ status = [ 'Tests on windows-latest with stable', 'Run Rustfmt', ] -pr_status = ['Specify breaking'] +pr_status = ['Enforce PR labels / Specify breaking'] # 3 hours timeout timeout-sec = 10800 From 7ad582f39ff16786ae1734e28c342f7822e43030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Mon, 11 Apr 2022 19:15:23 +0200 Subject: [PATCH 1360/1889] Update bors.toml --- bors.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bors.toml b/bors.toml index 1e90bbc8a..8ce9ec3da 100644 --- a/bors.toml +++ b/bors.toml @@ -4,6 +4,6 @@ status = [ 'Tests on windows-latest with stable', 'Run Rustfmt', ] -pr_status = ['Enforce PR labels / Specify breaking'] +pr_status = ['Specify breaking'] # 3 hours timeout timeout-sec = 10800 From bbb6728d2f017f2a6eea34c15fd6a7c8155d77c1 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 8 Apr 2022 17:32:22 +0200 Subject: [PATCH 1361/1889] add distinct attributes to cli --- cli/src/main.rs | 10 ++++++++++ milli/src/search/mod.rs | 2 ++ 2 files changed, 12 insertions(+) diff --git a/cli/src/main.rs b/cli/src/main.rs index 3e9e8c75f..97580142b 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -469,6 +469,8 @@ struct SettingsUpdate { criteria: Option>, #[structopt(long)] exact_attributes: Option>, + #[structopt(long)] + distinct_attribute: Option, } impl Performer for SettingsUpdate { @@ -503,6 +505,14 @@ impl Performer for SettingsUpdate { } } + if let Some(distinct_attr) = self.distinct_attribute { + if !distinct_attr.is_empty() { + update.set_distinct_field(distinct_attr); + } else { + update.reset_distinct_field(); + } + } + let mut bars = Vec::new(); let progesses = MultiProgress::new(); for _ in 0..4 { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index b01bae817..d53bcafb7 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -243,6 +243,8 @@ impl<'a> Search<'a> { excluded_candidates = candidates.into_excluded(); } + dbg!(excluded_candidates.len()); + Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) } } From cd83014fff5b8cf7300be62e1fd470d6f4af87be Mon Sep 17 00:00:00 2001 From: ad hoc Date: Sat, 9 Apr 2022 14:50:43 +0200 Subject: [PATCH 1362/1889] add test for disctinct nb hits --- milli/tests/search/distinct.rs | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index 631618f73..022724fde 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -8,7 +8,7 @@ use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; macro_rules! test_distinct { - ($func:ident, $distinct:ident, $criteria:expr) => { + ($func:ident, $distinct:ident, $criteria:expr, $n_res:expr) => { #[test] fn $func() { let criteria = $criteria; @@ -30,7 +30,9 @@ macro_rules! test_distinct { search.authorize_typos(true); search.optional_words(true); - let SearchResult { documents_ids, .. } = search.execute().unwrap(); + let SearchResult { documents_ids, candidates, .. } = search.execute().unwrap(); + + assert_eq!(candidates.len(), $n_res); let mut distinct_values = HashSet::new(); let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) @@ -54,20 +56,22 @@ macro_rules! test_distinct { test_distinct!( distinct_string_default_criteria, tag, - vec![Words, Typo, Proximity, Attribute, Exactness] + vec![Words, Typo, Proximity, Attribute, Exactness], + 3 ); test_distinct!( distinct_number_default_criteria, asc_desc_rank, - vec![Words, Typo, Proximity, Attribute, Exactness] + vec![Words, Typo, Proximity, Attribute, Exactness], + 7 ); -test_distinct!(distinct_string_criterion_words, tag, vec![Words]); -test_distinct!(distinct_number_criterion_words, asc_desc_rank, vec![Words]); -test_distinct!(distinct_string_criterion_words_typo, tag, vec![Words, Typo]); -test_distinct!(distinct_number_criterion_words_typo, asc_desc_rank, vec![Words, Typo]); -test_distinct!(distinct_string_criterion_words_proximity, tag, vec![Words, Proximity]); -test_distinct!(distinct_number_criterion_words_proximity, asc_desc_rank, vec![Words, Proximity]); -test_distinct!(distinct_string_criterion_words_attribute, tag, vec![Words, Attribute]); -test_distinct!(distinct_number_criterion_words_attribute, asc_desc_rank, vec![Words, Attribute]); -test_distinct!(distinct_string_criterion_words_exactness, tag, vec![Words, Exactness]); -test_distinct!(distinct_number_criterion_words_exactness, asc_desc_rank, vec![Words, Exactness]); +test_distinct!(distinct_string_criterion_words, tag, vec![Words], 3); +test_distinct!(distinct_number_criterion_words, asc_desc_rank, vec![Words], 7); +test_distinct!(distinct_string_criterion_words_typo, tag, vec![Words, Typo], 3); +test_distinct!(distinct_number_criterion_words_typo, asc_desc_rank, vec![Words, Typo], 7); +test_distinct!(distinct_string_criterion_words_proximity, tag, vec![Words, Proximity], 3); +test_distinct!(distinct_number_criterion_words_proximity, asc_desc_rank, vec![Words, Proximity], 7); +test_distinct!(distinct_string_criterion_words_attribute, tag, vec![Words, Attribute], 3); +test_distinct!(distinct_number_criterion_words_attribute, asc_desc_rank, vec![Words, Attribute], 7); +test_distinct!(distinct_string_criterion_words_exactness, tag, vec![Words, Exactness], 3); +test_distinct!(distinct_number_criterion_words_exactness, asc_desc_rank, vec![Words, Exactness], 7); From dda28d74156bc45e2895d9226ea00c64210edb10 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Sat, 9 Apr 2022 14:30:00 +0200 Subject: [PATCH 1363/1889] exclude excluded canditates from search result candidates --- milli/src/search/mod.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index d53bcafb7..dcb2e0803 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -243,9 +243,11 @@ impl<'a> Search<'a> { excluded_candidates = candidates.into_excluded(); } - dbg!(excluded_candidates.len()); - - Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) + Ok(SearchResult { + matching_words, + candidates: initial_candidates - excluded_candidates, + documents_ids, + }) } } From 436d2032c4aeac23434732ad374c3a46883aa5cd Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Apr 2022 15:40:07 -0700 Subject: [PATCH 1364/1889] Add benchmarks to the flatten-serde-json subcrate --- flatten-serde-json/Cargo.toml | 9 +++-- flatten-serde-json/benches/benchmarks.rs | 42 ++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 flatten-serde-json/benches/benchmarks.rs diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 7c18656f1..0220e8ceb 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -5,7 +5,12 @@ edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] serde_json = "1.0" + +[dev-dependencies] +criterion = { version = "0.3", features = ["html_reports"] } + +[[bench]] +name = "benchmarks" +harness = false diff --git a/flatten-serde-json/benches/benchmarks.rs b/flatten-serde-json/benches/benchmarks.rs new file mode 100644 index 000000000..6536bb513 --- /dev/null +++ b/flatten-serde-json/benches/benchmarks.rs @@ -0,0 +1,42 @@ +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use flatten_serde_json::flatten; +use serde_json::json; + +pub fn flatten_simple(c: &mut Criterion) { + let mut input = json!({ + "a": { + "b": "c", + "d": "e", + "f": "g" + } + }); + let object = input.as_object_mut().unwrap(); + + c.bench_with_input(BenchmarkId::new("flatten", "simple"), &object, |b, input| { + b.iter(|| flatten(input)) + }); +} + +pub fn flatten_complex(c: &mut Criterion) { + let mut input = json!({ + "a": [ + "b", + ["c", "d"], + { "e": ["f", "g"] }, + [ + { "h": "i" }, + { "e": ["j", { "z": "y" }] }, + ], + ["l"], + "m", + ] + }); + let object = input.as_object_mut().unwrap(); + + c.bench_with_input(BenchmarkId::new("flatten", "complex"), &object, |b, input| { + b.iter(|| flatten(input)) + }); +} + +criterion_group!(benches, flatten_simple, flatten_complex); +criterion_main!(benches); From b3cec1a3832a4663635c873a700fbb4cc1089108 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Apr 2022 16:12:56 -0700 Subject: [PATCH 1365/1889] Prefer using direct method calls instead of using the json macros --- flatten-serde-json/src/lib.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs index 734ae2a24..8312f5bd6 100644 --- a/flatten-serde-json/src/lib.rs +++ b/flatten-serde-json/src/lib.rs @@ -1,6 +1,6 @@ #![doc = include_str!("../README.md")] -use serde_json::{json, Map, Value}; +use serde_json::{Map, Value}; pub fn flatten(json: &Map) -> Map { let mut obj = Map::new(); @@ -42,7 +42,7 @@ fn insert_value(base_json: &mut Map, key: &str, to_insert: Value) debug_assert!(!to_insert.is_object()); debug_assert!(!to_insert.is_array()); - // does the field aleardy exists? + // does the field already exists? if let Some(value) = base_json.get_mut(key) { // is it already an array if let Some(array) = value.as_array_mut() { @@ -50,16 +50,18 @@ fn insert_value(base_json: &mut Map, key: &str, to_insert: Value) // or is there a collision } else { let value = std::mem::take(value); - base_json[key] = json!([value, to_insert]); + base_json[key] = Value::Array(vec![value, to_insert]); } // if it does not exist we can push the value untouched } else { - base_json.insert(key.to_string(), json!(to_insert)); + base_json.insert(key.to_string(), to_insert); } } #[cfg(test)] mod tests { + use serde_json::json; + use super::*; #[test] From ee64f4a9367785dcafb0ac2a9738bc20d146c862 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 11 Apr 2022 15:43:18 +0200 Subject: [PATCH 1366/1889] Use smartstring to store the external id in our hashmap We need to store all the external id (primary key) in a hashmap associated to their internal id during. The smartstring remove heap allocation / memory usage and should improve the cache locality. --- milli/Cargo.toml | 1 + milli/src/update/index_documents/mod.rs | 7 +++++-- milli/src/update/index_documents/transform.rs | 9 ++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 641fb71e8..1295c4384 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -32,6 +32,7 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] } slice-group-by = "0.3.0" smallstr = { version = "0.3.0", features = ["serde"] } smallvec = "1.8.0" +smartstring = "1.0.1" tempfile = "3.3.0" time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } uuid = { version = "0.8.2", features = ["v4"] } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index eb50a85ed..ae353b0df 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1109,8 +1109,11 @@ mod tests { let mut big_object = HashMap::new(); big_object.insert(S("id"), "wow"); - let content: String = - (0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap(); + let content: String = (0..=u16::MAX) + .into_iter() + .map(|p| p.to_string()) + .reduce(|a, b| a + " " + b.as_ref()) + .unwrap(); big_object.insert("content".to_string(), &content); let mut cursor = Cursor::new(Vec::new()); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 4413e00ca..cbb6ed428 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -11,6 +11,7 @@ use itertools::Itertools; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; use serde_json::{Map, Value}; +use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::{IndexDocumentsMethod, IndexerConfig}; @@ -55,7 +56,8 @@ pub struct Transform<'a, 'i> { flattened_sorter: grenad::Sorter, replaced_documents_ids: RoaringBitmap, new_documents_ids: RoaringBitmap, - new_external_documents_ids_builder: FxHashMap, u64>, + // To increase the cache locality and the heap usage we use smartstring. + new_external_documents_ids_builder: FxHashMap, u64>, documents_count: usize, } @@ -254,10 +256,7 @@ impl<'a, 'i> Transform<'a, 'i> { None => { // if the document has already been inserted in this // batch we need to get its docid - match self - .new_external_documents_ids_builder - .entry(external_id.as_bytes().to_vec()) - { + match self.new_external_documents_ids_builder.entry(external_id.into()) { Entry::Occupied(entry) => (*entry.get() as u32, false), // if the document has never been encountered we give it a new docid // and push this new docid to the external documents ids builder From c2469b67659715fb56a1da115d9bc64df833f791 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 11 Apr 2022 18:43:44 +0200 Subject: [PATCH 1367/1889] create the json-depth-checker crate --- Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 16 +++ json-depth-checker/benches/depth.rs | 59 +++++++++ json-depth-checker/fuzz/Cargo.toml | 27 +++++ json-depth-checker/fuzz/fuzz_targets/depth.rs | 13 ++ json-depth-checker/src/lib.rs | 114 ++++++++++++++++++ 6 files changed, 230 insertions(+), 1 deletion(-) create mode 100644 json-depth-checker/Cargo.toml create mode 100644 json-depth-checker/benches/depth.rs create mode 100644 json-depth-checker/fuzz/Cargo.toml create mode 100644 json-depth-checker/fuzz/fuzz_targets/depth.rs create mode 100644 json-depth-checker/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index a9378adc4..506fd3dc3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["milli", "filter-parser", "flatten-serde-json", "http-ui", "benchmarks", "infos", "helpers", "cli"] +members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "http-ui", "benchmarks", "infos", "helpers", "cli"] default-members = ["milli"] [profile.dev] diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml new file mode 100644 index 000000000..9c386a383 --- /dev/null +++ b/json-depth-checker/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "json-depth-checker" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +serde_json = "1.0" + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "depth" +harness = false \ No newline at end of file diff --git a/json-depth-checker/benches/depth.rs b/json-depth-checker/benches/depth.rs new file mode 100644 index 000000000..e11bc1a68 --- /dev/null +++ b/json-depth-checker/benches/depth.rs @@ -0,0 +1,59 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use json_depth_checker::should_flatten_from_unchecked_slice; +use serde_json::json; + +fn criterion_benchmark(c: &mut Criterion) { + let null = serde_json::to_vec(&json!(null)).unwrap(); + let bool_true = serde_json::to_vec(&json!(true)).unwrap(); + let bool_false = serde_json::to_vec(&json!(false)).unwrap(); + let integer = serde_json::to_vec(&json!(42)).unwrap(); + let float = serde_json::to_vec(&json!(1456.258)).unwrap(); + let string = serde_json::to_vec(&json!("hello world")).unwrap(); + let object = serde_json::to_vec(&json!({ "hello": "world",})).unwrap(); + let complex_object = serde_json::to_vec(&json!({ + "doggos": [ + { "bernard": true }, + { "michel": 42 }, + false, + ], + "bouvier": true, + "caniche": null, + })) + .unwrap(); + let simple_array = serde_json::to_vec(&json!([ + 1, + 2, + 3, + "viva", + "l\"algeria", + true, + "[array]", + "escaped string \"" + ])) + .unwrap(); + let array_of_array = serde_json::to_vec(&json!([1, [2, [3]]])).unwrap(); + let array_of_object = serde_json::to_vec(&json!([1, [2, [3]], {}])).unwrap(); + + c.bench_function("null", |b| b.iter(|| should_flatten_from_unchecked_slice(&null))); + c.bench_function("true", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_true))); + c.bench_function("false", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_false))); + c.bench_function("integer", |b| b.iter(|| should_flatten_from_unchecked_slice(&integer))); + c.bench_function("float", |b| b.iter(|| should_flatten_from_unchecked_slice(&float))); + c.bench_function("string", |b| b.iter(|| should_flatten_from_unchecked_slice(&string))); + c.bench_function("object", |b| b.iter(|| should_flatten_from_unchecked_slice(&object))); + c.bench_function("complex object", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&complex_object)) + }); + c.bench_function("simple array", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&simple_array)) + }); + c.bench_function("array of array", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&array_of_array)) + }); + c.bench_function("array of object", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&array_of_object)) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/json-depth-checker/fuzz/Cargo.toml b/json-depth-checker/fuzz/Cargo.toml new file mode 100644 index 000000000..e36657ec2 --- /dev/null +++ b/json-depth-checker/fuzz/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "json-depth-checker" +version = "0.0.0" +authors = ["Automatically generated"] +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +arbitrary-json = "0.1.1" +serde_json = "1.0.79" + +[dependencies.json-depth-checker] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "depth" +path = "fuzz_targets/depth.rs" +test = false +doc = false diff --git a/json-depth-checker/fuzz/fuzz_targets/depth.rs b/json-depth-checker/fuzz/fuzz_targets/depth.rs new file mode 100644 index 000000000..6c3a6efe7 --- /dev/null +++ b/json-depth-checker/fuzz/fuzz_targets/depth.rs @@ -0,0 +1,13 @@ +#![no_main] +use arbitrary_json::ArbitraryValue; +use json_depth_checker::*; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|value: ArbitraryValue| { + let value = serde_json::Value::from(value); + let left = should_flatten_from_value(&value); + let value = serde_json::to_vec(&value).unwrap(); + let right = should_flatten_from_unchecked_slice(&value); + + assert_eq!(left, right); +}); diff --git a/json-depth-checker/src/lib.rs b/json-depth-checker/src/lib.rs new file mode 100644 index 000000000..0d423aadb --- /dev/null +++ b/json-depth-checker/src/lib.rs @@ -0,0 +1,114 @@ +use serde_json::Value; + +/// Your json MUST BE valid and generated by `serde_json::to_vec` before being +/// sent in this function. This function is DUMB and FAST but makes a lot of +/// asumption about the way `serde_json` will generate its input. +/// Will returns `true` if the json contains an object, an array of array +/// or an array containing an object. +/// Returns `false` for everything else. +pub fn should_flatten_from_unchecked_slice(json: &[u8]) -> bool { + if json.is_empty() { + return false; + } + + // since the json we receive has been generated by serde_json we know + // it doesn't contains any whitespace at the beginning thus we can check + // directly if we're looking at an object. + if json[0] == b'{' { + return true; + } else if json[0] != b'[' { + // if the json isn't an object or an array it means it's a simple value. + return false; + } + + // The array case is a little bit more complex. We are looking for a second + // `[` but we need to ensure that it doesn't appear inside of a string. Thus + // we need to keep track of if we're in a string or not. + + // will be used when we met a `\` to skip the next character. + let mut skip_next = false; + let mut in_string = false; + + for byte in json.iter().skip(1) { + match byte { + // handle the backlash. + _ if skip_next => skip_next = false, + b'\\' => skip_next = true, + + // handle the strings. + byte if in_string => { + if *byte == b'"' { + in_string = false; + } + } + b'"' => in_string = true, + + // handle the arrays. + b'[' => return true, + // since we know the json is valid we don't need to ensure the + // array is correctly closed + + // handle the objects. + b'{' => return true, + + // ignore everything else + _ => (), + } + } + + false +} + +/// Consider using [`should_flatten_from_unchecked_slice`] when you can. +/// Will returns `true` if the json contains an object, an array of array +/// or an array containing an object. +/// Returns `false` for everything else. +/// This function has been written to test the [`should_flatten_from_unchecked_slice`]. +pub fn should_flatten_from_value(json: &Value) -> bool { + match json { + Value::Object(..) => true, + Value::Array(array) => array.iter().any(|value| value.is_array() || value.is_object()), + _ => false, + } +} + +#[cfg(test)] +mod tests { + use serde_json::*; + + use super::*; + + #[test] + fn test_shouldnt_flatten() { + let shouldnt_flatten = vec![ + json!(null), + json!(true), + json!(false), + json!("a superb string"), + json!("a string escaping other \"string\""), + json!([null, true, false]), + json!(["hello", "world", "!"]), + json!(["a \"string\" escaping 'an other'", "\"[\"", "\"{\""]), + ]; + for value in shouldnt_flatten { + assert!(!should_flatten_from_value(&value)); + let value = serde_json::to_vec(&value).unwrap(); + assert!(!should_flatten_from_unchecked_slice(&value)); + } + } + + #[test] + fn test_should_flatten() { + let should_flatten = vec![ + json!({}), + json!({ "hello": "world" }), + json!(["hello", ["world"]]), + json!([true, true, true, true, true, true, true, true, true, {}]), + ]; + for value in should_flatten { + assert!(should_flatten_from_value(&value)); + let value = serde_json::to_vec(&value).unwrap(); + assert!(should_flatten_from_unchecked_slice(&value)); + } + } +} From 399fba16bb33ae3f8b66c6a568a9717a2e805e5e Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 12 Apr 2022 11:22:36 +0200 Subject: [PATCH 1368/1889] only flatten an object if it's nested --- milli/Cargo.toml | 1 + milli/src/update/index_documents/transform.rs | 30 ++++++++++++++----- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1295c4384..64497bc13 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,6 +18,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" } memmap2 = "0.5.3" diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index cbb6ed428..e94eb170b 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -286,9 +286,11 @@ impl<'a, 'i> Transform<'a, 'i> { })?; self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; - let buffer = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))?; - - self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; + if let Some(buffer) = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { + self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; + } else { + self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?; + } } else { self.new_documents_ids.insert(docid); } @@ -300,8 +302,13 @@ impl<'a, 'i> Transform<'a, 'i> { if let Some(flatten) = flattened_document { self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; } else { - let buffer = self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))?; - self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; + if let Some(buffer) = + self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? + { + self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; + } else { + self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?; + } } progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -326,8 +333,15 @@ impl<'a, 'i> Transform<'a, 'i> { } // Flatten a document from the fields ids map contained in self and insert the new - // created fields. - fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result> { + // created fields. Returns `None` if the document doesn't need to be flattened. + fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result>> { + if obkv + .iter() + .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value)) + { + return Ok(None); + } + let mut doc = serde_json::Map::new(); for (k, v) in obkv.iter() { @@ -357,7 +371,7 @@ impl<'a, 'i> Transform<'a, 'i> { writer.insert(fid, &value)?; } - Ok(buffer) + Ok(Some(buffer)) } // Flatten a document from a field mapping generated by [create_fields_mapping] From 00f78d6b5a0642db042a33d3e660b4a4177f5d33 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 13 Apr 2022 11:47:20 +0200 Subject: [PATCH 1369/1889] Apply code suggestions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- json-depth-checker/Cargo.toml | 2 +- json-depth-checker/src/lib.rs | 6 +++--- milli/src/update/index_documents/transform.rs | 18 ++++++++---------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 9c386a383..d608a49dc 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -13,4 +13,4 @@ criterion = "0.3" [[bench]] name = "depth" -harness = false \ No newline at end of file +harness = false diff --git a/json-depth-checker/src/lib.rs b/json-depth-checker/src/lib.rs index 0d423aadb..3d0f28af8 100644 --- a/json-depth-checker/src/lib.rs +++ b/json-depth-checker/src/lib.rs @@ -3,9 +3,9 @@ use serde_json::Value; /// Your json MUST BE valid and generated by `serde_json::to_vec` before being /// sent in this function. This function is DUMB and FAST but makes a lot of /// asumption about the way `serde_json` will generate its input. -/// Will returns `true` if the json contains an object, an array of array -/// or an array containing an object. -/// Returns `false` for everything else. +/// +/// Will return `true` if the JSON contains an object, an array of array +/// or an array containing an object. Returns `false` for everything else. pub fn should_flatten_from_unchecked_slice(json: &[u8]) -> bool { if json.is_empty() { return false; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index e94eb170b..c215872ca 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -286,10 +286,9 @@ impl<'a, 'i> Transform<'a, 'i> { })?; self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; - if let Some(buffer) = self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { - self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; - } else { - self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?; + match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { + Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, + None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?, } } else { self.new_documents_ids.insert(docid); @@ -302,12 +301,11 @@ impl<'a, 'i> Transform<'a, 'i> { if let Some(flatten) = flattened_document { self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; } else { - if let Some(buffer) = - self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? - { - self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?; - } else { - self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?; + match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { + Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, + None => { + self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())? + } } } From 8d630a6f6269b9dda4a9899896479b9e6cfcc5a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 13 Apr 2022 13:17:33 +0200 Subject: [PATCH 1370/1889] Update version for the next release (v0.26.1) --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 79ace436a..06ac4ddb5 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.26.0" +version = "0.26.1" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 633ac2cc7..ea58d874d 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.26.0" +version = "0.26.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 2a4fea85f..933129c70 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.26.0" +version = "0.26.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 3a61b6165..0401db86d 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.26.0" +version = "0.26.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 64497bc13..ef9e33c95 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.26.0" +version = "0.26.1" authors = ["Kerollmops "] edition = "2018" From f1115e274ff4fc055d15c10f2cb8517d6b34e84b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Apr 2022 10:35:50 +0200 Subject: [PATCH 1371/1889] Use Copy impl of FormatOption instead of clonning --- milli/src/search/matches/mod.rs | 54 ++++++++++++++++----------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index ad4f6cd69..c7812aa77 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -499,7 +499,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options.clone()), &text); + assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; @@ -507,7 +507,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options.clone()), &text); + assert_eq!(&matcher.format(format_options), &text); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -515,7 +515,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options.clone()), &text); + assert_eq!(&matcher.format(format_options), &text); } #[test] @@ -532,14 +532,14 @@ mod tests { let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ""); + assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ":-)"); + assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -547,7 +547,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text, because there is no matches. - assert_eq!(&matcher.format(format_options.clone()), &text); + assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; @@ -555,7 +555,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options.clone()), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -564,7 +564,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves." ); } @@ -589,7 +589,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options.clone()), "Ŵôřlḑôle"); + assert_eq!(&matcher.format(format_options), "Ŵôřlḑôle"); // Text containing unicode match. let text = "Ŵôřlḑ"; @@ -597,7 +597,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options.clone()), "Ŵôřlḑ"); + assert_eq!(&matcher.format(format_options), "Ŵôřlḑ"); // Text containing unicode match. let text = "Westfália"; @@ -605,7 +605,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options.clone()), "Westfália"); + assert_eq!(&matcher.format(format_options), "Westfália"); } #[test] @@ -622,14 +622,14 @@ mod tests { let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ""); + assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ":-)"); + assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -638,7 +638,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "A quick brown fox can not jump 32 feet, right…" ); @@ -649,7 +649,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "(A quick brown fox can not jump 32 feet, right…" ); @@ -660,7 +660,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // should crop the phrase instead of croping around the match. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…Split The World is a book written by Emily Henry…" ); @@ -671,7 +671,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…future to build a world with the boy she loves…" ); @@ -682,7 +682,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World." ); @@ -693,7 +693,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…void void void void void split the world void void" ); @@ -704,7 +704,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…void void void void void split the world void void" ); @@ -715,7 +715,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…void void void void void split the world void void" ); } @@ -734,14 +734,14 @@ mod tests { let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ""); + assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ":-)"); + assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -750,7 +750,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // both should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "A quick brown fox can not jump 32 feet, right…" ); @@ -761,7 +761,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…future to build a world with the boy she loves…" ); @@ -771,7 +771,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(format_options.clone()), "…she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; @@ -780,7 +780,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…void void void void void split the world void void" ); } From 8b14090927d2ee6c0958f29f98cd52b996599c83 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Mon, 18 Apr 2022 22:48:22 +0200 Subject: [PATCH 1372/1889] fix min-word-len-for-typo not reset properly --- milli/src/update/settings.rs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 7dd37ccc2..ff59249b7 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -10,6 +10,7 @@ use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; +use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::{FieldsIdsMap, Index, Result}; @@ -46,6 +47,14 @@ impl Setting { pub const fn is_not_set(&self) -> bool { matches!(self, Self::NotSet) } + + /// If `Self` is `Reset`, then map self to `Set` with the provided `val`. + pub fn or_reset(self, val: T) -> Self { + match self { + Self::Reset => Self::Set(val), + otherwise => otherwise, + } + } } impl Serialize for Setting { @@ -535,7 +544,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } fn update_min_typo_word_len(&mut self) -> Result<()> { - match (self.min_word_len_one_typo, self.min_word_len_two_typos) { + let one = self.min_word_len_one_typo.or_reset(DEFAULT_MIN_WORD_LEN_ONE_TYPO); + let two = self.min_word_len_two_typos.or_reset(DEFAULT_MIN_WORD_LEN_TWO_TYPOS); + match (one, two) { (Setting::Set(one), Setting::Set(two)) => { if one > two { return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); @@ -1422,6 +1433,20 @@ mod tests { assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 8); assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 8); + + let mut txn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut txn, &index, &config); + + builder.reset_min_word_len_one_typo(); + builder.reset_min_word_len_two_typos(); + builder.execute(|_| ()).unwrap(); + + txn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS); } #[test] From 152a10344c098c1f227fd339c75dea941aa728b0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 13 Apr 2022 15:24:54 +0200 Subject: [PATCH 1373/1889] Get rid of the threshold when comparing benchmarks It just hide things --- benchmarks/scripts/compare.sh | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh index 84d1dc0e6..ff49144f3 100755 --- a/benchmarks/scripts/compare.sh +++ b/benchmarks/scripts/compare.sh @@ -35,10 +35,4 @@ done path_list=$(echo " $@" | sed 's/ / \/tmp\//g') -if [[ ${#@} -gt 1 ]]; then - # Print the diff changes between the old and new benchmarks - # by only displaying the lines that have a diff of more than 5%. - critcmp --threshold 5 $path_list -else - critcmp $path_list -fi +critcmp $path_list From d81a3f4a74b4a33bbe1c39890cf74e9ef30060fb Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 20 Apr 2022 16:11:23 +0200 Subject: [PATCH 1374/1889] improve the fuzzer of the flatten crate --- flatten-serde-json/fuzz/Cargo.toml | 5 +++-- flatten-serde-json/fuzz/fuzz_targets/flatten.rs | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/flatten-serde-json/fuzz/Cargo.toml b/flatten-serde-json/fuzz/Cargo.toml index 2e0510d5f..52b514785 100644 --- a/flatten-serde-json/fuzz/Cargo.toml +++ b/flatten-serde-json/fuzz/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "flatten_serde_json-fuzz" +name = "flatten-serde-json-fuzz" version = "0.0.0" authors = ["Automatically generated"] publish = false @@ -11,8 +11,9 @@ cargo-fuzz = true [dependencies] libfuzzer-sys = "0.4" arbitrary-json = "0.1.1" +json-depth-checker = { path = "../../json-depth-checker" } -[dependencies.flatten_serde_json] +[dependencies.flatten-serde-json] path = ".." # Prevent this from interfering with workspaces diff --git a/flatten-serde-json/fuzz/fuzz_targets/flatten.rs b/flatten-serde-json/fuzz/fuzz_targets/flatten.rs index 399d1c484..97969dfab 100644 --- a/flatten-serde-json/fuzz/fuzz_targets/flatten.rs +++ b/flatten-serde-json/fuzz/fuzz_targets/flatten.rs @@ -1,8 +1,12 @@ #![no_main] use arbitrary_json::ArbitraryObject; use flatten_serde_json::flatten; +use json_depth_checker::should_flatten_from_value; use libfuzzer_sys::fuzz_target; fuzz_target!(|object: ArbitraryObject| { - let _ = flatten(&object); + let object = flatten(&object); + if !object.is_empty() { + assert!(object.values().any(|value| !should_flatten_from_value(value))); + } }); From eb5830aa407660b3dc718dc4947c1f6a1aed9ee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 21 Apr 2022 13:45:28 +0200 Subject: [PATCH 1375/1889] Add a test to make sure that long words are handled --- milli/src/update/index_documents/mod.rs | 26 +++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ae353b0df..6a671129f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1623,4 +1623,30 @@ mod tests { let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); assert_eq!(documents_ids.len(), 1); } + + /// We try to index documents with words that are too long here, + /// it should not return any error. + #[test] + fn text_with_too_long_words() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let content = documents!([ + {"id": 1, "title": "a".repeat(256) }, + {"id": 2, "title": "b".repeat(512) }, + {"id": 3, "title": format!("{} {}", "c".repeat(250), "d".repeat(250)) }, + ]); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + } } From 3a2451fcbad9ab29dae49e281d56210d06bf9536 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 21 Apr 2022 13:52:09 +0200 Subject: [PATCH 1376/1889] add test normalize exact words --- milli/src/update/settings.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ff59249b7..3c0c0fbee 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1461,4 +1461,22 @@ mod tests { builder.set_min_word_len_two_typos(7); assert!(builder.execute(|_| ()).is_err()); } + + #[test] + fn update_exact_words_normalization() { + let index = TempIndex::new(); + let config = IndexerConfig::default(); + + // Set the genres setting + let mut txn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut txn, &index, &config); + + let words = btreeset! { S("Ab"), S("ac") }; + builder.set_exact_words(words); + assert!(builder.execute(|_| ()).is_ok()); + let exact_words = index.exact_words(&txn).unwrap(); + for word in exact_words.into_fst().stream().into_str_vec().unwrap() { + assert!(word.0 == "ac" || word.0 == "ab"); + } + } } From 2e0089d5ff65adad7351b821b0bcb7eb7004479e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 21 Apr 2022 14:09:33 +0200 Subject: [PATCH 1377/1889] normalize exact words --- milli/src/update/settings.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 3c0c0fbee..d49915787 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -580,6 +580,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_exact_words(&mut self) -> Result<()> { match self.exact_words { Setting::Set(ref mut words) => { + fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> String { + analyzer.analyze(text).tokens().map(|token| token.text().to_string()).collect() + } + + let mut config = AnalyzerConfig::default(); + let stop_words = self.index.stop_words(self.wtxn)?; + if let Some(stop_words) = &stop_words { + config.stop_words(stop_words); + } + let analyzer = Analyzer::new(config); + + let mut words: Vec<_> = + words.iter().map(|word| normalize(&analyzer, word)).collect(); + + // normalization could reorder words + words.sort_unstable(); + let words = fst::Set::from_iter(words.iter())?; self.index.put_exact_words(&mut self.wtxn, &words)?; } From dc0d4addd93eae1bfdb67a5fe0638bdc0ef95a80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 21 Apr 2022 19:02:22 +0200 Subject: [PATCH 1378/1889] First version of new CONTRIBUTING.md --- CONTRIBUTING.md | 130 ++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 43 ++-------------- 2 files changed, 134 insertions(+), 39 deletions(-) create mode 100644 CONTRIBUTING.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..91fd034e3 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,130 @@ +# Contributing + +First, thank you for contributing to Meilisearch! The goal of this document is to provide everything you need to start contributing to Milli, the search engine of Meilisearch. + +Remember that there are many ways to contribute other than writing code: writing [tutorials or blog posts](https://github.com/meilisearch/awesome-meilisearch), improving [the documentation](https://github.com/meilisearch/documentation), submitting [bug reports](https://github.com/meilisearch/milli/issues/new) and [feature requests](https://github.com/meilisearch/product/discussions/categories/feedback-feature-proposal)... + +## Table of Contents +- [Assumptions](#assumptions) +- [How to Contribute](#how-to-contribute) +- [Development Workflow](#development-workflow) +- [Git Guidelines](#git-guidelines) +- [Release Process (for internal team only)](#release-process-for-internal-team-only) + +## Assumptions + +1. **You're familiar with [GitHub](https://github.com) and the [Pull Requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)(PR) workflow.** +2. **You've read the Meilisearch [documentation](https://docs.meilisearch.com).** +3. **You know about the [Meilisearch community](https://docs.meilisearch.com/learn/what_is_meilisearch/contact.html). + Please use this for help.** + +## How to Contribute + +1. Ensure your change has an issue! Find an + [existing issue](https://github.com/meilisearch/milli/issues/) or [open a new issue](https://github.com/meilisearch/milli/issues/new). + * This is where you can get a feel if the change will be accepted or not. +2. Once approved, [fork the Milli repository](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) in your own GitHub account. +3. [Create a new Git branch](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-and-deleting-branches-within-your-repository) +4. Review the [Development Workflow](#development-workflow) section that describes the steps to maintain the repository. +5. Make your changes on your branch. +6. [Submit the branch as a Pull Request](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request-from-a-fork) pointing to the `main` branch of the Meilisearch repository. A maintainer should comment and/or review your Pull Request within a few days. Although depending on the circumstances, it may take longer. + +## Development Workflow + +_WIP section_ + +### Setup and run + +```bash +cargo run --release +``` + +We recommend using the `--release` flag to test the full performance. + +### Test + +```bash +cargo test +``` + +### Querying the engine via the web interface + +To help you develop your feature you might need to use a web interface! You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700). + +### Compile and run the HTTP debug server + +You can specify the number of threads to use to index documents and many other settings too. + +```bash +cd http-ui +cargo run --release -- --db my-database.mdb -vvv --indexing-jobs 8 +``` + +### Index your documents + +It can index a massive amount of documents in not much time, I already achieved to index: + - 115m songs (song and artist name) in \~48min and take 81GiB on disk. + - 12m cities (name, timezone and country ID) in \~4min and take 6GiB on disk. + +These metrics are done on a MacBook Pro with the M1 processor. + +You can feed the engine with your CSV (comma-separated, yes) data like this: + +```bash +printf "id,name,age\n1,hello,32\n2,kiki,24\n" | http POST 127.0.0.1:9700/documents content-type:text/csv +``` + +Don't forget to specify the `id` of the documents. Also, note that it supports JSON and JSON +streaming: you can send them to the engine by using the `content-type:application/json` and +`content-type:application/x-ndjson` headers respectively. + +## Git Guidelines + +### Git Branches + +All changes must be made in a branch and submitted as PR. + +We do not enforce any branch naming style, but please use something descriptive of your changes. + +### Git Commits + +As minimal requirements, your commit message should: +- be capitalized +- not finish by a dot or any other punctuation character (!,?) +- start with a verb so that we can read your commit message this way: "This commit will ...", where "..." is the commit message. + e.g.: "Fix the home page button" or "Add more tests for create_index method" + +We don't follow any other convention, but if you want to use one, we recommend [the Chris Beams one](https://chris.beams.io/posts/git-commit/). + +### GitHub Pull Requests + +Some notes on GitHub PRs: + +- All PRs must be reviewed and approved by at least one maintainer. +- The PR title should be accurate and descriptive of the changes. The title of the PR will be indeed automatically added to the next [release changelogs](https://github.com/meilisearch/milli/releases/). +- [Convert your PR as a draft](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/changing-the-stage-of-a-pull-request) if your changes are a work in progress: no one will review it until you pass your PR as ready for review.
+ The draft PRs are recommended when you want to show that you are working on something and make your work visible. +- The branch related to the PR must be **up-to-date with `main`** before merging. Fortunately, this project uses [Bors](https://github.com/bors-ng/bors-ng) to automatically enforce this requirement without the PR author having to rebase manually. + +## Release Process (for internal team only) + +Meilisearch tools follow the [Semantic Versioning Convention](https://semver.org/). + +### Automation to rebase and Merge the PRs + +This project integrates a bot that helps us manage pull requests merging.
+_[Read more about this](https://github.com/meilisearch/integration-guides/blob/main/resources/bors.md)._ + +### Automated changelogs + +This project integrates a tool to create automated changelogs: the [release-drafter](https://github.com/release-drafter/release-drafter/). + +### How to Publish the Release + +Make a PR modifying all the `Cargo.toml` files with the right version. + +Once the changes are merged on `main`, you can publish the current draft release via the [GitHub interface](https://github.com/meilisearch/milli/releases): on this page, click on `Edit` (related to the draft release) > update the description if needed > when you are ready, click on `Publish release`. + +
+ +Thank you again for reading this through, we can not wait to begin to work with you if you made your way through this contributing guide ❤️ diff --git a/README.md b/README.md index aa8770159..4bf6a0d70 100644 --- a/README.md +++ b/README.md @@ -20,50 +20,15 @@ This repository contains crates to quickly debug the engine: - The `search` crate is a simple command-line that helps run [flamegraph] on top of it. - The `helpers` crate is only used to modify the database inplace, sometimes. -### Compile and run the HTTP debug server +## How to use it? -You can specify the number of threads to use to index documents and many other settings too. - -```bash -cd http-ui -cargo run --release -- --db my-database.mdb -vvv --indexing-jobs 8 -``` - -### Index your documents - -It can index a massive amount of documents in not much time, I already achieved to index: - - 115m songs (song and artist name) in \~48min and take 81GiB on disk. - - 12m cities (name, timezone and country ID) in \~4min and take 6GiB on disk. - -These metrics are done on a MacBook Pro with the M1 processor. - -You can feed the engine with your CSV (comma-separated, yes) data like this: - -```bash -printf "id,name,age\n1,hello,32\n2,kiki,24\n" | http POST 127.0.0.1:9700/documents content-type:text/csv -``` - -Don't forget to specify the `id` of the documents. Also, note that it supports JSON and JSON -streaming: you can send them to the engine by using the `content-type:application/json` and -`content-type:application/x-ndjson` headers respectively. - -### Querying the engine via the website - -You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700). +_Section in WIP_ ## Contributing -You can setup a `git-hook` to stop you from making a commit too fast. It'll stop you if: -- Any of the workspaces does not build -- Your code is not well-formatted +We're glad you're thinking about contributing to this repository! Feel free to pick an issue, and to ask any question you need. Some points might not be clear and we are available to help you! -These two things are also checked in the CI, so ignoring the hook won't help you merge your code. -But if you need to, you can still add `--no-verify` when creating your commit to ignore the hook. - -To enable the hook, run the following command from the root of the project: -``` -cp script/pre-commit .git/hooks/pre-commit -``` +Also, we recommend following the [CONTRIBUTING.md](/CONTRIBUTING.md) to create your PR. [Meilisearch]: https://github.com/meilisearch/meilisearch [flamegraph]: https://github.com/flamegraph-rs/flamegraph From 08753d002a56e5a8d5d6495ad300574bf98b6ddd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 25 Apr 2022 13:39:45 +0200 Subject: [PATCH 1379/1889] Remove pr_status from bors settings --- bors.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/bors.toml b/bors.toml index 8ce9ec3da..717fd69d1 100644 --- a/bors.toml +++ b/bors.toml @@ -4,6 +4,5 @@ status = [ 'Tests on windows-latest with stable', 'Run Rustfmt', ] -pr_status = ['Specify breaking'] # 3 hours timeout timeout-sec = 10800 From fb192aaa9f7772a0ad4d72b32c9245ce5134c4b6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 25 Apr 2022 15:55:38 +0200 Subject: [PATCH 1380/1889] Update the list of milli's subcrates --- README.md | 7 +++++-- cli/Cargo.toml | 2 -- filter-parser/Cargo.toml | 3 +-- helpers/Cargo.toml | 1 + json-depth-checker/Cargo.toml | 3 +-- json-depth-checker/src/lib.rs | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 4bf6a0d70..31e71b603 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,13 @@ to process one update at a time. This repository contains crates to quickly debug the engine: - There are benchmarks located in the `benchmarks` crate. + - The `cli` crate is a simple command-line interface that helps run [flamegraph] on top of it. + - The `filter-parser` crate contains the parser for the Meilisearch filter syntax. + - The `flatten-serde-json` crate contains the library that flattens serde-json `Value` objects like elastic search does. + - The `helpers` crate is only used to do operations on the database. - The `http-ui` crate is a simple HTTP dashboard to tests the features like for real! - The `infos` crate is used to dump the internal data-structure and ensure correctness. - - The `search` crate is a simple command-line that helps run [flamegraph] on top of it. - - The `helpers` crate is only used to modify the database inplace, sometimes. + - The `json-depth-checker` crate is used to indicate if a JSON must be flattened. ## How to use it? diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 06ac4ddb5..de302b895 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -4,8 +4,6 @@ version = "0.26.1" edition = "2018" description = "A CLI to interact with a milli index" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - [dependencies] indicatif = "0.16.2" serde = "1.0.136" diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index ea29404ed..fe069ccb2 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -2,8 +2,7 @@ name = "filter-parser" version = "0.1.0" edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +description = "The parser for the Meilisearch filter syntax" [dependencies] nom = "7.1.0" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index ea58d874d..d71009c91 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -3,6 +3,7 @@ name = "helpers" version = "0.26.1" authors = ["Clément Renault "] edition = "2018" +description = "A small tool to do operations on the database" [dependencies] anyhow = "1.0.56" diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index d608a49dc..9d99a47d8 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -2,8 +2,7 @@ name = "json-depth-checker" version = "0.1.0" edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +description = "A library that indicates if a JSON must be flattened" [dependencies] serde_json = "1.0" diff --git a/json-depth-checker/src/lib.rs b/json-depth-checker/src/lib.rs index 3d0f28af8..d571a0ca5 100644 --- a/json-depth-checker/src/lib.rs +++ b/json-depth-checker/src/lib.rs @@ -1,6 +1,6 @@ use serde_json::Value; -/// Your json MUST BE valid and generated by `serde_json::to_vec` before being +/// Your json MUST BE valid and generated by `serde_json::to_vec` before being /// sent in this function. This function is DUMB and FAST but makes a lot of /// asumption about the way `serde_json` will generate its input. /// From 7e19bf1c0eadc9f2da84fef0395ef508aebd17a9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 25 Apr 2022 17:25:46 +0200 Subject: [PATCH 1381/1889] Add an example usage of the library in the README --- README.md | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 31e71b603..20b12329b 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,68 @@ This repository contains crates to quickly debug the engine: ## How to use it? -_Section in WIP_ +Milli is a library that does search things, it must be embedded in a program. +You can compute the documentation of it by using `cargo doc --open`. + +Here is an example usage of the library where we insert documents into the engine +and search for one of them just after. + +```rust +let path = tempfile::tempdir().unwrap(); +let mut options = EnvOpenOptions::new(); +options.map_size(10 * 1024 * 1024); // 10 MB +let index = Index::new(options, &path).unwrap(); + +let mut wtxn = index.write_txn().unwrap(); +let content = documents!([ + { + "id": 2, + "title": "Prideand Prejudice", + "au{hor": "Jane Austin", + "genre": "romance", + "price$": "3.5$", + }, + { + "id": 456, + "title": "Le Petit Prince", + "au{hor": "Antoine de Saint-Exupéry", + "genre": "adventure", + "price$": "10.0$", + }, + { + "id": 1, + "title": "Wonderland", + "au{hor": "Lewis Carroll", + "genre": "fantasy", + "price$": "25.99$", + }, + { + "id": 4, + "title": "Harry Potter ing fantasy\0lood Prince", + "au{hor": "J. K. Rowling", + "genre": "fantasy\0", + }, +]); + +let config = IndexerConfig::default(); +let indexing_config = IndexDocumentsConfig::default(); +let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); +builder.add_documents(content).unwrap(); +builder.execute().unwrap(); +wtxn.commit().unwrap(); + + +// You can search in the index now! +let mut rtxn = index.read_txn().unwrap(); +let mut search = Search::new(&rtxn, &index); +search.query("horry"); +search.limit(10); + +let result = search.execute().unwrap(); +assert_eq!(result.documents_ids.len(), 1); +``` ## Contributing From 2db3d602593f4f25eea346122b2b3ff00e15ba09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Mon, 25 Apr 2022 18:14:35 +0200 Subject: [PATCH 1382/1889] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 20b12329b..fbc2587d9 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ This repository contains crates to quickly debug the engine: - There are benchmarks located in the `benchmarks` crate. - The `cli` crate is a simple command-line interface that helps run [flamegraph] on top of it. - The `filter-parser` crate contains the parser for the Meilisearch filter syntax. - - The `flatten-serde-json` crate contains the library that flattens serde-json `Value` objects like elastic search does. + - The `flatten-serde-json` crate contains the library that flattens serde-json `Value` objects like Elasticsearch does. - The `helpers` crate is only used to do operations on the database. - The `http-ui` crate is a simple HTTP dashboard to tests the features like for real! - The `infos` crate is used to dump the internal data-structure and ensure correctness. From 2277172f9c76c4bba7a8dedea37968bef830940e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Mon, 25 Apr 2022 18:14:39 +0200 Subject: [PATCH 1383/1889] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fbc2587d9..08c78bd10 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ This repository contains crates to quickly debug the engine: - The `filter-parser` crate contains the parser for the Meilisearch filter syntax. - The `flatten-serde-json` crate contains the library that flattens serde-json `Value` objects like Elasticsearch does. - The `helpers` crate is only used to do operations on the database. - - The `http-ui` crate is a simple HTTP dashboard to tests the features like for real! + - The `http-ui` crate is a simple HTTP dashboard to test the features like for real! - The `infos` crate is used to dump the internal data-structure and ensure correctness. - The `json-depth-checker` crate is used to indicate if a JSON must be flattened. From 5e562ffecfa60495a1d7d23602abce86f364bd8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Mon, 25 Apr 2022 18:14:43 +0200 Subject: [PATCH 1384/1889] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 08c78bd10..5e916905d 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ Milli is a library that does search things, it must be embedded in a program. You can compute the documentation of it by using `cargo doc --open`. Here is an example usage of the library where we insert documents into the engine -and search for one of them just after. +and search for one of them right after. ```rust let path = tempfile::tempdir().unwrap(); From fa6f495662aaa3dbe37c63e2f3d943b1d07e7d81 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 25 Apr 2022 18:32:06 +0200 Subject: [PATCH 1385/1889] fix the indexing fuzzer --- milli/fuzz/fuzz_targets/indexing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index fc51f969a..b618aabad 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -39,7 +39,7 @@ fn index_documents( let mut wtxn = index.write_txn()?; let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()); + let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())?; builder.add_documents(documents)?; builder.execute().unwrap(); From d138b3c70400c293047d038d049440fa5a120100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 25 Apr 2022 14:03:21 +0200 Subject: [PATCH 1386/1889] Update version --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 3 ++- filter-parser/Cargo.toml | 3 ++- flatten-serde-json/Cargo.toml | 3 ++- helpers/Cargo.toml | 3 ++- http-ui/Cargo.toml | 3 ++- infos/Cargo.toml | 3 ++- json-depth-checker/Cargo.toml | 3 ++- milli/Cargo.toml | 2 +- 9 files changed, 16 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 0dbbd6d6f..440fe4ce0 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.1.0" +version = "0.27.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index de302b895..83e3053a9 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,8 +1,9 @@ [package] name = "cli" -version = "0.26.1" +version = "0.27.0" edition = "2018" description = "A CLI to interact with a milli index" +publish = false [dependencies] indicatif = "0.16.2" diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index fe069ccb2..a302bc758 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,8 +1,9 @@ [package] name = "filter-parser" -version = "0.1.0" +version = "0.27.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" +publish = false [dependencies] nom = "7.1.0" diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 0220e8ceb..ceb24336e 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,9 +1,10 @@ [package] name = "flatten-serde-json" -version = "0.1.0" +version = "0.27.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" +publish = false [dependencies] serde_json = "1.0" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index d71009c91..c7cfdc8ea 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,9 +1,10 @@ [package] name = "helpers" -version = "0.26.1" +version = "0.27.0" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" +publish = false [dependencies] anyhow = "1.0.56" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 933129c70..888847c26 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,9 +1,10 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.26.1" +version = "0.27.0" authors = ["Clément Renault "] edition = "2018" +publish = false [dependencies] anyhow = "1.0.56" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 0401db86d..fb618121d 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,8 +1,9 @@ [package] name = "infos" -version = "0.26.1" +version = "0.27.0" authors = ["Clément Renault "] edition = "2018" +publish = false [dependencies] anyhow = "1.0.56" diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 9d99a47d8..087e7e947 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,8 +1,9 @@ [package] name = "json-depth-checker" -version = "0.1.0" +version = "0.27.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" +publish = false [dependencies] serde_json = "1.0" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ef9e33c95..a104145d1 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.26.1" +version = "0.27.0" authors = ["Kerollmops "] edition = "2018" From 7cb764356509dda95db64c72068bb91186290110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 25 Apr 2022 18:40:57 +0200 Subject: [PATCH 1387/1889] Make nightly CI run every week Update CI Fix CI --- .github/workflows/rust.yml | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7338d134b..978f96f5c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -1,10 +1,11 @@ name: Rust on: + schedule: + - cron: '0 5 * * MON' # Every Monday at 5:00am push: - branches: [ staging, trying ] + branches: [ staging, trying ] # For Bors pull_request: - branches: [ main ] env: CARGO_TERM_COLOR: always @@ -17,15 +18,21 @@ jobs: fail-fast: false matrix: os: [ubuntu-18.04, macos-latest, windows-latest] - rust: - - stable - - nightly steps: - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 + - name: Run test with Rust nightly + if: github.event_name == 'schedule' + uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: ${{ matrix.rust }} + toolchain: nightly + override: true + - name: Run test with Rust stable + if: github.event_name != 'schedule' + uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable override: true - name: Cache dependencies uses: Swatinem/rust-cache@v1.3.0 From f19d2dc548f584841d2592f54574100fb8694b4a Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 25 Apr 2022 14:09:52 +0200 Subject: [PATCH 1388/1889] Only flatten the required fields apply review comments Co-authored-by: Kerollmops --- milli/src/update/index_documents/mod.rs | 16 +- milli/src/update/index_documents/transform.rs | 166 +++++++++++++----- 2 files changed, 135 insertions(+), 47 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 6a671129f..35e99a199 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1337,32 +1337,34 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = update::Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key("nested.id".to_owned()); + builder.set_primary_key("complex.nested.id".to_owned()); builder.execute(|_| ()).unwrap(); wtxn.commit().unwrap(); let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { - "nested": { - "id": 0, + "complex": { + "nested": { + "id": 0, + }, }, "title": "The zeroth document", }, { - "nested": { + "complex.nested": { "id": 1, }, "title": "The first document", }, { - "nested": { - "id": 2, + "complex": { + "nested.id": 2, }, "title": "The second document", }, { - "nested.id": 3, + "complex.nested.id": 3, "title": "The third document", }, ]); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index c215872ca..9238212fd 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -340,35 +340,48 @@ impl<'a, 'i> Transform<'a, 'i> { return Ok(None); } + // store the keys and values the original obkv + the flattened json + // We first extract all the key+value out of the obkv. If a value is not nested + // we keep a reference on its value. If the value is nested we'll get its value + // as an owned `Vec` after flattening it. + let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new(); + + // the object we're going to use to store the fields that need to be flattened. let mut doc = serde_json::Map::new(); - for (k, v) in obkv.iter() { - let key = self.fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: k, - process: "Flatten from fields ids map.", - })?; - let value = serde_json::from_slice::(v) - .map_err(crate::error::InternalError::SerdeJson)?; - doc.insert(key.to_string(), value); + // we recreate a json containing only the fields that needs to be flattened. + // all the raw values get inserted directly in the `key_value` vec. + for (key, value) in obkv.iter() { + if json_depth_checker::should_flatten_from_unchecked_slice(value) { + let key = self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Flatten from fields ids map.", + })?; + + let value = serde_json::from_slice::(value) + .map_err(crate::error::InternalError::SerdeJson)?; + doc.insert(key.to_string(), value); + } else { + key_value.push((key, value.into())); + } } let flattened = flatten_serde_json::flatten(&doc); - // Once we have the flattened version we can convert it back to obkv and - // insert all the new generated fields_ids (if any) in the fields ids map. - let mut buffer: Vec = Vec::new(); - let mut writer = KvWriter::new(&mut buffer); - let mut flattened: Vec<_> = flattened.into_iter().collect(); - // we reorder the field to get all the known field first - flattened - .sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX)); - - for (key, value) in flattened { + // Once we have the flattened version we insert all the new generated fields_ids + // (if any) in the fields ids map and serialize the value. + for (key, value) in flattened.into_iter() { let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; - writer.insert(fid, &value)?; + key_value.push((fid, value.into())); } + // we sort the key. If there was a conflict between the obkv and the new generated value the + // keys will be consecutive. + key_value.sort_unstable_by_key(|(key, _)| *key); + + let mut buffer = Vec::new(); + Self::create_obkv_from_key_value(&mut key_value, &mut buffer)?; Ok(Some(buffer)) } @@ -380,41 +393,114 @@ impl<'a, 'i> Transform<'a, 'i> { output_buffer: &mut Vec, field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>, ) -> Result<()> { + // store the keys and values of the json + the original obkv + let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new(); + // if the primary_key is nested we need to flatten the document before being able to do anything let mut doc = serde_json::Map::new(); - for (k, v) in obkv.iter() { - let key = - mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; - let key = self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: *key, - process: "Flatten from field mapping.", - })?; - let value = - serde_json::from_slice::(v).map_err(InternalError::SerdeJson)?; - doc.insert(key.to_string(), value); + // we recreate a json containing only the fields that needs to be flattened. + // all the raw values get inserted directly in the `key_value` vec. + for (key, value) in obkv.iter() { + if json_depth_checker::should_flatten_from_unchecked_slice(value) { + let key = + mapping.get(&key).ok_or(InternalError::FieldIdMappingMissingEntry { key })?; + let key = + self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: *key, + process: "Flatten from field mapping.", + })?; + let value = serde_json::from_slice::(value) + .map_err(InternalError::SerdeJson)?; + doc.insert(key.to_string(), value); + } else { + key_value.push((key, value.into())); + } } let flattened = flatten_serde_json::flatten(&doc); - // Once we have the flattened version we can convert it back to obkv and - // insert all the new generated fields_ids (if any) in the fields ids map. - output_buffer.clear(); - let mut writer = KvWriter::new(output_buffer); - let mut flattened: Vec<_> = flattened.into_iter().collect(); - // we reorder the field to get all the known field first - flattened - .sort_unstable_by_key(|(key, _)| self.fields_ids_map.id(&key).unwrap_or(FieldId::MAX)); - - for (key, value) in flattened { + // Once we have the flattened version we insert all the new generated fields_ids + // (if any) in the fields ids map and serialize the value. + for (key, value) in flattened.into_iter() { let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; - writer.insert(fid, &value)?; + key_value.push((fid, value.clone().into())); + if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() { field_buffer_cache.push((fid, value.into())); } } + // we sort the key. If there was a conflict between the obkv and the new generated value the + // keys will be consecutive. + key_value.sort_unstable_by_key(|(key, _)| *key); + + Self::create_obkv_from_key_value(&mut key_value, output_buffer)?; + Ok(()) + } + + /// Generate an obkv from a slice of key / value sorted by key. + fn create_obkv_from_key_value( + key_value: &mut [(FieldId, Cow<[u8]>)], + output_buffer: &mut Vec, + ) -> Result<()> { + debug_assert!( + key_value.windows(2).all(|vec| vec[0].0 <= vec[1].0), + "The slice of key / value pair must be sorted." + ); + + output_buffer.clear(); + let mut writer = KvWriter::new(output_buffer); + + let mut skip_next_value = false; + for things in key_value.windows(2) { + if skip_next_value { + skip_next_value = false; + continue; + } + let (key1, value1) = &things[0]; + let (key2, value2) = &things[1]; + + // now we're going to look for conflicts between the keys. For example the following documents would cause a conflict: + // { "doggo.name": "jean", "doggo": { "name": "paul" } } + // we should find a first "doggo.name" from the obkv and a second one from the flattening. + // but we must generate the following document: + // { "doggo.name": ["jean", "paul"] } + // thus we're going to merge the value from the obkv and the flattened document in a single array and skip the next + // iteration. + if key1 == key2 { + skip_next_value = true; + + let value1 = serde_json::from_slice(value1) + .map_err(crate::error::InternalError::SerdeJson)?; + let value2 = serde_json::from_slice(value2) + .map_err(crate::error::InternalError::SerdeJson)?; + let value = match (value1, value2) { + (Value::Array(mut left), Value::Array(mut right)) => { + left.append(&mut right); + Value::Array(left) + } + (Value::Array(mut array), value) | (value, Value::Array(mut array)) => { + array.push(value); + Value::Array(array) + } + (left, right) => Value::Array(vec![left, right]), + }; + + let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; + writer.insert(*key1, value)?; + } else { + writer.insert(*key1, value1)?; + } + } + + if !skip_next_value { + // the unwrap is safe here, we know there was at least one value in the document + let (key, value) = key_value.last().unwrap(); + writer.insert(*key, value)?; + } + Ok(()) } From 5c29258e8e5dabd81a4f789f6b3049fe910fa950 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 21 Apr 2022 14:18:08 +0200 Subject: [PATCH 1389/1889] fix cargo warnings --- milli/src/search/query_tree.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 2db4e06d5..02fc0747a 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -100,6 +100,7 @@ impl QueryKind { QueryKind::Exact { original_typo: 0, word } } + #[cfg(test)] pub fn exact_with_typo(original_typo: u8, word: String) -> Self { QueryKind::Exact { original_typo, word } } @@ -108,14 +109,6 @@ impl QueryKind { QueryKind::Tolerant { typo, word } } - pub fn is_tolerant(&self) -> bool { - matches!(self, QueryKind::Tolerant { .. }) - } - - pub fn is_exact(&self) -> bool { - matches!(self, QueryKind::Exact { .. }) - } - pub fn typo(&self) -> u8 { match self { QueryKind::Tolerant { typo, .. } => *typo, From ec8903048339db5db50770637c4738ac8472dea8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 26 Apr 2022 17:36:04 +0200 Subject: [PATCH 1390/1889] Update bors toml --- .github/workflows/rust.yml | 2 +- bors.toml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 978f96f5c..f59a1e9a8 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -12,7 +12,7 @@ env: jobs: tests: - name: Tests on ${{ matrix.os }} with ${{ matrix.rust }} + name: Tests on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false diff --git a/bors.toml b/bors.toml index 717fd69d1..9b75c79d5 100644 --- a/bors.toml +++ b/bors.toml @@ -1,7 +1,7 @@ status = [ - 'Tests on ubuntu-18.04 with stable', - 'Tests on macos-latest with stable', - 'Tests on windows-latest with stable', + 'Tests on ubuntu-18.04', + 'Tests on macos-latest', + 'Tests on windows-latest', 'Run Rustfmt', ] # 3 hours timeout From 7d1c2d97bf29efe7503287cb18173fd8e6d349eb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 26 Apr 2022 17:59:53 +0200 Subject: [PATCH 1391/1889] Return facets even when there is no values associated to it --- milli/src/search/facet/facet_distribution.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 2208ee636..ddbcb2b68 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -243,9 +243,7 @@ impl<'a> FacetDistribution<'a> { for (fid, name) in fields_ids_map.iter() { if crate::is_faceted(name, &fields) { let values = self.facet_values(fid)?; - if !values.is_empty() { - distribution.insert(name.to_string(), values); - } + distribution.insert(name.to_string(), values); } } From a4d343aade38b97a53baedcba7cad37c5665a712 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 26 Apr 2022 18:12:58 +0200 Subject: [PATCH 1392/1889] Add a test to check for the returned facet distribution --- milli/tests/search/facet_distribution.rs | 77 ++++++++++++++++++++++++ milli/tests/search/mod.rs | 1 + 2 files changed, 78 insertions(+) create mode 100644 milli/tests/search/facet_distribution.rs diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs new file mode 100644 index 000000000..d3aece2ab --- /dev/null +++ b/milli/tests/search/facet_distribution.rs @@ -0,0 +1,77 @@ +use std::io::Cursor; + +use big_s::S; +use heed::EnvOpenOptions; +use maplit::hashset; +use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use milli::{FacetDistribution, Index}; + +#[test] +fn test_facet_distribution_with_no_facet_values() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_filterable_fields(hashset! { + S("genres"), + S("tags"), + }); + builder.execute(|_| ()).unwrap(); + + // index documents + let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; + let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let mut cursor = Cursor::new(Vec::new()); + let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let reader = Cursor::new( + r#"[ + { + "id": 123, + "title": "What a week, hu...", + "genres": [], + "tags": ["blue"] + }, + { + "id": 345, + "title": "I am the pig!", + "tags": ["red"] + } + ]"#, + ); + + for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { + let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); + documents_builder.extend_from_json(doc).unwrap(); + } + + documents_builder.finish().unwrap(); + + cursor.set_position(0); + + // index documents + let content = DocumentBatchReader::from_reader(cursor).unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + let mut distrib = FacetDistribution::new(&txn, &index); + distrib.facets(vec!["genres"]); + let result = distrib.execute().unwrap(); + assert_eq!(result["genres"].len(), 0); + + let mut distrib = FacetDistribution::new(&txn, &index); + distrib.facets(vec!["tags"]); + let result = distrib.execute().unwrap(); + assert_eq!(result["tags"].len(), 2); +} diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index c72ca8ba3..12e9861fa 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -13,6 +13,7 @@ use serde::Deserialize; use slice_group_by::GroupBy; mod distinct; +mod facet_distribution; mod filters; mod query_criteria; mod sort; From 3eb3f0269e484ffb0fa2d0b2a68c5a160abf8730 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 21 Apr 2022 14:19:39 +0200 Subject: [PATCH 1393/1889] deny warnings in CI --- .github/workflows/rust.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f59a1e9a8..09cd99b80 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -9,6 +9,7 @@ on: env: CARGO_TERM_COLOR: always + RUSTFLAGS: "-D warnings" jobs: tests: From 1ee3d6ae33a6d707f474585bfdc81a41655723c8 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Fri, 29 Apr 2022 16:13:18 +0200 Subject: [PATCH 1394/1889] fix mistake in Settings initialization --- milli/src/update/settings.rs | 50 +++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index d49915787..ab42d750c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -126,9 +126,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { primary_key: Setting::NotSet, authorize_typos: Setting::NotSet, exact_words: Setting::NotSet, - min_word_len_two_typos: Setting::Reset, - min_word_len_one_typo: Setting::Reset, - exact_attributes: Setting::Reset, + min_word_len_two_typos: Setting::NotSet, + min_word_len_one_typo: Setting::NotSet, + exact_attributes: Setting::NotSet, indexer_config, } } @@ -1496,4 +1496,48 @@ mod tests { assert!(word.0 == "ac" || word.0 == "ab"); } } + + #[test] + fn test_correct_settings_init() { + let index = TempIndex::new(); + let config = IndexerConfig::default(); + + // Set the genres setting + let mut txn = index.write_txn().unwrap(); + let builder = Settings::new(&mut txn, &index, &config); + let Settings { + wtxn: _, + index: _, + indexer_config: _, + searchable_fields, + displayed_fields, + filterable_fields, + sortable_fields, + criteria, + stop_words, + distinct_field, + synonyms, + primary_key, + authorize_typos, + min_word_len_two_typos, + min_word_len_one_typo, + exact_words, + exact_attributes, + } = builder; + + assert!(matches!(searchable_fields, Setting::NotSet)); + assert!(matches!(displayed_fields, Setting::NotSet)); + assert!(matches!(filterable_fields, Setting::NotSet)); + assert!(matches!(sortable_fields, Setting::NotSet)); + assert!(matches!(criteria, Setting::NotSet)); + assert!(matches!(stop_words, Setting::NotSet)); + assert!(matches!(distinct_field, Setting::NotSet)); + assert!(matches!(synonyms, Setting::NotSet)); + assert!(matches!(primary_key, Setting::NotSet)); + assert!(matches!(authorize_typos, Setting::NotSet)); + assert!(matches!(min_word_len_two_typos, Setting::NotSet)); + assert!(matches!(min_word_len_one_typo, Setting::NotSet)); + assert!(matches!(exact_words, Setting::NotSet)); + assert!(matches!(exact_attributes, Setting::NotSet)); + } } From 3cb1f6d0a16f61ee6bd570f03135393f13b1705f Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 2 May 2022 19:19:50 +0200 Subject: [PATCH 1395/1889] improve geosearch error messages --- milli/src/error.rs | 49 +++++-- .../extract/extract_geo_points.rs | 41 +++++- milli/src/update/index_documents/mod.rs | 129 ++++++++++++++++++ 3 files changed, 200 insertions(+), 19 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index a2d5219c1..9e464a557 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -60,7 +60,7 @@ pub enum UserError { DocumentLimitReached, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: BTreeSet }, - InvalidGeoField { document_id: Value }, + InvalidGeoField(GeoError), InvalidFilter(String), InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, SortRankingRuleMissing, @@ -76,6 +76,14 @@ pub enum UserError { InvalidMinTypoWordLenSetting(u8, u8), } +#[derive(Debug)] +pub enum GeoError { + MissingLatitude { document_id: Value }, + MissingLongitude { document_id: Value }, + BadLatitude { document_id: Value, value: Value }, + BadLongitude { document_id: Value, value: Value }, +} + impl From for Error { fn from(error: io::Error) -> Error { // TODO must be improved and more precise @@ -230,17 +238,7 @@ impl fmt::Display for UserError { name_list ) } - Self::InvalidGeoField { document_id } => { - let document_id = match document_id { - Value::String(id) => id.clone(), - _ => document_id.to_string(), - }; - write!( - f, - "The document with the id: `{}` contains an invalid `_geo` field.", - document_id - ) - }, + Self::InvalidGeoField(error) => write!(f, "{error}"), Self::InvalidDocumentId { document_id } => { let document_id = match document_id { Value::String(id) => id.clone(), @@ -314,6 +312,33 @@ impl fmt::Display for FieldIdMapMissingEntry { impl StdError for FieldIdMapMissingEntry {} +impl From for UserError { + fn from(geo_error: GeoError) -> Self { + UserError::InvalidGeoField(geo_error) + } +} + +impl fmt::Display for GeoError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + GeoError::MissingLatitude { document_id } => { + write!(f, "Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.") + } + GeoError::MissingLongitude { document_id } => { + write!(f, "Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.") + } + GeoError::BadLatitude { document_id, value } => { + write!(f, "Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.") + } + GeoError::BadLongitude { document_id, value } => { + write!(f, "Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.") + } + } + } +} + +impl StdError for GeoError {} + impl fmt::Display for SerializationError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 65cb1c3ce..53f94f84a 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -1,9 +1,12 @@ use std::fs::File; use std::io; +use std::result::Result as StdResult; use concat_arrays::concat_arrays; +use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; +use crate::error::GeoError; use crate::{FieldId, InternalError, Result, UserError}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. @@ -24,15 +27,31 @@ pub fn extract_geo_points( let mut cursor = obkv_documents.into_cursor()?; while let Some((docid_bytes, value)) = cursor.move_on_next()? { let obkv = obkv::KvReader::new(value); - let (lat, lng) = obkv.get(lat_fid).zip(obkv.get(lng_fid)).ok_or_else(|| { + // since we only needs the primary key when we throw an error we create this getter to + // lazily get it when needed + let primary_key = || -> Value { let primary_key = obkv.get(primary_key_id).unwrap(); - let primary_key = serde_json::from_slice(primary_key).unwrap(); - UserError::InvalidGeoField { document_id: primary_key } + serde_json::from_slice(primary_key).unwrap() + }; + + // first we get the two fields + let lat = obkv.get(lat_fid).ok_or_else(|| -> UserError { + GeoError::MissingLatitude { document_id: primary_key() }.into() })?; - let (lat, lng): (f64, f64) = ( - serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, - serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, - ); + let lng = obkv.get(lng_fid).ok_or_else(|| -> UserError { + GeoError::MissingLongitude { document_id: primary_key() }.into() + })?; + + // then we extract the values + let lat = extract_value(serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?) + .map_err(|lat| -> UserError { + GeoError::BadLatitude { document_id: primary_key(), value: lat }.into() + })?; + + let lng = extract_value(serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?) + .map_err(|lng| -> UserError { + GeoError::BadLongitude { document_id: primary_key(), value: lng }.into() + })?; let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; @@ -40,3 +59,11 @@ pub fn extract_geo_points( Ok(writer_into_reader(writer)?) } + +fn extract_value(value: Value) -> StdResult { + match value { + Value::Number(ref n) => n.as_f64().ok_or(value), + Value::String(ref s) => s.parse::().map_err(|_| value), + value => Err(value), + } +} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 35e99a199..5ad8782c0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1005,6 +1005,135 @@ mod tests { wtxn.commit().unwrap(); } + #[test] + fn index_all_flavour_of_geo() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + builder.set_filterable_fields(hashset!(S("_geo"))); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let indexing_config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + let mut wtxn = index.write_txn().unwrap(); + + let documents = documents!([ + { "id": 0, "_geo": { "lat": 31, "lng": [42] } }, + { "id": 1, "_geo": { "lat": "31" }, "_geo.lng": 42 }, + { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" }, + { "id": 3, "_geo.lat": 31, "_geo.lng": "42" }, + ]); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = crate::Search::new(&rtxn, &index); + search.filter(crate::Filter::from_str("_geoRadius(31, 42, 0.000001)").unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0, 1, 2, 3]); + } + + #[test] + fn geo_error() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + builder.set_filterable_fields(hashset!(S("_geo"))); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let indexing_config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + let mut wtxn = index.write_txn().unwrap(); + + let documents = documents!([ + { "id": 0, "_geo": { "lng": 42 } } + ]); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(documents).unwrap(); + let error = builder.execute().unwrap_err(); + assert_eq!( + &error.to_string(), + r#"Could not find latitude in the document with the id: `0`. Was expecting a `_geo.lat` field."# + ); + + let documents = documents!([ + { "id": 0, "_geo": { "lat": 42 } } + ]); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(documents).unwrap(); + let error = builder.execute().unwrap_err(); + assert_eq!( + &error.to_string(), + r#"Could not find longitude in the document with the id: `0`. Was expecting a `_geo.lng` field."# + ); + + let documents = documents!([ + { "id": 0, "_geo": { "lat": "lol", "lng": 42 } } + ]); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(documents).unwrap(); + let error = builder.execute().unwrap_err(); + assert_eq!( + &error.to_string(), + r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `"lol"`."# + ); + + let documents = documents!([ + { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } } + ]); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(documents).unwrap(); + let error = builder.execute().unwrap_err(); + assert_eq!( + &error.to_string(), + r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `[12,13]`."# + ); + + let documents = documents!([ + { "id": 0, "_geo": { "lat": 12, "lng": "hello" } } + ]); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(documents).unwrap(); + let error = builder.execute().unwrap_err(); + assert_eq!( + &error.to_string(), + r#"Could not parse longitude in the document with the id: `0`. Was expecting a number but instead got `"hello"`."# + ); + } + #[test] fn delete_documents_then_insert() { let path = tempfile::tempdir().unwrap(); From f820c9804dc7df0f835644540b6cf905239a1dd9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 2 May 2022 17:00:03 +0200 Subject: [PATCH 1396/1889] add one nested benchmark --- benchmarks/benches/indexing.rs | 128 +++++++++++++++++++++++++++++++++ benchmarks/build.rs | 2 + 2 files changed, 130 insertions(+) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 2d0604750..091c081b2 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -498,6 +498,132 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { }); } +fn indexing_nested_movies_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing nested movies with default settings", |b| { + b.iter_with_setup( + move || { + let index = setup_index(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_primary_key("id".to_owned()); + let searchable_fields = [ + "title", + "overview", + "provider_names", + "genres", + "crew.name", + "cast.character", + "cast.name", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_searchable_fields(searchable_fields); + + let filterable_fields = [ + "popularity", + "release_date", + "runtime", + "vote_average", + "external_ids", + "keywords", + "providers.buy.name", + "providers.rent.name", + "providers.flatrate.name", + "provider_names", + "genres", + "crew.name", + "cast.character", + "cast.name", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_filterable_fields(filterable_fields); + + let sortable_fields = ["popularity", "runtime", "vote_average", "release_date"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_sortable_fields(sortable_fields); + + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(10); + group.bench_function("Indexing nested movies without any facets", |b| { + b.iter_with_setup( + move || { + let index = setup_index(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_primary_key("id".to_owned()); + let searchable_fields = [ + "title", + "overview", + "provider_names", + "genres", + "crew.name", + "cast.character", + "cast.name", + ] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_searchable_fields(searchable_fields); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn indexing_geo(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(10); @@ -564,6 +690,8 @@ criterion_group!( indexing_wiki_in_three_batches, indexing_movies_default, indexing_movies_in_three_batches, + indexing_nested_movies_default, + indexing_nested_movies_without_faceted_fields, indexing_geo ); criterion_main!(benches); diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 906230fd4..c15123b37 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -22,6 +22,7 @@ const DATASET_MOVIES: (&str, &str) = ("movies", "json"); const DATASET_MOVIES_1_2: (&str, &str) = ("movies-1_2", "json"); const DATASET_MOVIES_3_4: (&str, &str) = ("movies-3_4", "json"); const DATASET_MOVIES_4_4: (&str, &str) = ("movies-4_4", "json"); +const DATASET_NESTED_MOVIES: (&str, &str) = ("nested_movies", "json"); const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl"); const ALL_DATASETS: &[(&str, &str)] = &[ @@ -37,6 +38,7 @@ const ALL_DATASETS: &[(&str, &str)] = &[ DATASET_MOVIES_1_2, DATASET_MOVIES_3_4, DATASET_MOVIES_4_4, + DATASET_NESTED_MOVIES, DATASET_GEO, ]; From 7e47031bdc96b3ea8eff2c1b2a056e1a44915c23 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 3 May 2022 10:03:13 +0200 Subject: [PATCH 1397/1889] Add a test for long keys in LMDB --- milli/src/update/index_documents/mod.rs | 51 +++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 35e99a199..58e964986 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1651,4 +1651,55 @@ mod tests { builder.execute().unwrap(); wtxn.commit().unwrap(); } + + #[test] + fn text_with_too_long_keys() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let script = "https://bug.example.com/meilisearch/milli.saml2?ROLE=Programmer-1337&SAMLRequest=Cy1ytcZT1Po%2L2IY2y9Unru8rgnW4qWfPiI0EpT7P8xjJV8PeQikRL%2E8D9A4pj9tmbymbQCQwGmGjPMK7qwXFPX4DH52JO2b7n6TXjuR7zkIFuYdzdY2rwRNBPgCL7ihclEm9zyIjKZQ%2JTqiwfXxWjnI0KEYQYHdwd6Q%2Fx%28BDLNsvmL54CCY2F4RWeRs4eqWfn%2EHqxlhreFzax4AiQ2tgOtV5thOaaWqrhZD%2Py70nuyZWNTKwciGI43AoHg6PThANsQ5rAY5amzN%2ufbs1swETUXlLZuOut5YGpYPZfY6STJWNp4QYSUOUXBZpdElYsH7UHZ7VhJycgyt%28aTK0GW6GbKne2tJM0hgSczOqndg6RFa9WsnSBi4zMcaEfYur4WlSsHDYInF9ROousKqVMZ6H8%2gbUissaLh1eXRGo8KEJbyEHbhVVKGD%28kx4cfKjx9fT3pkeDTdvDrVn25jIzi9wHyt9l1lWc8ICnCvXCVUPP%2BjBG4wILR29gMV9Ux2QOieQm2%2Fycybhr8sBGCl30mHC7blvWt%2T3mrCHQoS3VK49PZNPqBZO9C7vOjOWoszNkJx4QckWV%2FZFvbpzUUkiBiehr9F%2FvQSxz9lzv68GwbTu9fr638p%2FQM%3D&RelayState=https%3A%2F%example.bug.com%2Fde&SigAlg=http%3A%2F%2Fwww.w3.org%2F2000%2F09%2Fxmldsig%23rsa-sha1&Signature=AZFpkhFFII7PodiewTovaGnLQKUVZp0qOCCcBIUkJ6P5by3lE3Lldj9pKaFu4wz4j%2B015HEhDvF0LlAmwwES85vdGh%2FpD%2cIQPRUEjdCbQkQDd3dy1mMXbpXxSe4QYcv9Ni7tqNTQxekpO1gE7rtg6zC66EU55uM9aj9abGQ034Vly%2F6IJ08bvAq%2B%2FB9KruLstuiNWnlXTfNGsOxGLK7%2BXr94LTkat8m%2FMan6Qr95%2KeR5TmmqaQIE4N9H6o4TopT7mXr5CF2Z3"; + + // Create 200 documents with a long text + let content = { + let documents: Vec<_> = (0..200i32) + .into_iter() + .map(|i| serde_json::json!({ "id": i, "script": script })) + .collect(); + + let mut writer = std::io::Cursor::new(Vec::new()); + let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); + let documents = serde_json::to_vec(&documents).unwrap(); + builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); + builder.finish().unwrap(); + writer.set_position(0); + crate::documents::DocumentBatchReader::from_reader(writer).unwrap() + }; + + // Index those 200 long documents + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + // Create one long document + let content = documents!([ + {"id": 400, "script": script }, + ]); + + // Index this one long document + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + } } From 211c8763b935b51f9beaab807195dfe7ed4cd968 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 3 May 2022 09:57:03 +0200 Subject: [PATCH 1398/1889] Make sure that we do not generate too long keys --- milli/src/update/index_documents/mod.rs | 3 ++- milli/src/update/word_prefix_docids.rs | 7 +++++-- milli/src/update/word_prefix_pair_proximity_docids.rs | 8 +++++--- milli/src/update/words_prefix_position_docids.rs | 8 +++++--- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 58e964986..ed2347b25 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,7 +20,8 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, + sorter_into_lmdb_database, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, + ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 2887b5583..1002c13cf 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -5,7 +5,8 @@ use heed::types::{ByteSlice, Str}; use heed::Database; use crate::update::index_documents::{ - create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, MergeFn, + create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, + CursorClonableMmap, MergeFn, }; use crate::{Result, RoaringBitmapCodec}; @@ -124,7 +125,9 @@ fn write_prefixes_in_sorter( ) -> Result<()> { for (key, data_slices) in prefixes.drain() { for data in data_slices { - sorter.insert(&key, data)?; + if valid_lmdb_key(&key) { + sorter.insert(&key, data)?; + } } } diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index be0ddf005..72b41c472 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -7,8 +7,8 @@ use log::debug; use slice_group_by::GroupBy; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, - MergeFn, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, + CursorClonableMmap, MergeFn, }; use crate::{Index, Result, StrStrU8Codec}; @@ -188,7 +188,9 @@ fn write_prefixes_in_sorter( ) -> Result<()> { for (key, data_slices) in prefixes.drain() { for data in data_slices { - sorter.insert(&key, data)?; + if valid_lmdb_key(&key) { + sorter.insert(&key, data)?; + } } } diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index 77e9e7c29..b2b24084d 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -11,8 +11,8 @@ use crate::error::SerializationError; use crate::heed_codec::StrBEU32Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, CursorClonableMmap, - MergeFn, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, + CursorClonableMmap, MergeFn, }; use crate::{Index, Result}; @@ -167,7 +167,9 @@ fn write_prefixes_in_sorter( ) -> Result<()> { for (key, data_slices) in prefixes.drain() { for data in data_slices { - sorter.insert(&key, data)?; + if valid_lmdb_key(&key) { + sorter.insert(&key, data)?; + } } } From 5ad5d56f7e029d6c345356cb356bdc2bd16f925f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 4 May 2022 10:43:54 +0200 Subject: [PATCH 1399/1889] remove useless comment --- milli/src/update/settings.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ab42d750c..7ee6bf014 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1502,7 +1502,6 @@ mod tests { let index = TempIndex::new(); let config = IndexerConfig::default(); - // Set the genres setting let mut txn = index.write_txn().unwrap(); let builder = Settings::new(&mut txn, &index, &config); let Settings { From c55368ddd49b821da662da51cb68e5b03b153439 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 4 May 2022 14:11:03 +0200 Subject: [PATCH 1400/1889] apply code suggestion Co-authored-by: Kerollmops --- milli/src/error.rs | 6 ++++ .../extract/extract_geo_points.rs | 32 +++++++++---------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 9e464a557..47b159223 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -97,6 +97,12 @@ impl From for Error { } } +impl From for Error { + fn from(error: GeoError) -> Error { + Error::UserError(UserError::InvalidGeoField(error)) + } +} + impl From> for Error where Error: From, diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 53f94f84a..0ecb113b3 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -7,7 +7,7 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::GeoError; -use crate::{FieldId, InternalError, Result, UserError}; +use crate::{FieldId, InternalError, Result}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. /// @@ -35,23 +35,23 @@ pub fn extract_geo_points( }; // first we get the two fields - let lat = obkv.get(lat_fid).ok_or_else(|| -> UserError { - GeoError::MissingLatitude { document_id: primary_key() }.into() - })?; - let lng = obkv.get(lng_fid).ok_or_else(|| -> UserError { - GeoError::MissingLongitude { document_id: primary_key() }.into() - })?; + let lat = obkv + .get(lat_fid) + .ok_or_else(|| GeoError::MissingLatitude { document_id: primary_key() })?; + let lng = obkv + .get(lng_fid) + .ok_or_else(|| GeoError::MissingLongitude { document_id: primary_key() })?; // then we extract the values - let lat = extract_value(serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?) - .map_err(|lat| -> UserError { - GeoError::BadLatitude { document_id: primary_key(), value: lat }.into() - })?; + let lat = extract_float_from_value( + serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lat| GeoError::BadLatitude { document_id: primary_key(), value: lat })?; - let lng = extract_value(serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?) - .map_err(|lng| -> UserError { - GeoError::BadLongitude { document_id: primary_key(), value: lng }.into() - })?; + let lng = extract_float_from_value( + serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lng| GeoError::BadLongitude { document_id: primary_key(), value: lng })?; let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; @@ -60,7 +60,7 @@ pub fn extract_geo_points( Ok(writer_into_reader(writer)?) } -fn extract_value(value: Value) -> StdResult { +fn extract_float_from_value(value: Value) -> StdResult { match value { Value::Number(ref n) => n.as_f64().ok_or(value), Value::String(ref s) => s.parse::().map_err(|_| value), From 48cdfddebfdd1768e064c74f94320630f063e441 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 4 May 2022 14:44:51 +0200 Subject: [PATCH 1401/1889] Remove the wip section part of the contributing file --- CONTRIBUTING.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 91fd034e3..9e7ff8c90 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -31,8 +31,6 @@ Remember that there are many ways to contribute other than writing code: writing ## Development Workflow -_WIP section_ - ### Setup and run ```bash From 484a9ddb278cced75f4985f7b30ba69d7c10ee44 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 2 May 2022 20:10:25 +0200 Subject: [PATCH 1402/1889] Simplify the error creation with thiserror and a smol friendly macro --- milli/Cargo.toml | 1 + milli/src/asc_desc.rs | 48 ++---- milli/src/criterion.rs | 53 ++---- milli/src/error.rs | 368 +++++++++++++---------------------------- 4 files changed, 142 insertions(+), 328 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index a104145d1..3628a1538 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -35,6 +35,7 @@ smallstr = { version = "0.3.0", features = ["serde"] } smallvec = "1.8.0" smartstring = "1.0.1" tempfile = "3.3.0" +thiserror = "1.0.31" time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } uuid = { version = "0.8.2", features = ["v4"] } diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index f07e1ded8..88023b3cf 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -4,6 +4,7 @@ use std::fmt; use std::str::FromStr; use serde::{Deserialize, Serialize}; +use thiserror::Error; use crate::error::is_reserved_keyword; use crate::{CriterionError, Error, UserError}; @@ -153,14 +154,24 @@ impl FromStr for AscDesc { } } -#[derive(Debug)] +#[derive(Error, Debug)] pub enum SortError { + #[error("{}", AscDescError::InvalidLatitude)] InvalidLatitude, + #[error("{}", AscDescError::InvalidLongitude)] InvalidLongitude, + #[error("Invalid syntax for the geo parameter: expected expression formated like \ + `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{name}`.")] BadGeoPointUsage { name: String }, + #[error("Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{name}`.")] InvalidName { name: String }, + #[error("`{name}` is a reserved keyword and thus can't be used as a sort expression.")] ReservedName { name: String }, + #[error("`{name}` is a reserved keyword and thus can't be used as a sort expression. \ + Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.")] ReservedNameForSettings { name: String }, + #[error("`{name}` is a reserved keyword and thus can't be used as a sort expression. \ + Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.")] ReservedNameForFilter { name: String }, } @@ -184,41 +195,6 @@ impl From for SortError { } } -impl fmt::Display for SortError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::InvalidLatitude => write!(f, "{}", AscDescError::InvalidLatitude), - Self::InvalidLongitude => write!(f, "{}", AscDescError::InvalidLongitude), - Self::BadGeoPointUsage { name } => { - write!( - f, - "Invalid syntax for the geo parameter: expected expression formated like \ - `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{}`.", - name - ) - } - Self::InvalidName { name } => { - write!(f, "Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name) - } - Self::ReservedName { name } => { - write!( - f, - "`{}` is a reserved keyword and thus can't be used as a sort expression.", - name - ) - } - Self::ReservedNameForSettings { name } | Self::ReservedNameForFilter { name } => { - write!( - f, - "`{}` is a reserved keyword and thus can't be used as a sort expression. \ - Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.", - name, - ) - } - } - } -} - impl From for Error { fn from(error: SortError) -> Self { Self::UserError(UserError::SortError(error)) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index aca2f95b5..a46a137ad 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -2,55 +2,28 @@ use std::fmt; use std::str::FromStr; use serde::{Deserialize, Serialize}; +use thiserror::Error; -use crate::error::Error; -use crate::{AscDesc, Member, UserError}; +use crate::{AscDesc, Member}; -#[derive(Debug)] +#[derive(Error, Debug)] pub enum CriterionError { + #[error("`{name}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.")] InvalidName { name: String }, + #[error("`{name}` is a reserved keyword and thus can't be used as a ranking rule")] ReservedName { name: String }, + #[error( + "`{name}` is a reserved keyword and thus can't be used as a ranking rule. \ +`{name}` can only be used for sorting at search time" + )] ReservedNameForSort { name: String }, + #[error( + "`{name}` is a reserved keyword and thus can't be used as a ranking rule. \ +`{name}` can only be used for filtering at search time" + )] ReservedNameForFilter { name: String }, } -impl fmt::Display for CriterionError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::InvalidName { name } => write!(f, "`{}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.", name), - Self::ReservedName { name } => { - write!( - f, - "`{}` is a reserved keyword and thus can't be used as a ranking rule", - name - ) - } - Self::ReservedNameForSort { name } => { - write!( - f, - "`{}` is a reserved keyword and thus can't be used as a ranking rule. \ -`{}` can only be used for sorting at search time", - name, name - ) - } - Self::ReservedNameForFilter { name } => { - write!( - f, - "`{}` is a reserved keyword and thus can't be used as a ranking rule. \ -`{}` can only be used for filtering at search time", - name, name - ) - } - } - } -} - -impl From for Error { - fn from(error: CriterionError) -> Self { - Self::UserError(UserError::CriterionError(error)) - } -} - #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub enum Criterion { /// Sorted by decreasing number of matched query terms. diff --git a/milli/src/error.rs b/milli/src/error.rs index 47b159223..caabb96fc 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -1,11 +1,11 @@ use std::collections::BTreeSet; use std::convert::Infallible; -use std::error::Error as StdError; -use std::{fmt, io, str}; +use std::{io, str}; use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; use serde_json::{Map, Value}; +use thiserror::Error; use crate::{CriterionError, DocumentId, FieldId, SortError}; @@ -15,92 +15,172 @@ pub fn is_reserved_keyword(keyword: &str) -> bool { ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword) } -#[derive(Debug)] +#[derive(Error, Debug)] pub enum Error { - InternalError(InternalError), - IoError(io::Error), - UserError(UserError), + #[error("internal: {0}.")] + InternalError(#[from] InternalError), + #[error(transparent)] + IoError(#[from] io::Error), + #[error(transparent)] + UserError(#[from] UserError), } -#[derive(Debug)] +#[derive(Error, Debug)] pub enum InternalError { + #[error("{}", HeedError::DatabaseClosing)] DatabaseClosing, + #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, - FieldIdMapMissingEntry(FieldIdMapMissingEntry), + #[error(transparent)] + FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry), + #[error("Missing {key} in the field id mapping.")] FieldIdMappingMissingEntry { key: FieldId }, - Fst(fst::Error), + #[error(transparent)] + Fst(#[from] fst::Error), + #[error("Invalid compression type have been specified to grenad.")] GrenadInvalidCompressionType, + #[error("Invalid grenad file with an invalid version format.")] GrenadInvalidFormatVersion, + #[error("Invalid merge while processing {process}.")] IndexingMergingKeys { process: &'static str }, + #[error("{}", HeedError::InvalidDatabaseTyping)] InvalidDatabaseTyping, - RayonThreadPool(ThreadPoolBuildError), - SerdeJson(serde_json::Error), - Serialization(SerializationError), - Store(MdbError), - Utf8(str::Utf8Error), + #[error(transparent)] + RayonThreadPool(#[from] ThreadPoolBuildError), + #[error(transparent)] + SerdeJson(#[from] serde_json::Error), + #[error(transparent)] + Serialization(#[from] SerializationError), + #[error(transparent)] + Store(#[from] MdbError), + #[error(transparent)] + Utf8(#[from] str::Utf8Error), } -#[derive(Debug)] +#[derive(Error, Debug)] pub enum SerializationError { + #[error("{}", match .db_name { + Some(name) => format!("decoding from the {name} database failed"), + None => "decoding failed".to_string(), + })] Decoding { db_name: Option<&'static str> }, + #[error("{}", match .db_name { + Some(name) => format!("encoding into the {name} database failed"), + None => "encoding failed".to_string(), + })] Encoding { db_name: Option<&'static str> }, + #[error("number is not a valid finite number")] InvalidNumberSerialization, } -#[derive(Debug)] +#[derive(Error, Debug)] pub enum FieldIdMapMissingEntry { + #[error("unknown field id {field_id} coming from the {process} process")] FieldId { field_id: FieldId, process: &'static str }, + #[error("unknown field name {field_name} coming from the {process} process")] FieldName { field_name: String, process: &'static str }, } -#[derive(Debug)] +#[derive(Error, Debug)] pub enum UserError { + #[error("A document cannot contain more than 65,535 fields.")] AttributeLimitReached, - CriterionError(CriterionError), + #[error(transparent)] + CriterionError(#[from] CriterionError), + #[error("Maximum number of documents reached.")] DocumentLimitReached, + #[error( + "Document identifier `{}` is invalid. \ +A document identifier can be of type integer or string, \ +only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", .document_id.to_string() + )] InvalidDocumentId { document_id: Value }, + #[error("Invalid facet distribution, the fields `{}` are not set as filterable.", + .invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") + )] InvalidFacetsDistribution { invalid_facets_name: BTreeSet }, - InvalidGeoField(GeoError), + #[error(transparent)] + InvalidGeoField(#[from] GeoError), + #[error("{0}")] InvalidFilter(String), + #[error("Attribute `{}` is not sortable. {}", + .field, + match .valid_fields.is_empty() { + true => "This index does not have configured sortable attributes.".to_string(), + false => format!("Available sortable attributes are: `{}`.", + valid_fields.iter().map(AsRef::as_ref).collect::>().join(", ") + ), + } + )] InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, + #[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")] SortRankingRuleMissing, + #[error("The database file is in an invalid state.")] InvalidStoreFile, + #[error("Maximum database size has been reached.")] MaxDatabaseSizeReached, + #[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] MissingDocumentId { primary_key: String, document: Object }, + #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")] MissingPrimaryKey, + #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] NoSpaceLeftOnDevice, + #[error("Index already has a primary key: `{0}`.")] PrimaryKeyCannotBeChanged(String), + #[error(transparent)] SerdeJson(serde_json::Error), - SortError(SortError), + #[error(transparent)] + SortError(#[from] SortError), + #[error("An unknown internal document id have been used: `{document_id}`.")] UnknownInternalDocumentId { document_id: DocumentId }, + #[error("`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {0}` and twoTypos: {1}`.")] InvalidMinTypoWordLenSetting(u8, u8), } -#[derive(Debug)] +#[derive(Error, Debug)] pub enum GeoError { + #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")] MissingLatitude { document_id: Value }, + #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")] MissingLongitude { document_id: Value }, + #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] BadLatitude { document_id: Value, value: Value }, + #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] BadLongitude { document_id: Value, value: Value }, } -impl From for Error { - fn from(error: io::Error) -> Error { - // TODO must be improved and more precise - Error::IoError(error) - } +/// A little macro helper to autogenerate From implementation that needs two `Into`. +/// Given the following parameters: `error_from_sub_error!(FieldIdMapMissingEntry => InternalError)` +/// the macro will create the following code: +/// ```ignore +/// impl From for Error { +/// fn from(error: FieldIdMapMissingEntry) -> Error { +/// Error::from(InternalError::from(error)) +/// } +/// } +/// ``` +macro_rules! error_from_sub_error { + () => {}; + ($sub:ty => $intermediate:ty) => { + impl From<$sub> for Error { + fn from(error: $sub) -> Error { + Error::from(<$intermediate>::from(error)) + } + } + }; + ($($sub:ty => $intermediate:ty $(,)?),+) => { + $(error_from_sub_error!($sub => $intermediate);)+ + }; } -impl From for Error { - fn from(error: fst::Error) -> Error { - Error::InternalError(InternalError::Fst(error)) - } -} - -impl From for Error { - fn from(error: GeoError) -> Error { - Error::UserError(UserError::InvalidGeoField(error)) - } +error_from_sub_error! { + FieldIdMapMissingEntry => InternalError, + fst::Error => InternalError, + str::Utf8Error => InternalError, + ThreadPoolBuildError => InternalError, + SerializationError => InternalError, + GeoError => UserError, + CriterionError => UserError, } impl From> for Error @@ -121,12 +201,6 @@ where } } -impl From for Error { - fn from(error: str::Utf8Error) -> Error { - Error::InternalError(InternalError::Utf8(error)) - } -} - impl From for Error { fn from(_error: Infallible) -> Error { unreachable!() @@ -153,216 +227,6 @@ impl From for Error { } } -impl From for Error { - fn from(error: ThreadPoolBuildError) -> Error { - Error::InternalError(InternalError::RayonThreadPool(error)) - } -} - -impl From for Error { - fn from(error: FieldIdMapMissingEntry) -> Error { - Error::InternalError(InternalError::FieldIdMapMissingEntry(error)) - } -} - -impl From for Error { - fn from(error: InternalError) -> Error { - Error::InternalError(error) - } -} - -impl From for Error { - fn from(error: UserError) -> Error { - Error::UserError(error) - } -} - -impl From for Error { - fn from(error: SerializationError) -> Error { - Error::InternalError(InternalError::Serialization(error)) - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::InternalError(error) => write!(f, "internal: {}.", error), - Self::IoError(error) => error.fmt(f), - Self::UserError(error) => error.fmt(f), - } - } -} - -impl StdError for Error {} - -impl fmt::Display for InternalError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::DatabaseMissingEntry { db_name, key } => { - write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name) - } - Self::FieldIdMapMissingEntry(error) => error.fmt(f), - Self::FieldIdMappingMissingEntry { key } => { - write!(f, "Missing {} in the field id mapping.", key) - } - Self::Fst(error) => error.fmt(f), - Self::GrenadInvalidCompressionType => { - f.write_str("Invalid compression type have been specified to grenad.") - } - Self::GrenadInvalidFormatVersion => { - f.write_str("Invalid grenad file with an invalid version format.") - } - Self::IndexingMergingKeys { process } => { - write!(f, "Invalid merge while processing {}.", process) - } - Self::Serialization(error) => error.fmt(f), - Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), - Self::RayonThreadPool(error) => error.fmt(f), - Self::SerdeJson(error) => error.fmt(f), - Self::DatabaseClosing => HeedError::DatabaseClosing.fmt(f), - Self::Store(error) => error.fmt(f), - Self::Utf8(error) => error.fmt(f), - } - } -} - -impl StdError for InternalError {} - -impl fmt::Display for UserError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::InvalidFilter(error) => f.write_str(error), - Self::AttributeLimitReached => f.write_str("A document cannot contain more than 65,535 fields."), - Self::CriterionError(error) => write!(f, "{}", error), - Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."), - Self::InvalidFacetsDistribution { invalid_facets_name } => { - let name_list = - invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", "); - write!( - f, - "Invalid facet distribution, the fields `{}` are not set as filterable.", - name_list - ) - } - Self::InvalidGeoField(error) => write!(f, "{error}"), - Self::InvalidDocumentId { document_id } => { - let document_id = match document_id { - Value::String(id) => id.clone(), - _ => document_id.to_string(), - }; - write!( - f, - "Document identifier `{}` is invalid. \ -A document identifier can be of type integer or string, \ -only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", - document_id - ) - } - Self::InvalidSortableAttribute { field, valid_fields } => { - let valid_names = - valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); - - if valid_names.is_empty() { - write!( - f, - "Attribute `{}` is not sortable. This index does not have configured sortable attributes.", - field - ) - } else { - write!( - f, - "Attribute `{}` is not sortable. Available sortable attributes are: `{}`.", - field, valid_names - ) - } - } - Self::SortRankingRuleMissing => f.write_str( - "The sort ranking rule must be specified in the \ -ranking rules settings to use the sort parameter at search time.", - ), - Self::MissingDocumentId { primary_key, document } => { - let json = serde_json::to_string(document).unwrap(); - write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json) - } - Self::MissingPrimaryKey => f.write_str("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index."), - Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."), - Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."), - Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."), - Self::PrimaryKeyCannotBeChanged(primary_key) => { - write!(f, "Index already has a primary key: `{}`.", primary_key) - } - Self::SerdeJson(error) => error.fmt(f), - Self::SortError(error) => write!(f, "{}", error), - Self::UnknownInternalDocumentId { document_id } => { - write!(f, "An unknown internal document id have been used: `{}`.", document_id) - } - Self::InvalidMinTypoWordLenSetting(one, two) => write!(f, "`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {}` and twoTypos: {}`.", one, two), - } - } -} - -impl StdError for UserError {} - -impl fmt::Display for FieldIdMapMissingEntry { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::FieldId { field_id, process } => { - write!(f, "unknown field id {} coming from the {} process", field_id, process) - } - Self::FieldName { field_name, process } => { - write!(f, "unknown field name {} coming from the {} process", field_name, process) - } - } - } -} - -impl StdError for FieldIdMapMissingEntry {} - -impl From for UserError { - fn from(geo_error: GeoError) -> Self { - UserError::InvalidGeoField(geo_error) - } -} - -impl fmt::Display for GeoError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - GeoError::MissingLatitude { document_id } => { - write!(f, "Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.") - } - GeoError::MissingLongitude { document_id } => { - write!(f, "Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.") - } - GeoError::BadLatitude { document_id, value } => { - write!(f, "Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.") - } - GeoError::BadLongitude { document_id, value } => { - write!(f, "Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.") - } - } - } -} - -impl StdError for GeoError {} - -impl fmt::Display for SerializationError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Self::Decoding { db_name: Some(name) } => { - write!(f, "decoding from the {} database failed", name) - } - Self::Decoding { db_name: None } => f.write_str("decoding failed"), - Self::Encoding { db_name: Some(name) } => { - write!(f, "encoding into the {} database failed", name) - } - Self::Encoding { db_name: None } => f.write_str("encoding failed"), - Self::InvalidNumberSerialization => f.write_str("number is not a valid finite number"), - } - } -} - -impl StdError for SerializationError {} - #[test] fn conditionally_lookup_for_error_message() { let prefix = "Attribute `name` is not sortable."; From f586028f9a8250e462c43484c6ce4867dc2d5aff Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 16 May 2022 15:22:52 +0200 Subject: [PATCH 1403/1889] fix the searchable fields bug when a field is nested MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update milli/src/index.rs Co-authored-by: Clément Renault --- milli/src/index.rs | 163 +++++++++++++++++++++++- milli/src/update/index_documents/mod.rs | 12 ++ milli/src/update/settings.rs | 8 +- 3 files changed, 176 insertions(+), 7 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 3adfd2629..81648fe1c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -42,6 +42,7 @@ pub mod main_key { pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; + pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; pub const STOP_WORDS_KEY: &str = "stop-words"; pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; @@ -457,12 +458,43 @@ impl Index { /* searchable fields */ - /// Writes the searchable fields, when this list is specified, only these are indexed. - pub(crate) fn put_searchable_fields( + /// Write the user defined searchable fields and generate the real searchable fields from the specified fields ids map. + pub(crate) fn put_all_searchable_fields_from_fields_ids_map( &self, wtxn: &mut RwTxn, - fields: &[&str], + user_fields: &[&str], + fields_ids_map: &FieldsIdsMap, ) -> heed::Result<()> { + // We can write the user defined searchable fields as-is. + self.put_user_defined_searchable_fields(wtxn, user_fields)?; + + // Now we generate the real searchable fields: + // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. + // 2. Iterate over the user defined searchable fields. + // 3. If a user defined field is a subset of a field defined in the fields_ids_map + // (ie doggo.name is a subset of doggo) then we push it at the end of the fields. + let mut real_fields = user_fields.to_vec(); + + for field_from_map in fields_ids_map.names() { + for user_field in user_fields { + if crate::is_faceted_by(field_from_map, user_field) + && !user_fields.contains(&field_from_map) + { + real_fields.push(field_from_map); + } + } + } + + self.put_searchable_fields(wtxn, &real_fields) + } + + pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.delete_searchable_fields(wtxn)?; + self.delete_user_defined_searchable_fields(wtxn) + } + + /// Writes the searchable fields, when this list is specified, only these are indexed. + fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { self.main.put::<_, Str, SerdeBincode<&[&str]>>( wtxn, main_key::SEARCHABLE_FIELDS_KEY, @@ -471,7 +503,7 @@ impl Index { } /// Deletes the searchable fields, when no fields are specified, all fields are indexed. - pub(crate) fn delete_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + fn delete_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, main_key::SEARCHABLE_FIELDS_KEY) } @@ -498,6 +530,36 @@ impl Index { } } + /// Writes the searchable fields, when this list is specified, only these are indexed. + pub(crate) fn put_user_defined_searchable_fields( + &self, + wtxn: &mut RwTxn, + fields: &[&str], + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<_>>( + wtxn, + main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY, + &fields, + ) + } + + /// Deletes the searchable fields, when no fields are specified, all fields are indexed. + pub(crate) fn delete_user_defined_searchable_fields( + &self, + wtxn: &mut RwTxn, + ) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) + } + + /// Returns the user defined searchable fields. + pub fn user_defined_searchable_fields<'t>( + &self, + rtxn: &'t RoTxn, + ) -> heed::Result>> { + self.main + .get::<_, Str, SerdeBincode>>(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) + } + /* filterable fields */ /// Writes the filterable fields names in the database. @@ -1031,12 +1093,13 @@ impl Index { pub(crate) mod tests { use std::ops::Deref; + use big_s::S; use heed::EnvOpenOptions; use maplit::btreemap; use tempfile::TempDir; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; - use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig}; + use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig}; use crate::Index; pub(crate) struct TempIndex { @@ -1184,4 +1247,94 @@ pub(crate) mod tests { assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 3); assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 15); } + + #[test] + fn add_documents_and_set_searchable_fields() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = documents!([ + { "id": 1, "doggo": "kevin" }, + { "id": 2, "doggo": { "name": "bob", "age": 20 } }, + { "id": 3, "name": "jean", "age": 25 }, + ]); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + // set searchable fields + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_searchable_fields(vec![S("doggo"), S("name")]); + + builder.execute(drop).unwrap(); + wtxn.commit().unwrap(); + + // ensure we get the right real searchable fields + user defined searchable fields + let rtxn = index.read_txn().unwrap(); + + let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); + + let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(user_defined, &["doggo", "name"]); + } + + #[test] + fn set_searchable_fields_and_add_documents() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); + + // set searchable fields + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_searchable_fields(vec![S("doggo"), S("name")]); + + builder.execute(drop).unwrap(); + wtxn.commit().unwrap(); + + // ensure we get the right real searchable fields + user defined searchable fields + let rtxn = index.read_txn().unwrap(); + + let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(real, &["doggo", "name"]); + let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(user_defined, &["doggo", "name"]); + + let mut wtxn = index.write_txn().unwrap(); + let content = documents!([ + { "id": 1, "doggo": "kevin" }, + { "id": 2, "doggo": { "name": "bob", "age": 20 } }, + { "id": 3, "name": "jean", "age": 25 }, + ]); + + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + // ensure we get the right real searchable fields + user defined searchable fields + let rtxn = index.read_txn().unwrap(); + + let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); + + let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(user_defined, &["doggo", "name"]); + } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 794e64abc..bf7d06a23 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -157,6 +157,18 @@ where let new_facets = output.compute_real_facets(self.wtxn, self.index)?; self.index.put_faceted_fields(self.wtxn, &new_facets)?; + // in case new fields were introduced we're going to recreate the searchable fields. + if let Some(faceted_fields) = self.index.user_defined_searchable_fields(self.wtxn)? { + // we can't keep references on the faceted fields while we update the index thus we need to own it. + let faceted_fields: Vec = + faceted_fields.into_iter().map(str::to_string).collect(); + self.index.put_all_searchable_fields_from_fields_ids_map( + self.wtxn, + &faceted_fields.iter().map(String::as_ref).collect::>(), + &output.fields_ids_map, + )?; + } + let indexed_documents = output.documents_count as u64; let number_of_documents = self.execute_raw(output)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ab42d750c..28b10ec1d 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -343,11 +343,15 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; } - self.index.put_searchable_fields(self.wtxn, &names)?; + self.index.put_all_searchable_fields_from_fields_ids_map( + self.wtxn, + &names, + &new_fields_ids_map, + )?; self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; } Setting::Reset => { - self.index.delete_searchable_fields(self.wtxn)?; + self.index.delete_all_searchable_fields(self.wtxn)?; } Setting::NotSet => return Ok(false), } From 0af399a6d7f397474c07d2df689f6d35729f9af8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 16 May 2022 15:55:18 +0200 Subject: [PATCH 1404/1889] fix the mixed dataset geosearch indexing bug --- .../extract/extract_geo_points.rs | 36 +++++++++--------- milli/src/update/index_documents/mod.rs | 37 +++++++++++++++++++ 2 files changed, 56 insertions(+), 17 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 0ecb113b3..fffae5e77 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -35,26 +35,28 @@ pub fn extract_geo_points( }; // first we get the two fields - let lat = obkv - .get(lat_fid) - .ok_or_else(|| GeoError::MissingLatitude { document_id: primary_key() })?; - let lng = obkv - .get(lng_fid) - .ok_or_else(|| GeoError::MissingLongitude { document_id: primary_key() })?; + let lat = obkv.get(lat_fid); + let lng = obkv.get(lng_fid); - // then we extract the values - let lat = extract_float_from_value( - serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, - ) - .map_err(|lat| GeoError::BadLatitude { document_id: primary_key(), value: lat })?; + if let Some((lat, lng)) = lat.zip(lng) { + // then we extract the values + let lat = extract_float_from_value( + serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lat| GeoError::BadLatitude { document_id: primary_key(), value: lat })?; - let lng = extract_float_from_value( - serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, - ) - .map_err(|lng| GeoError::BadLongitude { document_id: primary_key(), value: lng })?; + let lng = extract_float_from_value( + serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lng| GeoError::BadLongitude { document_id: primary_key(), value: lng })?; - let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; - writer.insert(docid_bytes, bytes)?; + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + writer.insert(docid_bytes, bytes)?; + } else if lat.is_none() && lng.is_some() { + return Err(GeoError::MissingLatitude { document_id: primary_key() })?; + } else if lat.is_some() && lng.is_none() { + return Err(GeoError::MissingLongitude { document_id: primary_key() })?; + } } Ok(writer_into_reader(writer)?) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 794e64abc..4c659aed2 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1006,6 +1006,43 @@ mod tests { wtxn.commit().unwrap(); } + #[test] + fn mixed_geo_documents() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // We send 6 documents and mix the ones that have _geo and those that don't have it. + let mut wtxn = index.write_txn().unwrap(); + let documents = documents!([ + { "id": 2, "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, + { "id": 456 }, + { "id": 1 }, + { "id": 1344 }, + { "id": 4 }, + { "id": 42, "_geo": { "lat": 35, "lng": 23 } } + ]); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + let faceted_fields = hashset!(S("_geo")); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + } + #[test] fn index_all_flavour_of_geo() { let path = tempfile::tempdir().unwrap(); From 137434a1c83744a5f4ce19c933388d43fa5d1371 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 16 May 2022 17:05:20 +0200 Subject: [PATCH 1405/1889] Add some implementation on MatchBounds --- milli/src/search/matches/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index c7812aa77..d89e7dcb6 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -3,6 +3,7 @@ use std::borrow::Cow; use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; pub use matching_words::{MatchingWord, MatchingWords}; use meilisearch_tokenizer::token::{SeparatorKind, Token}; +use serde::Serialize; pub mod matching_words; @@ -86,7 +87,7 @@ pub struct Match { token_position: usize, } -#[derive(Clone, Debug)] +#[derive(Serialize, Debug, Clone, PartialEq)] pub struct MatchBounds { pub start: usize, pub length: usize, From 895f5d8a26b1152ea9f8a2f79c31f257d743d786 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 18 May 2022 10:37:12 +0200 Subject: [PATCH 1406/1889] Bump milli version --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 440fe4ce0..219b47ca3 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.27.0" +version = "0.28.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 83e3053a9..3440c3189 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.27.0" +version = "0.28.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index a302bc758..4d87335b7 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.27.0" +version = "0.28.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index ceb24336e..dd0ed19e4 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.27.0" +version = "0.28.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index c7cfdc8ea..a52c7dc4d 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.27.0" +version = "0.28.0" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 888847c26..addfcf8f9 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.27.0" +version = "0.28.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index fb618121d..1df5bc0b0 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.27.0" +version = "0.28.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 087e7e947..5ef0e3191 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.27.0" +version = "0.28.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3628a1538..ac5a43899 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.27.0" +version = "0.28.0" authors = ["Kerollmops "] edition = "2018" From cd7c6e19ed64c37f173ae4cf5daa0b110628f866 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 18 May 2022 14:51:00 +0200 Subject: [PATCH 1407/1889] Reintroduce the max values by facet limit --- milli/src/search/facet/facet_distribution.rs | 48 ++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index ddbcb2b68..23b0b1df9 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; -use std::{fmt, mem}; +use std::{cmp, fmt, mem}; use heed::types::ByteSlice; use roaring::RoaringBitmap; @@ -13,6 +13,14 @@ use crate::heed_codec::facet::{ use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; use crate::{FieldId, Index, Result}; +/// The default number of values by facets that will +/// be fetched from the key-value store. +const DEFAULT_VALUES_BY_FACET: usize = 1000; + +/// The hard limit in the number of values by facets that will be fetched from +/// the key-value store. Searching for more values could slow down the engine. +const MAX_VALUES_BY_FACET: usize = 10000; + /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 3000; @@ -20,13 +28,20 @@ const CANDIDATES_THRESHOLD: u64 = 3000; pub struct FacetDistribution<'a> { facets: Option>, candidates: Option, + max_values_by_facet: usize, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } impl<'a> FacetDistribution<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> { - FacetDistribution { facets: None, candidates: None, rtxn, index } + FacetDistribution { + facets: None, + candidates: None, + max_values_by_facet: DEFAULT_VALUES_BY_FACET, + rtxn, + index, + } } pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { @@ -34,6 +49,11 @@ impl<'a> FacetDistribution<'a> { self } + pub fn max_values_by_facet(&mut self, max: usize) -> &mut Self { + self.max_values_by_facet = cmp::min(max, MAX_VALUES_BY_FACET); + self + } + pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self { self.candidates = Some(candidates); self @@ -52,6 +72,7 @@ impl<'a> FacetDistribution<'a> { FacetType::Number => { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); @@ -64,6 +85,10 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, _, value), ()) = result?; *distribution.entry(value.to_string()).or_insert(0) += 1; + + if distribution.len() - distribution_prelength == self.max_values_by_facet { + break; + } } } } @@ -86,6 +111,10 @@ impl<'a> FacetDistribution<'a> { .entry(normalized_value) .or_insert_with(|| (original_value, 0)); *count += 1; + + if normalized_distribution.len() == self.max_values_by_facet { + break; + } } } @@ -116,6 +145,9 @@ impl<'a> FacetDistribution<'a> { if !docids.is_empty() { distribution.insert(value.to_string(), docids.len()); } + if distribution.len() == self.max_values_by_facet { + break; + } } Ok(()) @@ -136,6 +168,9 @@ impl<'a> FacetDistribution<'a> { if !docids.is_empty() { distribution.insert(original.to_string(), docids.len()); } + if distribution.len() == self.max_values_by_facet { + break; + } } Ok(()) @@ -155,6 +190,9 @@ impl<'a> FacetDistribution<'a> { for result in range { let ((_, _, value, _), docids) = result?; distribution.insert(value.to_string(), docids.len()); + if distribution.len() == self.max_values_by_facet { + break; + } } let iter = self @@ -168,6 +206,9 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, normalized_value), (original_value, docids)) = result?; normalized_distribution.insert(normalized_value, (original_value, docids.len())); + if normalized_distribution.len() == self.max_values_by_facet { + break; + } } let iter = normalized_distribution @@ -253,11 +294,12 @@ impl<'a> FacetDistribution<'a> { impl fmt::Debug for FacetDistribution<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let FacetDistribution { facets, candidates, rtxn: _, index: _ } = self; + let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self; f.debug_struct("FacetDistribution") .field("facets", facets) .field("candidates", candidates) + .field("max_values_by_facet", max_values_by_facet) .finish() } } From 754f48a4fba0d34afaf30b2e2e04f71a289c83a2 Mon Sep 17 00:00:00 2001 From: Matthias Wright Date: Fri, 20 May 2022 21:25:43 +0200 Subject: [PATCH 1408/1889] Improves ranking rules error message --- milli/src/criterion.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index a46a137ad..c02cd2525 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -8,7 +8,7 @@ use crate::{AscDesc, Member}; #[derive(Error, Debug)] pub enum CriterionError { - #[error("`{name}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.")] + #[error("`{name}` ranking rule is invalid. Valid ranking rules are words, typo, sort, proximity, attribute, exactness and custom ranking rules.")] InvalidName { name: String }, #[error("`{name}` is a reserved keyword and thus can't be used as a ranking rule")] ReservedName { name: String }, From 8993fec8a3dd3d2dbf7f681c2d7455aed75cc444 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 24 May 2022 09:15:49 +0200 Subject: [PATCH 1409/1889] return optional exact words --- milli/src/index.rs | 6 +++--- milli/src/search/query_tree.rs | 14 +++++++------- milli/src/update/settings.rs | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 81648fe1c..41bd85b93 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1041,10 +1041,10 @@ impl Index { } /// List the words on which typo are not allowed - pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result>> { + pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result>>> { match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? { - Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), - None => Ok(fst::Set::default().map_data(Cow::Owned)?), + Some(bytes) => Ok(Some(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?)), + None => Ok(None), } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 02fc0747a..2e53971d2 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -152,7 +152,7 @@ trait Context { } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; - fn exact_words(&self) -> crate::Result>>; + fn exact_words(&self) -> crate::Result>>>; } /// The query tree builder is the interface to build a query tree. @@ -183,7 +183,7 @@ impl<'a> Context for QueryTreeBuilder<'a> { Ok((one, two)) } - fn exact_words(&self) -> crate::Result>> { + fn exact_words(&self) -> crate::Result>>> { self.index.exact_words(self.rtxn) } } @@ -277,13 +277,13 @@ pub struct TypoConfig<'a> { pub max_typos: u8, pub word_len_one_typo: u8, pub word_len_two_typo: u8, - pub exact_words: fst::Set>, + pub exact_words: Option>>, } /// Return the `QueryKind` of a word depending on `authorize_typos` /// and the provided word length. fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind { - if authorize_typos && !config.exact_words.contains(&word) { + if authorize_typos && !config.exact_words.map(|s| s.contains(&word)).unwrap_or(false) { let count = word.chars().count().min(u8::MAX as usize) as u8; if count < config.word_len_one_typo { QueryKind::exact(word) @@ -779,8 +779,8 @@ mod test { Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) } - fn exact_words(&self) -> crate::Result>> { - Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap()) + fn exact_words(&self) -> crate::Result>>> { + Ok(Some(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())) } } @@ -1405,7 +1405,7 @@ mod test { #[test] fn test_min_word_len_typo() { - let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap(); + let exact_words = Some(fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap()); let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words }; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index bd1495b1c..829932d5c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1495,7 +1495,7 @@ mod tests { let words = btreeset! { S("Ab"), S("ac") }; builder.set_exact_words(words); assert!(builder.execute(|_| ()).is_ok()); - let exact_words = index.exact_words(&txn).unwrap(); + let exact_words = index.exact_words(&txn).unwrap().unwrap(); for word in exact_words.into_fst().stream().into_str_vec().unwrap() { assert!(word.0 == "ac" || word.0 == "ab"); } From ac975cc747e8b9edc0ddb251efc142607a1b969e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 24 May 2022 09:43:17 +0200 Subject: [PATCH 1410/1889] cache context's exact words --- milli/src/search/mod.rs | 2 +- milli/src/search/query_tree.rs | 49 ++++++++++++++++++++++------------ 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 979ee1e6e..f3f852a48 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -118,7 +118,7 @@ impl<'a> Search<'a> { let before = Instant::now(); let (query_tree, primitive_query, matching_words) = match self.query.as_ref() { Some(query) => { - let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); + let mut builder = QueryTreeBuilder::new(self.rtxn, self.index)?; builder.optional_words(self.optional_words); builder.authorize_typos(self.is_typo_authorized()?); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 2e53971d2..4c4127dd4 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -152,7 +152,7 @@ trait Context { } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; - fn exact_words(&self) -> crate::Result>>>; + fn exact_words(&self) -> &Option>>; } /// The query tree builder is the interface to build a query tree. @@ -162,6 +162,7 @@ pub struct QueryTreeBuilder<'a> { optional_words: bool, authorize_typos: bool, words_limit: Option, + exact_words: Option>>, } impl<'a> Context for QueryTreeBuilder<'a> { @@ -183,16 +184,24 @@ impl<'a> Context for QueryTreeBuilder<'a> { Ok((one, two)) } - fn exact_words(&self) -> crate::Result>>> { - self.index.exact_words(self.rtxn) + fn exact_words(&self) -> &Option>> { + &self.exact_words } } impl<'a> QueryTreeBuilder<'a> { /// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn` /// and an Index `index`. - pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self { - Self { rtxn, index, optional_words: true, authorize_typos: true, words_limit: None } + pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Result { + let exact_words = index.exact_words(rtxn)?; + Ok(Self { + rtxn, + index, + optional_words: true, + authorize_typos: true, + words_limit: None, + exact_words, + }) } /// if `optional_words` is set to `false` the query tree will be @@ -277,13 +286,13 @@ pub struct TypoConfig<'a> { pub max_typos: u8, pub word_len_one_typo: u8, pub word_len_two_typo: u8, - pub exact_words: Option>>, + pub exact_words: &'a Option>>, } /// Return the `QueryKind` of a word depending on `authorize_typos` /// and the provided word length. fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind { - if authorize_typos && !config.exact_words.map(|s| s.contains(&word)).unwrap_or(false) { + if authorize_typos && !config.exact_words.as_ref().map(|s| s.contains(&word)).unwrap_or(false) { let count = word.chars().count().min(u8::MAX as usize) as u8; if count < config.word_len_one_typo { QueryKind::exact(word) @@ -342,7 +351,7 @@ fn create_query_tree( children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words()?; + let exact_words = ctx.exact_words(); let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; children.push(Operation::Query(Query { @@ -396,7 +405,7 @@ fn create_query_tree( let concat = words.concat(); let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words()?; + let exact_words = ctx.exact_words(); let config = TypoConfig { max_typos: 1, word_len_one_typo, @@ -501,7 +510,7 @@ fn create_matching_words( } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words()?; + let exact_words = ctx.exact_words(); let config = TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; @@ -579,7 +588,7 @@ fn create_matching_words( let word = words.concat(); let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words()?; + let exact_words = ctx.exact_words(); let config = TypoConfig { max_typos: 1, word_len_one_typo, @@ -742,8 +751,7 @@ mod test { struct TestContext { synonyms: HashMap, Vec>>, postings: HashMap, - // Raw bytes for the exact word fst Set - exact_words: Vec, + exact_words: Option>>, } impl TestContext { @@ -779,8 +787,8 @@ mod test { Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) } - fn exact_words(&self) -> crate::Result>>> { - Ok(Some(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())) + fn exact_words(&self) -> &Option>> { + &self.exact_words } } @@ -799,6 +807,8 @@ mod test { } let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap(); + let exact_words = + Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); TestContext { synonyms: hashmap! { @@ -1406,8 +1416,12 @@ mod test { #[test] fn test_min_word_len_typo() { let exact_words = Some(fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap()); - let config = - TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words }; + let config = TypoConfig { + max_typos: 2, + word_len_one_typo: 5, + word_len_two_typo: 7, + exact_words: &exact_words, + }; assert_eq!( typos("hello".to_string(), true, config.clone()), @@ -1433,6 +1447,7 @@ mod test { let tokens = result.tokens(); let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); + let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); let context = TestContext { exact_words, ..Default::default() }; let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap(); From 69dc4de80fb3f089bbb04352c3df21762e2b350f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 24 May 2022 12:14:55 +0200 Subject: [PATCH 1411/1889] change &Option to Option<&Set> --- milli/src/search/query_tree.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4c4127dd4..7d2390b39 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -152,7 +152,7 @@ trait Context { } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; - fn exact_words(&self) -> &Option>>; + fn exact_words(&self) -> Option<&fst::Set>>; } /// The query tree builder is the interface to build a query tree. @@ -184,8 +184,8 @@ impl<'a> Context for QueryTreeBuilder<'a> { Ok((one, two)) } - fn exact_words(&self) -> &Option>> { - &self.exact_words + fn exact_words(&self) -> Option<&fst::Set>> { + self.exact_words.as_ref() } } @@ -286,7 +286,7 @@ pub struct TypoConfig<'a> { pub max_typos: u8, pub word_len_one_typo: u8, pub word_len_two_typo: u8, - pub exact_words: &'a Option>>, + pub exact_words: Option<&'a fst::Set>>, } /// Return the `QueryKind` of a word depending on `authorize_typos` @@ -787,8 +787,8 @@ mod test { Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) } - fn exact_words(&self) -> &Option>> { - &self.exact_words + fn exact_words(&self) -> Option<&fst::Set>> { + self.exact_words.as_ref() } } @@ -1415,12 +1415,12 @@ mod test { #[test] fn test_min_word_len_typo() { - let exact_words = Some(fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap()); + let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap(); let config = TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, - exact_words: &exact_words, + exact_words: Some(&exact_words), }; assert_eq!( From 25fc576696446a57847d2eb5b263bc319223cddb Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 24 May 2022 14:15:33 +0200 Subject: [PATCH 1412/1889] review changes --- milli/src/search/query_tree.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 7d2390b39..76748179b 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -193,14 +193,13 @@ impl<'a> QueryTreeBuilder<'a> { /// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn` /// and an Index `index`. pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Result { - let exact_words = index.exact_words(rtxn)?; Ok(Self { rtxn, index, optional_words: true, authorize_typos: true, words_limit: None, - exact_words, + exact_words: index.exact_words(rtxn)?, }) } @@ -292,7 +291,7 @@ pub struct TypoConfig<'a> { /// Return the `QueryKind` of a word depending on `authorize_typos` /// and the provided word length. fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind { - if authorize_typos && !config.exact_words.as_ref().map(|s| s.contains(&word)).unwrap_or(false) { + if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) { let count = word.chars().count().min(u8::MAX as usize) as u8; if count < config.word_len_one_typo { QueryKind::exact(word) From c19c17eddb370ffcde3ad66ce6f5034fe27850a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 1 Jun 2022 18:31:02 +0200 Subject: [PATCH 1413/1889] Update version to v0.28.1 --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 219b47ca3..38c812ef8 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.28.0" +version = "0.28.1" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 3440c3189..af5f90979 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.28.0" +version = "0.28.1" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 4d87335b7..396248d28 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.28.0" +version = "0.28.1" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index dd0ed19e4..75015f0fd 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.28.0" +version = "0.28.1" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index a52c7dc4d..fc17d5a80 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.28.0" +version = "0.28.1" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index addfcf8f9..653cd69d4 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.28.0" +version = "0.28.1" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 1df5bc0b0..1da6bcf4c 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.28.0" +version = "0.28.1" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 5ef0e3191..838b88915 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.28.0" +version = "0.28.1" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ac5a43899..696384a01 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.28.0" +version = "0.28.1" authors = ["Kerollmops "] edition = "2018" From 192e024ada4bbf94c7d9bf92cab743e5d71da2b5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Jun 2022 15:46:44 +0200 Subject: [PATCH 1414/1889] Add Charabia in Cargo.toml --- milli/Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 696384a01..d19ff03a9 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,18 +9,18 @@ bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" bstr = "0.2.17" byteorder = "1.4.3" +charabia = "0.5.0" concat-arrays = "0.1.2" crossbeam-channel = "0.5.2" either = "1.6.1" +flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" -flatten-serde-json = { path = "../flatten-serde-json" } -grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" +grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" } memmap2 = "0.5.3" obkv = "0.2.0" once_cell = "1.10.0" From 86ac8568e6c98860ba0346a87ae296638e25da74 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Jun 2022 15:47:28 +0200 Subject: [PATCH 1415/1889] Use Charabia in milli --- milli/src/lib.rs | 2 +- milli/src/search/matches/matching_words.rs | 61 +++++----- milli/src/search/matches/mod.rs | 108 ++++++------------ milli/src/search/mod.rs | 12 +- milli/src/search/query_tree.rs | 93 ++++++--------- .../extract/extract_docid_word_positions.rs | 16 ++- milli/src/update/settings.rs | 37 +++--- 7 files changed, 127 insertions(+), 202 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index e718dccae..f28677ed8 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -21,7 +21,7 @@ pub use filter_parser::{Condition, FilterCondition}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; use serde_json::{Map, Value}; -pub use {heed, meilisearch_tokenizer as tokenizer}; +pub use {charabia as tokenizer, heed}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::criterion::{default_criteria, Criterion, CriterionError}; diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 84b47bba5..71fbfd794 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -3,8 +3,8 @@ use std::collections::BTreeMap; use std::fmt; use std::ops::{Index, IndexMut}; +use charabia::Token; use levenshtein_automata::{Distance, DFA}; -use meilisearch_tokenizer::Token; use crate::search::build_dfa; @@ -99,13 +99,13 @@ impl MatchingWord { /// Returns the lenght in chars of the match in case of the token matches the term. pub fn match_token(&self, token: &Token) -> Option { - match self.dfa.eval(token.text()) { + match self.dfa.eval(token.lemma()) { Distance::Exact(t) if t <= self.typo => { if self.prefix { - let len = bytes_to_highlight(token.text(), &self.word); - Some(token.num_chars_from_bytes(len)) + let len = bytes_to_highlight(token.lemma(), &self.word); + Some(token.original_lengths(len).0) } else { - Some(token.num_chars_from_bytes(token.text().len())) + Some(token.original_lengths(token.lemma().len()).0) } } _otherwise => None, @@ -262,7 +262,7 @@ mod tests { use std::borrow::Cow; use std::str::from_utf8; - use meilisearch_tokenizer::TokenKind; + use charabia::TokenKind; use super::*; use crate::MatchingWords; @@ -344,11 +344,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("word"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("word"), + char_end: "word".chars().count(), byte_end: "word".len(), - char_map: None, + ..Default::default() }) .next(), Some(MatchType::Full { char_len: 3, ids: &[2] }) @@ -357,11 +356,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("nyc"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("nyc"), + char_end: "nyc".chars().count(), byte_end: "nyc".len(), - char_map: None, + ..Default::default() }) .next(), None @@ -370,11 +368,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("world"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("world"), + char_end: "world".chars().count(), byte_end: "world".len(), - char_map: None, + ..Default::default() }) .next(), Some(MatchType::Full { char_len: 5, ids: &[2] }) @@ -383,11 +380,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("splitted"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("splitted"), + char_end: "splitted".chars().count(), byte_end: "splitted".len(), - char_map: None, + ..Default::default() }) .next(), Some(MatchType::Full { char_len: 5, ids: &[0] }) @@ -396,11 +392,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("thisnew"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("thisnew"), + char_end: "thisnew".chars().count(), byte_end: "thisnew".len(), - char_map: None, + ..Default::default() }) .next(), None @@ -409,11 +404,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("borld"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("borld"), + char_end: "borld".chars().count(), byte_end: "borld".len(), - char_map: None, + ..Default::default() }) .next(), Some(MatchType::Full { char_len: 5, ids: &[2] }) @@ -422,11 +416,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("wordsplit"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("wordsplit"), + char_end: "wordsplit".chars().count(), byte_end: "wordsplit".len(), - char_map: None, + ..Default::default() }) .next(), Some(MatchType::Full { char_len: 4, ids: &[2] }) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index d89e7dcb6..85e77e15b 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,8 +1,8 @@ use std::borrow::Cow; +use charabia::{SeparatorKind, Token}; use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; pub use matching_words::{MatchingWord, MatchingWords}; -use meilisearch_tokenizer::token::{SeparatorKind, Token}; use serde::Serialize; pub mod matching_words; @@ -168,13 +168,13 @@ impl<'t> Matcher<'t, '_> { let current_token_position = *token_position; let current_word_position = *word_position; *token_position += 1; - if token.is_separator().is_none() { + if !token.is_separator() { *word_position += 1; } Some((current_token_position, current_word_position, token)) }) - .filter(|(_, _, token)| token.is_separator().is_none()); + .filter(|(_, _, token)| !token.is_separator()); while let Some((token_position, word_position, word)) = words_positions.next() { for match_type in self.matching_words.match_token(word) { @@ -243,8 +243,8 @@ impl<'t> Matcher<'t, '_> { let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); while remaining_words > 0 { - let before_token = before_tokens.peek().map(|t| t.is_separator()); - let after_token = after_tokens.peek().map(|t| t.is_separator()); + let before_token = before_tokens.peek().map(|t| t.separator_kind()); + let after_token = after_tokens.peek().map(|t| t.separator_kind()); match (before_token, after_token) { // we can expand both sides. @@ -470,7 +470,7 @@ impl<'t> Matcher<'t, '_> { #[cfg(test)] mod tests { - use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; + use charabia::Tokenize; use super::*; use crate::search::matches::matching_words::MatchingWord; @@ -490,30 +490,26 @@ mod tests { let matching_words = matching_words(); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: false, crop: None }; // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); @@ -524,44 +520,38 @@ mod tests { let matching_words = matching_words(); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: true, crop: None }; // empty text. let text = ""; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text, because there is no matches. assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!( @@ -580,30 +570,26 @@ mod tests { let matching_words = MatchingWords::new(matching_words); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: true, crop: None }; // Text containing prefix match. let text = "Ŵôřlḑôle"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Ŵôřlḑôle"); // Text containing unicode match. let text = "Ŵôřlḑ"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Ŵôřlḑ"); // Text containing unicode match. let text = "Westfália"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Westfália"); @@ -614,28 +600,24 @@ mod tests { let matching_words = matching_words(); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: false, crop: Some(10) }; // empty text. let text = ""; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( @@ -645,8 +627,7 @@ mod tests { // Text without any match starting by a separator. let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( @@ -656,19 +637,17 @@ mod tests { // Test phrase propagation let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // should crop the phrase instead of croping around the match. assert_eq!( &matcher.format(format_options), - "…Split The World is a book written by Emily Henry…" + "… Split The World is a book written by Emily Henry…", ); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( @@ -678,8 +657,7 @@ mod tests { // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( @@ -689,8 +667,7 @@ mod tests { // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -700,8 +677,7 @@ mod tests { // Text containing matches with diferent density. let text = "split void the void void world void void void void void void void void void void split the world void void"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -711,8 +687,7 @@ mod tests { // Text containing matches with same word. let text = "split split split split split split void void void void void void void void void void split the world void void"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -726,28 +701,24 @@ mod tests { let matching_words = matching_words(); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: true, crop: Some(10) }; // empty text. let text = ""; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 first words with a marker at the end. assert_eq!( @@ -757,8 +728,7 @@ mod tests { // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( @@ -768,16 +738,14 @@ mod tests { // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -792,11 +760,9 @@ mod tests { let matching_words = matching_words(); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let text = "void void split the world void void."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); // set a smaller crop size let format_options = FormatOptions { highlight: false, crop: Some(2) }; @@ -847,13 +813,11 @@ mod tests { let mut builder = MatcherBuilder::from_matching_words(matching_words); builder.highlight_prefix("_".to_string()); builder.highlight_suffix("_".to_string()); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: true, crop: None }; let text = "the do or die can't be he do and or isn't he"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!( diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f3f852a48..62a7815b0 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -6,12 +6,12 @@ use std::result::Result as StdResult; use std::str::Utf8Error; use std::time::Instant; +use charabia::TokenizerBuilder; use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use log::debug; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; @@ -126,14 +126,14 @@ impl<'a> Search<'a> { builder.words_limit(self.words_limit); // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. - let mut config = AnalyzerConfig::default(); + let mut tokbuilder = TokenizerBuilder::new(); let stop_words = self.index.stop_words(self.rtxn)?; if let Some(ref stop_words) = stop_words { - config.stop_words(stop_words); + tokbuilder.stop_words(stop_words); } - let analyzer = Analyzer::new(config); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + + let tokenizer = tokbuilder.build(); + let tokens = tokenizer.tokenize(query); builder .build(tokens)? .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 76748179b..e0fac0f43 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,10 +1,9 @@ use std::borrow::Cow; use std::{cmp, fmt, mem}; +use charabia::classifier::ClassifiedTokenIter; +use charabia::{SeparatorKind, TokenKind}; use fst::Set; -use meilisearch_tokenizer::token::SeparatorKind; -use meilisearch_tokenizer::tokenizer::TokenStream; -use meilisearch_tokenizer::TokenKind; use roaring::RoaringBitmap; use slice_group_by::GroupBy; @@ -235,9 +234,9 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build( + pub fn build>( &self, - query: TokenStream, + query: ClassifiedTokenIter
, ) -> Result> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); @@ -649,11 +648,14 @@ impl PrimitiveQueryPart { /// Create primitive query from tokenized query string, /// the primitive query is an intermediate state to build the query tree. -fn create_primitive_query( - query: TokenStream, +fn create_primitive_query( + query: ClassifiedTokenIter, stop_words: Option>, words_limit: Option, -) -> PrimitiveQuery { +) -> PrimitiveQuery +where + A: AsRef<[u8]>, +{ let mut primitive_query = Vec::new(); let mut phrase = Vec::new(); let mut quoted = false; @@ -673,21 +675,18 @@ fn create_primitive_query( // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { - phrase.push(token.word.to_string()); + phrase.push(token.lemma().to_string()); } else if peekable.peek().is_some() { - if !stop_words - .as_ref() - .map_or(false, |swords| swords.contains(token.word.as_ref())) - { + if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { primitive_query - .push(PrimitiveQueryPart::Word(token.word.to_string(), false)); + .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false)); } } else { - primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); + primitive_query.push(PrimitiveQueryPart::Word(token.lemma().to_string(), true)); } } TokenKind::Separator(separator_kind) => { - let quote_count = token.word.chars().filter(|&s| s == '"').count(); + let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); // swap quoted state if we encounter a double quote if quote_count % 2 != 0 { quoted = !quoted; @@ -738,8 +737,8 @@ pub fn maximum_proximity(operation: &Operation) -> usize { mod test { use std::collections::HashMap; + use charabia::Tokenize; use maplit::hashmap; - use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; @@ -754,12 +753,12 @@ mod test { } impl TestContext { - fn build( + fn build>( &self, optional_words: bool, authorize_typos: bool, words_limit: Option, - query: TokenStream, + query: ClassifiedTokenIter, ) -> Result> { let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { @@ -856,9 +855,7 @@ mod test { #[test] fn prefix() { let query = "hey friends"; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -889,9 +886,7 @@ mod test { #[test] fn no_prefix() { let query = "hey friends "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -922,9 +917,7 @@ mod test { #[test] fn synonyms() { let query = "hello world "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -987,9 +980,7 @@ mod test { #[test] fn complex_synonyms() { let query = "new york city "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -1087,9 +1078,7 @@ mod test { #[test] fn ngrams() { let query = "n grams "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -1120,9 +1109,7 @@ mod test { #[test] fn word_split() { let query = "wordsplit fish "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -1159,9 +1146,7 @@ mod test { #[test] fn phrase() { let query = "\"hey friends\" \" \" \"wooop"; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::And(vec![ Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), @@ -1177,9 +1162,7 @@ mod test { #[test] fn phrase_with_hard_separator() { let query = "\"hey friends. wooop wooop\""; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::And(vec![ Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), @@ -1195,9 +1178,7 @@ mod test { #[test] fn optional_word() { let query = "hey my friend "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( true, @@ -1280,9 +1261,7 @@ mod test { #[test] fn optional_word_phrase() { let query = "\"hey my\""; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]); let (query_tree, _) = @@ -1294,9 +1273,7 @@ mod test { #[test] fn optional_word_multiple_phrases() { let query = r#""hey" my good "friend""#; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( true, @@ -1365,9 +1342,7 @@ mod test { #[test] fn no_typo() { let query = "hey friends "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -1397,9 +1372,7 @@ mod test { #[test] fn words_limit() { let query = "\"hey my\" good friend"; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::And(vec![ Operation::Phrase(vec!["hey".to_string(), "my".to_string()]), @@ -1441,10 +1414,8 @@ mod test { #[test] fn disable_typo_on_word() { let query = "goodbye"; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); + let tokens = query.tokenize(); - let tokens = result.tokens(); let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); let context = TestContext { exact_words, ..Default::default() }; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 44bf9dbf7..9a6060805 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -3,8 +3,7 @@ use std::convert::TryInto; use std::fs::File; use std::{io, mem, str}; -use meilisearch_tokenizer::token::SeparatorKind; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; +use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; @@ -40,11 +39,11 @@ pub fn extract_docid_word_positions( let mut key_buffer = Vec::new(); let mut field_buffer = String::new(); - let mut config = AnalyzerConfig::default(); + let mut builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { - config.stop_words(stop_words); + builder.stop_words(stop_words); } - let analyzer = Analyzer::>::new(AnalyzerConfig::default()); + let tokenizer = builder.build(); let mut cursor = obkv_documents.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -64,12 +63,11 @@ pub fn extract_docid_word_positions( serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut field_buffer) { - let analyzed = analyzer.analyze(field); - let tokens = process_tokens(analyzed.tokens()) + let tokens = process_tokens(tokenizer.tokenize(field)) .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { - let token = token.text().trim(); + let token = token.lemma().trim(); if !token.is_empty() { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(token.as_bytes()); @@ -146,7 +144,7 @@ fn process_tokens<'a>( tokens: impl Iterator>, ) -> impl Iterator)> { tokens - .skip_while(|token| token.is_separator().is_some()) + .skip_while(|token| token.is_separator()) .scan((0, None), |(offset, prev_kind), token| { match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 829932d5c..9363d8eb6 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,8 +1,8 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::result::Result as StdResult; +use charabia::{Tokenizer, TokenizerBuilder}; use itertools::Itertools; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -385,13 +385,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_synonyms(&mut self) -> Result { match self.synonyms { Setting::Set(ref synonyms) => { - fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec { - analyzer - .analyze(text) - .tokens() + fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec { + tokenizer + .tokenize(text) .filter_map(|token| { if token.is_word() { - Some(token.text().to_string()) + Some(token.lemma().to_string()) } else { None } @@ -399,19 +398,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { .collect::>() } - let mut config = AnalyzerConfig::default(); + let mut builder = TokenizerBuilder::new(); let stop_words = self.index.stop_words(self.wtxn)?; - if let Some(stop_words) = &stop_words { - config.stop_words(stop_words); + if let Some(ref stop_words) = stop_words { + builder.stop_words(stop_words); } - let analyzer = Analyzer::new(config); + let tokenizer = builder.build(); let mut new_synonyms = HashMap::new(); for (word, synonyms) in synonyms { // Normalize both the word and associated synonyms. - let normalized_word = normalize(&analyzer, word); + let normalized_word = normalize(&tokenizer, word); let normalized_synonyms = - synonyms.iter().map(|synonym| normalize(&analyzer, synonym)); + synonyms.iter().map(|synonym| normalize(&tokenizer, synonym)); // Store the normalized synonyms under the normalized word, // merging the possible duplicate words. @@ -584,19 +583,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_exact_words(&mut self) -> Result<()> { match self.exact_words { Setting::Set(ref mut words) => { - fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> String { - analyzer.analyze(text).tokens().map(|token| token.text().to_string()).collect() + fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String { + tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect() } - let mut config = AnalyzerConfig::default(); + let mut builder = TokenizerBuilder::new(); let stop_words = self.index.stop_words(self.wtxn)?; - if let Some(stop_words) = &stop_words { - config.stop_words(stop_words); + if let Some(ref stop_words) = stop_words { + builder.stop_words(stop_words); } - let analyzer = Analyzer::new(config); + let tokenizer = builder.build(); let mut words: Vec<_> = - words.iter().map(|word| normalize(&analyzer, word)).collect(); + words.iter().map(|word| normalize(&tokenizer, word)).collect(); // normalization could reorder words words.sort_unstable(); From 4dd3675d2bab65eef95df22af668c875dd9d4418 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Jun 2022 15:55:26 +0200 Subject: [PATCH 1416/1889] Update http-ui --- http-ui/src/main.rs | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 641f82046..57a78b41e 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -19,7 +19,7 @@ use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; use heed::EnvOpenOptions; use milli::documents::DocumentBatchReader; -use milli::tokenizer::{Analyzer, AnalyzerConfig}; +use milli::tokenizer::{Tokenizer, TokenizerBuilder}; use milli::update::UpdateIndexingStep::*; use milli::update::{ ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, @@ -139,17 +139,16 @@ pub struct IndexerOpt { pub max_positions_per_attributes: Option, } -struct Highlighter<'a, A> { - analyzer: Analyzer<'a, A>, +struct Highlighter<'s, A> { + tokenizer: Tokenizer<'s, A>, } -impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { - fn new(stop_words: &'a fst::Set) -> Self { - let mut config = AnalyzerConfig::default(); - config.stop_words(stop_words); - let analyzer = Analyzer::new(config); +impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { + fn new(stop_words: &'s fst::Set) -> Self { + let mut builder = TokenizerBuilder::new(); + builder.stop_words(stop_words); - Self { analyzer } + Self { tokenizer: builder.build() } } fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value { @@ -158,9 +157,8 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { Value::Bool(boolean) => Value::Bool(boolean), Value::Number(number) => Value::Number(number), Value::String(old_string) => { - let analyzed = self.analyzer.analyze(&old_string); - let analyzed: Vec<_> = analyzed.tokens().collect(); - let mut matcher = matcher_builder.build(&analyzed[..], &old_string); + let tokens: Vec<_> = self.tokenizer.tokenize(&old_string).collect(); + let mut matcher = matcher_builder.build(&tokens[..], &old_string); let format_options = FormatOptions { highlight: true, crop: Some(10) }; From 4dd7b20c327e5fcbe08d032da0cb1f3e1ddf8a9a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Jun 2022 17:33:25 +0200 Subject: [PATCH 1417/1889] Update benchmarks --- benchmarks/benches/formatting.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs index 5045df268..25e88ffeb 100644 --- a/benchmarks/benches/formatting.rs +++ b/benchmarks/benches/formatting.rs @@ -1,5 +1,5 @@ use criterion::{criterion_group, criterion_main}; -use milli::tokenizer::{Analyzer, AnalyzerConfig}; +use milli::tokenizer::Tokenize; use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; #[cfg(target_os = "linux")] @@ -52,9 +52,7 @@ fn bench_formatting(c: &mut criterion::Criterion) { for conf in confs { group.bench_function(conf.name, |b| { b.iter(|| { - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let analyzed = analyzer.analyze(&conf.text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = conf.text.tokenize().collect(); let mut matcher = conf.matching_words.build(&tokens[..], conf.text); matcher.format(option.clone()); }) From 7aabe42ae045ed1d3af8742b514933ac9ab1f90d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Jun 2022 17:59:04 +0200 Subject: [PATCH 1418/1889] Refactor matching words --- milli/src/search/matches/mod.rs | 178 +++++++++++++++++--------------- 1 file changed, 95 insertions(+), 83 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 85e77e15b..1a6d8958a 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; -use charabia::{SeparatorKind, Token}; +use charabia::{SeparatorKind, Token, Tokenizer}; use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; pub use matching_words::{MatchingWord, MatchingWords}; use serde::Serialize; @@ -11,16 +11,23 @@ const DEFAULT_CROP_MARKER: &'static str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &'static str = ""; const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = ""; -pub struct MatcherBuilder { +pub struct MatcherBuilder<'a, A> { matching_words: MatchingWords, + tokenizer: Tokenizer<'a, A>, crop_marker: Option, highlight_prefix: Option, highlight_suffix: Option, } -impl MatcherBuilder { - pub fn from_matching_words(matching_words: MatchingWords) -> Self { - Self { matching_words, crop_marker: None, highlight_prefix: None, highlight_suffix: None } +impl<'a, A> MatcherBuilder<'a, A> { + pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, A>) -> Self { + Self { + matching_words, + tokenizer, + crop_marker: None, + highlight_prefix: None, + highlight_suffix: None, + } } pub fn crop_marker(&mut self, marker: String) -> &Self { @@ -38,7 +45,7 @@ impl MatcherBuilder { self } - pub fn build<'t, 'm>(&'m self, tokens: &'t [Token], text: &'t str) -> Matcher<'t, 'm> { + pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { let crop_marker = match &self.crop_marker { Some(marker) => marker.as_str(), None => &DEFAULT_CROP_MARKER, @@ -54,8 +61,8 @@ impl MatcherBuilder { }; Matcher { text, - tokens, matching_words: &self.matching_words, + tokenizer: &self.tokenizer, crop_marker, highlight_prefix, highlight_suffix, @@ -93,17 +100,17 @@ pub struct MatchBounds { pub length: usize, } -pub struct Matcher<'t, 'm> { +pub struct Matcher<'t, 'm, A> { text: &'t str, - tokens: &'t [Token<'t>], matching_words: &'m MatchingWords, + tokenizer: &'m Tokenizer<'m, A>, crop_marker: &'m str, highlight_prefix: &'m str, highlight_suffix: &'m str, - matches: Option>, + matches: Option<(Vec>, Vec)>, } -impl<'t> Matcher<'t, '_> { +impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { /// Iterates over tokens and save any of them that matches the query. fn compute_matches(&mut self) -> &mut Self { fn compute_partial_match<'a>( @@ -159,10 +166,10 @@ impl<'t> Matcher<'t, '_> { false } + let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); let mut matches = Vec::new(); - let mut words_positions = self - .tokens + let mut words_positions = tokens .iter() .scan((0, 0), |(token_position, word_position), token| { let current_token_position = *token_position; @@ -210,7 +217,7 @@ impl<'t> Matcher<'t, '_> { } } - self.matches = Some(matches); + self.matches = Some((tokens, matches)); self } @@ -218,10 +225,10 @@ impl<'t> Matcher<'t, '_> { pub fn matches(&mut self) -> Vec { match &self.matches { None => self.compute_matches().matches(), - Some(matches) => matches + Some((tokens, matches)) => matches .iter() .map(|m| MatchBounds { - start: self.tokens[m.token_position].byte_start, + start: tokens[m.token_position].byte_start, length: m.match_len, }) .collect(), @@ -229,7 +236,7 @@ impl<'t> Matcher<'t, '_> { } /// Returns the bounds in byte index of the crop window. - fn crop_bounds(&self, matches: &[Match], crop_size: usize) -> (usize, usize) { + fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); @@ -239,8 +246,8 @@ impl<'t> Matcher<'t, '_> { // matches needs to be counted in the crop len. let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; - let mut before_tokens = self.tokens[..first_match_token_position].iter().rev().peekable(); - let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); + let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); + let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); while remaining_words > 0 { let before_token = before_tokens.peek().map(|t| t.separator_kind()); @@ -396,7 +403,7 @@ impl<'t> Matcher<'t, '_> { Cow::Borrowed(self.text) } else { match &self.matches { - Some(matches) => { + Some((tokens, matches)) => { let matches = match format_options.crop { Some(crop_size) if crop_size > 0 => { self.find_best_match_interval(matches, crop_size) @@ -405,7 +412,9 @@ impl<'t> Matcher<'t, '_> { }; let (byte_start, byte_end) = match format_options.crop { - Some(crop_size) if crop_size > 0 => self.crop_bounds(matches, crop_size), + Some(crop_size) if crop_size > 0 => { + self.crop_bounds(tokens, matches, crop_size) + } _ => (0, self.text.len()), }; @@ -420,7 +429,6 @@ impl<'t> Matcher<'t, '_> { if format_options.highlight { // insert highlight markers around matches. - let tokens = self.tokens; for m in matches { let token = &tokens[m.token_position]; @@ -470,7 +478,7 @@ impl<'t> Matcher<'t, '_> { #[cfg(test)] mod tests { - use charabia::Tokenize; + use charabia::TokenizerBuilder; use super::*; use crate::search::matches::matching_words::MatchingWord; @@ -485,6 +493,12 @@ mod tests { MatchingWords::new(matching_words) } + impl MatcherBuilder<'_, Vec> { + pub fn from_matching_words(matching_words: MatchingWords) -> Self { + Self::new(matching_words, TokenizerBuilder::default().build()) + } + } + #[test] fn format_identity() { let matching_words = matching_words(); @@ -495,22 +509,22 @@ mod tests { // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); } @@ -525,34 +539,34 @@ mod tests { // empty text. let text = ""; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no crop should return complete text, because there is no matches. assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. assert_eq!( &matcher.format(format_options), @@ -575,22 +589,22 @@ mod tests { // Text containing prefix match. let text = "Ŵôřlḑôle"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Ŵôřlḑôle"); // Text containing unicode match. let text = "Ŵôřlḑ"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Ŵôřlḑ"); // Text containing unicode match. let text = "Westfália"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Westfália"); } @@ -605,20 +619,20 @@ mod tests { // empty text. let text = ""; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no highlight should return 10 first words with a marker at the end. assert_eq!( &matcher.format(format_options), @@ -627,8 +641,8 @@ mod tests { // Text without any match starting by a separator. let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no highlight should return 10 first words with a marker at the end. assert_eq!( &matcher.format(format_options), @@ -637,8 +651,8 @@ mod tests { // Test phrase propagation let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // should crop the phrase instead of croping around the match. assert_eq!( &matcher.format(format_options), @@ -647,8 +661,8 @@ mod tests { // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no highlight should return 10 last words with a marker at the start. assert_eq!( &matcher.format(format_options), @@ -657,8 +671,8 @@ mod tests { // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // no highlight should return 10 last words with a marker at the start. assert_eq!( &matcher.format(format_options), @@ -667,8 +681,8 @@ mod tests { // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(format_options), @@ -677,8 +691,8 @@ mod tests { // Text containing matches with diferent density. let text = "split void the void void world void void void void void void void void void void split the world void void"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(format_options), @@ -687,8 +701,8 @@ mod tests { // Text containing matches with same word. let text = "split split split split split split void void void void void void void void void void split the world void void"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(format_options), @@ -706,20 +720,20 @@ mod tests { // empty text. let text = ""; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // both should return 10 first words with a marker at the end. assert_eq!( &matcher.format(format_options), @@ -728,8 +742,8 @@ mod tests { // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( &matcher.format(format_options), @@ -738,15 +752,15 @@ mod tests { // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + + let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(format_options), @@ -762,26 +776,25 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let text = "void void split the world void void."; - let tokens: Vec<_> = text.tokenize().collect(); // set a smaller crop size let format_options = FormatOptions { highlight: false, crop: Some(2) }; - let mut matcher = builder.build(&tokens[..], text); + let mut matcher = builder.build(text); // because crop size < query size, partially format matches. assert_eq!(&matcher.format(format_options), "…split the…"); // set a smaller crop size let format_options = FormatOptions { highlight: false, crop: Some(1) }; - let mut matcher = builder.build(&tokens[..], text); + let mut matcher = builder.build(text); // because crop size < query size, partially format matches. assert_eq!(&matcher.format(format_options), "…split…"); // set crop size to 0 let format_options = FormatOptions { highlight: false, crop: Some(0) }; - let mut matcher = builder.build(&tokens[..], text); + let mut matcher = builder.build(text); // because crop size is 0, crop is ignored. assert_eq!(&matcher.format(format_options), "void void split the world void void."); } @@ -817,9 +830,8 @@ mod tests { let format_options = FormatOptions { highlight: true, crop: None }; let text = "the do or die can't be he do and or isn't he"; - let tokens: Vec<_> = text.tokenize().collect(); - let mut matcher = builder.build(&tokens[..], text); + let mut matcher = builder.build(text); assert_eq!( &matcher.format(format_options), "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_", From 727d663f28680c2c8c6c2f868bf9dac87ca6de8e Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Jun 2022 18:07:10 +0200 Subject: [PATCH 1419/1889] Update benchmarks --- benchmarks/benches/formatting.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs index 25e88ffeb..f0ef8ea15 100644 --- a/benchmarks/benches/formatting.rs +++ b/benchmarks/benches/formatting.rs @@ -1,5 +1,5 @@ use criterion::{criterion_group, criterion_main}; -use milli::tokenizer::Tokenize; +use milli::tokenizer::TokenizerBuilder; use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; #[cfg(target_os = "linux")] @@ -9,7 +9,7 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; struct Conf<'a> { name: &'a str, text: &'a str, - matching_words: MatcherBuilder, + matching_words: MatcherBuilder<'a, Vec>, } fn bench_formatting(c: &mut criterion::Criterion) { @@ -18,7 +18,7 @@ fn bench_formatting(c: &mut criterion::Criterion) { Conf { name: "'the door d'", text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#, - matching_words: MatcherBuilder::from_matching_words(MatchingWords::new(vec![ + matching_words: MatcherBuilder::new(MatchingWords::new(vec![ (vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]), (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), (vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]), @@ -27,7 +27,8 @@ fn bench_formatting(c: &mut criterion::Criterion) { (vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]), (vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]), (vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]), - ])), + ] + ), TokenizerBuilder::default().build()), }, ]; @@ -52,8 +53,7 @@ fn bench_formatting(c: &mut criterion::Criterion) { for conf in confs { group.bench_function(conf.name, |b| { b.iter(|| { - let tokens: Vec<_> = conf.text.tokenize().collect(); - let mut matcher = conf.matching_words.build(&tokens[..], conf.text); + let mut matcher = conf.matching_words.build(conf.text); matcher.format(option.clone()); }) }); From 6ce1c6487adb82f13eb0c2d4560ceda0a324f848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 2 Jun 2022 18:07:55 +0200 Subject: [PATCH 1420/1889] Update version for next release (v0.29.0) --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 38c812ef8..6e3282581 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.28.1" +version = "0.29.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index af5f90979..cf3fadd2c 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.28.1" +version = "0.29.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 396248d28..560d22119 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.28.1" +version = "0.29.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 75015f0fd..b38f4abd6 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.28.1" +version = "0.29.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index fc17d5a80..5090f6265 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.28.1" +version = "0.29.0" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 653cd69d4..5a857f7f6 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.28.1" +version = "0.29.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 1da6bcf4c..4c9024dbe 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.28.1" +version = "0.29.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 838b88915..954853a38 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.28.1" +version = "0.29.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d19ff03a9..f88961e9e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.28.1" +version = "0.29.0" authors = ["Kerollmops "] edition = "2018" From a5c790bf4b54eeb763d9742fc1c6bc373370fb4a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Jun 2022 18:15:36 +0200 Subject: [PATCH 1421/1889] Update http-ui --- http-ui/src/main.rs | 43 +++++++++++++++---------------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 57a78b41e..ce4fa7ba5 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -19,7 +19,7 @@ use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; use heed::EnvOpenOptions; use milli::documents::DocumentBatchReader; -use milli::tokenizer::{Tokenizer, TokenizerBuilder}; +use milli::tokenizer::TokenizerBuilder; use milli::update::UpdateIndexingStep::*; use milli::update::{ ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, @@ -140,38 +140,31 @@ pub struct IndexerOpt { } struct Highlighter<'s, A> { - tokenizer: Tokenizer<'s, A>, + matcher_builder: MatcherBuilder<'s, A>, } impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { - fn new(stop_words: &'s fst::Set) -> Self { - let mut builder = TokenizerBuilder::new(); - builder.stop_words(stop_words); - - Self { tokenizer: builder.build() } + fn new(matcher_builder: MatcherBuilder<'s, A>) -> Self { + Self { matcher_builder } } - fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value { + fn highlight_value(&self, value: Value) -> Value { match value { Value::Null => Value::Null, Value::Bool(boolean) => Value::Bool(boolean), Value::Number(number) => Value::Number(number), Value::String(old_string) => { - let tokens: Vec<_> = self.tokenizer.tokenize(&old_string).collect(); - let mut matcher = matcher_builder.build(&tokens[..], &old_string); + let mut matcher = self.matcher_builder.build(&old_string); let format_options = FormatOptions { highlight: true, crop: Some(10) }; Value::String(matcher.format(format_options).to_string()) } - Value::Array(values) => Value::Array( - values.into_iter().map(|v| self.highlight_value(v, matcher_builder)).collect(), - ), + Value::Array(values) => { + Value::Array(values.into_iter().map(|v| self.highlight_value(v)).collect()) + } Value::Object(object) => Value::Object( - object - .into_iter() - .map(|(k, v)| (k, self.highlight_value(v, matcher_builder))) - .collect(), + object.into_iter().map(|(k, v)| (k, self.highlight_value(v))).collect(), ), } } @@ -179,14 +172,13 @@ impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { fn highlight_record( &self, object: &mut Map, - matcher_builder: &MatcherBuilder, attributes_to_highlight: &HashSet, ) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { let old_value = mem::take(value); - *value = self.highlight_value(old_value, matcher_builder); + *value = self.highlight_value(old_value); } } } @@ -798,20 +790,15 @@ async fn main() -> anyhow::Result<()> { None => fields_ids_map.iter().map(|(_, name)| name).map(String::from).collect(), }; - let stop_words = fst::Set::default(); - let highlighter = Highlighter::new(&stop_words); - - let mut matcher_builder = MatcherBuilder::from_matching_words(matching_words); + let mut matcher_builder = + MatcherBuilder::new(matching_words, TokenizerBuilder::default().build()); matcher_builder.highlight_prefix("".to_string()); matcher_builder.highlight_suffix("".to_string()); + let highlighter = Highlighter::new(matcher_builder); for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { - highlighter.highlight_record( - &mut object, - &matcher_builder, - &attributes_to_highlight, - ); + highlighter.highlight_record(&mut object, &attributes_to_highlight); } documents.push(object); From d212dc6b8b797a85a045ead07717a57648cc79f0 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 2 Jun 2022 18:22:56 +0200 Subject: [PATCH 1422/1889] Remove useless newline --- milli/src/search/matches/mod.rs | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 1a6d8958a..ba2e8728e 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -509,21 +509,18 @@ mod tests { // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); @@ -539,33 +536,28 @@ mod tests { // empty text. let text = ""; - let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); // no crop should return complete text, because there is no matches. assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. assert_eq!( @@ -589,21 +581,18 @@ mod tests { // Text containing prefix match. let text = "Ŵôřlḑôle"; - let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Ŵôřlḑôle"); // Text containing unicode match. let text = "Ŵôřlḑ"; - let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Ŵôřlḑ"); // Text containing unicode match. let text = "Westfália"; - let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Westfália"); @@ -619,19 +608,16 @@ mod tests { // empty text. let text = ""; - let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); // no highlight should return 10 first words with a marker at the end. assert_eq!( @@ -641,7 +627,6 @@ mod tests { // Text without any match starting by a separator. let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; - let mut matcher = builder.build(text); // no highlight should return 10 first words with a marker at the end. assert_eq!( @@ -651,7 +636,6 @@ mod tests { // Test phrase propagation let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; - let mut matcher = builder.build(text); // should crop the phrase instead of croping around the match. assert_eq!( @@ -661,7 +645,6 @@ mod tests { // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); // no highlight should return 10 last words with a marker at the start. assert_eq!( @@ -671,7 +654,6 @@ mod tests { // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); // no highlight should return 10 last words with a marker at the start. assert_eq!( @@ -681,7 +663,6 @@ mod tests { // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -691,7 +672,6 @@ mod tests { // Text containing matches with diferent density. let text = "split void the void void world void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -701,7 +681,6 @@ mod tests { // Text containing matches with same word. let text = "split split split split split split void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -720,19 +699,16 @@ mod tests { // empty text. let text = ""; - let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let mut matcher = builder.build(text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); // both should return 10 first words with a marker at the end. assert_eq!( @@ -742,7 +718,6 @@ mod tests { // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( @@ -752,14 +727,12 @@ mod tests { // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -779,21 +752,18 @@ mod tests { // set a smaller crop size let format_options = FormatOptions { highlight: false, crop: Some(2) }; - let mut matcher = builder.build(text); // because crop size < query size, partially format matches. assert_eq!(&matcher.format(format_options), "…split the…"); // set a smaller crop size let format_options = FormatOptions { highlight: false, crop: Some(1) }; - let mut matcher = builder.build(text); // because crop size < query size, partially format matches. assert_eq!(&matcher.format(format_options), "…split…"); // set crop size to 0 let format_options = FormatOptions { highlight: false, crop: Some(0) }; - let mut matcher = builder.build(text); // because crop size is 0, crop is ignored. assert_eq!(&matcher.format(format_options), "void void split the world void void."); @@ -830,7 +800,6 @@ mod tests { let format_options = FormatOptions { highlight: true, crop: None }; let text = "the do or die can't be he do and or isn't he"; - let mut matcher = builder.build(text); assert_eq!( &matcher.format(format_options), From 31776fdc3ffcec5448024bf725c2e9f108b5a76e Mon Sep 17 00:00:00 2001 From: ad hoc Date: Tue, 7 Jun 2022 12:24:06 +0200 Subject: [PATCH 1423/1889] add failing test --- milli/src/update/index_documents/mod.rs | 48 +++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 117233611..5b6af12ae 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1881,4 +1881,52 @@ mod tests { wtxn.commit().unwrap(); } + + #[test] + fn index_documents_in_multiple_transforms() { + let tmp = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 100); + let index = Index::new(options, tmp).unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let indexer_config = IndexerConfig::default(); + let mut builder = IndexDocuments::new( + &mut wtxn, + &index, + &indexer_config, + IndexDocumentsConfig::default(), + |_| (), + ) + .unwrap(); + + let doc1 = documents! {[{ + "id": 228142, + "title": "asdsad", + "state": "automated", + "priority": "normal", + "public_uid": "37ccf021", + "project_id": 78207, + "branch_id_number": 0 + }]}; + + let doc2 = documents! {[{ + "id": 228143, + "title": "something", + "state": "automated", + "priority": "normal", + "public_uid": "39c6499b", + "project_id": 78207, + "branch_id_number": 0 + }]}; + + builder.add_documents(doc1).unwrap(); + builder.add_documents(doc2).unwrap(); + + builder.execute().unwrap(); + + let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map(); + let ids = map.values().collect::>(); + + assert_eq!(ids.len(), map.len()); + } } From d0aaa7ff0057204762006ee3263019d94e21abe1 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 7 Jun 2022 15:44:55 +0200 Subject: [PATCH 1424/1889] Fix wrong internal ids assignments --- milli/src/update/index_documents/transform.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 9238212fd..08d450578 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -51,6 +51,7 @@ pub struct Transform<'a, 'i> { indexer_settings: &'a IndexerConfig, pub autogenerate_docids: bool, pub index_documents_method: IndexDocumentsMethod, + available_documents_ids: AvailableDocumentsIds, original_sorter: grenad::Sorter, flattened_sorter: grenad::Sorter, @@ -128,12 +129,14 @@ impl<'a, 'i> Transform<'a, 'i> { indexer_settings.max_nb_chunks, indexer_settings.max_memory.map(|mem| mem / 2), ); + let documents_ids = index.documents_ids(wtxn)?; Ok(Transform { index, fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, autogenerate_docids, + available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), original_sorter, flattened_sorter, index_documents_method, @@ -156,8 +159,6 @@ impl<'a, 'i> Transform<'a, 'i> { { let fields_index = reader.index(); let external_documents_ids = self.index.external_documents_ids(wtxn)?; - let documents_ids = self.index.documents_ids(wtxn)?; - let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; @@ -261,7 +262,8 @@ impl<'a, 'i> Transform<'a, 'i> { // if the document has never been encountered we give it a new docid // and push this new docid to the external documents ids builder Entry::Vacant(entry) => { - let new_docid = available_documents_ids + let new_docid = self + .available_documents_ids .next() .ok_or(UserError::DocumentLimitReached)?; entry.insert(new_docid as u64); From 478dbfa45a7ea902904a4d2317448b0b408716c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 7 Jun 2022 18:59:33 +0200 Subject: [PATCH 1425/1889] Update version for next release (v0.29.1) --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 6e3282581..5905404f7 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.29.0" +version = "0.29.1" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index cf3fadd2c..0e3c43b53 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.29.0" +version = "0.29.1" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 560d22119..63bbc387e 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.29.0" +version = "0.29.1" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index b38f4abd6..0f6d63c94 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.29.0" +version = "0.29.1" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 5090f6265..830139dc8 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.29.0" +version = "0.29.1" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 5a857f7f6..9572b2d9b 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.29.0" +version = "0.29.1" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 4c9024dbe..30c043c0d 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.29.0" +version = "0.29.1" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 954853a38..7f60a7fc3 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.29.0" +version = "0.29.1" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f88961e9e..6606fd889 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.29.0" +version = "0.29.1" authors = ["Kerollmops "] edition = "2018" From bae400744738d5cfc253a46ce058fbc9d90fd71d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 8 Jun 2022 15:42:29 +0200 Subject: [PATCH 1426/1889] Remove the hard limit on the number of facet values returned --- milli/src/search/facet/facet_distribution.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 23b0b1df9..7340538ea 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; -use std::{cmp, fmt, mem}; +use std::{fmt, mem}; use heed::types::ByteSlice; use roaring::RoaringBitmap; @@ -17,10 +17,6 @@ use crate::{FieldId, Index, Result}; /// be fetched from the key-value store. const DEFAULT_VALUES_BY_FACET: usize = 1000; -/// The hard limit in the number of values by facets that will be fetched from -/// the key-value store. Searching for more values could slow down the engine. -const MAX_VALUES_BY_FACET: usize = 10000; - /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 3000; @@ -50,7 +46,7 @@ impl<'a> FacetDistribution<'a> { } pub fn max_values_by_facet(&mut self, max: usize) -> &mut Self { - self.max_values_by_facet = cmp::min(max, MAX_VALUES_BY_FACET); + self.max_values_by_facet = max; self } From 2a505503b3fbeef66dab0735a062e33567eac64d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 8 Jun 2022 15:43:01 +0200 Subject: [PATCH 1427/1889] Change the number of facet values returned by default to 100 --- milli/src/search/facet/facet_distribution.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 7340538ea..8069abede 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -15,7 +15,7 @@ use crate::{FieldId, Index, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. -const DEFAULT_VALUES_BY_FACET: usize = 1000; +const DEFAULT_VALUES_BY_FACET: usize = 100; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. From 56ee9cc21fa47cabaa27117f922512773fd9e70f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 8 Jun 2022 16:00:06 +0200 Subject: [PATCH 1428/1889] Bump the version to 0.29.2 --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 5905404f7..44507d527 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.29.1" +version = "0.29.2" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 0e3c43b53..01dc243b7 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.29.1" +version = "0.29.2" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 63bbc387e..d20407de2 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.29.1" +version = "0.29.2" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 0f6d63c94..014d10ad6 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.29.1" +version = "0.29.2" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 830139dc8..41573268f 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.29.1" +version = "0.29.2" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 9572b2d9b..493d54e12 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.29.1" +version = "0.29.2" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 30c043c0d..69ae2e9de 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.29.1" +version = "0.29.2" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 7f60a7fc3..8383aad0a 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.29.1" +version = "0.29.2" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 6606fd889..96f48707d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.29.1" +version = "0.29.2" authors = ["Kerollmops "] edition = "2018" From 52a494bd3b75cda4bb045650ae73bd8dce51b335 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 8 Jun 2022 16:54:16 +0200 Subject: [PATCH 1429/1889] Add the new pagination.limited_to and faceting.max_values_per_facet settings --- milli/src/update/settings.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 9363d8eb6..86c168be3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -104,6 +104,8 @@ pub struct Settings<'a, 't, 'u, 'i> { exact_words: Setting>, /// Attributes on which typo tolerance is disabled. exact_attributes: Setting>, + max_values_per_facet: Setting, + limit_pagination_to: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -129,6 +131,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { min_word_len_two_typos: Setting::NotSet, min_word_len_one_typo: Setting::NotSet, exact_attributes: Setting::NotSet, + max_values_per_facet: Setting::NotSet, + limit_pagination_to: Setting::NotSet, indexer_config, } } @@ -246,6 +250,22 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.exact_attributes = Setting::Reset; } + pub fn set_max_values_per_facet(&mut self, value: usize) { + self.max_values_per_facet = Setting::Set(value); + } + + pub fn reset_max_values_per_facet(&mut self) { + self.max_values_per_facet = Setting::Reset; + } + + pub fn set_limit_pagination_to(&mut self, value: usize) { + self.limit_pagination_to = Setting::Set(value); + } + + pub fn reset_limit_pagination_to(&mut self) { + self.limit_pagination_to = Setting::Reset; + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -1525,6 +1545,8 @@ mod tests { min_word_len_one_typo, exact_words, exact_attributes, + max_values_per_facet, + limit_pagination_to, } = builder; assert!(matches!(searchable_fields, Setting::NotSet)); @@ -1541,5 +1563,7 @@ mod tests { assert!(matches!(min_word_len_one_typo, Setting::NotSet)); assert!(matches!(exact_words, Setting::NotSet)); assert!(matches!(exact_attributes, Setting::NotSet)); + assert!(matches!(max_values_per_facet, Setting::NotSet)); + assert!(matches!(limit_pagination_to, Setting::NotSet)); } } From 69931e50d20ed2ad3ec56f2323b57ec71131befc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 8 Jun 2022 17:28:23 +0200 Subject: [PATCH 1430/1889] Add the max_values_by_facet setting to the database --- milli/src/index.rs | 14 ++++++++++ milli/src/lib.rs | 2 +- milli/src/search/facet/facet_distribution.rs | 28 +++++++++++--------- milli/src/search/facet/mod.rs | 2 +- milli/src/search/mod.rs | 2 +- milli/src/update/settings.rs | 24 ++++++++++++++--- 6 files changed, 52 insertions(+), 20 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 41bd85b93..f7e3aa14a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -56,6 +56,8 @@ pub mod main_key { pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; pub const EXACT_WORDS: &str = "exact-words"; pub const EXACT_ATTRIBUTES: &str = "exact-attributes"; + pub const MAX_VALUES_PER_FACET: &str = "max-values-per-facet"; + pub const PAGINATION_LIMITED_TO: &str = "pagination-limited-to"; } pub mod db_name { @@ -1087,6 +1089,18 @@ impl Index { self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?; Ok(()) } + + pub fn max_values_per_facet(&self, txn: &RoTxn) -> heed::Result> { + self.main.get::<_, Str, OwnedType>(txn, main_key::MAX_VALUES_PER_FACET) + } + + pub(crate) fn put_max_values_per_facet(&self, txn: &mut RwTxn, val: usize) -> heed::Result<()> { + self.main.put::<_, Str, OwnedType>(txn, main_key::MAX_VALUES_PER_FACET, &val) + } + + pub(crate) fn delete_max_values_per_facet(&self, txn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(txn, main_key::MAX_VALUES_PER_FACET) + } } #[cfg(test)] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index f28677ed8..81cd057d5 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -38,7 +38,7 @@ pub use self::heed_codec::{ pub use self::index::Index; pub use self::search::{ FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, - MatchingWords, Search, SearchResult, + MatchingWords, Search, SearchResult, DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 8069abede..b2718a490 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -15,7 +15,7 @@ use crate::{FieldId, Index, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. -const DEFAULT_VALUES_BY_FACET: usize = 100; +pub const DEFAULT_VALUES_PER_FACET: usize = 100; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. @@ -24,7 +24,7 @@ const CANDIDATES_THRESHOLD: u64 = 3000; pub struct FacetDistribution<'a> { facets: Option>, candidates: Option, - max_values_by_facet: usize, + max_values_per_facet: usize, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } @@ -34,7 +34,7 @@ impl<'a> FacetDistribution<'a> { FacetDistribution { facets: None, candidates: None, - max_values_by_facet: DEFAULT_VALUES_BY_FACET, + max_values_per_facet: DEFAULT_VALUES_PER_FACET, rtxn, index, } @@ -45,8 +45,8 @@ impl<'a> FacetDistribution<'a> { self } - pub fn max_values_by_facet(&mut self, max: usize) -> &mut Self { - self.max_values_by_facet = max; + pub fn max_values_per_facet(&mut self, max: usize) -> &mut Self { + self.max_values_per_facet = max; self } @@ -82,7 +82,8 @@ impl<'a> FacetDistribution<'a> { let ((_, _, value), ()) = result?; *distribution.entry(value.to_string()).or_insert(0) += 1; - if distribution.len() - distribution_prelength == self.max_values_by_facet { + if distribution.len() - distribution_prelength == self.max_values_per_facet + { break; } } @@ -108,7 +109,7 @@ impl<'a> FacetDistribution<'a> { .or_insert_with(|| (original_value, 0)); *count += 1; - if normalized_distribution.len() == self.max_values_by_facet { + if normalized_distribution.len() == self.max_values_per_facet { break; } } @@ -141,7 +142,7 @@ impl<'a> FacetDistribution<'a> { if !docids.is_empty() { distribution.insert(value.to_string(), docids.len()); } - if distribution.len() == self.max_values_by_facet { + if distribution.len() == self.max_values_per_facet { break; } } @@ -164,7 +165,7 @@ impl<'a> FacetDistribution<'a> { if !docids.is_empty() { distribution.insert(original.to_string(), docids.len()); } - if distribution.len() == self.max_values_by_facet { + if distribution.len() == self.max_values_per_facet { break; } } @@ -186,7 +187,7 @@ impl<'a> FacetDistribution<'a> { for result in range { let ((_, _, value, _), docids) = result?; distribution.insert(value.to_string(), docids.len()); - if distribution.len() == self.max_values_by_facet { + if distribution.len() == self.max_values_per_facet { break; } } @@ -202,7 +203,7 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, normalized_value), (original_value, docids)) = result?; normalized_distribution.insert(normalized_value, (original_value, docids.len())); - if normalized_distribution.len() == self.max_values_by_facet { + if normalized_distribution.len() == self.max_values_per_facet { break; } } @@ -290,12 +291,13 @@ impl<'a> FacetDistribution<'a> { impl fmt::Debug for FacetDistribution<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self; + let FacetDistribution { facets, candidates, max_values_per_facet, rtxn: _, index: _ } = + self; f.debug_struct("FacetDistribution") .field("facets", facets) .field("candidates", candidates) - .field("max_values_by_facet", max_values_by_facet) + .field("max_values_per_facet", max_values_per_facet) .finish() } } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index c8f91352b..e3ac95882 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,4 +1,4 @@ -pub use self::facet_distribution::FacetDistribution; +pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::facet_string::FacetStringIter; pub use self::filter::Filter; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 62a7815b0..1c363e142 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; +pub use self::facet::{FacetDistribution, FacetNumberIter, Filter, DEFAULT_VALUES_PER_FACET}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 86c168be3..ce4bfbc70 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -105,7 +105,7 @@ pub struct Settings<'a, 't, 'u, 'i> { /// Attributes on which typo tolerance is disabled. exact_attributes: Setting>, max_values_per_facet: Setting, - limit_pagination_to: Setting, + pagination_limited_to: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -132,7 +132,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { min_word_len_one_typo: Setting::NotSet, exact_attributes: Setting::NotSet, max_values_per_facet: Setting::NotSet, - limit_pagination_to: Setting::NotSet, + pagination_limited_to: Setting::NotSet, indexer_config, } } @@ -632,6 +632,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_max_values_per_facet(&mut self) -> Result<()> { + match self.max_values_per_facet { + Setting::Set(max) => { + self.index.put_max_values_per_facet(&mut self.wtxn, max)?; + } + Setting::Reset => { + self.index.delete_max_values_per_facet(&mut self.wtxn)?; + } + Setting::NotSet => (), + } + + Ok(()) + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, @@ -650,6 +664,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_authorize_typos()?; self.update_min_typo_word_len()?; self.update_exact_words()?; + self.update_max_values_per_facet()?; + self.update_pagination_limited_to()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, @@ -1546,7 +1562,7 @@ mod tests { exact_words, exact_attributes, max_values_per_facet, - limit_pagination_to, + pagination_limited_to, } = builder; assert!(matches!(searchable_fields, Setting::NotSet)); @@ -1564,6 +1580,6 @@ mod tests { assert!(matches!(exact_words, Setting::NotSet)); assert!(matches!(exact_attributes, Setting::NotSet)); assert!(matches!(max_values_per_facet, Setting::NotSet)); - assert!(matches!(limit_pagination_to, Setting::NotSet)); + assert!(matches!(pagination_limited_to, Setting::NotSet)); } } From 445d5474cca00142f8b509f9e058760f0f3ad9c9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 8 Jun 2022 17:31:21 +0200 Subject: [PATCH 1431/1889] Add the pagination_limited_to setting to the database --- milli/src/index.rs | 16 ++++++++++++++++ milli/src/update/settings.rs | 22 ++++++++++++++++++---- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index f7e3aa14a..28c870592 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1101,6 +1101,22 @@ impl Index { pub(crate) fn delete_max_values_per_facet(&self, txn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(txn, main_key::MAX_VALUES_PER_FACET) } + + pub fn pagination_limited_to(&self, txn: &RoTxn) -> heed::Result> { + self.main.get::<_, Str, OwnedType>(txn, main_key::PAGINATION_LIMITED_TO) + } + + pub(crate) fn put_pagination_limited_to( + &self, + txn: &mut RwTxn, + val: usize, + ) -> heed::Result<()> { + self.main.put::<_, Str, OwnedType>(txn, main_key::PAGINATION_LIMITED_TO, &val) + } + + pub(crate) fn delete_pagination_limited_to(&self, txn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(txn, main_key::PAGINATION_LIMITED_TO) + } } #[cfg(test)] diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ce4bfbc70..174d7073d 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -258,12 +258,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.max_values_per_facet = Setting::Reset; } - pub fn set_limit_pagination_to(&mut self, value: usize) { - self.limit_pagination_to = Setting::Set(value); + pub fn set_pagination_limited_to(&mut self, value: usize) { + self.pagination_limited_to = Setting::Set(value); } - pub fn reset_limit_pagination_to(&mut self) { - self.limit_pagination_to = Setting::Reset; + pub fn reset_pagination_limited_to(&mut self) { + self.pagination_limited_to = Setting::Reset; } fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> @@ -646,6 +646,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_pagination_limited_to(&mut self) -> Result<()> { + match self.pagination_limited_to { + Setting::Set(max) => { + self.index.put_pagination_limited_to(&mut self.wtxn, max)?; + } + Setting::Reset => { + self.index.delete_pagination_limited_to(&mut self.wtxn)?; + } + Setting::NotSet => (), + } + + Ok(()) + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep) + Sync, From 90afde435bb3fb5b391dcd4d95bb6dcc3d0bc14d Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 9 Jun 2022 16:03:49 +0200 Subject: [PATCH 1432/1889] fix escaped quotes in filter --- filter-parser/src/lib.rs | 13 ++---- milli/src/search/facet/filter.rs | 79 ++++++++++++++++++++++++++++++-- 2 files changed, 78 insertions(+), 14 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index bad7dbc64..243d1a3f4 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -40,7 +40,6 @@ mod error; mod value; use std::fmt::Debug; -use std::ops::Deref; use std::str::FromStr; pub use condition::{parse_condition, parse_to, Condition}; @@ -70,14 +69,6 @@ pub struct Token<'a> { value: Option, } -impl<'a> Deref for Token<'a> { - type Target = &'a str; - - fn deref(&self) -> &Self::Target { - &self.span - } -} - impl<'a> PartialEq for Token<'a> { fn eq(&self, other: &Self) -> bool { self.span.fragment() == other.span.fragment() @@ -89,6 +80,10 @@ impl<'a> Token<'a> { Self { span, value } } + pub fn lexeme(&self) -> &str { + &self.span + } + pub fn value(&self) -> &str { self.value.as_ref().map_or(&self.span, |value| value) } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 8f1ee749f..a809aa5fb 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,7 +1,6 @@ use std::collections::HashSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; -use std::ops::Deref; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; @@ -283,8 +282,9 @@ impl<'a> Filter<'a> { Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)), Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)), Condition::Equal(val) => { - let (_original_value, string_docids) = - strings_db.get(rtxn, &(field_id, &val.to_lowercase()))?.unwrap_or_default(); + let (_original_value, string_docids) = strings_db + .get(rtxn, &(field_id, &val.value().to_lowercase()))? + .unwrap_or_default(); let number = val.parse::().ok(); let number_docids = match number { Some(n) => { @@ -362,7 +362,7 @@ impl<'a> Filter<'a> { return Ok(RoaringBitmap::new()); } } else { - match *fid.deref() { + match fid.lexeme() { attribute @ "_geo" => { return Err(fid.as_external_error(FilterError::BadGeo(attribute)))?; } @@ -461,7 +461,7 @@ mod tests { use maplit::hashset; use super::*; - use crate::update::{IndexerConfig, Settings}; + use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::Index; #[test] @@ -598,6 +598,75 @@ mod tests { )); } + #[test] + fn escaped_quote_in_filter_value_2380() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = documents!([ + { + "id": "test_1", + "monitor_diagonal": "27' to 30'" + }, + { + "id": "test_2", + "monitor_diagonal": "27\" to 30\"" + }, + { + "id": "test_3", + "monitor_diagonal": "27\" to 30'" + }, + ]); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + builder.set_filterable_fields(hashset!(S("monitor_diagonal"))); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = crate::Search::new(&rtxn, &index); + // this filter is copy pasted from #2380 with the exact same espace sequence + search.filter( + crate::Filter::from_str("monitor_diagonal = '27\" to 30\\''").unwrap().unwrap(), + ); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![2]); + + search.filter( + crate::Filter::from_str(r#"monitor_diagonal = "27' to 30'" "#).unwrap().unwrap(), + ); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0]); + + search.filter( + crate::Filter::from_str(r#"monitor_diagonal = "27\" to 30\"" "#).unwrap().unwrap(), + ); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1]); + + search.filter( + crate::Filter::from_str(r#"monitor_diagonal = "27\" to 30'" "#).unwrap().unwrap(), + ); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![2]); + } + #[test] fn geo_radius_error() { let path = tempfile::tempdir().unwrap(); From 676187ba430019c2f86c7836332b592a100bcd5e Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 9 Jun 2022 16:53:32 +0200 Subject: [PATCH 1433/1889] bump milli version --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 44507d527..2c6c93bd8 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.29.2" +version = "0.29.3" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 01dc243b7..20dc25c28 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.29.2" +version = "0.29.3" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index d20407de2..e8d54ab4f 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.29.2" +version = "0.29.3" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 014d10ad6..ca6332a50 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.29.2" +version = "0.29.3" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 41573268f..cdf7c3b2e 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.29.2" +version = "0.29.3" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 493d54e12..34e9df773 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.29.2" +version = "0.29.3" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 69ae2e9de..169cc7e08 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.29.2" +version = "0.29.3" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 8383aad0a..8bc092ad0 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.29.2" +version = "0.29.3" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 96f48707d..70bb8f585 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.29.2" +version = "0.29.3" authors = ["Kerollmops "] edition = "2018" From 0d1d35405275f80f3b13a19baad06ae4db738d2a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 13 Jun 2022 16:39:17 +0200 Subject: [PATCH 1434/1889] Ensure that Index methods are not bypassed by Meilisearch --- cli/src/main.rs | 6 +++--- infos/src/main.rs | 35 +++++++++++++++-------------------- milli/src/index.rs | 6 +++--- 3 files changed, 21 insertions(+), 26 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 97580142b..14bc797af 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -229,7 +229,7 @@ impl Performer for DocumentAddition { println!("Adding {} documents to the index.", reader.len()); - let mut txn = index.env.write_txn()?; + let mut txn = index.write_txn()?; let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() }; let update_method = if self.update_documents { IndexDocumentsMethod::UpdateDocuments @@ -424,7 +424,7 @@ impl Search { offset: &Option, limit: &Option, ) -> Result>> { - let txn = index.env.read_txn()?; + let txn = index.read_txn()?; let mut search = index.search(&txn); if let Some(ref query) = query { @@ -475,7 +475,7 @@ struct SettingsUpdate { impl Performer for SettingsUpdate { fn perform(self, index: milli::Index) -> Result<()> { - let mut txn = index.env.write_txn()?; + let mut txn = index.write_txn()?; let config = IndexerConfig { log_every_n: Some(100), ..Default::default() }; diff --git a/infos/src/main.rs b/infos/src/main.rs index 05c168233..49fa685aa 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -371,11 +371,9 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho use std::cmp::Reverse; use std::collections::BinaryHeap; - use heed::types::{ByteSlice, Str}; + use heed::types::ByteSlice; let Index { - env: _env, - main, word_docids, word_prefix_docids, docid_word_positions, @@ -390,7 +388,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho exact_word_prefix_docids, field_id_docid_facet_f64s: _, field_id_docid_facet_strings: _, - documents, + .. } = index; let main_name = "main"; @@ -425,11 +423,10 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho heap.pop(); } - if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { - heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); - if heap.len() > limit { - heap.pop(); - } + let documents_ids = index.documents_ids(rtxn)?; + heap.push(Reverse((documents_ids.len() as usize, format!("documents-ids"), main_name))); + if heap.len() > limit { + heap.pop(); } for result in word_docids.remap_data_type::().iter(rtxn)? { @@ -549,9 +546,10 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho } } - for result in documents.remap_data_type::().iter(rtxn)? { + for result in index.all_documents(rtxn)? { let (id, value) = result?; - heap.push(Reverse((value.len(), id.to_string(), documents_name))); + let size = value.iter().map(|(k, v)| k.to_ne_bytes().len() + v.len()).sum(); + heap.push(Reverse((size, id.to_string(), documents_name))); if heap.len() > limit { heap.pop(); } @@ -877,7 +875,7 @@ fn export_documents( ) -> anyhow::Result<()> { use std::io::{BufWriter, Write as _}; - use milli::{obkv_to_json, BEU32}; + use milli::obkv_to_json; let stdout = io::stdout(); let mut out = BufWriter::new(stdout); @@ -886,12 +884,13 @@ fn export_documents( let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); let iter: Box> = if internal_ids.is_empty() { - Box::new(index.documents.iter(rtxn)?.map(|result| result.map(|(_id, obkv)| obkv))) + Box::new(index.all_documents(rtxn)?.map(|result| result.map(|(_id, obkv)| obkv))) } else { Box::new( - internal_ids + index + .documents(rtxn, internal_ids.into_iter())? .into_iter() - .flat_map(|id| index.documents.get(rtxn, &BEU32::new(id)).transpose()), + .map(|(_id, obkv)| Ok(obkv)), ) }; @@ -973,8 +972,6 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a use heed::types::ByteSlice; let Index { - env: _env, - main, word_docids, word_prefix_docids, docid_word_positions, @@ -989,7 +986,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a field_id_docid_facet_strings, exact_word_prefix_docids, exact_word_docids, - documents, + .. } = index; let names = if names.is_empty() { @@ -1000,7 +997,6 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a for name in names { let database = match name.as_str() { - MAIN => &main, WORD_PREFIX_DOCIDS => word_prefix_docids.as_polymorph(), WORD_DOCIDS => word_docids.as_polymorph(), DOCID_WORD_POSITIONS => docid_word_positions.as_polymorph(), @@ -1016,7 +1012,6 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(), EXACT_WORD_PREFIX_DOCIDS => exact_word_prefix_docids.as_polymorph(), - DOCUMENTS => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; diff --git a/milli/src/index.rs b/milli/src/index.rs index 28c870592..2cb90284b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -82,10 +82,10 @@ pub mod db_name { #[derive(Clone)] pub struct Index { /// The LMDB environment which this index is associated with. - pub env: heed::Env, + pub(crate) env: heed::Env, /// Contains many different types (e.g. the fields ids map). - pub main: PolyDatabase, + pub(crate) main: PolyDatabase, /// A word and all the documents ids containing the word. pub word_docids: Database, @@ -125,7 +125,7 @@ pub struct Index { pub field_id_docid_facet_strings: Database, /// Maps the document id to the document as an obkv store. - pub documents: Database, ObkvCodec>, + pub(crate) documents: Database, ObkvCodec>, } impl Index { From 177154828cbcf60f5cc47beace0ab10ae7bc768f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 13 Jun 2022 16:39:33 +0200 Subject: [PATCH 1435/1889] Extends deletion tests --- milli/src/update/delete_documents.rs | 386 +++++++++++++++++++++------ 1 file changed, 299 insertions(+), 87 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 97250d988..ef4f849cc 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -622,16 +622,46 @@ where #[cfg(test)] mod tests { - use std::collections::HashSet; - use big_s::S; - use heed::EnvOpenOptions; + use heed::{EnvOpenOptions, RwTxn}; use maplit::hashset; use super::*; use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::Filter; + fn insert_documents<'t, R: std::io::Read + std::io::Seek>( + wtxn: &mut RwTxn<'t, '_>, + index: &'t Index, + documents: crate::documents::DocumentBatchReader, + ) { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + } + + fn delete_documents<'t>( + wtxn: &mut RwTxn<'t, '_>, + index: &'t Index, + external_ids: &[&str], + ) -> Vec { + let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); + let ids_to_delete: Vec = external_ids + .iter() + .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) + .collect(); + + // Delete some documents. + let mut builder = DeleteDocuments::new(wtxn, index).unwrap(); + external_ids.iter().for_each(|id| drop(builder.delete_external_id(id))); + builder.execute().unwrap(); + + ids_to_delete + } + #[test] fn delete_documents_with_numbers_as_primary_key() { let path = tempfile::tempdir().unwrap(); @@ -697,7 +727,7 @@ mod tests { } #[test] - fn delete_documents_with_filterable_attributes() { + fn filtered_placeholder_search_should_not_return_deleted_documents() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -733,18 +763,10 @@ mod tests { {"docid":"1_69","label":"geometry"} ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); - builder.execute().unwrap(); - - // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.delete_external_id("1_4"); - builder.execute().unwrap(); + insert_documents(&mut wtxn, &index, content); + delete_documents(&mut wtxn, &index, &["1_4"]); + // Placeholder search with filter let filter = Filter::from_str("label = sign").unwrap().unwrap(); let results = index.search(&wtxn).filter(filter).execute().unwrap(); assert!(results.documents_ids.is_empty()); @@ -753,7 +775,113 @@ mod tests { } #[test] - fn delete_documents_with_geo_points() { + fn placeholder_search_should_not_return_deleted_documents() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key(S("docid")); + builder.execute(|_| ()).unwrap(); + + let content = documents!([ + {"docid":"1_4","label":"sign"}, + {"docid":"1_5","label":"letter"}, + {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, + {"docid":"1_36","label":"drawing,painting,pattern"}, + {"docid":"1_37","label":"art,drawing,outdoor"}, + {"docid":"1_38","label":"aquarium,art,drawing"}, + {"docid":"1_39","label":"abstract"}, + {"docid":"1_40","label":"cartoon"}, + {"docid":"1_41","label":"art,drawing"}, + {"docid":"1_42","label":"art,pattern"}, + {"docid":"1_43","label":"abstract,art,drawing,pattern"}, + {"docid":"1_44","label":"drawing"}, + {"docid":"1_45","label":"art"}, + {"docid":"1_46","label":"abstract,colorfulness,pattern"}, + {"docid":"1_47","label":"abstract,pattern"}, + {"docid":"1_52","label":"abstract,cartoon"}, + {"docid":"1_57","label":"abstract,drawing,pattern"}, + {"docid":"1_58","label":"abstract,art,cartoon"}, + {"docid":"1_68","label":"design"}, + {"docid":"1_69","label":"geometry"} + ]); + + insert_documents(&mut wtxn, &index, content); + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]); + + // Placeholder search + let results = index.search(&wtxn).execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + } + + #[test] + fn search_should_not_return_deleted_documents() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key(S("docid")); + builder.execute(|_| ()).unwrap(); + + let content = documents!([ + {"docid":"1_4","label":"sign"}, + {"docid":"1_5","label":"letter"}, + {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, + {"docid":"1_36","label":"drawing,painting,pattern"}, + {"docid":"1_37","label":"art,drawing,outdoor"}, + {"docid":"1_38","label":"aquarium,art,drawing"}, + {"docid":"1_39","label":"abstract"}, + {"docid":"1_40","label":"cartoon"}, + {"docid":"1_41","label":"art,drawing"}, + {"docid":"1_42","label":"art,pattern"}, + {"docid":"1_43","label":"abstract,art,drawing,pattern"}, + {"docid":"1_44","label":"drawing"}, + {"docid":"1_45","label":"art"}, + {"docid":"1_46","label":"abstract,colorfulness,pattern"}, + {"docid":"1_47","label":"abstract,pattern"}, + {"docid":"1_52","label":"abstract,cartoon"}, + {"docid":"1_57","label":"abstract,drawing,pattern"}, + {"docid":"1_58","label":"abstract,art,cartoon"}, + {"docid":"1_68","label":"design"}, + {"docid":"1_69","label":"geometry"} + ]); + + insert_documents(&mut wtxn, &index, content); + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + + // search for abstract + let results = index.search(&wtxn).query("abstract").execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + } + + #[test] + fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -768,82 +896,166 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"id":"1","city":"Lille", "_geo": { "lat": 50.629973371633746, "lng": 3.0569447399419570 } }, - {"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.641586120121050, "lng": 3.1106593480348670 } }, - {"id":"3","city":"Hellemmes", "_geo": { "lat": 50.631220965518080, "lng": 3.1106399673339933 } }, - {"id":"4","city":"Villeneuve-d'Ascq", "_geo": { "lat": 50.622468098014565, "lng": 3.1476425513437140 } }, - {"id":"5","city":"Hem", "_geo": { "lat": 50.655250871381355, "lng": 3.1897297266244130 } }, - {"id":"6","city":"Roubaix", "_geo": { "lat": 50.692473451896710, "lng": 3.1763326737747650 } }, - {"id":"7","city":"Tourcoing", "_geo": { "lat": 50.726397466736480, "lng": 3.1541653659578670 } }, - {"id":"8","city":"Mouscron", "_geo": { "lat": 50.745325554908610, "lng": 3.2206407854429853 } }, - {"id":"9","city":"Tournai", "_geo": { "lat": 50.605342528602630, "lng": 3.3758586941351414 } }, - {"id":"10","city":"Ghent", "_geo": { "lat": 51.053777403679035, "lng": 3.6957733119926930 } }, - {"id":"11","city":"Brussels", "_geo": { "lat": 50.846640974544690, "lng": 4.3370663564281840 } }, - {"id":"12","city":"Charleroi", "_geo": { "lat": 50.409570138889480, "lng": 4.4347354315085520 } }, - {"id":"13","city":"Mons", "_geo": { "lat": 50.450294178855420, "lng": 3.9623722870904690 } }, - {"id":"14","city":"Valenciennes", "_geo": { "lat": 50.351817774473545, "lng": 3.5326283646928800 } }, - {"id":"15","city":"Arras", "_geo": { "lat": 50.284487528579950, "lng": 2.7637515844478160 } }, - {"id":"16","city":"Cambrai", "_geo": { "lat": 50.179340577906700, "lng": 3.2189409952502930 } }, - {"id":"17","city":"Bapaume", "_geo": { "lat": 50.111276127236400, "lng": 2.8547894666083120 } }, - {"id":"18","city":"Amiens", "_geo": { "lat": 49.931472529669996, "lng": 2.2710499758317080 } }, - {"id":"19","city":"Compiègne", "_geo": { "lat": 49.444980887725656, "lng": 2.7913841281529015 } }, - {"id":"20","city":"Paris", "_geo": { "lat": 48.902100060895480, "lng": 2.3708400867406930 } } + {"id":"1","city":"Lille", "_geo": { "lat": 50.6299 as f32, "lng": 3.0569 as f32 } }, + {"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.6415 as f32, "lng": 3.1106 as f32 } }, + {"id":"3","city":"Hellemmes", "_geo": { "lat": 50.6312 as f32, "lng": 3.1106 as f32 } }, + {"id":"4","city":"Villeneuve-d'Ascq", "_geo": { "lat": 50.6224 as f32, "lng": 3.1476 as f32 } }, + {"id":"5","city":"Hem", "_geo": { "lat": 50.6552 as f32, "lng": 3.1897 as f32 } }, + {"id":"6","city":"Roubaix", "_geo": { "lat": 50.6924 as f32, "lng": 3.1763 as f32 } }, + {"id":"7","city":"Tourcoing", "_geo": { "lat": 50.7263 as f32, "lng": 3.1541 as f32 } }, + {"id":"8","city":"Mouscron", "_geo": { "lat": 50.7453 as f32, "lng": 3.2206 as f32 } }, + {"id":"9","city":"Tournai", "_geo": { "lat": 50.6053 as f32, "lng": 3.3758 as f32 } }, + {"id":"10","city":"Ghent", "_geo": { "lat": 51.0537 as f32, "lng": 3.6957 as f32 } }, + {"id":"11","city":"Brussels", "_geo": { "lat": 50.8466 as f32, "lng": 4.3370 as f32 } }, + {"id":"12","city":"Charleroi", "_geo": { "lat": 50.4095 as f32, "lng": 4.4347 as f32 } }, + {"id":"13","city":"Mons", "_geo": { "lat": 50.4502 as f32, "lng": 3.9623 as f32 } }, + {"id":"14","city":"Valenciennes", "_geo": { "lat": 50.3518 as f32, "lng": 3.5326 as f32 } }, + {"id":"15","city":"Arras", "_geo": { "lat": 50.2844 as f32, "lng": 2.7637 as f32 } }, + {"id":"16","city":"Cambrai", "_geo": { "lat": 50.1793 as f32, "lng": 3.2189 as f32 } }, + {"id":"17","city":"Bapaume", "_geo": { "lat": 50.1112 as f32, "lng": 2.8547 as f32 } }, + {"id":"18","city":"Amiens", "_geo": { "lat": 49.9314 as f32, "lng": 2.2710 as f32 } }, + {"id":"19","city":"Compiègne", "_geo": { "lat": 49.4449 as f32, "lng": 2.7913 as f32 } }, + {"id":"20","city":"Paris", "_geo": { "lat": 48.9021 as f32, "lng": 2.3708 as f32 } } ]); let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; - let indexing_config = IndexDocumentsConfig::default(); + insert_documents(&mut wtxn, &index, content); + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete); - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); - builder.execute().unwrap(); - - let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); - let ids_to_delete: Vec = external_ids_to_delete - .iter() - .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) - .collect(); - - // Delete some documents. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - external_ids_to_delete.iter().for_each(|id| drop(builder.delete_external_id(id))); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let rtree = index.geo_rtree(&rtxn).unwrap().unwrap(); - let geo_faceted_doc_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); - - let all_geo_ids = rtree.iter().map(|point| point.data).collect::>(); - let all_geo_documents = index - .documents(&rtxn, all_geo_ids.iter().map(|(id, _)| id).copied()) - .unwrap() - .iter() - .map(|(id, _)| *id) - .collect::>(); - - let all_geo_faceted_ids = geo_faceted_doc_ids.iter().collect::>(); - let all_geo_faceted_documents = index - .documents(&rtxn, all_geo_faceted_ids.iter().copied()) - .unwrap() - .iter() - .map(|(id, _)| *id) - .collect::>(); - - assert_eq!( - all_geo_documents, all_geo_faceted_documents, - "There is an inconsistency between the geo_faceted database and the rtree" - ); - - for id in all_geo_documents.iter() { - assert!(!ids_to_delete.contains(&id), "The document {} was supposed to be deleted", id); + // Placeholder search with geo filter + let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); + let results = index.search(&wtxn).filter(filter).execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); } - assert_eq!( - all_geo_ids.len(), - all_geo_documents.len(), - "We deleted documents that were not supposed to be deleted" - ); + wtxn.commit().unwrap(); + } + + #[test] + fn get_documents_should_not_return_deleted_documents() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key(S("docid")); + builder.execute(|_| ()).unwrap(); + + let content = documents!([ + {"docid":"1_4","label":"sign"}, + {"docid":"1_5","label":"letter"}, + {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, + {"docid":"1_36","label":"drawing,painting,pattern"}, + {"docid":"1_37","label":"art,drawing,outdoor"}, + {"docid":"1_38","label":"aquarium,art,drawing"}, + {"docid":"1_39","label":"abstract"}, + {"docid":"1_40","label":"cartoon"}, + {"docid":"1_41","label":"art,drawing"}, + {"docid":"1_42","label":"art,pattern"}, + {"docid":"1_43","label":"abstract,art,drawing,pattern"}, + {"docid":"1_44","label":"drawing"}, + {"docid":"1_45","label":"art"}, + {"docid":"1_46","label":"abstract,colorfulness,pattern"}, + {"docid":"1_47","label":"abstract,pattern"}, + {"docid":"1_52","label":"abstract,cartoon"}, + {"docid":"1_57","label":"abstract,drawing,pattern"}, + {"docid":"1_58","label":"abstract,art,cartoon"}, + {"docid":"1_68","label":"design"}, + {"docid":"1_69","label":"geometry"} + ]); + + insert_documents(&mut wtxn, &index, content); + let deleted_external_ids = ["1_7", "1_52"]; + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids); + + // list all documents + let results = index.all_documents(&wtxn).unwrap(); + for result in results { + let (id, _) = result.unwrap(); + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + + // list internal document ids + let results = index.documents_ids(&wtxn).unwrap(); + for id in results { + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + + // get internal docids from deleted external document ids + let results = index.external_documents_ids(&wtxn).unwrap(); + for id in deleted_external_ids { + assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id); + } + + wtxn.commit().unwrap(); + } + + #[test] + fn stats_should_not_return_deleted_documents() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key(S("docid")); + builder.execute(|_| ()).unwrap(); + + let content = documents!([ + {"docid":"1_4","label":"sign"}, + {"docid":"1_5","label":"letter"}, + {"docid":"1_7","label":"abstract,cartoon,design,pattern", "title": "Mickey Mouse"}, + {"docid":"1_36","label":"drawing,painting,pattern"}, + {"docid":"1_37","label":"art,drawing,outdoor"}, + {"docid":"1_38","label":"aquarium,art,drawing", "title": "Nemo"}, + {"docid":"1_39","label":"abstract"}, + {"docid":"1_40","label":"cartoon"}, + {"docid":"1_41","label":"art,drawing"}, + {"docid":"1_42","label":"art,pattern"}, + {"docid":"1_43","label":"abstract,art,drawing,pattern", "number": 32i32}, + {"docid":"1_44","label":"drawing", "number": 44i32}, + {"docid":"1_45","label":"art"}, + {"docid":"1_46","label":"abstract,colorfulness,pattern"}, + {"docid":"1_47","label":"abstract,pattern"}, + {"docid":"1_52","label":"abstract,cartoon"}, + {"docid":"1_57","label":"abstract,drawing,pattern"}, + {"docid":"1_58","label":"abstract,art,cartoon"}, + {"docid":"1_68","label":"design"}, + {"docid":"1_69","label":"geometry"} + ]); + + insert_documents(&mut wtxn, &index, content); + delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + + // count internal documents + let results = index.number_of_documents(&wtxn).unwrap(); + assert_eq!(18, results); + + // count field distribution + let results = index.field_distribution(&wtxn).unwrap(); + assert_eq!(Some(&18), results.get("label")); + assert_eq!(Some(&1), results.get("title")); + assert_eq!(Some(&2), results.get("number")); + + wtxn.commit().unwrap(); } } From 447195a27a8c1b1536ffc76d169a5a376bcdfe92 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 14 Jun 2022 10:32:44 +0200 Subject: [PATCH 1436/1889] Replace format by to_string --- infos/src/main.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 49fa685aa..29a87cdcf 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -410,7 +410,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho // Fetch the words FST let words_fst = index.words_fst(rtxn)?; let length = words_fst.as_fst().as_bytes().len(); - heap.push(Reverse((length, format!("words-fst"), main_name))); + heap.push(Reverse((length, "words-fst".to_string(), main_name))); if heap.len() > limit { heap.pop(); } @@ -418,13 +418,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho // Fetch the word prefix FST let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; let length = words_prefixes_fst.as_fst().as_bytes().len(); - heap.push(Reverse((length, format!("words-prefixes-fst"), main_name))); + heap.push(Reverse((length, "words-prefixes-fst".to_string(), main_name))); if heap.len() > limit { heap.pop(); } let documents_ids = index.documents_ids(rtxn)?; - heap.push(Reverse((documents_ids.len() as usize, format!("documents-ids"), main_name))); + heap.push(Reverse((documents_ids.len() as usize, "documents-ids".to_string(), main_name))); if heap.len() > limit { heap.pop(); } From adbb0ff3186ccbd598f92b02b2c76235319c0b36 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 16 Jun 2022 10:17:58 +0200 Subject: [PATCH 1437/1889] Add deletion benchmarks --- benchmarks/Cargo.toml | 3 + benchmarks/benches/indexing.rs | 771 ++++++++++++++++++++++----------- 2 files changed, 519 insertions(+), 255 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 2c6c93bd8..c64a83c51 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -16,6 +16,9 @@ jemallocator = "0.3.2" [dev-dependencies] heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } criterion = { version = "0.3.5", features = ["html_reports"] } +rand = "0.8.5" +rand_chacha = "0.3.1" +roaring = "0.9.0" [build-dependencies] anyhow = "1.0.56" diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 091c081b2..b773eca65 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -5,14 +5,21 @@ use std::fs::{create_dir_all, remove_dir_all}; use std::path::Path; use criterion::{criterion_group, criterion_main, Criterion}; -use heed::EnvOpenOptions; -use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use heed::{EnvOpenOptions, RwTxn}; +use milli::update::{ + DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, +}; use milli::Index; +use rand::seq::SliceRandom; +use rand_chacha::rand_core::SeedableRng; +use roaring::RoaringBitmap; #[cfg(target_os = "linux")] #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +const BENCHMARK_ITERATION: usize = 10; + fn setup_dir(path: impl AsRef) { match remove_dir_all(path.as_ref()) { Ok(_) => (), @@ -31,39 +38,95 @@ fn setup_index() -> Index { Index::new(options, path).unwrap() } +fn setup_settings<'t>( + wtxn: &mut RwTxn<'t, '_>, + index: &'t Index, + primary_key: &str, + searchable_fields: &[&str], + filterable_fields: &[&str], + sortable_fields: &[&str], +) { + let config = IndexerConfig::default(); + let mut builder = Settings::new(wtxn, index, &config); + + builder.set_primary_key(primary_key.to_owned()); + + let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); + + let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_sortable_fields(sortable_fields); + + builder.execute(|_| ()).unwrap(); +} + +fn setup_index_with_settings<'t>( + primary_key: &str, + searchable_fields: &[&str], + filterable_fields: &[&str], + sortable_fields: &[&str], +) -> milli::Index { + let index = setup_index(); + let mut wtxn = index.write_txn().unwrap(); + setup_settings( + &mut wtxn, + &index, + primary_key, + searchable_fields, + filterable_fields, + sortable_fields, + ); + wtxn.commit().unwrap(); + + index +} + +fn choose_document_ids_from_index_batched( + index: &Index, + count: usize, + batch_size: usize, +) -> Vec { + let rtxn = index.read_txn().unwrap(); + // create batch of document ids to delete + let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7700); + let document_ids: Vec<_> = index.documents_ids(&rtxn).unwrap().into_iter().collect(); + let document_ids_to_delete: Vec<_> = + document_ids.choose_multiple(&mut rng, count).map(Clone::clone).collect(); + + document_ids_to_delete + .chunks(batch_size) + .map(|c| { + let mut batch = RoaringBitmap::new(); + for id in c { + batch.insert(*id); + } + + batch + }) + .collect() +} + fn indexing_songs_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); + group.sample_size(BENCHMARK_ITERATION); group.bench_function("Indexing songs with default settings", |b| { b.iter_with_setup( move || { - let index = setup_index(); + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"]; + let sortable_fields = []; - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = - ["title", "album", "artist", "genre", "country", "released", "duration"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = - ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = - ["released-timestamp", "duration-float", "genre", "country", "artist"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - index + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) }, move |index| { let config = IndexerConfig::default(); @@ -84,41 +147,85 @@ fn indexing_songs_default(c: &mut Criterion) { }); } -fn indexing_songs_in_three_batches_default(c: &mut Criterion) { +fn deleting_songs_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); - group.bench_function("Indexing songs in three batches with default settings", |b| { + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Deleting songs in batches with default settings", |b| { b.iter_with_setup( move || { - let index = setup_index(); + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"]; + let sortable_fields = []; - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = - ["title", "album", "artist", "genre", "country", "released", "duration"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = - ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = - ["released-timestamp", "duration-float", "genre", "country", "artist"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); // We index only one half of the dataset in the setup part // as we don't care about the time it takes. let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let count = 1250; + let batch_size = 250; + let document_ids_to_delete = + choose_document_ids_from_index_batched(&index, count, batch_size); + + (index, document_ids_to_delete) + }, + move |(index, document_ids_to_delete)| { + let mut wtxn = index.write_txn().unwrap(); + + for ids in document_ids_to_delete { + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_documents(&ids); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_songs_in_three_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing songs in three batches with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) @@ -160,34 +267,21 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); + group.sample_size(BENCHMARK_ITERATION); group.bench_function("Indexing songs without faceted numbers", |b| { b.iter_with_setup( move || { - let index = setup_index(); + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = ["genre", "country", "artist"]; + let sortable_fields = []; - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = - ["title", "album", "artist", "genre", "country", "released", "duration"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = - ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = - ["genre", "country", "artist"].iter().map(|s| s.to_string()).collect(); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - index + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) }, move |index| { let config = IndexerConfig::default(); @@ -211,30 +305,21 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { fn indexing_songs_without_faceted_fields(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); + group.sample_size(BENCHMARK_ITERATION); group.bench_function("Indexing songs without any facets", |b| { b.iter_with_setup( move || { - let index = setup_index(); + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = []; + let sortable_fields = []; - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = - ["title", "album", "artist", "genre", "country", "released", "duration"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = - ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - index + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) }, move |index| { let config = IndexerConfig::default(); @@ -257,29 +342,21 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { fn indexing_wiki(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); + group.sample_size(BENCHMARK_ITERATION); group.bench_function("Indexing wiki", |b| { b.iter_with_setup( move || { - let index = setup_index(); + let primary_key = "id"; + let searchable_fields = ["title", "body"]; + let filterable_fields = []; + let sortable_fields = []; - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = - ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - // there is NO faceted fields at all - - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - index + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) }, move |index| { let config = IndexerConfig::default(); @@ -301,28 +378,81 @@ fn indexing_wiki(c: &mut Criterion) { }); } +fn deleting_wiki_in_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Deleting wiki in batches with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "body"]; + let filterable_fields = []; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let count = 1250; + let batch_size = 250; + let document_ids_to_delete = + choose_document_ids_from_index_batched(&index, count, batch_size); + + (index, document_ids_to_delete) + }, + move |(index, document_ids_to_delete)| { + let mut wtxn = index.write_txn().unwrap(); + + for ids in document_ids_to_delete { + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_documents(&ids); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn indexing_wiki_in_three_batches(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); + group.sample_size(BENCHMARK_ITERATION); group.bench_function("Indexing wiki in three batches", |b| { b.iter_with_setup( move || { - let index = setup_index(); + let primary_key = "id"; + let searchable_fields = ["title", "body"]; + let filterable_fields = []; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); - let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = - ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - // there is NO faceted fields at all - builder.execute(|_| ()).unwrap(); // We index only one half of the dataset in the setup part // as we don't care about the time it takes. @@ -376,34 +506,21 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { fn indexing_movies_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); + group.sample_size(BENCHMARK_ITERATION); group.bench_function("Indexing movies with default settings", |b| { b.iter_with_setup( move || { - let index = setup_index(); + let primary_key = "id"; + let searchable_fields = ["title", "overview"]; + let filterable_fields = ["released_date", "genres"]; + let sortable_fields = []; - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = ["title", "poster", "overview", "release_date", "genres"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = - ["title", "overview"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = - ["released_date", "genres"].iter().map(|s| s.to_string()).collect(); - builder.set_filterable_fields(faceted_fields); - - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - index + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) }, move |index| { let config = IndexerConfig::default(); @@ -424,35 +541,80 @@ fn indexing_movies_default(c: &mut Criterion) { }); } +fn deleting_movies_in_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Deleting movies in batches with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "overview"]; + let filterable_fields = ["released_date", "genres"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + let documents = utils::documents_from(datasets_paths::MOVIES, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let count = 1250; + let batch_size = 250; + let document_ids_to_delete = + choose_document_ids_from_index_batched(&index, count, batch_size); + + (index, document_ids_to_delete) + }, + move |(index, document_ids_to_delete)| { + let mut wtxn = index.write_txn().unwrap(); + + for ids in document_ids_to_delete { + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_documents(&ids); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn indexing_movies_in_three_batches(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); + group.sample_size(BENCHMARK_ITERATION); group.bench_function("Indexing movies in three batches", |b| { b.iter_with_setup( move || { - let index = setup_index(); + let primary_key = "id"; + let searchable_fields = ["title", "overview"]; + let filterable_fields = ["released_date", "genres"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); - let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("id".to_owned()); - let displayed_fields = ["title", "poster", "overview", "release_date", "genres"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = - ["title", "overview"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = - ["released_date", "genres"].iter().map(|s| s.to_string()).collect(); - builder.set_filterable_fields(faceted_fields); - - builder.execute(|_| ()).unwrap(); - // We index only one half of the dataset in the setup part // as we don't care about the time it takes. let config = IndexerConfig::default(); @@ -500,17 +662,11 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { fn indexing_nested_movies_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); + group.sample_size(BENCHMARK_ITERATION); group.bench_function("Indexing nested movies with default settings", |b| { b.iter_with_setup( move || { - let index = setup_index(); - - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("id".to_owned()); + let primary_key = "id"; let searchable_fields = [ "title", "overview", @@ -519,12 +675,7 @@ fn indexing_nested_movies_default(c: &mut Criterion) { "crew.name", "cast.character", "cast.name", - ] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_searchable_fields(searchable_fields); - + ]; let filterable_fields = [ "popularity", "release_date", @@ -540,21 +691,15 @@ fn indexing_nested_movies_default(c: &mut Criterion) { "crew.name", "cast.character", "cast.name", - ] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_filterable_fields(filterable_fields); + ]; + let sortable_fields = ["popularity", "runtime", "vote_average", "release_date"]; - let sortable_fields = ["popularity", "runtime", "vote_average", "release_date"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_sortable_fields(sortable_fields); - - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - index + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) }, move |index| { let config = IndexerConfig::default(); @@ -575,19 +720,13 @@ fn indexing_nested_movies_default(c: &mut Criterion) { }); } -fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { +fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); - group.bench_function("Indexing nested movies without any facets", |b| { + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Deleting nested movies in batches with default settings", |b| { b.iter_with_setup( move || { - let index = setup_index(); - - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("id".to_owned()); + let primary_key = "id"; let searchable_fields = [ "title", "overview", @@ -596,14 +735,94 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { "crew.name", "cast.character", "cast.name", - ] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_searchable_fields(searchable_fields); - builder.execute(|_| ()).unwrap(); + ]; + let filterable_fields = [ + "popularity", + "release_date", + "runtime", + "vote_average", + "external_ids", + "keywords", + "providers.buy.name", + "providers.rent.name", + "providers.flatrate.name", + "provider_names", + "genres", + "crew.name", + "cast.character", + "cast.name", + ]; + let sortable_fields = ["popularity", "runtime", "vote_average", "release_date"]; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); - index + + let count = 1250; + let batch_size = 250; + let document_ids_to_delete = + choose_document_ids_from_index_batched(&index, count, batch_size); + + (index, document_ids_to_delete) + }, + move |(index, document_ids_to_delete)| { + let mut wtxn = index.write_txn().unwrap(); + + for ids in document_ids_to_delete { + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_documents(&ids); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing nested movies without any facets", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = [ + "title", + "overview", + "provider_names", + "genres", + "crew.name", + "cast.character", + "cast.name", + ]; + let filterable_fields = []; + let sortable_fields = []; + + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) }, move |index| { let config = IndexerConfig::default(); @@ -626,39 +845,21 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { fn indexing_geo(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); - group.sample_size(10); + group.sample_size(BENCHMARK_ITERATION); group.bench_function("Indexing geo_point", |b| { b.iter_with_setup( move || { - let index = setup_index(); + let primary_key = "geonameid"; + let searchable_fields = ["name", "alternatenames", "elevation"]; + let filterable_fields = ["_geo", "population", "elevation"]; + let sortable_fields = ["_geo", "population", "elevation"]; - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_primary_key("geonameid".to_owned()); - let displayed_fields = - ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = - ["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - let filterable_fields = - ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); - builder.set_filterable_fields(filterable_fields); - - let sortable_fields = - ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); - builder.set_sortable_fields(sortable_fields); - - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - index + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) }, move |index| { let config = IndexerConfig::default(); @@ -680,18 +881,78 @@ fn indexing_geo(c: &mut Criterion) { }); } +fn deleting_geo_in_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Deleting geo_point in batches with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "geonameid"; + let searchable_fields = ["name", "alternatenames", "elevation"]; + let filterable_fields = ["_geo", "population", "elevation"]; + let sortable_fields = ["_geo", "population", "elevation"]; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let count = 1250; + let batch_size = 250; + let document_ids_to_delete = + choose_document_ids_from_index_batched(&index, count, batch_size); + + (index, document_ids_to_delete) + }, + move |(index, document_ids_to_delete)| { + let mut wtxn = index.write_txn().unwrap(); + + for ids in document_ids_to_delete { + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_documents(&ids); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + criterion_group!( benches, indexing_songs_default, + deleting_songs_in_batches_default, indexing_songs_without_faceted_numbers, indexing_songs_without_faceted_fields, indexing_songs_in_three_batches_default, indexing_wiki, + deleting_wiki_in_batches_default, indexing_wiki_in_three_batches, indexing_movies_default, + deleting_movies_in_batches_default, indexing_movies_in_three_batches, indexing_nested_movies_default, + deleting_nested_movies_in_batches_default, indexing_nested_movies_without_faceted_fields, - indexing_geo + indexing_geo, + deleting_geo_in_batches_default ); criterion_main!(benches); From 2652310f2abd9ac850ece4869a30ffe7ad757fd5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 16 Jun 2022 10:32:58 +0200 Subject: [PATCH 1438/1889] Change delete benchmark names --- benchmarks/benches/indexing.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index b773eca65..9af7f6429 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -150,7 +150,7 @@ fn indexing_songs_default(c: &mut Criterion) { fn deleting_songs_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); - group.bench_function("Deleting songs in batches with default settings", |b| { + group.bench_function("-songs-delete-facetedString-facetedNumber-searchable-", |b| { b.iter_with_setup( move || { let primary_key = "id"; @@ -381,7 +381,7 @@ fn indexing_wiki(c: &mut Criterion) { fn deleting_wiki_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); - group.bench_function("Deleting wiki in batches with default settings", |b| { + group.bench_function("-wiki-delete-searchable-", |b| { b.iter_with_setup( move || { let primary_key = "id"; @@ -544,7 +544,7 @@ fn indexing_movies_default(c: &mut Criterion) { fn deleting_movies_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); - group.bench_function("Deleting movies in batches with default settings", |b| { + group.bench_function("-movies-delete-facetedString-facetedNumber-searchable-", |b| { b.iter_with_setup( move || { let primary_key = "id"; @@ -723,7 +723,7 @@ fn indexing_nested_movies_default(c: &mut Criterion) { fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); - group.bench_function("Deleting nested movies in batches with default settings", |b| { + group.bench_function("-movies-delete-facetedString-facetedNumber-searchable-nested-", |b| { b.iter_with_setup( move || { let primary_key = "id"; @@ -884,7 +884,7 @@ fn indexing_geo(c: &mut Criterion) { fn deleting_geo_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); - group.bench_function("Deleting geo_point in batches with default settings", |b| { + group.bench_function("-geo-delete-facetedNumber-facetedGeo-searchable-", |b| { b.iter_with_setup( move || { let primary_key = "geonameid"; From a0ab90a4d7475045d4f8ceb557e1647d0f8a244c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 16 Jun 2022 18:23:57 +0200 Subject: [PATCH 1439/1889] Avoid having an ending separator before crop marker --- milli/src/search/matches/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index ba2e8728e..e2bde3daf 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -262,7 +262,11 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { (Some(before_token_kind), Some(after_token_kind)) => { if before_token_kind == after_token_kind { before_tokens.next(); - after_tokens.next(); + + // this avoid having an ending separator before crop marker. + if remaining_words > 1 { + after_tokens.next(); + } } else if before_token_kind == SeparatorKind::Hard { after_tokens.next(); } else { From 31f749b5d8e645260b4986dbc86139daa3c6dcee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 20 Jun 2022 12:09:57 +0200 Subject: [PATCH 1440/1889] Update version for next release (v0.30.0) --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index c64a83c51..ab8b89d6a 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.29.3" +version = "0.30.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 20dc25c28..2f38f8add 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.29.3" +version = "0.30.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index e8d54ab4f..77d0f58da 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.29.3" +version = "0.30.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index ca6332a50..ac8b5e950 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.29.3" +version = "0.30.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index cdf7c3b2e..5dc7ec65a 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.29.3" +version = "0.30.0" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 34e9df773..2f95cd7e4 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.29.3" +version = "0.30.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 169cc7e08..be96ab57f 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.29.3" +version = "0.30.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 8bc092ad0..92199cd67 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.29.3" +version = "0.30.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 70bb8f585..3a67c7f8b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.29.3" +version = "0.30.0" authors = ["Kerollmops "] edition = "2018" From d2f84a9d9e60bb027c543718426f27fe288b6bbf Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 22 Jun 2022 11:37:04 +0200 Subject: [PATCH 1441/1889] Improve the estimatedNbHits when distinct is enabled --- milli/src/search/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 1c363e142..447ba4984 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -223,7 +223,6 @@ impl<'a> Search<'a> { debug!("Number of candidates found {}", candidates.len()); let excluded = take(&mut excluded_candidates); - let mut candidates = distinct.distinct(candidates, excluded); initial_candidates |= bucket_candidates; @@ -236,10 +235,12 @@ impl<'a> Search<'a> { for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { documents_ids.push(candidate?); } + + excluded_candidates |= candidates.into_excluded(); + if documents_ids.len() == self.limit { break; } - excluded_candidates = candidates.into_excluded(); } Ok(SearchResult { From d7c248042bad28bf16e34adda506e37fe8d92797 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 22 Jun 2022 12:00:45 +0200 Subject: [PATCH 1442/1889] Rename the limitedTo parameter into maxTotalHits --- milli/src/index.rs | 14 +++++++------- milli/src/update/settings.rs | 26 +++++++++++++------------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 2cb90284b..bb351d58f 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -57,7 +57,7 @@ pub mod main_key { pub const EXACT_WORDS: &str = "exact-words"; pub const EXACT_ATTRIBUTES: &str = "exact-attributes"; pub const MAX_VALUES_PER_FACET: &str = "max-values-per-facet"; - pub const PAGINATION_LIMITED_TO: &str = "pagination-limited-to"; + pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits"; } pub mod db_name { @@ -1102,20 +1102,20 @@ impl Index { self.main.delete::<_, Str>(txn, main_key::MAX_VALUES_PER_FACET) } - pub fn pagination_limited_to(&self, txn: &RoTxn) -> heed::Result> { - self.main.get::<_, Str, OwnedType>(txn, main_key::PAGINATION_LIMITED_TO) + pub fn pagination_max_total_hits(&self, txn: &RoTxn) -> heed::Result> { + self.main.get::<_, Str, OwnedType>(txn, main_key::PAGINATION_MAX_TOTAL_HITS) } - pub(crate) fn put_pagination_limited_to( + pub(crate) fn put_pagination_max_total_hits( &self, txn: &mut RwTxn, val: usize, ) -> heed::Result<()> { - self.main.put::<_, Str, OwnedType>(txn, main_key::PAGINATION_LIMITED_TO, &val) + self.main.put::<_, Str, OwnedType>(txn, main_key::PAGINATION_MAX_TOTAL_HITS, &val) } - pub(crate) fn delete_pagination_limited_to(&self, txn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(txn, main_key::PAGINATION_LIMITED_TO) + pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS) } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 174d7073d..ccf29eb49 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -105,7 +105,7 @@ pub struct Settings<'a, 't, 'u, 'i> { /// Attributes on which typo tolerance is disabled. exact_attributes: Setting>, max_values_per_facet: Setting, - pagination_limited_to: Setting, + pagination_max_total_hits: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -132,7 +132,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { min_word_len_one_typo: Setting::NotSet, exact_attributes: Setting::NotSet, max_values_per_facet: Setting::NotSet, - pagination_limited_to: Setting::NotSet, + pagination_max_total_hits: Setting::NotSet, indexer_config, } } @@ -258,12 +258,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.max_values_per_facet = Setting::Reset; } - pub fn set_pagination_limited_to(&mut self, value: usize) { - self.pagination_limited_to = Setting::Set(value); + pub fn set_pagination_max_total_hits(&mut self, value: usize) { + self.pagination_max_total_hits = Setting::Set(value); } - pub fn reset_pagination_limited_to(&mut self) { - self.pagination_limited_to = Setting::Reset; + pub fn reset_pagination_max_total_hits(&mut self) { + self.pagination_max_total_hits = Setting::Reset; } fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> @@ -646,13 +646,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } - fn update_pagination_limited_to(&mut self) -> Result<()> { - match self.pagination_limited_to { + fn update_pagination_max_total_hits(&mut self) -> Result<()> { + match self.pagination_max_total_hits { Setting::Set(max) => { - self.index.put_pagination_limited_to(&mut self.wtxn, max)?; + self.index.put_pagination_max_total_hits(&mut self.wtxn, max)?; } Setting::Reset => { - self.index.delete_pagination_limited_to(&mut self.wtxn)?; + self.index.delete_pagination_max_total_hits(&mut self.wtxn)?; } Setting::NotSet => (), } @@ -679,7 +679,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_min_typo_word_len()?; self.update_exact_words()?; self.update_max_values_per_facet()?; - self.update_pagination_limited_to()?; + self.update_pagination_max_total_hits()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, @@ -1576,7 +1576,7 @@ mod tests { exact_words, exact_attributes, max_values_per_facet, - pagination_limited_to, + pagination_max_total_hits, } = builder; assert!(matches!(searchable_fields, Setting::NotSet)); @@ -1594,6 +1594,6 @@ mod tests { assert!(matches!(exact_words, Setting::NotSet)); assert!(matches!(exact_attributes, Setting::NotSet)); assert!(matches!(max_values_per_facet, Setting::NotSet)); - assert!(matches!(pagination_limited_to, Setting::NotSet)); + assert!(matches!(pagination_max_total_hits, Setting::NotSet)); } } From f5c3b951bc92d220b3ee4f82af97daad1918b1b3 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 22 Jun 2022 12:04:17 +0200 Subject: [PATCH 1443/1889] Bump the milli version to 0.31.0 --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index ab8b89d6a..71874b852 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.30.0" +version = "0.31.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 2f38f8add..9d037b7a9 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.30.0" +version = "0.31.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 77d0f58da..43a4dec4b 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.30.0" +version = "0.31.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index ac8b5e950..0fd035546 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.30.0" +version = "0.31.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 5dc7ec65a..7072ecb9d 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.30.0" +version = "0.31.0" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 2f95cd7e4..c87ff4d96 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.30.0" +version = "0.31.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index be96ab57f..0d0c50110 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.30.0" +version = "0.31.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 92199cd67..7785e8f09 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.30.0" +version = "0.31.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 3a67c7f8b..2b3300e55 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.30.0" +version = "0.31.0" authors = ["Kerollmops "] edition = "2018" From 238692a8e7299b787c0b80c250d0cf8e96bf0649 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 22 Jun 2022 16:23:11 +0200 Subject: [PATCH 1444/1889] Introduce the copy_to_path method on the Index --- milli/src/index.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index bb351d58f..d89246dd8 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,11 +1,12 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; +use std::fs::File; use std::mem::size_of; use std::path::Path; use heed::flags::Flags; use heed::types::*; -use heed::{Database, PolyDatabase, RoTxn, RwTxn}; +use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn}; use roaring::RoaringBitmap; use rstar::RTree; use time::OffsetDateTime; @@ -214,6 +215,10 @@ impl Index { self.env.path() } + pub fn copy_to_path>(&self, path: P, option: CompactionOption) -> Result { + self.env.copy_to_path(path, option).map_err(Into::into) + } + /// Returns an `EnvClosingEvent` that can be used to wait for the closing event, /// multiple threads can wait on this event. /// From cc48992e79e1958fe810756e4b178fc137dbc0d5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 22 Jun 2022 17:05:51 +0200 Subject: [PATCH 1445/1889] Bump the milli version to 0.31.1 --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 71874b852..b40519d99 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.31.0" +version = "0.31.1" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 9d037b7a9..e59710b72 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.31.0" +version = "0.31.1" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 43a4dec4b..1a2e46929 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.31.0" +version = "0.31.1" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 0fd035546..d22f7d86d 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.31.0" +version = "0.31.1" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 7072ecb9d..7e9dd207a 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.31.0" +version = "0.31.1" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index c87ff4d96..9e8781e55 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.31.0" +version = "0.31.1" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 0d0c50110..49be47c1e 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.31.0" +version = "0.31.1" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 7785e8f09..d4ea547fb 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.31.0" +version = "0.31.1" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 2b3300e55..06d05598f 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.31.0" +version = "0.31.1" authors = ["Kerollmops "] edition = "2018" From 3ff03a3f5f1edc24af451be2302dc922e92f636d Mon Sep 17 00:00:00 2001 From: Dmytro Gordon Date: Mon, 27 Jun 2022 15:55:17 +0300 Subject: [PATCH 1446/1889] Fix not equal filter when field contains both number and strings --- milli/src/search/facet/filter.rs | 7 +------ milli/tests/search/filters.rs | 3 +++ milli/tests/search/mod.rs | 10 +++++++++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index a809aa5fb..91caa0171 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -306,12 +306,7 @@ impl<'a> Filter<'a> { return Ok(string_docids | number_docids); } Condition::NotEqual(val) => { - let number = val.parse::().ok(); - let all_numbers_ids = if number.is_some() { - index.number_faceted_documents_ids(rtxn, field_id)? - } else { - RoaringBitmap::new() - }; + let all_numbers_ids = index.number_faceted_documents_ids(rtxn, field_id)?; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; let operator = Condition::Equal(val.clone()); let docids = Self::evaluate_operator( diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 99063f9f6..fe926d17a 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -43,6 +43,9 @@ test_filter!(eq_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank=1")] test_filter!(eq_string_or_filter, vec![Left(vec!["tag=red", "tag=green"])]); test_filter!(eq_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank=1"])]); test_filter!(eq_number_or_filter, vec![Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])]); +test_filter!(neq_simple_string_filter, vec![Right("tag!=red")]); +test_filter!(neq_simple_number_filter, vec![Right("asc_desc_rank!=1")]); +test_filter!(neq_simple_string_in_number_column_filter, vec![Right("asc_desc_rank!=red")]); test_filter!(geo_radius, vec![Right("_geoRadius(50.630010347667806, 3.086251829166809, 100000)")]); test_filter!( not_geo_radius, diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 12e9861fa..472fbafe0 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -168,7 +168,15 @@ pub fn expected_order( fn execute_filter(filter: &str, document: &TestDocument) -> Option { let mut id = None; - if let Some((field, filter)) = filter.split_once("=") { + if let Some((field, filter)) = filter.split_once("!=") { + if field == "tag" && document.tag != filter { + id = Some(document.id.clone()) + } else if field == "asc_desc_rank" + && Ok(&document.asc_desc_rank) != filter.parse::().as_ref() + { + id = Some(document.id.clone()) + } + } else if let Some((field, filter)) = filter.split_once("=") { if field == "tag" && document.tag == filter { id = Some(document.id.clone()) } else if field == "asc_desc_rank" From 446439e8bed575eb99b008934028b797b27d1879 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Jul 2022 12:19:30 +0200 Subject: [PATCH 1447/1889] bump charabia --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 06d05598f..ed779527c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" bstr = "0.2.17" byteorder = "1.4.3" -charabia = "0.5.0" +charabia = "0.5.1" concat-arrays = "0.1.2" crossbeam-channel = "0.5.2" either = "1.6.1" From 2700d8dc67961c94d8887c997c453a9891dac342 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 4 Jul 2022 15:10:12 +0200 Subject: [PATCH 1448/1889] Add reindexing benchmarks --- benchmarks/benches/indexing.rs | 213 +++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 9af7f6429..3ae0a1a84 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -147,6 +147,58 @@ fn indexing_songs_default(c: &mut Criterion) { }); } +fn reindexing_songs_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Reindexing songs with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn deleting_songs_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); @@ -378,6 +430,59 @@ fn indexing_wiki(c: &mut Criterion) { }); } +fn reindexing_wiki(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Reindexing wiki", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "body"]; + let filterable_fields = []; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn deleting_wiki_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); @@ -541,6 +646,57 @@ fn indexing_movies_default(c: &mut Criterion) { }); } +fn reindexing_movies_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Reindexing movies with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "overview"]; + let filterable_fields = ["released_date", "genres"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::MOVIES, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::MOVIES, "json"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn deleting_movies_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); @@ -881,6 +1037,59 @@ fn indexing_geo(c: &mut Criterion) { }); } +fn reindexing_geo(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Reindexing geo_point", |b| { + b.iter_with_setup( + move || { + let primary_key = "geonameid"; + let searchable_fields = ["name", "alternatenames", "elevation"]; + let filterable_fields = ["_geo", "population", "elevation"]; + let sortable_fields = ["_geo", "population", "elevation"]; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); + builder.add_documents(documents).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + fn deleting_geo_in_batches_default(c: &mut Criterion) { let mut group = c.benchmark_group("indexing"); group.sample_size(BENCHMARK_ITERATION); @@ -939,20 +1148,24 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { criterion_group!( benches, indexing_songs_default, + reindexing_songs_default, deleting_songs_in_batches_default, indexing_songs_without_faceted_numbers, indexing_songs_without_faceted_fields, indexing_songs_in_three_batches_default, indexing_wiki, + reindexing_wiki, deleting_wiki_in_batches_default, indexing_wiki_in_three_batches, indexing_movies_default, + reindexing_movies_default, deleting_movies_in_batches_default, indexing_movies_in_three_batches, indexing_nested_movies_default, deleting_nested_movies_in_batches_default, indexing_nested_movies_without_faceted_fields, indexing_geo, + reindexing_geo, deleting_geo_in_batches_default ); criterion_main!(benches); From 3b309f654ad1d60a7ffd6faafa846640a84b31d7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 13 Jun 2022 17:59:34 +0200 Subject: [PATCH 1449/1889] Fasten the document deletion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a document deletion occurs, instead of deleting the document we mark it as deleted in the new “soft deleted” bitmap. It is then removed from the search, and all the other endpoints. --- milli/src/error.rs | 2 + milli/src/external_documents_ids.rs | 11 +- milli/src/index.rs | 40 +- milli/src/search/facet/filter.rs | 74 +++- milli/src/search/mod.rs | 2 +- milli/src/update/available_documents_ids.rs | 35 +- milli/src/update/clear_documents.rs | 10 +- milli/src/update/delete_documents.rs | 376 +++++++++--------- milli/src/update/index_documents/mod.rs | 16 +- milli/src/update/index_documents/transform.rs | 131 +++--- 10 files changed, 413 insertions(+), 284 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index caabb96fc..b151e5545 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -27,6 +27,8 @@ pub enum Error { #[derive(Error, Debug)] pub enum InternalError { + #[error("Tried to access a soft deleted documents.")] + AccessingSoftDeletedDocument { document_id: DocumentId }, #[error("{}", HeedError::DatabaseClosing)] DatabaseClosing, #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 3dce18b00..6029722af 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -5,26 +5,30 @@ use std::{fmt, str}; use fst::map::IndexedValue; use fst::{IntoStreamer, Streamer}; +use roaring::RoaringBitmap; const DELETED_ID: u64 = u64::MAX; pub struct ExternalDocumentsIds<'a> { pub(crate) hard: fst::Map>, pub(crate) soft: fst::Map>, + soft_deleted_docids: RoaringBitmap, } impl<'a> ExternalDocumentsIds<'a> { pub fn new( hard: fst::Map>, soft: fst::Map>, + soft_deleted_docids: RoaringBitmap, ) -> ExternalDocumentsIds<'a> { - ExternalDocumentsIds { hard, soft } + ExternalDocumentsIds { hard, soft, soft_deleted_docids } } pub fn into_static(self) -> ExternalDocumentsIds<'static> { ExternalDocumentsIds { hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), + soft_deleted_docids: self.soft_deleted_docids, } } @@ -36,7 +40,9 @@ impl<'a> ExternalDocumentsIds<'a> { pub fn get>(&self, external_id: A) -> Option { let external_id = external_id.as_ref(); match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { - Some(id) if id != DELETED_ID => Some(id.try_into().unwrap()), + Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => { + Some(id.try_into().unwrap()) + } _otherwise => None, } } @@ -134,6 +140,7 @@ impl Default for ExternalDocumentsIds<'static> { ExternalDocumentsIds { hard: fst::Map::default().map_data(Cow::Owned).unwrap(), soft: fst::Map::default().map_data(Cow::Owned).unwrap(), + soft_deleted_docids: RoaringBitmap::new(), } } } diff --git a/milli/src/index.rs b/milli/src/index.rs index d89246dd8..9ada51170 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -32,6 +32,7 @@ pub mod main_key { pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; + pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids"; pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; @@ -254,6 +255,29 @@ impl Index { Ok(count.unwrap_or_default()) } + /* deleted documents ids */ + + /// Writes the soft deleted documents ids. + pub(crate) fn put_soft_deleted_documents_ids( + &self, + wtxn: &mut RwTxn, + docids: &RoaringBitmap, + ) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>( + wtxn, + main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY, + docids, + ) + } + + /// Returns the soft deleted documents ids. + pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self + .main + .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)? + .unwrap_or_default()) + } + /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. @@ -280,7 +304,7 @@ impl Index { wtxn: &mut RwTxn, external_documents_ids: &ExternalDocumentsIds<'a>, ) -> heed::Result<()> { - let ExternalDocumentsIds { hard, soft } = external_documents_ids; + let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids; let hard = hard.as_fst().as_bytes(); let soft = soft.as_fst().as_bytes(); self.main.put::<_, Str, ByteSlice>( @@ -311,7 +335,8 @@ impl Index { Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?, None => fst::Map::default().map_data(Cow::Owned)?, }; - Ok(ExternalDocumentsIds::new(hard, soft)) + let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?; + Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids)) } /* fields ids map */ @@ -929,9 +954,13 @@ impl Index { rtxn: &'t RoTxn, ids: impl IntoIterator, ) -> Result)>> { + let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; let mut documents = Vec::new(); for id in ids { + if soft_deleted_documents.contains(id) { + return Err(InternalError::AccessingSoftDeletedDocument { document_id: id })?; + } let kv = self .documents .get(rtxn, &BEU32::new(id))? @@ -947,11 +976,16 @@ impl Index { &self, rtxn: &'t RoTxn, ) -> Result)>>> { + let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?; + Ok(self .documents .iter(rtxn)? // we cast the BEU32 to a DocumentId - .map(|document| document.map(|(id, obkv)| (id.get(), obkv)))) + .map(|document| document.map(|(id, obkv)| (id.get(), obkv))) + .filter(move |document| { + document.as_ref().map_or(true, |(id, _)| !soft_deleted_docids.contains(*id)) + })) } pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 91caa0171..d89413f62 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -10,9 +10,7 @@ use roaring::RoaringBitmap; use super::FacetNumberRange; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, -}; +use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{ distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, }; @@ -266,11 +264,12 @@ impl<'a> Filter<'a> { fn evaluate_operator( rtxn: &heed::RoTxn, index: &Index, - numbers_db: heed::Database, - strings_db: heed::Database, field_id: FieldId, operator: &Condition<'a>, ) -> Result { + let numbers_db = index.facet_id_f64_docids; + let strings_db = index.facet_id_string_docids; + // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. @@ -309,9 +308,7 @@ impl<'a> Filter<'a> { let all_numbers_ids = index.number_faceted_documents_ids(rtxn, field_id)?; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator( - rtxn, index, numbers_db, strings_db, field_id, &operator, - )?; + let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; return Ok((all_numbers_ids | all_strings_ids) - docids); } }; @@ -342,17 +339,27 @@ impl<'a> Filter<'a> { } pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { - let numbers_db = index.facet_id_f64_docids; - let strings_db = index.facet_id_string_docids; + // to avoid doing this for each recursive call we're going to do it ONCE ahead of time + let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; + let filterable_fields = index.filterable_fields(rtxn)?; + // and finally we delete all the soft_deleted_documents, again, only once at the very end + self.inner_evaluate(rtxn, index, &filterable_fields) + .map(|result| result - soft_deleted_documents) + } + + fn inner_evaluate( + &self, + rtxn: &heed::RoTxn, + index: &Index, + filterable_fields: &HashSet, + ) -> Result { match &self.condition { FilterCondition::Condition { fid, op } => { - let filterable_fields = index.filterable_fields(rtxn)?; - - if crate::is_faceted(fid.value(), &filterable_fields) { + if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) + Self::evaluate_operator(rtxn, index, fid, &op) } else { return Ok(RoaringBitmap::new()); } @@ -371,7 +378,7 @@ impl<'a> Filter<'a> { return Err(fid.as_external_error( FilterError::AttributeNotFilterable { attribute, - filterable_fields, + filterable_fields: filterable_fields.clone(), }, ))?; } @@ -379,17 +386,39 @@ impl<'a> Filter<'a> { } } FilterCondition::Or(lhs, rhs) => { - let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?; - let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?; + let lhs = Self::inner_evaluate( + &(lhs.as_ref().clone()).into(), + rtxn, + index, + filterable_fields, + )?; + let rhs = Self::inner_evaluate( + &(rhs.as_ref().clone()).into(), + rtxn, + index, + filterable_fields, + )?; Ok(lhs | rhs) } FilterCondition::And(lhs, rhs) => { - let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?; - let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?; + let lhs = Self::inner_evaluate( + &(lhs.as_ref().clone()).into(), + rtxn, + index, + filterable_fields, + )?; + if lhs.is_empty() { + return Ok(lhs); + } + let rhs = Self::inner_evaluate( + &(rhs.as_ref().clone()).into(), + rtxn, + index, + filterable_fields, + )?; Ok(lhs & rhs) } FilterCondition::GeoLowerThan { point, radius } => { - let filterable_fields = index.filterable_fields(rtxn)?; if filterable_fields.contains("_geo") { let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?]; if !(-90.0..=90.0).contains(&base_point[0]) { @@ -422,16 +451,17 @@ impl<'a> Filter<'a> { } else { return Err(point[0].as_external_error(FilterError::AttributeNotFilterable { attribute: "_geo", - filterable_fields, + filterable_fields: filterable_fields.clone(), }))?; } } FilterCondition::GeoGreaterThan { point, radius } => { - let result = Self::evaluate( + let result = Self::inner_evaluate( &FilterCondition::GeoLowerThan { point: point.clone(), radius: radius.clone() } .into(), rtxn, index, + filterable_fields, )?; let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; Ok(geo_faceted_doc_ids - result) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 447ba4984..1930091ef 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -214,7 +214,7 @@ impl<'a> Search<'a> { ) -> Result { let mut offset = self.offset; let mut initial_candidates = RoaringBitmap::new(); - let mut excluded_candidates = RoaringBitmap::new(); + let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?; let mut documents_ids = Vec::new(); while let Some(FinalResult { candidates, bucket_candidates, .. }) = diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs index 653bc7dd2..3e4ec5600 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_documents_ids.rs @@ -8,11 +8,16 @@ pub struct AvailableDocumentsIds { } impl AvailableDocumentsIds { - pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds { - match docids.max() { + pub fn from_documents_ids( + docids: &RoaringBitmap, + soft_deleted_docids: &RoaringBitmap, + ) -> AvailableDocumentsIds { + let used_docids = docids | soft_deleted_docids; + + match used_docids.max() { Some(last_id) => { let mut available = RoaringBitmap::from_iter(0..last_id); - available -= docids; + available -= used_docids; let iter = match last_id.checked_add(1) { Some(id) => id..=u32::max_value(), @@ -44,7 +49,7 @@ mod tests { #[test] fn empty() { let base = RoaringBitmap::new(); - let left = AvailableDocumentsIds::from_documents_ids(&base); + let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); let right = 0..=u32::max_value(); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } @@ -57,8 +62,28 @@ mod tests { base.insert(100); base.insert(405); - let left = AvailableDocumentsIds::from_documents_ids(&base); + let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); } + + #[test] + fn soft_deleted() { + let mut base = RoaringBitmap::new(); + base.insert(0); + base.insert(10); + base.insert(100); + base.insert(405); + + let mut soft_deleted = RoaringBitmap::new(); + soft_deleted.insert(1); + soft_deleted.insert(11); + soft_deleted.insert(101); + soft_deleted.insert(406); + + let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted); + let right = + (0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n)); + left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); + } } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index f93ba60fa..d1939df7b 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -35,6 +35,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { documents, } = self.index; + let empty_roaring = RoaringBitmap::default(); + // We retrieve the number of documents ids that we are deleting. let number_of_documents = self.index.number_of_documents(self.wtxn)?; let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; @@ -43,16 +45,16 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; - self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; + self.index.put_documents_ids(self.wtxn, &empty_roaring)?; + self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; // We clean all the faceted documents ids. - let empty = RoaringBitmap::default(); for field_id in faceted_fields { - self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?; - self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?; + self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?; + self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?; } // Clear the other databases. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index ef4f849cc..564b729ea 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,5 +1,4 @@ use std::collections::btree_map::Entry; -use std::collections::HashMap; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; @@ -17,15 +16,17 @@ use crate::heed_codec::facet::{ use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32, - BEU32, + DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, + RoaringBitmapCodec, SmallString32, BEU32, }; +const DELETE_DOCUMENTS_THRESHOLD: u64 = 100_000; + pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, external_documents_ids: ExternalDocumentsIds<'static>, - documents_ids: RoaringBitmap, + to_delete_docids: RoaringBitmap, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -45,16 +46,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { wtxn, index, external_documents_ids, - documents_ids: RoaringBitmap::new(), + to_delete_docids: RoaringBitmap::new(), }) } pub fn delete_document(&mut self, docid: u32) { - self.documents_ids.insert(docid); + self.to_delete_docids.insert(docid); } pub fn delete_documents(&mut self, docids: &RoaringBitmap) { - self.documents_ids |= docids; + self.to_delete_docids |= docids; } pub fn delete_external_id(&mut self, external_id: &str) -> Option { @@ -63,28 +64,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { Some(docid) } - pub fn execute(self) -> Result { + pub fn execute(mut self) -> Result { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; + let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?; let current_documents_ids_len = documents_ids.len(); // We can and must stop removing documents in a database that is empty. if documents_ids.is_empty() { - return Ok(DocumentDeletionResult { - deleted_documents: 0, - remaining_documents: current_documents_ids_len, - }); + // but if there was still documents to delete we clear the database entirely + if !soft_deleted_docids.is_empty() { + ClearDocuments::new(self.wtxn, self.index).execute()?; + } + return Ok(DocumentDeletionResult { deleted_documents: 0, remaining_documents: 0 }); } // We remove the documents ids that we want to delete // from the documents in the database and write them back. - documents_ids -= &self.documents_ids; + documents_ids -= &self.to_delete_docids; self.index.put_documents_ids(self.wtxn, &documents_ids)?; // We can execute a ClearDocuments operation when the number of documents // to delete is exactly the number of documents in the database. - if current_documents_ids_len == self.documents_ids.len() { + if current_documents_ids_len == self.to_delete_docids.len() { let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?; return Ok(DocumentDeletionResult { deleted_documents: current_documents_ids_len, @@ -93,6 +96,50 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let mut field_distribution = self.index.field_distribution(self.wtxn)?; + + // we update the field distribution + for docid in self.to_delete_docids.iter() { + let key = BEU32::new(docid); + let document = + self.index.documents.get(self.wtxn, &key)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: "documents", key: None }, + )?; + for (fid, _value) in document.iter() { + let field_name = + fields_ids_map.name(fid).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "delete documents", + })?; + if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) + { + match entry.get().checked_sub(1) { + Some(0) | None => entry.remove(), + Some(count) => entry.insert(count), + }; + } + } + } + + self.index.put_field_distribution(self.wtxn, &field_distribution)?; + + soft_deleted_docids |= &self.to_delete_docids; + + // if we have less documents to delete than the threshold we simply save them in + // the `soft_deleted_documents_ids` bitmap and early exit. + if soft_deleted_docids.len() < DELETE_DOCUMENTS_THRESHOLD { + self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; + return Ok(DocumentDeletionResult { + deleted_documents: self.to_delete_docids.len(), + remaining_documents: documents_ids.len(), + }); + } + + // There is more than documents to delete than the threshold we needs to delete them all + self.to_delete_docids = soft_deleted_docids; + // and we can reset the soft deleted bitmap + self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; + let primary_key = self.index.primary_key(self.wtxn)?.ok_or_else(|| { InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, @@ -127,23 +174,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { documents, } = self.index; - // Number of fields for each document that has been deleted. - let mut fields_ids_distribution_diff = HashMap::new(); - // Retrieve the words and the external documents ids contained in the documents. let mut words = Vec::new(); let mut external_ids = Vec::new(); - for docid in &self.documents_ids { + for docid in &self.to_delete_docids { // We create an iterator to be able to get the content and delete the document // content itself. It's faster to acquire a cursor to get and delete, // as we avoid traversing the LMDB B-Tree two times but only once. let key = BEU32::new(docid); let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; if let Some((_key, obkv)) = iter.next().transpose()? { - for (field_id, _) in obkv.iter() { - *fields_ids_distribution_diff.entry(field_id).or_default() += 1; - } - if let Some(content) = obkv.get(id_field) { let external_id = match serde_json::from_slice(content).unwrap() { Value::String(string) => SmallString32::from(string.as_str()), @@ -171,24 +211,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } - let mut field_distribution = self.index.field_distribution(self.wtxn)?; - - // We use pre-calculated number of fields occurrences that needs to be deleted - // to reflect deleted documents. - // If all field occurrences are removed, delete the entry from distribution. - // Otherwise, insert new number of occurrences (current_count - count_diff). - for (field_id, count_diff) in fields_ids_distribution_diff { - let field_name = fields_ids_map.name(field_id).unwrap(); - if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) { - match entry.get().checked_sub(count_diff) { - Some(0) | None => entry.remove(), - Some(count) => entry.insert(count), - }; - } - } - - self.index.put_field_distribution(self.wtxn, &field_distribution)?; - // We create the FST map of the external ids that we must delete. external_ids.sort_unstable(); let external_ids_to_delete = fst::Set::from_iter(external_ids)?; @@ -214,7 +236,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_docids, word.as_str(), must_remove, - &self.documents_ids, + &self.to_delete_docids, )?; remove_from_word_docids( @@ -222,7 +244,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { exact_word_docids, word.as_str(), must_remove, - &self.documents_ids, + &self.to_delete_docids, )?; } @@ -256,12 +278,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_words_fst(self.wtxn, &new_words_fst)?; let prefixes_to_delete = - remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?; + remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.to_delete_docids)?; let exact_prefix_to_delete = remove_from_word_prefix_docids( self.wtxn, exact_word_prefix_docids, - &self.documents_ids, + &self.to_delete_docids, )?; let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); @@ -293,7 +315,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (key, mut docids) = result?; let previous_len = docids.len(); - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -314,7 +336,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -332,7 +354,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -351,7 +373,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { while let Some(result) = iter.next() { let (bytes, mut docids) = result?; let previous_len = docids.len(); - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -368,7 +390,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; while let Some((key, mut docids)) = iter.next().transpose()? { let previous_len = docids.len(); - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; if docids.is_empty() { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; @@ -386,7 +408,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree .iter() - .filter(|&point| self.documents_ids.contains(point.data.0)) + .filter(|&point| self.to_delete_docids.contains(point.data.0)) .cloned() .map(|point| (point, point.data.0)) .unzip(); @@ -403,46 +425,46 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { remove_docids_from_facet_field_id_number_docids( self.wtxn, facet_id_f64_docids, - &self.documents_ids, + &self.to_delete_docids, )?; remove_docids_from_facet_field_id_string_docids( self.wtxn, facet_id_string_docids, - &self.documents_ids, + &self.to_delete_docids, )?; // Remove the documents ids from the faceted documents ids. for field_id in self.index.faceted_fields_ids(self.wtxn)? { // Remove docids from the number faceted documents ids let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?; - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?; remove_docids_from_field_id_docid_facet_value( self.wtxn, field_id_docid_facet_f64s, field_id, - &self.documents_ids, + &self.to_delete_docids, |(_fid, docid, _value)| docid, )?; // Remove docids from the string faceted documents ids let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?; - docids -= &self.documents_ids; + docids -= &self.to_delete_docids; self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?; remove_docids_from_field_id_docid_facet_value( self.wtxn, field_id_docid_facet_strings, field_id, - &self.documents_ids, + &self.to_delete_docids, |(_fid, docid, _value)| docid, )?; } Ok(DocumentDeletionResult { - deleted_documents: self.documents_ids.len(), + deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), }) } @@ -741,26 +763,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"docid":"1_4","label":"sign"}, - {"docid":"1_5","label":"letter"}, - {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, - {"docid":"1_36","label":"drawing,painting,pattern"}, - {"docid":"1_37","label":"art,drawing,outdoor"}, - {"docid":"1_38","label":"aquarium,art,drawing"}, - {"docid":"1_39","label":"abstract"}, - {"docid":"1_40","label":"cartoon"}, - {"docid":"1_41","label":"art,drawing"}, - {"docid":"1_42","label":"art,pattern"}, - {"docid":"1_43","label":"abstract,art,drawing,pattern"}, - {"docid":"1_44","label":"drawing"}, - {"docid":"1_45","label":"art"}, - {"docid":"1_46","label":"abstract,colorfulness,pattern"}, - {"docid":"1_47","label":"abstract,pattern"}, - {"docid":"1_52","label":"abstract,cartoon"}, - {"docid":"1_57","label":"abstract,drawing,pattern"}, - {"docid":"1_58","label":"abstract,art,cartoon"}, - {"docid":"1_68","label":"design"}, - {"docid":"1_69","label":"geometry"} + { "docid": "1_4", "label": "sign" }, + { "docid": "1_5", "label": "letter" }, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, + { "docid": "1_36", "label": "drawing,painting,pattern" }, + { "docid": "1_37", "label": "art,drawing,outdoor" }, + { "docid": "1_38", "label": "aquarium,art,drawing" }, + { "docid": "1_39", "label": "abstract" }, + { "docid": "1_40", "label": "cartoon" }, + { "docid": "1_41", "label": "art,drawing" }, + { "docid": "1_42", "label": "art,pattern" }, + { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, + { "docid": "1_44", "label": "drawing" }, + { "docid": "1_45", "label": "art" }, + { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, + { "docid": "1_47", "label": "abstract,pattern" }, + { "docid": "1_52", "label": "abstract,cartoon" }, + { "docid": "1_57", "label": "abstract,drawing,pattern" }, + { "docid": "1_58", "label": "abstract,art,cartoon" }, + { "docid": "1_68", "label": "design" }, + { "docid": "1_69", "label": "geometry" } ]); insert_documents(&mut wtxn, &index, content); @@ -788,26 +810,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"docid":"1_4","label":"sign"}, - {"docid":"1_5","label":"letter"}, - {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, - {"docid":"1_36","label":"drawing,painting,pattern"}, - {"docid":"1_37","label":"art,drawing,outdoor"}, - {"docid":"1_38","label":"aquarium,art,drawing"}, - {"docid":"1_39","label":"abstract"}, - {"docid":"1_40","label":"cartoon"}, - {"docid":"1_41","label":"art,drawing"}, - {"docid":"1_42","label":"art,pattern"}, - {"docid":"1_43","label":"abstract,art,drawing,pattern"}, - {"docid":"1_44","label":"drawing"}, - {"docid":"1_45","label":"art"}, - {"docid":"1_46","label":"abstract,colorfulness,pattern"}, - {"docid":"1_47","label":"abstract,pattern"}, - {"docid":"1_52","label":"abstract,cartoon"}, - {"docid":"1_57","label":"abstract,drawing,pattern"}, - {"docid":"1_58","label":"abstract,art,cartoon"}, - {"docid":"1_68","label":"design"}, - {"docid":"1_69","label":"geometry"} + { "docid": "1_4", "label": "sign" }, + { "docid": "1_5", "label": "letter" }, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, + { "docid": "1_36", "label": "drawing,painting,pattern" }, + { "docid": "1_37", "label": "art,drawing,outdoor" }, + { "docid": "1_38", "label": "aquarium,art,drawing" }, + { "docid": "1_39", "label": "abstract" }, + { "docid": "1_40", "label": "cartoon" }, + { "docid": "1_41", "label": "art,drawing" }, + { "docid": "1_42", "label": "art,pattern" }, + { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, + { "docid": "1_44", "label": "drawing" }, + { "docid": "1_45", "label": "art" }, + { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, + { "docid": "1_47", "label": "abstract,pattern" }, + { "docid": "1_52", "label": "abstract,cartoon" }, + { "docid": "1_57", "label": "abstract,drawing,pattern" }, + { "docid": "1_58", "label": "abstract,art,cartoon" }, + { "docid": "1_68", "label": "design" }, + { "docid": "1_69", "label": "geometry" } ]); insert_documents(&mut wtxn, &index, content); @@ -841,26 +863,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"docid":"1_4","label":"sign"}, - {"docid":"1_5","label":"letter"}, - {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, - {"docid":"1_36","label":"drawing,painting,pattern"}, - {"docid":"1_37","label":"art,drawing,outdoor"}, - {"docid":"1_38","label":"aquarium,art,drawing"}, - {"docid":"1_39","label":"abstract"}, - {"docid":"1_40","label":"cartoon"}, - {"docid":"1_41","label":"art,drawing"}, - {"docid":"1_42","label":"art,pattern"}, - {"docid":"1_43","label":"abstract,art,drawing,pattern"}, - {"docid":"1_44","label":"drawing"}, - {"docid":"1_45","label":"art"}, - {"docid":"1_46","label":"abstract,colorfulness,pattern"}, - {"docid":"1_47","label":"abstract,pattern"}, - {"docid":"1_52","label":"abstract,cartoon"}, - {"docid":"1_57","label":"abstract,drawing,pattern"}, - {"docid":"1_58","label":"abstract,art,cartoon"}, - {"docid":"1_68","label":"design"}, - {"docid":"1_69","label":"geometry"} + {"docid": "1_4", "label": "sign"}, + {"docid": "1_5", "label": "letter"}, + {"docid": "1_7", "label": "abstract,cartoon,design,pattern"}, + {"docid": "1_36","label": "drawing,painting,pattern"}, + {"docid": "1_37","label": "art,drawing,outdoor"}, + {"docid": "1_38","label": "aquarium,art,drawing"}, + {"docid": "1_39","label": "abstract"}, + {"docid": "1_40","label": "cartoon"}, + {"docid": "1_41","label": "art,drawing"}, + {"docid": "1_42","label": "art,pattern"}, + {"docid": "1_43","label": "abstract,art,drawing,pattern"}, + {"docid": "1_44","label": "drawing"}, + {"docid": "1_45","label": "art"}, + {"docid": "1_46","label": "abstract,colorfulness,pattern"}, + {"docid": "1_47","label": "abstract,pattern"}, + {"docid": "1_52","label": "abstract,cartoon"}, + {"docid": "1_57","label": "abstract,drawing,pattern"}, + {"docid": "1_58","label": "abstract,art,cartoon"}, + {"docid": "1_68","label": "design"}, + {"docid": "1_69","label": "geometry"} ]); insert_documents(&mut wtxn, &index, content); @@ -896,26 +918,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"id":"1","city":"Lille", "_geo": { "lat": 50.6299 as f32, "lng": 3.0569 as f32 } }, - {"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.6415 as f32, "lng": 3.1106 as f32 } }, - {"id":"3","city":"Hellemmes", "_geo": { "lat": 50.6312 as f32, "lng": 3.1106 as f32 } }, - {"id":"4","city":"Villeneuve-d'Ascq", "_geo": { "lat": 50.6224 as f32, "lng": 3.1476 as f32 } }, - {"id":"5","city":"Hem", "_geo": { "lat": 50.6552 as f32, "lng": 3.1897 as f32 } }, - {"id":"6","city":"Roubaix", "_geo": { "lat": 50.6924 as f32, "lng": 3.1763 as f32 } }, - {"id":"7","city":"Tourcoing", "_geo": { "lat": 50.7263 as f32, "lng": 3.1541 as f32 } }, - {"id":"8","city":"Mouscron", "_geo": { "lat": 50.7453 as f32, "lng": 3.2206 as f32 } }, - {"id":"9","city":"Tournai", "_geo": { "lat": 50.6053 as f32, "lng": 3.3758 as f32 } }, - {"id":"10","city":"Ghent", "_geo": { "lat": 51.0537 as f32, "lng": 3.6957 as f32 } }, - {"id":"11","city":"Brussels", "_geo": { "lat": 50.8466 as f32, "lng": 4.3370 as f32 } }, - {"id":"12","city":"Charleroi", "_geo": { "lat": 50.4095 as f32, "lng": 4.4347 as f32 } }, - {"id":"13","city":"Mons", "_geo": { "lat": 50.4502 as f32, "lng": 3.9623 as f32 } }, - {"id":"14","city":"Valenciennes", "_geo": { "lat": 50.3518 as f32, "lng": 3.5326 as f32 } }, - {"id":"15","city":"Arras", "_geo": { "lat": 50.2844 as f32, "lng": 2.7637 as f32 } }, - {"id":"16","city":"Cambrai", "_geo": { "lat": 50.1793 as f32, "lng": 3.2189 as f32 } }, - {"id":"17","city":"Bapaume", "_geo": { "lat": 50.1112 as f32, "lng": 2.8547 as f32 } }, - {"id":"18","city":"Amiens", "_geo": { "lat": 49.9314 as f32, "lng": 2.2710 as f32 } }, - {"id":"19","city":"Compiègne", "_geo": { "lat": 49.4449 as f32, "lng": 2.7913 as f32 } }, - {"id":"20","city":"Paris", "_geo": { "lat": 48.9021 as f32, "lng": 2.3708 as f32 } } + { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, + { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, + { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, + { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, + { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } }, + { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } }, + { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } }, + { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } }, + { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } }, + { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } }, + { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } }, + { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } }, + { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } }, + { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } }, + { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } }, + { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } }, + { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } }, + { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, + { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, + { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } ]); let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; @@ -951,26 +973,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"docid":"1_4","label":"sign"}, - {"docid":"1_5","label":"letter"}, - {"docid":"1_7","label":"abstract,cartoon,design,pattern"}, - {"docid":"1_36","label":"drawing,painting,pattern"}, - {"docid":"1_37","label":"art,drawing,outdoor"}, - {"docid":"1_38","label":"aquarium,art,drawing"}, - {"docid":"1_39","label":"abstract"}, - {"docid":"1_40","label":"cartoon"}, - {"docid":"1_41","label":"art,drawing"}, - {"docid":"1_42","label":"art,pattern"}, - {"docid":"1_43","label":"abstract,art,drawing,pattern"}, - {"docid":"1_44","label":"drawing"}, - {"docid":"1_45","label":"art"}, - {"docid":"1_46","label":"abstract,colorfulness,pattern"}, - {"docid":"1_47","label":"abstract,pattern"}, - {"docid":"1_52","label":"abstract,cartoon"}, - {"docid":"1_57","label":"abstract,drawing,pattern"}, - {"docid":"1_58","label":"abstract,art,cartoon"}, - {"docid":"1_68","label":"design"}, - {"docid":"1_69","label":"geometry"} + { "docid": "1_4", "label": "sign" }, + { "docid": "1_5", "label": "letter" }, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, + { "docid": "1_36", "label": "drawing,painting,pattern" }, + { "docid": "1_37", "label": "art,drawing,outdoor" }, + { "docid": "1_38", "label": "aquarium,art,drawing" }, + { "docid": "1_39", "label": "abstract" }, + { "docid": "1_40", "label": "cartoon" }, + { "docid": "1_41", "label": "art,drawing" }, + { "docid": "1_42", "label": "art,pattern" }, + { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, + { "docid": "1_44", "label": "drawing" }, + { "docid": "1_45", "label": "art" }, + { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, + { "docid": "1_47", "label": "abstract,pattern" }, + { "docid": "1_52", "label": "abstract,cartoon" }, + { "docid": "1_57", "label": "abstract,drawing,pattern" }, + { "docid": "1_58", "label": "abstract,art,cartoon" }, + { "docid": "1_68", "label": "design" }, + { "docid": "1_69", "label": "geometry" } ]); insert_documents(&mut wtxn, &index, content); @@ -1021,26 +1043,26 @@ mod tests { builder.execute(|_| ()).unwrap(); let content = documents!([ - {"docid":"1_4","label":"sign"}, - {"docid":"1_5","label":"letter"}, - {"docid":"1_7","label":"abstract,cartoon,design,pattern", "title": "Mickey Mouse"}, - {"docid":"1_36","label":"drawing,painting,pattern"}, - {"docid":"1_37","label":"art,drawing,outdoor"}, - {"docid":"1_38","label":"aquarium,art,drawing", "title": "Nemo"}, - {"docid":"1_39","label":"abstract"}, - {"docid":"1_40","label":"cartoon"}, - {"docid":"1_41","label":"art,drawing"}, - {"docid":"1_42","label":"art,pattern"}, - {"docid":"1_43","label":"abstract,art,drawing,pattern", "number": 32i32}, - {"docid":"1_44","label":"drawing", "number": 44i32}, - {"docid":"1_45","label":"art"}, - {"docid":"1_46","label":"abstract,colorfulness,pattern"}, - {"docid":"1_47","label":"abstract,pattern"}, - {"docid":"1_52","label":"abstract,cartoon"}, - {"docid":"1_57","label":"abstract,drawing,pattern"}, - {"docid":"1_58","label":"abstract,art,cartoon"}, - {"docid":"1_68","label":"design"}, - {"docid":"1_69","label":"geometry"} + { "docid": "1_4", "label": "sign"}, + { "docid": "1_5", "label": "letter"}, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern", "title": "Mickey Mouse"}, + { "docid": "1_36", "label": "drawing,painting,pattern"}, + { "docid": "1_37", "label": "art,drawing,outdoor"}, + { "docid": "1_38", "label": "aquarium,art,drawing", "title": "Nemo"}, + { "docid": "1_39", "label": "abstract"}, + { "docid": "1_40", "label": "cartoon"}, + { "docid": "1_41", "label": "art,drawing"}, + { "docid": "1_42", "label": "art,pattern"}, + { "docid": "1_43", "label": "abstract,art,drawing,pattern", "number": 32i32}, + { "docid": "1_44", "label": "drawing", "number": 44i32}, + { "docid": "1_45", "label": "art"}, + { "docid": "1_46", "label": "abstract,colorfulness,pattern"}, + { "docid": "1_47", "label": "abstract,pattern"}, + { "docid": "1_52", "label": "abstract,cartoon"}, + { "docid": "1_57", "label": "abstract,drawing,pattern"}, + { "docid": "1_58", "label": "abstract,art,cartoon"}, + { "docid": "1_68", "label": "design"}, + { "docid": "1_69", "label": "geometry"} ]); insert_documents(&mut wtxn, &index, content); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 5b6af12ae..33d6396c5 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -719,10 +719,11 @@ mod tests { assert_eq!(count, 1); // Check that we get only one document from the database. - let docs = index.documents(&rtxn, Some(0)).unwrap(); + // Since the document has been deleted and re-inserted, its internal docid has has been incremented to 1 + let docs = index.documents(&rtxn, Some(1)).unwrap(); assert_eq!(docs.len(), 1); let (id, doc) = docs[0]; - assert_eq!(id, 0); + assert_eq!(id, 1); // Check that this document is equal to the last one sent. let mut doc_iter = doc.iter(); @@ -809,11 +810,12 @@ mod tests { let count = index.number_of_documents(&rtxn).unwrap(); assert_eq!(count, 3); - let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); - let (kevin_id, _) = - docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); - let (id, doc) = docs[*kevin_id as usize]; - assert_eq!(id, *kevin_id); + // the document 0 has been deleted and reinserted with the id 3 + let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap(); + let kevin_position = + docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); + assert_eq!(kevin_position, 2); + let (_, doc) = docs[kevin_position]; // Check that this document is equal to the last // one sent and that an UUID has been generated. diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 08d450578..7ddf8765a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -4,7 +4,6 @@ use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; -use byteorder::ReadBytesExt; use fxhash::FxHashMap; use heed::RoTxn; use itertools::Itertools; @@ -57,7 +56,7 @@ pub struct Transform<'a, 'i> { flattened_sorter: grenad::Sorter, replaced_documents_ids: RoaringBitmap, new_documents_ids: RoaringBitmap, - // To increase the cache locality and the heap usage we use smartstring. + // To increase the cache locality and decrease the heap usage we use compact smartstring. new_external_documents_ids_builder: FxHashMap, u64>, documents_count: usize, } @@ -130,13 +129,17 @@ impl<'a, 'i> Transform<'a, 'i> { indexer_settings.max_memory.map(|mem| mem / 2), ); let documents_ids = index.documents_ids(wtxn)?; + let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?; Ok(Transform { index, fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, autogenerate_docids, - available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), + available_documents_ids: AvailableDocumentsIds::from_documents_ids( + &documents_ids, + &soft_deleted_documents_ids, + ), original_sorter, flattened_sorter, index_documents_method, @@ -248,45 +251,39 @@ impl<'a, 'i> Transform<'a, 'i> { writer.insert(*k, v)?; } - let (docid, should_insert_original_document) = - match external_documents_ids.get(&*external_id) { - // if the document is in the db but has already been inserted - // (ie: already exists in the list of replaced documents ids), - // we should not add the original document a second time. - Some(docid) => (docid, !self.replaced_documents_ids.contains(docid)), - None => { - // if the document has already been inserted in this - // batch we need to get its docid - match self.new_external_documents_ids_builder.entry(external_id.into()) { - Entry::Occupied(entry) => (*entry.get() as u32, false), - // if the document has never been encountered we give it a new docid - // and push this new docid to the external documents ids builder - Entry::Vacant(entry) => { - let new_docid = self - .available_documents_ids - .next() - .ok_or(UserError::DocumentLimitReached)?; - entry.insert(new_docid as u64); - (new_docid, false) - } - } + let mut original_docid = None; + + let docid = match self.new_external_documents_ids_builder.entry(external_id.into()) { + Entry::Occupied(entry) => *entry.get() as u32, + Entry::Vacant(entry) => { + // If the document was already in the db we mark it as a replaced document. + // It'll be deleted later. We keep its original docid to insert it in the grenad. + if let Some(docid) = external_documents_ids.get(entry.key()) { + self.replaced_documents_ids.insert(docid); + original_docid = Some(docid); } - }; + let docid = self + .available_documents_ids + .next() + .ok_or(UserError::DocumentLimitReached)?; + entry.insert(docid as u64); + docid + } + }; - if should_insert_original_document { - self.replaced_documents_ids.insert(docid); - - let key = BEU32::new(docid); + if let Some(original_docid) = original_docid { + let original_key = BEU32::new(original_docid); let base_obkv = self .index .documents .remap_data_type::() - .get(wtxn, &key)? + .get(wtxn, &original_key)? .ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None, })?; + // we associate the base document with the new key, everything will get merged later. self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, @@ -506,6 +503,39 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(()) } + fn remove_deleted_documents_from_field_distribution( + &self, + rtxn: &RoTxn, + field_distribution: &mut FieldDistribution, + ) -> Result<()> { + for deleted_docid in self.replaced_documents_ids.iter() { + let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, + )?; + + for (key, _) in obkv.iter() { + let name = + self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + })?; + // We checked that the document was in the db earlier. If we can't find it it means + // there is an inconsistency between the field distribution and the field id map. + let field = + field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Accessing field distribution in transform.", + })?; + *field -= 1; + if *field == 0 { + // since we were able to get the field right before it's safe to unwrap here + field_distribution.remove(name).unwrap(); + } + } + } + Ok(()) + } + /// Generate the `TransformOutput` based on the given sorter that can be generated from any /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// id for the user side and the value must be an obkv where keys are valid fields ids. @@ -532,9 +562,14 @@ impl<'a, 'i> Transform<'a, 'i> { tempfile::tempfile()?, ); - // Once we have all the documents in the sorter, we write the documents - // in the writer. We also generate the field distribution. + // To compute the field distribution we need to; + // 1. Remove all the deleted documents from the field distribution + // 2. Add all the new documents to the field distribution let mut field_distribution = self.index.field_distribution(wtxn)?; + + self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?; + + // Here we are going to do the document count + field distribution + `write_into_stream_writer` let mut iter = self.original_sorter.into_stream_merger_iter()?; // used only for the callback let mut documents_count = 0; @@ -547,36 +582,6 @@ impl<'a, 'i> Transform<'a, 'i> { total_documents: self.documents_count, }); - let u32_key = key.clone().read_u32::()?; - // if the document was already in the db we remove all of its field - // from the field distribution. - if self.replaced_documents_ids.contains(u32_key) { - let obkv = self.index.documents.get(wtxn, &BEU32::new(u32_key))?.ok_or( - InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, - )?; - - for (key, _) in obkv.iter() { - let name = - self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Computing field distribution in transform.", - })?; - // We checked that the document was in the db earlier. If we can't find it it means - // there is an inconsistency between the field distribution and the field id map. - let field = field_distribution.get_mut(name).ok_or( - FieldIdMapMissingEntry::FieldId { - field_id: key, - process: "Accessing field distribution in transform.", - }, - )?; - *field -= 1; - if *field == 0 { - // since we were able to get the field right before it's safe to unwrap here - field_distribution.remove(name).unwrap(); - } - } - } - // We increment all the field of the current document in the field distribution. let obkv = KvReader::new(val); From eaf28b0628b202d02eb0334781dcb04dbd2b2937 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 29 Jun 2022 06:44:16 +0200 Subject: [PATCH 1450/1889] Apply review suggestions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/update/delete_documents.rs | 2 ++ milli/src/update/index_documents/mod.rs | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 564b729ea..0221ebc6f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -20,6 +20,8 @@ use crate::{ RoaringBitmapCodec, SmallString32, BEU32, }; +/// The threshold we use to determine after which number of documents we want to clear the +/// soft-deleted database and delete documents for real. const DELETE_DOCUMENTS_THRESHOLD: u64 = 100_000; pub struct DeleteDocuments<'t, 'u, 'i> { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 33d6396c5..ba428f078 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -719,7 +719,7 @@ mod tests { assert_eq!(count, 1); // Check that we get only one document from the database. - // Since the document has been deleted and re-inserted, its internal docid has has been incremented to 1 + // Since the document has been deleted and re-inserted, its internal docid has been incremented to 1 let docs = index.documents(&rtxn, Some(1)).unwrap(); assert_eq!(docs.len(), 1); let (id, doc) = docs[0]; From b61efd09fc8bc62de1ee7facb44e6603ad34c747 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 4 Jul 2022 14:59:11 +0200 Subject: [PATCH 1451/1889] Makes the internal soft deleted error a UserError --- milli/src/error.rs | 4 ++-- milli/src/index.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index b151e5545..57ae1c85a 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -27,8 +27,6 @@ pub enum Error { #[derive(Error, Debug)] pub enum InternalError { - #[error("Tried to access a soft deleted documents.")] - AccessingSoftDeletedDocument { document_id: DocumentId }, #[error("{}", HeedError::DatabaseClosing)] DatabaseClosing, #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] @@ -85,6 +83,8 @@ pub enum FieldIdMapMissingEntry { #[derive(Error, Debug)] pub enum UserError { + #[error("A soft deleted internal document id have been used: `{document_id}`.")] + AccessingSoftDeletedDocument { document_id: DocumentId }, #[error("A document cannot contain more than 65,535 fields.")] AttributeLimitReached, #[error(transparent)] diff --git a/milli/src/index.rs b/milli/src/index.rs index 9ada51170..9637b4103 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -959,7 +959,7 @@ impl Index { for id in ids { if soft_deleted_documents.contains(id) { - return Err(InternalError::AccessingSoftDeletedDocument { document_id: id })?; + return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?; } let kv = self .documents From 9bc7627e274202cbce55ad2e8fac569be9c99260 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 5 Jul 2022 15:51:06 +0200 Subject: [PATCH 1452/1889] Fix deserialisation of NDJson documents in benchmarks --- benchmarks/benches/utils.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index b769bf2c7..f79d925e7 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -161,6 +161,7 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { while reader.read_line(&mut buf)? > 0 { documents.extend_from_json(&mut buf.as_bytes())?; + buf.clear(); } documents.finish()?; From 250be9fe6cb7f02207324b66b8beff98a0af9d41 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Jul 2022 15:57:44 +0200 Subject: [PATCH 1453/1889] put the threshold back to 10k --- milli/src/update/delete_documents.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 0221ebc6f..3b519c101 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -22,7 +22,7 @@ use crate::{ /// The threshold we use to determine after which number of documents we want to clear the /// soft-deleted database and delete documents for real. -const DELETE_DOCUMENTS_THRESHOLD: u64 = 100_000; +const DELETE_DOCUMENTS_THRESHOLD: u64 = 10_000; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, From 1bfdcfc84f7901972355375ffd82ca66d4f6e225 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 5 Jul 2022 16:23:36 +0200 Subject: [PATCH 1454/1889] Bump uuid to 1.1.2 --- milli/Cargo.toml | 2 +- milli/src/update/index_documents/transform.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index ed779527c..feddf91c5 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -37,7 +37,7 @@ smartstring = "1.0.1" tempfile = "3.3.0" thiserror = "1.0.31" time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } -uuid = { version = "0.8.2", features = ["v4"] } +uuid = { version = "1.1.2", features = ["v4"] } filter-parser = { path = "../filter-parser" } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 08d450578..99dcc5062 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -202,7 +202,7 @@ impl<'a, 'i> Transform<'a, 'i> { // it, transform it into a string and validate it, and then update it in the // document. If none is found, and we were told to generate missing document ids, then // we create the missing field, and update the new document. - let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; + let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; let external_id = if primary_key_id_nested { let mut field_buffer_cache = field_buffer_cache.clone(); self.flatten_from_field_mapping( @@ -799,7 +799,7 @@ fn update_primary_key<'a>( addition_index: &DocumentsBatchIndex, primary_key_id: FieldId, primary_key_name: &str, - uuid_buffer: &'a mut [u8; uuid::adapter::Hyphenated::LENGTH], + uuid_buffer: &'a mut [u8; uuid::fmt::Hyphenated::LENGTH], field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>, mut external_id_buffer: &'a mut Vec, autogenerate_docids: bool, @@ -826,7 +826,7 @@ fn update_primary_key<'a>( Ok(Cow::Owned(value)) } None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer); + let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); serde_json::to_writer(&mut external_id_buffer, &uuid) .map_err(InternalError::SerdeJson)?; field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into())); From aae03356cb3985fd10e440c0737172232e05829d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 6 Jul 2022 18:20:15 +0200 Subject: [PATCH 1455/1889] Use BufReader to read datasets in benchmarks --- benchmarks/benches/utils.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index f79d925e7..b2a9966a2 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -140,9 +140,10 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { } } -pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader { +pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader { let reader = File::open(filename).expect(&format!("could not find the dataset in: {}", filename)); + let reader = BufReader::new(reader); let documents = match filetype { "csv" => documents_from_csv(reader).unwrap(), "json" => documents_from_json(reader).unwrap(), @@ -152,12 +153,11 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader anyhow::Result> { +fn documents_from_jsonl(mut reader: impl BufRead) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; let mut buf = String::new(); - let mut reader = BufReader::new(reader); while reader.read_line(&mut buf)? > 0 { documents.extend_from_json(&mut buf.as_bytes())?; @@ -168,7 +168,7 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { Ok(writer.into_inner()) } -fn documents_from_json(reader: impl Read) -> anyhow::Result> { +fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; @@ -178,7 +178,7 @@ fn documents_from_json(reader: impl Read) -> anyhow::Result> { Ok(writer.into_inner()) } -fn documents_from_csv(reader: impl Read) -> anyhow::Result> { +fn documents_from_csv(reader: impl BufRead) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; From 5d79617a56e8d00d295628dd20227b43c9eb31f2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 7 Jul 2022 16:28:09 +0200 Subject: [PATCH 1456/1889] Chores: Enhance smart-crop code comments --- milli/src/search/matches/mod.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index e2bde3daf..46f87654f 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -11,6 +11,7 @@ const DEFAULT_CROP_MARKER: &'static str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &'static str = ""; const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = ""; +/// Structure used to build a Matcher allowing to customize formating tags. pub struct MatcherBuilder<'a, A> { matching_words: MatchingWords, tokenizer: Tokenizer<'a, A>, @@ -100,6 +101,8 @@ pub struct MatchBounds { pub length: usize, } +/// Structure used to analize a string, compute words that match, +/// and format the source string returning an highlighted and croped sub-string. pub struct Matcher<'t, 'm, A> { text: &'t str, matching_words: &'m MatchingWords, @@ -113,6 +116,8 @@ pub struct Matcher<'t, 'm, A> { impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { /// Iterates over tokens and save any of them that matches the query. fn compute_matches(&mut self) -> &mut Self { + /// some word are counted as matches only if there are close together and in the good order, + /// compute_partial_match peek into next words to validate if the match is complete. fn compute_partial_match<'a>( mut partial: PartialMatch, token_position: usize, @@ -246,9 +251,14 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { // matches needs to be counted in the crop len. let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; + // create the initial state of the crop window: 2 iterators starting from the matches positions, + // a reverse iterator starting from the first match token position and going trew the beginning of the text, let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); + // an iterator starting from the last match token position and going trew the end of the text. let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); + // gross the crop window peeking in both drections + // until the window contains the good number of words: while remaining_words > 0 { let before_token = before_tokens.peek().map(|t| t.separator_kind()); let after_token = after_tokens.peek().map(|t| t.separator_kind()); @@ -315,6 +325,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { } } + // finally, keep the byte index of each bounds of the crop window. let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); @@ -353,7 +364,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { (uniq_score, distance_score, order_score) } - /// Returns the matches interval where the score computed by match_interval_score is maximal. + /// Returns the matches interval where the score computed by match_interval_score is the best. fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { // we compute the matches interval if we have at least 2 matches. if matches.len() > 1 { @@ -408,6 +419,8 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { } else { match &self.matches { Some((tokens, matches)) => { + // If the text have to be croped, + // compute the best interval to crop around. let matches = match format_options.crop { Some(crop_size) if crop_size > 0 => { self.find_best_match_interval(matches, crop_size) @@ -415,6 +428,8 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { _ => matches, }; + // If the text have to be croped, + // crop around the best interval. let (byte_start, byte_end) = match format_options.crop { Some(crop_size) if crop_size > 0 => { self.crop_bounds(tokens, matches, crop_size) From 048e174efb80daa7ca0ac302f53d36fe514d172e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 10:13:48 +0200 Subject: [PATCH 1457/1889] Do not allocate when parsing CSV headers --- milli/src/documents/builder.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 2be7c1dd8..391175f31 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -108,7 +108,7 @@ impl DocumentBatchBuilder { .headers()? .into_iter() .map(parse_csv_header) - .map(|(k, t)| (this.index.insert(&k), t)) + .map(|(k, t)| (this.index.insert(k), t)) .collect::>(); for (i, record) in records.into_records().enumerate() { @@ -161,16 +161,16 @@ enum AllowedType { Number, } -fn parse_csv_header(header: &str) -> (String, AllowedType) { +fn parse_csv_header(header: &str) -> (&str, AllowedType) { // if there are several separators we only split on the last one. match header.rsplit_once(':') { Some((field_name, field_type)) => match field_type { - "string" => (field_name.to_string(), AllowedType::String), - "number" => (field_name.to_string(), AllowedType::Number), + "string" => (field_name, AllowedType::String), + "number" => (field_name, AllowedType::Number), // if the pattern isn't reconized, we keep the whole field. - _otherwise => (header.to_string(), AllowedType::String), + _otherwise => (header, AllowedType::String), }, - None => (header.to_string(), AllowedType::String), + None => (header, AllowedType::String), } } From eb63af1f1024063afc14eca909e99b4b370b2ff1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 13:20:51 +0200 Subject: [PATCH 1458/1889] Update grenad to 0.4.2 --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index feddf91c5..d980c6041 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.4.1" -grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } +grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } From 419ce3966c42fcb66fd1dbdb239b4287bf55d74b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:03:48 +0200 Subject: [PATCH 1459/1889] Rework the DocumentsBatchBuilder/Reader to use grenad --- milli/src/documents/builder.rs | 216 +++++++++++++++--------------- milli/src/documents/mod.rs | 107 ++++++--------- milli/src/documents/reader.rs | 117 +++++++++------- milli/src/documents/serde_impl.rs | 134 ------------------ 4 files changed, 218 insertions(+), 356 deletions(-) delete mode 100644 milli/src/documents/serde_impl.rs diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 391175f31..159afb8d9 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,157 +1,159 @@ -use std::collections::BTreeMap; -use std::io; -use std::io::{Cursor, Write}; +use std::io::{self, Write}; -use byteorder::{BigEndian, WriteBytesExt}; -use serde::Deserializer; -use serde_json::Value; +use grenad::{CompressionType, WriterBuilder}; +use serde_json::{to_writer, Map, Value}; -use super::serde_impl::DocumentVisitor; -use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; -use crate::FieldId; +use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary /// format used by milli. /// -/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to -/// iterate over the documents. +/// The writer used by the `DocumentsBatchBuilder` can be read using a `DocumentsBatchReader` +/// to iterate over the documents. /// /// ## example: /// ``` -/// use milli::documents::DocumentBatchBuilder; /// use serde_json::json; -/// use std::io::Cursor; +/// use milli::documents::DocumentsBatchBuilder; /// -/// let json = r##"{"id": 1, "name": "foo"}"##; -/// let mut writer = Cursor::new(Vec::new()); -/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); -/// builder.extend_from_json(&mut json.as_bytes()).unwrap(); -/// builder.finish().unwrap(); +/// let json = json!({ "id": 1, "name": "foo" }); +/// +/// let mut builder = DocumentsBatchBuilder::new(Vec::new()); +/// builder.append_json_object(json.as_object().unwrap()).unwrap(); +/// let _vector = builder.into_inner().unwrap(); /// ``` -pub struct DocumentBatchBuilder { - inner: ByteCounter, - index: DocumentsBatchIndex, +pub struct DocumentsBatchBuilder { + /// The inner grenad writer, the last value must always be the `DocumentsBatchIndex`. + writer: grenad::Writer, + /// A map that creates the relation between field ids and field names. + fields_index: DocumentsBatchIndex, + /// The number of documents that were added to this builder, + /// it doesn't take the primary key of the documents into account at this point. + documents_count: u32, + + /// A buffer to store a temporary obkv buffer and avoid reallocating. obkv_buffer: Vec, + /// A buffer to serialize the values and avoid reallocating, + /// serialized values are stored in an obkv. value_buffer: Vec, - values: BTreeMap, - count: usize, } -impl DocumentBatchBuilder { - pub fn new(writer: W) -> Result { - let index = DocumentsBatchIndex::default(); - let mut writer = ByteCounter::new(writer); - // add space to write the offset of the metadata at the end of the writer - writer.write_u64::(0)?; - - Ok(Self { - inner: writer, - index, +impl DocumentsBatchBuilder { + pub fn new(writer: W) -> DocumentsBatchBuilder { + DocumentsBatchBuilder { + writer: WriterBuilder::new().compression_type(CompressionType::None).build(writer), + fields_index: DocumentsBatchIndex::default(), + documents_count: 0, obkv_buffer: Vec::new(), value_buffer: Vec::new(), - values: BTreeMap::new(), - count: 0, - }) + } } - /// Returns the number of documents that have been written to the builder. - pub fn len(&self) -> usize { - self.count + /// Returns the number of documents inserted into this builder. + pub fn documents_count(&self) -> u32 { + self.documents_count } - /// This method must be called after the document addition is terminated. It will put the - /// metadata at the end of the file, and write the metadata offset at the beginning on the - /// file. - pub fn finish(self) -> Result { - let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self; + /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_json_object(&mut self, object: &Map) -> io::Result<()> { + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. + let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect(); + fields_ids.sort_unstable(); - let meta = DocumentsMetadata { count, index }; + self.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer); + for field_id in fields_ids { + let key = self.fields_index.name(field_id).unwrap(); + self.value_buffer.clear(); + to_writer(&mut self.value_buffer, &object[key])?; + writer.insert(field_id, &self.value_buffer)?; + } - bincode::serialize_into(&mut writer, &meta)?; + let internal_id = self.documents_count.to_be_bytes(); + let document_bytes = writer.into_inner()?; + self.writer.insert(internal_id, &document_bytes)?; + self.documents_count += 1; - writer.seek(io::SeekFrom::Start(0))?; - writer.write_u64::(offset as u64)?; - - writer.flush()?; - - Ok(count) + Ok(()) } - /// Extends the builder with json documents from a reader. - pub fn extend_from_json(&mut self, reader: R) -> Result<(), Error> { - let mut de = serde_json::Deserializer::from_reader(reader); - - let mut visitor = DocumentVisitor { - inner: &mut self.inner, - index: &mut self.index, - obkv_buffer: &mut self.obkv_buffer, - value_buffer: &mut self.value_buffer, - values: &mut self.values, - count: &mut self.count, - }; - - de.deserialize_any(&mut visitor).map_err(Error::JsonError)? - } - - /// Creates a builder from a reader of CSV documents. - /// - /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform - /// optimisations, and extending from another CSV is not allowed. - pub fn from_csv(reader: R, writer: W) -> Result { - let mut this = Self::new(writer)?; - // Ensure that this is the first and only addition made with this builder - debug_assert!(this.index.is_empty()); - - let mut records = csv::Reader::from_reader(reader); - - let headers = records + /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_csv(&mut self, mut reader: csv::Reader) -> Result<(), Error> { + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. + let mut typed_fields_ids: Vec<_> = reader .headers()? .into_iter() .map(parse_csv_header) - .map(|(k, t)| (this.index.insert(k), t)) - .collect::>(); + .map(|(k, t)| (self.fields_index.insert(k), t)) + .enumerate() + .collect(); + typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid); - for (i, record) in records.into_records().enumerate() { - let record = record?; - this.obkv_buffer.clear(); - let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer); - for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { - let value = match ty { + let mut record = csv::StringRecord::new(); + let mut line = 0; + while reader.read_record(&mut record)? { + // We increment here and not at the end of the while loop to take + // the header offset into account. + line += 1; + + self.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer); + + for (i, (field_id, type_)) in typed_fields_ids.iter() { + self.value_buffer.clear(); + + let value = &record[*i]; + match type_ { AllowedType::Number => { if value.trim().is_empty() { - Value::Null + to_writer(&mut self.value_buffer, &Value::Null)?; } else { - value.trim().parse::().map(Value::from).map_err(|error| { - Error::ParseFloat { - error, - // +1 for the header offset. - line: i + 1, - value: value.to_string(), + match value.trim().parse::() { + Ok(float) => { + to_writer(&mut self.value_buffer, &float)?; } - })? + Err(error) => { + return Err(Error::ParseFloat { + error, + line, + value: value.to_string(), + }); + } + } } } AllowedType::String => { if value.is_empty() { - Value::Null + to_writer(&mut self.value_buffer, &Value::Null)?; } else { - Value::String(value.to_string()) + to_writer(&mut self.value_buffer, value)?; } } - }; + } - this.value_buffer.clear(); - serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; - writer.insert(*fid, &this.value_buffer)?; + // We insert into the obkv writer the value buffer that has been filled just above. + writer.insert(*field_id, &self.value_buffer)?; } - this.inner.write_u32::(this.obkv_buffer.len() as u32)?; - this.inner.write_all(&this.obkv_buffer)?; - - this.count += 1; + let internal_id = self.documents_count.to_be_bytes(); + let document_bytes = writer.into_inner()?; + self.writer.insert(internal_id, &document_bytes)?; + self.documents_count += 1; } - Ok(this) + Ok(()) + } + + /// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`. + pub fn into_inner(mut self) -> io::Result { + let DocumentsBatchBuilder { mut writer, fields_index, .. } = self; + + // We serialize and insert the `DocumentsBatchIndex` as the last key of the grenad writer. + self.value_buffer.clear(); + to_writer(&mut self.value_buffer, &fields_index)?; + writer.insert(DOCUMENTS_BATCH_INDEX_KEY, &self.value_buffer)?; + + writer.into_inner() } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 09f15901d..bd0afc6e4 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,24 +1,22 @@ mod builder; -/// The documents module defines an intermediary document format that milli uses for indexation, and -/// provides an API to easily build and read such documents. -/// -/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can -/// later be read by milli using the `DocumentBatchReader` interface. mod reader; -mod serde_impl; use std::fmt::{self, Debug}; use std::io; use bimap::BiHashMap; -pub use builder::DocumentBatchBuilder; -pub use reader::DocumentBatchReader; +pub use builder::DocumentsBatchBuilder; +pub use reader::{DocumentsBatchCursor, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; use crate::FieldId; +/// The key that is used to store the `DocumentsBatchIndex` datastructure, +/// it is the absolute last key of the list. +const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes(); + /// A bidirectional map that links field ids to their name in a document batch. -#[derive(Default, Debug, Serialize, Deserialize)] +#[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct DocumentsBatchIndex(pub BiHashMap); impl DocumentsBatchIndex { @@ -46,8 +44,8 @@ impl DocumentsBatchIndex { self.0.iter() } - pub fn name(&self, id: FieldId) -> Option<&String> { - self.0.get_by_left(&id) + pub fn name(&self, id: FieldId) -> Option<&str> { + self.0.get_by_left(&id).map(AsRef::as_ref) } pub fn recreate_json( @@ -69,50 +67,20 @@ impl DocumentsBatchIndex { } } -#[derive(Debug, Serialize, Deserialize)] -struct DocumentsMetadata { - count: usize, - index: DocumentsBatchIndex, -} - -pub struct ByteCounter { - count: usize, - writer: W, -} - -impl ByteCounter { - fn new(writer: W) -> Self { - Self { count: 0, writer } - } -} - -impl io::Write for ByteCounter { - fn write(&mut self, buf: &[u8]) -> io::Result { - let count = self.writer.write(buf)?; - self.count += count; - Ok(count) - } - - fn flush(&mut self) -> io::Result<()> { - self.writer.flush() - } -} - #[derive(Debug)] pub enum Error { ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, InvalidDocumentFormat, - Custom(String), - JsonError(serde_json::Error), - CsvError(csv::Error), - Serialize(bincode::Error), + Csv(csv::Error), + Json(serde_json::Error), + Serialize(serde_json::Error), + Grenad(grenad::Error), Io(io::Error), - DocumentTooLarge, } impl From for Error { fn from(e: csv::Error) -> Self { - Self::CsvError(e) + Self::Csv(e) } } @@ -122,15 +90,15 @@ impl From for Error { } } -impl From for Error { - fn from(other: bincode::Error) -> Self { - Self::Serialize(other) +impl From for Error { + fn from(other: serde_json::Error) -> Self { + Self::Json(other) } } -impl From for Error { - fn from(other: serde_json::Error) -> Self { - Self::JsonError(other) +impl From for Error { + fn from(other: grenad::Error) -> Self { + Self::Grenad(other) } } @@ -140,13 +108,14 @@ impl fmt::Display for Error { Error::ParseFloat { error, line, value } => { write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) } - Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), - Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), - Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), + Error::InvalidDocumentFormat => { + f.write_str("Invalid document addition format, missing the documents batch index.") + } Error::Io(e) => write!(f, "{}", e), - Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"), Error::Serialize(e) => write!(f, "{}", e), - Error::CsvError(e) => write!(f, "{}", e), + Error::Grenad(e) => write!(f, "{}", e), + Error::Csv(e) => write!(f, "{}", e), + Error::Json(e) => write!(f, "{}", e), } } } @@ -158,15 +127,25 @@ impl std::error::Error for Error {} macro_rules! documents { ($data:tt) => {{ let documents = serde_json::json!($data); - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); + let documents = match documents { + object @ serde_json::Value::Object(_) => vec![object], + serde_json::Value::Array(objects) => objects, + invalid => { + panic!("an array of objects must be specified, {:#?} is not an array", invalid) + } + }; - writer.set_position(0); + let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); + for document in documents { + let object = match document { + serde_json::Value::Object(object) => object, + invalid => panic!("an object must be specified, {:#?} is not an object", invalid), + }; + builder.append_json_object(&object).unwrap(); + } - crate::documents::DocumentBatchReader::from_reader(writer).unwrap() + let vector = builder.into_inner().unwrap(); + crate::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap() }}; } diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 14d7c8ceb..3dff999f5 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,11 +1,9 @@ +use std::convert::TryInto; use std::io; -use std::io::{BufReader, Read}; -use std::mem::size_of; -use byteorder::{BigEndian, ReadBytesExt}; use obkv::KvReader; -use super::{DocumentsBatchIndex, DocumentsMetadata, Error}; +use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; use crate::FieldId; /// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with @@ -13,63 +11,80 @@ use crate::FieldId; /// /// The documents are returned in the form of `obkv::Reader` where each field is identified with a /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. -pub struct DocumentBatchReader { - reader: BufReader, - metadata: DocumentsMetadata, - buffer: Vec, - seen_documents: usize, +pub struct DocumentsBatchReader { + cursor: grenad::ReaderCursor, + fields_index: DocumentsBatchIndex, } -impl DocumentBatchReader { +impl DocumentsBatchReader { /// Construct a `DocumentsReader` from a reader. /// - /// It first retrieves the index, then moves to the first document. Subsequent calls to - /// `next_document` advance the document reader until all the documents have been read. - pub fn from_reader(mut reader: R) -> Result { - let mut buffer = Vec::new(); + /// It first retrieves the index, then moves to the first document. Use the `into_cursor` + /// method to iterator over the documents, from the first to the last. + pub fn from_reader(reader: R) -> Result { + let reader = grenad::Reader::new(reader)?; + let mut cursor = reader.into_cursor()?; - let meta_offset = reader.read_u64::()?; - reader.seek(io::SeekFrom::Start(meta_offset))?; - reader.read_to_end(&mut buffer)?; - let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?; + let fields_index = match cursor.move_on_key_equal_to(DOCUMENTS_BATCH_INDEX_KEY)? { + Some((_, value)) => serde_json::from_slice(value).map_err(Error::Serialize)?, + None => return Err(Error::InvalidDocumentFormat), + }; - reader.seek(io::SeekFrom::Start(size_of::() as u64))?; - buffer.clear(); - - let reader = BufReader::new(reader); - - Ok(Self { reader, metadata, buffer, seen_documents: 0 }) + Ok(DocumentsBatchReader { cursor, fields_index }) } - /// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a - /// reference to the addition index. - pub fn next_document_with_index<'a>( - &'a mut self, - ) -> io::Result)>> { - if self.seen_documents < self.metadata.count { - let doc_len = self.reader.read_u32::()?; - self.buffer.resize(doc_len as usize, 0); - self.reader.read_exact(&mut self.buffer)?; - self.seen_documents += 1; - - let reader = KvReader::new(&self.buffer); - Ok(Some((&self.metadata.index, reader))) - } else { - Ok(None) - } - } - - /// Return the fields index for the documents batch. - pub fn index(&self) -> &DocumentsBatchIndex { - &self.metadata.index - } - - /// Returns the number of documents in the reader. - pub fn len(&self) -> usize { - self.metadata.count + pub fn documents_count(&self) -> u32 { + self.cursor.len().saturating_sub(1).try_into().expect("Invalid number of documents") } pub fn is_empty(&self) -> bool { - self.len() == 0 + self.cursor.len().saturating_sub(1) == 0 + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + &self.fields_index + } + + /// This method returns a forward cursor over the documents. + pub fn into_cursor(self) -> DocumentsBatchCursor { + let DocumentsBatchReader { cursor, fields_index } = self; + let mut cursor = DocumentsBatchCursor { cursor, fields_index }; + cursor.reset(); + cursor + } +} + +/// A forward cursor over the documents in a `DocumentsBatchReader`. +pub struct DocumentsBatchCursor { + cursor: grenad::ReaderCursor, + fields_index: DocumentsBatchIndex, +} + +impl DocumentsBatchCursor { + pub fn into_reader(self) -> DocumentsBatchReader { + let DocumentsBatchCursor { cursor, fields_index, .. } = self; + DocumentsBatchReader { cursor, fields_index } + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + &self.fields_index + } + + /// Resets the cursor to be able to read from the start again. + pub fn reset(&mut self) { + self.cursor.reset(); + } +} + +impl DocumentsBatchCursor { + /// Returns the next document, starting from the first one. Subsequent calls to + /// `next_document` advance the document reader until all the documents have been read. + pub fn next_document(&mut self) -> Result>, grenad::Error> { + match self.cursor.move_on_next()? { + Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { + Ok(Some(KvReader::new(value))) + } + _otherwise => Ok(None), + } } } diff --git a/milli/src/documents/serde_impl.rs b/milli/src/documents/serde_impl.rs deleted file mode 100644 index d57bf1ffb..000000000 --- a/milli/src/documents/serde_impl.rs +++ /dev/null @@ -1,134 +0,0 @@ -use std::collections::BTreeMap; -use std::fmt; -use std::io::{Cursor, Write}; - -use byteorder::WriteBytesExt; -use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; -use serde::Deserialize; -use serde_json::Value; - -use super::{ByteCounter, DocumentsBatchIndex, Error}; -use crate::FieldId; - -macro_rules! tri { - ($e:expr) => { - match $e { - Ok(r) => r, - Err(e) => return Ok(Err(e.into())), - } - }; -} - -struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex); - -impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { - type Value = FieldId; - - fn deserialize(self, deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - deserializer.deserialize_str(self) - } -} - -impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { - type Value = FieldId; - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - Ok(self.0.insert(v)) - } - - fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "a string") - } -} - -struct ValueDeserializer; - -impl<'de> DeserializeSeed<'de> for ValueDeserializer { - type Value = serde_json::Value; - - fn deserialize(self, deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - serde_json::Value::deserialize(deserializer) - } -} - -pub struct DocumentVisitor<'a, W> { - pub inner: &'a mut ByteCounter, - pub index: &'a mut DocumentsBatchIndex, - pub obkv_buffer: &'a mut Vec, - pub value_buffer: &'a mut Vec, - pub values: &'a mut BTreeMap, - pub count: &'a mut usize, -} - -impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { - /// This Visitor value is nothing, since it write the value to a file. - type Value = Result<(), Error>; - - fn visit_seq(self, mut seq: A) -> Result - where - A: SeqAccess<'de>, - { - while let Some(v) = seq.next_element_seed(&mut *self)? { - tri!(v) - } - - Ok(Ok(())) - } - - fn visit_map(self, mut map: A) -> Result - where - A: MapAccess<'de>, - { - while let Some((key, value)) = - map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? - { - self.values.insert(key, value); - } - - self.obkv_buffer.clear(); - let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer)); - for (key, value) in self.values.iter() { - self.value_buffer.clear(); - // This is guaranteed to work - tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value)); - tri!(obkv.insert(*key, &self.value_buffer)); - } - - let reader = tri!(obkv.into_inner()).into_inner(); - - tri!(self.inner.write_u32::(reader.len() as u32)); - tri!(self.inner.write_all(reader)); - - *self.count += 1; - self.values.clear(); - - Ok(Ok(())) - } - - fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "a documents, or a sequence of documents.") - } -} - -impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> -where - W: Write, -{ - type Value = Result<(), Error>; - - fn deserialize(self, deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - deserializer.deserialize_map(self) - } -} From e8297ad27e4977f4f43c43f181dc9d7c9ea041dd Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:04:27 +0200 Subject: [PATCH 1460/1889] Fix the tests for the new DocumentsBatchBuilder/Reader --- milli/src/documents/builder.rs | 332 +++++++++--------- milli/src/documents/mod.rs | 133 ++----- milli/src/search/distinct/mod.rs | 31 +- milli/src/update/index_documents/mod.rs | 63 ++-- milli/src/update/index_documents/transform.rs | 12 +- milli/tests/search/facet_distribution.rs | 26 +- milli/tests/search/mod.rs | 18 +- milli/tests/search/query_criteria.rs | 20 +- milli/tests/search/typo_tolerance.rs | 31 +- 9 files changed, 292 insertions(+), 374 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 159afb8d9..19cc1ce53 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -183,7 +183,8 @@ mod test { use serde_json::{json, Map}; use super::*; - use crate::documents::DocumentBatchReader; + use crate::documents::DocumentsBatchReader; + use crate::FieldId; fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { let mut map = Map::new(); @@ -192,7 +193,7 @@ mod test { let field_name = index.name(fid).unwrap().clone(); let value: Value = serde_json::from_slice(value).unwrap(); - map.insert(field_name, value); + map.insert(field_name.to_string(), value); } Value::Object(map) @@ -200,15 +201,13 @@ mod test { #[test] fn add_single_documents_json() { - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - let json = serde_json::json!({ "id": 1, "field": "hello!", }); - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(json.as_object().unwrap()).unwrap(); let json = serde_json::json!({ "blabla": false, @@ -216,100 +215,64 @@ mod test { "id": 1, }); - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + builder.append_json_object(json.as_object().unwrap()).unwrap(); - assert_eq!(builder.len(), 2); + assert_eq!(builder.documents_count(), 2); + let vector = builder.into_inner().unwrap(); - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); assert_eq!(index.len(), 3); + + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 3); - assert!(reader.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_documents_seq_json() { - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - let json = serde_json::json!([{ - "id": 1, - "field": "hello!", - },{ - "blabla": false, - "field": "hello!", - "id": 1, - } - ]); - - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); - - assert_eq!(builder.len(), 2); - - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); - assert_eq!(document.iter().count(), 2); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); - assert_eq!(document.iter().count(), 3); - - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn add_documents_csv() { - let mut cursor = Cursor::new(Vec::new()); + let csv_content = "id:number,field:string\n1,hello!\n2,blabla"; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let csv = "id:number,field:string\n1,hello!\n2,blabla"; + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + assert_eq!(builder.documents_count(), 2); + let vector = builder.into_inner().unwrap(); - let builder = - DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); assert_eq!(index.len(), 2); + + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - let (_index, document) = reader.next_document_with_index().unwrap().unwrap(); + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn simple_csv_document() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -320,22 +283,25 @@ mod test { }) ); - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn coma_in_field() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United, States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -349,17 +315,20 @@ mod test { #[test] fn quote_in_field() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United"" States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -373,17 +342,20 @@ mod test { #[test] fn integer_in_field() { - let documents = r#"city,country,pop:number + let csv_content = r#"city,country,pop:number "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -397,17 +369,20 @@ mod test { #[test] fn float_in_field() { - let documents = r#"city,country,pop:number + let csv_content = r#"city,country,pop:number "Boston","United States","4628910.01""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -421,17 +396,20 @@ mod test { #[test] fn several_colon_in_header() { - let documents = r#"city:love:string,country:state,pop + let csv_content = r#"city:love:string,country:state,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -445,17 +423,20 @@ mod test { #[test] fn ending_by_colon_in_header() { - let documents = r#"city:,country,pop + let csv_content = r#"city:,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -469,17 +450,20 @@ mod test { #[test] fn starting_by_colon_in_header() { - let documents = r#":city,country,pop + let csv_content = r#":city,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -494,32 +478,36 @@ mod test { #[ignore] #[test] fn starting_by_colon_in_header2() { - let documents = r#":string,country,pop + let csv_content = r#":string,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); - assert!(reader.next_document_with_index().is_err()); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + + assert!(cursor.next_document().is_err()); } #[test] fn double_colon_in_header() { - let documents = r#"city::string,country,pop + let csv_content = r#"city::string,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -533,34 +521,32 @@ mod test { #[test] fn bad_type_in_header() { - let documents = r#"city,country:number,pop + let csv_content = r#"city,country:number,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } #[test] fn bad_column_count1() { - let documents = r#"city,country,pop -"Boston","United States","4628910", "too much""#; + let csv_content = r#"city,country,pop +"Boston","United States","4628910", "too much + let csv = csv::Reader::from_reader(Cursor::new(csv_content"#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } #[test] fn bad_column_count2() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United States""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index bd0afc6e4..7a34ae13b 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -159,7 +159,7 @@ mod test { #[test] fn create_documents_no_errors() { - let json = json!({ + let value = json!({ "number": 1, "string": "this is a field", "array": ["an", "array"], @@ -169,26 +169,17 @@ mod test { "bool": true }); - let json = serde_json::to_vec(&json).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(json)).unwrap(); - - builder.finish().unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(value.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - assert_eq!(documents.index().iter().count(), 5); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 5); - assert!(documents.next_document_with_index().unwrap().is_none()); + assert_eq!(documents.documents_batch_index().iter().count(), 5); + let reader = documents.next_document().unwrap().unwrap(); + assert_eq!(reader.iter().count(), 5); + assert!(documents.next_document().unwrap().is_none()); } #[test] @@ -200,101 +191,55 @@ mod test { "toto": false, }); - let doc1 = serde_json::to_vec(&doc1).unwrap(); - let doc2 = serde_json::to_vec(&doc2).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(doc1)).unwrap(); - builder.extend_from_json(Cursor::new(doc2)).unwrap(); - - builder.finish().unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(doc1.as_object().unwrap()).unwrap(); + builder.append_json_object(doc2.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); - - assert_eq!(documents.index().iter().count(), 2); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 1); - assert!(documents.next_document_with_index().unwrap().is_some()); - assert!(documents.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_documents_array() { - let docs = json!([ - { "toto": false }, - { "tata": "hello" }, - ]); - - let docs = serde_json::to_vec(&docs).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(docs)).unwrap(); - - builder.finish().unwrap(); - - let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); - - assert_eq!(documents.index().iter().count(), 2); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 1); - assert!(documents.next_document_with_index().unwrap().is_some()); - assert!(documents.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_invalid_document_format() { - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - let docs = json!([[ - { "toto": false }, - { "tata": "hello" }, - ]]); - - let docs = serde_json::to_vec(&docs).unwrap(); - assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); - - let docs = json!("hello"); - let docs = serde_json::to_vec(&docs).unwrap(); - - assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); + DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor(); + assert_eq!(documents.documents_batch_index().iter().count(), 2); + let reader = documents.next_document().unwrap().unwrap(); + assert_eq!(reader.iter().count(), 1); + assert!(documents.next_document().unwrap().is_some()); + assert!(documents.next_document().unwrap().is_none()); } #[test] fn test_nested() { - let mut docs = documents!([{ + let docs_reader = documents!([{ "hello": { "toto": ["hello"] } }]); - let (_index, doc) = docs.next_document_with_index().unwrap().unwrap(); - + let mut cursor = docs_reader.into_cursor(); + let doc = cursor.next_document().unwrap().unwrap(); let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); assert_eq!(nested, json!({ "toto": ["hello"] })); } #[test] - fn out_of_order_fields() { + fn out_of_order_json_fields() { let _documents = documents!([ {"id": 1,"b": 0}, {"id": 2,"a": 0,"b": 0}, ]); } + + #[test] + fn out_of_order_csv_fields() { + let csv1_content = "id:number,b\n1,0"; + let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content)); + + let csv2_content = "id:number,a,b\n2,0,0"; + let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv1).unwrap(); + builder.append_csv(csv2).unwrap(); + let vector = builder.into_inner().unwrap(); + + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + } } diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 237fd718a..670fa01ac 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -35,7 +35,7 @@ mod test { use roaring::RoaringBitmap; use serde_json::{json, Value}; - use crate::documents::{DocumentBatchBuilder, DocumentBatchReader}; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; use crate::index::Index; use crate::update::{ @@ -43,14 +43,11 @@ mod test { }; use crate::{DocumentId, FieldId, BEU32}; - static JSON: Lazy> = Lazy::new(generate_documents); - - fn generate_documents() -> Vec { + static JSON: Lazy> = Lazy::new(|| { let mut rng = rand::thread_rng(); let num_docs = rng.gen_range(10..30); - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); @@ -63,7 +60,7 @@ mod test { let mut sample_ints = cat_ints.clone(); sample_ints.shuffle(&mut rng); - let doc = json!({ + let json = json!({ "id": i, "txt": txt, "cat-int": rng.gen_range(0..3), @@ -71,13 +68,16 @@ mod test { "cat-ints": sample_ints[..(rng.gen_range(0..3))], }); - let doc = Cursor::new(serde_json::to_vec(&doc).unwrap()); - builder.extend_from_json(doc).unwrap(); + let object = match json { + Value::Object(object) => object, + _ => panic!(), + }; + + builder.append_json_object(&object).unwrap(); } - builder.finish().unwrap(); - cursor.into_inner() - } + builder.into_inner().unwrap() + }); /// Returns a temporary index populated with random test documents, the FieldId for the /// distinct attribute, and the RoaringBitmap with the document ids. @@ -101,7 +101,8 @@ mod test { IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); let reader = - crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); + crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) + .unwrap(); addition.add_documents(reader).unwrap(); addition.execute().unwrap(); @@ -109,8 +110,8 @@ mod test { let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); - let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); - let map = (0..documents.len() as u32).collect(); + let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap(); + let map = (0..documents.documents_count() as u32).collect(); txn.commit().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba428f078..7f6e00b11 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -25,7 +25,7 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use crate::documents::DocumentBatchReader; +use crate::documents::DocumentsBatchReader; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, @@ -121,7 +121,7 @@ where /// builder, and the builder must be discarded. /// /// Returns the number of documents added to the builder. - pub fn add_documents(&mut self, reader: DocumentBatchReader) -> Result + pub fn add_documents(&mut self, reader: DocumentsBatchReader) -> Result where R: Read + Seek, { @@ -590,9 +590,8 @@ mod tests { use maplit::hashset; use super::*; - use crate::documents::DocumentBatchBuilder; + use crate::documents::DocumentsBatchBuilder; use crate::update::DeleteDocuments; - use crate::HashMap; #[test] fn simple_document_replacement() { @@ -1252,21 +1251,17 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); - let mut big_object = HashMap::new(); - big_object.insert(S("id"), "wow"); + let mut big_object = serde_json::Map::new(); + big_object.insert(S("id"), serde_json::Value::from("wow")); for i in 0..1000 { let key = i.to_string(); - big_object.insert(key, "I am a text!"); + big_object.insert(key, serde_json::Value::from("I am a text!")); } - let mut cursor = Cursor::new(Vec::new()); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap()); - builder.extend_from_json(big_object).unwrap(); - builder.finish().unwrap(); - cursor.set_position(0); - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(&big_object).unwrap(); + let vector = builder.into_inner().unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); @@ -1288,23 +1283,19 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); - let mut big_object = HashMap::new(); - big_object.insert(S("id"), "wow"); + let mut big_object = serde_json::Map::new(); + big_object.insert(S("id"), serde_json::Value::from("wow")); let content: String = (0..=u16::MAX) .into_iter() .map(|p| p.to_string()) .reduce(|a, b| a + " " + b.as_ref()) .unwrap(); - big_object.insert("content".to_string(), &content); + big_object.insert("content".to_string(), serde_json::Value::from(content)); - let mut cursor = Cursor::new(Vec::new()); - - let big_object = serde_json::to_string(&big_object).unwrap(); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.extend_from_json(&mut big_object.as_bytes()).unwrap(); - builder.finish().unwrap(); - cursor.set_position(0); - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(&big_object).unwrap(); + let vector = builder.into_inner().unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); @@ -1843,18 +1834,20 @@ mod tests { // Create 200 documents with a long text let content = { - let documents: Vec<_> = (0..200i32) + let documents_iter = (0..200i32) .into_iter() .map(|i| serde_json::json!({ "id": i, "script": script })) - .collect(); + .filter_map(|json| match json { + serde_json::Value::Object(object) => Some(object), + _ => None, + }); - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); - writer.set_position(0); - crate::documents::DocumentBatchReader::from_reader(writer).unwrap() + let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); + for object in documents_iter { + builder.append_json_object(&object).unwrap(); + } + let vector = builder.into_inner().unwrap(); + crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap() }; // Index those 200 long documents diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 12a858024..129357075 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -14,7 +14,7 @@ use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::{IndexDocumentsMethod, IndexerConfig}; -use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; @@ -152,7 +152,7 @@ impl<'a, 'i> Transform<'a, 'i> { pub fn read_documents( &mut self, - mut reader: DocumentBatchReader, + reader: DocumentsBatchReader, wtxn: &mut heed::RwTxn, progress_callback: F, ) -> Result @@ -160,7 +160,8 @@ impl<'a, 'i> Transform<'a, 'i> { R: Read + Seek, F: Fn(UpdateIndexingStep) + Sync, { - let fields_index = reader.index(); + let mut cursor = reader.into_cursor(); + let fields_index = cursor.documents_batch_index(); let external_documents_ids = self.index.external_documents_ids(wtxn)?; let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; @@ -186,7 +187,8 @@ impl<'a, 'i> Transform<'a, 'i> { let mut documents_count = 0; let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); - while let Some((addition_index, document)) = reader.next_document_with_index()? { + let addition_index = cursor.documents_batch_index().clone(); + while let Some(document) = cursor.next_document()? { let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -840,7 +842,7 @@ fn update_primary_key<'a>( None => { let mut json = Map::new(); for (key, value) in document.iter() { - let key = addition_index.name(key).cloned(); + let key = addition_index.name(key).map(ToString::to_string); let value = serde_json::from_slice::(&value).ok(); if let Some((k, v)) = key.zip(value) { diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index d3aece2ab..66713de1e 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -3,9 +3,10 @@ use std::io::Cursor; use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{FacetDistribution, Index}; +use serde_json::{Deserializer, Map, Value}; #[test] fn test_facet_distribution_with_no_facet_values() { @@ -30,35 +31,30 @@ fn test_facet_distribution_with_no_facet_values() { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new( - r#"[ - { + r#"{ "id": 123, "title": "What a week, hu...", "genres": [], "tags": ["blue"] - }, + } { "id": 345, "title": "I am the pig!", "tags": ["red"] - } - ]"#, + }"#, ); - for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); - documents_builder.extend_from_json(doc).unwrap(); + for result in Deserializer::from_reader(reader).into_iter::>() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); } - documents_builder.finish().unwrap(); - - cursor.set_position(0); + let vector = documents_builder.into_inner().unwrap(); // index documents - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 472fbafe0..4cf117dc7 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -6,10 +6,11 @@ use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index, Member}; use serde::Deserialize; +use serde_json::{Deserializer, Map, Value}; use slice_group_by::GroupBy; mod distinct; @@ -62,21 +63,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); - for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); - documents_builder.extend_from_json(doc).unwrap(); + for result in Deserializer::from_reader(reader).into_iter::>() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); } - documents_builder.finish().unwrap(); - - cursor.set_position(0); + let vector = documents_builder.into_inner().unwrap(); // index documents - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 893d7c30a..89a6a6eec 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -5,7 +5,7 @@ use big_s::S; use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; use rand::Rng; @@ -393,8 +393,7 @@ fn criteria_ascdesc() { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut batch_builder = DocumentsBatchBuilder::new(Vec::new()); (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| { let mut rng = rand::thread_rng(); @@ -412,16 +411,17 @@ fn criteria_ascdesc() { "age": age, }); - let json = Cursor::new(serde_json::to_vec(&json).unwrap()); - batch_builder.extend_from_json(json).unwrap(); + let object = match json { + serde_json::Value::Object(object) => object, + _ => panic!(), + }; + + batch_builder.append_json_object(&object).unwrap(); }); - batch_builder.finish().unwrap(); - - cursor.set_position(0); - - let reader = DocumentBatchReader::from_reader(cursor).unwrap(); + let vector = batch_builder.into_inner().unwrap(); + let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(reader).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 9a7986c5e..63bf22579 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -106,26 +106,23 @@ fn test_typo_disabled_on_word() { options.map_size(4096 * 100); let index = Index::new(options, tmp.path()).unwrap(); - let documents = json!([ - { - "id": 1usize, - "data": "zealand", - }, - { - "id": 2usize, - "data": "zearand", - }, - ]); + let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new()); + let doc1 = json!({ + "id": 1usize, + "data": "zealand", + }); - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); + let doc2 = json!({ + "id": 2usize, + "data": "zearand", + }); - writer.set_position(0); + builder.append_json_object(doc1.as_object().unwrap()).unwrap(); + builder.append_json_object(doc2.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); - let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap(); + let documents = + milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap(); let mut txn = index.write_txn().unwrap(); let config = IndexerConfig::default(); From 6d0498df2445975d8cfff3cb8c4b1453bd5e00e6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:22:59 +0200 Subject: [PATCH 1461/1889] Fix the fuzz tests --- milli/fuzz/fuzz_targets/indexing.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index b618aabad..5c3b79ed7 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -7,10 +7,10 @@ use anyhow::{bail, Result}; use arbitrary_json::ArbitraryValue; use heed::EnvOpenOptions; use libfuzzer_sys::fuzz_target; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::Index; -use serde_json::Value; +use serde_json::{Map, Value}; #[cfg(target_os = "linux")] #[global_allocator] @@ -19,21 +19,26 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; /// reads json from input and write an obkv batch to writer. pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { let writer = BufWriter::new(writer); - let mut builder = DocumentBatchBuilder::new(writer)?; - builder.extend_from_json(input)?; + let mut builder = DocumentsBatchBuilder::new(writer); - if builder.len() == 0 { + let values: Vec> = serde_json::from_reader(input)?; + if builder.documents_count() == 0 { bail!("Empty payload"); } - let count = builder.finish()?; + for object in values { + builder.append_json_object(&object)?; + } - Ok(count) + let count = builder.documents_count(); + let vector = builder.into_inner()?; + + Ok(count as usize) } fn index_documents( index: &mut milli::Index, - documents: DocumentBatchReader>>, + documents: DocumentsBatchReader>>, ) -> Result<()> { let config = IndexerConfig::default(); let mut wtxn = index.write_txn()?; @@ -98,7 +103,7 @@ fuzz_target!(|batches: Vec>| { // We ignore all malformed documents if let Ok(_) = read_json(json.as_bytes(), &mut documents) { documents.rewind().unwrap(); - let documents = DocumentBatchReader::from_reader(documents).unwrap(); + let documents = DocumentsBatchReader::from_reader(documents).unwrap(); // A lot of errors can come out of milli and we don't know which ones are normal or not // so we are only going to look for the unexpected panics. let _ = index_documents(&mut index, documents); From a4ceef96246e8e121007c16a0d159cd4c99ede11 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:35:59 +0200 Subject: [PATCH 1462/1889] Fix the cli for the new DocumentsBatchBuilder/Reader structs --- cli/src/main.rs | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 14bc797af..dcd0f407a 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -8,6 +8,7 @@ use std::time::Instant; use byte_unit::Byte; use eyre::Result; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; @@ -225,9 +226,9 @@ impl Performer for DocumentAddition { DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?, }; - let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?; + let reader = DocumentsBatchReader::from_reader(Cursor::new(documents))?; - println!("Adding {} documents to the index.", reader.len()); + println!("Adding {} documents to the index.", reader.documents_count()); let mut txn = index.write_txn()?; let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() }; @@ -321,35 +322,35 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa } fn documents_from_jsonl(reader: impl Read) -> Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let reader = BufReader::new(reader); - let mut buf = String::new(); - let mut reader = BufReader::new(reader); - - while reader.read_line(&mut buf)? > 0 { - documents.extend_from_json(&mut buf.as_bytes())?; + for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + let object = result?; + documents.append_json_object(&object)?; } - documents.finish()?; - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_json(reader: impl Read) -> Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let list: Vec> = serde_json::from_reader(reader)?; - documents.extend_from_json(reader)?; - documents.finish()?; + for object in list { + documents.append_json_object(&object)?; + } - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_csv(reader: impl Read) -> Result> { - let mut writer = Cursor::new(Vec::new()); - milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; + let csv = csv::Reader::from_reader(reader); - Ok(writer.into_inner()) + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) } #[derive(Debug, StructOpt)] From f29114f94aff42592d8f0a932784f578433e0744 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 17:21:54 +0200 Subject: [PATCH 1463/1889] Fix http-ui to fit with the new DocumentsBatchBuilder/Reader structs --- http-ui/src/main.rs | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index ce4fa7ba5..63b9ee5e0 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -3,7 +3,7 @@ mod update_store; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; -use std::io::{BufRead, BufReader, Cursor, Read}; +use std::io::{BufReader, Cursor, Read}; use std::net::SocketAddr; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; @@ -18,7 +18,7 @@ use either::Either; use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; use heed::EnvOpenOptions; -use milli::documents::DocumentBatchReader; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::tokenizer::TokenizerBuilder; use milli::update::UpdateIndexingStep::*; use milli::update::{ @@ -399,7 +399,7 @@ async fn main() -> anyhow::Result<()> { otherwise => panic!("invalid update format {:?}", otherwise), }; - let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?; + let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?; builder.add_documents(documents)?; @@ -1032,35 +1032,36 @@ async fn main() -> anyhow::Result<()> { Ok(()) } -fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; +fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let reader = BufReader::new(reader); - for result in BufReader::new(reader).lines() { - let line = result?; - documents.extend_from_json(Cursor::new(line))?; + for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + let object = result?; + documents.append_json_object(&object)?; } - documents.finish()?; - - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } -fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; +fn documents_from_json(reader: impl Read) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let list: Vec> = serde_json::from_reader(reader)?; - documents.extend_from_json(reader)?; - documents.finish()?; + for object in list { + documents.append_json_object(&object)?; + } - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } -fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; +fn documents_from_csv(reader: impl Read) -> anyhow::Result> { + let csv = csv::Reader::from_reader(reader); - Ok(writer.into_inner()) + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) } #[cfg(test)] From a97d4d63b9f1f29edac7f78aa33710682996e63c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:17:48 +0200 Subject: [PATCH 1464/1889] Fix the benchmarks --- benchmarks/benches/utils.rs | 41 ++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index b2a9966a2..091b9b0f5 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,13 +1,13 @@ #![allow(dead_code)] use std::fs::{create_dir_all, remove_dir_all, File}; -use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; +use std::io::{self, BufReader, Cursor, Read, Seek}; use std::num::ParseFloatError; use std::path::Path; use criterion::BenchmarkId; use heed::EnvOpenOptions; -use milli::documents::DocumentBatchReader; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{ IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; @@ -150,39 +150,38 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader documents_from_jsonl(reader).unwrap(), otherwise => panic!("invalid update format {:?}", otherwise), }; - DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap() + DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() } -fn documents_from_jsonl(mut reader: impl BufRead) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; +fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let mut buf = String::new(); - - while reader.read_line(&mut buf)? > 0 { - documents.extend_from_json(&mut buf.as_bytes())?; - buf.clear(); + for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + let object = result?; + documents.append_json_object(&object)?; } - documents.finish()?; - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let list: Vec> = serde_json::from_reader(reader)?; - documents.extend_from_json(reader)?; - documents.finish()?; + for object in list { + documents.append_json_object(&object)?; + } - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_csv(reader: impl BufRead) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; + let csv = csv::Reader::from_reader(reader); - Ok(writer.into_inner()) + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) } enum AllowedType { From bdc426388379e80e03d8291d3a12b2a6361f0571 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:12:15 +0200 Subject: [PATCH 1465/1889] Introduce the validate_documents_batch function --- milli/src/update/index_documents/transform.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 129357075..42187fc1e 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -13,7 +13,7 @@ use serde_json::{Map, Value}; use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; -use super::{IndexDocumentsMethod, IndexerConfig}; +use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; From cefffde9af704bc9b4146206a852972745941eda Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:07:28 +0200 Subject: [PATCH 1466/1889] Improve the .gitignore of the fuzz crate --- milli/fuzz/.gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore index cb73742e4..ebf2c9395 100644 --- a/milli/fuzz/.gitignore +++ b/milli/fuzz/.gitignore @@ -1,2 +1,5 @@ +Cargo.lock +target/ + /corpus/ /artifacts/ From 0146175fe60e104519ebd23fa2e14bd1ff3e8bfc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:12:15 +0200 Subject: [PATCH 1467/1889] Introduce the validate_documents_batch function --- milli/src/documents/builder.rs | 36 ++--- milli/src/documents/mod.rs | 28 +++- milli/src/error.rs | 6 + .../extract/extract_geo_points.rs | 9 +- milli/src/update/index_documents/mod.rs | 36 +++-- milli/src/update/index_documents/transform.rs | 26 +--- milli/src/update/index_documents/validate.rs | 140 ++++++++++++++++++ 7 files changed, 208 insertions(+), 73 deletions(-) create mode 100644 milli/src/update/index_documents/validate.rs diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 19cc1ce53..15a22090a 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -180,24 +180,10 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) { mod test { use std::io::Cursor; - use serde_json::{json, Map}; + use serde_json::json; use super::*; - use crate::documents::DocumentsBatchReader; - use crate::FieldId; - - fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { - let mut map = Map::new(); - - for (fid, value) in obkv.iter() { - let field_name = index.name(fid).unwrap().clone(); - let value: Value = serde_json::from_slice(value).unwrap(); - - map.insert(field_name.to_string(), value); - } - - Value::Object(map) - } + use crate::documents::{obkv_to_object, DocumentsBatchReader}; #[test] fn add_single_documents_json() { @@ -272,7 +258,7 @@ mod test { DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -301,7 +287,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -328,7 +314,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -355,7 +341,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -382,7 +368,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -409,7 +395,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -436,7 +422,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -463,7 +449,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -507,7 +493,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 7a34ae13b..ee3593bf8 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -6,15 +6,30 @@ use std::io; use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; +use obkv::KvReader; pub use reader::{DocumentsBatchCursor, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; -use crate::FieldId; +use crate::error::{FieldIdMapMissingEntry, InternalError}; +use crate::{FieldId, Object, Result}; /// The key that is used to store the `DocumentsBatchIndex` datastructure, /// it is the absolute last key of the list. const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes(); +/// Helper function to convert an obkv reader into a JSON object. +pub fn obkv_to_object(obkv: &KvReader, index: &DocumentsBatchIndex) -> Result { + obkv.iter() + .map(|(field_id, value)| { + let field_name = index.name(field_id).ok_or_else(|| { + FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" } + })?; + let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; + Ok((field_name.to_string(), value)) + }) + .collect() +} + /// A bidirectional map that links field ids to their name in a document batch. #[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct DocumentsBatchIndex(pub BiHashMap); @@ -48,11 +63,12 @@ impl DocumentsBatchIndex { self.0.get_by_left(&id).map(AsRef::as_ref) } - pub fn recreate_json( - &self, - document: &obkv::KvReaderU16, - ) -> Result, crate::Error> { - let mut map = serde_json::Map::new(); + pub fn id(&self, name: &str) -> Option { + self.0.get_by_right(name).cloned() + } + + pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result { + let mut map = Object::new(); for (k, v) in document.iter() { // TODO: TAMO: update the error type diff --git a/milli/src/error.rs b/milli/src/error.rs index 57ae1c85a..d34130210 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -141,10 +141,16 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco #[derive(Error, Debug)] pub enum GeoError { + #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")] + NotAnObject { document_id: Value, value: Value }, + #[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")] + MissingLatitudeAndLongitude { document_id: Value }, #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")] MissingLatitude { document_id: Value }, #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")] MissingLongitude { document_id: Value }, + #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{lat}` and `{lng}`.")] + BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value }, #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] BadLatitude { document_id: Value, value: Value }, #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index fffae5e77..0f804b93b 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -7,6 +7,7 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::GeoError; +use crate::update::index_documents::extract_float_from_value; use crate::{FieldId, InternalError, Result}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. @@ -61,11 +62,3 @@ pub fn extract_geo_points( Ok(writer_into_reader(writer)?) } - -fn extract_float_from_value(value: Value) -> StdResult { - match value { - Value::Number(ref n) => n.as_f64().ok_or(value), - Value::String(ref s) => s.parse::().map_err(|_| value), - value => Err(value), - } -} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7f6e00b11..2fb7cbcd9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2,11 +2,13 @@ mod extract; mod helpers; mod transform; mod typed_chunk; +mod validate; use std::collections::HashSet; use std::io::{Cursor, Read, Seek}; use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; +use std::result::Result as StdResult; use crossbeam_channel::{Receiver, Sender}; use heed::types::Str; @@ -25,13 +27,19 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use crate::documents::DocumentsBatchReader; +use self::validate::validate_documents_batch; +pub use self::validate::{ + extract_float_from_value, validate_document_id, validate_document_id_from_json, + validate_geo_from_json, +}; +use crate::documents::{obkv_to_object, DocumentsBatchReader}; +use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; -use crate::{Index, Result, RoaringBitmapCodec, UserError}; +use crate::{Index, Result, RoaringBitmapCodec}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -117,19 +125,27 @@ where /// Adds a batch of documents to the current builder. /// - /// Since the documents are progressively added to the writer, a failure will cause a stale - /// builder, and the builder must be discarded. + /// Since the documents are progressively added to the writer, a failure will cause only + /// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward. /// /// Returns the number of documents added to the builder. - pub fn add_documents(&mut self, reader: DocumentsBatchReader) -> Result - where - R: Read + Seek, - { + pub fn add_documents( + mut self, + reader: DocumentsBatchReader, + ) -> Result<(Self, StdResult)> { // Early return when there is no document to add if reader.is_empty() { - return Ok(0); + return Ok((self, Ok(0))); } + // We check for user errors in this validator and if there is one, we can return + // the `IndexDocument` struct as it is valid to send more documents into it. + // However, if there is an internal error we throw it away! + let reader = match validate_documents_batch(self.wtxn, self.index, reader)? { + Ok(reader) => reader, + Err(user_error) => return Ok((self, Err(user_error))), + }; + let indexed_documents = self .transform .as_mut() @@ -139,7 +155,7 @@ where self.added_documents += indexed_documents; - Ok(indexed_documents) + Ok((self, Ok(indexed_documents))) } #[logging_timer::time("IndexDocuments::{}")] diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 42187fc1e..bc7eefd33 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -17,6 +17,7 @@ use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; +use crate::update::index_documents::validate_document_id_from_json; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, @@ -782,14 +783,6 @@ fn compute_primary_key_pair( } } -fn validate_document_id(document_id: &str) -> Option<&str> { - let document_id = document_id.trim(); - Some(document_id).filter(|id| { - !id.is_empty() - && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) - }) -} - /// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec`. /// /// The size and alignment of T and U must match. @@ -813,22 +806,7 @@ fn update_primary_key<'a>( ) -> Result> { match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { Some((_, bytes)) => { - let value = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { - Value::String(string) => match validate_document_id(&string) { - Some(s) if s.len() == string.len() => string, - Some(s) => s.to_string(), - None => { - return Err(UserError::InvalidDocumentId { - document_id: Value::String(string), - } - .into()) - } - }, - Value::Number(number) => number.to_string(), - content => { - return Err(UserError::InvalidDocumentId { document_id: content.clone() }.into()) - } - }; + let value = validate_document_id_from_json(bytes)??; serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; Ok(Cow::Owned(value)) } diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs new file mode 100644 index 000000000..b4c0cb68f --- /dev/null +++ b/milli/src/update/index_documents/validate.rs @@ -0,0 +1,140 @@ +use std::io::{Read, Seek}; +use std::result::Result as StdResult; + +use serde_json::Value; + +use crate::error::{GeoError, InternalError, UserError}; +use crate::update::index_documents::{obkv_to_object, DocumentsBatchReader}; +use crate::{Index, Result}; + +/// This function validates a documents by checking that: +/// - we can infer a primary key, +/// - all the documents id exist and, +/// - the validity of them but also, +/// - the validity of the `_geo` field depending on the settings. +pub fn validate_documents_batch( + rtxn: &heed::RoTxn, + index: &Index, + reader: DocumentsBatchReader, +) -> Result, UserError>> { + let mut cursor = reader.into_cursor(); + let documents_batch_index = cursor.documents_batch_index().clone(); + + // The primary key *field id* that has already been set for this index or the one + // we will guess by searching for the first key that contains "id" as a substring. + let (primary_key, primary_key_id) = match index.primary_key(rtxn)? { + Some(primary_key) => match documents_batch_index.id(primary_key) { + Some(id) => (primary_key, id), + None => { + return match cursor.next_document()? { + Some(first_document) => Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(&first_document, &documents_batch_index)?, + })), + // If there is no document in this batch the best we can do is to return this error. + None => Ok(Err(UserError::MissingPrimaryKey)), + }; + } + }, + None => { + let guessed = documents_batch_index + .iter() + .filter(|(_, name)| name.contains("id")) + .min_by_key(|(fid, _)| *fid); + match guessed { + Some((id, name)) => (name.as_str(), *id), + None => return Ok(Err(UserError::MissingPrimaryKey)), + } + } + }; + + // If the settings specifies that a _geo field must be used therefore we must check the + // validity of it in all the documents of this batch and this is when we return `Some`. + let geo_field_id = match documents_batch_index.id("_geo") { + Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id), + _otherwise => None, + }; + + while let Some(document) = cursor.next_document()? { + let document_id = match document.get(primary_key_id) { + Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? { + Ok(document_id) => document_id, + Err(user_error) => return Ok(Err(user_error)), + }, + None => { + return Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })) + } + }; + + if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { + if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? { + return Ok(Err(UserError::from(user_error))); + } + } + } + + Ok(Ok(cursor.into_reader())) +} + +/// Returns a trimmed version of the document id or `None` if it is invalid. +pub fn validate_document_id(document_id: &str) -> Option<&str> { + let id = document_id.trim(); + if !id.is_empty() + && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) + { + Some(id) + } else { + None + } +} + +/// Parses a Json encoded document id and validate it, returning a user error when it is one. +pub fn validate_document_id_from_json(bytes: &[u8]) -> Result> { + match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + Value::String(string) => match validate_document_id(&string) { + Some(s) if s.len() == string.len() => Ok(Ok(string)), + Some(s) => Ok(Ok(s.to_string())), + None => { + return Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })) + } + }, + Value::Number(number) => Ok(Ok(number.to_string())), + content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), + } +} + +/// Try to extract an `f64` from a JSON `Value` and return the `Value` +/// in the `Err` variant if it failed. +pub fn extract_float_from_value(value: Value) -> StdResult { + match value { + Value::Number(ref n) => n.as_f64().ok_or(value), + Value::String(ref s) => s.parse::().map_err(|_| value), + value => Err(value), + } +} + +pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result> { + let result = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { + (Some(lat), Some(lng)) => { + match (extract_float_from_value(lat), extract_float_from_value(lng)) { + (Ok(_), Ok(_)) => Ok(()), + (Err(value), Ok(_)) => Err(GeoError::BadLatitude { document_id, value }), + (Ok(_), Err(value)) => Err(GeoError::BadLongitude { document_id, value }), + (Err(lat), Err(lng)) => { + Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng }) + } + } + } + (None, Some(_)) => Err(GeoError::MissingLatitude { document_id }), + (Some(_), None) => Err(GeoError::MissingLongitude { document_id }), + (None, None) => Err(GeoError::MissingLatitudeAndLongitude { document_id }), + }, + value => Err(GeoError::NotAnObject { document_id, value }), + }; + + Ok(result) +} From fcfc4caf8c4512f9dce768017d95ac91804a90d2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 15:36:27 +0200 Subject: [PATCH 1468/1889] Move the Object type in the lib.rs file and use it everywhere --- benchmarks/benches/utils.rs | 20 ++++++++---------- cli/src/main.rs | 9 ++++---- http-ui/src/main.rs | 16 ++++++-------- milli/fuzz/fuzz_targets/indexing.rs | 2 +- milli/src/documents/builder.rs | 5 +++-- milli/src/error.rs | 6 ++---- milli/src/lib.rs | 21 ++++++++++--------- .../extract/extract_geo_points.rs | 1 - milli/src/update/index_documents/transform.rs | 2 +- milli/tests/search/facet_distribution.rs | 6 +++--- milli/tests/search/mod.rs | 6 +++--- 11 files changed, 43 insertions(+), 51 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 091b9b0f5..630e17943 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -11,8 +11,8 @@ use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{ IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; -use milli::{Filter, Index}; -use serde_json::{Map, Value}; +use milli::{Filter, Index, Object}; +use serde_json::Value; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory @@ -96,12 +96,10 @@ pub fn base_setup(conf: &Conf) -> Index { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let documents = documents_from(conf.dataset, conf.dataset_format); - - builder.add_documents(documents).unwrap(); - + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -156,7 +154,7 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { let object = result?; documents.append_json_object(&object)?; } @@ -166,7 +164,7 @@ fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result> { fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec> = serde_json::from_reader(reader)?; + let list: Vec = serde_json::from_reader(reader)?; for object in list { documents.append_json_object(&object)?; @@ -221,14 +219,14 @@ impl CSVDocumentDeserializer { } impl Iterator for CSVDocumentDeserializer { - type Item = anyhow::Result>; + type Item = anyhow::Result; fn next(&mut self) -> Option { let csv_document = self.documents.next()?; match csv_document { Ok(csv_document) => { - let mut document = Map::new(); + let mut document = Object::new(); for ((field_name, field_type), value) in self.headers.iter().zip(csv_document.into_iter()) diff --git a/cli/src/main.rs b/cli/src/main.rs index dcd0f407a..db4ca91ab 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -13,8 +13,7 @@ use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; -use milli::Index; -use serde_json::{Map, Value}; +use milli::{Index, Object}; use structopt::StructOpt; #[cfg(target_os = "linux")] @@ -325,7 +324,7 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); let reader = BufReader::new(reader); - for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { let object = result?; documents.append_json_object(&object)?; } @@ -335,7 +334,7 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { fn documents_from_json(reader: impl Read) -> Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec> = serde_json::from_reader(reader)?; + let list: Vec = serde_json::from_reader(reader)?; for object in list { documents.append_json_object(&object)?; @@ -424,7 +423,7 @@ impl Search { filter: &Option, offset: &Option, limit: &Option, - ) -> Result>> { + ) -> Result> { let txn = index.read_txn()?; let mut search = index.search(&txn); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 63b9ee5e0..8167076c6 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -26,11 +26,11 @@ use milli::update::{ }; use milli::{ obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index, - MatcherBuilder, SearchResult, SortError, + MatcherBuilder, Object, SearchResult, SortError, }; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value}; +use serde_json::Value; use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; @@ -169,11 +169,7 @@ impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { } } - fn highlight_record( - &self, - object: &mut Map, - attributes_to_highlight: &HashSet, - ) { + fn highlight_record(&self, object: &mut Object, attributes_to_highlight: &HashSet) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { @@ -708,7 +704,7 @@ async fn main() -> anyhow::Result<()> { #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] struct Answer { - documents: Vec>, + documents: Vec, number_of_candidates: u64, facets: BTreeMap>, } @@ -1036,7 +1032,7 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); let reader = BufReader::new(reader); - for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { let object = result?; documents.append_json_object(&object)?; } @@ -1046,7 +1042,7 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { fn documents_from_json(reader: impl Read) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec> = serde_json::from_reader(reader)?; + let list: Vec = serde_json::from_reader(reader)?; for object in list { documents.append_json_object(&object)?; diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index 5c3b79ed7..e4f42655e 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -21,7 +21,7 @@ pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { let writer = BufWriter::new(writer); let mut builder = DocumentsBatchBuilder::new(writer); - let values: Vec> = serde_json::from_reader(input)?; + let values: Vec = serde_json::from_reader(input)?; if builder.documents_count() == 0 { bail!("Empty payload"); } diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 15a22090a..589e52269 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,9 +1,10 @@ use std::io::{self, Write}; use grenad::{CompressionType, WriterBuilder}; -use serde_json::{to_writer, Map, Value}; +use serde_json::{to_writer, Value}; use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; +use crate::Object; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary /// format used by milli. @@ -55,7 +56,7 @@ impl DocumentsBatchBuilder { } /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly. - pub fn append_json_object(&mut self, object: &Map) -> io::Result<()> { + pub fn append_json_object(&mut self, object: &Object) -> io::Result<()> { // Make sure that we insert the fields ids in order as the obkv writer has this requirement. let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect(); fields_ids.sort_unstable(); diff --git a/milli/src/error.rs b/milli/src/error.rs index d34130210..a23472951 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -4,12 +4,10 @@ use std::{io, str}; use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; -use serde_json::{Map, Value}; +use serde_json::Value; use thiserror::Error; -use crate::{CriterionError, DocumentId, FieldId, SortError}; - -pub type Object = Map; +use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 81cd057d5..a7be87183 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -20,7 +20,7 @@ use std::hash::BuildHasherDefault; pub use filter_parser::{Condition, FilterCondition}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; -use serde_json::{Map, Value}; +use serde_json::Value; pub use {charabia as tokenizer, heed}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; @@ -43,20 +43,21 @@ pub use self::search::{ pub type Result = std::result::Result; +pub type Attribute = u32; +pub type BEU32 = heed::zerocopy::U32; +pub type BEU64 = heed::zerocopy::U64; +pub type DocumentId = u32; pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; +pub type FieldDistribution = BTreeMap; +pub type FieldId = u16; +pub type Object = serde_json::Map; +pub type Position = u32; +pub type RelativePosition = u16; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; -pub type BEU32 = heed::zerocopy::U32; -pub type BEU64 = heed::zerocopy::U64; -pub type Attribute = u32; -pub type DocumentId = u32; -pub type FieldId = u16; -pub type Position = u32; -pub type RelativePosition = u16; -pub type FieldDistribution = BTreeMap; /// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata /// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point @@ -82,7 +83,7 @@ pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, obkv: obkv::KvReaderU16, -) -> Result> { +) -> Result { displayed_fields .iter() .copied() diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 0f804b93b..46ef9ba9b 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -1,6 +1,5 @@ use std::fs::File; use std::io; -use std::result::Result as StdResult; use concat_arrays::concat_arrays; use serde_json::Value; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index bc7eefd33..4ece58509 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -13,7 +13,7 @@ use serde_json::{Map, Value}; use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; -use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig}; +use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index 66713de1e..8890285e7 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -5,8 +5,8 @@ use heed::EnvOpenOptions; use maplit::hashset; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{FacetDistribution, Index}; -use serde_json::{Deserializer, Map, Value}; +use milli::{FacetDistribution, Index, Object}; +use serde_json::Deserializer; #[test] fn test_facet_distribution_with_no_facet_values() { @@ -46,7 +46,7 @@ fn test_facet_distribution_with_no_facet_values() { }"#, ); - for result in Deserializer::from_reader(reader).into_iter::>() { + for result in Deserializer::from_reader(reader).into_iter::() { let object = result.unwrap(); documents_builder.append_json_object(&object).unwrap(); } diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 4cf117dc7..0b6ce80cc 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -8,9 +8,9 @@ use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{AscDesc, Criterion, DocumentId, Index, Member}; +use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object}; use serde::Deserialize; -use serde_json::{Deserializer, Map, Value}; +use serde_json::Deserializer; use slice_group_by::GroupBy; mod distinct; @@ -66,7 +66,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); - for result in Deserializer::from_reader(reader).into_iter::>() { + for result in Deserializer::from_reader(reader).into_iter::() { let object = result.unwrap(); documents_builder.append_json_object(&object).unwrap(); } From 399eec5c0101780f74b9794957865c3fe8b3d519 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 14:35:19 +0200 Subject: [PATCH 1469/1889] Fix the indexation tests --- benchmarks/benches/indexing.rs | 110 ++++++---- cli/src/main.rs | 7 +- http-ui/src/main.rs | 6 +- milli/src/index.rs | 25 ++- milli/src/search/distinct/mod.rs | 5 +- milli/src/search/facet/filter.rs | 5 +- milli/src/update/clear_documents.rs | 5 +- milli/src/update/delete_documents.rs | 18 +- milli/src/update/index_documents/mod.rs | 205 +++++++++++-------- milli/src/update/index_documents/validate.rs | 1 + milli/src/update/settings.rs | 70 ++++--- milli/tests/search/facet_distribution.rs | 6 +- milli/tests/search/mod.rs | 6 +- milli/tests/search/query_criteria.rs | 6 +- milli/tests/search/typo_tolerance.rs | 7 +- 15 files changed, 288 insertions(+), 194 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 3ae0a1a84..1b501b21a 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -132,12 +132,13 @@ fn indexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -223,11 +224,12 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -279,11 +281,12 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -294,19 +297,21 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -339,13 +344,14 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -377,12 +383,13 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -415,12 +422,13 @@ fn indexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -507,11 +515,12 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -564,12 +573,13 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -581,24 +591,26 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -631,12 +643,13 @@ fn indexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -720,11 +733,12 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -775,12 +789,13 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { // as we don't care about the time it takes. let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -791,21 +806,23 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -861,12 +878,13 @@ fn indexing_nested_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -922,11 +940,12 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -984,12 +1003,13 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1021,12 +1041,13 @@ fn indexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1113,11 +1134,12 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/cli/src/main.rs b/cli/src/main.rs index db4ca91ab..0d197af17 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -255,7 +255,7 @@ impl Performer for DocumentAddition { let bar = progesses.add(bar); bars.push(bar); } - let mut addition = milli::update::IndexDocuments::new( + let addition = milli::update::IndexDocuments::new( &mut txn, &index, &config, @@ -263,7 +263,10 @@ impl Performer for DocumentAddition { |step| indexing_callback(step, &bars), ) .unwrap(); - addition.add_documents(reader)?; + let (addition, user_error) = addition.add_documents(reader)?; + if let Err(error) = user_error { + return Err(error.into()); + } std::thread::spawn(move || { progesses.join().unwrap(); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 8167076c6..117aa31e8 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -374,7 +374,7 @@ async fn main() -> anyhow::Result<()> { }); }; - let mut builder = milli::update::IndexDocuments::new( + let builder = milli::update::IndexDocuments::new( &mut wtxn, &index_cloned, GLOBAL_CONFIG.get().unwrap(), @@ -397,8 +397,8 @@ async fn main() -> anyhow::Result<()> { let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?; - builder.add_documents(documents)?; - + let (builder, user_error) = builder.add_documents(documents)?; + let _count = user_error?; let result = builder.execute(); match result { diff --git a/milli/src/index.rs b/milli/src/index.rs index 9637b4103..272877912 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1212,10 +1212,11 @@ pub(crate) mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1234,7 +1235,7 @@ pub(crate) mod tests { // we add all the documents a second time. we are supposed to get the same // field_distribution in the end let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); let content = documents!([ @@ -1242,7 +1243,8 @@ pub(crate) mod tests { { "id": 2, "name": "bob", "age": 20 }, { "id": 2, "name": "bob", "age": 20 }, ]); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1265,10 +1267,11 @@ pub(crate) mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1333,10 +1336,11 @@ pub(crate) mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1390,10 +1394,11 @@ pub(crate) mod tests { ]); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 670fa01ac..1a9c56cf3 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -97,14 +97,15 @@ mod test { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut addition = + let addition = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); let reader = crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) .unwrap(); - addition.add_documents(reader).unwrap(); + let (addition, user_error) = addition.add_documents(reader).unwrap(); + user_error.unwrap(); addition.execute().unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index d89413f62..41e2f0657 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -648,10 +648,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index d1939df7b..3fe57eeae 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -100,9 +100,10 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig::default(); let config = IndexerConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // Clear all documents from the database. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 3b519c101..49e7de8ae 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -657,13 +657,13 @@ mod tests { fn insert_documents<'t, R: std::io::Read + std::io::Seek>( wtxn: &mut RwTxn<'t, '_>, index: &'t Index, - documents: crate::documents::DocumentBatchReader, + documents: crate::documents::DocumentsBatchReader, ) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = - IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(documents).unwrap(); + let builder = IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); } @@ -701,9 +701,10 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // delete those documents, ids are synchronous therefore 0, 1, and 2. @@ -736,9 +737,10 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // Delete not all of the documents but some of them. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2fb7cbcd9..ae42483f0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -141,7 +141,12 @@ where // We check for user errors in this validator and if there is one, we can return // the `IndexDocument` struct as it is valid to send more documents into it. // However, if there is an internal error we throw it away! - let reader = match validate_documents_batch(self.wtxn, self.index, reader)? { + let reader = match validate_documents_batch( + self.wtxn, + self.index, + self.config.autogenerate_docids, + reader, + )? { Ok(reader) => reader, Err(user_error) => return Ok((self, Err(user_error))), }; @@ -626,10 +631,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -642,10 +648,11 @@ mod tests { // Second we send 1 document with id 1, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "name": "updated kevin" } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -662,9 +669,11 @@ mod tests { { "id": 2, "name": "updated kevina" }, { "id": 3, "name": "updated benoit" } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -694,10 +703,11 @@ mod tests { update_method: IndexDocumentsMethod::UpdateDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -722,9 +732,10 @@ mod tests { // Second we send 1 document with id 1, to force it to be merged with the previous one. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "age": 25 } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -765,7 +776,7 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); assert!(builder.add_documents(content).is_err()); wtxn.commit().unwrap(); @@ -794,10 +805,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -814,9 +826,10 @@ mod tests { // Second we send 1 document with the generated uuid, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -856,9 +869,10 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -873,9 +887,10 @@ mod tests { let content = documents!([ { "name": "new kevin" } ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -898,9 +913,10 @@ mod tests { let content = documents!([]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -924,7 +940,7 @@ mod tests { let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); assert!(builder.add_documents(content).is_err()); @@ -934,9 +950,10 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. let content = documents!([ { "id": 32, "name": "kevin" } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -963,9 +980,10 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1009,9 +1027,10 @@ mod tests { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1020,7 +1039,7 @@ mod tests { update_method: IndexDocumentsMethod::UpdateDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let documents = documents!([ { @@ -1030,7 +1049,8 @@ mod tests { } ]); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); } @@ -1057,9 +1077,10 @@ mod tests { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1099,10 +1120,11 @@ mod tests { { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" }, { "id": 3, "_geo.lat": 31, "_geo.lng": "42" }, ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1138,10 +1160,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lng": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1151,10 +1174,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1164,10 +1188,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": "lol", "lng": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1177,10 +1202,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1190,10 +1216,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": 12, "lng": "hello" } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1217,10 +1244,11 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); @@ -1237,10 +1265,11 @@ mod tests { { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_some()); @@ -1249,10 +1278,11 @@ mod tests { { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1281,10 +1311,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1315,10 +1346,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1373,10 +1405,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1426,10 +1459,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1558,10 +1592,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1628,10 +1663,11 @@ mod tests { // index the documents let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1720,10 +1756,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1737,10 +1774,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1759,10 +1797,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1787,10 +1826,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1832,10 +1872,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); } @@ -1870,10 +1911,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // Create one long document @@ -1884,10 +1926,11 @@ mod tests { // Index this one long document let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1901,7 +1944,7 @@ mod tests { let index = Index::new(options, tmp).unwrap(); let mut wtxn = index.write_txn().unwrap(); let indexer_config = IndexerConfig::default(); - let mut builder = IndexDocuments::new( + let builder = IndexDocuments::new( &mut wtxn, &index, &indexer_config, @@ -1930,8 +1973,10 @@ mod tests { "branch_id_number": 0 }]}; - builder.add_documents(doc1).unwrap(); - builder.add_documents(doc2).unwrap(); + let (builder, user_error) = builder.add_documents(doc1).unwrap(); + user_error.unwrap(); + let (builder, user_error) = builder.add_documents(doc2).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index b4c0cb68f..0ed1f1cc0 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -15,6 +15,7 @@ use crate::{Index, Result}; pub fn validate_documents_batch( rtxn: &heed::RoTxn, index: &Index, + autogenerate_docids: bool, reader: DocumentsBatchReader, ) -> Result, UserError>> { let mut cursor = reader.into_cursor(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ccf29eb49..5f39579b7 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -735,10 +735,11 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -798,10 +799,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -850,10 +852,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -880,10 +883,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // In the same transaction we change the displayed fields to be only the age. @@ -934,10 +938,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -974,10 +979,11 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1016,10 +1022,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1067,10 +1074,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1110,10 +1118,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1142,10 +1151,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1172,10 +1182,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // In the same transaction we provide some stop_words @@ -1251,10 +1262,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // In the same transaction provide some synonyms @@ -1389,10 +1401,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1452,10 +1465,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index 8890285e7..83d692d7f 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -29,8 +29,7 @@ fn test_facet_distribution_with_no_facet_values() { let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new( r#"{ @@ -55,7 +54,8 @@ fn test_facet_distribution_with_no_facet_values() { // index documents let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 0b6ce80cc..3b8960fcc 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -61,8 +61,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); @@ -75,7 +74,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { // index documents let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 89a6a6eec..a96366f5e 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -390,8 +390,7 @@ fn criteria_ascdesc() { // index documents let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut batch_builder = DocumentsBatchBuilder::new(Vec::new()); @@ -422,7 +421,8 @@ fn criteria_ascdesc() { let vector = batch_builder.into_inner().unwrap(); let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - builder.add_documents(reader).unwrap(); + let (builder, user_error) = builder.add_documents(reader).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 63bf22579..7c4cf8971 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -127,11 +127,10 @@ fn test_typo_disabled_on_word() { let mut txn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = - IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); - - builder.add_documents(documents).unwrap(); + let builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); txn.commit().unwrap(); From 2ceeb51c37a98e8233edab175467e11101ef5dd2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 15:14:20 +0200 Subject: [PATCH 1470/1889] Support the auto-generated ids when validating documents --- milli/src/update/index_documents/mod.rs | 6 ++++-- milli/src/update/index_documents/validate.rs | 12 +++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ae42483f0..1a1fc9a0e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -778,7 +778,8 @@ mod tests { let indexing_config = IndexDocumentsConfig::default(); let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - assert!(builder.add_documents(content).is_err()); + let (_builder, user_error) = builder.add_documents(content).unwrap(); + assert!(user_error.is_err()); wtxn.commit().unwrap(); // Check that there is no document. @@ -943,7 +944,8 @@ mod tests { let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - assert!(builder.add_documents(content).is_err()); + let (_builder, user_error) = builder.add_documents(content).unwrap(); + assert!(user_error.is_err()); wtxn.commit().unwrap(); // First we send 1 document with a valid id. diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 0ed1f1cc0..0cb0b4aff 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -19,20 +19,20 @@ pub fn validate_documents_batch( reader: DocumentsBatchReader, ) -> Result, UserError>> { let mut cursor = reader.into_cursor(); - let documents_batch_index = cursor.documents_batch_index().clone(); + let mut documents_batch_index = cursor.documents_batch_index().clone(); // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. let (primary_key, primary_key_id) = match index.primary_key(rtxn)? { Some(primary_key) => match documents_batch_index.id(primary_key) { Some(id) => (primary_key, id), + None if autogenerate_docids => (primary_key, documents_batch_index.insert(primary_key)), None => { return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), document: obkv_to_object(&first_document, &documents_batch_index)?, })), - // If there is no document in this batch the best we can do is to return this error. None => Ok(Err(UserError::MissingPrimaryKey)), }; } @@ -40,10 +40,11 @@ pub fn validate_documents_batch( None => { let guessed = documents_batch_index .iter() - .filter(|(_, name)| name.contains("id")) + .filter(|(_, name)| name.to_lowercase().contains("id")) .min_by_key(|(fid, _)| *fid); match guessed { Some((id, name)) => (name.as_str(), *id), + None if autogenerate_docids => ("id", documents_batch_index.insert("id")), None => return Ok(Err(UserError::MissingPrimaryKey)), } } @@ -56,12 +57,16 @@ pub fn validate_documents_batch( _otherwise => None, }; + let mut count = 0; while let Some(document) = cursor.next_document()? { let document_id = match document.get(primary_key_id) { Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? { Ok(document_id) => document_id, Err(user_error) => return Ok(Err(user_error)), }, + None if autogenerate_docids => { + format!("{{auto-generated id of the {}nth document}}", count) + } None => { return Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), @@ -75,6 +80,7 @@ pub fn validate_documents_batch( return Ok(Err(UserError::from(user_error))); } } + count += 1; } Ok(Ok(cursor.into_reader())) From 19eb3b4708c1cd2619d2a77cefbcda7ff5e4bda5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 16:06:52 +0200 Subject: [PATCH 1471/1889] Make sur that we do not accept floats as documents ids --- milli/src/update/index_documents/mod.rs | 47 ++++++++++++++++++++ milli/src/update/index_documents/validate.rs | 2 +- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 1a1fc9a0e..5bce3b851 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1987,4 +1987,51 @@ mod tests { assert_eq!(ids.len(), map.len()); } + + #[test] + fn primary_key_must_not_contain_floats() { + let tmp = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 100); + let index = Index::new(options, tmp).unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let indexer_config = IndexerConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &indexer_config, + IndexDocumentsConfig::default(), + |_| (), + ) + .unwrap(); + + let doc1 = documents! {[{ + "id": -228142, + "title": "asdsad", + }]}; + + let doc2 = documents! {[{ + "id": 228143.56, + "title": "something", + }]}; + + let doc3 = documents! {[{ + "id": -228143.56, + "title": "something", + }]}; + + let doc4 = documents! {[{ + "id": 2.0, + "title": "something", + }]}; + + let (builder, user_error) = builder.add_documents(doc1).unwrap(); + user_error.unwrap(); + let (builder, user_error) = builder.add_documents(doc2).unwrap(); + assert!(user_error.is_err()); + let (builder, user_error) = builder.add_documents(doc3).unwrap(); + assert!(user_error.is_err()); + let (_builder, user_error) = builder.add_documents(doc4).unwrap(); + assert!(user_error.is_err()); + } } diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 0cb0b4aff..c69c754ac 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -108,7 +108,7 @@ pub fn validate_document_id_from_json(bytes: &[u8]) -> Result Ok(Ok(number.to_string())), + Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), } } From 8ebf5eed0d80beb623a056d4e70a5d0535cd7181 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 17:58:52 +0200 Subject: [PATCH 1472/1889] Make the nested primary key work --- milli/src/error.rs | 2 + milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/index_documents/transform.rs | 5 +- milli/src/update/index_documents/validate.rs | 228 ++++++++++++++---- 4 files changed, 191 insertions(+), 46 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index a23472951..d05acbe1c 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -121,6 +121,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco MaxDatabaseSizeReached, #[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] MissingDocumentId { primary_key: String, document: Object }, + #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] + TooManyDocumentIds { primary_key: String, document: Object }, #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")] MissingPrimaryKey, #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 5bce3b851..ba1064684 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -29,7 +29,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use self::validate::validate_documents_batch; pub use self::validate::{ - extract_float_from_value, validate_document_id, validate_document_id_from_json, + extract_float_from_value, validate_document_id, validate_document_id_value, validate_geo_from_json, }; use crate::documents::{obkv_to_object, DocumentsBatchReader}; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 4ece58509..38f6dc8ff 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -17,7 +17,7 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; -use crate::update::index_documents::validate_document_id_from_json; +use crate::update::index_documents::validate_document_id_value; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, @@ -806,7 +806,8 @@ fn update_primary_key<'a>( ) -> Result> { match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { Some((_, bytes)) => { - let value = validate_document_id_from_json(bytes)??; + let document_id = serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)?; + let value = validate_document_id_value(document_id)??; serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; Ok(Cow::Owned(value)) } diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index c69c754ac..32e8de03f 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -1,11 +1,16 @@ use std::io::{Read, Seek}; +use std::iter; use std::result::Result as StdResult; use serde_json::Value; +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{GeoError, InternalError, UserError}; -use crate::update::index_documents::{obkv_to_object, DocumentsBatchReader}; -use crate::{Index, Result}; +use crate::update::index_documents::obkv_to_object; +use crate::{FieldId, Index, Object, Result}; + +/// The symbol used to define levels in a nested primary key. +const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; /// This function validates a documents by checking that: /// - we can infer a primary key, @@ -23,10 +28,15 @@ pub fn validate_documents_batch( // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. - let (primary_key, primary_key_id) = match index.primary_key(rtxn)? { + let primary_key = match index.primary_key(rtxn)? { + Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { + PrimaryKey::nested(primary_key) + } Some(primary_key) => match documents_batch_index.id(primary_key) { - Some(id) => (primary_key, id), - None if autogenerate_docids => (primary_key, documents_batch_index.insert(primary_key)), + Some(id) => PrimaryKey::flat(primary_key, id), + None if autogenerate_docids => { + PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) + } None => { return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { @@ -43,8 +53,10 @@ pub fn validate_documents_batch( .filter(|(_, name)| name.to_lowercase().contains("id")) .min_by_key(|(fid, _)| *fid); match guessed { - Some((id, name)) => (name.as_str(), *id), - None if autogenerate_docids => ("id", documents_batch_index.insert("id")), + Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), + None if autogenerate_docids => { + PrimaryKey::flat("id", documents_batch_index.insert("id")) + } None => return Ok(Err(UserError::MissingPrimaryKey)), } } @@ -59,20 +71,15 @@ pub fn validate_documents_batch( let mut count = 0; while let Some(document) = cursor.next_document()? { - let document_id = match document.get(primary_key_id) { - Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? { - Ok(document_id) => document_id, - Err(user_error) => return Ok(Err(user_error)), - }, - None if autogenerate_docids => { - format!("{{auto-generated id of the {}nth document}}", count) - } - None => { - return Ok(Err(UserError::MissingDocumentId { - primary_key: primary_key.to_string(), - document: obkv_to_object(&document, &documents_batch_index)?, - })) - } + let document_id = match fetch_document_id( + &document, + &documents_batch_index, + primary_key, + autogenerate_docids, + count, + )? { + Ok(document_id) => document_id, + Err(user_error) => return Ok(Err(user_error)), }; if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { @@ -86,30 +93,167 @@ pub fn validate_documents_batch( Ok(Ok(cursor.into_reader())) } +/// Retrieve the document id after validating it, returning a `UserError` +/// if the id is invalid or can't be guessed. +fn fetch_document_id( + document: &obkv::KvReader, + documents_batch_index: &DocumentsBatchIndex, + primary_key: PrimaryKey, + autogenerate_docids: bool, + count: usize, +) -> Result> { + match primary_key { + PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { + match document.get(primary_key_id) { + Some(document_id_bytes) => { + let document_id = serde_json::from_slice(document_id_bytes) + .map_err(InternalError::SerdeJson)?; + match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => Ok(Err(user_error)), + } + } + None if autogenerate_docids => { + Ok(Ok(format!("{{auto-generated id of the {}nth document}}", count))) + } + None => Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })), + } + } + nested @ PrimaryKey::Nested { .. } => { + let mut matching_documents_ids = Vec::new(); + for (first_level_name, right) in nested.possible_level_names() { + if let Some(field_id) = documents_batch_index.id(first_level_name) { + if let Some(value_bytes) = document.get(field_id) { + let object = serde_json::from_slice(value_bytes) + .map_err(InternalError::SerdeJson)?; + fetch_matching_values(object, right, &mut matching_documents_ids); + + if matching_documents_ids.len() >= 2 { + return Ok(Err(UserError::TooManyDocumentIds { + primary_key: nested.primary_key().to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })); + } + } + } + } + + match matching_documents_ids.pop() { + Some(document_id) => match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => Ok(Err(user_error)), + }, + None => Ok(Err(UserError::MissingDocumentId { + primary_key: nested.primary_key().to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })), + } + } + } +} + +/// A type that represent the type of primary key that has been set +/// for this index, a classic flat one or a nested one. +#[derive(Debug, Clone, Copy)] +enum PrimaryKey<'a> { + Flat { name: &'a str, field_id: FieldId }, + Nested { name: &'a str }, +} + +impl PrimaryKey<'_> { + fn flat(name: &str, field_id: FieldId) -> PrimaryKey { + PrimaryKey::Flat { name, field_id } + } + + fn nested(name: &str) -> PrimaryKey { + PrimaryKey::Nested { name } + } + + fn primary_key(&self) -> &str { + match self { + PrimaryKey::Flat { name, .. } => name, + PrimaryKey::Nested { name } => name, + } + } + + /// Returns an `Iterator` that gives all the possible fields names the primary key + /// can have depending of the first level name and deepnes of the objects. + fn possible_level_names(&self) -> impl Iterator + '_ { + let name = self.primary_key(); + iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL)) + } +} + +fn contained_in(selector: &str, key: &str) -> bool { + selector.starts_with(key) + && selector[key.len()..] + .chars() + .next() + .map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL) + .unwrap_or(true) +} + +pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { + match value { + Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), + otherwise => output.push(otherwise), + } +} + +pub fn fetch_matching_values_in_object( + object: Object, + selector: &str, + base_key: &str, + output: &mut Vec, +) { + for (key, value) in object { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) + }; + + // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` + // so we check the contained_in on both side. + let should_continue = + contained_in(selector, &base_key) || contained_in(&base_key, selector); + + if should_continue { + match value { + Value::Object(object) => { + fetch_matching_values_in_object(object, selector, &base_key, output) + } + value => output.push(value), + } + } + } +} + /// Returns a trimmed version of the document id or `None` if it is invalid. pub fn validate_document_id(document_id: &str) -> Option<&str> { - let id = document_id.trim(); - if !id.is_empty() - && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) + let document_id = document_id.trim(); + if !document_id.is_empty() + && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) { - Some(id) + Some(document_id) } else { None } } /// Parses a Json encoded document id and validate it, returning a user error when it is one. -pub fn validate_document_id_from_json(bytes: &[u8]) -> Result> { - match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { +pub fn validate_document_id_value(document_id: Value) -> Result> { + match document_id { Value::String(string) => match validate_document_id(&string) { Some(s) if s.len() == string.len() => Ok(Ok(string)), Some(s) => Ok(Ok(s.to_string())), - None => { - return Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })) - } + None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), }, Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), - content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), + content => Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), } } @@ -124,24 +268,22 @@ pub fn extract_float_from_value(value: Value) -> StdResult { } pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result> { - let result = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { (Some(lat), Some(lng)) => { match (extract_float_from_value(lat), extract_float_from_value(lng)) { - (Ok(_), Ok(_)) => Ok(()), - (Err(value), Ok(_)) => Err(GeoError::BadLatitude { document_id, value }), - (Ok(_), Err(value)) => Err(GeoError::BadLongitude { document_id, value }), + (Ok(_), Ok(_)) => Ok(Ok(())), + (Err(value), Ok(_)) => Ok(Err(GeoError::BadLatitude { document_id, value })), + (Ok(_), Err(value)) => Ok(Err(GeoError::BadLongitude { document_id, value })), (Err(lat), Err(lng)) => { - Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng }) + Ok(Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng })) } } } - (None, Some(_)) => Err(GeoError::MissingLatitude { document_id }), - (Some(_), None) => Err(GeoError::MissingLongitude { document_id }), - (None, None) => Err(GeoError::MissingLatitudeAndLongitude { document_id }), + (None, Some(_)) => Ok(Err(GeoError::MissingLatitude { document_id })), + (Some(_), None) => Ok(Err(GeoError::MissingLongitude { document_id })), + (None, None) => Ok(Err(GeoError::MissingLatitudeAndLongitude { document_id })), }, - value => Err(GeoError::NotAnObject { document_id, value }), - }; - - Ok(result) + value => Ok(Err(GeoError::NotAnObject { document_id, value })), + } } From dc3f092d0782fd7b39bb29aa424caa04ada8de3c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 16 Jun 2022 12:03:43 +0200 Subject: [PATCH 1473/1889] Do not leak an internal grenad Error --- milli/src/documents/mod.rs | 2 +- milli/src/documents/reader.rs | 32 ++++++++++++++++++++++++++++++-- milli/src/error.rs | 7 +++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index ee3593bf8..66a05b7b6 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -7,7 +7,7 @@ use std::io; use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; use obkv::KvReader; -pub use reader::{DocumentsBatchCursor, DocumentsBatchReader}; +pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; use crate::error::{FieldIdMapMissingEntry, InternalError}; diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 3dff999f5..720b403b9 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,5 +1,5 @@ use std::convert::TryInto; -use std::io; +use std::{error, fmt, io}; use obkv::KvReader; @@ -79,7 +79,9 @@ impl DocumentsBatchCursor { impl DocumentsBatchCursor { /// Returns the next document, starting from the first one. Subsequent calls to /// `next_document` advance the document reader until all the documents have been read. - pub fn next_document(&mut self) -> Result>, grenad::Error> { + pub fn next_document( + &mut self, + ) -> Result>, DocumentsBatchCursorError> { match self.cursor.move_on_next()? { Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { Ok(Some(KvReader::new(value))) @@ -88,3 +90,29 @@ impl DocumentsBatchCursor { } } } + +/// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents. +#[derive(Debug)] +pub struct DocumentsBatchCursorError { + inner: grenad::Error, +} + +impl From for DocumentsBatchCursorError { + fn from(error: grenad::Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError { inner: error } + } +} + +impl Into for DocumentsBatchCursorError { + fn into(self) -> grenad::Error { + self.inner + } +} + +impl error::Error for DocumentsBatchCursorError {} + +impl fmt::Display for DocumentsBatchCursorError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.inner.fmt(f) + } +} diff --git a/milli/src/error.rs b/milli/src/error.rs index d05acbe1c..d9dca287d 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -7,6 +7,7 @@ use rayon::ThreadPoolBuildError; use serde_json::Value; use thiserror::Error; +use crate::documents::DocumentsBatchCursorError; use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { @@ -209,6 +210,12 @@ where } } +impl From for Error { + fn from(error: DocumentsBatchCursorError) -> Error { + Error::from(Into::::into(error)) + } +} + impl From for Error { fn from(_error: Infallible) -> Error { unreachable!() From ea852200bbb9d9520ab7e13e5f6019c3b7c7fed3 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 22 Jun 2022 10:28:49 +0200 Subject: [PATCH 1474/1889] Fix the format used for a geo deleting benchmark --- benchmarks/benches/indexing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 1b501b21a..80c7ba0ed 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -1137,7 +1137,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); - let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json"); + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); builder.execute().unwrap(); From 6a0a0ae94f38ee04ffed9bd252444128163b1550 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 20 Jun 2022 13:48:02 +0200 Subject: [PATCH 1475/1889] Make the Transform read from an EnrichedDocumentsBatchReader --- milli/src/documents/enriched.rs | 103 ++++++++++++++++++ milli/src/documents/mod.rs | 13 +++ milli/src/documents/reader.rs | 20 ++-- milli/src/error.rs | 10 +- milli/src/update/index_documents/mod.rs | 6 +- milli/src/update/index_documents/transform.rs | 8 +- milli/src/update/index_documents/validate.rs | 22 ++-- 7 files changed, 158 insertions(+), 24 deletions(-) create mode 100644 milli/src/documents/enriched.rs diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs new file mode 100644 index 000000000..8645e06c4 --- /dev/null +++ b/milli/src/documents/enriched.rs @@ -0,0 +1,103 @@ +use std::fs::File; +use std::{io, str}; + +use obkv::KvReader; + +use super::{ + DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader, + Error, +}; +use crate::FieldId; + +/// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have +/// been created with a `DocumentsBatchWriter` and, for the enriched data, +/// a simple `grenad::Reader`. +/// +/// The documents are returned in the form of `obkv::Reader` where each field is identified with a +/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. +pub struct EnrichedDocumentsBatchReader { + documents: DocumentsBatchReader, + external_ids: grenad::ReaderCursor, +} + +impl EnrichedDocumentsBatchReader { + pub fn new( + documents: DocumentsBatchReader, + external_ids: grenad::Reader, + ) -> Result { + if documents.documents_count() as u64 == external_ids.len() { + Ok(EnrichedDocumentsBatchReader { + documents, + external_ids: external_ids.into_cursor()?, + }) + } else { + Err(Error::InvalidEnrichedData) + } + } + + pub fn documents_count(&self) -> u32 { + self.documents.documents_count() + } + + pub fn is_empty(&self) -> bool { + self.documents.is_empty() + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + self.documents.documents_batch_index() + } + + /// This method returns a forward cursor over the enriched documents. + pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor { + let EnrichedDocumentsBatchReader { documents, mut external_ids } = self; + external_ids.reset(); + EnrichedDocumentsBatchCursor { documents: documents.into_cursor(), external_ids } + } +} + +#[derive(Debug, Clone, Copy)] +pub struct EnrichedDocument<'a> { + pub document: KvReader<'a, FieldId>, + pub external_id: &'a str, +} + +pub struct EnrichedDocumentsBatchCursor { + documents: DocumentsBatchCursor, + external_ids: grenad::ReaderCursor, +} + +impl EnrichedDocumentsBatchCursor { + pub fn into_reader(self) -> EnrichedDocumentsBatchReader { + let EnrichedDocumentsBatchCursor { documents, external_ids } = self; + EnrichedDocumentsBatchReader { documents: documents.into_reader(), external_ids } + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + self.documents.documents_batch_index() + } + + /// Resets the cursor to be able to read from the start again. + pub fn reset(&mut self) { + self.documents.reset(); + self.external_ids.reset(); + } +} + +impl EnrichedDocumentsBatchCursor { + /// Returns the next document, starting from the first one. Subsequent calls to + /// `next_document` advance the document reader until all the documents have been read. + pub fn next_enriched_document( + &mut self, + ) -> Result, DocumentsBatchCursorError> { + let document = self.documents.next_document()?; + let external_id = match self.external_ids.move_on_next()? { + Some((_, bytes)) => Some(str::from_utf8(bytes)?), + None => None, + }; + + match document.zip(external_id) { + Some((document, external_id)) => Ok(Some(EnrichedDocument { document, external_id })), + None => Ok(None), + } + } +} diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 66a05b7b6..43bfc1c20 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,11 +1,14 @@ mod builder; +mod enriched; mod reader; use std::fmt::{self, Debug}; use std::io; +use std::str::Utf8Error; use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; +pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; use obkv::KvReader; pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; @@ -87,6 +90,8 @@ impl DocumentsBatchIndex { pub enum Error { ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, InvalidDocumentFormat, + InvalidEnrichedData, + InvalidUtf8(Utf8Error), Csv(csv::Error), Json(serde_json::Error), Serialize(serde_json::Error), @@ -118,6 +123,12 @@ impl From for Error { } } +impl From for Error { + fn from(other: Utf8Error) -> Self { + Self::InvalidUtf8(other) + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -127,6 +138,8 @@ impl fmt::Display for Error { Error::InvalidDocumentFormat => { f.write_str("Invalid document addition format, missing the documents batch index.") } + Error::InvalidEnrichedData => f.write_str("Invalid enriched data."), + Error::InvalidUtf8(e) => write!(f, "{}", e), Error::Io(e) => write!(f, "{}", e), Error::Serialize(e) => write!(f, "{}", e), Error::Grenad(e) => write!(f, "{}", e), diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 720b403b9..7bd6dbd51 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,5 +1,5 @@ use std::convert::TryInto; -use std::{error, fmt, io}; +use std::{error, fmt, io, str}; use obkv::KvReader; @@ -93,19 +93,20 @@ impl DocumentsBatchCursor { /// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents. #[derive(Debug)] -pub struct DocumentsBatchCursorError { - inner: grenad::Error, +pub enum DocumentsBatchCursorError { + Grenad(grenad::Error), + Utf8(str::Utf8Error), } impl From for DocumentsBatchCursorError { fn from(error: grenad::Error) -> DocumentsBatchCursorError { - DocumentsBatchCursorError { inner: error } + DocumentsBatchCursorError::Grenad(error) } } -impl Into for DocumentsBatchCursorError { - fn into(self) -> grenad::Error { - self.inner +impl From for DocumentsBatchCursorError { + fn from(error: str::Utf8Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError::Utf8(error) } } @@ -113,6 +114,9 @@ impl error::Error for DocumentsBatchCursorError {} impl fmt::Display for DocumentsBatchCursorError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.inner.fmt(f) + match self { + DocumentsBatchCursorError::Grenad(e) => e.fmt(f), + DocumentsBatchCursorError::Utf8(e) => e.fmt(f), + } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index d9dca287d..0419ceeda 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -7,7 +7,7 @@ use rayon::ThreadPoolBuildError; use serde_json::Value; use thiserror::Error; -use crate::documents::DocumentsBatchCursorError; +use crate::documents::{self, DocumentsBatchCursorError}; use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { @@ -36,6 +36,8 @@ pub enum InternalError { FieldIdMappingMissingEntry { key: FieldId }, #[error(transparent)] Fst(#[from] fst::Error), + #[error(transparent)] + DocumentsError(#[from] documents::Error), #[error("Invalid compression type have been specified to grenad.")] GrenadInvalidCompressionType, #[error("Invalid grenad file with an invalid version format.")] @@ -185,6 +187,7 @@ macro_rules! error_from_sub_error { error_from_sub_error! { FieldIdMapMissingEntry => InternalError, fst::Error => InternalError, + documents::Error => InternalError, str::Utf8Error => InternalError, ThreadPoolBuildError => InternalError, SerializationError => InternalError, @@ -212,7 +215,10 @@ where impl From for Error { fn from(error: DocumentsBatchCursorError) -> Error { - Error::from(Into::::into(error)) + match error { + DocumentsBatchCursorError::Grenad(e) => Error::from(e), + DocumentsBatchCursorError::Utf8(e) => Error::from(e), + } } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba1064684..fe3bd1f8f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -27,7 +27,7 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use self::validate::validate_documents_batch; +use self::validate::validate_and_enrich_documents_batch; pub use self::validate::{ extract_float_from_value, validate_document_id, validate_document_id_value, validate_geo_from_json, @@ -141,7 +141,7 @@ where // We check for user errors in this validator and if there is one, we can return // the `IndexDocument` struct as it is valid to send more documents into it. // However, if there is an internal error we throw it away! - let reader = match validate_documents_batch( + let enriched_documents_reader = match validate_and_enrich_documents_batch( self.wtxn, self.index, self.config.autogenerate_docids, @@ -155,7 +155,7 @@ where .transform .as_mut() .expect("Invalid document addition state") - .read_documents(reader, self.wtxn, &self.progress)? + .read_documents(enriched_documents_reader, self.wtxn, &self.progress)? as u64; self.added_documents += indexed_documents; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 38f6dc8ff..4d0a4c311 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -14,7 +14,7 @@ use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::{IndexDocumentsMethod, IndexerConfig}; -use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; +use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; use crate::update::index_documents::validate_document_id_value; @@ -153,7 +153,7 @@ impl<'a, 'i> Transform<'a, 'i> { pub fn read_documents( &mut self, - reader: DocumentsBatchReader, + reader: EnrichedDocumentsBatchReader, wtxn: &mut heed::RwTxn, progress_callback: F, ) -> Result @@ -189,7 +189,9 @@ impl<'a, 'i> Transform<'a, 'i> { let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); let addition_index = cursor.documents_batch_index().clone(); - while let Some(document) = cursor.next_document()? { + while let Some(enriched_document) = cursor.next_enriched_document()? { + let EnrichedDocument { document, external_id } = enriched_document; + let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 32e8de03f..8b68532cb 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -4,27 +4,28 @@ use std::result::Result as StdResult; use serde_json::Value; -use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; use crate::error::{GeoError, InternalError, UserError}; -use crate::update::index_documents::obkv_to_object; +use crate::update::index_documents::{obkv_to_object, writer_into_reader}; use crate::{FieldId, Index, Object, Result}; /// The symbol used to define levels in a nested primary key. const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; -/// This function validates a documents by checking that: +/// This function validates and enrich the documents by checking that: /// - we can infer a primary key, -/// - all the documents id exist and, +/// - all the documents id exist and are extracted, /// - the validity of them but also, /// - the validity of the `_geo` field depending on the settings. -pub fn validate_documents_batch( +pub fn validate_and_enrich_documents_batch( rtxn: &heed::RoTxn, index: &Index, autogenerate_docids: bool, reader: DocumentsBatchReader, -) -> Result, UserError>> { +) -> Result, UserError>> { let mut cursor = reader.into_cursor(); let mut documents_batch_index = cursor.documents_batch_index().clone(); + let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. @@ -82,6 +83,8 @@ pub fn validate_documents_batch( Err(user_error) => return Ok(Err(user_error)), }; + external_ids.insert(count.to_be_bytes(), &document_id)?; + if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? { return Ok(Err(UserError::from(user_error))); @@ -90,7 +93,10 @@ pub fn validate_documents_batch( count += 1; } - Ok(Ok(cursor.into_reader())) + let external_ids = writer_into_reader(external_ids)?; + let reader = EnrichedDocumentsBatchReader::new(cursor.into_reader(), external_ids)?; + + Ok(Ok(reader)) } /// Retrieve the document id after validating it, returning a `UserError` @@ -100,7 +106,7 @@ fn fetch_document_id( documents_batch_index: &DocumentsBatchIndex, primary_key: PrimaryKey, autogenerate_docids: bool, - count: usize, + count: u32, ) -> Result> { match primary_key { PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { From 5f1bfb73eeeb1da17a1d2598d01916b08022aa8c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 10:45:25 +0200 Subject: [PATCH 1476/1889] Extract the primary key name and make it accessible --- milli/src/documents/enriched.rs | 28 +++++++++++++++++--- milli/src/update/index_documents/validate.rs | 6 ++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs index 8645e06c4..918b47c95 100644 --- a/milli/src/documents/enriched.rs +++ b/milli/src/documents/enriched.rs @@ -17,17 +17,20 @@ use crate::FieldId; /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. pub struct EnrichedDocumentsBatchReader { documents: DocumentsBatchReader, + primary_key: String, external_ids: grenad::ReaderCursor, } impl EnrichedDocumentsBatchReader { pub fn new( documents: DocumentsBatchReader, + primary_key: String, external_ids: grenad::Reader, ) -> Result { if documents.documents_count() as u64 == external_ids.len() { Ok(EnrichedDocumentsBatchReader { documents, + primary_key, external_ids: external_ids.into_cursor()?, }) } else { @@ -39,6 +42,10 @@ impl EnrichedDocumentsBatchReader { self.documents.documents_count() } + pub fn primary_key(&self) -> &str { + &self.primary_key + } + pub fn is_empty(&self) -> bool { self.documents.is_empty() } @@ -49,9 +56,13 @@ impl EnrichedDocumentsBatchReader { /// This method returns a forward cursor over the enriched documents. pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor { - let EnrichedDocumentsBatchReader { documents, mut external_ids } = self; + let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self; external_ids.reset(); - EnrichedDocumentsBatchCursor { documents: documents.into_cursor(), external_ids } + EnrichedDocumentsBatchCursor { + documents: documents.into_cursor(), + primary_key, + external_ids, + } } } @@ -63,13 +74,22 @@ pub struct EnrichedDocument<'a> { pub struct EnrichedDocumentsBatchCursor { documents: DocumentsBatchCursor, + primary_key: String, external_ids: grenad::ReaderCursor, } impl EnrichedDocumentsBatchCursor { pub fn into_reader(self) -> EnrichedDocumentsBatchReader { - let EnrichedDocumentsBatchCursor { documents, external_ids } = self; - EnrichedDocumentsBatchReader { documents: documents.into_reader(), external_ids } + let EnrichedDocumentsBatchCursor { documents, primary_key, external_ids } = self; + EnrichedDocumentsBatchReader { + documents: documents.into_reader(), + primary_key, + external_ids, + } + } + + pub fn primary_key(&self) -> &str { + &self.primary_key } pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 8b68532cb..83d7ef38e 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -94,7 +94,11 @@ pub fn validate_and_enrich_documents_batch( } let external_ids = writer_into_reader(external_ids)?; - let reader = EnrichedDocumentsBatchReader::new(cursor.into_reader(), external_ids)?; + let reader = EnrichedDocumentsBatchReader::new( + cursor.into_reader(), + primary_key.primary_key().to_string(), + external_ids, + )?; Ok(Ok(reader)) } From 742543091e9b33084952ad27e00eff60655d7bf0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 10:48:07 +0200 Subject: [PATCH 1477/1889] Constify the default primary key name --- milli/src/update/index_documents/validate.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 83d7ef38e..4e52f4cb9 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -12,6 +12,9 @@ use crate::{FieldId, Index, Object, Result}; /// The symbol used to define levels in a nested primary key. const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; +/// The default primary that is used when not specified. +const DEFAULT_PRIMARY_KEY: &str = "id"; + /// This function validates and enrich the documents by checking that: /// - we can infer a primary key, /// - all the documents id exist and are extracted, @@ -51,13 +54,14 @@ pub fn validate_and_enrich_documents_batch( None => { let guessed = documents_batch_index .iter() - .filter(|(_, name)| name.to_lowercase().contains("id")) + .filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY)) .min_by_key(|(fid, _)| *fid); match guessed { Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), - None if autogenerate_docids => { - PrimaryKey::flat("id", documents_batch_index.insert("id")) - } + None if autogenerate_docids => PrimaryKey::flat( + DEFAULT_PRIMARY_KEY, + documents_batch_index.insert(DEFAULT_PRIMARY_KEY), + ), None => return Ok(Err(UserError::MissingPrimaryKey)), } } From 905af2a2e9f69b2341d532b105bbf1b3baff819e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 11:12:51 +0200 Subject: [PATCH 1478/1889] Use the primary key and external id in the transform --- milli/src/update/index_documents/transform.rs | 290 +++++------------- 1 file changed, 79 insertions(+), 211 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 4d0a4c311..e82556ec7 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -9,7 +9,7 @@ use heed::RoTxn; use itertools::Itertools; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; -use serde_json::{Map, Value}; +use serde_json::Value; use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; @@ -17,15 +17,12 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; -use crate::update::index_documents::validate_document_id_value; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, }; -const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; - pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, @@ -85,18 +82,6 @@ fn create_fields_mapping( .collect() } -/// Look for a key containing the [DEFAULT_PRIMARY_KEY_NAME] in the fields. -/// It doesn't look in the subfield because we don't want to enable the -/// primary key inference on nested objects. -fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { - index - .iter() - .sorted_by_key(|(k, _)| *k) - .map(|(_, v)| v) - .find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME)) - .map(String::as_str) -} - impl<'a, 'i> Transform<'a, 'i> { pub fn new( wtxn: &mut heed::RwTxn, @@ -167,28 +152,15 @@ impl<'a, 'i> Transform<'a, 'i> { let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; - let alternative_name = self - .index - .primary_key(wtxn)? - .or_else(|| find_primary_key(fields_index)) - .map(String::from); - - let (primary_key_id, primary_key_name) = compute_primary_key_pair( - self.index.primary_key(wtxn)?, - &mut self.fields_ids_map, - alternative_name, - self.autogenerate_docids, - )?; - - let primary_key_id_nested = primary_key_name.contains('.'); + let primary_key = cursor.primary_key().to_string(); + self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; + let primary_key_id_nested = primary_key.contains('.'); let mut flattened_document = None; let mut obkv_buffer = Vec::new(); let mut flattened_obkv_buffer = Vec::new(); let mut documents_count = 0; - let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); - let addition_index = cursor.documents_batch_index().clone(); while let Some(enriched_document) = cursor.next_enriched_document()? { let EnrichedDocument { document, external_id } = enriched_document; @@ -210,8 +182,7 @@ impl<'a, 'i> Transform<'a, 'i> { // it, transform it into a string and validate it, and then update it in the // document. If none is found, and we were told to generate missing document ids, then // we create the missing field, and update the new document. - let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; - let external_id = if primary_key_id_nested { + if primary_key_id_nested { let mut field_buffer_cache = field_buffer_cache.clone(); self.flatten_from_field_mapping( &mapping, @@ -220,29 +191,6 @@ impl<'a, 'i> Transform<'a, 'i> { &mut field_buffer_cache, )?; flattened_document = Some(&flattened_obkv_buffer); - let document = KvReader::new(&flattened_obkv_buffer); - - update_primary_key( - document, - &addition_index, - primary_key_id, - &primary_key_name, - &mut uuid_buffer, - &mut field_buffer_cache, - &mut external_id_buffer, - self.autogenerate_docids, - )? - } else { - update_primary_key( - document, - &addition_index, - primary_key_id, - &primary_key_name, - &mut uuid_buffer, - &mut field_buffer_cache, - &mut external_id_buffer, - self.autogenerate_docids, - )? }; // Insertion in a obkv need to be done with keys ordered. For now they are ordered @@ -318,7 +266,6 @@ impl<'a, 'i> Transform<'a, 'i> { }); field_buffer = drop_and_reuse(field_buffer_cache); - external_id_buffer.clear(); obkv_buffer.clear(); } @@ -327,7 +274,7 @@ impl<'a, 'i> Transform<'a, 'i> { }); self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; - self.index.put_primary_key(wtxn, &primary_key_name)?; + self.index.put_primary_key(wtxn, &primary_key)?; self.documents_count += documents_count; // Now that we have a valid sorter that contains the user id and the obkv we // give it to the last transforming function which returns the TransformOutput. @@ -749,42 +696,6 @@ impl<'a, 'i> Transform<'a, 'i> { } } -/// Given an optional primary key and an optional alternative name, returns the (field_id, attr_name) -/// for the primary key according to the following rules: -/// - if primary_key is `Some`, returns the id and the name, else -/// - if alternative_name is Some, adds alternative to the fields_ids_map, and returns the pair, else -/// - if autogenerate_docids is true, insert the default id value in the field ids map ("id") and -/// returns the pair, else -/// - returns an error. -fn compute_primary_key_pair( - primary_key: Option<&str>, - fields_ids_map: &mut FieldsIdsMap, - alternative_name: Option, - autogenerate_docids: bool, -) -> Result<(FieldId, String)> { - match primary_key { - Some(primary_key) => { - let id = fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; - Ok((id, primary_key.to_string())) - } - None => { - let name = match alternative_name { - Some(key) => key, - None => { - if !autogenerate_docids { - // If there is no primary key in the current document batch, we must - // return an error and not automatically generate any document id. - return Err(UserError::MissingPrimaryKey.into()); - } - DEFAULT_PRIMARY_KEY_NAME.to_string() - } - }; - let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; - Ok((id, name)) - } - } -} - /// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec`. /// /// The size and alignment of T and U must match. @@ -796,49 +707,6 @@ fn drop_and_reuse(mut vec: Vec) -> Vec { vec.into_iter().map(|_| unreachable!()).collect() } -fn update_primary_key<'a>( - document: KvReader<'a, FieldId>, - addition_index: &DocumentsBatchIndex, - primary_key_id: FieldId, - primary_key_name: &str, - uuid_buffer: &'a mut [u8; uuid::fmt::Hyphenated::LENGTH], - field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>, - mut external_id_buffer: &'a mut Vec, - autogenerate_docids: bool, -) -> Result> { - match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { - Some((_, bytes)) => { - let document_id = serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)?; - let value = validate_document_id_value(document_id)??; - serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; - Ok(Cow::Owned(value)) - } - None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); - serde_json::to_writer(&mut external_id_buffer, &uuid) - .map_err(InternalError::SerdeJson)?; - field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into())); - Ok(Cow::Borrowed(&*uuid)) - } - None => { - let mut json = Map::new(); - for (key, value) in document.iter() { - let key = addition_index.name(key).map(ToString::to_string); - let value = serde_json::from_slice::(&value).ok(); - - if let Some((k, v)) = key.zip(value) { - json.insert(k, v); - } - } - - Err(UserError::MissingDocumentId { - primary_key: primary_key_name.to_string(), - document: json, - })? - } - } -} - impl TransformOutput { // find and insert the new field ids pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result> { @@ -853,87 +721,87 @@ impl TransformOutput { } } -#[cfg(test)] -mod test { - use super::*; +// #[cfg(test)] +// mod test { +// use super::*; - mod compute_primary_key { - use big_s::S; +// mod compute_primary_key { +// use big_s::S; - use super::{compute_primary_key_pair, FieldsIdsMap}; +// use super::{compute_primary_key_pair, FieldsIdsMap}; - #[test] - fn should_return_primary_key_if_is_some() { - let mut fields_map = FieldsIdsMap::new(); - fields_map.insert("toto").unwrap(); - let result = compute_primary_key_pair( - Some("toto"), - &mut fields_map, - Some("tata".to_string()), - false, - ); - assert_eq!(result.unwrap(), (0, "toto".to_string())); - assert_eq!(fields_map.len(), 1); +// #[test] +// fn should_return_primary_key_if_is_some() { +// let mut fields_map = FieldsIdsMap::new(); +// fields_map.insert("toto").unwrap(); +// let result = compute_primary_key_pair( +// Some("toto"), +// &mut fields_map, +// Some("tata".to_string()), +// false, +// ); +// assert_eq!(result.unwrap(), (0, "toto".to_string())); +// assert_eq!(fields_map.len(), 1); - // and with nested fields - let mut fields_map = FieldsIdsMap::new(); - fields_map.insert("toto.tata").unwrap(); - let result = compute_primary_key_pair( - Some("toto.tata"), - &mut fields_map, - Some(S("titi")), - false, - ); - assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); - assert_eq!(fields_map.len(), 1); - } +// // and with nested fields +// let mut fields_map = FieldsIdsMap::new(); +// fields_map.insert("toto.tata").unwrap(); +// let result = compute_primary_key_pair( +// Some("toto.tata"), +// &mut fields_map, +// Some(S("titi")), +// false, +// ); +// assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); +// assert_eq!(fields_map.len(), 1); +// } - #[test] - fn should_return_alternative_if_primary_is_none() { - let mut fields_map = FieldsIdsMap::new(); - let result = - compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); - assert_eq!(result.unwrap(), (0, S("tata"))); - assert_eq!(fields_map.len(), 1); - } +// #[test] +// fn should_return_alternative_if_primary_is_none() { +// let mut fields_map = FieldsIdsMap::new(); +// let result = +// compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); +// assert_eq!(result.unwrap(), (0, S("tata"))); +// assert_eq!(fields_map.len(), 1); +// } - #[test] - fn should_return_default_if_both_are_none() { - let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair(None, &mut fields_map, None, true); - assert_eq!(result.unwrap(), (0, S("id"))); - assert_eq!(fields_map.len(), 1); - } +// #[test] +// fn should_return_default_if_both_are_none() { +// let mut fields_map = FieldsIdsMap::new(); +// let result = compute_primary_key_pair(None, &mut fields_map, None, true); +// assert_eq!(result.unwrap(), (0, S("id"))); +// assert_eq!(fields_map.len(), 1); +// } - #[test] - fn should_return_err_if_both_are_none_and_recompute_is_false() { - let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair(None, &mut fields_map, None, false); - assert!(result.is_err()); - assert_eq!(fields_map.len(), 0); - } - } +// #[test] +// fn should_return_err_if_both_are_none_and_recompute_is_false() { +// let mut fields_map = FieldsIdsMap::new(); +// let result = compute_primary_key_pair(None, &mut fields_map, None, false); +// assert!(result.is_err()); +// assert_eq!(fields_map.len(), 0); +// } +// } - mod primary_key_inference { - use big_s::S; - use bimap::BiHashMap; +// mod primary_key_inference { +// use big_s::S; +// use bimap::BiHashMap; - use crate::documents::DocumentsBatchIndex; - use crate::update::index_documents::transform::find_primary_key; +// use crate::documents::DocumentsBatchIndex; +// use crate::update::index_documents::transform::find_primary_key; - #[test] - fn primary_key_infered_on_first_field() { - // We run the test multiple times to change the order in which the fields are iterated upon. - for _ in 1..50 { - let mut map = BiHashMap::new(); - map.insert(1, S("fakeId")); - map.insert(2, S("fakeId")); - map.insert(3, S("fakeId")); - map.insert(4, S("fakeId")); - map.insert(0, S("realId")); +// #[test] +// fn primary_key_infered_on_first_field() { +// // We run the test multiple times to change the order in which the fields are iterated upon. +// for _ in 1..50 { +// let mut map = BiHashMap::new(); +// map.insert(1, S("fakeId")); +// map.insert(2, S("fakeId")); +// map.insert(3, S("fakeId")); +// map.insert(4, S("fakeId")); +// map.insert(0, S("realId")); - assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); - } - } - } -} +// assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); +// } +// } +// } +// } From c8ebf0de47e09964a9b4060b1da9982a593040e1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 11:14:14 +0200 Subject: [PATCH 1479/1889] Rename the validate function as an enriching function --- .../index_documents/{validate.rs => enrich.rs} | 2 +- milli/src/update/index_documents/mod.rs | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) rename milli/src/update/index_documents/{validate.rs => enrich.rs} (99%) diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/enrich.rs similarity index 99% rename from milli/src/update/index_documents/validate.rs rename to milli/src/update/index_documents/enrich.rs index 4e52f4cb9..e3c3bd6f6 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -20,7 +20,7 @@ const DEFAULT_PRIMARY_KEY: &str = "id"; /// - all the documents id exist and are extracted, /// - the validity of them but also, /// - the validity of the `_geo` field depending on the settings. -pub fn validate_and_enrich_documents_batch( +pub fn enrich_documents_batch( rtxn: &heed::RoTxn, index: &Index, autogenerate_docids: bool, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index fe3bd1f8f..db1a768e6 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1,8 +1,8 @@ +mod enrich; mod extract; mod helpers; mod transform; mod typed_chunk; -mod validate; use std::collections::HashSet; use std::io::{Cursor, Read, Seek}; @@ -19,6 +19,11 @@ use serde::{Deserialize, Serialize}; use slice_group_by::GroupBy; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; +use self::enrich::enrich_documents_batch; +pub use self::enrich::{ + extract_float_from_value, validate_document_id, validate_document_id_value, + validate_geo_from_json, +}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, @@ -27,11 +32,6 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use self::validate::validate_and_enrich_documents_batch; -pub use self::validate::{ - extract_float_from_value, validate_document_id, validate_document_id_value, - validate_geo_from_json, -}; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; @@ -141,7 +141,7 @@ where // We check for user errors in this validator and if there is one, we can return // the `IndexDocument` struct as it is valid to send more documents into it. // However, if there is an internal error we throw it away! - let enriched_documents_reader = match validate_and_enrich_documents_batch( + let enriched_documents_reader = match enrich_documents_batch( self.wtxn, self.index, self.config.autogenerate_docids, From d1a4da98127207b7ce560280911480a4e0a2ced4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 11:16:59 +0200 Subject: [PATCH 1480/1889] Generate a real UUIDv4 when ids are auto-generated --- milli/src/update/index_documents/enrich.rs | 93 ++++++++++++++----- .../extract/extract_geo_points.rs | 14 +-- 2 files changed, 78 insertions(+), 29 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index e3c3bd6f6..5d00565a8 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -1,6 +1,6 @@ use std::io::{Read, Seek}; -use std::iter; use std::result::Result as StdResult; +use std::{fmt, iter}; use serde_json::Value; @@ -29,6 +29,7 @@ pub fn enrich_documents_batch( let mut cursor = reader.into_cursor(); let mut documents_batch_index = cursor.documents_batch_index().clone(); let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; + let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. @@ -76,31 +77,33 @@ pub fn enrich_documents_batch( let mut count = 0; while let Some(document) = cursor.next_document()? { - let document_id = match fetch_document_id( + let document_id = match fetch_or_generate_document_id( &document, &documents_batch_index, primary_key, autogenerate_docids, + &mut uuid_buffer, count, )? { Ok(document_id) => document_id, Err(user_error) => return Ok(Err(user_error)), }; - external_ids.insert(count.to_be_bytes(), &document_id)?; + external_ids.insert(count.to_be_bytes(), document_id.value())?; if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { - if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? { + if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? { return Ok(Err(UserError::from(user_error))); } } + count += 1; } let external_ids = writer_into_reader(external_ids)?; let reader = EnrichedDocumentsBatchReader::new( cursor.into_reader(), - primary_key.primary_key().to_string(), + primary_key.name().to_string(), external_ids, )?; @@ -109,13 +112,14 @@ pub fn enrich_documents_batch( /// Retrieve the document id after validating it, returning a `UserError` /// if the id is invalid or can't be guessed. -fn fetch_document_id( +fn fetch_or_generate_document_id( document: &obkv::KvReader, documents_batch_index: &DocumentsBatchIndex, primary_key: PrimaryKey, autogenerate_docids: bool, + uuid_buffer: &mut [u8; uuid::adapter::Hyphenated::LENGTH], count: u32, -) -> Result> { +) -> Result> { match primary_key { PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { match document.get(primary_key_id) { @@ -123,12 +127,13 @@ fn fetch_document_id( let document_id = serde_json::from_slice(document_id_bytes) .map_err(InternalError::SerdeJson)?; match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(document_id)), + Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), Err(user_error) => Ok(Err(user_error)), } } None if autogenerate_docids => { - Ok(Ok(format!("{{auto-generated id of the {}nth document}}", count))) + let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer); + Ok(Ok(DocumentId::generated(uuid.to_string(), count))) } None => Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), @@ -147,7 +152,7 @@ fn fetch_document_id( if matching_documents_ids.len() >= 2 { return Ok(Err(UserError::TooManyDocumentIds { - primary_key: nested.primary_key().to_string(), + primary_key: nested.name().to_string(), document: obkv_to_object(&document, &documents_batch_index)?, })); } @@ -157,11 +162,11 @@ fn fetch_document_id( match matching_documents_ids.pop() { Some(document_id) => match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(document_id)), + Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), Err(user_error) => Ok(Err(user_error)), }, None => Ok(Err(UserError::MissingDocumentId { - primary_key: nested.primary_key().to_string(), + primary_key: nested.name().to_string(), document: obkv_to_object(&document, &documents_batch_index)?, })), } @@ -186,7 +191,7 @@ impl PrimaryKey<'_> { PrimaryKey::Nested { name } } - fn primary_key(&self) -> &str { + fn name(&self) -> &str { match self { PrimaryKey::Flat { name, .. } => name, PrimaryKey::Nested { name } => name, @@ -196,11 +201,53 @@ impl PrimaryKey<'_> { /// Returns an `Iterator` that gives all the possible fields names the primary key /// can have depending of the first level name and deepnes of the objects. fn possible_level_names(&self) -> impl Iterator + '_ { - let name = self.primary_key(); + let name = self.name(); iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL)) } } +/// A type that represents a document id that has been retrieved from a document or auto-generated. +/// +/// In case the document id has been auto-generated, the document nth is kept to help +/// users debug if there is an issue with the document itself. +#[derive(Clone)] +pub enum DocumentId { + Retrieved { value: String }, + Generated { value: String, document_nth: u32 }, +} + +impl DocumentId { + fn retrieved(value: String) -> DocumentId { + DocumentId::Retrieved { value } + } + + fn generated(value: String, document_nth: u32) -> DocumentId { + DocumentId::Generated { value, document_nth } + } + + fn value(&self) -> &str { + match self { + DocumentId::Retrieved { value } => value, + DocumentId::Generated { value, .. } => value, + } + } + + fn debug(&self) -> String { + format!("{:?}", self) + } +} + +impl fmt::Debug for DocumentId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DocumentId::Retrieved { value } => write!(f, "{:?}", value), + DocumentId::Generated { value, document_nth } => { + write!(f, "{{{:?}}} of the {}nth document", value, document_nth) + } + } + } +} + fn contained_in(selector: &str, key: &str) -> bool { selector.starts_with(key) && selector[key.len()..] @@ -281,23 +328,25 @@ pub fn extract_float_from_value(value: Value) -> StdResult { } } -pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result> { +pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result> { + use GeoError::*; + let debug_id = || Value::from(id.debug()); match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { (Some(lat), Some(lng)) => { match (extract_float_from_value(lat), extract_float_from_value(lng)) { (Ok(_), Ok(_)) => Ok(Ok(())), - (Err(value), Ok(_)) => Ok(Err(GeoError::BadLatitude { document_id, value })), - (Ok(_), Err(value)) => Ok(Err(GeoError::BadLongitude { document_id, value })), + (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })), + (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })), (Err(lat), Err(lng)) => { - Ok(Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng })) + Ok(Err(BadLatitudeAndLongitude { document_id: debug_id(), lat, lng })) } } } - (None, Some(_)) => Ok(Err(GeoError::MissingLatitude { document_id })), - (Some(_), None) => Ok(Err(GeoError::MissingLongitude { document_id })), - (None, None) => Ok(Err(GeoError::MissingLatitudeAndLongitude { document_id })), + (None, Some(_)) => Ok(Err(MissingLatitude { document_id: debug_id() })), + (Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })), + (None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })), }, - value => Ok(Err(GeoError::NotAnObject { document_id, value })), + value => Ok(Err(NotAnObject { document_id: debug_id(), value })), } } diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 46ef9ba9b..5a6de236b 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -29,9 +29,9 @@ pub fn extract_geo_points( let obkv = obkv::KvReader::new(value); // since we only needs the primary key when we throw an error we create this getter to // lazily get it when needed - let primary_key = || -> Value { - let primary_key = obkv.get(primary_key_id).unwrap(); - serde_json::from_slice(primary_key).unwrap() + let document_id = || -> Value { + let document_id = obkv.get(primary_key_id).unwrap(); + serde_json::from_slice(document_id).unwrap() }; // first we get the two fields @@ -43,19 +43,19 @@ pub fn extract_geo_points( let lat = extract_float_from_value( serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, ) - .map_err(|lat| GeoError::BadLatitude { document_id: primary_key(), value: lat })?; + .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; let lng = extract_float_from_value( serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, ) - .map_err(|lng| GeoError::BadLongitude { document_id: primary_key(), value: lng })?; + .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; } else if lat.is_none() && lng.is_some() { - return Err(GeoError::MissingLatitude { document_id: primary_key() })?; + return Err(GeoError::MissingLatitude { document_id: document_id() })?; } else if lat.is_some() && lng.is_none() { - return Err(GeoError::MissingLongitude { document_id: primary_key() })?; + return Err(GeoError::MissingLongitude { document_id: document_id() })?; } } From 0bbcc7b1808061355202efe040c38f64de64ba50 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 14:41:19 +0200 Subject: [PATCH 1481/1889] Expose the `DocumentId` struct to be sure to inject the generated ids --- milli/src/documents/enriched.rs | 13 ++++++----- milli/src/documents/reader.rs | 12 +++++----- milli/src/error.rs | 2 +- milli/src/update/index_documents/enrich.rs | 22 ++++++++++++------- milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/index_documents/transform.rs | 13 +++++++++-- milli/src/update/mod.rs | 2 +- 7 files changed, 41 insertions(+), 25 deletions(-) diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs index 918b47c95..4f45a891a 100644 --- a/milli/src/documents/enriched.rs +++ b/milli/src/documents/enriched.rs @@ -7,6 +7,7 @@ use super::{ DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader, Error, }; +use crate::update::DocumentId; use crate::FieldId; /// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have @@ -66,10 +67,10 @@ impl EnrichedDocumentsBatchReader { } } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone)] pub struct EnrichedDocument<'a> { pub document: KvReader<'a, FieldId>, - pub external_id: &'a str, + pub document_id: DocumentId, } pub struct EnrichedDocumentsBatchCursor { @@ -110,13 +111,13 @@ impl EnrichedDocumentsBatchCursor { &mut self, ) -> Result, DocumentsBatchCursorError> { let document = self.documents.next_document()?; - let external_id = match self.external_ids.move_on_next()? { - Some((_, bytes)) => Some(str::from_utf8(bytes)?), + let document_id = match self.external_ids.move_on_next()? { + Some((_, bytes)) => serde_json::from_slice(bytes).map(Some)?, None => None, }; - match document.zip(external_id) { - Some((document, external_id)) => Ok(Some(EnrichedDocument { document, external_id })), + match document.zip(document_id) { + Some((document, document_id)) => Ok(Some(EnrichedDocument { document, document_id })), None => Ok(None), } } diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 7bd6dbd51..70b8b0131 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,5 +1,5 @@ use std::convert::TryInto; -use std::{error, fmt, io, str}; +use std::{error, fmt, io}; use obkv::KvReader; @@ -95,7 +95,7 @@ impl DocumentsBatchCursor { #[derive(Debug)] pub enum DocumentsBatchCursorError { Grenad(grenad::Error), - Utf8(str::Utf8Error), + SerdeJson(serde_json::Error), } impl From for DocumentsBatchCursorError { @@ -104,9 +104,9 @@ impl From for DocumentsBatchCursorError { } } -impl From for DocumentsBatchCursorError { - fn from(error: str::Utf8Error) -> DocumentsBatchCursorError { - DocumentsBatchCursorError::Utf8(error) +impl From for DocumentsBatchCursorError { + fn from(error: serde_json::Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError::SerdeJson(error) } } @@ -116,7 +116,7 @@ impl fmt::Display for DocumentsBatchCursorError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { DocumentsBatchCursorError::Grenad(e) => e.fmt(f), - DocumentsBatchCursorError::Utf8(e) => e.fmt(f), + DocumentsBatchCursorError::SerdeJson(e) => e.fmt(f), } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 0419ceeda..0abb41eec 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -217,7 +217,7 @@ impl From for Error { fn from(error: DocumentsBatchCursorError) -> Error { match error { DocumentsBatchCursorError::Grenad(e) => Error::from(e), - DocumentsBatchCursorError::Utf8(e) => Error::from(e), + DocumentsBatchCursorError::SerdeJson(e) => Error::from(InternalError::from(e)), } } } diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 5d00565a8..1a0c31c24 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -2,6 +2,7 @@ use std::io::{Read, Seek}; use std::result::Result as StdResult; use std::{fmt, iter}; +use serde::{Deserialize, Serialize}; use serde_json::Value; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; @@ -89,14 +90,15 @@ pub fn enrich_documents_batch( Err(user_error) => return Ok(Err(user_error)), }; - external_ids.insert(count.to_be_bytes(), document_id.value())?; - if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? { return Ok(Err(UserError::from(user_error))); } } + let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?; + external_ids.insert(count.to_be_bytes(), document_id)?; + count += 1; } @@ -210,7 +212,7 @@ impl PrimaryKey<'_> { /// /// In case the document id has been auto-generated, the document nth is kept to help /// users debug if there is an issue with the document itself. -#[derive(Clone)] +#[derive(Serialize, Deserialize, Clone)] pub enum DocumentId { Retrieved { value: String }, Generated { value: String, document_nth: u32 }, @@ -225,16 +227,20 @@ impl DocumentId { DocumentId::Generated { value, document_nth } } - fn value(&self) -> &str { + fn debug(&self) -> String { + format!("{:?}", self) + } + + pub fn is_generated(&self) -> bool { + matches!(self, DocumentId::Generated { .. }) + } + + pub fn value(&self) -> &str { match self { DocumentId::Retrieved { value } => value, DocumentId::Generated { value, .. } => value, } } - - fn debug(&self) -> String { - format!("{:?}", self) - } } impl fmt::Debug for DocumentId { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index db1a768e6..615e1dfc7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -22,7 +22,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; pub use self::enrich::{ extract_float_from_value, validate_document_id, validate_document_id_value, - validate_geo_from_json, + validate_geo_from_json, DocumentId, }; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index e82556ec7..a34295a50 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -153,8 +153,9 @@ impl<'a, 'i> Transform<'a, 'i> { let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; let primary_key = cursor.primary_key().to_string(); - self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; let primary_key_id_nested = primary_key.contains('.'); + let primary_key_id = + self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; let mut flattened_document = None; let mut obkv_buffer = Vec::new(); @@ -162,7 +163,7 @@ impl<'a, 'i> Transform<'a, 'i> { let mut documents_count = 0; let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); while let Some(enriched_document) = cursor.next_enriched_document()? { - let EnrichedDocument { document, external_id } = enriched_document; + let EnrichedDocument { document, document_id } = enriched_document; let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { @@ -171,6 +172,14 @@ impl<'a, 'i> Transform<'a, 'i> { }); } + // When the document id has been auto-generated by the `enrich_documents_batch` + // we must insert this document id into the remaped document. + let external_id = document_id.value(); + if document_id.is_generated() { + let docid = serde_json::to_vec(external_id).map_err(InternalError::SerdeJson)?; + field_buffer_cache.push((primary_key_id, Cow::from(docid))); + } + for (k, v) in document.iter() { let mapped_id = *mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 965ed4fd2..1bf27a5f0 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -3,7 +3,7 @@ pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; pub use self::facets::Facets; pub use self::index_documents::{ - DocumentAdditionResult, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, + DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; pub use self::indexer_config::IndexerConfig; pub use self::settings::{Setting, Settings}; From 5d149d631f7736443d5c871a3442bfd99dbaeb48 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 30 Jun 2022 15:13:50 +0200 Subject: [PATCH 1482/1889] Remove tests for a function that no more exists --- milli/src/update/index_documents/transform.rs | 85 ------------------- 1 file changed, 85 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index a34295a50..6bf3dde43 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -729,88 +729,3 @@ impl TransformOutput { .collect()) } } - -// #[cfg(test)] -// mod test { -// use super::*; - -// mod compute_primary_key { -// use big_s::S; - -// use super::{compute_primary_key_pair, FieldsIdsMap}; - -// #[test] -// fn should_return_primary_key_if_is_some() { -// let mut fields_map = FieldsIdsMap::new(); -// fields_map.insert("toto").unwrap(); -// let result = compute_primary_key_pair( -// Some("toto"), -// &mut fields_map, -// Some("tata".to_string()), -// false, -// ); -// assert_eq!(result.unwrap(), (0, "toto".to_string())); -// assert_eq!(fields_map.len(), 1); - -// // and with nested fields -// let mut fields_map = FieldsIdsMap::new(); -// fields_map.insert("toto.tata").unwrap(); -// let result = compute_primary_key_pair( -// Some("toto.tata"), -// &mut fields_map, -// Some(S("titi")), -// false, -// ); -// assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); -// assert_eq!(fields_map.len(), 1); -// } - -// #[test] -// fn should_return_alternative_if_primary_is_none() { -// let mut fields_map = FieldsIdsMap::new(); -// let result = -// compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); -// assert_eq!(result.unwrap(), (0, S("tata"))); -// assert_eq!(fields_map.len(), 1); -// } - -// #[test] -// fn should_return_default_if_both_are_none() { -// let mut fields_map = FieldsIdsMap::new(); -// let result = compute_primary_key_pair(None, &mut fields_map, None, true); -// assert_eq!(result.unwrap(), (0, S("id"))); -// assert_eq!(fields_map.len(), 1); -// } - -// #[test] -// fn should_return_err_if_both_are_none_and_recompute_is_false() { -// let mut fields_map = FieldsIdsMap::new(); -// let result = compute_primary_key_pair(None, &mut fields_map, None, false); -// assert!(result.is_err()); -// assert_eq!(fields_map.len(), 0); -// } -// } - -// mod primary_key_inference { -// use big_s::S; -// use bimap::BiHashMap; - -// use crate::documents::DocumentsBatchIndex; -// use crate::update::index_documents::transform::find_primary_key; - -// #[test] -// fn primary_key_infered_on_first_field() { -// // We run the test multiple times to change the order in which the fields are iterated upon. -// for _ in 1..50 { -// let mut map = BiHashMap::new(); -// map.insert(1, S("fakeId")); -// map.insert(2, S("fakeId")); -// map.insert(3, S("fakeId")); -// map.insert(4, S("fakeId")); -// map.insert(0, S("realId")); - -// assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); -// } -// } -// } -// } From 2eec290424b7508c3953be301e070028fdf900f7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Jul 2022 16:36:23 +0200 Subject: [PATCH 1483/1889] Check the validity of the latitute and longitude numbers --- milli/src/error.rs | 6 ++--- milli/src/update/index_documents/enrich.rs | 24 ++++++++++++++----- .../extract/extract_geo_points.rs | 6 ++--- milli/src/update/index_documents/mod.rs | 2 +- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 0abb41eec..80c923bd9 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -152,11 +152,11 @@ pub enum GeoError { MissingLatitude { document_id: Value }, #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")] MissingLongitude { document_id: Value }, - #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{lat}` and `{lng}`.")] + #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting finite numbers but instead got `{lat}` and `{lng}`.")] BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value }, - #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] + #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")] BadLatitude { document_id: Value, value: Value }, - #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] + #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")] BadLongitude { document_id: Value, value: Value }, } diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 1a0c31c24..28318881b 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -326,11 +326,23 @@ pub fn validate_document_id_value(document_id: Value) -> Result StdResult { - match value { - Value::Number(ref n) => n.as_f64().ok_or(value), - Value::String(ref s) => s.parse::().map_err(|_| value), - value => Err(value), +pub fn extract_finite_float_from_value(value: Value) -> StdResult { + let number = match value { + Value::Number(ref n) => match n.as_f64() { + Some(number) => number, + None => return Err(value), + }, + Value::String(ref s) => match s.parse::() { + Ok(number) => number, + Err(_) => return Err(value), + }, + value => return Err(value), + }; + + if number.is_finite() { + Ok(number) + } else { + Err(value) } } @@ -340,7 +352,7 @@ pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result match (object.remove("lat"), object.remove("lng")) { (Some(lat), Some(lng)) => { - match (extract_float_from_value(lat), extract_float_from_value(lng)) { + match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) { (Ok(_), Ok(_)) => Ok(Ok(())), (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })), (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })), diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 5a6de236b..47085144a 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -6,7 +6,7 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::GeoError; -use crate::update::index_documents::extract_float_from_value; +use crate::update::index_documents::extract_finite_float_from_value; use crate::{FieldId, InternalError, Result}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. @@ -40,12 +40,12 @@ pub fn extract_geo_points( if let Some((lat, lng)) = lat.zip(lng) { // then we extract the values - let lat = extract_float_from_value( + let lat = extract_finite_float_from_value( serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, ) .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; - let lng = extract_float_from_value( + let lng = extract_finite_float_from_value( serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, ) .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 615e1dfc7..652c1e72b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -21,7 +21,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; pub use self::enrich::{ - extract_float_from_value, validate_document_id, validate_document_id_value, + extract_finite_float_from_value, validate_document_id, validate_document_id_value, validate_geo_from_json, DocumentId, }; pub use self::helpers::{ From dc61105554954801cce37cc4cca32b1ef4814347 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Jul 2022 17:44:08 +0200 Subject: [PATCH 1484/1889] Fix the nested document id fetching function --- milli/src/update/index_documents/enrich.rs | 4 +++- milli/src/update/index_documents/mod.rs | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 28318881b..0298f0532 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -204,7 +204,9 @@ impl PrimaryKey<'_> { /// can have depending of the first level name and deepnes of the objects. fn possible_level_names(&self) -> impl Iterator + '_ { let name = self.name(); - iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL)) + name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) + .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) + .chain(iter::once((name, ""))) } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 652c1e72b..54599acce 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1198,7 +1198,7 @@ mod tests { let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), - r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `"lol"`."# + r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `"lol"`."# ); let documents = documents!([ @@ -1212,7 +1212,7 @@ mod tests { let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), - r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `[12,13]`."# + r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `[12,13]`."# ); let documents = documents!([ @@ -1226,7 +1226,7 @@ mod tests { let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), - r#"Could not parse longitude in the document with the id: `0`. Was expecting a number but instead got `"hello"`."# + r#"Could not parse longitude in the document with the id: `0`. Was expecting a finite number but instead got `"hello"`."# ); } From a892a4a79c5933a2eedb04d6b46bd19d66229729 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Jul 2022 18:38:50 +0200 Subject: [PATCH 1485/1889] Introduce a function to extend from a JSON array of objects --- benchmarks/benches/utils.rs | 5 +- cli/src/main.rs | 5 +- http-ui/src/main.rs | 5 +- milli/src/documents/builder.rs | 9 ++++ milli/src/documents/mod.rs | 1 + milli/src/documents/serde_impl.rs | 76 +++++++++++++++++++++++++++++++ 6 files changed, 89 insertions(+), 12 deletions(-) create mode 100644 milli/src/documents/serde_impl.rs diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 630e17943..51178b43b 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -164,11 +164,8 @@ fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result> { fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec = serde_json::from_reader(reader)?; - for object in list { - documents.append_json_object(&object)?; - } + documents.append_json_array(reader)?; documents.into_inner().map_err(Into::into) } diff --git a/cli/src/main.rs b/cli/src/main.rs index 0d197af17..35fef95c6 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -337,11 +337,8 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { fn documents_from_json(reader: impl Read) -> Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec = serde_json::from_reader(reader)?; - for object in list { - documents.append_json_object(&object)?; - } + documents.append_json_array(reader)?; documents.into_inner().map_err(Into::into) } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 117aa31e8..83fce9a9c 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1042,11 +1042,8 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { fn documents_from_json(reader: impl Read) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec = serde_json::from_reader(reader)?; - for object in list { - documents.append_json_object(&object)?; - } + documents.append_json_array(reader)?; documents.into_inner().map_err(Into::into) } diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 589e52269..bb9d6aa68 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,9 +1,11 @@ use std::io::{self, Write}; use grenad::{CompressionType, WriterBuilder}; +use serde::de::Deserializer; use serde_json::{to_writer, Value}; use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; +use crate::documents::serde_impl::DocumentVisitor; use crate::Object; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary @@ -78,6 +80,13 @@ impl DocumentsBatchBuilder { Ok(()) } + /// Appends a new JSON array of objects into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_json_array(&mut self, reader: R) -> Result<(), Error> { + let mut de = serde_json::Deserializer::from_reader(reader); + let mut visitor = DocumentVisitor::new(self); + de.deserialize_any(&mut visitor)? + } + /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly. pub fn append_csv(&mut self, mut reader: csv::Reader) -> Result<(), Error> { // Make sure that we insert the fields ids in order as the obkv writer has this requirement. diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 43bfc1c20..c5ff7a120 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,6 +1,7 @@ mod builder; mod enriched; mod reader; +mod serde_impl; use std::fmt::{self, Debug}; use std::io; diff --git a/milli/src/documents/serde_impl.rs b/milli/src/documents/serde_impl.rs new file mode 100644 index 000000000..d4abdc844 --- /dev/null +++ b/milli/src/documents/serde_impl.rs @@ -0,0 +1,76 @@ +use std::fmt; +use std::io::Write; + +use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; + +use super::Error; +use crate::documents::DocumentsBatchBuilder; +use crate::Object; + +macro_rules! tri { + ($e:expr) => { + match $e { + Ok(r) => r, + Err(e) => return Ok(Err(e.into())), + } + }; +} + +pub struct DocumentVisitor<'a, W> { + inner: &'a mut DocumentsBatchBuilder, + object: Object, +} + +impl<'a, W> DocumentVisitor<'a, W> { + pub fn new(inner: &'a mut DocumentsBatchBuilder) -> Self { + DocumentVisitor { inner, object: Object::new() } + } +} + +impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { + /// This Visitor value is nothing, since it write the value to a file. + type Value = Result<(), Error>; + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + while let Some(v) = seq.next_element_seed(&mut *self)? { + tri!(v) + } + + Ok(Ok(())) + } + + fn visit_map(self, mut map: A) -> Result + where + A: MapAccess<'de>, + { + self.object.clear(); + while let Some((key, value)) = map.next_entry()? { + self.object.insert(key, value); + } + + tri!(self.inner.append_json_object(&self.object)); + + Ok(Ok(())) + } + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "a documents, or a sequence of documents.") + } +} + +impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> +where + W: Write, +{ + type Value = Result<(), Error>; + + fn deserialize(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_map(self) + } +} From 192793ee38c7d819b5b542c8c7d0f1ae473d9619 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 12 Jul 2022 12:42:06 +0200 Subject: [PATCH 1486/1889] Add some tests to check for the nested documents ids --- milli/src/update/index_documents/enrich.rs | 9 ++-- milli/src/update/index_documents/mod.rs | 52 ++++++++++++++++++++++ 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 0298f0532..d7ab89faa 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -257,12 +257,9 @@ impl fmt::Debug for DocumentId { } fn contained_in(selector: &str, key: &str) -> bool { - selector.starts_with(key) - && selector[key.len()..] - .chars() - .next() - .map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL) - .unwrap_or(true) + selector.strip_prefix(key).map_or(false, |tail| { + tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) + }) } pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 54599acce..c9890f93f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1627,6 +1627,58 @@ mod tests { assert_eq!(documents_ids, vec![3]); } + #[test] + fn retrieve_a_b_nested_document_id() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key("a.b".to_owned()); + builder.execute(|_| ()).unwrap(); + + let content = documents!({ "a" : { "b" : { "c" : 1 }}}); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + let (_builder, user_error) = builder.add_documents(content).unwrap(); + + // There must be an issue with the primary key no present in the given document + user_error.unwrap_err(); + } + + #[test] + fn retrieve_a_b_c_nested_document_id() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key("a.b.c".to_owned()); + builder.execute(|_| ()).unwrap(); + + let content = documents!({ "a" : { "b" : { "c" : 1 }}}); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); + assert!(external_documents_ids.get("1").is_some()); + } + #[test] fn test_facets_generation() { let path = tempfile::tempdir().unwrap(); From 25e768f31c32d8718132be124ab2e32c94c15f8f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 12 Jul 2022 14:41:33 +0200 Subject: [PATCH 1487/1889] Fix another issue with the nested primary key selector --- milli/src/update/index_documents/enrich.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index d7ab89faa..56f8fa4c0 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -256,7 +256,7 @@ impl fmt::Debug for DocumentId { } } -fn contained_in(selector: &str, key: &str) -> bool { +fn starts_with(selector: &str, key: &str) -> bool { selector.strip_prefix(key).map_or(false, |tail| { tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) }) @@ -282,12 +282,7 @@ pub fn fetch_matching_values_in_object( format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) }; - // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` - // so we check the contained_in on both side. - let should_continue = - contained_in(selector, &base_key) || contained_in(&base_key, selector); - - if should_continue { + if starts_with(selector, &base_key) { match value { Value::Object(object) => { fetch_matching_values_in_object(object, selector, &base_key, output) From 448114cc1c1edd1781a830590f2c888dcdab775d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 12 Jul 2022 15:22:09 +0200 Subject: [PATCH 1488/1889] Fix the benchmarks with the new indexation API --- benchmarks/benches/indexing.rs | 40 +++++++++++++--------- benchmarks/benches/utils.rs | 4 +-- milli/src/update/index_documents/enrich.rs | 6 ++-- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 80c7ba0ed..81b21b5ea 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -170,12 +170,13 @@ fn reindexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -185,12 +186,13 @@ fn reindexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -460,12 +462,13 @@ fn reindexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -476,12 +479,13 @@ fn reindexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -680,12 +684,13 @@ fn reindexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -695,12 +700,13 @@ fn reindexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1079,12 +1085,13 @@ fn reindexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1095,12 +1102,13 @@ fn reindexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 51178b43b..fba05edbe 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,7 +1,7 @@ #![allow(dead_code)] use std::fs::{create_dir_all, remove_dir_all, File}; -use std::io::{self, BufReader, Cursor, Read, Seek}; +use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; use std::num::ParseFloatError; use std::path::Path; @@ -138,7 +138,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { } } -pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader { +pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { let reader = File::open(filename).expect(&format!("could not find the dataset in: {}", filename)); let reader = BufReader::new(reader); diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 56f8fa4c0..51495c598 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -30,7 +30,7 @@ pub fn enrich_documents_batch( let mut cursor = reader.into_cursor(); let mut documents_batch_index = cursor.documents_batch_index().clone(); let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; - let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; + let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. @@ -119,7 +119,7 @@ fn fetch_or_generate_document_id( documents_batch_index: &DocumentsBatchIndex, primary_key: PrimaryKey, autogenerate_docids: bool, - uuid_buffer: &mut [u8; uuid::adapter::Hyphenated::LENGTH], + uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], count: u32, ) -> Result> { match primary_key { @@ -134,7 +134,7 @@ fn fetch_or_generate_document_id( } } None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer); + let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); Ok(Ok(DocumentId::generated(uuid.to_string(), count))) } None => Ok(Err(UserError::MissingDocumentId { From 1da4ab5918ff7edd9a6531dd34a7985323d1b9e8 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 18 Jul 2022 10:18:03 +0200 Subject: [PATCH 1489/1889] Update milli/src/search/matches/mod.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 46f87654f..f63852b52 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -325,7 +325,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { } } - // finally, keep the byte index of each bounds of the crop window. + // finally, keep the byte index of each bound of the crop window. let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); From e261ef64d7d4db66a4087fb3021cfa7bd60bdd0b Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 18 Jul 2022 10:18:51 +0200 Subject: [PATCH 1490/1889] Update milli/src/search/matches/mod.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index f63852b52..b08268657 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -102,7 +102,7 @@ pub struct MatchBounds { } /// Structure used to analize a string, compute words that match, -/// and format the source string returning an highlighted and croped sub-string. +/// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'm, A> { text: &'t str, matching_words: &'m MatchingWords, From 8270e2b7681d07d15650a9eaa221fb627fd9f6ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 10:34:12 +0200 Subject: [PATCH 1491/1889] Fix name of "release_date" facet in movies benchmarks --- benchmarks/benches/indexing.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 3ae0a1a84..8c6deca50 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -617,7 +617,7 @@ fn indexing_movies_default(c: &mut Criterion) { move || { let primary_key = "id"; let searchable_fields = ["title", "overview"]; - let filterable_fields = ["released_date", "genres"]; + let filterable_fields = ["release_date", "genres"]; let sortable_fields = []; setup_index_with_settings( @@ -654,7 +654,7 @@ fn reindexing_movies_default(c: &mut Criterion) { move || { let primary_key = "id"; let searchable_fields = ["title", "overview"]; - let filterable_fields = ["released_date", "genres"]; + let filterable_fields = ["release_date", "genres"]; let sortable_fields = []; let index = setup_index_with_settings( @@ -705,7 +705,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { move || { let primary_key = "id"; let searchable_fields = ["title", "overview"]; - let filterable_fields = ["released_date", "genres"]; + let filterable_fields = ["release_date", "genres"]; let sortable_fields = []; let index = setup_index_with_settings( @@ -760,7 +760,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { move || { let primary_key = "id"; let searchable_fields = ["title", "overview"]; - let filterable_fields = ["released_date", "genres"]; + let filterable_fields = ["release_date", "genres"]; let sortable_fields = []; let index = setup_index_with_settings( From ab1571cdec615d025eeab8c924df74a5c485dceb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 12:41:58 +0200 Subject: [PATCH 1492/1889] Simplify Transform::read_documents, enabled by enriched documents reader --- milli/src/update/index_documents/transform.rs | 97 +++---------------- 1 file changed, 11 insertions(+), 86 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 6bf3dde43..d03a803fd 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -153,18 +153,18 @@ impl<'a, 'i> Transform<'a, 'i> { let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; let primary_key = cursor.primary_key().to_string(); - let primary_key_id_nested = primary_key.contains('.'); let primary_key_id = self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; - let mut flattened_document = None; let mut obkv_buffer = Vec::new(); - let mut flattened_obkv_buffer = Vec::new(); let mut documents_count = 0; + let mut docid_buffer: Vec = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); while let Some(enriched_document) = cursor.next_enriched_document()? { let EnrichedDocument { document, document_id } = enriched_document; + // drop_and_reuse is called instead of .clear() to communicate to the compiler that field_buffer + // does not keep references from the cursor between loop iterations let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -176,8 +176,9 @@ impl<'a, 'i> Transform<'a, 'i> { // we must insert this document id into the remaped document. let external_id = document_id.value(); if document_id.is_generated() { - let docid = serde_json::to_vec(external_id).map_err(InternalError::SerdeJson)?; - field_buffer_cache.push((primary_key_id, Cow::from(docid))); + serde_json::to_writer(&mut docid_buffer, external_id) + .map_err(InternalError::SerdeJson)?; + field_buffer_cache.push((primary_key_id, Cow::from(&docid_buffer))); } for (k, v) in document.iter() { @@ -186,22 +187,6 @@ impl<'a, 'i> Transform<'a, 'i> { field_buffer_cache.push((mapped_id, Cow::from(v))); } - // We need to make sure that every document has a primary key. After we have remapped - // all the fields in the document, we try to find the primary key value. If we can find - // it, transform it into a string and validate it, and then update it in the - // document. If none is found, and we were told to generate missing document ids, then - // we create the missing field, and update the new document. - if primary_key_id_nested { - let mut field_buffer_cache = field_buffer_cache.clone(); - self.flatten_from_field_mapping( - &mapping, - &document, - &mut flattened_obkv_buffer, - &mut field_buffer_cache, - )?; - flattened_document = Some(&flattened_obkv_buffer); - }; - // Insertion in a obkv need to be done with keys ordered. For now they are ordered // according to the document addition key order, so we sort it according to the // fieldids map keys order. @@ -256,18 +241,12 @@ impl<'a, 'i> Transform<'a, 'i> { } // We use the extracted/generated user id as the key for this document. - self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?; + self.original_sorter.insert(&docid.to_be_bytes(), &obkv_buffer)?; documents_count += 1; - if let Some(flatten) = flattened_document { - self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; - } else { - match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { - Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, - None => { - self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())? - } - } + match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { + Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, + None => self.flattened_sorter.insert(docid.to_be_bytes(), &obkv_buffer)?, } progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -275,6 +254,7 @@ impl<'a, 'i> Transform<'a, 'i> { }); field_buffer = drop_and_reuse(field_buffer_cache); + docid_buffer.clear(); obkv_buffer.clear(); } @@ -345,61 +325,6 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(Some(buffer)) } - // Flatten a document from a field mapping generated by [create_fields_mapping] - fn flatten_from_field_mapping( - &mut self, - mapping: &HashMap, - obkv: &KvReader, - output_buffer: &mut Vec, - field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>, - ) -> Result<()> { - // store the keys and values of the json + the original obkv - let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new(); - - // if the primary_key is nested we need to flatten the document before being able to do anything - let mut doc = serde_json::Map::new(); - - // we recreate a json containing only the fields that needs to be flattened. - // all the raw values get inserted directly in the `key_value` vec. - for (key, value) in obkv.iter() { - if json_depth_checker::should_flatten_from_unchecked_slice(value) { - let key = - mapping.get(&key).ok_or(InternalError::FieldIdMappingMissingEntry { key })?; - let key = - self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: *key, - process: "Flatten from field mapping.", - })?; - let value = serde_json::from_slice::(value) - .map_err(InternalError::SerdeJson)?; - doc.insert(key.to_string(), value); - } else { - key_value.push((key, value.into())); - } - } - - let flattened = flatten_serde_json::flatten(&doc); - - // Once we have the flattened version we insert all the new generated fields_ids - // (if any) in the fields ids map and serialize the value. - for (key, value) in flattened.into_iter() { - let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; - let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; - key_value.push((fid, value.clone().into())); - - if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() { - field_buffer_cache.push((fid, value.into())); - } - } - - // we sort the key. If there was a conflict between the obkv and the new generated value the - // keys will be consecutive. - key_value.sort_unstable_by_key(|(key, _)| *key); - - Self::create_obkv_from_key_value(&mut key_value, output_buffer)?; - Ok(()) - } - /// Generate an obkv from a slice of key / value sorted by key. fn create_obkv_from_key_value( key_value: &mut [(FieldId, Cow<[u8]>)], From fc9f3f31e74edda92ad1beb31226e9cb100781e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 16:08:01 +0200 Subject: [PATCH 1493/1889] Change DocumentsBatchReader to access cursor and index at same time Otherwise it is not possible to iterate over all documents while using the fields index at the same time. --- milli/src/documents/builder.rs | 71 ++++++++++--------- milli/src/documents/enriched.rs | 25 ++----- milli/src/documents/mod.rs | 16 +++-- milli/src/documents/reader.rs | 20 ++---- milli/src/update/index_documents/enrich.rs | 9 +-- milli/src/update/index_documents/transform.rs | 6 +- 6 files changed, 65 insertions(+), 82 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index bb9d6aa68..dc027e1b7 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -216,9 +216,9 @@ mod test { assert_eq!(builder.documents_count(), 2); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); assert_eq!(index.len(), 3); let document = cursor.next_document().unwrap().unwrap(); @@ -240,9 +240,9 @@ mod test { assert_eq!(builder.documents_count(), 2); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); assert_eq!(index.len(), 2); let document = cursor.next_document().unwrap().unwrap(); @@ -264,9 +264,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -292,9 +292,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -319,9 +319,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -346,9 +346,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -373,9 +373,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -400,9 +400,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -427,9 +427,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -454,9 +454,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -482,8 +482,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); assert!(cursor.next_document().is_err()); } @@ -498,9 +499,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs index 4f45a891a..fa21c0f87 100644 --- a/milli/src/documents/enriched.rs +++ b/milli/src/documents/enriched.rs @@ -56,14 +56,13 @@ impl EnrichedDocumentsBatchReader { } /// This method returns a forward cursor over the enriched documents. - pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor { + pub fn into_cursor_and_fields_index( + self, + ) -> (EnrichedDocumentsBatchCursor, DocumentsBatchIndex) { let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self; + let (documents, fields_index) = documents.into_cursor_and_fields_index(); external_ids.reset(); - EnrichedDocumentsBatchCursor { - documents: documents.into_cursor(), - primary_key, - external_ids, - } + (EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index) } } @@ -80,23 +79,9 @@ pub struct EnrichedDocumentsBatchCursor { } impl EnrichedDocumentsBatchCursor { - pub fn into_reader(self) -> EnrichedDocumentsBatchReader { - let EnrichedDocumentsBatchCursor { documents, primary_key, external_ids } = self; - EnrichedDocumentsBatchReader { - documents: documents.into_reader(), - primary_key, - external_ids, - } - } - pub fn primary_key(&self) -> &str { &self.primary_key } - - pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { - self.documents.documents_batch_index() - } - /// Resets the cursor to be able to read from the start again. pub fn reset(&mut self) { self.documents.reset(); diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index c5ff7a120..e766e29cf 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -203,10 +203,11 @@ mod test { builder.append_json_object(value.as_object().unwrap()).unwrap(); let vector = builder.into_inner().unwrap(); - let mut documents = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let (mut documents, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); - assert_eq!(documents.documents_batch_index().iter().count(), 5); + assert_eq!(index.iter().count(), 5); let reader = documents.next_document().unwrap().unwrap(); assert_eq!(reader.iter().count(), 5); assert!(documents.next_document().unwrap().is_none()); @@ -226,9 +227,10 @@ mod test { builder.append_json_object(doc2.as_object().unwrap()).unwrap(); let vector = builder.into_inner().unwrap(); - let mut documents = - DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor(); - assert_eq!(documents.documents_batch_index().iter().count(), 2); + let (mut documents, index) = DocumentsBatchReader::from_reader(io::Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + assert_eq!(index.iter().count(), 2); let reader = documents.next_document().unwrap().unwrap(); assert_eq!(reader.iter().count(), 1); assert!(documents.next_document().unwrap().is_some()); @@ -243,7 +245,7 @@ mod test { } }]); - let mut cursor = docs_reader.into_cursor(); + let (mut cursor, _) = docs_reader.into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); assert_eq!(nested, json!({ "toto": ["hello"] })); diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 70b8b0131..a8a4c662d 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -17,6 +17,10 @@ pub struct DocumentsBatchReader { } impl DocumentsBatchReader { + pub fn new(cursor: DocumentsBatchCursor, fields_index: DocumentsBatchIndex) -> Self { + Self { cursor: cursor.cursor, fields_index } + } + /// Construct a `DocumentsReader` from a reader. /// /// It first retrieves the index, then moves to the first document. Use the `into_cursor` @@ -46,30 +50,20 @@ impl DocumentsBatchReader { } /// This method returns a forward cursor over the documents. - pub fn into_cursor(self) -> DocumentsBatchCursor { + pub fn into_cursor_and_fields_index(self) -> (DocumentsBatchCursor, DocumentsBatchIndex) { let DocumentsBatchReader { cursor, fields_index } = self; - let mut cursor = DocumentsBatchCursor { cursor, fields_index }; + let mut cursor = DocumentsBatchCursor { cursor }; cursor.reset(); - cursor + (cursor, fields_index) } } /// A forward cursor over the documents in a `DocumentsBatchReader`. pub struct DocumentsBatchCursor { cursor: grenad::ReaderCursor, - fields_index: DocumentsBatchIndex, } impl DocumentsBatchCursor { - pub fn into_reader(self) -> DocumentsBatchReader { - let DocumentsBatchCursor { cursor, fields_index, .. } = self; - DocumentsBatchReader { cursor, fields_index } - } - - pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { - &self.fields_index - } - /// Resets the cursor to be able to read from the start again. pub fn reset(&mut self) { self.cursor.reset(); diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 51495c598..7c9a016d8 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -27,8 +27,8 @@ pub fn enrich_documents_batch( autogenerate_docids: bool, reader: DocumentsBatchReader, ) -> Result, UserError>> { - let mut cursor = reader.into_cursor(); - let mut documents_batch_index = cursor.documents_batch_index().clone(); + let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index(); + let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; @@ -103,9 +103,10 @@ pub fn enrich_documents_batch( } let external_ids = writer_into_reader(external_ids)?; + let primary_key_name = primary_key.name().to_string(); let reader = EnrichedDocumentsBatchReader::new( - cursor.into_reader(), - primary_key.name().to_string(), + DocumentsBatchReader::new(cursor, documents_batch_index), + primary_key_name, external_ids, )?; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index d03a803fd..0de90924a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -146,11 +146,11 @@ impl<'a, 'i> Transform<'a, 'i> { R: Read + Seek, F: Fn(UpdateIndexingStep) + Sync, { - let mut cursor = reader.into_cursor(); - let fields_index = cursor.documents_batch_index(); + let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); + let external_documents_ids = self.index.external_documents_ids(wtxn)?; - let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; + let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; let primary_key = cursor.primary_key().to_string(); let primary_key_id = From d7fd5c58cdc7daf3893cce4591869cc0a2042dde Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 18 Jul 2022 17:45:06 +0200 Subject: [PATCH 1494/1889] Update milli/src/search/matches/mod.rs --- milli/src/search/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index b08268657..2b47c2c32 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -257,7 +257,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { // an iterator starting from the last match token position and going trew the end of the text. let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); - // gross the crop window peeking in both drections + // grows the crop window peeking in both directions // until the window contains the good number of words: while remaining_words > 0 { let before_token = before_tokens.peek().map(|t| t.separator_kind()); From 1237cfc249a0c007f3a3280a765d8fdf366597dc Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 18 Jul 2022 17:45:37 +0200 Subject: [PATCH 1495/1889] Update milli/src/search/matches/mod.rs --- milli/src/search/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 2b47c2c32..89ab0064f 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -116,7 +116,7 @@ pub struct Matcher<'t, 'm, A> { impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { /// Iterates over tokens and save any of them that matches the query. fn compute_matches(&mut self) -> &mut Self { - /// some word are counted as matches only if there are close together and in the good order, + /// some words are counted as matches only if they are close together and in the good order, /// compute_partial_match peek into next words to validate if the match is complete. fn compute_partial_match<'a>( mut partial: PartialMatch, From fb794c6b5ec84e3aaeb4fba0c382b5b21c4e201c Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 18 Jul 2022 17:46:00 +0200 Subject: [PATCH 1496/1889] Update milli/src/search/matches/mod.rs --- milli/src/search/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 89ab0064f..8df93daa1 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -419,7 +419,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { } else { match &self.matches { Some((tokens, matches)) => { - // If the text have to be croped, + // If the text has to be cropped, // compute the best interval to crop around. let matches = match format_options.crop { Some(crop_size) if crop_size > 0 => { From a277daa1f2a6cc8f7a12f9bc9c7dfcabc79a16a8 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 18 Jul 2022 17:47:13 +0200 Subject: [PATCH 1497/1889] Update milli/src/search/matches/mod.rs --- milli/src/search/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 8df93daa1..649078f7c 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -252,7 +252,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; // create the initial state of the crop window: 2 iterators starting from the matches positions, - // a reverse iterator starting from the first match token position and going trew the beginning of the text, + // a reverse iterator starting from the first match token position and going towards the beginning of the text, let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); // an iterator starting from the last match token position and going trew the end of the text. let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); From 8ddb4e750b99f65610c4d49a7c7ef03b298a67e8 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 18 Jul 2022 17:47:39 +0200 Subject: [PATCH 1498/1889] Update milli/src/search/matches/mod.rs --- milli/src/search/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 649078f7c..fe35b2a0a 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -254,7 +254,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { // create the initial state of the crop window: 2 iterators starting from the matches positions, // a reverse iterator starting from the first match token position and going towards the beginning of the text, let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); - // an iterator starting from the last match token position and going trew the end of the text. + // an iterator starting from the last match token position and going towards the end of the text. let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); // grows the crop window peeking in both directions From 2d79720f5da563ecf3ff3f585576209132f93b9a Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 18 Jul 2022 17:48:04 +0200 Subject: [PATCH 1499/1889] Update milli/src/search/matches/mod.rs --- milli/src/search/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index fe35b2a0a..72592c4cb 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -428,7 +428,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { _ => matches, }; - // If the text have to be croped, + // If the text has to be cropped, // crop around the best interval. let (byte_start, byte_end) = match format_options.crop { Some(crop_size) if crop_size > 0 => { From 453d593ce804eac35af4f7abd1b3a88c218941d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 09:30:19 +0200 Subject: [PATCH 1500/1889] Add a database containing the docids where each field exists --- infos/src/main.rs | 15 ++ milli/src/heed_codec/facet/mod.rs | 40 +++++ milli/src/index.rs | 24 ++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 11 +- .../extract/extract_facet_exists_docids.rs | 42 +++++ .../extract/extract_fid_docid_facet_values.rs | 29 +++- .../src/update/index_documents/extract/mod.rs | 47 ++++-- milli/src/update/index_documents/mod.rs | 149 ++++++++++++++++++ .../src/update/index_documents/typed_chunk.rs | 13 ++ 10 files changed, 350 insertions(+), 22 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_facet_exists_docids.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index 29a87cdcf..89aec6182 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -384,6 +384,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_exists_docids, exact_word_docids, exact_word_prefix_docids, field_id_docid_facet_f64s: _, @@ -402,6 +403,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let field_id_word_count_docids_name = "field_id_word_count_docids"; let facet_id_f64_docids_name = "facet_id_f64_docids"; let facet_id_string_docids_name = "facet_id_string_docids"; + let facet_id_exists_docids_name = "facet_id_exists_docids"; let documents_name = "documents"; let mut heap = BinaryHeap::with_capacity(limit + 1); @@ -544,6 +546,17 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho heap.pop(); } } + + // List the docids where the facet exists + let db = facet_id_exists_docids.remap_data_type::(); + for result in facet_values_iter(rtxn, db, facet_id)? { + let (_fid, value) = result?; + let key = format!("{}", facet_name); + heap.push(Reverse((value.len(), key, facet_id_exists_docids_name))); + if heap.len() > limit { + heap.pop(); + } + } } for result in index.all_documents(rtxn)? { @@ -984,6 +997,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a facet_id_string_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, + facet_id_exists_docids, exact_word_prefix_docids, exact_word_docids, .. @@ -1007,6 +1021,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(), FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(), FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(), + FACET_ID_EXISTS_DOCIDS => facet_id_exists_docids.as_polymorph(), FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(), FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(), EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(), diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index e93fb57b9..79dbffa1d 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -25,3 +25,43 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } + +use crate::{try_split_array_at, DocumentId, FieldId}; +use std::borrow::Cow; +use std::convert::TryInto; + +pub struct FieldIdCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdCodec { + type DItem = FieldId; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, _) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + Some(field_id) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldIdCodec { + type EItem = FieldId; + + fn bytes_encode(field_id: &Self::EItem) -> Option> { + Some(Cow::Owned(field_id.to_be_bytes().to_vec())) + } +} + +pub struct FieldIdDocIdCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdDocIdCodec { + type DItem = (FieldId, DocumentId); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + + let document_id_bytes = bytes[..4].try_into().ok()?; + let document_id = u32::from_be_bytes(document_id_bytes); + + Some((field_id, document_id)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 9637b4103..816112178 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -15,7 +15,7 @@ use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, @@ -75,6 +75,7 @@ pub mod db_name { pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; + pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; @@ -116,6 +117,9 @@ pub struct Index { /// Maps the position of a word prefix with all the docids where this prefix appears. pub word_prefix_position_docids: Database, + /// Maps the facet field id and the docids for which this field exists + pub facet_id_exists_docids: Database, + /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, /// Maps the facet field id and the string with the original string and docids that corresponds to it. @@ -134,7 +138,7 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(16); + options.max_dbs(17); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -152,6 +156,9 @@ impl Index { let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; + let facet_id_exists_docids: Database = + env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; + let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; @@ -174,6 +181,7 @@ impl Index { field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_exists_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -806,6 +814,18 @@ impl Index { } } + /// Retrieve all the documents which contain this field id + pub fn exists_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result { + match self.facet_id_exists_docids.get(rtxn, &field_id)? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /* distinct field */ pub(crate) fn put_distinct_field( diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index d1939df7b..db438d019 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_position_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_exists_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -69,6 +70,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; + facet_id_exists_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 3b519c101..6dfdb9a7c 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -170,6 +170,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_position_docids, word_prefix_position_docids, facet_id_f64_docids, + facet_id_exists_docids, facet_id_string_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, @@ -424,11 +425,17 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_number_docids( + remove_docids_from_facet_field_id_docids( self.wtxn, facet_id_f64_docids, &self.to_delete_docids, )?; + // We delete the documents ids that are under the facet field id values. + remove_docids_from_facet_field_id_docids( + self.wtxn, + facet_id_exists_docids, + &self.to_delete_docids, + )?; remove_docids_from_facet_field_id_string_docids( self.wtxn, @@ -618,7 +625,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( Ok(()) } -fn remove_docids_from_facet_field_id_number_docids<'a, C>( +fn remove_docids_from_facet_field_id_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs new file mode 100644 index 000000000..e7a001c08 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs @@ -0,0 +1,42 @@ +use std::fs::File; +use std::io; + +use heed::{BytesDecode, BytesEncode}; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, +}; +use crate::heed_codec::facet::{FieldIdCodec, FieldIdDocIdCodec}; +use crate::Result; + +/// Extracts the documents ids where this field appears. +/// +/// Returns a grenad reader whose key is the field id encoded +/// with `FieldIdCodec` and the value is a document_id (u32) +/// encoded as native-endian bytes. +#[logging_timer::time] +pub fn extract_facet_exists_docids( + docid_fid_facet_number: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_exists_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut cursor = docid_fid_facet_number.into_cursor()?; + while let Some((key_bytes, _)) = cursor.move_on_next()? { + let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap(); + + let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap(); + + facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + } + + sorter_into_reader(facet_exists_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 628636f78..d93bde500 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -20,7 +20,7 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader)> { +) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -28,7 +28,7 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 2), + max_memory.map(|m| m / 3), ); let mut fid_docid_facet_strings_sorter = create_sorter( @@ -36,7 +36,15 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 2), + max_memory.map(|m| m / 3), + ); + + let mut fid_docid_facet_exists_sorter = create_sorter( + keep_first, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 3), ); let mut key_buffer = Vec::new(); @@ -46,15 +54,19 @@ pub fn extract_fid_docid_facet_values( for (field_id, field_bytes) in obkv.iter() { if faceted_fields.contains(&field_id) { - let value = - serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - let (numbers, strings) = extract_facet_values(&value); - key_buffer.clear(); + // here, we know already that the document must be added to the “field id exists” database // prefix key with the field_id and the document_id + key_buffer.extend_from_slice(&field_id.to_be_bytes()); key_buffer.extend_from_slice(&docid_bytes); + fid_docid_facet_exists_sorter.insert(&key_buffer, ().as_bytes())?; + + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + + let (numbers, strings) = extract_facet_values(&value); // insert facet numbers in sorter for number in numbers { @@ -79,7 +91,8 @@ pub fn extract_fid_docid_facet_values( Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, - sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?, + sorter_into_reader(fid_docid_facet_exists_sorter, indexer)?, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index c3c2033a6..7d26e0984 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -1,4 +1,5 @@ mod extract_docid_word_positions; +mod extract_facet_exists_docids; mod extract_facet_number_docids; mod extract_facet_string_docids; mod extract_fid_docid_facet_values; @@ -16,6 +17,7 @@ use log::debug; use rayon::prelude::*; use self::extract_docid_word_positions::extract_docid_word_positions; +use self::extract_facet_exists_docids::extract_facet_exists_docids; use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_string_docids::extract_facet_string_docids; use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; @@ -53,7 +55,7 @@ pub(crate) fn data_from_obkv_documents( }) .collect::>()?; - let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks + let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks .par_bridge() .map(|flattened_obkv_chunks| { send_and_extract_flattened_documents_data( @@ -72,7 +74,10 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, - (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), + ( + docid_fid_facet_numbers_chunks, + (docid_fid_facet_strings_chunks, docid_fid_facet_exists_chunks), + ), ) = result?; spawn_extraction_task::<_, _, Vec>>( @@ -137,6 +142,15 @@ pub(crate) fn data_from_obkv_documents( TypedChunk::FieldIdFacetNumberDocids, "field-id-facet-number-docids", ); + spawn_extraction_task::<_, _, Vec>>( + docid_fid_facet_exists_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_facet_exists_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::FieldIdFacetExistsDocids, + "field-id-facet-exists-docids", + ); Ok(()) } @@ -197,6 +211,7 @@ fn send_original_documents_data( /// - docid_word_positions /// - docid_fid_facet_numbers /// - docid_fid_facet_strings +/// - docid_fid_facet_exists fn send_and_extract_flattened_documents_data( flattened_documents_chunk: Result>, indexer: GrenadParameters, @@ -209,7 +224,10 @@ fn send_and_extract_flattened_documents_data( max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, - (grenad::Reader, grenad::Reader), + ( + grenad::Reader, + (grenad::Reader, grenad::Reader), + ), )> { let flattened_documents_chunk = flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -250,12 +268,15 @@ fn send_and_extract_flattened_documents_data( Ok(docid_word_positions_chunk) }, || { - let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = - extract_fid_docid_facet_values( - flattened_documents_chunk.clone(), - indexer.clone(), - faceted_fields, - )?; + let ( + docid_fid_facet_numbers_chunk, + docid_fid_facet_strings_chunk, + docid_fid_facet_exists_chunk, + ) = extract_fid_docid_facet_values( + flattened_documents_chunk.clone(), + indexer.clone(), + faceted_fields, + )?; // send docid_fid_facet_numbers_chunk to DB writer let docid_fid_facet_numbers_chunk = @@ -273,7 +294,13 @@ fn send_and_extract_flattened_documents_data( docid_fid_facet_strings_chunk.clone(), ))); - Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) + let docid_fid_facet_exists_chunk = + unsafe { as_cloneable_grenad(&docid_fid_facet_exists_chunk)? }; + + Ok(( + docid_fid_facet_numbers_chunk, + (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk), + )) }, ); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba428f078..82457762e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1931,4 +1931,153 @@ mod tests { assert_eq!(ids.len(), map.len()); } + + #[test] + fn index_documents_check_exists_database_reindex() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = documents!([ + { + "id": 0, + "colour": 0, + }, + { + "id": 1, + "colour": [] + }, + { + "id": 2, + "colour": {} + }, + { + "id": 3, + "colour": null + }, + { + "id": 4, + "colour": [1] + }, + { + "id": 5 + }, + { + "id": 6, + "colour": { + "green": 1 + } + } + ]); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + let faceted_fields = hashset!(S("colour")); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); + + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + + let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); + + let bitmap_colour_green = + index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); + } + + #[test] + fn index_documents_check_exists_database() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let config = IndexerConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + let faceted_fields = hashset!(S("colour")); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let content = documents!([ + { + "id": 0, + "colour": 0, + }, + { + "id": 1, + "colour": [] + }, + { + "id": 2, + "colour": {} + }, + { + "id": 3, + "colour": null + }, + { + "id": 4, + "colour": [1] + }, + { + "id": 5 + }, + { + "id": 6, + "colour": { + "green": 1 + } + } + ]); + + let indexing_config = IndexDocumentsConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); + + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + + let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); + + let bitmap_colour_green = + index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); + } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 26b97c3a0..e501e5efd 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,6 +35,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), + FieldIdFacetExistsDocids(grenad::Reader), GeoPoints(grenad::Reader), } @@ -146,6 +147,18 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids_iter) => { + append_entries_into_database( + facet_id_exists_docids_iter, + &index.facet_id_exists_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + ) + .unwrap(); + is_merged_database = true; + } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { append_entries_into_database( word_pair_proximity_docids_iter, From a8641b42a7963666eedb9846b6aa69481a3f92ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Jun 2022 15:05:49 +0200 Subject: [PATCH 1501/1889] Modify flatten_serde_json to keep dummy value for all object keys Example: ```json { "id": 0, "colour" : { "green": 1 } } ``` becomes: ```json { "id": 0, "colour" : [], "colour.green": 1 } ``` to retain the information the key "colour" exists in the original json value. --- flatten-serde-json/src/lib.rs | 53 ++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs index 8312f5bd6..e1b2b20c7 100644 --- a/flatten-serde-json/src/lib.rs +++ b/flatten-serde-json/src/lib.rs @@ -4,7 +4,11 @@ use serde_json::{Map, Value}; pub fn flatten(json: &Map) -> Map { let mut obj = Map::new(); - insert_object(&mut obj, None, json); + let mut all_keys = vec![]; + insert_object(&mut obj, None, json, &mut all_keys); + for key in all_keys { + obj.entry(key).or_insert(Value::Array(vec![])); + } obj } @@ -12,26 +16,32 @@ fn insert_object( base_json: &mut Map, base_key: Option<&str>, object: &Map, + all_keys: &mut Vec, ) { for (key, value) in object { let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); - + all_keys.push(new_key.clone()); if let Some(array) = value.as_array() { - insert_array(base_json, &new_key, array); + insert_array(base_json, &new_key, array, all_keys); } else if let Some(object) = value.as_object() { - insert_object(base_json, Some(&new_key), object); + insert_object(base_json, Some(&new_key), object, all_keys); } else { insert_value(base_json, &new_key, value.clone()); } } } -fn insert_array(base_json: &mut Map, base_key: &str, array: &Vec) { +fn insert_array( + base_json: &mut Map, + base_key: &str, + array: &Vec, + all_keys: &mut Vec, +) { for value in array { if let Some(object) = value.as_object() { - insert_object(base_json, Some(base_key), object); + insert_object(base_json, Some(base_key), object, all_keys); } else if let Some(sub_array) = value.as_array() { - insert_array(base_json, base_key, sub_array); + insert_array(base_json, base_key, sub_array, all_keys); } else { insert_value(base_json, base_key, value.clone()); } @@ -103,6 +113,7 @@ mod tests { assert_eq!( &flat, json!({ + "a": [], "a.b": "c", "a.d": "e", "a.f": "g" @@ -116,6 +127,10 @@ mod tests { fn flatten_array() { let mut base: Value = json!({ "a": [ + 1, + "b", + [], + [{}], { "b": "c" }, { "b": "d" }, { "b": "e" }, @@ -127,6 +142,7 @@ mod tests { assert_eq!( &flat, json!({ + "a": [1, "b"], "a.b": ["c", "d", "e"], }) .as_object() @@ -154,6 +170,28 @@ mod tests { .as_object() .unwrap() ); + + // here we must keep 42 in "a" + let mut base: Value = json!({ + "a": [ + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + null, + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": null, + "a.b": ["c", "d", "e"], + }) + .as_object() + .unwrap() + ); } #[test] @@ -170,6 +208,7 @@ mod tests { assert_eq!( &flat, json!({ + "a": [], "a.b": ["c", "d"], }) .as_object() From 72452f0cb270d5225e83b31ec4d707845afd1eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 25 May 2022 11:55:16 +0200 Subject: [PATCH 1502/1889] Implements the EXIST filter operator --- filter-parser/src/condition.rs | 13 ++++++++++++- filter-parser/src/error.rs | 4 ++-- filter-parser/src/lib.rs | 27 ++++++++++++++++++++++----- milli/src/search/facet/filter.rs | 19 +++++++++++++++++++ 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 264787055..0ece99a0d 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -8,7 +8,7 @@ use nom::branch::alt; use nom::bytes::complete::tag; use nom::combinator::cut; -use nom::sequence::tuple; +use nom::sequence::{terminated, tuple}; use Condition::*; use crate::{parse_value, FilterCondition, IResult, Span, Token}; @@ -19,6 +19,8 @@ pub enum Condition<'a> { GreaterThanOrEqual(Token<'a>), Equal(Token<'a>), NotEqual(Token<'a>), + Exist, + NotExist, LowerThan(Token<'a>), LowerThanOrEqual(Token<'a>), Between { from: Token<'a>, to: Token<'a> }, @@ -33,6 +35,8 @@ impl<'a> Condition<'a> { GreaterThanOrEqual(n) => (LowerThan(n), None), Equal(s) => (NotEqual(s), None), NotEqual(s) => (Equal(s), None), + Exist => (NotExist, None), + NotExist => (Exist, None), LowerThan(n) => (GreaterThanOrEqual(n), None), LowerThanOrEqual(n) => (GreaterThan(n), None), Between { from, to } => (LowerThan(from), Some(GreaterThan(to))), @@ -58,6 +62,13 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } +/// exist = value EXIST +pub fn parse_exist(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("EXIST"))(input)?; + + Ok((input, FilterCondition::Condition { fid: key.into(), op: Exist })) +} + /// to = value value TO value pub fn parse_to(input: Span) -> IResult { let (input, (key, from, _, to)) = diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index ddf7bea47..8136732c8 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -128,10 +128,10 @@ impl<'a> Display for Error<'a> { writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)? } ErrorKind::InvalidPrimary if input.trim().is_empty() => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing.")? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing.")? } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `{}`.", escaped_input)? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `{}`.", escaped_input)? } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 243d1a3f4..ee4edc122 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -6,8 +6,9 @@ //! or = and (~ "OR" ~ and) //! and = not (~ "AND" not)* //! not = ("NOT" ~ not) | primary -//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to +//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | exist | to //! condition = value ("==" | ">" ...) value +//! exist = value EXIST //! to = value value TO value //! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* //! singleQuoted = "'" .* all but quotes "'" @@ -42,6 +43,7 @@ mod value; use std::fmt::Debug; use std::str::FromStr; +use condition::parse_exist; pub use condition::{parse_condition, parse_to, Condition}; use error::{cut_with_err, NomErrorExt}; pub use error::{Error, ErrorKind}; @@ -248,6 +250,7 @@ fn parse_primary(input: Span) -> IResult { ), parse_geo_radius, parse_condition, + parse_exist, parse_to, // the next lines are only for error handling and are written at the end to have the less possible performance impact parse_geo_point, @@ -420,6 +423,20 @@ pub mod tests { op: Condition::LowerThan(rtok("NOT subscribers >= ", "1000")), }, ), + ( + "subscribers EXIST", + Fc::Condition { + fid: rtok("", "subscribers"), + op: Condition::Exist, + }, + ), + ( + "NOT subscribers EXIST", + Fc::Condition { + fid: rtok("NOT ", "subscribers"), + op: Condition::NotExist, + }, + ), ( "subscribers 100 TO 1000", Fc::Condition { @@ -577,10 +594,10 @@ pub mod tests { ("channel = ", "Was expecting a value but instead got nothing."), ("channel = 🐻", "Was expecting a value but instead got `🐻`."), ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."), - ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `OR`."), - ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `AND`."), - ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing."), + ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `OR`."), + ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `AND`."), + ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `channel Ponce`."), + ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index d89413f62..a5c13ec2a 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -280,6 +280,25 @@ impl<'a> Filter<'a> { Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)), Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)), Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)), + Condition::Exist => { + let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; + return Ok(exist); + } + Condition::NotExist => { + let all_ids = index.documents_ids(rtxn)?; + + let exist = Self::evaluate_operator( + rtxn, + index, + numbers_db, + strings_db, + field_id, + &Condition::Exist, + )?; + + let notexist = all_ids - exist; + return Ok(notexist); + } Condition::Equal(val) => { let (_original_value, string_docids) = strings_db .get(rtxn, &(field_id, &val.value().to_lowercase()))? From dc64170a69e0be95b2c9e5bb96e1a5be0b58ec06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Jun 2022 16:42:09 +0200 Subject: [PATCH 1503/1889] =?UTF-8?q?Improve=20syntax=20of=20EXISTS=20filt?= =?UTF-8?q?er,=20allow=20=E2=80=9Cvalue=20NOT=20EXISTS=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- filter-parser/src/condition.rs | 22 ++++++++++------ filter-parser/src/error.rs | 4 +-- filter-parser/src/lib.rs | 45 +++++++++++++++++++------------- milli/src/search/facet/filter.rs | 6 ++--- 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 0ece99a0d..b57d68b75 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -19,8 +19,8 @@ pub enum Condition<'a> { GreaterThanOrEqual(Token<'a>), Equal(Token<'a>), NotEqual(Token<'a>), - Exist, - NotExist, + Exists, + NotExists, LowerThan(Token<'a>), LowerThanOrEqual(Token<'a>), Between { from: Token<'a>, to: Token<'a> }, @@ -35,8 +35,8 @@ impl<'a> Condition<'a> { GreaterThanOrEqual(n) => (LowerThan(n), None), Equal(s) => (NotEqual(s), None), NotEqual(s) => (Equal(s), None), - Exist => (NotExist, None), - NotExist => (Exist, None), + Exists => (NotExists, None), + NotExists => (Exists, None), LowerThan(n) => (GreaterThanOrEqual(n), None), LowerThanOrEqual(n) => (GreaterThan(n), None), Between { from, to } => (LowerThan(from), Some(GreaterThan(to))), @@ -62,11 +62,17 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } -/// exist = value EXIST -pub fn parse_exist(input: Span) -> IResult { - let (input, key) = terminated(parse_value, tag("EXIST"))(input)?; +/// exist = value NOT EXISTS +pub fn parse_exists(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; - Ok((input, FilterCondition::Condition { fid: key.into(), op: Exist })) + Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists })) +} +/// exist = value NOT EXISTS +pub fn parse_not_exists(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("NOT EXISTS"))(input)?; + + Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists })) } /// to = value value TO value diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index 8136732c8..a3720f7bf 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -128,10 +128,10 @@ impl<'a> Display for Error<'a> { writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)? } ErrorKind::InvalidPrimary if input.trim().is_empty() => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing.")? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")? } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `{}`.", escaped_input)? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `{}`.", escaped_input)? } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index ee4edc122..69215798c 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -3,23 +3,24 @@ //! ```text //! filter = expression ~ EOF //! expression = or -//! or = and (~ "OR" ~ and) -//! and = not (~ "AND" not)* -//! not = ("NOT" ~ not) | primary -//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | exist | to +//! or = and ("OR" and) +//! and = not ("AND" not)* +//! not = ("NOT" not) | primary +//! primary = (WS* "(" expression ")" WS*) | geoRadius | condition | exists | not_exists | to //! condition = value ("==" | ">" ...) value -//! exist = value EXIST +//! exists = value EXISTS +//! not_exists = value NOT EXISTS //! to = value value TO value -//! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +//! value = WS* ( word | singleQuoted | doubleQuoted) ~ WS* //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* ~ "_geoRadius(" ~ WS* ~ float ~ WS* ~ "," ~ WS* ~ float ~ WS* ~ "," float ~ WS* ~ ")" +//! geoRadius = WS* ~ "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" //! ``` //! //! Other BNF grammar used to handle some specific errors: //! ```text -//! geoPoint = WS* ~ "_geoPoint(" ~ (float ~ ",")* ~ ")" +//! geoPoint = WS* "_geoPoint(" (float ",")* ")" //! ``` //! //! Specific errors: @@ -43,8 +44,8 @@ mod value; use std::fmt::Debug; use std::str::FromStr; -use condition::parse_exist; pub use condition::{parse_condition, parse_to, Condition}; +use condition::{parse_exists, parse_not_exists}; use error::{cut_with_err, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; @@ -250,7 +251,8 @@ fn parse_primary(input: Span) -> IResult { ), parse_geo_radius, parse_condition, - parse_exist, + parse_exists, + parse_not_exists, parse_to, // the next lines are only for error handling and are written at the end to have the less possible performance impact parse_geo_point, @@ -424,17 +426,24 @@ pub mod tests { }, ), ( - "subscribers EXIST", + "subscribers EXISTS", Fc::Condition { fid: rtok("", "subscribers"), - op: Condition::Exist, + op: Condition::Exists, }, ), ( - "NOT subscribers EXIST", + "NOT subscribers EXISTS", Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Condition::NotExist, + op: Condition::NotExists, + }, + ), + ( + "subscribers NOT EXISTS", + Fc::Condition { + fid: rtok("", "subscribers"), + op: Condition::NotExists, }, ), ( @@ -594,10 +603,10 @@ pub mod tests { ("channel = ", "Was expecting a value but instead got nothing."), ("channel = 🐻", "Was expecting a value but instead got `🐻`."), ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."), - ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `OR`."), - ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `AND`."), - ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing."), + ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `OR`."), + ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `AND`."), + ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."), + ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index a5c13ec2a..7f3b928dd 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -280,11 +280,11 @@ impl<'a> Filter<'a> { Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)), Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)), Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)), - Condition::Exist => { + Condition::Exists => { let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; return Ok(exist); } - Condition::NotExist => { + Condition::NotExists => { let all_ids = index.documents_ids(rtxn)?; let exist = Self::evaluate_operator( @@ -293,7 +293,7 @@ impl<'a> Filter<'a> { numbers_db, strings_db, field_id, - &Condition::Exist, + &Condition::Exists, )?; let notexist = all_ids - exist; From 0388b2d46382e5578824dd5b2b522cead764f35b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 08:58:41 +0200 Subject: [PATCH 1504/1889] Run cargo fmt --- milli/src/heed_codec/facet/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 79dbffa1d..8c5a4c118 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -26,10 +26,11 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { } } -use crate::{try_split_array_at, DocumentId, FieldId}; use std::borrow::Cow; use std::convert::TryInto; +use crate::{try_split_array_at, DocumentId, FieldId}; + pub struct FieldIdCodec; impl<'a> heed::BytesDecode<'a> for FieldIdCodec { From a5c916225077b0dfe8ba35cea702af730f692b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 09:14:19 +0200 Subject: [PATCH 1505/1889] Improve parser for NOT EXISTS filter Allow multiple spaces between NOT and EXISTS --- filter-parser/src/condition.rs | 10 ++++---- filter-parser/src/lib.rs | 43 +++++++++++++++++++++++----------- filter-parser/src/value.rs | 2 +- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index b57d68b75..6a5ecbe0a 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -7,11 +7,12 @@ use nom::branch::alt; use nom::bytes::complete::tag; +use nom::character::complete::multispace1; use nom::combinator::cut; use nom::sequence::{terminated, tuple}; use Condition::*; -use crate::{parse_value, FilterCondition, IResult, Span, Token}; +use crate::{parse_value, ws, FilterCondition, IResult, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Condition<'a> { @@ -62,16 +63,17 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } -/// exist = value NOT EXISTS +/// exist = value "EXISTS" pub fn parse_exists(input: Span) -> IResult { let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists })) } -/// exist = value NOT EXISTS +/// exist = value "NOT" WS* "EXISTS" pub fn parse_not_exists(input: Span) -> IResult { - let (input, key) = terminated(parse_value, tag("NOT EXISTS"))(input)?; + let (input, key) = parse_value(input)?; + let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?; Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists })) } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 69215798c..e40519e87 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -1,21 +1,21 @@ //! BNF grammar: //! //! ```text -//! filter = expression ~ EOF +//! filter = expression EOF //! expression = or //! or = and ("OR" and) //! and = not ("AND" not)* //! not = ("NOT" not) | primary //! primary = (WS* "(" expression ")" WS*) | geoRadius | condition | exists | not_exists | to //! condition = value ("==" | ">" ...) value -//! exists = value EXISTS -//! not_exists = value NOT EXISTS -//! to = value value TO value -//! value = WS* ( word | singleQuoted | doubleQuoted) ~ WS* +//! exists = value "EXISTS" +//! not_exists = value "NOT" WS* "EXISTS" +//! to = value value "TO" value +//! value = WS* ( word | singleQuoted | doubleQuoted) WS* //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* ~ "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" +//! geoRadius = WS* "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" //! ``` //! //! Other BNF grammar used to handle some specific errors: @@ -31,7 +31,7 @@ //! field < 12 AND _geoPoint(1, 2) //! ``` //! -//! - If a user try to use a geoRadius as a value we must throw an error. +//! - If a user try to use a geoRadius as a value we must throw an error. //! ```text //! field = _geoRadius(12, 13, 14) //! ``` @@ -170,7 +170,7 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } -/// or = and (~ "OR" ~ and) +/// or = and ("OR" and) fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; // if we found a `OR` then we MUST find something next @@ -182,7 +182,7 @@ fn parse_or(input: Span) -> IResult { Ok((input, expr)) } -/// and = not (~ "AND" not)* +/// and = not ("AND" not)* fn parse_and(input: Span) -> IResult { let (input, lhs) = parse_not(input)?; // if we found a `AND` then we MUST find something next @@ -193,14 +193,14 @@ fn parse_and(input: Span) -> IResult { Ok((input, expr)) } -/// not = ("NOT" ~ not) | primary +/// not = ("NOT" not) | primary /// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. /// If we parse a `NOT` we MUST parse something behind. fn parse_not(input: Span) -> IResult { alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input) } -/// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) +/// geoRadius = WS* "_geoRadius(float "," float "," float) /// If we parse `_geoRadius` we MUST parse the rest of the expression. fn parse_geo_radius(input: Span) -> IResult { // we want to forbid space BEFORE the _geoRadius but not after @@ -224,7 +224,7 @@ fn parse_geo_radius(input: Span) -> IResult { Ok((input, res)) } -/// geoPoint = WS* ~ "_geoPoint(float ~ "," ~ float ~ "," float) +/// geoPoint = WS* "_geoPoint(float "," float "," float) fn parse_geo_point(input: Span) -> IResult { // we want to forbid space BEFORE the _geoPoint but not after tuple(( @@ -238,7 +238,7 @@ fn parse_geo_point(input: Span) -> IResult { Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) } -/// primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to +/// primary = (WS* "(" expression ")" WS*) | geoRadius | condition | to fn parse_primary(input: Span) -> IResult { alt(( // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis @@ -266,7 +266,7 @@ pub fn parse_expression(input: Span) -> IResult { parse_or(input) } -/// filter = expression ~ EOF +/// filter = expression EOF pub fn parse_filter(input: Span) -> IResult { terminated(parse_expression, eof)(input) } @@ -446,6 +446,20 @@ pub mod tests { op: Condition::NotExists, }, ), + ( + "NOT subscribers NOT EXISTS", + Fc::Condition { + fid: rtok("NOT ", "subscribers"), + op: Condition::Exists, + }, + ), + ( + "subscribers NOT EXISTS", + Fc::Condition { + fid: rtok("", "subscribers"), + op: Condition::NotExists, + }, + ), ( "subscribers 100 TO 1000", Fc::Condition { @@ -616,6 +630,7 @@ pub mod tests { ("channel = \"ponce", "Expression `\\\"ponce` is missing the following closing delimiter: `\"`."), ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."), ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."), + ("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."), ]; for (input, expected) in test_case { diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 84dd21902..18ae58ae5 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -48,7 +48,7 @@ fn quoted_by(quote: char, input: Span) -> IResult { )) } -/// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +/// value = WS* ( word | singleQuoted | doubleQuoted) WS* pub fn parse_value<'a>(input: Span<'a>) -> IResult> { // to get better diagnostic message we are going to strip the left whitespaces from the input right now let (input, _) = take_while(char::is_whitespace)(input)?; From 722db7b088a86f2f4cabef93927e6815a791136e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 09:28:21 +0200 Subject: [PATCH 1506/1889] Ignore target directory of filter-parser/fuzz crate --- filter-parser/fuzz/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/filter-parser/fuzz/.gitignore b/filter-parser/fuzz/.gitignore index cb73742e4..9a2e1d58c 100644 --- a/filter-parser/fuzz/.gitignore +++ b/filter-parser/fuzz/.gitignore @@ -1,2 +1,3 @@ /corpus/ /artifacts/ +/target/ \ No newline at end of file From bd15f5625af5d3032f386d8c67cfb9a4d5f8330d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 13:31:51 +0200 Subject: [PATCH 1507/1889] Fix compiler warning --- filter-parser/src/condition.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 6a5ecbe0a..c63f1d926 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -12,7 +12,7 @@ use nom::combinator::cut; use nom::sequence::{terminated, tuple}; use Condition::*; -use crate::{parse_value, ws, FilterCondition, IResult, Span, Token}; +use crate::{parse_value, FilterCondition, IResult, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Condition<'a> { From 392472f4bb9f1f16aa27e6b9feef249f2d1f31f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 06:19:33 +0200 Subject: [PATCH 1508/1889] Apply suggestions from code review Co-authored-by: Tamo --- filter-parser/src/lib.rs | 6 +++--- infos/src/main.rs | 2 +- milli/src/index.rs | 3 +-- .../index_documents/extract/extract_facet_exists_docids.rs | 2 -- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index e40519e87..5cce5f4c3 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -170,7 +170,7 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } -/// or = and ("OR" and) +/// or = and ("OR" and)* fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; // if we found a `OR` then we MUST find something next @@ -200,7 +200,7 @@ fn parse_not(input: Span) -> IResult { alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input) } -/// geoRadius = WS* "_geoRadius(float "," float "," float) +/// geoRadius = WS* "_geoRadius(float WS* "," WS* float WS* "," WS* float) /// If we parse `_geoRadius` we MUST parse the rest of the expression. fn parse_geo_radius(input: Span) -> IResult { // we want to forbid space BEFORE the _geoRadius but not after @@ -224,7 +224,7 @@ fn parse_geo_radius(input: Span) -> IResult { Ok((input, res)) } -/// geoPoint = WS* "_geoPoint(float "," float "," float) +/// geoPoint = WS* "_geoPoint(float WS* "," WS* float WS* "," WS* float) fn parse_geo_point(input: Span) -> IResult { // we want to forbid space BEFORE the _geoPoint but not after tuple(( diff --git a/infos/src/main.rs b/infos/src/main.rs index 89aec6182..feec17557 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -551,7 +551,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let db = facet_id_exists_docids.remap_data_type::(); for result in facet_values_iter(rtxn, db, facet_id)? { let (_fid, value) = result?; - let key = format!("{}", facet_name); + let key = facet_name.to_string(); heap.push(Reverse((value.len(), key, facet_id_exists_docids_name))); if heap.len() > limit { heap.pop(); diff --git a/milli/src/index.rs b/milli/src/index.rs index 816112178..b0897271e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -156,8 +156,7 @@ impl Index { let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; - let facet_id_exists_docids: Database = - env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; + let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs index e7a001c08..d25c57aea 100644 --- a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs @@ -32,9 +32,7 @@ pub fn extract_facet_exists_docids( let mut cursor = docid_fid_facet_number.into_cursor()?; while let Some((key_bytes, _)) = cursor.move_on_next()? { let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap(); - let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap(); - facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } From 30bd4db0fcbcf31d80baec5892d28076c16b8577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 08:24:16 +0200 Subject: [PATCH 1509/1889] Simplify indexing task for facet_exists_docids database --- milli/src/heed_codec/facet/field_id_codec.rs | 25 +++++++++++ milli/src/heed_codec/facet/mod.rs | 43 +------------------ milli/src/lib.rs | 1 + .../extract/extract_facet_exists_docids.rs | 40 ----------------- .../extract/extract_fid_docid_facet_values.rs | 24 +++++++---- .../src/update/index_documents/extract/mod.rs | 22 +++------- 6 files changed, 50 insertions(+), 105 deletions(-) create mode 100644 milli/src/heed_codec/facet/field_id_codec.rs delete mode 100644 milli/src/update/index_documents/extract/extract_facet_exists_docids.rs diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs new file mode 100644 index 000000000..d147423f2 --- /dev/null +++ b/milli/src/heed_codec/facet/field_id_codec.rs @@ -0,0 +1,25 @@ +use crate::{FieldId, BEU16}; +use heed::zerocopy::AsBytes; +use std::{borrow::Cow, convert::TryInto}; + +pub struct FieldIdCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdCodec { + type DItem = FieldId; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let bytes: [u8; 2] = bytes[..2].try_into().ok()?; + let field_id = BEU16::from(bytes).get(); + Some(field_id) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldIdCodec { + type EItem = FieldId; + + fn bytes_encode(field_id: &Self::EItem) -> Option> { + let field_id = BEU16::new(*field_id); + let bytes = field_id.as_bytes(); + Some(Cow::Owned(bytes.to_vec())) + } +} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 8c5a4c118..384991fd7 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -5,6 +5,7 @@ mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; +mod field_id_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; @@ -15,6 +16,7 @@ pub use self::facet_string_level_zero_value_codec::{ pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; +pub use self::field_id_codec::FieldIdCodec; /// Tries to split a slice in half at the given middle point, /// `None` if the slice is too short. @@ -25,44 +27,3 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } - -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::{try_split_array_at, DocumentId, FieldId}; - -pub struct FieldIdCodec; - -impl<'a> heed::BytesDecode<'a> for FieldIdCodec { - type DItem = FieldId; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, _) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - Some(field_id) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldIdCodec { - type EItem = FieldId; - - fn bytes_encode(field_id: &Self::EItem) -> Option> { - Some(Cow::Owned(field_id.to_be_bytes().to_vec())) - } -} - -pub struct FieldIdDocIdCodec; - -impl<'a> heed::BytesDecode<'a> for FieldIdDocIdCodec { - type DItem = (FieldId, DocumentId); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - - let document_id_bytes = bytes[..4].try_into().ok()?; - let document_id = u32::from_be_bytes(document_id_bytes); - - Some((field_id, document_id)) - } -} diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 81cd057d5..20fdceaec 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -49,6 +49,7 @@ pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; +pub type BEU16 = heed::zerocopy::U16; pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; pub type Attribute = u32; diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs deleted file mode 100644 index d25c57aea..000000000 --- a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs +++ /dev/null @@ -1,40 +0,0 @@ -use std::fs::File; -use std::io; - -use heed::{BytesDecode, BytesEncode}; - -use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, -}; -use crate::heed_codec::facet::{FieldIdCodec, FieldIdDocIdCodec}; -use crate::Result; - -/// Extracts the documents ids where this field appears. -/// -/// Returns a grenad reader whose key is the field id encoded -/// with `FieldIdCodec` and the value is a document_id (u32) -/// encoded as native-endian bytes. -#[logging_timer::time] -pub fn extract_facet_exists_docids( - docid_fid_facet_number: grenad::Reader, - indexer: GrenadParameters, -) -> Result> { - let max_memory = indexer.max_memory_by_thread(); - - let mut facet_exists_docids_sorter = create_sorter( - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ); - - let mut cursor = docid_fid_facet_number.into_cursor()?; - while let Some((key_bytes, _)) = cursor.move_on_next()? { - let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap(); - let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap(); - facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; - } - - sorter_into_reader(facet_exists_docids_sorter, indexer) -} diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index d93bde500..c83ac49e0 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,15 +1,16 @@ +use heed::zerocopy::AsBytes; +use serde_json::Value; use std::collections::HashSet; +use std::convert::TryInto; use std::fs::File; use std::io; use std::mem::size_of; -use heed::zerocopy::AsBytes; -use serde_json::Value; - use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::{DocumentId, FieldId, Result}; +use crate::update::index_documents::merge_cbo_roaring_bitmaps; +use crate::{DocumentId, FieldId, Result, BEU32}; /// Extracts the facet values of each faceted field of each document. /// @@ -40,7 +41,7 @@ pub fn extract_fid_docid_facet_values( ); let mut fid_docid_facet_exists_sorter = create_sorter( - keep_first, + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -56,12 +57,17 @@ pub fn extract_fid_docid_facet_values( if faceted_fields.contains(&field_id) { key_buffer.clear(); - // here, we know already that the document must be added to the “field id exists” database - // prefix key with the field_id and the document_id - + // Set key to the field_id + // Note: this encoding is consistent with FieldIdCodec key_buffer.extend_from_slice(&field_id.to_be_bytes()); + + // Here, we know already that the document must be added to the “field id exists” database + let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); + let document = BEU32::from(document).get(); + fid_docid_facet_exists_sorter.insert(&key_buffer, document.to_ne_bytes())?; + + // For the other extraction tasks, prefix the key with the field_id and the document_id key_buffer.extend_from_slice(&docid_bytes); - fid_docid_facet_exists_sorter.insert(&key_buffer, ().as_bytes())?; let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 7d26e0984..bb695a99f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -1,5 +1,4 @@ mod extract_docid_word_positions; -mod extract_facet_exists_docids; mod extract_facet_number_docids; mod extract_facet_string_docids; mod extract_fid_docid_facet_values; @@ -17,7 +16,6 @@ use log::debug; use rayon::prelude::*; use self::extract_docid_word_positions::extract_docid_word_positions; -use self::extract_facet_exists_docids::extract_facet_exists_docids; use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_string_docids::extract_facet_string_docids; use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; @@ -142,15 +140,12 @@ pub(crate) fn data_from_obkv_documents( TypedChunk::FieldIdFacetNumberDocids, "field-id-facet-number-docids", ); - spawn_extraction_task::<_, _, Vec>>( - docid_fid_facet_exists_chunks.clone(), - indexer.clone(), - lmdb_writer_sx.clone(), - extract_facet_exists_docids, - merge_cbo_roaring_bitmaps, - TypedChunk::FieldIdFacetExistsDocids, - "field-id-facet-exists-docids", - ); + + // spawn extraction task for field-id-facet-exists-docids + rayon::spawn(move || { + let reader = docid_fid_facet_exists_chunks.merge(merge_cbo_roaring_bitmaps, &indexer); + let _ = lmdb_writer_sx.send(reader.map(TypedChunk::FieldIdFacetExistsDocids)); + }); Ok(()) } @@ -226,7 +221,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, grenad::Reader), + (grenad::Reader, grenad::Reader), ), )> { let flattened_documents_chunk = @@ -294,9 +289,6 @@ fn send_and_extract_flattened_documents_data( docid_fid_facet_strings_chunk.clone(), ))); - let docid_fid_facet_exists_chunk = - unsafe { as_cloneable_grenad(&docid_fid_facet_exists_chunk)? }; - Ok(( docid_fid_facet_numbers_chunk, (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk), From c17d616250cee38d34dbad51173188ae8cc54ead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 08:41:33 +0200 Subject: [PATCH 1510/1889] Refactor index_documents_check_exists_database tests --- milli/src/update/index_documents/mod.rs | 222 +++++++++--------------- 1 file changed, 86 insertions(+), 136 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 82457762e..99f474eb6 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1932,152 +1932,102 @@ mod tests { assert_eq!(ids.len(), map.len()); } - #[test] - fn index_documents_check_exists_database_reindex() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { - "id": 0, - "colour": 0, - }, - { - "id": 1, - "colour": [] - }, - { - "id": 2, - "colour": {} - }, - { - "id": 3, - "colour": null - }, - { - "id": 4, - "colour": [1] - }, - { - "id": 5 - }, - { - "id": 6, - "colour": { - "green": 1 - } - } - ]); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - builder.add_documents(content).unwrap(); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - let faceted_fields = hashset!(S("colour")); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); - - let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); - let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); - - let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); - assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); - - let bitmap_colour_green = - index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); - assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); - } - #[test] fn index_documents_check_exists_database() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - let faceted_fields = hashset!(S("colour")); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - let content = documents!([ - { - "id": 0, - "colour": 0, - }, - { - "id": 1, - "colour": [] - }, - { - "id": 2, - "colour": {} - }, - { - "id": 3, - "colour": null - }, - { - "id": 4, - "colour": [1] - }, - { - "id": 5 - }, - { - "id": 6, - "colour": { - "green": 1 - } - } - ]); - let indexing_config = IndexDocumentsConfig::default(); - let mut wtxn = index.write_txn().unwrap(); + let faceted_fields = hashset!(S("colour")); + let content = || { + documents!([ + { + "id": 0, + "colour": 0, + }, + { + "id": 1, + "colour": [] + }, + { + "id": 2, + "colour": {} + }, + { + "id": 3, + "colour": null + }, + { + "id": 4, + "colour": [1] + }, + { + "id": 5 + }, + { + "id": 6, + "colour": { + "green": 1 + } + }, + { + "id": 7, + "colour": { + "green": { + "blue": [] + } + } + } + ]) + }; + let make_index = || { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + Index::new(options, &path).unwrap() + }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - builder.add_documents(content).unwrap(); - builder.execute().unwrap(); + let set_filterable_fields = |index: &Index| { + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_filterable_fields(faceted_fields.clone()); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + }; + let add_documents = |index: &Index| { + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content()).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + }; - wtxn.commit().unwrap(); + let check_ok = |index: &Index| { + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); - let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); - let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); - let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + let bitmap_colour = + index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6, 7]); - let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); - assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); + let bitmap_colour_green = + index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6, 7]); + }; - let bitmap_colour_green = - index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); - assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); + let index = make_index(); + add_documents(&index); + set_filterable_fields(&index); + check_ok(&index); + + let index = make_index(); + set_filterable_fields(&index); + add_documents(&index); + check_ok(&index); } } From ea0642c32d799b201c61cc81d8f979e33cc74380 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 09:12:37 +0200 Subject: [PATCH 1511/1889] Make filter parser more strict regarding spacing around operators OR, AND, NOT, TO must now be followed by spaces --- filter-parser/src/condition.rs | 11 ++++----- filter-parser/src/lib.rs | 45 ++++++++++++++++++++-------------- filter-parser/src/value.rs | 2 +- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index c63f1d926..cbf73b96a 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -44,8 +44,7 @@ impl<'a> Condition<'a> { } } } - -/// condition = value ("==" | ">" ...) value +/// condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value pub fn parse_condition(input: Span) -> IResult { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?; @@ -69,7 +68,7 @@ pub fn parse_exists(input: Span) -> IResult { Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists })) } -/// exist = value "NOT" WS* "EXISTS" +/// exist = value "NOT" WS+ "EXISTS" pub fn parse_not_exists(input: Span) -> IResult { let (input, key) = parse_value(input)?; @@ -77,10 +76,10 @@ pub fn parse_not_exists(input: Span) -> IResult { Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists })) } -/// to = value value TO value +/// to = value value "TO" WS+ value pub fn parse_to(input: Span) -> IResult { - let (input, (key, from, _, to)) = - tuple((parse_value, parse_value, tag("TO"), cut(parse_value)))(input)?; + let (input, (key, from, _, _, to)) = + tuple((parse_value, parse_value, tag("TO"), multispace1, cut(parse_value)))(input)?; Ok((input, FilterCondition::Condition { fid: key, op: Between { from, to } })) } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 5cce5f4c3..01be432d7 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -3,19 +3,19 @@ //! ```text //! filter = expression EOF //! expression = or -//! or = and ("OR" and) -//! and = not ("AND" not)* -//! not = ("NOT" not) | primary -//! primary = (WS* "(" expression ")" WS*) | geoRadius | condition | exists | not_exists | to -//! condition = value ("==" | ">" ...) value +//! or = and ("OR" WS+ and)* +//! and = not ("AND" WS+ not)* +//! not = ("NOT" WS+ not) | primary +//! primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to +//! condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value //! exists = value "EXISTS" -//! not_exists = value "NOT" WS* "EXISTS" -//! to = value value "TO" value -//! value = WS* ( word | singleQuoted | doubleQuoted) WS* +//! not_exists = value "NOT" WS+ "EXISTS" +//! to = value value "TO" WS+ value +//! value = WS* ( word | singleQuoted | doubleQuoted) WS+ //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" +//! geoRadius = "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" //! ``` //! //! Other BNF grammar used to handle some specific errors: @@ -50,7 +50,7 @@ use error::{cut_with_err, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; use nom::bytes::complete::tag; -use nom::character::complete::{char, multispace0}; +use nom::character::complete::{char, multispace0, multispace1}; use nom::combinator::{cut, eof, map}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; @@ -170,11 +170,11 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } -/// or = and ("OR" and)* +/// or = and ("OR" WS+ and)* fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; // if we found a `OR` then we MUST find something next - let (input, ors) = many0(preceded(ws(tag("OR")), cut(parse_and)))(input)?; + let (input, ors) = many0(preceded(ws(tuple((tag("OR"), multispace1))), cut(parse_and)))(input)?; let expr = ors .into_iter() @@ -186,24 +186,28 @@ fn parse_or(input: Span) -> IResult { fn parse_and(input: Span) -> IResult { let (input, lhs) = parse_not(input)?; // if we found a `AND` then we MUST find something next - let (input, ors) = many0(preceded(ws(tag("AND")), cut(parse_not)))(input)?; + let (input, ors) = + many0(preceded(ws(tuple((tag("AND"), multispace1))), cut(parse_not)))(input)?; let expr = ors .into_iter() .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); Ok((input, expr)) } -/// not = ("NOT" not) | primary -/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. +/// not = ("NOT" WS+ not) | primary +/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. /// If we parse a `NOT` we MUST parse something behind. fn parse_not(input: Span) -> IResult { - alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input) + alt(( + map(preceded(ws(tuple((tag("NOT"), multispace1))), cut(parse_not)), |e| e.negate()), + parse_primary, + ))(input) } /// geoRadius = WS* "_geoRadius(float WS* "," WS* float WS* "," WS* float) /// If we parse `_geoRadius` we MUST parse the rest of the expression. fn parse_geo_radius(input: Span) -> IResult { - // we want to forbid space BEFORE the _geoRadius but not after + // we want to allow space BEFORE the _geoRadius but not after let parsed = preceded( tuple((multispace0, tag("_geoRadius"))), // if we were able to parse `_geoRadius` and can't parse the rest of the input we return a failure @@ -238,7 +242,7 @@ fn parse_geo_point(input: Span) -> IResult { Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) } -/// primary = (WS* "(" expression ")" WS*) | geoRadius | condition | to +/// primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to fn parse_primary(input: Span) -> IResult { alt(( // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis @@ -620,7 +624,7 @@ pub mod tests { ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `OR`."), ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `AND`."), ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing."), + ("channel = Ponce OR", "Found unexpected characters at the end of the filter: `OR`. You probably forgot an `OR` or an `AND` rule."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), @@ -631,6 +635,9 @@ pub mod tests { ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."), ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."), ("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."), + ("subscribers 100 TO1000", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`."), + ("channel = ponce ORdog != 'bernese mountain'", "Found unexpected characters at the end of the filter: `ORdog != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), + ("channel = ponce AND'dog' != 'bernese mountain'", "Found unexpected characters at the end of the filter: `AND\\'dog\\' != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), ]; for (input, expected) in test_case { diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 18ae58ae5..22da6a0df 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -48,7 +48,7 @@ fn quoted_by(quote: char, input: Span) -> IResult { )) } -/// value = WS* ( word | singleQuoted | doubleQuoted) WS* +/// value = WS* ( word | singleQuoted | doubleQuoted) WS+ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { // to get better diagnostic message we are going to strip the left whitespaces from the input right now let (input, _) = take_while(char::is_whitespace)(input)?; From 80b962b4f4aeacaec68e22279e3349adf3138746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 09:16:53 +0200 Subject: [PATCH 1512/1889] Run cargo fmt --- milli/src/heed_codec/facet/field_id_codec.rs | 7 +++++-- .../extract/extract_fid_docid_facet_values.rs | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs index d147423f2..871b05a09 100644 --- a/milli/src/heed_codec/facet/field_id_codec.rs +++ b/milli/src/heed_codec/facet/field_id_codec.rs @@ -1,6 +1,9 @@ -use crate::{FieldId, BEU16}; +use std::borrow::Cow; +use std::convert::TryInto; + use heed::zerocopy::AsBytes; -use std::{borrow::Cow, convert::TryInto}; + +use crate::{FieldId, BEU16}; pub struct FieldIdCodec; diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index c83ac49e0..6d66a7a64 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,11 +1,12 @@ -use heed::zerocopy::AsBytes; -use serde_json::Value; use std::collections::HashSet; use std::convert::TryInto; use std::fs::File; use std::io; use std::mem::size_of; +use heed::zerocopy::AsBytes; +use serde_json::Value; + use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; From 4f0bd317dff811a74a86649d4c6e064f31949c15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 4 Jul 2022 08:41:54 +0200 Subject: [PATCH 1513/1889] Remove custom implementation of BytesEncode/Decode for the FieldId --- milli/src/heed_codec/facet/field_id_codec.rs | 28 -------------------- milli/src/heed_codec/facet/mod.rs | 7 +++-- milli/src/index.rs | 4 +-- 3 files changed, 7 insertions(+), 32 deletions(-) delete mode 100644 milli/src/heed_codec/facet/field_id_codec.rs diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs deleted file mode 100644 index 871b05a09..000000000 --- a/milli/src/heed_codec/facet/field_id_codec.rs +++ /dev/null @@ -1,28 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use heed::zerocopy::AsBytes; - -use crate::{FieldId, BEU16}; - -pub struct FieldIdCodec; - -impl<'a> heed::BytesDecode<'a> for FieldIdCodec { - type DItem = FieldId; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let bytes: [u8; 2] = bytes[..2].try_into().ok()?; - let field_id = BEU16::from(bytes).get(); - Some(field_id) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldIdCodec { - type EItem = FieldId; - - fn bytes_encode(field_id: &Self::EItem) -> Option> { - let field_id = BEU16::new(*field_id); - let bytes = field_id.as_bytes(); - Some(Cow::Owned(bytes.to_vec())) - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 384991fd7..51812d97a 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -5,7 +5,9 @@ mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; -mod field_id_codec; + +use crate::BEU16; +use heed::types::OwnedType; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; @@ -16,7 +18,8 @@ pub use self::facet_string_level_zero_value_codec::{ pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; -pub use self::field_id_codec::FieldIdCodec; + +pub type FieldIdCodec = OwnedType; /// Tries to split a slice in half at the given middle point, /// `None` if the slice is too short. diff --git a/milli/src/index.rs b/milli/src/index.rs index b0897271e..aec7aa396 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -21,7 +21,7 @@ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, StrBEU32Codec, StrStrU8Codec, BEU32, + Search, StrBEU32Codec, StrStrU8Codec, BEU16, BEU32, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -819,7 +819,7 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - match self.facet_id_exists_docids.get(rtxn, &field_id)? { + match self.facet_id_exists_docids.get(rtxn, &BEU16::new(field_id))? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), } From 1eb1e73bb3d909d2be54ff835126f008b812aa9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 4 Jul 2022 09:28:23 +0200 Subject: [PATCH 1514/1889] Add integration tests for the EXISTS filter --- milli/src/heed_codec/facet/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 11 ++++-- milli/tests/assets/test_set.ndjson | 30 +++++++------- milli/tests/search/filters.rs | 6 +++ milli/tests/search/mod.rs | 52 +++++++++++++++++++++++-- 5 files changed, 78 insertions(+), 23 deletions(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 51812d97a..0b2d9186f 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -6,7 +6,6 @@ mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; -use crate::BEU16; use heed::types::OwnedType; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; @@ -18,6 +17,7 @@ pub use self::facet_string_level_zero_value_codec::{ pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; +use crate::BEU16; pub type FieldIdCodec = OwnedType; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 99f474eb6..950b3a417 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -592,7 +592,7 @@ mod tests { use super::*; use crate::documents::DocumentBatchBuilder; use crate::update::DeleteDocuments; - use crate::HashMap; + use crate::{HashMap, BEU16}; #[test] fn simple_document_replacement() { @@ -2012,11 +2012,14 @@ mod tests { let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); let bitmap_colour = - index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + index.facet_id_exists_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap(); assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6, 7]); - let bitmap_colour_green = - index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + let bitmap_colour_green = index + .facet_id_exists_docids + .get(&rtxn, &BEU16::new(colour_green_id)) + .unwrap() + .unwrap(); assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6, 7]); }; diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 6383d274e..427daca8c 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -1,17 +1,17 @@ -{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":""} -{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":""} -{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":""} -{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":""} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":""} -{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":""} -{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":""} -{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":""} +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null]} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": []} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E"} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"]} +{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8]} {"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""} -{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":""} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""} -{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":""} -{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":""} -{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":""} -{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":""} -{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":""} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] } +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}} +{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]} +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}} } +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]] } +{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16} {"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""} diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index fe926d17a..1700a1478 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -80,3 +80,9 @@ test_filter!( lower_complex_filter_2, vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])] ); +test_filter!(exists_filter_1, vec![Right("opt1 EXISTS")]); +test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]); +test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]); +test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]); + +test_filter!(exists_filter_2, vec![Right("opt1.opt2 EXISTS")]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 472fbafe0..ec784bfc0 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -9,7 +9,7 @@ use maplit::{hashmap, hashset}; use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index, Member}; -use serde::Deserialize; +use serde::{Deserialize, Deserializer}; use slice_group_by::GroupBy; mod distinct; @@ -43,6 +43,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("tag"), S("asc_desc_rank"), S("_geo"), + S("opt1"), + S("opt1.opt2") }); builder.set_sortable_fields(hashset! { S("tag"), @@ -196,12 +198,44 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { id = (document.geo_rank < 100000).then(|| document.id.clone()); } else if filter.starts_with("NOT _geoRadius") { id = (document.geo_rank > 1000000).then(|| document.id.clone()); + } else if matches!(filter, "opt1 EXISTS" | "NOT opt1 NOT EXISTS") { + id = document.opt1.is_some().then(|| document.id.clone()); + } else if matches!(filter, "NOT opt1 EXISTS" | "opt1 NOT EXISTS") { + id = document.opt1.is_none().then(|| document.id.clone()); + } else if matches!(filter, "opt1.opt2 EXISTS") { + if document.opt1opt2.is_some() { + id = Some(document.id.clone()); + } else if let Some(opt1) = &document.opt1 { + id = contains_key_rec(opt1, "opt2").then(|| document.id.clone()); + } } id } +pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { + match v { + serde_json::Value::Array(v) => { + for v in v.iter() { + if contains_key_rec(v, key) { + return true; + } + } + false + } + serde_json::Value::Object(v) => { + for (k, v) in v.iter() { + if k == key || contains_key_rec(v, key) { + return true; + } + } + false + } + _ => false, + } +} + pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet { - let dataset: HashSet = + let dataset: Vec = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); let mut filtered_ids: HashSet<_> = dataset.iter().map(|d| d.id.clone()).collect(); @@ -229,7 +263,7 @@ pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet, + #[serde(default, deserialize_with = "some_option", rename = "opt1.opt2")] + pub opt1opt2: Option, +} + +fn some_option<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let result = serde_json::Value::deserialize(deserializer)?; + Ok(Some(result)) } From aed8c69bcb94b83ccb2957b741584f908c22d094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 09:57:28 +0200 Subject: [PATCH 1515/1889] Refactor indexation of the "facet-id-exists-docids" database The idea is to directly create a sorted and merged list of bitmaps in the form of a BTreeMap instead of creating a grenad::Reader where the keys are field_id and the values are docids. Then we send that BTreeMap to the thing that handles TypedChunks, which inserts its content into the database. --- .../extract/extract_fid_docid_facet_values.rs | 23 +++--- .../src/update/index_documents/extract/mod.rs | 35 +++++---- .../src/update/index_documents/typed_chunk.rs | 73 ++++++++++++++++--- 3 files changed, 92 insertions(+), 39 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 6d66a7a64..368378792 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,16 +1,16 @@ -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::io; use std::mem::size_of; use heed::zerocopy::AsBytes; +use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{DocumentId, FieldId, Result, BEU32}; /// Extracts the facet values of each faceted field of each document. @@ -22,7 +22,7 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { +) -> Result<(grenad::Reader, grenad::Reader, BTreeMap)> { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -30,7 +30,7 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 3), + max_memory.map(|m| m / 2), ); let mut fid_docid_facet_strings_sorter = create_sorter( @@ -38,16 +38,10 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 3), + max_memory.map(|m| m / 2), ); - let mut fid_docid_facet_exists_sorter = create_sorter( - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory.map(|m| m / 3), - ); + let mut facet_exists_docids = BTreeMap::::new(); let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; @@ -65,7 +59,8 @@ pub fn extract_fid_docid_facet_values( // Here, we know already that the document must be added to the “field id exists” database let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); let document = BEU32::from(document).get(); - fid_docid_facet_exists_sorter.insert(&key_buffer, document.to_ne_bytes())?; + + facet_exists_docids.entry(field_id).or_default().insert(document); // For the other extraction tasks, prefix the key with the field_id and the document_id key_buffer.extend_from_slice(&docid_bytes); @@ -99,7 +94,7 @@ pub fn extract_fid_docid_facet_values( Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?, - sorter_into_reader(fid_docid_facet_exists_sorter, indexer)?, + facet_exists_docids, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index bb695a99f..76d968919 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -8,12 +8,13 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::fs::File; use crossbeam_channel::Sender; use log::debug; use rayon::prelude::*; +use roaring::RoaringBitmap; use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_facet_number_docids::extract_facet_number_docids; @@ -72,12 +73,24 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, - ( - docid_fid_facet_numbers_chunks, - (docid_fid_facet_strings_chunks, docid_fid_facet_exists_chunks), - ), + (docid_fid_facet_numbers_chunks, (docid_fid_facet_strings_chunks, facet_exists_docids)), ) = result?; + // merge facet_exists_docids hashmaps and send them as a typed chunk + { + let lmdb_writer_sx = lmdb_writer_sx.clone(); + rayon::spawn(move || { + let mut all = BTreeMap::default(); + for facet_exists_docids in facet_exists_docids { + for (field_id, docids) in facet_exists_docids { + let docids0 = all.entry(field_id).or_default(); + *docids0 |= docids; + } + } + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(all))); + }); + } + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), @@ -141,12 +154,6 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-number-docids", ); - // spawn extraction task for field-id-facet-exists-docids - rayon::spawn(move || { - let reader = docid_fid_facet_exists_chunks.merge(merge_cbo_roaring_bitmaps, &indexer); - let _ = lmdb_writer_sx.send(reader.map(TypedChunk::FieldIdFacetExistsDocids)); - }); - Ok(()) } @@ -221,7 +228,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, grenad::Reader), + (grenad::Reader, BTreeMap), ), )> { let flattened_documents_chunk = @@ -266,7 +273,7 @@ fn send_and_extract_flattened_documents_data( let ( docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, - docid_fid_facet_exists_chunk, + facet_exists_docids, ) = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), indexer.clone(), @@ -291,7 +298,7 @@ fn send_and_extract_flattened_documents_data( Ok(( docid_fid_facet_numbers_chunk, - (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk), + (docid_fid_facet_strings_chunk, facet_exists_docids), )) }, ); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e501e5efd..e1fd8f98d 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,11 +1,12 @@ use std::borrow::Cow; +use std::collections::BTreeMap; use std::convert::TryInto; use std::fs::File; use std::io; use grenad::MergerBuilder; use heed::types::ByteSlice; -use heed::{BytesDecode, RwTxn}; +use heed::{BytesDecode, BytesEncode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ @@ -16,8 +17,8 @@ use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, - Result, + error, lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, + GeoPoint, Index, Result, BEU16, }; pub(crate) enum TypedChunk { @@ -35,7 +36,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), - FieldIdFacetExistsDocids(grenad::Reader), + FieldIdFacetExistsDocids(BTreeMap), GeoPoints(grenad::Reader), } @@ -147,16 +148,14 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids_iter) => { - append_entries_into_database( - facet_id_exists_docids_iter, + TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { + write_sorted_iterator_into_database( + facet_id_exists_docids.into_iter().map(|(k, v)| (BEU16::new(k), v)), &index.facet_id_exists_docids, + "facet-id-exists-docids", wtxn, - index_is_empty, - |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, - ) - .unwrap(); + )?; is_merged_database = true; } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { @@ -270,6 +269,58 @@ fn merge_cbo_roaring_bitmaps( )?) } +fn write_sorted_iterator_into_database( + mut iterator: Iter, + database: &heed::Database, + database_name: &'static str, + wtxn: &mut RwTxn, + merge_values: Merge, +) -> Result<()> +where + for<'a> KeyCodec: BytesEncode<'a, EItem = Key>, + for<'a> ValueCodec: BytesEncode<'a, EItem = Value> + BytesDecode<'a, DItem = Value>, + Iter: Iterator, + Merge: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, +{ + if database.is_empty(wtxn)? { + let mut database = database.iter_mut(wtxn)?.remap_types::(); + + while let Some((key, value)) = iterator.next() { + let key = KeyCodec::bytes_encode(&key) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + if valid_lmdb_key(&key) { + let value = ValueCodec::bytes_encode(&value) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + unsafe { database.append(&key, &value)? }; + } + } + + Ok(()) + } else { + let database = database.remap_types::(); + let mut buffer = Vec::new(); + while let Some((key, value)) = iterator.next() { + let key = KeyCodec::bytes_encode(&key) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + if valid_lmdb_key(&key) { + let value = ValueCodec::bytes_encode(&value) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + let value = match database.get(wtxn, &key)? { + Some(prev_value) => { + merge_values(&value, &prev_value, &mut buffer)?; + &buffer[..] + } + None => &value, + }; + + database.put(wtxn, &key, value)?; + } + } + + Ok(()) + } +} + /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( From d0eee5ff7a970b42b1a3723c1e627dc2f2ea8608 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 13:44:36 +0200 Subject: [PATCH 1516/1889] Fix compiler error --- milli/src/search/facet/filter.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 7f3b928dd..903f9644f 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -287,14 +287,7 @@ impl<'a> Filter<'a> { Condition::NotExists => { let all_ids = index.documents_ids(rtxn)?; - let exist = Self::evaluate_operator( - rtxn, - index, - numbers_db, - strings_db, - field_id, - &Condition::Exists, - )?; + let exist = Self::evaluate_operator(rtxn, index, field_id, &Condition::Exists)?; let notexist = all_ids - exist; return Ok(notexist); From 1506683705b0ee01afdef14d1f07a527d6dadc17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 14:42:35 +0200 Subject: [PATCH 1517/1889] Avoid using too much memory when indexing facet-exists-docids --- .../extract/extract_fid_docid_facet_values.rs | 19 +++++- .../src/update/index_documents/extract/mod.rs | 29 ++++---- .../src/update/index_documents/typed_chunk.rs | 68 +++---------------- 3 files changed, 40 insertions(+), 76 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 368378792..cf116e6f5 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -5,13 +5,15 @@ use std::io; use std::mem::size_of; use heed::zerocopy::AsBytes; +use heed::BytesEncode; use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::{DocumentId, FieldId, Result, BEU32}; +use crate::update::index_documents::{create_writer, writer_into_reader}; +use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32}; /// Extracts the facet values of each faceted field of each document. /// @@ -22,7 +24,7 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader, BTreeMap)> { +) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -91,10 +93,21 @@ pub fn extract_fid_docid_facet_values( } } + let mut facet_exists_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + for (fid, bitmap) in facet_exists_docids.into_iter() { + let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + } + let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; + Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?, - facet_exists_docids, + facet_exists_docids_reader, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 76d968919..157886e63 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -8,13 +8,12 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; -use std::collections::{BTreeMap, HashSet}; +use std::collections::HashSet; use std::fs::File; use crossbeam_channel::Sender; use log::debug; use rayon::prelude::*; -use roaring::RoaringBitmap; use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_facet_number_docids::extract_facet_number_docids; @@ -73,21 +72,25 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, - (docid_fid_facet_numbers_chunks, (docid_fid_facet_strings_chunks, facet_exists_docids)), + ( + docid_fid_facet_numbers_chunks, + (docid_fid_facet_strings_chunks, facet_exists_docids_chunks), + ), ) = result?; - // merge facet_exists_docids hashmaps and send them as a typed chunk + // merge facet_exists_docids and send them as a typed chunk { let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { - let mut all = BTreeMap::default(); - for facet_exists_docids in facet_exists_docids { - for (field_id, docids) in facet_exists_docids { - let docids0 = all.entry(field_id).or_default(); - *docids0 |= docids; + debug!("merge {} database", "facet-id-exists-docids"); + match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + Ok(reader) => { + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); } } - let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(all))); }); } @@ -228,7 +231,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, BTreeMap), + (grenad::Reader, grenad::Reader), ), )> { let flattened_documents_chunk = @@ -273,7 +276,7 @@ fn send_and_extract_flattened_documents_data( let ( docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, - facet_exists_docids, + fid_facet_exists_docids_chunk, ) = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), indexer.clone(), @@ -298,7 +301,7 @@ fn send_and_extract_flattened_documents_data( Ok(( docid_fid_facet_numbers_chunk, - (docid_fid_facet_strings_chunk, facet_exists_docids), + (docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk), )) }, ); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e1fd8f98d..5b7b00c21 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,12 +1,11 @@ use std::borrow::Cow; -use std::collections::BTreeMap; use std::convert::TryInto; use std::fs::File; use std::io; use grenad::MergerBuilder; use heed::types::ByteSlice; -use heed::{BytesDecode, BytesEncode, RwTxn}; +use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ @@ -17,8 +16,8 @@ use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ - error, lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, - GeoPoint, Index, Result, BEU16, + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, + Result, }; pub(crate) enum TypedChunk { @@ -36,7 +35,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), - FieldIdFacetExistsDocids(BTreeMap), + FieldIdFacetExistsDocids(grenad::Reader), GeoPoints(grenad::Reader), } @@ -149,11 +148,12 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { - write_sorted_iterator_into_database( - facet_id_exists_docids.into_iter().map(|(k, v)| (BEU16::new(k), v)), + append_entries_into_database( + facet_id_exists_docids, &index.facet_id_exists_docids, - "facet-id-exists-docids", wtxn, + index_is_empty, + |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -269,58 +269,6 @@ fn merge_cbo_roaring_bitmaps( )?) } -fn write_sorted_iterator_into_database( - mut iterator: Iter, - database: &heed::Database, - database_name: &'static str, - wtxn: &mut RwTxn, - merge_values: Merge, -) -> Result<()> -where - for<'a> KeyCodec: BytesEncode<'a, EItem = Key>, - for<'a> ValueCodec: BytesEncode<'a, EItem = Value> + BytesDecode<'a, DItem = Value>, - Iter: Iterator, - Merge: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, -{ - if database.is_empty(wtxn)? { - let mut database = database.iter_mut(wtxn)?.remap_types::(); - - while let Some((key, value)) = iterator.next() { - let key = KeyCodec::bytes_encode(&key) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - if valid_lmdb_key(&key) { - let value = ValueCodec::bytes_encode(&value) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - unsafe { database.append(&key, &value)? }; - } - } - - Ok(()) - } else { - let database = database.remap_types::(); - let mut buffer = Vec::new(); - while let Some((key, value)) = iterator.next() { - let key = KeyCodec::bytes_encode(&key) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - if valid_lmdb_key(&key) { - let value = ValueCodec::bytes_encode(&value) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - let value = match database.get(wtxn, &key)? { - Some(prev_value) => { - merge_values(&value, &prev_value, &mut buffer)?; - &buffer[..] - } - None => &value, - }; - - database.put(wtxn, &key, value)?; - } - } - - Ok(()) - } -} - /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( From 41a0ce07cb5f07b886df99ddbf1889164486b0a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 20 Jul 2022 16:20:35 +0200 Subject: [PATCH 1518/1889] Add a code comment, as suggested in PR review Co-authored-by: Many the fish --- milli/src/documents/builder.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index dc027e1b7..1a57db34b 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -97,6 +97,7 @@ impl DocumentsBatchBuilder { .map(|(k, t)| (self.fields_index.insert(k), t)) .enumerate() .collect(); + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid); let mut record = csv::StringRecord::new(); From cbb3b254595ed126313b7c33899642cbedf83e10 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 21 Jul 2022 10:04:30 +0200 Subject: [PATCH 1519/1889] Fix(Search): Fix phrase search candidates computation This bug is an old bug but was hidden by the proximity criterion, Phrase search were always returning an empty candidates list. Before the fix, we were trying to find any words[n] near words[n] instead of finding any words[n] near words[n+1], for example: for a phrase search '"Hello world"' we were searching for "hello" near "hello" first, instead of "hello" near "world". --- milli/src/search/criteria/mod.rs | 2 +- milli/src/search/criteria/proximity.rs | 44 +++++++++++++++++++------- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 05305d724..4613acb4f 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -335,7 +335,7 @@ pub fn resolve_query_tree<'t>( // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset).enumerate() { + for (dist, s2) in win.iter().skip(offset + 1).enumerate() { match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { Some(m) => bitmaps.push(m), // If there are no document for this distance, there will be no diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 2bfa61e85..30919585b 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -192,22 +192,42 @@ fn resolve_candidates<'t>( let most_right = words .last() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let mut candidates = None; - for slice in words.windows(2) { - let (left, right) = (&slice[0], &slice[1]); - match ctx.word_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => match candidates.as_mut() { - Some(candidates) => *candidates &= pair_docids, - None => candidates = Some(pair_docids), - }, - None => { - candidates = None; + let mut candidates = RoaringBitmap::new(); + let mut first_iter = true; + let winsize = words.len().min(7); + + for win in words.windows(winsize) { + // Get all the documents with the matching distance for each word pairs. + let mut bitmaps = Vec::with_capacity(winsize.pow(2)); + for (offset, s1) in win.iter().enumerate() { + for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this distance, there will be no + // results for the phrase query. + None => return Ok(Default::default()), + } + } + } + + // We sort the bitmaps so that we perform the small intersections first, which is faster. + bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); + + for bitmap in bitmaps { + if first_iter { + candidates = bitmap; + first_iter = false; + } else { + candidates &= bitmap; + } + // There will be no match, return early + if candidates.is_empty() { break; } } } - match (most_left, most_right, candidates) { - (Some(l), Some(r), Some(c)) => vec![(l, r, c)], + match (most_left, most_right) { + (Some(l), Some(r)) => vec![(l, r, candidates)], _otherwise => Default::default(), } } else { From d5e9b7305b0561bdd23aadb39c56e77d132c57bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 21 Jul 2022 13:19:42 +0400 Subject: [PATCH 1520/1889] Update version for next release (v0.32.0) --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index b40519d99..896ccd739 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.31.1" +version = "0.32.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index e59710b72..e4de70031 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.31.1" +version = "0.32.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 1a2e46929..8f61796b3 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.31.1" +version = "0.32.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index d22f7d86d..5cbc35f25 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.31.1" +version = "0.32.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 7e9dd207a..46c50de43 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.31.1" +version = "0.32.0" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 9e8781e55..43e046c11 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.31.1" +version = "0.32.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 49be47c1e..ea1ee9193 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.31.1" +version = "0.32.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index d4ea547fb..a0bb76676 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.31.1" +version = "0.32.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index d980c6041..37c7b7c84 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.31.1" +version = "0.32.0" authors = ["Kerollmops "] edition = "2018" From 1fe224f2c643a20922767f909f0bb0517e174189 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 21 Jul 2022 16:12:01 +0200 Subject: [PATCH 1521/1889] Update filter-parser/fuzz/.gitignore Co-authored-by: Many the fish --- filter-parser/fuzz/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter-parser/fuzz/.gitignore b/filter-parser/fuzz/.gitignore index 9a2e1d58c..084aa18c1 100644 --- a/filter-parser/fuzz/.gitignore +++ b/filter-parser/fuzz/.gitignore @@ -1,3 +1,3 @@ /corpus/ /artifacts/ -/target/ \ No newline at end of file +/target/ From f156d7dd3b45f524a9e716ee88b09b7bb363b280 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Jul 2022 19:15:16 +0200 Subject: [PATCH 1522/1889] Stop reindexing already indexed documents --- milli/src/update/index_documents/transform.rs | 77 ++++++++++++------- 1 file changed, 50 insertions(+), 27 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 0de90924a..705bbb21c 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -200,24 +200,26 @@ impl<'a, 'i> Transform<'a, 'i> { let mut original_docid = None; - let docid = match self.new_external_documents_ids_builder.entry(external_id.into()) { - Entry::Occupied(entry) => *entry.get() as u32, - Entry::Vacant(entry) => { - // If the document was already in the db we mark it as a replaced document. - // It'll be deleted later. We keep its original docid to insert it in the grenad. - if let Some(docid) = external_documents_ids.get(entry.key()) { - self.replaced_documents_ids.insert(docid); - original_docid = Some(docid); + let docid = + match self.new_external_documents_ids_builder.entry(external_id.clone().into()) { + Entry::Occupied(entry) => *entry.get() as u32, + Entry::Vacant(entry) => { + // If the document was already in the db we mark it as a replaced document. + // It'll be deleted later. We keep its original docid to insert it in the grenad. + if let Some(docid) = external_documents_ids.get(entry.key()) { + self.replaced_documents_ids.insert(docid); + original_docid = Some(docid); + } + let docid = self + .available_documents_ids + .next() + .ok_or(UserError::DocumentLimitReached)?; + entry.insert(docid as u64); + docid } - let docid = self - .available_documents_ids - .next() - .ok_or(UserError::DocumentLimitReached)?; - entry.insert(docid as u64); - docid - } - }; + }; + let mut skip_insertion = false; if let Some(original_docid) = original_docid { let original_key = BEU32::new(original_docid); let base_obkv = self @@ -230,24 +232,39 @@ impl<'a, 'i> Transform<'a, 'i> { key: None, })?; - // we associate the base document with the new key, everything will get merged later. - self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; - match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { - Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, - None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?, + // we check if the two documents are exactly equal. If it's the case we can skip this document entirely + if base_obkv == obkv_buffer { + // we're not replacing anything + self.replaced_documents_ids.remove(original_docid); + // and we need to put back the original id as it was before + self.new_external_documents_ids_builder.remove(&*external_id); + skip_insertion = true; + } else { + // we associate the base document with the new key, everything will get merged later. + self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; + match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { + Some(buffer) => { + self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)? + } + None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?, + } } } else { self.new_documents_ids.insert(docid); } - // We use the extracted/generated user id as the key for this document. - self.original_sorter.insert(&docid.to_be_bytes(), &obkv_buffer)?; - documents_count += 1; + if !skip_insertion { + // We use the extracted/generated user id as the key for this document. + self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?; - match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { - Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, - None => self.flattened_sorter.insert(docid.to_be_bytes(), &obkv_buffer)?, + match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { + Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, + None => { + self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())? + } + } } + documents_count += 1; progress_callback(UpdateIndexingStep::RemapDocumentAddition { documents_seen: documents_count, @@ -394,6 +411,11 @@ impl<'a, 'i> Transform<'a, 'i> { rtxn: &RoTxn, field_distribution: &mut FieldDistribution, ) -> Result<()> { + println!( + "The following documents are going to be deleted from the field distribution: {:?}", + self.replaced_documents_ids + ); + for deleted_docid in self.replaced_documents_ids.iter() { let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or( InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, @@ -461,6 +483,7 @@ impl<'a, 'i> Transform<'a, 'i> { let mut documents_count = 0; while let Some((key, val)) = iter.next()? { + println!("Reading a document"); // send a callback to show at which step we are documents_count += 1; progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { From 7fc35c558616abd5178529d2cc73d19ac1e7ccb9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 7 Jul 2022 15:02:06 +0200 Subject: [PATCH 1523/1889] remove the useless prints --- milli/src/update/index_documents/transform.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 705bbb21c..b61395a96 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -411,11 +411,6 @@ impl<'a, 'i> Transform<'a, 'i> { rtxn: &RoTxn, field_distribution: &mut FieldDistribution, ) -> Result<()> { - println!( - "The following documents are going to be deleted from the field distribution: {:?}", - self.replaced_documents_ids - ); - for deleted_docid in self.replaced_documents_ids.iter() { let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or( InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, @@ -483,7 +478,6 @@ impl<'a, 'i> Transform<'a, 'i> { let mut documents_count = 0; while let Some((key, val)) = iter.next()? { - println!("Reading a document"); // send a callback to show at which step we are documents_count += 1; progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { From d6f9a60a322998590bceb5d095c74f63e3077414 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 3 Aug 2022 11:38:40 +0200 Subject: [PATCH 1524/1889] fix: Remove whitespace trimming during document id validation fix #592 --- milli/src/update/index_documents/enrich.rs | 2 - milli/src/update/index_documents/mod.rs | 47 ++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 7c9a016d8..15fbe9319 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -294,9 +294,7 @@ pub fn fetch_matching_values_in_object( } } -/// Returns a trimmed version of the document id or `None` if it is invalid. pub fn validate_document_id(document_id: &str) -> Option<&str> { - let document_id = document_id.trim(); if !document_id.is_empty() && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index c9890f93f..0f0eaca5a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2086,4 +2086,51 @@ mod tests { let (_builder, user_error) = builder.add_documents(doc4).unwrap(); assert!(user_error.is_err()); } + + #[test] + fn primary_key_must_not_contain_whitespace() { + let tmp = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 100); + let index = Index::new(options, tmp).unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let indexer_config = IndexerConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &indexer_config, + IndexDocumentsConfig::default(), + |_| (), + ) + .unwrap(); + + let doc1 = documents! {[{ + "id": " 1", + "title": "asdsad", + }]}; + + let doc2 = documents! {[{ + "id": "\t2", + "title": "something", + }]}; + + let doc3 = documents! {[{ + "id": "\r3", + "title": "something", + }]}; + + let doc4 = documents! {[{ + "id": "\n4", + "title": "something", + }]}; + + let (builder, user_error) = builder.add_documents(doc1).unwrap(); + assert!(user_error.is_err()); + let (builder, user_error) = builder.add_documents(doc2).unwrap(); + assert!(user_error.is_err()); + let (builder, user_error) = builder.add_documents(doc3).unwrap(); + assert!(user_error.is_err()); + let (_builder, user_error) = builder.add_documents(doc4).unwrap(); + assert!(user_error.is_err()); + } } From acff17fb88fff6cc61ab3ae5392669b59afef262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 2 Aug 2022 15:13:06 +0200 Subject: [PATCH 1525/1889] Simplify indexing tests --- milli/src/documents/mod.rs | 11 + milli/src/index.rs | 229 ++-- milli/src/update/clear_documents.rs | 29 +- milli/src/update/delete_documents.rs | 376 +++---- milli/src/update/index_documents/mod.rs | 1328 +++++++---------------- milli/src/update/settings.rs | 823 ++++++-------- 6 files changed, 1070 insertions(+), 1726 deletions(-) diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index e766e29cf..c1580309a 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -179,6 +179,17 @@ macro_rules! documents { }}; } +#[cfg(test)] +pub fn documents_batch_reader_from_objects( + objects: impl IntoIterator, +) -> DocumentsBatchReader>> { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in objects { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(std::io::Cursor::new(builder.into_inner().unwrap())).unwrap() +} + #[cfg(test)] mod test { use std::io::Cursor; diff --git a/milli/src/index.rs b/milli/src/index.rs index 6d95332fd..43888a177 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1182,16 +1182,19 @@ pub(crate) mod tests { use std::ops::Deref; use big_s::S; - use heed::EnvOpenOptions; + use heed::{EnvOpenOptions, RwTxn}; use maplit::btreemap; use tempfile::TempDir; + use crate::documents::DocumentsBatchReader; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; - use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig}; + use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::Index; pub(crate) struct TempIndex { - inner: Index, + pub inner: Index, + pub indexer_config: IndexerConfig, + pub index_documents_config: IndexDocumentsConfig, _tempdir: TempDir, } @@ -1204,43 +1207,88 @@ pub(crate) mod tests { } impl TempIndex { - /// Creates a temporary index, with a default `4096 * 100` size. This should be enough for - /// most tests. - pub fn new() -> Self { + /// Creates a temporary index + pub fn new_with_map_size(size: usize) -> Self { let mut options = EnvOpenOptions::new(); - options.map_size(100 * 4096); + options.map_size(size); let _tempdir = TempDir::new_in(".").unwrap(); let inner = Index::new(options, _tempdir.path()).unwrap(); - Self { inner, _tempdir } + let indexer_config = IndexerConfig::default(); + let index_documents_config = IndexDocumentsConfig::default(); + Self { inner, indexer_config, index_documents_config, _tempdir } + } + /// Creates a temporary index, with a default `4096 * 1000` size. This should be enough for + /// most tests. + pub fn new() -> Self { + Self::new_with_map_size(4096 * 1000) + } + pub fn add_documents_using_wtxn<'t, R>( + &'t self, + wtxn: &mut RwTxn<'t, '_>, + documents: DocumentsBatchReader, + ) -> Result<(), crate::error::Error> + where + R: std::io::Read + std::io::Seek, + { + let builder = IndexDocuments::new( + wtxn, + &self, + &self.indexer_config, + self.index_documents_config.clone(), + |_| (), + ) + .unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error?; + builder.execute()?; + Ok(()) + } + pub fn add_documents( + &self, + documents: DocumentsBatchReader, + ) -> Result<(), crate::error::Error> + where + R: std::io::Read + std::io::Seek, + { + let mut wtxn = self.write_txn().unwrap(); + self.add_documents_using_wtxn(&mut wtxn, documents)?; + wtxn.commit().unwrap(); + Ok(()) + } + + pub fn update_settings( + &self, + update: impl Fn(&mut Settings), + ) -> Result<(), crate::error::Error> { + let mut wtxn = self.write_txn().unwrap(); + self.update_settings_using_wtxn(&mut wtxn, update)?; + wtxn.commit().unwrap(); + Ok(()) + } + pub fn update_settings_using_wtxn<'t>( + &'t self, + wtxn: &mut RwTxn<'t, '_>, + update: impl Fn(&mut Settings), + ) -> Result<(), crate::error::Error> { + let mut builder = update::Settings::new(wtxn, &self.inner, &self.indexer_config); + update(&mut builder); + builder.execute(drop)?; + Ok(()) } } #[test] fn initial_field_distribution() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 1, "name": "kevin" }, - { "id": 2, "name": "bob", "age": 20 }, - { "id": 2, "name": "bob", "age": 20 }, - ]); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + let index = TempIndex::new(); + index + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "bob", "age": 20 }, + { "id": 2, "name": "bob", "age": 20 }, + ])) + .unwrap(); let rtxn = index.read_txn().unwrap(); - let field_distribution = index.field_distribution(&rtxn).unwrap(); assert_eq!( field_distribution, @@ -1253,19 +1301,13 @@ pub(crate) mod tests { // we add all the documents a second time. we are supposed to get the same // field_distribution in the end - let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let content = documents!([ - { "id": 1, "name": "kevin" }, - { "id": 2, "name": "bob", "age": 20 }, - { "id": 2, "name": "bob", "age": 20 }, - ]); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "bob", "age": 20 }, + { "id": 2, "name": "bob", "age": 20 }, + ])) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1280,19 +1322,12 @@ pub(crate) mod tests { ); // then we update a document by removing one field and another by adding one field - let content = documents!([ - { "id": 1, "name": "kevin", "has_dog": true }, - { "id": 2, "name": "bob" } - ]); - - let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "id": 1, "name": "kevin", "has_dog": true }, + { "id": 2, "name": "bob" } + ])) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1341,35 +1376,19 @@ pub(crate) mod tests { #[test] fn add_documents_and_set_searchable_fields() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 1, "doggo": "kevin" }, - { "id": 2, "doggo": { "name": "bob", "age": 20 } }, - { "id": 3, "name": "jean", "age": 25 }, - ]); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - // set searchable fields - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - builder.set_searchable_fields(vec![S("doggo"), S("name")]); - - builder.execute(drop).unwrap(); - wtxn.commit().unwrap(); + let index = TempIndex::new(); + index + .add_documents(documents!([ + { "id": 1, "doggo": "kevin" }, + { "id": 2, "doggo": { "name": "bob", "age": 20 } }, + { "id": 3, "name": "jean", "age": 25 }, + ])) + .unwrap(); + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("doggo"), S("name")]); + }) + .unwrap(); // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); @@ -1383,19 +1402,13 @@ pub(crate) mod tests { #[test] fn set_searchable_fields_and_add_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let index = TempIndex::new(); - // set searchable fields - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - builder.set_searchable_fields(vec![S("doggo"), S("name")]); - - builder.execute(drop).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("doggo"), S("name")]); + }) + .unwrap(); // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); @@ -1405,21 +1418,13 @@ pub(crate) mod tests { let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); assert_eq!(user_defined, &["doggo", "name"]); - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 1, "doggo": "kevin" }, - { "id": 2, "doggo": { "name": "bob", "age": 20 } }, - { "id": 3, "name": "jean", "age": 25 }, - ]); - - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "id": 1, "doggo": "kevin" }, + { "id": 2, "doggo": { "name": "bob", "age": 20 } }, + { "id": 3, "name": "jean", "age": 25 }, + ])) + .unwrap(); // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 388865d56..5b7dbc57c 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -82,36 +82,25 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { #[cfg(test)] mod tests { - use heed::EnvOpenOptions; - use super::*; - use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig}; + use crate::index::tests::TempIndex; #[test] fn clear_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 0, "name": "kevin", "age": 20 }, - { "id": 1, "name": "kevina" }, - { "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } } - ]); - let indexing_config = IndexDocumentsConfig::default(); - let config = IndexerConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); + index + .add_documents_using_wtxn(&mut wtxn, documents!([ + { "id": 0, "name": "kevin", "age": 20 }, + { "id": 1, "name": "kevina" }, + { "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } } + ])) + .unwrap(); // Clear all documents from the database. let builder = ClearDocuments::new(&mut wtxn, &index); assert_eq!(builder.execute().unwrap(), 3); - wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index f10829454..c981ee061 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -654,26 +654,13 @@ where #[cfg(test)] mod tests { use big_s::S; - use heed::{EnvOpenOptions, RwTxn}; + use heed::RwTxn; use maplit::hashset; use super::*; - use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; + use crate::index::tests::TempIndex; use crate::Filter; - fn insert_documents<'t, R: std::io::Read + std::io::Seek>( - wtxn: &mut RwTxn<'t, '_>, - index: &'t Index, - documents: crate::documents::DocumentsBatchReader, - ) { - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - } - fn delete_documents<'t>( wtxn: &mut RwTxn<'t, '_>, index: &'t Index, @@ -695,24 +682,19 @@ mod tests { #[test] fn delete_documents_with_numbers_as_primary_key() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, - { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, - { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } - ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, + { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, + { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } + ]), + ) + .unwrap(); // delete those documents, ids are synchronous therefore 0, 1, and 2. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); @@ -730,25 +712,19 @@ mod tests { #[test] fn delete_documents_with_strange_primary_key() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "mysuperid": 0, "name": "kevin" }, - { "mysuperid": 1, "name": "kevina" }, - { "mysuperid": 2, "name": "benoit" } - ]); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "mysuperid": 0, "name": "kevin" }, + { "mysuperid": 1, "name": "kevina" }, + { "mysuperid": 2, "name": "benoit" } + ]), + ) + .unwrap(); // Delete not all of the documents but some of them. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); @@ -761,42 +737,45 @@ mod tests { #[test] fn filtered_placeholder_search_should_not_return_deleted_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(S("docid")); - builder.set_filterable_fields(hashset! { S("label") }); - builder.execute(|_| ()).unwrap(); - let content = documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } - ]); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + settings.set_filterable_fields(hashset! { S("label") }); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": "sign" }, + { "docid": "1_5", "label": "letter" }, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, + { "docid": "1_36", "label": "drawing,painting,pattern" }, + { "docid": "1_37", "label": "art,drawing,outdoor" }, + { "docid": "1_38", "label": "aquarium,art,drawing" }, + { "docid": "1_39", "label": "abstract" }, + { "docid": "1_40", "label": "cartoon" }, + { "docid": "1_41", "label": "art,drawing" }, + { "docid": "1_42", "label": "art,pattern" }, + { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, + { "docid": "1_44", "label": "drawing" }, + { "docid": "1_45", "label": "art" }, + { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, + { "docid": "1_47", "label": "abstract,pattern" }, + { "docid": "1_52", "label": "abstract,cartoon" }, + { "docid": "1_57", "label": "abstract,drawing,pattern" }, + { "docid": "1_58", "label": "abstract,art,cartoon" }, + { "docid": "1_68", "label": "design" }, + { "docid": "1_69", "label": "geometry" } + ]), + ) + .unwrap(); - insert_documents(&mut wtxn, &index, content); delete_documents(&mut wtxn, &index, &["1_4"]); // Placeholder search with filter @@ -809,41 +788,43 @@ mod tests { #[test] fn placeholder_search_should_not_return_deleted_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(S("docid")); - builder.execute(|_| ()).unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); - let content = documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } - ]); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": "sign" }, + { "docid": "1_5", "label": "letter" }, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, + { "docid": "1_36", "label": "drawing,painting,pattern" }, + { "docid": "1_37", "label": "art,drawing,outdoor" }, + { "docid": "1_38", "label": "aquarium,art,drawing" }, + { "docid": "1_39", "label": "abstract" }, + { "docid": "1_40", "label": "cartoon" }, + { "docid": "1_41", "label": "art,drawing" }, + { "docid": "1_42", "label": "art,pattern" }, + { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, + { "docid": "1_44", "label": "drawing" }, + { "docid": "1_45", "label": "art" }, + { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, + { "docid": "1_47", "label": "abstract,pattern" }, + { "docid": "1_52", "label": "abstract,cartoon" }, + { "docid": "1_57", "label": "abstract,drawing,pattern" }, + { "docid": "1_58", "label": "abstract,art,cartoon" }, + { "docid": "1_68", "label": "design" }, + { "docid": "1_69", "label": "geometry" } + ]), + ) + .unwrap(); - insert_documents(&mut wtxn, &index, content); let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]); // Placeholder search @@ -862,41 +843,43 @@ mod tests { #[test] fn search_should_not_return_deleted_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(S("docid")); - builder.execute(|_| ()).unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); - let content = documents!([ - {"docid": "1_4", "label": "sign"}, - {"docid": "1_5", "label": "letter"}, - {"docid": "1_7", "label": "abstract,cartoon,design,pattern"}, - {"docid": "1_36","label": "drawing,painting,pattern"}, - {"docid": "1_37","label": "art,drawing,outdoor"}, - {"docid": "1_38","label": "aquarium,art,drawing"}, - {"docid": "1_39","label": "abstract"}, - {"docid": "1_40","label": "cartoon"}, - {"docid": "1_41","label": "art,drawing"}, - {"docid": "1_42","label": "art,pattern"}, - {"docid": "1_43","label": "abstract,art,drawing,pattern"}, - {"docid": "1_44","label": "drawing"}, - {"docid": "1_45","label": "art"}, - {"docid": "1_46","label": "abstract,colorfulness,pattern"}, - {"docid": "1_47","label": "abstract,pattern"}, - {"docid": "1_52","label": "abstract,cartoon"}, - {"docid": "1_57","label": "abstract,drawing,pattern"}, - {"docid": "1_58","label": "abstract,art,cartoon"}, - {"docid": "1_68","label": "design"}, - {"docid": "1_69","label": "geometry"} - ]); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + {"docid": "1_4", "label": "sign"}, + {"docid": "1_5", "label": "letter"}, + {"docid": "1_7", "label": "abstract,cartoon,design,pattern"}, + {"docid": "1_36","label": "drawing,painting,pattern"}, + {"docid": "1_37","label": "art,drawing,outdoor"}, + {"docid": "1_38","label": "aquarium,art,drawing"}, + {"docid": "1_39","label": "abstract"}, + {"docid": "1_40","label": "cartoon"}, + {"docid": "1_41","label": "art,drawing"}, + {"docid": "1_42","label": "art,pattern"}, + {"docid": "1_43","label": "abstract,art,drawing,pattern"}, + {"docid": "1_44","label": "drawing"}, + {"docid": "1_45","label": "art"}, + {"docid": "1_46","label": "abstract,colorfulness,pattern"}, + {"docid": "1_47","label": "abstract,pattern"}, + {"docid": "1_52","label": "abstract,cartoon"}, + {"docid": "1_57","label": "abstract,drawing,pattern"}, + {"docid": "1_58","label": "abstract,art,cartoon"}, + {"docid": "1_68","label": "design"}, + {"docid": "1_69","label": "geometry"} + ]), + ) + .unwrap(); - insert_documents(&mut wtxn, &index, content); let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); // search for abstract @@ -915,20 +898,18 @@ mod tests { #[test] fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(S("id")); - builder.set_filterable_fields(hashset!(S("_geo"))); - builder.set_sortable_fields(hashset!(S("_geo"))); - builder.execute(|_| ()).unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("id")); + settings.set_filterable_fields(hashset!(S("_geo"))); + settings.set_sortable_fields(hashset!(S("_geo"))); + }) + .unwrap(); - let content = documents!([ + index.add_documents_using_wtxn(&mut wtxn, documents!([ { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, @@ -949,10 +930,9 @@ mod tests { { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } - ]); - let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; + ])).unwrap(); - insert_documents(&mut wtxn, &index, content); + let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete); // Placeholder search with geo filter @@ -972,41 +952,43 @@ mod tests { #[test] fn get_documents_should_not_return_deleted_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(S("docid")); - builder.execute(|_| ()).unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); - let content = documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } - ]); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": "sign" }, + { "docid": "1_5", "label": "letter" }, + { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, + { "docid": "1_36", "label": "drawing,painting,pattern" }, + { "docid": "1_37", "label": "art,drawing,outdoor" }, + { "docid": "1_38", "label": "aquarium,art,drawing" }, + { "docid": "1_39", "label": "abstract" }, + { "docid": "1_40", "label": "cartoon" }, + { "docid": "1_41", "label": "art,drawing" }, + { "docid": "1_42", "label": "art,pattern" }, + { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, + { "docid": "1_44", "label": "drawing" }, + { "docid": "1_45", "label": "art" }, + { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, + { "docid": "1_47", "label": "abstract,pattern" }, + { "docid": "1_52", "label": "abstract,cartoon" }, + { "docid": "1_57", "label": "abstract,drawing,pattern" }, + { "docid": "1_58", "label": "abstract,art,cartoon" }, + { "docid": "1_68", "label": "design" }, + { "docid": "1_69", "label": "geometry" } + ]), + ) + .unwrap(); - insert_documents(&mut wtxn, &index, content); let deleted_external_ids = ["1_7", "1_52"]; let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids); @@ -1042,18 +1024,17 @@ mod tests { #[test] fn stats_should_not_return_deleted_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(S("docid")); - builder.execute(|_| ()).unwrap(); - let content = documents!([ + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index.add_documents_using_wtxn(&mut wtxn, documents!([ { "docid": "1_4", "label": "sign"}, { "docid": "1_5", "label": "letter"}, { "docid": "1_7", "label": "abstract,cartoon,design,pattern", "title": "Mickey Mouse"}, @@ -1074,9 +1055,8 @@ mod tests { { "docid": "1_58", "label": "abstract,art,cartoon"}, { "docid": "1_68", "label": "design"}, { "docid": "1_69", "label": "geometry"} - ]); + ])).unwrap(); - insert_documents(&mut wtxn, &index, content); delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); // count internal documents diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 63aec1290..b0cae600f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -604,41 +604,27 @@ fn execute_word_prefix_docids( #[cfg(test)] mod tests { - use std::io::Cursor; - use big_s::S; - use heed::EnvOpenOptions; use maplit::hashset; use super::*; - use crate::documents::DocumentsBatchBuilder; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; use crate::update::DeleteDocuments; use crate::BEU16; #[test] fn simple_document_replacement() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); // First we send 3 documents with ids from 1 to 3. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 1, "name": "kevin" }, - { "id": 2, "name": "kevina" }, - { "id": 3, "name": "benoit" } - ]); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "kevina" }, + { "id": 3, "name": "benoit" } + ])) + .unwrap(); // Check that there is 3 documents now. let rtxn = index.read_txn().unwrap(); @@ -647,15 +633,7 @@ mod tests { drop(rtxn); // Second we send 1 document with id 1, to erase the previous ones. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ { "id": 1, "name": "updated kevin" } ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index.add_documents(documents!([ { "id": 1, "name": "updated kevin" } ])).unwrap(); // Check that there is **always** 3 documents. let rtxn = index.read_txn().unwrap(); @@ -664,18 +642,13 @@ mod tests { drop(rtxn); // Third we send 3 documents again to replace the existing ones. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 1, "name": "updated second kevin" }, - { "id": 2, "name": "updated kevina" }, - { "id": 3, "name": "updated benoit" } - ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "id": 1, "name": "updated second kevin" }, + { "id": 2, "name": "updated kevina" }, + { "id": 3, "name": "updated benoit" } + ])) + .unwrap(); // Check that there is **always** 3 documents. let rtxn = index.read_txn().unwrap(); @@ -686,31 +659,18 @@ mod tests { #[test] fn simple_document_merge() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; // First we send 3 documents with duplicate ids and // change the index method to merge documents. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 1, "name": "kevin" }, - { "id": 1, "name": "kevina" }, - { "id": 1, "name": "benoit" } - ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::UpdateDocuments, - ..Default::default() - }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 1, "name": "kevina" }, + { "id": 1, "name": "benoit" } + ])) + .unwrap(); // Check that there is only 1 document now. let rtxn = index.read_txn().unwrap(); @@ -731,14 +691,7 @@ mod tests { drop(rtxn); // Second we send 1 document with id 1, to force it to be merged with the previous one. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ { "id": 1, "age": 25 } ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index.add_documents(documents!([ { "id": 1, "age": 25 } ])).unwrap(); // Check that there is **always** 1 document. let rtxn = index.read_txn().unwrap(); @@ -763,25 +716,14 @@ mod tests { #[test] fn not_auto_generated_documents_ids() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); - // First we send 3 documents with ids from 1 to 3. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ + let result = index.add_documents(documents!([ { "name": "kevin" }, { "name": "kevina" }, { "name": "benoit" } - ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (_builder, user_error) = builder.add_documents(content).unwrap(); - assert!(user_error.is_err()); - wtxn.commit().unwrap(); + ])); + assert!(result.is_err()); // Check that there is no document. let rtxn = index.read_txn().unwrap(); @@ -792,28 +734,16 @@ mod tests { #[test] fn simple_auto_generated_documents_ids() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; // First we send 3 documents with ids from 1 to 3. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "name": "kevin" }, - { "name": "kevina" }, - { "name": "benoit" } - ]); - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "name": "kevin" }, + { "name": "kevina" }, + { "name": "benoit" } + ])) + .unwrap(); // Check that there is 3 documents now. let rtxn = index.read_txn().unwrap(); @@ -826,14 +756,7 @@ mod tests { drop(rtxn); // Second we send 1 document with the generated uuid, to erase the previous ones. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index.add_documents(documents!([ { "name": "updated kevin", "id": kevin_uuid } ])).unwrap(); // Check that there is **always** 3 documents. let rtxn = index.read_txn().unwrap(); @@ -857,26 +780,16 @@ mod tests { #[test] fn reordered_auto_generated_documents_ids() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); // First we send 3 documents with ids from 1 to 3. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 1, "name": "kevin" }, - { "id": 2, "name": "kevina" }, - { "id": 3, "name": "benoit" } - ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "kevina" }, + { "id": 3, "name": "benoit" } + ])) + .unwrap(); // Check that there is 3 documents now. let rtxn = index.read_txn().unwrap(); @@ -885,16 +798,8 @@ mod tests { drop(rtxn); // Second we send 1 document without specifying the id. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ { "name": "new kevin" } ]); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index.index_documents_config.autogenerate_docids = true; + index.add_documents(documents!([ { "name": "new kevin" } ])).unwrap(); // Check that there is 4 documents now. let rtxn = index.read_txn().unwrap(); @@ -905,22 +810,10 @@ mod tests { #[test] fn empty_update() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); // First we send 0 documents and only headers. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index.add_documents(documents!([])).unwrap(); // Check that there is no documents. let rtxn = index.read_txn().unwrap(); @@ -931,34 +824,14 @@ mod tests { #[test] fn invalid_documents_ids() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); // First we send 1 document with an invalid id. - let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. - let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (_builder, user_error) = builder.add_documents(content).unwrap(); - assert!(user_error.is_err()); - wtxn.commit().unwrap(); + index.add_documents(documents!([ { "id": "brume bleue", "name": "kevin" } ])).unwrap_err(); - // First we send 1 document with a valid id. - let mut wtxn = index.write_txn().unwrap(); - // There is a space in the document id. - let content = documents!([ { "id": 32, "name": "kevin" } ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + // Then we send 1 document with a valid id. + index.add_documents(documents!([ { "id": 32, "name": "kevin" } ])).unwrap(); // Check that there is 1 document now. let rtxn = index.read_txn().unwrap(); @@ -969,26 +842,16 @@ mod tests { #[test] fn complex_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); // First we send 3 documents with an id for only one of them. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, - { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, - { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } - ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, + { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, + { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } + ])) + .unwrap(); // Check that there is 1 documents now. let rtxn = index.read_txn().unwrap(); @@ -1010,126 +873,72 @@ mod tests { #[test] fn simple_documents_replace() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - // First we send 3 documents with an id for only one of them. - let mut wtxn = index.write_txn().unwrap(); - let documents = documents!([ + index.add_documents(documents!([ { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, { "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 }, { "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 }, { "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" }, { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" }, { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } } - ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + ])).unwrap(); - let mut wtxn = index.write_txn().unwrap(); - let indexing_config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::UpdateDocuments, - ..Default::default() - }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let documents = documents!([ - { - "id": 2, - "author": "J. Austen", - "date": "1813" - } - ]); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([{ + "id": 2, + "author": "J. Austen", + "date": "1813" + }])) + .unwrap(); } #[test] fn mixed_geo_documents() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; // We send 6 documents and mix the ones that have _geo and those that don't have it. - let mut wtxn = index.write_txn().unwrap(); - let documents = documents!([ - { "id": 2, "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, - { "id": 456 }, - { "id": 1 }, - { "id": 1344 }, - { "id": 4 }, - { "id": 42, "_geo": { "lat": 35, "lng": 23 } } - ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "id": 2, "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, + { "id": 456 }, + { "id": 1 }, + { "id": 1344 }, + { "id": 4 }, + { "id": 42, "_geo": { "lat": 35, "lng": 23 } } + ])) + .unwrap(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - let faceted_fields = hashset!(S("_geo")); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset!(S("_geo"))); + }) + .unwrap(); } #[test] fn index_all_flavour_of_geo() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset!(S("_geo"))); + }) + .unwrap(); - builder.set_filterable_fields(hashset!(S("_geo"))); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - let indexing_config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }; - let mut wtxn = index.write_txn().unwrap(); - - let documents = documents!([ - { "id": 0, "_geo": { "lat": 31, "lng": [42] } }, - { "id": 1, "_geo": { "lat": "31" }, "_geo.lng": 42 }, - { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" }, - { "id": 3, "_geo.lat": 31, "_geo.lng": "42" }, - ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": 31, "lng": [42] } }, + { "id": 1, "_geo": { "lat": "31" }, "_geo.lng": 42 }, + { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" }, + { "id": 3, "_geo.lat": 31, "_geo.lng": "42" }, + ])) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1141,90 +950,60 @@ mod tests { #[test] fn geo_error() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset!(S("_geo"))); + }) + .unwrap(); - builder.set_filterable_fields(hashset!(S("_geo"))); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - let indexing_config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }; - let mut wtxn = index.write_txn().unwrap(); - - let documents = documents!([ - { "id": 0, "_geo": { "lng": 42 } } - ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - let error = builder.execute().unwrap_err(); + let error = index + .add_documents(documents!([ + { "id": 0, "_geo": { "lng": 42 } } + ])) + .unwrap_err(); assert_eq!( &error.to_string(), r#"Could not find latitude in the document with the id: `0`. Was expecting a `_geo.lat` field."# ); - let documents = documents!([ - { "id": 0, "_geo": { "lat": 42 } } - ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - let error = builder.execute().unwrap_err(); + let error = index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": 42 } } + ])) + .unwrap_err(); assert_eq!( &error.to_string(), r#"Could not find longitude in the document with the id: `0`. Was expecting a `_geo.lng` field."# ); - let documents = documents!([ - { "id": 0, "_geo": { "lat": "lol", "lng": 42 } } - ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - let error = builder.execute().unwrap_err(); + let error = index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": "lol", "lng": 42 } } + ])) + .unwrap_err(); assert_eq!( &error.to_string(), r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `"lol"`."# ); - let documents = documents!([ - { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } } - ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - let error = builder.execute().unwrap_err(); + let error = index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } } + ])) + .unwrap_err(); assert_eq!( &error.to_string(), r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `[12,13]`."# ); - let documents = documents!([ - { "id": 0, "_geo": { "lat": 12, "lng": "hello" } } - ]); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - let error = builder.execute().unwrap_err(); + let error = index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": 12, "lng": "hello" } } + ])) + .unwrap_err(); assert_eq!( &error.to_string(), r#"Could not parse longitude in the document with the id: `0`. Was expecting a finite number but instead got `"hello"`."# @@ -1233,27 +1012,17 @@ mod tests { #[test] fn delete_documents_then_insert() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); + index + .add_documents(documents!([ + { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" }, + { "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" }, + { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } + ])) + .unwrap(); let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" }, - { "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" }, - { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, - { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } - ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); // Delete not all of the documents but some of them. @@ -1263,42 +1032,29 @@ mod tests { let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_none()); + wtxn.commit().unwrap(); - let content = documents!([ - { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } - ]); + index + .add_documents(documents!([ + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } + ])) + .unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); + let wtxn = index.write_txn().unwrap(); let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_some()); - - let content = documents!([ - { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } - ]); - - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + + index + .add_documents(documents!([ + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } + ])) + .unwrap(); } #[test] fn index_more_than_256_fields() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); + let index = TempIndex::new(); let mut big_object = serde_json::Map::new(); big_object.insert(S("id"), serde_json::Value::from("wow")); @@ -1307,56 +1063,23 @@ mod tests { big_object.insert(key, serde_json::Value::from("I am a text!")); } - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - builder.append_json_object(&big_object).unwrap(); - let vector = builder.into_inner().unwrap(); - let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); + let documents = documents_batch_reader_from_objects([big_object]); + index.add_documents(documents).unwrap(); } #[test] fn index_more_than_1000_positions_in_a_field() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(50 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - - let mut big_object = serde_json::Map::new(); - big_object.insert(S("id"), serde_json::Value::from("wow")); - let content: String = (0..=u16::MAX) - .into_iter() - .map(|p| p.to_string()) - .reduce(|a, b| a + " " + b.as_ref()) + let index = TempIndex::new_with_map_size(4096 * 100_000); // 400 MB + let mut content = String::with_capacity(382101); + for i in 0..=u16::MAX { + content.push_str(&format!("{i} ")); + } + index + .add_documents(documents!({ + "id": "wow", + "content": content + })) .unwrap(); - big_object.insert("content".to_string(), serde_json::Value::from(content)); - - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - builder.append_json_object(&big_object).unwrap(); - let vector = builder.into_inner().unwrap(); - let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); let mut rtxn = index.read_txn().unwrap(); @@ -1370,117 +1093,90 @@ mod tests { #[test] fn index_documents_with_zeroes() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { - "id": 2, - "title": "Prideand Prejudice", - "au{hor": "Jane Austin", - "genre": "romance", - "price$": "3.5$", - }, - { - "id": 456, - "title": "Le Petit Prince", - "au{hor": "Antoine de Saint-Exupéry", - "genre": "adventure", - "price$": "10.0$", - }, - { - "id": 1, - "title": "Wonderland", - "au{hor": "Lewis Carroll", - "genre": "fantasy", - "price$": "25.99$", - }, - { - "id": 4, - "title": "Harry Potter ing fantasy\0lood Prince", - "au{hor": "J. K. Rowling", - "genre": "fantasy\0", - }, - ]); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { + "id": 2, + "title": "Prideand Prejudice", + "au{hor": "Jane Austin", + "genre": "romance", + "price$": "3.5$", + }, + { + "id": 456, + "title": "Le Petit Prince", + "au{hor": "Antoine de Saint-Exupéry", + "genre": "adventure", + "price$": "10.0$", + }, + { + "id": 1, + "title": "Wonderland", + "au{hor": "Lewis Carroll", + "genre": "fantasy", + "price$": "25.99$", + }, + { + "id": 4, + "title": "Harry Potter ing fantasy\0lood Prince", + "au{hor": "J. K. Rowling", + "genre": "fantasy\0", + }, + ])) + .unwrap(); } #[test] fn index_documents_with_nested_fields() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { - "id": 0, - "title": "The zeroth document", - }, - { - "id": 1, - "title": "The first document", - "nested": { - "object": "field", - "machin": "bidule", + index + .add_documents(documents!([ + { + "id": 0, + "title": "The zeroth document", }, - }, - { - "id": 2, - "title": "The second document", - "nested": [ - "array", - { + { + "id": 1, + "title": "The first document", + "nested": { "object": "field", + "machin": "bidule", }, - { - "prout": "truc", - "machin": "lol", - }, - ], - }, - { - "id": 3, - "title": "The third document", - "nested": "I lied", - }, - ]); + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field", + }, + { + "prout": "truc", + "machin": "lol", + }, + ], + }, + { + "id": 3, + "title": "The third document", + "nested": "I lied", + }, + ])) + .unwrap(); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); + index + .update_settings(|settings| { + let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")]; + settings.set_searchable_fields(searchable_fields); - wtxn.commit().unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")]; - builder.set_searchable_fields(searchable_fields); - - let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin")); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin")); + settings.set_filterable_fields(faceted_fields); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1554,54 +1250,42 @@ mod tests { #[test] fn index_documents_with_nested_primary_key() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key("complex.nested.id".to_owned()); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_primary_key("complex.nested.id".to_owned()); + }) + .unwrap(); - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { - "complex": { - "nested": { - "id": 0, + index + .add_documents(documents!([ + { + "complex": { + "nested": { + "id": 0, + }, }, + "title": "The zeroth document", }, - "title": "The zeroth document", - }, - { - "complex.nested": { - "id": 1, + { + "complex.nested": { + "id": 1, + }, + "title": "The first document", }, - "title": "The first document", - }, - { - "complex": { - "nested.id": 2, + { + "complex": { + "nested.id": 2, + }, + "title": "The second document", }, - "title": "The second document", - }, - { - "complex.nested.id": 3, - "title": "The third document", - }, - ]); - - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + { + "complex.nested.id": 3, + "title": "The third document", + }, + ])) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1630,50 +1314,28 @@ mod tests { #[test] fn retrieve_a_b_nested_document_id() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key("a.b".to_owned()); - builder.execute(|_| ()).unwrap(); - - let content = documents!({ "a" : { "b" : { "c" : 1 }}}); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (_builder, user_error) = builder.add_documents(content).unwrap(); + index + .update_settings(|settings| { + settings.set_primary_key("a.b".to_owned()); + }) + .unwrap(); // There must be an issue with the primary key no present in the given document - user_error.unwrap_err(); + index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap_err(); } #[test] fn retrieve_a_b_c_nested_document_id() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key("a.b.c".to_owned()); - builder.execute(|_| ()).unwrap(); - - let content = documents!({ "a" : { "b" : { "c" : 1 }}}); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_primary_key("a.b.c".to_owned()); + }) + .unwrap(); + index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap(); let rtxn = index.read_txn().unwrap(); let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); @@ -1682,61 +1344,42 @@ mod tests { #[test] fn test_facets_generation() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { - "id": 0, - "dog": { - "race": { - "bernese mountain": "zeroth", + index + .add_documents(documents!([ + { + "id": 0, + "dog": { + "race": { + "bernese mountain": "zeroth", + }, }, }, - }, - { - "id": 1, - "dog.race": { - "bernese mountain": "first", + { + "id": 1, + "dog.race": { + "bernese mountain": "first", + }, }, - }, - { - "id": 2, - "dog.race.bernese mountain": "second", - }, - { - "id": 3, - "dog": { - "race.bernese mountain": "third" + { + "id": 2, + "dog.race.bernese mountain": "second", }, - }, - ]); + { + "id": 3, + "dog": { + "race.bernese mountain": "third" + }, + }, + ])) + .unwrap(); - // index the documents - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - // ---- ADD THE SETTING TO TEST THE FILTERABLE - - // add the settings - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - builder.set_filterable_fields(hashset!(String::from("dog"))); - - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset!(String::from("dog"))); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1751,17 +1394,12 @@ mod tests { let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); assert_eq!(documents_ids, vec![i]); } - - // ---- RESET THE SETTINGS - - // update the settings - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - builder.reset_filterable_fields(); - - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + // Reset the settings + index + .update_settings(|settings| { + settings.reset_filterable_fields(); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1769,16 +1407,12 @@ mod tests { assert_eq!(facets, hashset!()); - // ---- UPDATE THE SETTINGS TO TEST THE SORTABLE - - // update the settings - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - builder.set_sortable_fields(hashset!(S("dog.race"))); - - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + // update the settings to test the sortable + index + .update_settings(|settings| { + settings.set_sortable_fields(hashset!(S("dog.race"))); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1796,69 +1430,37 @@ mod tests { #[test] fn index_2_times_documents_split_by_zero_document_indexation() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); - let content = documents!([ - {"id": 0, "name": "Kerollmops", "score": 78}, - {"id": 1, "name": "ManyTheFish", "score": 75}, - {"id": 2, "name": "Ferdi", "score": 39}, - {"id": 3, "name": "Tommy", "score": 33} - ]); - - let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + {"id": 0, "name": "Kerollmops", "score": 78}, + {"id": 1, "name": "ManyTheFish", "score": 75}, + {"id": 2, "name": "Ferdi", "score": 39}, + {"id": 3, "name": "Tommy", "score": 33} + ])) + .unwrap(); // Check that there is 4 document now. let rtxn = index.read_txn().unwrap(); let count = index.number_of_documents(&rtxn).unwrap(); assert_eq!(count, 4); - let content = documents!([]); - - let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index.add_documents(documents!([])).unwrap(); // Check that there is 4 document now. let rtxn = index.read_txn().unwrap(); let count = index.number_of_documents(&rtxn).unwrap(); assert_eq!(count, 4); - let content = documents!([ - {"id": 0, "name": "Kerollmops", "score": 78}, - {"id": 1, "name": "ManyTheFish", "score": 75}, - {"id": 2, "name": "Ferdi", "score": 39}, - {"id": 3, "name": "Tommy", "score": 33} - ]); - - let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + {"id": 0, "name": "Kerollmops", "score": 78}, + {"id": 1, "name": "ManyTheFish", "score": 75}, + {"id": 2, "name": "Ferdi", "score": 39}, + {"id": 3, "name": "Tommy", "score": 33} + ])) + .unwrap(); // Check that there is 4 document now. let rtxn = index.read_txn().unwrap(); @@ -1868,26 +1470,14 @@ mod tests { #[test] fn test_meilisearch_1714() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); - let content = documents!([ - {"id": "123", "title": "小化妆包" }, - {"id": "456", "title": "Ipad 包" } - ]); - - let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + {"id": "123", "title": "小化妆包" }, + {"id": "456", "title": "Ipad 包" } + ])) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -1913,35 +1503,20 @@ mod tests { /// it should not return any error. #[test] fn text_with_too_long_words() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); - let content = documents!([ - {"id": 1, "title": "a".repeat(256) }, - {"id": 2, "title": "b".repeat(512) }, - {"id": 3, "title": format!("{} {}", "c".repeat(250), "d".repeat(250)) }, - ]); - - let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + {"id": 1, "title": "a".repeat(256) }, + {"id": 2, "title": "b".repeat(512) }, + {"id": 3, "title": format!("{} {}", "c".repeat(250), "d".repeat(250)) }, + ])) + .unwrap(); } #[test] fn text_with_too_long_keys() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let script = "https://bug.example.com/meilisearch/milli.saml2?ROLE=Programmer-1337&SAMLRequest=Cy1ytcZT1Po%2L2IY2y9Unru8rgnW4qWfPiI0EpT7P8xjJV8PeQikRL%2E8D9A4pj9tmbymbQCQwGmGjPMK7qwXFPX4DH52JO2b7n6TXjuR7zkIFuYdzdY2rwRNBPgCL7ihclEm9zyIjKZQ%2JTqiwfXxWjnI0KEYQYHdwd6Q%2Fx%28BDLNsvmL54CCY2F4RWeRs4eqWfn%2EHqxlhreFzax4AiQ2tgOtV5thOaaWqrhZD%2Py70nuyZWNTKwciGI43AoHg6PThANsQ5rAY5amzN%2ufbs1swETUXlLZuOut5YGpYPZfY6STJWNp4QYSUOUXBZpdElYsH7UHZ7VhJycgyt%28aTK0GW6GbKne2tJM0hgSczOqndg6RFa9WsnSBi4zMcaEfYur4WlSsHDYInF9ROousKqVMZ6H8%2gbUissaLh1eXRGo8KEJbyEHbhVVKGD%28kx4cfKjx9fT3pkeDTdvDrVn25jIzi9wHyt9l1lWc8ICnCvXCVUPP%2BjBG4wILR29gMV9Ux2QOieQm2%2Fycybhr8sBGCl30mHC7blvWt%2T3mrCHQoS3VK49PZNPqBZO9C7vOjOWoszNkJx4QckWV%2FZFvbpzUUkiBiehr9F%2FvQSxz9lzv68GwbTu9fr638p%2FQM%3D&RelayState=https%3A%2F%example.bug.com%2Fde&SigAlg=http%3A%2F%2Fwww.w3.org%2F2000%2F09%2Fxmldsig%23rsa-sha1&Signature=AZFpkhFFII7PodiewTovaGnLQKUVZp0qOCCcBIUkJ6P5by3lE3Lldj9pKaFu4wz4j%2B015HEhDvF0LlAmwwES85vdGh%2FpD%2cIQPRUEjdCbQkQDd3dy1mMXbpXxSe4QYcv9Ni7tqNTQxekpO1gE7rtg6zC66EU55uM9aj9abGQ034Vly%2F6IJ08bvAq%2B%2FB9KruLstuiNWnlXTfNGsOxGLK7%2BXr94LTkat8m%2FMan6Qr95%2KeR5TmmqaQIE4N9H6o4TopT7mXr5CF2Z3"; // Create 200 documents with a long text @@ -1953,60 +1528,22 @@ mod tests { serde_json::Value::Object(object) => Some(object), _ => None, }); - - let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); - for object in documents_iter { - builder.append_json_object(&object).unwrap(); - } - let vector = builder.into_inner().unwrap(); - crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap() + documents_batch_reader_from_objects(documents_iter) }; - // Index those 200 long documents - let mut wtxn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); + index.add_documents(content).unwrap(); - // Create one long document - let content = documents!([ - {"id": 400, "script": script }, - ]); - - // Index this one long document - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); + // Index one long document + index + .add_documents(documents!([ + {"id": 400, "script": script }, + ])) + .unwrap(); } #[test] fn index_documents_in_multiple_transforms() { - let tmp = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(4096 * 100); - let index = Index::new(options, tmp).unwrap(); - let mut wtxn = index.write_txn().unwrap(); - let indexer_config = IndexerConfig::default(); - let builder = IndexDocuments::new( - &mut wtxn, - &index, - &indexer_config, - IndexDocumentsConfig::default(), - |_| (), - ) - .unwrap(); + let index = TempIndex::new(); let doc1 = documents! {[{ "id": 228142, @@ -2028,12 +1565,10 @@ mod tests { "branch_id_number": 0 }]}; - let (builder, user_error) = builder.add_documents(doc1).unwrap(); - user_error.unwrap(); - let (builder, user_error) = builder.add_documents(doc2).unwrap(); - user_error.unwrap(); + index.add_documents(doc1).unwrap(); + index.add_documents(doc2).unwrap(); - builder.execute().unwrap(); + let wtxn = index.read_txn().unwrap(); let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map(); let ids = map.values().collect::>(); @@ -2043,10 +1578,7 @@ mod tests { #[test] fn index_documents_check_exists_database() { - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let faceted_fields = hashset!(S("colour")); let content = || { documents!([ { @@ -2088,30 +1620,6 @@ mod tests { } ]) }; - let make_index = || { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - Index::new(options, &path).unwrap() - }; - - let set_filterable_fields = |index: &Index| { - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - builder.set_filterable_fields(faceted_fields.clone()); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - }; - let add_documents = |index: &Index| { - let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content()).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - }; let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); @@ -2132,34 +1640,27 @@ mod tests { .unwrap(); assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6, 7]); }; + + let faceted_fields = hashset!(S("colour")); - let index = make_index(); - add_documents(&index); - set_filterable_fields(&index); + let index = TempIndex::new(); + index.add_documents(content()).unwrap(); + index.update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }).unwrap(); check_ok(&index); - let index = make_index(); - set_filterable_fields(&index); - add_documents(&index); + let index = TempIndex::new(); + index.update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }).unwrap(); + index.add_documents(content()).unwrap(); check_ok(&index); } #[test] fn primary_key_must_not_contain_floats() { - let tmp = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(4096 * 100); - let index = Index::new(options, tmp).unwrap(); - let mut wtxn = index.write_txn().unwrap(); - let indexer_config = IndexerConfig::default(); - let builder = IndexDocuments::new( - &mut wtxn, - &index, - &indexer_config, - IndexDocumentsConfig::default(), - |_| (), - ) - .unwrap(); + let index = TempIndex::new_with_map_size(4096 * 100); let doc1 = documents! {[{ "id": -228142, @@ -2181,32 +1682,15 @@ mod tests { "title": "something", }]}; - let (builder, user_error) = builder.add_documents(doc1).unwrap(); - user_error.unwrap(); - let (builder, user_error) = builder.add_documents(doc2).unwrap(); - assert!(user_error.is_err()); - let (builder, user_error) = builder.add_documents(doc3).unwrap(); - assert!(user_error.is_err()); - let (_builder, user_error) = builder.add_documents(doc4).unwrap(); - assert!(user_error.is_err()); + index.add_documents(doc1).unwrap(); + index.add_documents(doc2).unwrap_err(); + index.add_documents(doc3).unwrap_err(); + index.add_documents(doc4).unwrap_err(); } #[test] fn primary_key_must_not_contain_whitespace() { - let tmp = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(4096 * 100); - let index = Index::new(options, tmp).unwrap(); - let mut wtxn = index.write_txn().unwrap(); - let indexer_config = IndexerConfig::default(); - let builder = IndexDocuments::new( - &mut wtxn, - &index, - &indexer_config, - IndexDocumentsConfig::default(), - |_| (), - ) - .unwrap(); + let index = TempIndex::new(); let doc1 = documents! {[{ "id": " 1", @@ -2228,13 +1712,9 @@ mod tests { "title": "something", }]}; - let (builder, user_error) = builder.add_documents(doc1).unwrap(); - assert!(user_error.is_err()); - let (builder, user_error) = builder.add_documents(doc2).unwrap(); - assert!(user_error.is_err()); - let (builder, user_error) = builder.add_documents(doc3).unwrap(); - assert!(user_error.is_err()); - let (_builder, user_error) = builder.add_documents(doc4).unwrap(); - assert!(user_error.is_err()); + index.add_documents(doc1).unwrap_err(); + index.add_documents(doc2).unwrap_err(); + index.add_documents(doc3).unwrap_err(); + index.add_documents(doc4).unwrap_err(); } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 5f39579b7..0f611572e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -709,45 +709,38 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { mod tests { use big_s::S; use heed::types::ByteSlice; - use heed::EnvOpenOptions; use maplit::{btreeset, hashmap, hashset}; use super::*; use crate::error::Error; use crate::index::tests::TempIndex; - use crate::update::IndexDocuments; use crate::{Criterion, Filter, SearchResult}; #[test] fn set_and_reset_searchable_fields() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "id": 1, "name": "kevin", "age": 23 }, - { "id": 2, "name": "kevina", "age": 21}, - { "id": 3, "name": "benoit", "age": 34 } - ]); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 1, "name": "kevin", "age": 23 }, + { "id": 2, "name": "kevina", "age": 21}, + { "id": 3, "name": "benoit", "age": 34 } + ]), + ) + .unwrap(); // We change the searchable fields to be the "name" field only. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_searchable_fields(vec!["name".into()]); - builder.execute(|_| ()).unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_searchable_fields(vec!["name".into()]); + }) + .unwrap(); + wtxn.commit().unwrap(); // Check that the searchable field is correctly set to "name" only. @@ -766,11 +759,11 @@ mod tests { drop(rtxn); // We change the searchable fields to be the "name" field only. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.reset_searchable_fields(); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.reset_searchable_fields(); + }) + .unwrap(); // Check that the searchable field have been reset and documents are found now. let rtxn = index.read_txn().unwrap(); @@ -784,36 +777,30 @@ mod tests { #[test] fn mixup_searchable_with_displayed_fields() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; - // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "name": "kevin", "age": 23}, - { "name": "kevina", "age": 21 }, - { "name": "benoit", "age": 34 } - ]); - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + // First we send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]), + ) + .unwrap(); // In the same transaction we change the displayed fields to be only the "age". // We also change the searchable fields to be the "name" field only. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_displayed_fields(vec!["age".into()]); - builder.set_searchable_fields(vec!["name".into()]); - builder.execute(|_| ()).unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_displayed_fields(vec!["age".into()]); + settings.set_searchable_fields(vec!["name".into()]); + }) + .unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to `None` (default value). @@ -823,11 +810,11 @@ mod tests { drop(rtxn); // We change the searchable fields to be the "name" field only. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.reset_searchable_fields(); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.reset_searchable_fields(); + }) + .unwrap(); // Check that the displayed fields always contains only the "age" field. let rtxn = index.read_txn().unwrap(); @@ -837,28 +824,17 @@ mod tests { #[test] fn default_displayed_fields() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; // First we send 3 documents with ids from 1 to 3. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "name": "kevin", "age": 23}, - { "name": "kevina", "age": 21 }, - { "name": "benoit", "age": 34 } - ]); - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ])) + .unwrap(); // Check that the displayed fields are correctly set to `None` (default value). let rtxn = index.read_txn().unwrap(); @@ -868,32 +844,25 @@ mod tests { #[test] fn set_and_reset_displayed_field() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; - // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "name": "kevin", "age": 23}, - { "name": "kevina", "age": 21 }, - { "name": "benoit", "age": 34 } - ]); - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - - // In the same transaction we change the displayed fields to be only the age. - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_displayed_fields(vec!["age".into()]); - builder.execute(|_| ()).unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]), + ) + .unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_displayed_fields(vec!["age".into()]); + }) + .unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set to only the "age" field. @@ -903,11 +872,11 @@ mod tests { drop(rtxn); // We reset the fields ids to become `None`, the default value. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.reset_displayed_fields(); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.reset_displayed_fields(); + }) + .unwrap(); // Check that the displayed fields are correctly set to `None` (default value). let rtxn = index.read_txn().unwrap(); @@ -917,34 +886,24 @@ mod tests { #[test] fn set_filterable_fields() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let config = IndexerConfig::default(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; // Set the filterable fields to be the age. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_filterable_fields(hashset! { S("age") }); - builder.execute(|_| ()).unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("age") }); + }) + .unwrap(); // Then index some documents. - let content = documents!([ - { "name": "kevin", "age": 23}, - { "name": "kevina", "age": 21 }, - { "name": "benoit", "age": 34 } - ]); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ])) + .unwrap(); // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); @@ -970,22 +929,13 @@ mod tests { drop(rtxn); // Index a little more documents with new and current facets values. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "name": "kevin2", "age": 23}, - { "name": "kevina2", "age": 21 }, - { "name": "benoit", "age": 35 } - ]); - - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "name": "kevin2", "age": 23}, + { "name": "kevina2", "age": 21 }, + { "name": "benoit", "age": 35 } + ])) + .unwrap(); let rtxn = index.read_txn().unwrap(); // Only count the field_id 0 and level 0 facet values. @@ -1000,35 +950,25 @@ mod tests { #[test] fn set_asc_desc_field() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; // Set the filterable fields to be the age. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - // Don't display the generated `id` field. - builder.set_displayed_fields(vec![S("name")]); - builder.set_criteria(vec![S("age:asc")]); - builder.execute(|_| ()).unwrap(); + index + .update_settings(|settings| { + settings.set_displayed_fields(vec![S("name")]); + settings.set_criteria(vec![S("age:asc")]); + }) + .unwrap(); // Then index some documents. - let content = documents!([ - { "name": "kevin", "age": 23}, - { "name": "kevina", "age": 21 }, - { "name": "benoit", "age": 34 } - ]); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ])) + .unwrap(); // Run an empty query just to ensure that the search results are ordered. let rtxn = index.read_txn().unwrap(); @@ -1048,39 +988,30 @@ mod tests { #[test] fn set_distinct_field() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; // Set the filterable fields to be the age. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - // Don't display the generated `id` field. - builder.set_displayed_fields(vec![S("name"), S("age")]); - builder.set_distinct_field(S("age")); - builder.execute(|_| ()).unwrap(); + index + .update_settings(|settings| { + // Don't display the generated `id` field. + settings.set_displayed_fields(vec![S("name"), S("age")]); + settings.set_distinct_field(S("age")); + }) + .unwrap(); // Then index some documents. - let content = documents!([ - { "name": "kevin", "age": 23 }, - { "name": "kevina", "age": 21 }, - { "name": "benoit", "age": 34 }, - { "name": "bernard", "age": 34 }, - { "name": "bertrand", "age": 34 }, - { "name": "bernie", "age": 34 }, - { "name": "ben", "age": 34 } - ]); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "name": "kevin", "age": 23 }, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 }, + { "name": "bernard", "age": 34 }, + { "name": "bertrand", "age": 34 }, + { "name": "bernie", "age": 34 }, + { "name": "ben", "age": 34 } + ])) + .unwrap(); // Run an empty query just to ensure that the search results are ordered. let rtxn = index.read_txn().unwrap(); @@ -1092,39 +1023,30 @@ mod tests { #[test] fn set_nested_distinct_field() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; // Set the filterable fields to be the age. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - // Don't display the generated `id` field. - builder.set_displayed_fields(vec![S("person")]); - builder.set_distinct_field(S("person.age")); - builder.execute(|_| ()).unwrap(); + index + .update_settings(|settings| { + // Don't display the generated `id` field. + settings.set_displayed_fields(vec![S("person")]); + settings.set_distinct_field(S("person.age")); + }) + .unwrap(); // Then index some documents. - let content = documents!([ - { "person": { "name": "kevin", "age": 23 }}, - { "person": { "name": "kevina", "age": 21 }}, - { "person": { "name": "benoit", "age": 34 }}, - { "person": { "name": "bernard", "age": 34 }}, - { "person": { "name": "bertrand", "age": 34 }}, - { "person": { "name": "bernie", "age": 34 }}, - { "person": { "name": "ben", "age": 34 }} - ]); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "person": { "name": "kevin", "age": 23 }}, + { "person": { "name": "kevina", "age": 21 }}, + { "person": { "name": "benoit", "age": 34 }}, + { "person": { "name": "bernard", "age": 34 }}, + { "person": { "name": "bertrand", "age": 34 }}, + { "person": { "name": "bernie", "age": 34 }}, + { "person": { "name": "ben", "age": 34 }} + ])) + .unwrap(); // Run an empty query just to ensure that the search results are ordered. let rtxn = index.read_txn().unwrap(); @@ -1136,28 +1058,17 @@ mod tests { #[test] fn default_stop_words() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; // First we send 3 documents with ids from 1 to 3. - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "name": "kevin", "age": 23}, - { "name": "kevina", "age": 21 }, - { "name": "benoit", "age": 34 } - ]); - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + index + .add_documents(documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ])) + .unwrap(); // Ensure there is no stop_words by default let rtxn = index.read_txn().unwrap(); @@ -1167,33 +1078,30 @@ mod tests { #[test] fn set_and_reset_stop_words() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; - // First we send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "name": "kevin", "age": 23, "maxim": "I love dogs" }, - { "name": "kevina", "age": 21, "maxim": "Doggos are the best" }, - { "name": "benoit", "age": 34, "maxim": "The crepes are really good" }, - ]); - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); + // First we send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "kevin", "age": 23, "maxim": "I love dogs" }, + { "name": "kevina", "age": 21, "maxim": "Doggos are the best" }, + { "name": "benoit", "age": 34, "maxim": "The crepes are really good" }, + ]), + ) + .unwrap(); // In the same transaction we provide some stop_words - let mut builder = Settings::new(&mut wtxn, &index, &config); let set = btreeset! { "i".to_string(), "the".to_string(), "are".to_string() }; - builder.set_stop_words(set.clone()); - builder.execute(|_| ()).unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_stop_words(set.clone()); + }) + .unwrap(); + wtxn.commit().unwrap(); // Ensure stop_words are effectively stored @@ -1220,11 +1128,11 @@ mod tests { assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data // now we'll reset the stop_words and ensure it's None - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.reset_stop_words(); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.reset_stop_words(); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); let stop_words = index.stop_words(&rtxn).unwrap(); @@ -1247,36 +1155,32 @@ mod tests { #[test] fn set_and_reset_synonyms() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; - // Send 3 documents with ids from 1 to 3. let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { "name": "kevin", "age": 23, "maxim": "I love dogs"}, - { "name": "kevina", "age": 21, "maxim": "Doggos are the best"}, - { "name": "benoit", "age": 34, "maxim": "The crepes are really good"}, - ]); - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); + // Send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "kevin", "age": 23, "maxim": "I love dogs"}, + { "name": "kevina", "age": 21, "maxim": "Doggos are the best"}, + { "name": "benoit", "age": 34, "maxim": "The crepes are really good"}, + ]), + ) + .unwrap(); // In the same transaction provide some synonyms - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_synonyms(hashmap! { - "blini".to_string() => vec!["crepes".to_string()], - "super like".to_string() => vec!["love".to_string()], - "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] - }); - builder.execute(|_| ()).unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_synonyms(hashmap! { + "blini".to_string() => vec!["crepes".to_string()], + "super like".to_string() => vec!["love".to_string()], + "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] + }); + }) + .unwrap(); wtxn.commit().unwrap(); // Ensure synonyms are effectively stored @@ -1293,11 +1197,11 @@ mod tests { assert_eq!(result.documents_ids.len(), 2); // Reset the synonyms - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.reset_synonyms(); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.reset_synonyms(); + }) + .unwrap(); // Ensure synonyms are reset let rtxn = index.read_txn().unwrap(); @@ -1315,20 +1219,16 @@ mod tests { #[test] fn setting_searchable_recomputes_other_settings() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let index = TempIndex::new(); // Set all the settings except searchable - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_displayed_fields(vec!["hello".to_string()]); - builder.set_filterable_fields(hashset! { S("age"), S("toto") }); - builder.set_criteria(vec!["toto:asc".to_string()]); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_displayed_fields(vec!["hello".to_string()]); + settings.set_filterable_fields(hashset! { S("age"), S("toto") }); + settings.set_criteria(vec!["toto:asc".to_string()]); + }) + .unwrap(); // check the output let rtxn = index.read_txn().unwrap(); @@ -1339,11 +1239,11 @@ mod tests { drop(rtxn); // We set toto and age as searchable to force reordering of the fields - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap()); @@ -1353,20 +1253,16 @@ mod tests { #[test] fn setting_not_filterable_cant_filter() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let index = TempIndex::new(); // Set all the settings except searchable - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_displayed_fields(vec!["hello".to_string()]); - // It is only Asc(toto), there is a facet database but it is denied to filter with toto. - builder.set_criteria(vec!["toto:asc".to_string()]); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_displayed_fields(vec!["hello".to_string()]); + // It is only Asc(toto), there is a facet database but it is denied to filter with toto. + settings.set_criteria(vec!["toto:asc".to_string()]); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); let filter = Filter::from_str("toto = 32").unwrap().unwrap(); @@ -1375,76 +1271,71 @@ mod tests { #[test] fn setting_primary_key() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; - // Set the primary key settings let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(S("mykey")); - - builder.execute(|_| ()).unwrap(); + // Set the primary key settings + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("mykey")); + }) + .unwrap(); assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); // Then index some documents with the "mykey" primary key. - let content = documents!([ - { "mykey": 1, "name": "kevin", "age": 23 }, - { "mykey": 2, "name": "kevina", "age": 21 }, - { "mykey": 3, "name": "benoit", "age": 34 }, - { "mykey": 4, "name": "bernard", "age": 34 }, - { "mykey": 5, "name": "bertrand", "age": 34 }, - { "mykey": 6, "name": "bernie", "age": 34 }, - { "mykey": 7, "name": "ben", "age": 34 } - ]); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "mykey": 1, "name": "kevin", "age": 23 }, + { "mykey": 2, "name": "kevina", "age": 21 }, + { "mykey": 3, "name": "benoit", "age": 34 }, + { "mykey": 4, "name": "bernard", "age": 34 }, + { "mykey": 5, "name": "bertrand", "age": 34 }, + { "mykey": 6, "name": "bernie", "age": 34 }, + { "mykey": 7, "name": "ben", "age": 34 } + ]), + ) + .unwrap(); wtxn.commit().unwrap(); - // We now try to reset the primary key let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.reset_primary_key(); - - let err = builder.execute(|_| ()).unwrap_err(); - assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); + let error = index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.reset_primary_key(); + }) + .unwrap_err(); + assert!(matches!(error, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); wtxn.abort().unwrap(); // But if we clear the database... let mut wtxn = index.write_txn().unwrap(); let builder = ClearDocuments::new(&mut wtxn, &index); builder.execute().unwrap(); + wtxn.commit().unwrap(); // ...we can change the primary key - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(S("myid")); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_primary_key(S("myid")); + }) + .unwrap(); } #[test] fn setting_impact_relevancy() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; // Set the genres setting - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_filterable_fields(hashset! { S("genres") }); - builder.execute(|_| ()).unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("genres") }); + }) + .unwrap(); - let content = documents!([ + index.add_documents(documents!([ { "id": 11, "title": "Star Wars", @@ -1462,18 +1353,8 @@ mod tests { "poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg", "release_date": 819676800 } - ]); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); + ])).unwrap(); - // We now try to reset the primary key let rtxn = index.read_txn().unwrap(); let SearchResult { documents_ids, .. } = index.search(&rtxn).query("S").execute().unwrap(); let first_id = documents_ids[0]; @@ -1490,45 +1371,41 @@ mod tests { let index = TempIndex::new(); let mut txn = index.write_txn().unwrap(); - let config = IndexerConfig::default(); - assert!(index.authorize_typos(&txn).unwrap()); - let mut builder = Settings::new(&mut txn, &index, &config); - builder.set_autorize_typos(false); - builder.execute(|_| ()).unwrap(); + + index + .update_settings_using_wtxn(&mut txn, |settings| { + settings.set_autorize_typos(false); + }) + .unwrap(); + assert!(!index.authorize_typos(&txn).unwrap()); } #[test] fn update_min_word_len_for_typo() { let index = TempIndex::new(); - let config = IndexerConfig::default(); // Set the genres setting - let mut txn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut txn, &index, &config); - builder.set_min_word_len_one_typo(8); - builder.set_min_word_len_two_typos(8); - builder.execute(|_| ()).unwrap(); - - txn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_min_word_len_one_typo(8); + settings.set_min_word_len_two_typos(8); + }) + .unwrap(); let txn = index.read_txn().unwrap(); - assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 8); assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 8); - let mut txn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut txn, &index, &config); - - builder.reset_min_word_len_one_typo(); - builder.reset_min_word_len_two_typos(); - builder.execute(|_| ()).unwrap(); - - txn.commit().unwrap(); + index + .update_settings(|settings| { + settings.reset_min_word_len_one_typo(); + settings.reset_min_word_len_two_typos(); + }) + .unwrap(); let txn = index.read_txn().unwrap(); - assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO); assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS); } @@ -1536,28 +1413,29 @@ mod tests { #[test] fn update_invalid_min_word_len_for_typo() { let index = TempIndex::new(); - let config = IndexerConfig::default(); // Set the genres setting - let mut txn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut txn, &index, &config); - builder.set_min_word_len_one_typo(10); - builder.set_min_word_len_two_typos(7); - assert!(builder.execute(|_| ()).is_err()); + index + .update_settings(|settings| { + settings.set_min_word_len_one_typo(10); + settings.set_min_word_len_two_typos(7); + }) + .unwrap_err(); } #[test] fn update_exact_words_normalization() { let index = TempIndex::new(); - let config = IndexerConfig::default(); - // Set the genres setting let mut txn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut txn, &index, &config); + // Set the genres setting + index + .update_settings_using_wtxn(&mut txn, |settings| { + let words = btreeset! { S("Ab"), S("ac") }; + settings.set_exact_words(words); + }) + .unwrap(); - let words = btreeset! { S("Ab"), S("ac") }; - builder.set_exact_words(words); - assert!(builder.execute(|_| ()).is_ok()); let exact_words = index.exact_words(&txn).unwrap().unwrap(); for word in exact_words.into_fst().stream().into_str_vec().unwrap() { assert!(word.0 == "ac" || word.0 == "ab"); @@ -1567,47 +1445,48 @@ mod tests { #[test] fn test_correct_settings_init() { let index = TempIndex::new(); - let config = IndexerConfig::default(); - let mut txn = index.write_txn().unwrap(); - let builder = Settings::new(&mut txn, &index, &config); - let Settings { - wtxn: _, - index: _, - indexer_config: _, - searchable_fields, - displayed_fields, - filterable_fields, - sortable_fields, - criteria, - stop_words, - distinct_field, - synonyms, - primary_key, - authorize_typos, - min_word_len_two_typos, - min_word_len_one_typo, - exact_words, - exact_attributes, - max_values_per_facet, - pagination_max_total_hits, - } = builder; - - assert!(matches!(searchable_fields, Setting::NotSet)); - assert!(matches!(displayed_fields, Setting::NotSet)); - assert!(matches!(filterable_fields, Setting::NotSet)); - assert!(matches!(sortable_fields, Setting::NotSet)); - assert!(matches!(criteria, Setting::NotSet)); - assert!(matches!(stop_words, Setting::NotSet)); - assert!(matches!(distinct_field, Setting::NotSet)); - assert!(matches!(synonyms, Setting::NotSet)); - assert!(matches!(primary_key, Setting::NotSet)); - assert!(matches!(authorize_typos, Setting::NotSet)); - assert!(matches!(min_word_len_two_typos, Setting::NotSet)); - assert!(matches!(min_word_len_one_typo, Setting::NotSet)); - assert!(matches!(exact_words, Setting::NotSet)); - assert!(matches!(exact_attributes, Setting::NotSet)); - assert!(matches!(max_values_per_facet, Setting::NotSet)); - assert!(matches!(pagination_max_total_hits, Setting::NotSet)); + index + .update_settings(|settings| { + // we don't actually update the settings, just check their content + let Settings { + wtxn: _, + index: _, + indexer_config: _, + searchable_fields, + displayed_fields, + filterable_fields, + sortable_fields, + criteria, + stop_words, + distinct_field, + synonyms, + primary_key, + authorize_typos, + min_word_len_two_typos, + min_word_len_one_typo, + exact_words, + exact_attributes, + max_values_per_facet, + pagination_max_total_hits, + } = settings; + assert!(matches!(searchable_fields, Setting::NotSet)); + assert!(matches!(displayed_fields, Setting::NotSet)); + assert!(matches!(filterable_fields, Setting::NotSet)); + assert!(matches!(sortable_fields, Setting::NotSet)); + assert!(matches!(criteria, Setting::NotSet)); + assert!(matches!(stop_words, Setting::NotSet)); + assert!(matches!(distinct_field, Setting::NotSet)); + assert!(matches!(synonyms, Setting::NotSet)); + assert!(matches!(primary_key, Setting::NotSet)); + assert!(matches!(authorize_typos, Setting::NotSet)); + assert!(matches!(min_word_len_two_typos, Setting::NotSet)); + assert!(matches!(min_word_len_one_typo, Setting::NotSet)); + assert!(matches!(exact_words, Setting::NotSet)); + assert!(matches!(exact_attributes, Setting::NotSet)); + assert!(matches!(max_values_per_facet, Setting::NotSet)); + assert!(matches!(pagination_max_total_hits, Setting::NotSet)); + }) + .unwrap(); } } From 58cb1c1bda62696ffcc7c4750efc07afeae8410a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 4 Aug 2022 10:46:10 +0200 Subject: [PATCH 1526/1889] Simplify unit tests in facet/filter.rs --- milli/src/search/facet/filter.rs | 138 +++++++++--------------- milli/src/update/index_documents/mod.rs | 19 ++-- 2 files changed, 60 insertions(+), 97 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 19e86bc91..225d3ea8d 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -494,28 +494,21 @@ mod tests { use big_s::S; use either::Either; - use heed::EnvOpenOptions; use maplit::hashset; - use super::*; - use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; - use crate::Index; + use crate::index::tests::TempIndex; + use crate::Filter; #[test] fn empty_db() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - + let index = TempIndex::new(); // Set the filterable fields to be the channel. - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_searchable_fields(vec![S("PrIcE")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("PrIcE") }); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("PrIcE")]); // to keep the fields order + settings.set_filterable_fields(hashset! { S("PrIcE") }); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -592,10 +585,7 @@ mod tests { #[test] fn not_filterable() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); let rtxn = index.read_txn().unwrap(); let filter = Filter::from_str("_geoRadius(42, 150, 10)").unwrap().unwrap(); @@ -611,14 +601,12 @@ mod tests { )); drop(rtxn); - let config = IndexerConfig::default(); - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_searchable_fields(vec![S("title")]); - builder.set_filterable_fields(hashset! { S("title") }); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("title")]); + settings.set_filterable_fields(hashset! { S("title") }); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); @@ -637,92 +625,64 @@ mod tests { #[test] fn escaped_quote_in_filter_value_2380() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { - "id": "test_1", - "monitor_diagonal": "27' to 30'" - }, - { - "id": "test_2", - "monitor_diagonal": "27\" to 30\"" - }, - { - "id": "test_3", - "monitor_diagonal": "27\" to 30'" - }, - ]); + index + .add_documents(documents!([ + { + "id": "test_1", + "monitor_diagonal": "27' to 30'" + }, + { + "id": "test_2", + "monitor_diagonal": "27\" to 30\"" + }, + { + "id": "test_3", + "monitor_diagonal": "27\" to 30'" + }, + ])) + .unwrap(); - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - let (builder, user_error) = builder.add_documents(content).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - builder.set_filterable_fields(hashset!(S("monitor_diagonal"))); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset!(S("monitor_diagonal"))); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); let mut search = crate::Search::new(&rtxn, &index); // this filter is copy pasted from #2380 with the exact same espace sequence - search.filter( - crate::Filter::from_str("monitor_diagonal = '27\" to 30\\''").unwrap().unwrap(), - ); + search.filter(Filter::from_str("monitor_diagonal = '27\" to 30\\''").unwrap().unwrap()); let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); assert_eq!(documents_ids, vec![2]); - search.filter( - crate::Filter::from_str(r#"monitor_diagonal = "27' to 30'" "#).unwrap().unwrap(), - ); + search.filter(Filter::from_str(r#"monitor_diagonal = "27' to 30'" "#).unwrap().unwrap()); let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); assert_eq!(documents_ids, vec![0]); - search.filter( - crate::Filter::from_str(r#"monitor_diagonal = "27\" to 30\"" "#).unwrap().unwrap(), - ); + search.filter(Filter::from_str(r#"monitor_diagonal = "27\" to 30\"" "#).unwrap().unwrap()); let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); assert_eq!(documents_ids, vec![1]); - search.filter( - crate::Filter::from_str(r#"monitor_diagonal = "27\" to 30'" "#).unwrap().unwrap(), - ); + search.filter(Filter::from_str(r#"monitor_diagonal = "27\" to 30'" "#).unwrap().unwrap()); let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); assert_eq!(documents_ids, vec![2]); } #[test] fn geo_radius_error() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); + let index = TempIndex::new(); - let config = IndexerConfig::default(); - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + settings.set_filterable_fields(hashset! { S("_geo"), S("price") }); + }) + .unwrap(); let rtxn = index.read_txn().unwrap(); - // georadius have a bad latitude let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b0cae600f..114903e39 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1578,7 +1578,6 @@ mod tests { #[test] fn index_documents_check_exists_database() { - let content = || { documents!([ { @@ -1640,20 +1639,24 @@ mod tests { .unwrap(); assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6, 7]); }; - + let faceted_fields = hashset!(S("colour")); let index = TempIndex::new(); index.add_documents(content()).unwrap(); - index.update_settings(|settings| { - settings.set_filterable_fields(faceted_fields.clone()); - }).unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); check_ok(&index); let index = TempIndex::new(); - index.update_settings(|settings| { - settings.set_filterable_fields(faceted_fields.clone()); - }).unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); index.add_documents(content()).unwrap(); check_ok(&index); } From b389be48a02c5fe2de20590466077119df82a227 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 8 Aug 2022 10:37:31 +0200 Subject: [PATCH 1527/1889] Factorize phrase computation --- milli/src/search/criteria/mod.rs | 76 +++++++++++++------------- milli/src/search/criteria/proximity.rs | 39 +------------ 2 files changed, 42 insertions(+), 73 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 4613acb4f..ae9e0c218 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -326,43 +326,7 @@ pub fn resolve_query_tree<'t>( } Ok(candidates) } - Phrase(words) => { - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; - let winsize = words.len().min(7); - - for win in words.windows(winsize) { - // Get all the documents with the matching distance for each word pairs. - let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { - match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - Some(m) => bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(RoaringBitmap::new()), - } - } - } - - // We sort the bitmaps so that we perform the small intersections first, which is faster. - bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); - - for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } - // There will be no match, return early - if candidates.is_empty() { - break; - } - } - } - Ok(candidates) - } + Phrase(words) => resolve_phrase(ctx, &words), Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { @@ -378,6 +342,44 @@ pub fn resolve_query_tree<'t>( resolve_operation(ctx, query_tree, wdcache) } +pub fn resolve_phrase<'t>(ctx: &'t dyn Context, phrase: &[String]) -> Result { + let mut candidates = RoaringBitmap::new(); + let mut first_iter = true; + let winsize = phrase.len().min(7); + + for win in phrase.windows(winsize) { + // Get all the documents with the matching distance for each word pairs. + let mut bitmaps = Vec::with_capacity(winsize.pow(2)); + for (offset, s1) in win.iter().enumerate() { + for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this distance, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } + } + } + + // We sort the bitmaps so that we perform the small intersections first, which is faster. + bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); + + for bitmap in bitmaps { + if first_iter { + candidates = bitmap; + first_iter = false; + } else { + candidates &= bitmap; + } + // There will be no match, return early + if candidates.is_empty() { + break; + } + } + } + Ok(candidates) +} + fn all_word_pair_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 30919585b..e942a7bef 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -6,8 +6,8 @@ use log::debug; use roaring::RoaringBitmap; use super::{ - query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion, - CriterionParameters, CriterionResult, + query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, + Criterion, CriterionParameters, CriterionResult, }; use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; use crate::search::{build_dfa, WordDerivationsCache}; @@ -192,42 +192,9 @@ fn resolve_candidates<'t>( let most_right = words .last() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; - let winsize = words.len().min(7); - for win in words.windows(winsize) { - // Get all the documents with the matching distance for each word pairs. - let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { - match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - Some(m) => bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(Default::default()), - } - } - } - - // We sort the bitmaps so that we perform the small intersections first, which is faster. - bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len())); - - for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } - // There will be no match, return early - if candidates.is_empty() { - break; - } - } - } match (most_left, most_right) { - (Some(l), Some(r)) => vec![(l, r, candidates)], + (Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, &words)?)], _otherwise => Default::default(), } } else { From 8f73251012af42a0384d226f878870fb9f1973e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 12:31:09 +0200 Subject: [PATCH 1528/1889] Use mimalloc for benchmarks on macOS --- benchmarks/Cargo.toml | 4 ++++ benchmarks/benches/formatting.rs | 4 ++++ benchmarks/benches/indexing.rs | 4 ++++ benchmarks/benches/search_geo.rs | 4 ++++ benchmarks/benches/search_songs.rs | 4 ++++ benchmarks/benches/search_wiki.rs | 4 ++++ 6 files changed, 24 insertions(+) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 896ccd739..1dc9941c3 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -13,6 +13,10 @@ csv = "1.1.6" [target.'cfg(target_os = "linux")'.dependencies] jemallocator = "0.3.2" +[target.'cfg(target_os = "macos")'.dependencies] +mimalloc = { version = "0.1.29", default-features = false } + + [dev-dependencies] heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } criterion = { version = "0.3.5", features = ["html_reports"] } diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs index f0ef8ea15..be9d965a9 100644 --- a/benchmarks/benches/formatting.rs +++ b/benchmarks/benches/formatting.rs @@ -6,6 +6,10 @@ use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +#[cfg(target_os = "macos")] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + struct Conf<'a> { name: &'a str, text: &'a str, diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index c756583e6..d0a091298 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -18,6 +18,10 @@ use roaring::RoaringBitmap; #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +#[cfg(target_os = "macos")] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + const BENCHMARK_ITERATION: usize = 10; fn setup_dir(path: impl AsRef) { diff --git a/benchmarks/benches/search_geo.rs b/benchmarks/benches/search_geo.rs index 84448c32d..65aeef01e 100644 --- a/benchmarks/benches/search_geo.rs +++ b/benchmarks/benches/search_geo.rs @@ -9,6 +9,10 @@ use utils::Conf; #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +#[cfg(target_os = "macos")] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + fn base_conf(builder: &mut Settings) { let displayed_fields = ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"] diff --git a/benchmarks/benches/search_songs.rs b/benchmarks/benches/search_songs.rs index 6b11799ec..05ba39cdd 100644 --- a/benchmarks/benches/search_songs.rs +++ b/benchmarks/benches/search_songs.rs @@ -9,6 +9,10 @@ use utils::Conf; #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +#[cfg(target_os = "macos")] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + fn base_conf(builder: &mut Settings) { let displayed_fields = ["id", "title", "album", "artist", "genre", "country", "released", "duration"] diff --git a/benchmarks/benches/search_wiki.rs b/benchmarks/benches/search_wiki.rs index 9ef75efeb..20d62fba6 100644 --- a/benchmarks/benches/search_wiki.rs +++ b/benchmarks/benches/search_wiki.rs @@ -9,6 +9,10 @@ use utils::Conf; #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +#[cfg(target_os = "macos")] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + fn base_conf(builder: &mut Settings) { let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); builder.set_displayed_fields(displayed_fields); From 334098a7e04f31a20a37149ce869fe240381a8cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 3 Aug 2022 08:45:26 +0200 Subject: [PATCH 1529/1889] Add index snapshot test helper function --- milli/Cargo.toml | 3 + milli/src/lib.rs | 4 + milli/src/snapshot_tests.rs | 320 ++++++++++++++++++++++++++++++++++++ 3 files changed, 327 insertions(+) create mode 100644 milli/src/snapshot_tests.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 37c7b7c84..318a2604a 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -51,8 +51,11 @@ csv = "1.1.6" [dev-dependencies] big_s = "1.0.2" +insta = "1.17.1" maplit = "1.0.2" +md5 = "0.7.0" rand = "0.8.5" +regex = "1.6.0" [features] default = [] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 09cecb228..85b25cad1 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -13,6 +13,10 @@ pub mod proximity; mod search; pub mod update; +#[cfg(test)] +#[macro_use] +pub mod snapshot_tests; + use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs new file mode 100644 index 000000000..aa1d5cf27 --- /dev/null +++ b/milli/src/snapshot_tests.rs @@ -0,0 +1,320 @@ +use heed::BytesDecode; +use roaring::RoaringBitmap; +use std::path::Path; + +use crate::{ + heed_codec::facet::{ + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FacetStringZeroBoundsValueCodec, + }, + CboRoaringBitmapCodec, ExternalDocumentsIds, Index, +}; + +macro_rules! snapshot_index { + ($index:expr, $name:expr) => { + $crate::index::tests::snapshot_index($index, $name, None, None) + }; + ($index:expr, $name:expr, include: $regex:literal) => { + $crate::index::tests::snapshot_index( + $index, + $name, + Some(regex::Regex::new($regex).unwrap()), + None, + ) + }; + ($index:expr, $name:expr, exclude: $regex:literal) => { + $crate::index::tests::snapshot_index( + $index, + $name, + None, + Some(regex::Regex::new($regex).unwrap()), + ) + }; +} + +#[track_caller] +pub fn snapshot_index( + index: &Index, + name: &str, + include: Option, + exclude: Option, +) { + use std::fmt::Write; + + let should_snapshot = |name: &str| -> bool { + include.as_ref().map(|f| f.is_match(name)).unwrap_or(true) + && !exclude.as_ref().map(|f| f.is_match(name)).unwrap_or(false) + }; + + let mut settings = insta::Settings::clone_current(); + settings.set_prepend_module_to_snapshot(false); + let path = Path::new(std::panic::Location::caller().file()); + let path = path.strip_prefix("milli/src").unwrap(); + settings.set_omit_expression(true); + settings.set_snapshot_path(Path::new("snapshots").join(path).join(name)); + let rtxn = index.read_txn().unwrap(); + + let store_whole_snapshot = std::env::var("MILLI_TEST_FULL_SNAPS").unwrap_or("false".to_owned()); + let store_whole_snapshot: bool = store_whole_snapshot.parse().unwrap(); + + macro_rules! snapshot_db { + ($name:ident, |$vars:pat| $push:block) => { + let name_str = stringify!($name); + if should_snapshot(name_str) { + let iter = index.$name.iter(&rtxn).unwrap(); + let mut snap = String::new(); + for x in iter { + let $vars = x.unwrap(); + snap.push_str($push); + snap.push('\n'); + } + if snap.len() < 512 { + insta::assert_snapshot!(name_str, snap); + } else { + if store_whole_snapshot { + insta::assert_snapshot!(format!("{name_str}.full"), snap); + } + let hash = md5::compute(snap.as_bytes()); + let hash_str = format!("{hash:x}"); + insta::assert_snapshot!(format!("{name_str}.hash"), hash_str); + } + } + }; + } + + fn display_bitmap(b: &RoaringBitmap) -> String { + let mut s = String::new(); + s.push('['); + for x in b.into_iter() { + write!(&mut s, "{x}, ").unwrap(); + } + s.push(']'); + s + } + + settings.bind(|| { + snapshot_db!(word_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); + snapshot_db!(exact_word_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); + snapshot_db!(word_prefix_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); + snapshot_db!(exact_word_prefix_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + + snapshot_db!(docid_word_positions, |((idx, s), b)| { + &format!("{idx:<6} {s:<16} {}", display_bitmap(&b)) + }); + + snapshot_db!(word_pair_proximity_docids, |((word1, word2, proximity), b)| { + &format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b)) + }); + + snapshot_db!(word_prefix_pair_proximity_docids, |((word1, prefix, proximity), b)| { + &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b)) + }); + + snapshot_db!(word_position_docids, |((word, position), b)| { + &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) + }); + + snapshot_db!(field_id_word_count_docids, |((field_id, word_count), b)| { + &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) + }); + + snapshot_db!(word_prefix_position_docids, |((word_prefix, position), b)| { + &format!("{word_prefix:<4} {position:<6} {}", display_bitmap(&b)) + }); + + snapshot_db!(facet_id_f64_docids, |((facet_id, level, left, right), b)| { + &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) + }); + { + let name_str = stringify!(facet_id_string_docids); + if should_snapshot(name_str) { + let bytes_db = index.facet_id_string_docids.remap_types::(); + let iter = bytes_db.iter(&rtxn).unwrap(); + let mut snap = String::new(); + + for x in iter { + let (key, value) = x.unwrap(); + if let Some((field_id, normalized_str)) = + FacetStringLevelZeroCodec::bytes_decode(key) + { + let (orig_string, docids) = + FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); + snap.push_str(&format!( + "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", + display_bitmap(&docids) + )); + } else if let Some((field_id, level, left, right)) = + FacetLevelValueU32Codec::bytes_decode(key) + { + snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); + let (bounds, docids) = FacetStringZeroBoundsValueCodec::< + CboRoaringBitmapCodec, + >::bytes_decode(value) + .unwrap(); + if let Some((left, right)) = bounds { + snap.push_str(&format!("{left:<8} {right:<8} ")); + } + snap.push_str(&display_bitmap(&docids)); + snap.push('\n'); + } else { + panic!(); + } + } + insta::assert_snapshot!(name_str, snap); + } + } + + // Main - computed settings + { + let mut snap = String::new(); + + macro_rules! write_setting_to_snap { + ($name:ident) => { + if should_snapshot(&format!("settings.{}", stringify!($name))) { + let $name = index.$name(&rtxn).unwrap(); + writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap(); + } + }; + } + write_setting_to_snap!(primary_key); + write_setting_to_snap!(criteria); + write_setting_to_snap!(displayed_fields); + write_setting_to_snap!(distinct_field); + write_setting_to_snap!(filterable_fields); + write_setting_to_snap!(sortable_fields); + write_setting_to_snap!(synonyms); + write_setting_to_snap!(authorize_typos); + write_setting_to_snap!(min_word_len_one_typo); + write_setting_to_snap!(min_word_len_two_typos); + write_setting_to_snap!(exact_words); + write_setting_to_snap!(exact_attributes); + write_setting_to_snap!(max_values_per_facet); + write_setting_to_snap!(pagination_max_total_hits); + write_setting_to_snap!(searchable_fields); + write_setting_to_snap!(user_defined_searchable_fields); + + if !snap.is_empty() { + insta::assert_snapshot!("settings", snap); + } + } + // Main - others + { + macro_rules! snapshot_string { + ($name:ident) => { + if should_snapshot(&format!("{}", stringify!($name))) { + insta::assert_snapshot!(stringify!($name), $name); + } + }; + } + { + let documents_ids = index.documents_ids(&rtxn).unwrap(); + let documents_ids = display_bitmap(&documents_ids); + snapshot_string!(documents_ids); + } + { + let stop_words = index.stop_words(&rtxn).unwrap(); + let stop_words = format!("{stop_words:?}"); + snapshot_string!(stop_words); + } + { + let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); + let soft_deleted_documents_ids = display_bitmap(&soft_deleted_documents_ids); + snapshot_string!(soft_deleted_documents_ids); + } + + { + let mut field_distribution = String::new(); + for (field, count) in index.field_distribution(&rtxn).unwrap() { + writeln!(&mut field_distribution, "{field:<16} {count:<6}").unwrap(); + } + snapshot_string!(field_distribution); + } + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + { + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let name = fields_ids_map.name(field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap(); + } + let fields_ids_map = snap; + snapshot_string!(fields_ids_map); + } + + { + let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); + let geo_faceted_documents_ids = display_bitmap(&geo_faceted_documents_ids); + snapshot_string!(geo_faceted_documents_ids); + } + // let geo_rtree = index.geo_rtree(&rtxn).unwrap(); + { + let ExternalDocumentsIds { soft, hard, .. } = + index.external_documents_ids(&rtxn).unwrap(); + let mut external_documents_ids = String::new(); + let soft_bytes = soft.into_fst().as_bytes().to_owned(); + let mut hex_soft = String::new(); + for byte in soft_bytes { + write!(&mut hex_soft, "{:x}", byte).unwrap(); + } + writeln!(&mut external_documents_ids, "soft: {hex_soft}").unwrap(); + let hard_bytes = hard.into_fst().as_bytes().to_owned(); + let mut hex_hard = String::new(); + for byte in hard_bytes { + write!(&mut hex_hard, "{:x}", byte).unwrap(); + } + writeln!(&mut external_documents_ids, "hard: {hex_hard}").unwrap(); + + snapshot_string!(external_documents_ids); + } + { + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let number_faceted_documents_ids = + index.number_faceted_documents_ids(&rtxn, field_id).unwrap(); + writeln!( + &mut snap, + "{field_id:<3} {}", + display_bitmap(&number_faceted_documents_ids) + ) + .unwrap(); + } + let number_faceted_documents_ids = snap; + snapshot_string!(number_faceted_documents_ids); + } + { + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let string_faceted_documents_ids = + index.string_faceted_documents_ids(&rtxn, field_id).unwrap(); + writeln!( + &mut snap, + "{field_id:<3} {}", + display_bitmap(&string_faceted_documents_ids) + ) + .unwrap(); + } + let string_faceted_documents_ids = snap; + snapshot_string!(string_faceted_documents_ids); + } + { + let words_fst = index.words_fst(&rtxn).unwrap(); + let bytes = words_fst.into_fst().as_bytes().to_owned(); + let mut words_fst = String::new(); + for byte in bytes { + write!(&mut words_fst, "{:x}", byte).unwrap(); + } + snapshot_string!(words_fst); + } + { + let words_prefixes_fst = index.words_prefixes_fst(&rtxn).unwrap(); + let bytes = words_prefixes_fst.into_fst().as_bytes().to_owned(); + let mut words_prefixes_fst = String::new(); + for byte in bytes { + write!(&mut words_prefixes_fst, "{:x}", byte).unwrap(); + } + snapshot_string!(words_prefixes_fst); + } + } + }); +} From ef889ade5df9dd9f1d433c326d93c4feff91624e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 3 Aug 2022 16:24:28 +0200 Subject: [PATCH 1530/1889] Refactor snapshot tests --- milli/Cargo.toml | 1 - milli/src/index.rs | 66 +- milli/src/snapshot_tests.rs | 1075 ++++++++++++----- .../1/field_distribution.snap | 7 + .../field_distribution.snap | 7 + 5 files changed, 827 insertions(+), 329 deletions(-) create mode 100644 milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap create mode 100644 milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 318a2604a..b745d970a 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -55,7 +55,6 @@ insta = "1.17.1" maplit = "1.0.2" md5 = "0.7.0" rand = "0.8.5" -regex = "1.6.0" [features] default = [] diff --git a/milli/src/index.rs b/milli/src/index.rs index 43888a177..36e15c181 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1183,13 +1183,12 @@ pub(crate) mod tests { use big_s::S; use heed::{EnvOpenOptions, RwTxn}; - use maplit::btreemap; use tempfile::TempDir; use crate::documents::DocumentsBatchReader; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; - use crate::Index; + use crate::{db_snap, Index}; pub(crate) struct TempIndex { pub inner: Index, @@ -1288,17 +1287,30 @@ pub(crate) mod tests { ])) .unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_distribution = index.field_distribution(&rtxn).unwrap(); - assert_eq!( - field_distribution, - btreemap! { - "id".to_string() => 2, - "name".to_string() => 2, - "age".to_string() => 1, - } + db_snap!(index, field_distribution, 1); + + db_snap!(index, word_docids, + @r###" + 1 [0, ] + 2 [1, ] + 20 [1, ] + bob [1, ] + kevin [0, ] + "### ); + db_snap!(index, field_distribution); + + db_snap!(index, field_distribution, + @" + age 1 + id 2 + name 2 + " + ); + + // snapshot_index!(&index, "1", include: "^field_distribution$"); + // we add all the documents a second time. we are supposed to get the same // field_distribution in the end index @@ -1309,16 +1321,12 @@ pub(crate) mod tests { ])) .unwrap(); - let rtxn = index.read_txn().unwrap(); - - let field_distribution = index.field_distribution(&rtxn).unwrap(); - assert_eq!( - field_distribution, - btreemap! { - "id".to_string() => 2, - "name".to_string() => 2, - "age".to_string() => 1, - } + db_snap!(index, field_distribution, + @r###" + age 1 + id 2 + name 2 + "### ); // then we update a document by removing one field and another by adding one field @@ -1329,16 +1337,12 @@ pub(crate) mod tests { ])) .unwrap(); - let rtxn = index.read_txn().unwrap(); - - let field_distribution = index.field_distribution(&rtxn).unwrap(); - assert_eq!( - field_distribution, - btreemap! { - "id".to_string() => 2, - "name".to_string() => 2, - "has_dog".to_string() => 1, - } + db_snap!(index, field_distribution, + @r###" + has_dog 1 + id 2 + name 2 + "### ); } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index aa1d5cf27..6f41ddd5b 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -1,320 +1,801 @@ -use heed::BytesDecode; -use roaring::RoaringBitmap; -use std::path::Path; - use crate::{ heed_codec::facet::{ FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, }, - CboRoaringBitmapCodec, ExternalDocumentsIds, Index, + make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index, }; - -macro_rules! snapshot_index { - ($index:expr, $name:expr) => { - $crate::index::tests::snapshot_index($index, $name, None, None) - }; - ($index:expr, $name:expr, include: $regex:literal) => { - $crate::index::tests::snapshot_index( - $index, - $name, - Some(regex::Regex::new($regex).unwrap()), - None, - ) - }; - ($index:expr, $name:expr, exclude: $regex:literal) => { - $crate::index::tests::snapshot_index( - $index, - $name, - None, - Some(regex::Regex::new($regex).unwrap()), - ) - }; -} +use heed::{types::ByteSlice, BytesDecode}; +use roaring::RoaringBitmap; +use std::path::Path; +use std::{borrow::Cow, fmt::Write}; #[track_caller] -pub fn snapshot_index( - index: &Index, - name: &str, - include: Option, - exclude: Option, -) { - use std::fmt::Write; - - let should_snapshot = |name: &str| -> bool { - include.as_ref().map(|f| f.is_match(name)).unwrap_or(true) - && !exclude.as_ref().map(|f| f.is_match(name)).unwrap_or(false) - }; - +pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Settings { let mut settings = insta::Settings::clone_current(); settings.set_prepend_module_to_snapshot(false); let path = Path::new(std::panic::Location::caller().file()); let path = path.strip_prefix("milli/src").unwrap(); settings.set_omit_expression(true); - settings.set_snapshot_path(Path::new("snapshots").join(path).join(name)); - let rtxn = index.read_txn().unwrap(); + let test_name = std::thread::current().name().unwrap().rsplit("::").next().unwrap().to_owned(); - let store_whole_snapshot = std::env::var("MILLI_TEST_FULL_SNAPS").unwrap_or("false".to_owned()); - let store_whole_snapshot: bool = store_whole_snapshot.parse().unwrap(); + if let Some(name) = name { + settings.set_snapshot_path(Path::new("snapshots").join(path).join(test_name).join(name)); + } else { + settings.set_snapshot_path(Path::new("snapshots").join(path).join(test_name)); + } - macro_rules! snapshot_db { - ($name:ident, |$vars:pat| $push:block) => { - let name_str = stringify!($name); - if should_snapshot(name_str) { - let iter = index.$name.iter(&rtxn).unwrap(); - let mut snap = String::new(); - for x in iter { - let $vars = x.unwrap(); - snap.push_str($push); - snap.push('\n'); - } - if snap.len() < 512 { - insta::assert_snapshot!(name_str, snap); + settings +} + +#[macro_export] +macro_rules! db_snap { + ($index:ident, $db_name:ident, $name:literal) => { + let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some( + &format!("{}", $name), + )); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($index:ident, $db_name:ident) => { + let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($index:ident, $db_name:ident, @$inline:literal) => { + let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); } else { - if store_whole_snapshot { - insta::assert_snapshot!(format!("{name_str}.full"), snap); - } - let hash = md5::compute(snap.as_bytes()); - let hash_str = format!("{hash:x}"); - insta::assert_snapshot!(format!("{name_str}.hash"), hash_str); + insta::assert_snapshot!(name, snap); } } + }); + }; + ($index:ident, $db_name:ident, $name:literal, @$inline:literal) => { + let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(format!("", $name))); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; +} + +pub fn snap_word_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_exact_word_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, exact_word_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_prefix_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_prefix_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_exact_word_prefix_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, exact_word_prefix_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_docid_word_positions(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, docid_word_positions, |((idx, s), b)| { + &format!("{idx:<6} {s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_pair_proximity_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_pair_proximity_docids, |( + (word1, word2, proximity), + b, + )| { + &format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( + (word1, prefix, proximity), + b, + )| { + &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_position_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { + &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_field_id_word_count_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, field_id_word_count_docids, |( + (field_id, word_count), + b, + )| { + &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_prefix_position_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_prefix_position_docids, |( + (word_prefix, position), + b, + )| { + &format!("{word_prefix:<4} {position:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_facet_id_f64_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( + (facet_id, level, left, right), + b, + )| { + &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_facet_id_string_docids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let bytes_db = index.facet_id_string_docids.remap_types::(); + let iter = bytes_db.iter(&rtxn).unwrap(); + let mut snap = String::new(); + + for x in iter { + let (key, value) = x.unwrap(); + if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { + let (orig_string, docids) = + FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); + snap.push_str(&format!( + "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", + display_bitmap(&docids) + )); + } else if let Some((field_id, level, left, right)) = + FacetLevelValueU32Codec::bytes_decode(key) + { + snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); + let (bounds, docids) = + FacetStringZeroBoundsValueCodec::::bytes_decode(value) + .unwrap(); + if let Some((left, right)) = bounds { + snap.push_str(&format!("{left:<8} {right:<8} ")); + } + snap.push_str(&display_bitmap(&docids)); + snap.push('\n'); + } else { + panic!(); + } + } + snap +} +pub fn snap_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let documents_ids = index.documents_ids(&rtxn).unwrap(); + let snap = display_bitmap(&documents_ids); + snap +} +pub fn snap_stop_words(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + let snap = format!("{stop_words:?}"); + snap +} +pub fn snap_soft_deleted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); + let soft_deleted_documents_ids = display_bitmap(&soft_deleted_documents_ids); + soft_deleted_documents_ids +} +pub fn snap_field_distributions(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let mut snap = String::new(); + for (field, count) in index.field_distribution(&rtxn).unwrap() { + writeln!(&mut snap, "{field:<16} {count:<6}").unwrap(); + } + snap +} +pub fn snap_fields_ids_map(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let name = fields_ids_map.name(field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap(); + } + snap +} +pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); + let snap = display_bitmap(&geo_faceted_documents_ids); + snap +} +pub fn snap_external_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap(); + let mut snap = String::new(); + let soft_bytes = soft.into_fst().as_bytes().to_owned(); + let mut hex_soft = String::new(); + for byte in soft_bytes { + write!(&mut hex_soft, "{:x}", byte).unwrap(); + } + writeln!(&mut snap, "soft: {hex_soft}").unwrap(); + let hard_bytes = hard.into_fst().as_bytes().to_owned(); + let mut hex_hard = String::new(); + for byte in hard_bytes { + write!(&mut hex_hard, "{:x}", byte).unwrap(); + } + writeln!(&mut snap, "hard: {hex_hard}").unwrap(); + snap +} +pub fn snap_number_faceted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let number_faceted_documents_ids = + index.number_faceted_documents_ids(&rtxn, field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) + .unwrap(); + } + snap +} +pub fn snap_string_faceted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let string_faceted_documents_ids = + index.string_faceted_documents_ids(&rtxn, field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) + .unwrap(); + } + snap +} +pub fn snap_words_fst(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let words_fst = index.words_fst(&rtxn).unwrap(); + let bytes = words_fst.into_fst().as_bytes().to_owned(); + let mut snap = String::new(); + for byte in bytes { + write!(&mut snap, "{:x}", byte).unwrap(); + } + snap +} +pub fn snap_words_prefixes_fst(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let words_prefixes_fst = index.words_prefixes_fst(&rtxn).unwrap(); + let bytes = words_prefixes_fst.into_fst().as_bytes().to_owned(); + let mut snap = String::new(); + for byte in bytes { + write!(&mut snap, "{:x}", byte).unwrap(); + } + snap +} + +pub fn snap_settings(index: &Index) -> String { + let mut snap = String::new(); + let rtxn = index.read_txn().unwrap(); + + macro_rules! write_setting_to_snap { + ($name:ident) => { + let $name = index.$name(&rtxn).unwrap(); + writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap(); }; } - fn display_bitmap(b: &RoaringBitmap) -> String { - let mut s = String::new(); - s.push('['); - for x in b.into_iter() { - write!(&mut s, "{x}, ").unwrap(); - } - s.push(']'); - s - } + write_setting_to_snap!(primary_key); + write_setting_to_snap!(criteria); + write_setting_to_snap!(displayed_fields); + write_setting_to_snap!(distinct_field); + write_setting_to_snap!(filterable_fields); + write_setting_to_snap!(sortable_fields); + write_setting_to_snap!(synonyms); + write_setting_to_snap!(authorize_typos); + write_setting_to_snap!(min_word_len_one_typo); + write_setting_to_snap!(min_word_len_two_typos); + write_setting_to_snap!(exact_words); + write_setting_to_snap!(exact_attributes); + write_setting_to_snap!(max_values_per_facet); + write_setting_to_snap!(pagination_max_total_hits); + write_setting_to_snap!(searchable_fields); + write_setting_to_snap!(user_defined_searchable_fields); - settings.bind(|| { - snapshot_db!(word_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); - snapshot_db!(exact_word_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); - snapshot_db!(word_prefix_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); - snapshot_db!(exact_word_prefix_docids, |(s, b)| { - &format!("{s:<16} {}", display_bitmap(&b)) - }); - - snapshot_db!(docid_word_positions, |((idx, s), b)| { - &format!("{idx:<6} {s:<16} {}", display_bitmap(&b)) - }); - - snapshot_db!(word_pair_proximity_docids, |((word1, word2, proximity), b)| { - &format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b)) - }); - - snapshot_db!(word_prefix_pair_proximity_docids, |((word1, prefix, proximity), b)| { - &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b)) - }); - - snapshot_db!(word_position_docids, |((word, position), b)| { - &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) - }); - - snapshot_db!(field_id_word_count_docids, |((field_id, word_count), b)| { - &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) - }); - - snapshot_db!(word_prefix_position_docids, |((word_prefix, position), b)| { - &format!("{word_prefix:<4} {position:<6} {}", display_bitmap(&b)) - }); - - snapshot_db!(facet_id_f64_docids, |((facet_id, level, left, right), b)| { - &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) - }); - { - let name_str = stringify!(facet_id_string_docids); - if should_snapshot(name_str) { - let bytes_db = index.facet_id_string_docids.remap_types::(); - let iter = bytes_db.iter(&rtxn).unwrap(); - let mut snap = String::new(); - - for x in iter { - let (key, value) = x.unwrap(); - if let Some((field_id, normalized_str)) = - FacetStringLevelZeroCodec::bytes_decode(key) - { - let (orig_string, docids) = - FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); - snap.push_str(&format!( - "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", - display_bitmap(&docids) - )); - } else if let Some((field_id, level, left, right)) = - FacetLevelValueU32Codec::bytes_decode(key) - { - snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); - let (bounds, docids) = FacetStringZeroBoundsValueCodec::< - CboRoaringBitmapCodec, - >::bytes_decode(value) - .unwrap(); - if let Some((left, right)) = bounds { - snap.push_str(&format!("{left:<8} {right:<8} ")); - } - snap.push_str(&display_bitmap(&docids)); - snap.push('\n'); - } else { - panic!(); - } - } - insta::assert_snapshot!(name_str, snap); - } - } - - // Main - computed settings - { - let mut snap = String::new(); - - macro_rules! write_setting_to_snap { - ($name:ident) => { - if should_snapshot(&format!("settings.{}", stringify!($name))) { - let $name = index.$name(&rtxn).unwrap(); - writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap(); - } - }; - } - write_setting_to_snap!(primary_key); - write_setting_to_snap!(criteria); - write_setting_to_snap!(displayed_fields); - write_setting_to_snap!(distinct_field); - write_setting_to_snap!(filterable_fields); - write_setting_to_snap!(sortable_fields); - write_setting_to_snap!(synonyms); - write_setting_to_snap!(authorize_typos); - write_setting_to_snap!(min_word_len_one_typo); - write_setting_to_snap!(min_word_len_two_typos); - write_setting_to_snap!(exact_words); - write_setting_to_snap!(exact_attributes); - write_setting_to_snap!(max_values_per_facet); - write_setting_to_snap!(pagination_max_total_hits); - write_setting_to_snap!(searchable_fields); - write_setting_to_snap!(user_defined_searchable_fields); - - if !snap.is_empty() { - insta::assert_snapshot!("settings", snap); - } - } - // Main - others - { - macro_rules! snapshot_string { - ($name:ident) => { - if should_snapshot(&format!("{}", stringify!($name))) { - insta::assert_snapshot!(stringify!($name), $name); - } - }; - } - { - let documents_ids = index.documents_ids(&rtxn).unwrap(); - let documents_ids = display_bitmap(&documents_ids); - snapshot_string!(documents_ids); - } - { - let stop_words = index.stop_words(&rtxn).unwrap(); - let stop_words = format!("{stop_words:?}"); - snapshot_string!(stop_words); - } - { - let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); - let soft_deleted_documents_ids = display_bitmap(&soft_deleted_documents_ids); - snapshot_string!(soft_deleted_documents_ids); - } - - { - let mut field_distribution = String::new(); - for (field, count) in index.field_distribution(&rtxn).unwrap() { - writeln!(&mut field_distribution, "{field:<16} {count:<6}").unwrap(); - } - snapshot_string!(field_distribution); - } - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - { - let mut snap = String::new(); - for field_id in fields_ids_map.ids() { - let name = fields_ids_map.name(field_id).unwrap(); - writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap(); - } - let fields_ids_map = snap; - snapshot_string!(fields_ids_map); - } - - { - let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); - let geo_faceted_documents_ids = display_bitmap(&geo_faceted_documents_ids); - snapshot_string!(geo_faceted_documents_ids); - } - // let geo_rtree = index.geo_rtree(&rtxn).unwrap(); - { - let ExternalDocumentsIds { soft, hard, .. } = - index.external_documents_ids(&rtxn).unwrap(); - let mut external_documents_ids = String::new(); - let soft_bytes = soft.into_fst().as_bytes().to_owned(); - let mut hex_soft = String::new(); - for byte in soft_bytes { - write!(&mut hex_soft, "{:x}", byte).unwrap(); - } - writeln!(&mut external_documents_ids, "soft: {hex_soft}").unwrap(); - let hard_bytes = hard.into_fst().as_bytes().to_owned(); - let mut hex_hard = String::new(); - for byte in hard_bytes { - write!(&mut hex_hard, "{:x}", byte).unwrap(); - } - writeln!(&mut external_documents_ids, "hard: {hex_hard}").unwrap(); - - snapshot_string!(external_documents_ids); - } - { - let mut snap = String::new(); - for field_id in fields_ids_map.ids() { - let number_faceted_documents_ids = - index.number_faceted_documents_ids(&rtxn, field_id).unwrap(); - writeln!( - &mut snap, - "{field_id:<3} {}", - display_bitmap(&number_faceted_documents_ids) - ) - .unwrap(); - } - let number_faceted_documents_ids = snap; - snapshot_string!(number_faceted_documents_ids); - } - { - let mut snap = String::new(); - for field_id in fields_ids_map.ids() { - let string_faceted_documents_ids = - index.string_faceted_documents_ids(&rtxn, field_id).unwrap(); - writeln!( - &mut snap, - "{field_id:<3} {}", - display_bitmap(&string_faceted_documents_ids) - ) - .unwrap(); - } - let string_faceted_documents_ids = snap; - snapshot_string!(string_faceted_documents_ids); - } - { - let words_fst = index.words_fst(&rtxn).unwrap(); - let bytes = words_fst.into_fst().as_bytes().to_owned(); - let mut words_fst = String::new(); - for byte in bytes { - write!(&mut words_fst, "{:x}", byte).unwrap(); - } - snapshot_string!(words_fst); - } - { - let words_prefixes_fst = index.words_prefixes_fst(&rtxn).unwrap(); - let bytes = words_prefixes_fst.into_fst().as_bytes().to_owned(); - let mut words_prefixes_fst = String::new(); - for byte in bytes { - write!(&mut words_prefixes_fst, "{:x}", byte).unwrap(); - } - snapshot_string!(words_prefixes_fst); - } - } - }); + snap } + +#[macro_export] +macro_rules! full_snap_of_db { + ($index:ident, settings) => {{ + $crate::snapshot_tests::snap_settings(&$index) + }}; + ($index:ident, word_docids) => {{ + $crate::snapshot_tests::snap_word_docids(&$index) + }}; + ($index:ident, exact_word_docids) => {{ + $crate::snapshot_tests::snap_exact_word_docids(&$index) + }}; + ($index:ident, word_prefix_docids) => {{ + $crate::snapshot_tests::snap_word_prefix_docids(&$index) + }}; + ($index:ident, exact_word_prefix_docids) => {{ + $crate::snapshot_tests::snap_exact_word_prefix_docids(&$index) + }}; + ($index:ident, docid_word_positions) => {{ + $crate::snapshot_tests::snap_docid_word_positions(&$index) + }}; + ($index:ident, word_pair_proximity_docids) => {{ + $crate::snapshot_tests::snap_word_pair_proximity_docids(&$index) + }}; + ($index:ident, word_prefix_pair_proximity_docids) => {{ + $crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index) + }}; + ($index:ident, word_position_docids) => {{ + $crate::snapshot_tests::snap_word_position_docids(&$index) + }}; + ($index:ident, field_id_word_count_docids) => {{ + $crate::snapshot_tests::snap_field_id_word_count_docids(&$index) + }}; + ($index:ident, word_prefix_position_docids) => {{ + $crate::snapshot_tests::snap_word_prefix_position_docids(&$index) + }}; + ($index:ident, facet_id_f64_docids) => {{ + $crate::snapshot_tests::snap_facet_id_f64_docids(&$index) + }}; + ($index:ident, facet_id_string_docids) => {{ + $crate::snapshot_tests::snap_facet_id_string_docids(&$index) + }}; + ($index:ident, documents_ids) => {{ + $crate::snapshot_tests::snap_documents_ids(&$index) + }}; + ($index:ident, stop_words) => {{ + $crate::snapshot_tests::snap_stop_words(&$index) + }}; + ($index:ident, soft_deleted_documents_ids) => {{ + $crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index) + }}; + ($index:ident, field_distribution) => {{ + $crate::snapshot_tests::snap_field_distributions(&$index) + }}; + ($index:ident, fields_ids_map) => {{ + $crate::snapshot_tests::snap_fields_ids_map(&$index) + }}; + ($index:ident, geo_faceted_documents_ids) => {{ + $crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index) + }}; + ($index:ident, external_documents_ids) => {{ + $crate::snapshot_tests::snap_external_documents_ids(&$index) + }}; + ($index:ident, number_faceted_documents_ids) => {{ + $crate::snapshot_tests::snap_number_faceted_documents_ids(&$index) + }}; + ($index:ident, string_faceted_documents_ids) => {{ + $crate::snapshot_tests::snap_string_faceted_documents_ids(&$index) + }}; + ($index:ident, words_fst) => {{ + $crate::snapshot_tests::snap_words_fst(&$index) + }}; + ($index:ident, words_prefixes_fst) => {{ + $crate::snapshot_tests::snap_words_prefixes_fst(&$index) + }}; +} + +pub fn convert_snap_to_hash_if_needed<'snap>( + name: &str, + snap: &'snap str, + inline: bool, +) -> Vec<(String, Cow<'snap, str>)> { + let store_whole_snapshot = std::env::var("MILLI_TEST_FULL_SNAPS").unwrap_or("false".to_owned()); + let store_whole_snapshot: bool = store_whole_snapshot.parse().unwrap(); + + let max_len = if inline { 256 } else { 2048 }; + + if snap.len() < max_len { + vec![(name.to_owned(), Cow::Borrowed(snap))] + } else { + let mut r = vec![]; + if store_whole_snapshot { + r.push((format!("{name}.full"), Cow::Borrowed(snap))); + } + let hash = md5::compute(snap.as_bytes()); + let hash_str = format!("{hash:x}"); + r.push((format!("{name}.hash"), Cow::Owned(hash_str))); + r + } +} + +#[macro_export] +macro_rules! make_db_snap_from_iter { + ($index:ident, $name:ident, |$vars:pat| $push:block) => {{ + let rtxn = $index.read_txn().unwrap(); + let iter = $index.$name.iter(&rtxn).unwrap(); + let mut snap = String::new(); + for x in iter { + let $vars = x.unwrap(); + snap.push_str($push); + snap.push('\n'); + } + snap + }}; +} + +pub fn display_bitmap(b: &RoaringBitmap) -> String { + let mut s = String::new(); + s.push('['); + for x in b.into_iter() { + write!(&mut s, "{x}, ").unwrap(); + } + s.push(']'); + s +} + +// #[macro_export] +// macro_rules! snapshot_index { +// ($index:expr, $name:expr) => { +// $crate::snapshot_tests::snapshot_index($index, $name, None, None) +// }; +// ($index:expr, $name:expr, include: $regex:literal) => { +// $crate::snapshot_tests::snapshot_index( +// $index, +// $name, +// Some(regex::Regex::new($regex).unwrap()), +// None, +// ) +// }; +// ($index:expr, $name:expr, exclude: $regex:literal) => { +// $crate::snapshot_tests::snapshot_index( +// $index, +// $name, +// None, +// Some(regex::Regex::new($regex).unwrap()), +// ) +// }; +// } + +// pub fn snap_of_db_settings(index: &Index, include: Option) -> String { +// let should_snapshot = +// |name: &str| -> bool { include.as_ref().map(|f| f.is_match(name)).unwrap_or(true) }; + +// let rtxn = index.read_txn().unwrap(); + +// let mut snap = String::new(); + +// macro_rules! write_setting_to_snap { +// ($name:ident) => { +// if should_snapshot(&format!("settings.{}", stringify!($name))) { +// let $name = index.$name(&rtxn).unwrap(); +// writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap(); +// } +// }; +// } +// write_setting_to_snap!(primary_key); +// write_setting_to_snap!(criteria); +// write_setting_to_snap!(displayed_fields); +// write_setting_to_snap!(distinct_field); +// write_setting_to_snap!(filterable_fields); +// write_setting_to_snap!(sortable_fields); +// write_setting_to_snap!(synonyms); +// write_setting_to_snap!(authorize_typos); +// write_setting_to_snap!(min_word_len_one_typo); +// write_setting_to_snap!(min_word_len_two_typos); +// write_setting_to_snap!(exact_words); +// write_setting_to_snap!(exact_attributes); +// write_setting_to_snap!(max_values_per_facet); +// write_setting_to_snap!(pagination_max_total_hits); +// write_setting_to_snap!(searchable_fields); +// write_setting_to_snap!(user_defined_searchable_fields); + +// snap +// } + +// #[track_caller] +// pub fn snapshot_index( +// index: &Index, +// name: &str, +// include: Option, +// exclude: Option, +// ) { +// let should_snapshot = |name: &str| -> bool { +// include.as_ref().map(|f| f.is_match(name)).unwrap_or(true) +// && !exclude.as_ref().map(|f| f.is_match(name)).unwrap_or(false) +// }; +// let settings = default_db_snapshot_settings_for_test(Some(name)); +// let rtxn = index.read_txn().unwrap(); + +// let snapshot_hash = |name: &str, snap: &str| { +// let store_whole_snapshot = +// std::env::var("MILLI_TEST_FULL_SNAPS").unwrap_or("false".to_owned()); +// let store_whole_snapshot: bool = store_whole_snapshot.parse().unwrap(); +// if snap.len() < 512 { +// insta::assert_snapshot!(name, snap); +// } else { +// if store_whole_snapshot { +// insta::assert_snapshot!(format!("{name}.full"), snap); +// } +// let hash = md5::compute(snap.as_bytes()); +// let hash_str = format!("{hash:x}"); +// insta::assert_snapshot!(format!("{name}.hash"), hash_str); +// } +// }; + +// macro_rules! snapshot_db { +// ($name:ident, |$vars:pat| $push:block) => { +// let name_str = stringify!($name); +// if should_snapshot(name_str) { +// let iter = index.$name.iter(&rtxn).unwrap(); +// let mut snap = String::new(); +// for x in iter { +// let $vars = x.unwrap(); +// snap.push_str($push); +// snap.push('\n'); +// } +// snapshot_hash(name_str, &snap); +// } +// }; +// } + +// fn display_bitmap(b: &RoaringBitmap) -> String { +// let mut s = String::new(); +// s.push('['); +// for x in b.into_iter() { +// write!(&mut s, "{x}, ").unwrap(); +// } +// s.push(']'); +// s +// } + +// settings.bind(|| { +// snapshot_db!(word_docids, |(s, b)| { &format!("{s:<16} {}", $crate::snapshot_tests::display_bitmap(&b)) }); +// snapshot_db!(exact_word_docids, |(s, b)| { &format!("{s:<16} {}", $crate::snapshot_tests::display_bitmap(&b)) }); +// snapshot_db!(word_prefix_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); +// snapshot_db!(exact_word_prefix_docids, |(s, b)| { +// &format!("{s:<16} {}", display_bitmap(&b)) +// }); + +// snapshot_db!(docid_word_positions, |((idx, s), b)| { +// &format!("{idx:<6} {s:<16} {}", display_bitmap(&b)) +// }); + +// snapshot_db!(word_pair_proximity_docids, |((word1, word2, proximity), b)| { +// &format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b)) +// }); + +// snapshot_db!(word_prefix_pair_proximity_docids, |((word1, prefix, proximity), b)| { +// &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b)) +// }); + +// snapshot_db!(word_position_docids, |((word, position), b)| { +// &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) +// }); + +// snapshot_db!(field_id_word_count_docids, |((field_id, word_count), b)| { +// &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) +// }); + +// snapshot_db!(word_prefix_position_docids, |((word_prefix, position), b)| { +// &format!("{word_prefix:<4} {position:<6} {}", display_bitmap(&b)) +// }); + +// snapshot_db!(facet_id_f64_docids, |((facet_id, level, left, right), b)| { +// &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) +// }); +// { +// let name_str = stringify!(facet_id_string_docids); +// if should_snapshot(name_str) { +// let bytes_db = index.facet_id_string_docids.remap_types::(); +// let iter = bytes_db.iter(&rtxn).unwrap(); +// let mut snap = String::new(); + +// for x in iter { +// let (key, value) = x.unwrap(); +// if let Some((field_id, normalized_str)) = +// FacetStringLevelZeroCodec::bytes_decode(key) +// { +// let (orig_string, docids) = +// FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); +// snap.push_str(&format!( +// "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", +// display_bitmap(&docids) +// )); +// } else if let Some((field_id, level, left, right)) = +// FacetLevelValueU32Codec::bytes_decode(key) +// { +// snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); +// let (bounds, docids) = FacetStringZeroBoundsValueCodec::< +// CboRoaringBitmapCodec, +// >::bytes_decode(value) +// .unwrap(); +// if let Some((left, right)) = bounds { +// snap.push_str(&format!("{left:<8} {right:<8} ")); +// } +// snap.push_str(&display_bitmap(&docids)); +// snap.push('\n'); +// } else { +// panic!(); +// } +// } +// snapshot_hash(name_str, &snap); +// } +// } + +// // Main - computed settings +// { +// let mut snap = String::new(); + +// macro_rules! write_setting_to_snap { +// ($name:ident) => { +// if should_snapshot(&format!("settings.{}", stringify!($name))) { +// let $name = index.$name(&rtxn).unwrap(); +// writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap(); +// } +// }; +// } +// write_setting_to_snap!(primary_key); +// write_setting_to_snap!(criteria); +// write_setting_to_snap!(displayed_fields); +// write_setting_to_snap!(distinct_field); +// write_setting_to_snap!(filterable_fields); +// write_setting_to_snap!(sortable_fields); +// write_setting_to_snap!(synonyms); +// write_setting_to_snap!(authorize_typos); +// write_setting_to_snap!(min_word_len_one_typo); +// write_setting_to_snap!(min_word_len_two_typos); +// write_setting_to_snap!(exact_words); +// write_setting_to_snap!(exact_attributes); +// write_setting_to_snap!(max_values_per_facet); +// write_setting_to_snap!(pagination_max_total_hits); +// write_setting_to_snap!(searchable_fields); +// write_setting_to_snap!(user_defined_searchable_fields); + +// if !snap.is_empty() { +// insta::assert_snapshot!("settings", snap); +// } +// } +// // Main - others +// { +// macro_rules! snapshot_string { +// ($name:ident) => { +// if should_snapshot(&format!("{}", stringify!($name))) { +// insta::assert_snapshot!(stringify!($name), $name); +// } +// }; +// } +// { +// let documents_ids = index.documents_ids(&rtxn).unwrap(); +// let documents_ids = display_bitmap(&documents_ids); +// snapshot_string!(documents_ids); +// } +// { +// let stop_words = index.stop_words(&rtxn).unwrap(); +// let stop_words = format!("{stop_words:?}"); +// snapshot_string!(stop_words); +// } +// { +// let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); +// let soft_deleted_documents_ids = display_bitmap(&soft_deleted_documents_ids); +// snapshot_string!(soft_deleted_documents_ids); +// } + +// { +// let mut field_distribution = String::new(); +// for (field, count) in index.field_distribution(&rtxn).unwrap() { +// writeln!(&mut field_distribution, "{field:<16} {count:<6}").unwrap(); +// } +// snapshot_string!(field_distribution); +// } +// let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); +// { +// let mut snap = String::new(); +// for field_id in fields_ids_map.ids() { +// let name = fields_ids_map.name(field_id).unwrap(); +// writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap(); +// } +// let fields_ids_map = snap; +// snapshot_string!(fields_ids_map); +// } + +// { +// let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); +// let geo_faceted_documents_ids = display_bitmap(&geo_faceted_documents_ids); +// snapshot_string!(geo_faceted_documents_ids); +// } +// // let geo_rtree = index.geo_rtree(&rtxn).unwrap(); +// { +// let ExternalDocumentsIds { soft, hard, .. } = +// index.external_documents_ids(&rtxn).unwrap(); +// let mut external_documents_ids = String::new(); +// let soft_bytes = soft.into_fst().as_bytes().to_owned(); +// let mut hex_soft = String::new(); +// for byte in soft_bytes { +// write!(&mut hex_soft, "{:x}", byte).unwrap(); +// } +// writeln!(&mut external_documents_ids, "soft: {hex_soft}").unwrap(); +// let hard_bytes = hard.into_fst().as_bytes().to_owned(); +// let mut hex_hard = String::new(); +// for byte in hard_bytes { +// write!(&mut hex_hard, "{:x}", byte).unwrap(); +// } +// writeln!(&mut external_documents_ids, "hard: {hex_hard}").unwrap(); + +// snapshot_string!(external_documents_ids); +// } +// { +// let mut snap = String::new(); +// for field_id in fields_ids_map.ids() { +// let number_faceted_documents_ids = +// index.number_faceted_documents_ids(&rtxn, field_id).unwrap(); +// writeln!( +// &mut snap, +// "{field_id:<3} {}", +// display_bitmap(&number_faceted_documents_ids) +// ) +// .unwrap(); +// } +// let number_faceted_documents_ids = snap; +// snapshot_string!(number_faceted_documents_ids); +// } +// { +// let mut snap = String::new(); +// for field_id in fields_ids_map.ids() { +// let string_faceted_documents_ids = +// index.string_faceted_documents_ids(&rtxn, field_id).unwrap(); +// writeln!( +// &mut snap, +// "{field_id:<3} {}", +// display_bitmap(&string_faceted_documents_ids) +// ) +// .unwrap(); +// } +// let string_faceted_documents_ids = snap; +// snapshot_string!(string_faceted_documents_ids); +// } +// { +// let words_fst = index.words_fst(&rtxn).unwrap(); +// let bytes = words_fst.into_fst().as_bytes().to_owned(); +// let mut words_fst = String::new(); +// for byte in bytes { +// write!(&mut words_fst, "{:x}", byte).unwrap(); +// } +// snapshot_string!(words_fst); +// } +// { +// let words_prefixes_fst = index.words_prefixes_fst(&rtxn).unwrap(); +// let bytes = words_prefixes_fst.into_fst().as_bytes().to_owned(); +// let mut words_prefixes_fst = String::new(); +// for byte in bytes { +// write!(&mut words_prefixes_fst, "{:x}", byte).unwrap(); +// } +// snapshot_string!(words_prefixes_fst); +// } +// } +// }); +// } diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap new file mode 100644 index 000000000..9b074fb59 --- /dev/null +++ b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/index.rs +--- +age 1 +id 2 +name 2 + diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap new file mode 100644 index 000000000..9b074fb59 --- /dev/null +++ b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/index.rs +--- +age 1 +id 2 +name 2 + From b9907997e4e17e3675f970bcccf3fb267f9edc8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 3 Aug 2022 16:25:33 +0200 Subject: [PATCH 1531/1889] Remove old snapshot tests code --- milli/src/snapshot_tests.rs | 342 ------------------------------------ 1 file changed, 342 deletions(-) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 6f41ddd5b..77eeeb159 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -457,345 +457,3 @@ pub fn display_bitmap(b: &RoaringBitmap) -> String { s.push(']'); s } - -// #[macro_export] -// macro_rules! snapshot_index { -// ($index:expr, $name:expr) => { -// $crate::snapshot_tests::snapshot_index($index, $name, None, None) -// }; -// ($index:expr, $name:expr, include: $regex:literal) => { -// $crate::snapshot_tests::snapshot_index( -// $index, -// $name, -// Some(regex::Regex::new($regex).unwrap()), -// None, -// ) -// }; -// ($index:expr, $name:expr, exclude: $regex:literal) => { -// $crate::snapshot_tests::snapshot_index( -// $index, -// $name, -// None, -// Some(regex::Regex::new($regex).unwrap()), -// ) -// }; -// } - -// pub fn snap_of_db_settings(index: &Index, include: Option) -> String { -// let should_snapshot = -// |name: &str| -> bool { include.as_ref().map(|f| f.is_match(name)).unwrap_or(true) }; - -// let rtxn = index.read_txn().unwrap(); - -// let mut snap = String::new(); - -// macro_rules! write_setting_to_snap { -// ($name:ident) => { -// if should_snapshot(&format!("settings.{}", stringify!($name))) { -// let $name = index.$name(&rtxn).unwrap(); -// writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap(); -// } -// }; -// } -// write_setting_to_snap!(primary_key); -// write_setting_to_snap!(criteria); -// write_setting_to_snap!(displayed_fields); -// write_setting_to_snap!(distinct_field); -// write_setting_to_snap!(filterable_fields); -// write_setting_to_snap!(sortable_fields); -// write_setting_to_snap!(synonyms); -// write_setting_to_snap!(authorize_typos); -// write_setting_to_snap!(min_word_len_one_typo); -// write_setting_to_snap!(min_word_len_two_typos); -// write_setting_to_snap!(exact_words); -// write_setting_to_snap!(exact_attributes); -// write_setting_to_snap!(max_values_per_facet); -// write_setting_to_snap!(pagination_max_total_hits); -// write_setting_to_snap!(searchable_fields); -// write_setting_to_snap!(user_defined_searchable_fields); - -// snap -// } - -// #[track_caller] -// pub fn snapshot_index( -// index: &Index, -// name: &str, -// include: Option, -// exclude: Option, -// ) { -// let should_snapshot = |name: &str| -> bool { -// include.as_ref().map(|f| f.is_match(name)).unwrap_or(true) -// && !exclude.as_ref().map(|f| f.is_match(name)).unwrap_or(false) -// }; -// let settings = default_db_snapshot_settings_for_test(Some(name)); -// let rtxn = index.read_txn().unwrap(); - -// let snapshot_hash = |name: &str, snap: &str| { -// let store_whole_snapshot = -// std::env::var("MILLI_TEST_FULL_SNAPS").unwrap_or("false".to_owned()); -// let store_whole_snapshot: bool = store_whole_snapshot.parse().unwrap(); -// if snap.len() < 512 { -// insta::assert_snapshot!(name, snap); -// } else { -// if store_whole_snapshot { -// insta::assert_snapshot!(format!("{name}.full"), snap); -// } -// let hash = md5::compute(snap.as_bytes()); -// let hash_str = format!("{hash:x}"); -// insta::assert_snapshot!(format!("{name}.hash"), hash_str); -// } -// }; - -// macro_rules! snapshot_db { -// ($name:ident, |$vars:pat| $push:block) => { -// let name_str = stringify!($name); -// if should_snapshot(name_str) { -// let iter = index.$name.iter(&rtxn).unwrap(); -// let mut snap = String::new(); -// for x in iter { -// let $vars = x.unwrap(); -// snap.push_str($push); -// snap.push('\n'); -// } -// snapshot_hash(name_str, &snap); -// } -// }; -// } - -// fn display_bitmap(b: &RoaringBitmap) -> String { -// let mut s = String::new(); -// s.push('['); -// for x in b.into_iter() { -// write!(&mut s, "{x}, ").unwrap(); -// } -// s.push(']'); -// s -// } - -// settings.bind(|| { -// snapshot_db!(word_docids, |(s, b)| { &format!("{s:<16} {}", $crate::snapshot_tests::display_bitmap(&b)) }); -// snapshot_db!(exact_word_docids, |(s, b)| { &format!("{s:<16} {}", $crate::snapshot_tests::display_bitmap(&b)) }); -// snapshot_db!(word_prefix_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); -// snapshot_db!(exact_word_prefix_docids, |(s, b)| { -// &format!("{s:<16} {}", display_bitmap(&b)) -// }); - -// snapshot_db!(docid_word_positions, |((idx, s), b)| { -// &format!("{idx:<6} {s:<16} {}", display_bitmap(&b)) -// }); - -// snapshot_db!(word_pair_proximity_docids, |((word1, word2, proximity), b)| { -// &format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b)) -// }); - -// snapshot_db!(word_prefix_pair_proximity_docids, |((word1, prefix, proximity), b)| { -// &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b)) -// }); - -// snapshot_db!(word_position_docids, |((word, position), b)| { -// &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) -// }); - -// snapshot_db!(field_id_word_count_docids, |((field_id, word_count), b)| { -// &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) -// }); - -// snapshot_db!(word_prefix_position_docids, |((word_prefix, position), b)| { -// &format!("{word_prefix:<4} {position:<6} {}", display_bitmap(&b)) -// }); - -// snapshot_db!(facet_id_f64_docids, |((facet_id, level, left, right), b)| { -// &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) -// }); -// { -// let name_str = stringify!(facet_id_string_docids); -// if should_snapshot(name_str) { -// let bytes_db = index.facet_id_string_docids.remap_types::(); -// let iter = bytes_db.iter(&rtxn).unwrap(); -// let mut snap = String::new(); - -// for x in iter { -// let (key, value) = x.unwrap(); -// if let Some((field_id, normalized_str)) = -// FacetStringLevelZeroCodec::bytes_decode(key) -// { -// let (orig_string, docids) = -// FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); -// snap.push_str(&format!( -// "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", -// display_bitmap(&docids) -// )); -// } else if let Some((field_id, level, left, right)) = -// FacetLevelValueU32Codec::bytes_decode(key) -// { -// snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); -// let (bounds, docids) = FacetStringZeroBoundsValueCodec::< -// CboRoaringBitmapCodec, -// >::bytes_decode(value) -// .unwrap(); -// if let Some((left, right)) = bounds { -// snap.push_str(&format!("{left:<8} {right:<8} ")); -// } -// snap.push_str(&display_bitmap(&docids)); -// snap.push('\n'); -// } else { -// panic!(); -// } -// } -// snapshot_hash(name_str, &snap); -// } -// } - -// // Main - computed settings -// { -// let mut snap = String::new(); - -// macro_rules! write_setting_to_snap { -// ($name:ident) => { -// if should_snapshot(&format!("settings.{}", stringify!($name))) { -// let $name = index.$name(&rtxn).unwrap(); -// writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap(); -// } -// }; -// } -// write_setting_to_snap!(primary_key); -// write_setting_to_snap!(criteria); -// write_setting_to_snap!(displayed_fields); -// write_setting_to_snap!(distinct_field); -// write_setting_to_snap!(filterable_fields); -// write_setting_to_snap!(sortable_fields); -// write_setting_to_snap!(synonyms); -// write_setting_to_snap!(authorize_typos); -// write_setting_to_snap!(min_word_len_one_typo); -// write_setting_to_snap!(min_word_len_two_typos); -// write_setting_to_snap!(exact_words); -// write_setting_to_snap!(exact_attributes); -// write_setting_to_snap!(max_values_per_facet); -// write_setting_to_snap!(pagination_max_total_hits); -// write_setting_to_snap!(searchable_fields); -// write_setting_to_snap!(user_defined_searchable_fields); - -// if !snap.is_empty() { -// insta::assert_snapshot!("settings", snap); -// } -// } -// // Main - others -// { -// macro_rules! snapshot_string { -// ($name:ident) => { -// if should_snapshot(&format!("{}", stringify!($name))) { -// insta::assert_snapshot!(stringify!($name), $name); -// } -// }; -// } -// { -// let documents_ids = index.documents_ids(&rtxn).unwrap(); -// let documents_ids = display_bitmap(&documents_ids); -// snapshot_string!(documents_ids); -// } -// { -// let stop_words = index.stop_words(&rtxn).unwrap(); -// let stop_words = format!("{stop_words:?}"); -// snapshot_string!(stop_words); -// } -// { -// let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); -// let soft_deleted_documents_ids = display_bitmap(&soft_deleted_documents_ids); -// snapshot_string!(soft_deleted_documents_ids); -// } - -// { -// let mut field_distribution = String::new(); -// for (field, count) in index.field_distribution(&rtxn).unwrap() { -// writeln!(&mut field_distribution, "{field:<16} {count:<6}").unwrap(); -// } -// snapshot_string!(field_distribution); -// } -// let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); -// { -// let mut snap = String::new(); -// for field_id in fields_ids_map.ids() { -// let name = fields_ids_map.name(field_id).unwrap(); -// writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap(); -// } -// let fields_ids_map = snap; -// snapshot_string!(fields_ids_map); -// } - -// { -// let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); -// let geo_faceted_documents_ids = display_bitmap(&geo_faceted_documents_ids); -// snapshot_string!(geo_faceted_documents_ids); -// } -// // let geo_rtree = index.geo_rtree(&rtxn).unwrap(); -// { -// let ExternalDocumentsIds { soft, hard, .. } = -// index.external_documents_ids(&rtxn).unwrap(); -// let mut external_documents_ids = String::new(); -// let soft_bytes = soft.into_fst().as_bytes().to_owned(); -// let mut hex_soft = String::new(); -// for byte in soft_bytes { -// write!(&mut hex_soft, "{:x}", byte).unwrap(); -// } -// writeln!(&mut external_documents_ids, "soft: {hex_soft}").unwrap(); -// let hard_bytes = hard.into_fst().as_bytes().to_owned(); -// let mut hex_hard = String::new(); -// for byte in hard_bytes { -// write!(&mut hex_hard, "{:x}", byte).unwrap(); -// } -// writeln!(&mut external_documents_ids, "hard: {hex_hard}").unwrap(); - -// snapshot_string!(external_documents_ids); -// } -// { -// let mut snap = String::new(); -// for field_id in fields_ids_map.ids() { -// let number_faceted_documents_ids = -// index.number_faceted_documents_ids(&rtxn, field_id).unwrap(); -// writeln!( -// &mut snap, -// "{field_id:<3} {}", -// display_bitmap(&number_faceted_documents_ids) -// ) -// .unwrap(); -// } -// let number_faceted_documents_ids = snap; -// snapshot_string!(number_faceted_documents_ids); -// } -// { -// let mut snap = String::new(); -// for field_id in fields_ids_map.ids() { -// let string_faceted_documents_ids = -// index.string_faceted_documents_ids(&rtxn, field_id).unwrap(); -// writeln!( -// &mut snap, -// "{field_id:<3} {}", -// display_bitmap(&string_faceted_documents_ids) -// ) -// .unwrap(); -// } -// let string_faceted_documents_ids = snap; -// snapshot_string!(string_faceted_documents_ids); -// } -// { -// let words_fst = index.words_fst(&rtxn).unwrap(); -// let bytes = words_fst.into_fst().as_bytes().to_owned(); -// let mut words_fst = String::new(); -// for byte in bytes { -// write!(&mut words_fst, "{:x}", byte).unwrap(); -// } -// snapshot_string!(words_fst); -// } -// { -// let words_prefixes_fst = index.words_prefixes_fst(&rtxn).unwrap(); -// let bytes = words_prefixes_fst.into_fst().as_bytes().to_owned(); -// let mut words_prefixes_fst = String::new(); -// for byte in bytes { -// write!(&mut words_prefixes_fst, "{:x}", byte).unwrap(); -// } -// snapshot_string!(words_prefixes_fst); -// } -// } -// }); -// } From 3a734af159a3e01289ece0ff0abd253112e28ac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 3 Aug 2022 16:33:36 +0200 Subject: [PATCH 1532/1889] Add snapshot tests for Facets::execute --- .gitignore | 7 ++ milli/src/documents/mod.rs | 11 +++ milli/src/snapshot_tests.rs | 2 +- milli/src/update/facets.rs | 88 +++++++++++++++++++ .../default/facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../default/facet_id_string_docids.hash.snap | 4 + .../facet_id_string_docids.hash.snap | 4 + 12 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 milli/src/update/snapshots/update/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/snapshots/update/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/snapshots/update/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap create mode 100644 milli/src/update/snapshots/update/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap diff --git a/.gitignore b/.gitignore index 107b5bb36..02c4fcd79 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,10 @@ *.csv *.mmdb *.svg + +# Snapshots +## ... large +*.full.snap + +# ... unreviewed +*.snap.new diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index c1580309a..5c83991c2 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -190,6 +190,17 @@ pub fn documents_batch_reader_from_objects( DocumentsBatchReader::from_reader(std::io::Cursor::new(builder.into_inner().unwrap())).unwrap() } +#[cfg(test)] +pub fn batch_reader_from_documents( + documents: &[Object], +) -> DocumentsBatchReader>> { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(std::io::Cursor::new(builder.into_inner().unwrap())).unwrap() +} + #[cfg(test)] mod test { use std::io::Cursor; diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 77eeeb159..a881a155e 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -30,7 +30,7 @@ pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Setti #[macro_export] macro_rules! db_snap { - ($index:ident, $db_name:ident, $name:literal) => { + ($index:ident, $db_name:ident, $name:expr) => { let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some( &format!("{}", $name), )); diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 5892123eb..981fa819c 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -342,3 +342,91 @@ fn write_string_entry( writer.insert(&key, &data)?; Ok(()) } + +#[cfg(test)] +mod tests { + use std::num::NonZeroUsize; + + use crate::{db_snap, documents::batch_reader_from_documents, index::tests::TempIndex}; + + #[test] + fn test_facets_number() { + let test = + |name: &str, group_size: Option, min_level_size: Option| { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB + index.index_documents_config.autogenerate_docids = true; + index.index_documents_config.facet_level_group_size = group_size; + index.index_documents_config.facet_min_level_size = min_level_size; + + index + .update_settings(|settings| { + settings.set_filterable_fields( + IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) + .collect(), + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1_000 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 0..100 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents = batch_reader_from_documents(&documents); + + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, name); + }; + + test("default", None, None); + test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + test("small_groups_small_levels", NonZeroUsize::new(2), NonZeroUsize::new(2)); + test("small_groups_large_levels", NonZeroUsize::new(2), NonZeroUsize::new(128)); + test("large_groups_small_levels", NonZeroUsize::new(16), NonZeroUsize::new(2)); + test("large_groups_large_levels", NonZeroUsize::new(16), NonZeroUsize::new(256)); + } + + #[test] + fn test_facets_string() { + let test = |name: &str, + group_size: Option, + min_level_size: Option| { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB + index.index_documents_config.autogenerate_docids = true; + index.index_documents_config.facet_level_group_size = group_size; + index.index_documents_config.facet_min_level_size = min_level_size; + + index + .update_settings(|settings| { + settings.set_filterable_fields( + IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) + .collect(), + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..100 { + documents.push( + serde_json::json!({ "facet": format!("s{i:X}") }).as_object().unwrap().clone(), + ); + } + for i in 0..10 { + documents.push( + serde_json::json!({ "facet2": format!("s{i:X}") }).as_object().unwrap().clone(), + ); + } + let documents = batch_reader_from_documents(&documents); + + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_string_docids, name); + }; + + test("default", None, None); + test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + } +} diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/update/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..373455db6 --- /dev/null +++ b/milli/src/update/snapshots/update/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +587899707db2848da3f18399e14ed4d0 diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..c3415c320 --- /dev/null +++ b/milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +02bbf2ca1663cccea0e4c06d5ad06a45 diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..78dad29f1 --- /dev/null +++ b/milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +e68ea591e1af3e53e544dff9a1648e88 diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..61a5908f4 --- /dev/null +++ b/milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +12a4bb0f5b95d7629c2b9a915150c0cf diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..961346de5 --- /dev/null +++ b/milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +6438e94bc7fada13022e0efccdf294e0 diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/update/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..2b7c1ef9c --- /dev/null +++ b/milli/src/update/snapshots/update/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +5348bbc46b5384455b6a900666d2a502 diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/update/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..901b86255 --- /dev/null +++ b/milli/src/update/snapshots/update/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +faddef9eae5f2efacfec51f20f2e8cd6 diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/update/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..aa6c85461 --- /dev/null +++ b/milli/src/update/snapshots/update/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +ddb8fc987c5dc892337682595043858e From 606625668977b7620fd5b1d47ae0b8f0562d3183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 3 Aug 2022 16:49:03 +0200 Subject: [PATCH 1533/1889] Add snapshot tests for indexing of word_prefix_pair_proximity_docids --- milli/src/documents/mod.rs | 11 --- milli/src/update/facets.rs | 8 +- .../word_prefix_pair_proximity_docids.snap | 46 ++++++++++ .../word_prefix_pair_proximity_docids.snap | 56 ++++++++++++ .../word_prefix_pair_proximity_docids.rs | 87 +++++++++++++++++++ 5 files changed, 195 insertions(+), 13 deletions(-) create mode 100644 milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 5c83991c2..c1580309a 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -190,17 +190,6 @@ pub fn documents_batch_reader_from_objects( DocumentsBatchReader::from_reader(std::io::Cursor::new(builder.into_inner().unwrap())).unwrap() } -#[cfg(test)] -pub fn batch_reader_from_documents( - documents: &[Object], -) -> DocumentsBatchReader>> { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(std::io::Cursor::new(builder.into_inner().unwrap())).unwrap() -} - #[cfg(test)] mod test { use std::io::Cursor; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 981fa819c..904f165b1 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -347,7 +347,11 @@ fn write_string_entry( mod tests { use std::num::NonZeroUsize; - use crate::{db_snap, documents::batch_reader_from_documents, index::tests::TempIndex}; + use crate::{ + db_snap, + documents::{batch_reader_from_documents, documents_batch_reader_from_objects}, + index::tests::TempIndex, + }; #[test] fn test_facets_number() { @@ -419,7 +423,7 @@ mod tests { serde_json::json!({ "facet2": format!("s{i:X}") }).as_object().unwrap().clone(), ); } - let documents = batch_reader_from_documents(&documents); + let documents = documents_batch_reader_from_objects(documents); index.add_documents(documents).unwrap(); diff --git a/milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..0a61cf4e8 --- /dev/null +++ b/milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,46 @@ +--- +source: milli/src/update/word_prefix_pair_proximity_docids.rs +--- +5 a 1 [101, ] +5 a 2 [101, ] +5 b 4 [101, ] +5 be 4 [101, ] +am a 3 [101, ] +amazing a 1 [100, ] +amazing a 2 [100, ] +amazing a 3 [100, ] +amazing b 2 [100, ] +amazing be 2 [100, ] +an a 1 [100, ] +an a 2 [100, ] +an b 3 [100, ] +an be 3 [100, ] +and a 2 [100, ] +and a 3 [100, ] +and a 4 [100, ] +and b 1 [100, ] +and be 1 [100, ] +at a 1 [100, ] +at a 2 [100, 101, ] +at a 3 [100, ] +at b 3 [101, ] +at b 4 [100, ] +at be 3 [101, ] +at be 4 [100, ] +beautiful a 2 [100, ] +beautiful a 3 [100, ] +beautiful a 4 [100, ] +bell a 2 [101, ] +bell a 4 [101, ] +house a 3 [100, ] +house a 4 [100, ] +house b 2 [100, ] +house be 2 [100, ] +rings a 1 [101, ] +rings a 3 [101, ] +rings b 2 [101, ] +rings be 2 [101, ] +the a 3 [101, ] +the b 1 [101, ] +the be 1 [101, ] + diff --git a/milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..aabd9ddec --- /dev/null +++ b/milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,56 @@ +--- +source: milli/src/update/word_prefix_pair_proximity_docids.rs +--- +5 a 1 [101, ] +5 a 2 [101, ] +5 am 1 [101, ] +5 b 4 [101, ] +5 be 4 [101, ] +am a 3 [101, ] +amazing a 1 [100, ] +amazing a 2 [100, ] +amazing a 3 [100, ] +amazing b 2 [100, ] +amazing be 2 [100, ] +an a 1 [100, ] +an a 2 [100, 202, ] +an am 1 [100, ] +an b 3 [100, ] +an be 3 [100, ] +and a 2 [100, ] +and a 3 [100, ] +and a 4 [100, ] +and am 2 [100, ] +and b 1 [100, ] +and be 1 [100, ] +at a 1 [100, 202, ] +at a 2 [100, 101, ] +at a 3 [100, ] +at am 2 [100, 101, ] +at b 3 [101, ] +at b 4 [100, ] +at be 3 [101, ] +at be 4 [100, ] +beautiful a 2 [100, ] +beautiful a 3 [100, ] +beautiful a 4 [100, ] +beautiful am 3 [100, ] +bell a 2 [101, ] +bell a 4 [101, ] +bell am 4 [101, ] +extraordinary a 2 [202, ] +extraordinary a 3 [202, ] +house a 3 [100, 202, ] +house a 4 [100, 202, ] +house am 4 [100, ] +house b 2 [100, ] +house be 2 [100, ] +rings a 1 [101, ] +rings a 3 [101, ] +rings am 3 [101, ] +rings b 2 [101, ] +rings be 2 [101, ] +the a 3 [101, ] +the b 1 [101, ] +the be 1 [101, ] + diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 72b41c472..7e5d5c090 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -244,3 +244,90 @@ fn insert_current_prefix_data_in_sorter<'a>( Ok(()) } + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use crate::{ + db_snap, + documents::{DocumentsBatchBuilder, DocumentsBatchReader}, + index::tests::TempIndex, + }; + + fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { + let mut documents = Vec::new(); + for prefix in prefixes { + for i in 0..50 { + documents.push( + serde_json::json!({ + "text": format!("{prefix}{i:x}"), + }) + .as_object() + .unwrap() + .clone(), + ) + } + } + documents + } + + #[test] + fn test_update() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + + let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); + documents.push( + serde_json::json!({ + "text": "At an extraordinary house" + }) + .as_object() + .unwrap() + .clone(), + ); + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_prefix_pair_proximity_docids, "update"); + } +} From 8ac24d3114e43622ba9a194ee2d574f4fab2861f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 4 Aug 2022 10:50:38 +0200 Subject: [PATCH 1534/1889] Cargo fmt + fix compiler warnings/error --- milli/src/search/facet/filter.rs | 8 +++---- milli/src/snapshot_tests.rs | 22 ++++++++++--------- milli/src/update/facets.rs | 10 ++++----- .../word_prefix_pair_proximity_docids.rs | 8 +++---- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 225d3ea8d..03ec03d39 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -490,14 +490,12 @@ impl<'a> From> for Filter<'a> { #[cfg(test)] mod tests { - use std::fmt::Write; - + use crate::index::tests::TempIndex; + use crate::Filter; use big_s::S; use either::Either; use maplit::hashset; - - use crate::index::tests::TempIndex; - use crate::Filter; + use std::fmt::Write; #[test] fn empty_db() { diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index a881a155e..f9929bb22 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -1,14 +1,16 @@ -use crate::{ - heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FacetStringZeroBoundsValueCodec, - }, - make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index, -}; -use heed::{types::ByteSlice, BytesDecode}; -use roaring::RoaringBitmap; +use std::borrow::Cow; +use std::fmt::Write; use std::path::Path; -use std::{borrow::Cow, fmt::Write}; + +use heed::types::ByteSlice; +use heed::BytesDecode; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::{ + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FacetStringZeroBoundsValueCodec, +}; +use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index}; #[track_caller] pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Settings { diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 904f165b1..4c4963b56 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -347,11 +347,9 @@ fn write_string_entry( mod tests { use std::num::NonZeroUsize; - use crate::{ - db_snap, - documents::{batch_reader_from_documents, documents_batch_reader_from_objects}, - index::tests::TempIndex, - }; + use crate::db_snap; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; #[test] fn test_facets_number() { @@ -378,7 +376,7 @@ mod tests { for i in 0..100 { documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); } - let documents = batch_reader_from_documents(&documents); + let documents = documents_batch_reader_from_objects(documents); index.add_documents(documents).unwrap(); diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 7e5d5c090..574b49e97 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -249,11 +249,9 @@ fn insert_current_prefix_data_in_sorter<'a>( mod tests { use std::io::Cursor; - use crate::{ - db_snap, - documents::{DocumentsBatchBuilder, DocumentsBatchReader}, - index::tests::TempIndex, - }; + use crate::db_snap; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); From 4bba2f41d784cd2df2ad3fb730deae19718c6f42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 4 Aug 2022 11:00:46 +0200 Subject: [PATCH 1535/1889] Switch to snapshot tests for query_tree.rs --- milli/src/search/query_tree.rs | 537 ++++++++------------------------- 1 file changed, 134 insertions(+), 403 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index e0fac0f43..ace1d9dfe 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -857,30 +857,16 @@ mod test { let query = "hey friends"; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "friends".to_string()), - }), - ]), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "heyfriends".to_string()), - }), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "hey" } + PrefixTolerant { word: "friends", max typo: 1 } + PrefixTolerant { word: "heyfriends", max typo: 1 } + "###); } #[test] @@ -888,30 +874,16 @@ mod test { let query = "hey friends "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "friends".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "heyfriends".to_string()), - }), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "hey" } + Tolerant { word: "friends", max typo: 1 } + Tolerant { word: "heyfriends", max typo: 1 } + "###); } #[test] @@ -919,62 +891,24 @@ mod test { let query = "hello world "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hi".to_string()), - }), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("good".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("morning".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "hello".to_string()), - }), - ], - ), - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("earth".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("nature".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ], - ), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "helloworld".to_string()), - }), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + OR + Exact { word: "hi" } + AND + Exact { word: "good" } + Exact { word: "morning" } + Tolerant { word: "hello", max typo: 1 } + OR + Exact { word: "earth" } + Exact { word: "nature" } + Tolerant { word: "world", max typo: 1 } + Tolerant { word: "helloworld", max typo: 1 } + "###); } #[test] @@ -982,97 +916,34 @@ mod test { let query = "new york city "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("new".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("york".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("city".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "yorkcity".to_string()), - }), - ], - ), - ]), - Operation::And(vec![ - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("nyc".to_string()), - }), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("new".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("york".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("city".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "newyork".to_string()), - }), - ], - ), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("city".to_string()), - }), - ]), - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("nyc".to_string()), - }), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("new".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("york".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "newyorkcity".to_string()), - }), - ], - ), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "new" } + OR + AND + Exact { word: "york" } + Exact { word: "city" } + Tolerant { word: "yorkcity", max typo: 1 } + AND + OR + Exact { word: "nyc" } + AND + Exact { word: "new" } + Exact { word: "york" } + Exact { word: "city" } + Tolerant { word: "newyork", max typo: 1 } + Exact { word: "city" } + OR + Exact { word: "nyc" } + AND + Exact { word: "new" } + Exact { word: "york" } + Tolerant { word: "newyorkcity", max typo: 1 } + "###); } #[test] @@ -1080,30 +951,16 @@ mod test { let query = "n grams "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("n".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "grams".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "ngrams".to_string()), - }), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "n" } + Tolerant { word: "grams", max typo: 1 } + Tolerant { word: "ngrams", max typo: 1 } + "###); } #[test] @@ -1111,36 +968,18 @@ mod test { let query = "wordsplit fish "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Or( - false, - vec![ - Operation::Phrase(vec!["word".to_string(), "split".to_string()]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(2, "wordsplit".to_string()), - }), - ], - ), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("fish".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "wordsplitfish".to_string()), - }), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + OR + PHRASE ["word", "split"] + Tolerant { word: "wordsplit", max typo: 2 } + Exact { word: "fish" } + Tolerant { word: "wordsplitfish", max typo: 1 } + "###); } #[test] @@ -1148,15 +987,14 @@ mod test { let query = "\"hey friends\" \" \" \"wooop"; let tokens = query.tokenize(); - let expected = Operation::And(vec![ - Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), - ]); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + AND + PHRASE ["hey", "friends"] + Exact { word: "wooop" } + "###); } #[test] @@ -1164,15 +1002,14 @@ mod test { let query = "\"hey friends. wooop wooop\""; let tokens = query.tokenize(); - let expected = Operation::And(vec![ - Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), - Operation::Phrase(vec!["wooop".to_string(), "wooop".to_string()]), - ]); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + AND + PHRASE ["hey", "friends"] + PHRASE ["wooop", "wooop"] + "###); } #[test] @@ -1180,82 +1017,30 @@ mod test { let query = "hey my friend "; let tokens = query.tokenize(); - let expected = Operation::Or( - true, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("my".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "heymy".to_string()), - }), - ], - ), - Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("my".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "friend".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "myfriend".to_string()), - }), - ], - ), - ]), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "heymy".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "friend".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "heymyfriend".to_string()), - }), - ], - ), - ], - ); let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR(WORD) + Exact { word: "hey" } + OR + AND + Exact { word: "hey" } + Exact { word: "my" } + Tolerant { word: "heymy", max typo: 1 } + OR + AND + Exact { word: "hey" } + OR + AND + Exact { word: "my" } + Tolerant { word: "friend", max typo: 1 } + Tolerant { word: "myfriend", max typo: 1 } + AND + Tolerant { word: "heymy", max typo: 1 } + Tolerant { word: "friend", max typo: 1 } + Tolerant { word: "heymyfriend", max typo: 1 } + "###); } #[test] @@ -1263,11 +1048,12 @@ mod test { let query = "\"hey my\""; let tokens = query.tokenize(); - let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]); let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + PHRASE ["hey", "my"] + "###); } #[test] @@ -1275,68 +1061,27 @@ mod test { let query = r#""hey" my good "friend""#; let tokens = query.tokenize(); - let expected = Operation::Or( - true, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("friend".to_string()), - }), - ]), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("my".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("friend".to_string()), - }), - ]), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("my".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("good".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "mygood".to_string()), - }), - ], - ), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("friend".to_string()), - }), - ]), - ], - ); let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR(WORD) + AND + Exact { word: "hey" } + Exact { word: "friend" } + AND + Exact { word: "hey" } + Exact { word: "my" } + Exact { word: "friend" } + AND + Exact { word: "hey" } + OR + AND + Exact { word: "my" } + Exact { word: "good" } + Tolerant { word: "mygood", max typo: 1 } + Exact { word: "friend" } + "###); } #[test] @@ -1344,29 +1089,16 @@ mod test { let query = "hey friends "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("friends".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("heyfriends".to_string()), - }), - ], - ); let (query_tree, _) = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "hey" } + Exact { word: "friends" } + Exact { word: "heyfriends" } + "###); } #[test] @@ -1374,15 +1106,14 @@ mod test { let query = "\"hey my\" good friend"; let tokens = query.tokenize(); - let expected = Operation::And(vec![ - Operation::Phrase(vec!["hey".to_string(), "my".to_string()]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), - ]); - let (query_tree, _) = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + AND + PHRASE ["hey", "my"] + Exact { word: "good" } + "###); } #[test] From a9c7d8269308e6e145437c5ff7de952fed8bbad3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 4 Aug 2022 11:04:07 +0200 Subject: [PATCH 1536/1889] Switch to snapshot tests for search/criteria/attribute.rs --- milli/src/search/criteria/attribute.rs | 72 +++++++++++++++++++------- 1 file changed, 53 insertions(+), 19 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 3d67b60c0..d8feeeee9 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -632,25 +632,59 @@ mod tests { ]), ], ); - - let expected = vec![ - vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], - vec![ - vec![Query { prefix: false, kind: QueryKind::exact(S("manythe")) }], - vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], - ], - vec![ - vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], - vec![Query { prefix: false, kind: QueryKind::exact(S("thefish")) }], - ], - vec![ - vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], - vec![Query { prefix: false, kind: QueryKind::exact(S("the")) }], - vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], - ], - ]; - let result = flatten_query_tree(&query_tree); - assert_eq!(expected, result); + + insta::assert_debug_snapshot!(result, @r###" + [ + [ + [ + Exact { + word: "manythefish", + }, + ], + ], + [ + [ + Exact { + word: "manythe", + }, + ], + [ + Exact { + word: "fish", + }, + ], + ], + [ + [ + Exact { + word: "many", + }, + ], + [ + Exact { + word: "thefish", + }, + ], + ], + [ + [ + Exact { + word: "many", + }, + ], + [ + Exact { + word: "the", + }, + ], + [ + Exact { + word: "fish", + }, + ], + ], + ] + "###); } } From d2e01528a6e113e6090a670999fbd04452ca71f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 4 Aug 2022 11:18:32 +0200 Subject: [PATCH 1537/1889] Switch to snapshot tests for search/criteria/typo.rs --- milli/src/search/criteria/typo.rs | 206 +++++++++--------------------- milli/src/search/query_tree.rs | 5 - 2 files changed, 59 insertions(+), 152 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 97a9b4e4b..3ba158b3b 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -349,22 +349,33 @@ mod test { use super::super::test::TestContext; use super::*; + fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String { + let mut result = String::new(); + while let Some(criterion) = criteria.next(&mut parameters).unwrap() { + result.push_str(&format!("{criterion:?}\n\n")); + } + result + } + #[test] fn initial_placeholder_no_facets() { let context = TestContext::default(); let query_tree = None; let facet_candidates = None; - let mut criterion_parameters = CriterionParameters { + let criterion_parameters = CriterionParameters { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; let parent = Initial::new(query_tree, facet_candidates); - let mut criteria = Typo::new(&context, Box::new(parent)); + let criteria = Typo::new(&context, Box::new(parent)); + + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, bucket_candidates: None } - assert!(criteria.next(&mut criterion_parameters).unwrap().unwrap().candidates.is_none()); - assert!(criteria.next(&mut criterion_parameters).unwrap().is_none()); + "###); } #[test] @@ -390,78 +401,32 @@ mod test { let facet_candidates = None; - let mut criterion_parameters = CriterionParameters { + let criterion_parameters = CriterionParameters { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; let parent = Initial::new(Some(query_tree), facet_candidates); - let mut criteria = Typo::new(&context, Box::new(parent)); + let criteria = Typo::new(&context, Box::new(parent)); - let candidates_1 = context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("world").unwrap().unwrap(); - let expected_1 = CriterionResult { - query_tree: Some(Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("world".to_string()), - }), - ])], - )), - candidates: Some(candidates_1.clone()), - bucket_candidates: Some(candidates_1), - filtered_candidates: None, - }; + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } - assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + OR + Exact { word: "word" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } - let candidates_2 = (context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("word").unwrap().unwrap()) - - context.word_docids("world").unwrap().unwrap(); - let expected_2 = CriterionResult { - query_tree: Some(Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact_with_typo(1, "word".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("world".to_string()), - }), - ], - ), - ])], - )), - candidates: Some(candidates_2.clone()), - bucket_candidates: Some(candidates_2), - filtered_candidates: None, - }; - - assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); + "###); } #[test] @@ -470,25 +435,18 @@ mod test { let query_tree = None; let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criterion_parameters = CriterionParameters { + let criterion_parameters = CriterionParameters { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; let parent = Initial::new(query_tree, Some(facet_candidates.clone())); - let mut criteria = Typo::new(&context, Box::new(parent)); + let criteria = Typo::new(&context, Box::new(parent)); - let expected = CriterionResult { - query_tree: None, - candidates: None, - bucket_candidates: None, - filtered_candidates: Some(facet_candidates.clone()), - }; + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), bucket_candidates: None } - // first iteration, returns the facet candidates - assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected)); - - // second iteration, returns None because there is no more things to do - assert!(criteria.next(&mut criterion_parameters).unwrap().is_none()); + "###); } #[test] @@ -514,77 +472,31 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criterion_parameters = CriterionParameters { + let criterion_parameters = CriterionParameters { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone())); - let mut criteria = Typo::new(&context, Box::new(parent)); + let criteria = Typo::new(&context, Box::new(parent)); - let candidates_1 = context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("world").unwrap().unwrap(); - let expected_1 = CriterionResult { - query_tree: Some(Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("world".to_string()), - }), - ])], - )), - candidates: Some(&candidates_1 & &facet_candidates), - bucket_candidates: Some(&candidates_1 & &facet_candidates), - filtered_candidates: None, - }; + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } - assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + OR + Exact { word: "word" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } - let candidates_2 = (context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("word").unwrap().unwrap()) - - context.word_docids("world").unwrap().unwrap(); - let expected_2 = CriterionResult { - query_tree: Some(Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact_with_typo(1, "word".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("world".to_string()), - }), - ], - ), - ])], - )), - candidates: Some(&candidates_2 & &facet_candidates), - bucket_candidates: Some(&candidates_2 & &facet_candidates), - filtered_candidates: None, - }; - - assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); + "###); } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index ace1d9dfe..617d9e4d9 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -99,11 +99,6 @@ impl QueryKind { QueryKind::Exact { original_typo: 0, word } } - #[cfg(test)] - pub fn exact_with_typo(original_typo: u8, word: String) -> Self { - QueryKind::Exact { original_typo, word } - } - pub fn tolerant(typo: u8, word: String) -> Self { QueryKind::Tolerant { typo, word } } From 051f24f67467d106ff605e7f7ec43577ac6281f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 4 Aug 2022 11:26:39 +0200 Subject: [PATCH 1538/1889] Switch to snapshot tests for search/matches/mod.rs --- milli/src/search/matches/mod.rs | 142 ++++++++++++++++++++------------ 1 file changed, 88 insertions(+), 54 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 72592c4cb..09ed24080 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -573,15 +573,18 @@ mod tests { let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); - + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." + ); + // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. - assert_eq!( - &matcher.format(format_options), - "Natalie risk her future to build a world with the boy she loves." + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves." ); } @@ -602,19 +605,28 @@ mod tests { let text = "Ŵôřlḑôle"; let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options), "Ŵôřlḑôle"); + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑôle" + ); // Text containing unicode match. let text = "Ŵôřlḑ"; let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options), "Ŵôřlḑ"); + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑ" + ); // Text containing unicode match. let text = "Westfália"; let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options), "Westfália"); + insta::assert_snapshot!( + matcher.format(format_options), + @"Westfália" + ); } #[test] @@ -628,83 +640,89 @@ mod tests { // empty text. let text = ""; let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ""); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); // text containing only separators. let text = ":-)"; let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ":-)"); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let mut matcher = builder.build(text); // no highlight should return 10 first words with a marker at the end. - assert_eq!( - &matcher.format(format_options), - "A quick brown fox can not jump 32 feet, right…" + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" ); // Text without any match starting by a separator. let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; let mut matcher = builder.build(text); // no highlight should return 10 first words with a marker at the end. - assert_eq!( - &matcher.format(format_options), - "(A quick brown fox can not jump 32 feet, right…" + insta::assert_snapshot!( + matcher.format(format_options), + @"(A quick brown fox can not jump 32 feet, right…" ); // Test phrase propagation let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; let mut matcher = builder.build(text); // should crop the phrase instead of croping around the match. - assert_eq!( - &matcher.format(format_options), - "… Split The World is a book written by Emily Henry…", + insta::assert_snapshot!( + matcher.format(format_options), + @"… Split The World is a book written by Emily Henry…" ); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; let mut matcher = builder.build(text); // no highlight should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…future to build a world with the boy she loves…" + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" ); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let mut matcher = builder.build(text); // no highlight should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…she loves. Emily Henry: The Love That Split The World." + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." ); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…void void void void void split the world void void" + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" ); // Text containing matches with diferent density. let text = "split void the void void world void void void void void void void void void void split the world void void"; let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…void void void void void split the world void void" + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" ); // Text containing matches with same word. let text = "split split split split split split void void void void void void void void void void split the world void void"; let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…void void void void void split the world void void" + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" ); } @@ -719,44 +737,53 @@ mod tests { // empty text. let text = ""; let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ""); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); // text containing only separators. let text = ":-)"; let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ":-)"); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let mut matcher = builder.build(text); // both should return 10 first words with a marker at the end. - assert_eq!( - &matcher.format(format_options), - "A quick brown fox can not jump 32 feet, right…" + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" ); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; let mut matcher = builder.build(text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!( - &matcher.format(format_options), - "…future to build a world with the boy she loves…" + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" ); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let mut matcher = builder.build(text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World."); + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." + ); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…void void void void void split the world void void" + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" ); } @@ -773,19 +800,28 @@ mod tests { let format_options = FormatOptions { highlight: false, crop: Some(2) }; let mut matcher = builder.build(text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(format_options), "…split the…"); + insta::assert_snapshot!( + matcher.format(format_options), + @"…split the…" + ); // set a smaller crop size let format_options = FormatOptions { highlight: false, crop: Some(1) }; let mut matcher = builder.build(text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(format_options), "…split…"); + insta::assert_snapshot!( + matcher.format(format_options), + @"…split…" + ); // set crop size to 0 let format_options = FormatOptions { highlight: false, crop: Some(0) }; let mut matcher = builder.build(text); // because crop size is 0, crop is ignored. - assert_eq!(&matcher.format(format_options), "void void split the world void void."); + insta::assert_snapshot!( + matcher.format(format_options), + @"void void split the world void void." + ); } #[test] @@ -820,11 +856,9 @@ mod tests { let text = "the do or die can't be he do and or isn't he"; let mut matcher = builder.build(text); - assert_eq!( - &matcher.format(format_options), - "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_", - "matches: {:?}", - &matcher.matches + insta::assert_snapshot!( + matcher.format(format_options), + @"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_" ); } } From 748bb86b5be3ba942e6b9ed5ce60b8d4430ed754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 4 Aug 2022 11:34:10 +0200 Subject: [PATCH 1539/1889] cargo fmt --- milli/src/search/criteria/typo.rs | 2 +- milli/src/search/facet/filter.rs | 8 +++++--- milli/src/search/matches/mod.rs | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 3ba158b3b..e9e6fb2f5 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -370,7 +370,7 @@ mod test { let parent = Initial::new(query_tree, facet_candidates); let criteria = Typo::new(&context, Box::new(parent)); - + let result = display_criteria(criteria, criterion_parameters); insta::assert_snapshot!(result, @r###" CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, bucket_candidates: None } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 03ec03d39..225d3ea8d 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -490,12 +490,14 @@ impl<'a> From> for Filter<'a> { #[cfg(test)] mod tests { - use crate::index::tests::TempIndex; - use crate::Filter; + use std::fmt::Write; + use big_s::S; use either::Either; use maplit::hashset; - use std::fmt::Write; + + use crate::index::tests::TempIndex; + use crate::Filter; #[test] fn empty_db() { diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 09ed24080..2697405be 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -574,10 +574,10 @@ mod tests { let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. insta::assert_snapshot!( - matcher.format(format_options), + matcher.format(format_options), @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." ); - + // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; let mut matcher = builder.build(text); From ce560fdcb5d7b3aa2150be57d5e4e80b0038fd84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 4 Aug 2022 12:25:39 +0200 Subject: [PATCH 1540/1889] Add documentation for `db_snap!` --- milli/src/snapshot_tests.rs | 65 +++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f9929bb22..2b55a7e2c 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -30,6 +30,71 @@ pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Setti settings } +/** +Create a snapshot test of the given database. + +## Arguments +1. The identifier for the `Index` +2. The content of the index to snapshot. Available options are: + - `settings` + - `word_docids` + - `exact_word_docids` + - `word_prefix_docids` + - `exact_word_prefix_docids` + - `docid_word_positions` + - `word_pair_proximity_docids` + - `word_prefix_pair_proximity_docids` + - `word_position_docids` + - `field_id_word_count_docids` + - `word_prefix_position_docids` + - `facet_id_f64_docids` + - `facet_id_string_docids` + - `documents_ids` + - `stop_words` + - `soft_deleted_documents_ids` + - `field_distribution` + - `fields_ids_map` + - `geo_faceted_documents_ids` + - `external_documents_ids` + - `number_faceted_documents_ids` + - `string_faceted_documents_ids` + - `words_fst` + - `words_prefixes_fst` + +3. The identifier for the snapshot test (optional) +4. `@""` to write the snapshot inline (optional) + +## Behaviour +The content of the database will be printed either inline or to the file system +at `test_directory/test_file.rs/test_name/db_name.snap`. + +If the database is too large, then only the hash of the database will be saved, with +the name `db_name.hash.snap`. To *also* save the full content of the database anyway, +set the `MILLI_TEST_FULL_SNAPS` environment variable to `true`. The full snapshot will +be saved with the name `db_name.full.snap` but will not be saved to the git repository. + +Running `cargo test` will check whether the old snapshot is identical to the +current one. If they are equal, the test passes. Otherwise, the test fails. + +Use the command line `cargo insta` to approve or reject new snapshots. + +## Example +```ignore +let index = TempIndex::new(); + +// basic usages +db_snap!(index, word_docids); + +// named snapshot to avoid conflicts +db_snap!(index, word_docids, "some_identifier"); + +// write the snapshot inline +db_snap!(index, word_docids, @""); // will be autocompleted by running `cargo insta review` + +// give a name to the inline snapshot +db_snap!(index, word_docids, "some_identifier", @""); +``` +*/ #[macro_export] macro_rules! db_snap { ($index:ident, $db_name:ident, $name:expr) => { From 4b7fd4dfae9234492378ad51c0e7a2a0558dfb62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 10:42:54 +0200 Subject: [PATCH 1541/1889] Update insta version --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b745d970a..2bb6a50a1 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -51,7 +51,7 @@ csv = "1.1.6" [dev-dependencies] big_s = "1.0.2" -insta = "1.17.1" +insta = "1.18.1" maplit = "1.0.2" md5 = "0.7.0" rand = "0.8.5" From 12920f2a4f3f3613233801c858ce9cae214fb03b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 12:10:45 +0200 Subject: [PATCH 1542/1889] Fix paths of snapshot tests --- milli/src/snapshot_tests.rs | 7 ++++--- .../default/facet_id_f64_docids.hash.snap | 0 .../facet_id_f64_docids.hash.snap | 0 .../facet_id_f64_docids.hash.snap | 0 .../facet_id_f64_docids.hash.snap | 0 .../facet_id_f64_docids.hash.snap | 0 .../tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap | 0 .../default/facet_id_string_docids.hash.snap | 0 .../facet_id_string_docids.hash.snap | 0 .../initial/word_prefix_pair_proximity_docids.snap | 0 .../update/word_prefix_pair_proximity_docids.snap | 0 11 files changed, 4 insertions(+), 3 deletions(-) rename milli/src/update/snapshots/{update => }/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap (100%) rename milli/src/update/snapshots/{update => }/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap (100%) rename milli/src/update/snapshots/{update => }/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap (100%) rename milli/src/update/snapshots/{update => }/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap (100%) rename milli/src/update/snapshots/{update => }/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap (100%) rename milli/src/update/snapshots/{update => }/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap (100%) rename milli/src/update/snapshots/{update => }/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap (100%) rename milli/src/update/snapshots/{update => }/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap (100%) rename milli/src/update/snapshots/{update => }/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap (100%) rename milli/src/update/snapshots/{update => }/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap (100%) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 2b55a7e2c..c6e99a437 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -17,14 +17,15 @@ pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Setti let mut settings = insta::Settings::clone_current(); settings.set_prepend_module_to_snapshot(false); let path = Path::new(std::panic::Location::caller().file()); - let path = path.strip_prefix("milli/src").unwrap(); + let filename = path.file_name().unwrap().to_str().unwrap(); settings.set_omit_expression(true); let test_name = std::thread::current().name().unwrap().rsplit("::").next().unwrap().to_owned(); if let Some(name) = name { - settings.set_snapshot_path(Path::new("snapshots").join(path).join(test_name).join(name)); + settings + .set_snapshot_path(Path::new("snapshots").join(filename).join(test_name).join(name)); } else { - settings.set_snapshot_path(Path::new("snapshots").join(path).join(test_name)); + settings.set_snapshot_path(Path::new("snapshots").join(filename).join(test_name)); } settings diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap similarity index 100% rename from milli/src/update/snapshots/update/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap rename to milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap similarity index 100% rename from milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap similarity index 100% rename from milli/src/update/snapshots/update/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap similarity index 100% rename from milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap similarity index 100% rename from milli/src/update/snapshots/update/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap similarity index 100% rename from milli/src/update/snapshots/update/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap similarity index 100% rename from milli/src/update/snapshots/update/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap rename to milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap diff --git a/milli/src/update/snapshots/update/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap similarity index 100% rename from milli/src/update/snapshots/update/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap rename to milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap diff --git a/milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap rename to milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/snapshots/update/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap rename to milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap From 6f4912622360934a1f7a87fef888fc4415201b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 15:53:01 +0200 Subject: [PATCH 1543/1889] Fix db_snap macro with inline parameter --- milli/src/snapshot_tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index c6e99a437..eac3340fd 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -135,7 +135,7 @@ macro_rules! db_snap { }); }; ($index:ident, $db_name:ident, $name:literal, @$inline:literal) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(format!("", $name))); + let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(&format!("{}", $name))); settings.bind(|| { let snap = $crate::full_snap_of_db!($index, $db_name); let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); From fb2b6c0c28dc6c9135ce5a33f87f3918c6062cfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 16:56:42 +0200 Subject: [PATCH 1544/1889] Use mimalloc for benchmarks on all platforms --- benchmarks/Cargo.toml | 6 ------ benchmarks/benches/formatting.rs | 5 ----- benchmarks/benches/indexing.rs | 5 ----- benchmarks/benches/search_geo.rs | 5 ----- benchmarks/benches/search_songs.rs | 5 ----- benchmarks/benches/search_wiki.rs | 5 ----- 6 files changed, 31 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 1dc9941c3..e63210573 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -9,14 +9,8 @@ milli = { path = "../milli" } anyhow = "1.0.56" serde_json = { version = "1.0.79", features = ["preserve_order"] } csv = "1.1.6" - -[target.'cfg(target_os = "linux")'.dependencies] -jemallocator = "0.3.2" - -[target.'cfg(target_os = "macos")'.dependencies] mimalloc = { version = "0.1.29", default-features = false } - [dev-dependencies] heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } criterion = { version = "0.3.5", features = ["html_reports"] } diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs index be9d965a9..25c5a0ba8 100644 --- a/benchmarks/benches/formatting.rs +++ b/benchmarks/benches/formatting.rs @@ -2,11 +2,6 @@ use criterion::{criterion_group, criterion_main}; use milli::tokenizer::TokenizerBuilder; use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; -#[cfg(target_os = "linux")] -#[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; - -#[cfg(target_os = "macos")] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index d0a091298..d532c85d9 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -14,11 +14,6 @@ use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; use roaring::RoaringBitmap; -#[cfg(target_os = "linux")] -#[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; - -#[cfg(target_os = "macos")] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/benchmarks/benches/search_geo.rs b/benchmarks/benches/search_geo.rs index 65aeef01e..faea4e3e0 100644 --- a/benchmarks/benches/search_geo.rs +++ b/benchmarks/benches/search_geo.rs @@ -5,11 +5,6 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; -#[cfg(target_os = "linux")] -#[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; - -#[cfg(target_os = "macos")] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/benchmarks/benches/search_songs.rs b/benchmarks/benches/search_songs.rs index 05ba39cdd..a1245528f 100644 --- a/benchmarks/benches/search_songs.rs +++ b/benchmarks/benches/search_songs.rs @@ -5,11 +5,6 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; -#[cfg(target_os = "linux")] -#[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; - -#[cfg(target_os = "macos")] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/benchmarks/benches/search_wiki.rs b/benchmarks/benches/search_wiki.rs index 20d62fba6..b792c2645 100644 --- a/benchmarks/benches/search_wiki.rs +++ b/benchmarks/benches/search_wiki.rs @@ -5,11 +5,6 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; -#[cfg(target_os = "linux")] -#[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; - -#[cfg(target_os = "macos")] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; From dea00311b681316caf441b3d4718c6d0e297f504 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 16 Aug 2022 09:19:17 +0200 Subject: [PATCH 1545/1889] Add type annotations to remove compiler error --- milli/src/error.rs | 4 ++-- milli/src/search/facet/filter.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 80c923bd9..c817f64fa 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -99,7 +99,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco )] InvalidDocumentId { document_id: Value }, #[error("Invalid facet distribution, the fields `{}` are not set as filterable.", - .invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") + .invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") )] InvalidFacetsDistribution { invalid_facets_name: BTreeSet }, #[error(transparent)] @@ -111,7 +111,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco match .valid_fields.is_empty() { true => "This index does not have configured sortable attributes.".to_string(), false => format!("Available sortable attributes are: `{}`.", - valid_fields.iter().map(AsRef::as_ref).collect::>().join(", ") + valid_fields.iter().map(AsRef::as_ref).collect::>().join(", ") ), } )] diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 225d3ea8d..90aab826a 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -45,7 +45,7 @@ impl<'a> Display for FilterError<'a> { attribute, ) } else { - let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::>().join(" "); + let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::>().join(" "); write!( f, From 20be69e1b94aa375077b8a55630a88b050d652a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 16 Aug 2022 20:09:36 +0200 Subject: [PATCH 1546/1889] Always use mimalloc as the global allocator --- cli/Cargo.toml | 3 +-- cli/src/main.rs | 3 +-- helpers/Cargo.toml | 3 +-- helpers/src/main.rs | 3 +-- http-ui/Cargo.toml | 3 +-- http-ui/src/main.rs | 3 +-- infos/Cargo.toml | 4 +--- infos/src/main.rs | 3 +-- milli/fuzz/Cargo.toml | 4 +--- milli/fuzz/fuzz_targets/indexing.rs | 3 +-- 10 files changed, 10 insertions(+), 22 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index e4de70031..9ca03894b 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -18,6 +18,5 @@ byte-unit = { version = "4.0.14", features = ["serde"] } bimap = "0.6.2" csv = "1.1.6" stderrlog = "0.5.1" +mimalloc = { version = "0.1.29", default-features = false } -[target.'cfg(target_os = "linux")'.dependencies] -jemallocator = "0.3.2" diff --git a/cli/src/main.rs b/cli/src/main.rs index 35fef95c6..8485560f5 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -16,9 +16,8 @@ use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerCon use milli::{Index, Object}; use structopt::StructOpt; -#[cfg(target_os = "linux")] #[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; #[derive(Debug, StructOpt)] #[structopt(name = "Milli CLI", about = "A simple CLI to manipulate a milli index.")] diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 46c50de43..4d32dc32a 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -14,5 +14,4 @@ milli = { path = "../milli" } stderrlog = "0.5.1" structopt = { version = "0.3.26", default-features = false } -[target.'cfg(target_os = "linux")'.dependencies] -jemallocator = "0.3.2" +mimalloc = { version = "0.1.29", default-features = false } \ No newline at end of file diff --git a/helpers/src/main.rs b/helpers/src/main.rs index b325aef89..0081965ad 100644 --- a/helpers/src/main.rs +++ b/helpers/src/main.rs @@ -5,9 +5,8 @@ use heed::{CompactionOption, Env, EnvOpenOptions}; use structopt::StructOpt; use Command::*; -#[cfg(target_os = "linux")] #[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; #[derive(Debug, StructOpt)] /// Some helpers commands for milli. diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 43e046c11..b3763409e 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -30,6 +30,7 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] } tokio = { version = "1.17.0", features = ["full"] } tokio-stream = { version = "0.1.8", default-features = false, features = ["sync"] } warp = "0.3.2" +mimalloc = { version = "0.1.29", default-features = false } # logging log = "0.4.14" @@ -45,5 +46,3 @@ csv = "1.1.6" maplit = "1.0.2" serde_test = "1.0.136" -[target.'cfg(target_os = "linux")'.dependencies] -jemallocator = "0.3.2" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 83fce9a9c..de5d3c5ab 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -42,9 +42,8 @@ use warp::Filter; use self::update_store::UpdateStore; -#[cfg(target_os = "linux")] #[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; static GLOBAL_CONFIG: OnceCell = OnceCell::new(); diff --git a/infos/Cargo.toml b/infos/Cargo.toml index ea1ee9193..7cbe52693 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -15,6 +15,4 @@ roaring = "0.9.0" serde_json = "1.0.79" stderrlog = "0.5.1" structopt = { version = "0.3.26", default-features = false } - -[target.'cfg(target_os = "linux")'.dependencies] -jemallocator = "0.3.2" +mimalloc = { version = "0.1.29", default-features = false } diff --git a/infos/src/main.rs b/infos/src/main.rs index feec17557..2862fed7a 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -11,9 +11,8 @@ use milli::{FieldId, Index}; use structopt::StructOpt; use Command::*; -#[cfg(target_os = "linux")] #[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; const ALL_DATABASE_NAMES: &[&str] = &[ MAIN, diff --git a/milli/fuzz/Cargo.toml b/milli/fuzz/Cargo.toml index e734936fb..7e1bea3c5 100644 --- a/milli/fuzz/Cargo.toml +++ b/milli/fuzz/Cargo.toml @@ -16,9 +16,7 @@ serde_json = { version = "1.0.62", features = ["preserve_order"] } anyhow = "1.0" tempfile = "3.3" arbitrary-json = "0.1.0" - -[target.'cfg(target_os = "linux")'.dependencies] -jemallocator = "0.3.2" +mimalloc = { version = "0.1.29", default-features = false } [dependencies.milli] path = ".." diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index e4f42655e..a447aebe2 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -12,9 +12,8 @@ use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Setting use milli::Index; use serde_json::{Map, Value}; -#[cfg(target_os = "linux")] #[global_allocator] -static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; /// reads json from input and write an obkv batch to writer. pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { From f20e588ec1a19bcfbbd28de7b0636951edc40d4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 07:44:33 +0200 Subject: [PATCH 1547/1889] Make sure there is one newline at eof in cargo.toml --- cli/Cargo.toml | 1 - helpers/Cargo.toml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 9ca03894b..e012c8a33 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -19,4 +19,3 @@ bimap = "0.6.2" csv = "1.1.6" stderrlog = "0.5.1" mimalloc = { version = "0.1.29", default-features = false } - diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 4d32dc32a..1167bd353 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -14,4 +14,4 @@ milli = { path = "../milli" } stderrlog = "0.5.1" structopt = { version = "0.3.26", default-features = false } -mimalloc = { version = "0.1.29", default-features = false } \ No newline at end of file +mimalloc = { version = "0.1.29", default-features = false } From 03e679b634b8e70b34cb20069a611f028de84476 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 16 Aug 2022 20:50:08 +0200 Subject: [PATCH 1548/1889] Make binaries faster on release profile through better compile options --- Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 506fd3dc3..f0fc59499 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,8 @@ opt-level = 3 [profile.release] debug = true +codegen-units = 1 +lto = "thin" # Make sure that the build scripts and proc-macros are compiled with # all the optimizations. It speeds up the zip crate that we use in the build.rs. From 5d59bfde8a102f2da004619b8981fd8030316bf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 11:46:56 +0200 Subject: [PATCH 1549/1889] Sort Cargo.toml dependencies --- benchmarks/Cargo.toml | 8 ++++---- cli/Cargo.toml | 18 +++++++++--------- helpers/Cargo.toml | 3 +-- http-ui/Cargo.toml | 6 +++--- infos/Cargo.toml | 2 +- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index e63210573..600525372 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -5,15 +5,15 @@ edition = "2018" publish = false [dependencies] -milli = { path = "../milli" } anyhow = "1.0.56" -serde_json = { version = "1.0.79", features = ["preserve_order"] } csv = "1.1.6" +milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } +serde_json = { version = "1.0.79", features = ["preserve_order"] } [dev-dependencies] -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } criterion = { version = "0.3.5", features = ["html_reports"] } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } rand = "0.8.5" rand_chacha = "0.3.1" roaring = "0.9.0" @@ -21,8 +21,8 @@ roaring = "0.9.0" [build-dependencies] anyhow = "1.0.56" bytes = "1.1.0" -flate2 = "1.0.22" convert_case = "0.5.0" +flate2 = "1.0.22" reqwest = { version = "0.11.9", features = ["blocking", "rustls-tls"], default-features = false } [[bench]] diff --git a/cli/Cargo.toml b/cli/Cargo.toml index e012c8a33..504df712e 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -6,16 +6,16 @@ description = "A CLI to interact with a milli index" publish = false [dependencies] +bimap = "0.6.2" +byte-unit = { version = "4.0.14", features = ["serde"] } +color-eyre = "0.6.1" +csv = "1.1.6" +eyre = "0.6.7" +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } indicatif = "0.16.2" +milli = { path = "../milli" } +mimalloc = { version = "0.1.29", default-features = false } serde = "1.0.136" serde_json = "1.0.79" -structopt = "0.3.26" -milli = { path = "../milli" } -eyre = "0.6.7" -color-eyre = "0.6.1" -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } -byte-unit = { version = "4.0.14", features = ["serde"] } -bimap = "0.6.2" -csv = "1.1.6" stderrlog = "0.5.1" -mimalloc = { version = "0.1.29", default-features = false } +structopt = "0.3.26" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 1167bd353..bd09574f3 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -11,7 +11,6 @@ anyhow = "1.0.56" byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } milli = { path = "../milli" } +mimalloc = { version = "0.1.29", default-features = false } stderrlog = "0.5.1" structopt = { version = "0.3.26", default-features = false } - -mimalloc = { version = "0.1.29", default-features = false } diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index b3763409e..993818f93 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -13,6 +13,7 @@ crossbeam-channel = "0.5.2" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } memmap2 = "0.5.3" milli = { path = "../milli" } +mimalloc = { version = "0.1.29", default-features = false } once_cell = "1.10.0" rayon = "1.5.1" structopt = { version = "0.3.26", default-features = false, features = ["wrap_help"] } @@ -30,17 +31,16 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] } tokio = { version = "1.17.0", features = ["full"] } tokio-stream = { version = "0.1.8", default-features = false, features = ["sync"] } warp = "0.3.2" -mimalloc = { version = "0.1.29", default-features = false } # logging +fst = "0.4.7" log = "0.4.14" stderrlog = "0.5.1" -fst = "0.4.7" # Temporary fix for bitvec, remove once fixed. (https://github.com/bitvecto-rs/bitvec/issues/105) -funty = "2.0.0" bimap = "0.6.2" csv = "1.1.6" +funty = "2.0.0" [dev-dependencies] maplit = "1.0.2" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 7cbe52693..8c92ae649 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -11,8 +11,8 @@ byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } csv = "1.1.6" heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } milli = { path = "../milli" } +mimalloc = { version = "0.1.29", default-features = false } roaring = "0.9.0" serde_json = "1.0.79" stderrlog = "0.5.1" structopt = { version = "0.3.26", default-features = false } -mimalloc = { version = "0.1.29", default-features = false } From 306593144d74ed673592cf772e12d2225d1e5518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 6 Jul 2022 18:20:15 +0200 Subject: [PATCH 1550/1889] Refactor word prefix pair proximity indexation --- infos/src/main.rs | 1 + milli/src/heed_codec/mod.rs | 1 + milli/src/heed_codec/str_str_u8_codec.rs | 35 +- milli/src/lib.rs | 2 +- .../extract_word_pair_proximity_docids.rs | 1 + .../word_prefix_pair_proximity_docids/mod.rs | 468 ++++++++++++++++++ .../readme.md | 144 ++++++ 7 files changed, 649 insertions(+), 3 deletions(-) create mode 100644 milli/src/update/word_prefix_pair_proximity_docids/mod.rs create mode 100644 milli/src/update/word_prefix_pair_proximity_docids/readme.md diff --git a/infos/src/main.rs b/infos/src/main.rs index feec17557..4e05ce0a5 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -1153,6 +1153,7 @@ fn word_pair_proximities_docids( prefix.extend_from_slice(word1.as_bytes()); prefix.push(0); prefix.extend_from_slice(word2.as_bytes()); + prefix.push(0); let db = index.word_pair_proximity_docids.as_polymorph(); let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?; diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 2f2a01192..02235f26d 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -16,3 +16,4 @@ pub use self::roaring_bitmap_length::{ }; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::StrStrU8Codec; +pub use self::str_str_u8_codec::UncheckedStrStrU8Codec; diff --git a/milli/src/heed_codec/str_str_u8_codec.rs b/milli/src/heed_codec/str_str_u8_codec.rs index 2454e7d56..888e08752 100644 --- a/milli/src/heed_codec/str_str_u8_codec.rs +++ b/milli/src/heed_codec/str_str_u8_codec.rs @@ -9,9 +9,11 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { fn bytes_decode(bytes: &'a [u8]) -> Option { let (n, bytes) = bytes.split_last()?; let s1_end = bytes.iter().position(|b| *b == 0)?; - let (s1_bytes, s2_bytes) = bytes.split_at(s1_end); + let (s1_bytes, rest) = bytes.split_at(s1_end); + let rest = &rest[1..]; let s1 = str::from_utf8(s1_bytes).ok()?; - let s2 = str::from_utf8(&s2_bytes[1..]).ok()?; + let (_, s2_bytes) = rest.split_last()?; + let s2 = str::from_utf8(s2_bytes).ok()?; Some((s1, s2, *n)) } } @@ -24,6 +26,35 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { bytes.extend_from_slice(s1.as_bytes()); bytes.push(0); bytes.extend_from_slice(s2.as_bytes()); + bytes.push(0); + bytes.push(*n); + Some(Cow::Owned(bytes)) + } +} +pub struct UncheckedStrStrU8Codec; + +impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec { + type DItem = (&'a [u8], &'a [u8], u8); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (n, bytes) = bytes.split_last()?; + let s1_end = bytes.iter().position(|b| *b == 0)?; + let (s1_bytes, rest) = bytes.split_at(s1_end); + let rest = &rest[1..]; + let (_, s2_bytes) = rest.split_last()?; + Some((s1_bytes, s2_bytes, *n)) + } +} + +impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec { + type EItem = (&'a [u8], &'a [u8], u8); + + fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1); + bytes.extend_from_slice(s1); + bytes.push(0); + bytes.extend_from_slice(s2); + bytes.push(0); bytes.push(*n); Some(Cow::Owned(bytes)) } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 85b25cad1..ac88ebdab 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,7 +37,7 @@ pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, - RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, + RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, UncheckedStrStrU8Codec, }; pub use self::index::Index; pub use self::search::{ diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 90349eb93..5117bfaba 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -142,6 +142,7 @@ fn document_word_positions_into_sorter<'b>( key_buffer.extend_from_slice(w1.as_bytes()); key_buffer.push(0); key_buffer.extend_from_slice(w2.as_bytes()); + key_buffer.push(0); key_buffer.push(prox as u8); word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs new file mode 100644 index 000000000..119c0c53e --- /dev/null +++ b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs @@ -0,0 +1,468 @@ +use grenad::CompressionType; +use heed::types::ByteSlice; + +use heed::BytesDecode; +use log::debug; + +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; +use std::time::Instant; + +use crate::update::index_documents::{ + create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, +}; +use crate::{Index, Result, UncheckedStrStrU8Codec}; + +pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, + max_proximity: u8, + max_prefix_length: usize, +} + +impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { + WordPrefixPairProximityDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + max_nb_chunks: None, + max_memory: None, + max_proximity: 4, + max_prefix_length: 2, + } + } + + /// Set the maximum proximity required to make a prefix be part of the words prefixes + /// database. If two words are too far from the threshold the associated documents will + /// not be part of the prefix database. + /// + /// Default value is 4. This value must be lower or equal than 7 and will be clamped + /// to this bound otherwise. + pub fn max_proximity(&mut self, value: u8) -> &mut Self { + self.max_proximity = value.max(7); + self + } + + /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words + /// prefixes database. If the prefix length is higher than the threshold, the associated documents + /// will not be part of the prefix database. + /// + /// Default value is 2. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value; + self + } + + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] + pub fn execute<'a>( + mut self, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &'a [String], + common_prefix_fst_words: &[&'a [String]], + del_prefix_fst_words: &HashSet>, + ) -> Result<()> { + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + if !del_prefix_fst_words.is_empty() { + let mut iter = self + .index + .word_prefix_pair_proximity_docids + .remap_data_type::() + .iter_mut(self.wtxn)?; + while let Some(((_, w2, _), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(w2.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; + } + } + } + + // We construct a Trie of all the prefixes that are smaller than the max prefix length + // This is an optimisation that allows us to iterate over all prefixes of a word quickly. + let new_prefix_fst_words = PrefixTrieNode::from_sorted_prefixes( + new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), + ); + + let common_prefix_fst_words = PrefixTrieNode::from_sorted_prefixes( + common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), + ); + + let mut allocations = Allocations::default(); + let mut batch = PrefixAndProximityBatch::default(); + + if !common_prefix_fst_words.is_empty() { + let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + + while let Some((key, data)) = cursor.move_on_next()? { + let (word1, word2, proximity) = + UncheckedStrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + + if proximity <= self.max_proximity { + batch.flush_if_necessary( + word1, + word2, + &mut allocations, + &mut |key, value| { + insert_into_database( + &mut self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + self.insert_word_prefix_pair_proximity_docids_into_batch( + word2, + proximity, + data, + &common_prefix_fst_words, + &mut batch, + &mut allocations, + )?; + } + } + batch.flush(&mut allocations, &mut |key, value| { + insert_into_database( + &mut self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + })?; + } + + if !new_prefix_fst_words.is_empty() { + let mut db_iter = self + .index + .word_pair_proximity_docids + .remap_key_type::() + .remap_data_type::() + .iter(self.wtxn)?; + + let mut writer = create_writer( + self.chunk_compression_type, + self.chunk_compression_level, + tempfile::tempfile()?, + ); + + while let Some(((word1, word2, proximity), data)) = db_iter.next().transpose()? { + if proximity <= self.max_proximity { + batch.flush_if_necessary( + word1, + word2, + &mut allocations, + &mut |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + self.insert_word_prefix_pair_proximity_docids_into_batch( + word2, + proximity, + data, + &new_prefix_fst_words, + &mut batch, + &mut allocations, + )?; + } + } + batch.flush(&mut allocations, &mut |key, value| { + writer.insert(key, value).map_err(|e| e.into()) + })?; + + drop(db_iter); + writer_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + writer, + )?; + } + + Ok(()) + } + + fn insert_word_prefix_pair_proximity_docids_into_batch<'b, 'c>( + &self, + word2: &[u8], + proximity: u8, + data: &'b [u8], + prefixes: &'c PrefixTrieNode, + writer: &'b mut PrefixAndProximityBatch, + allocations: &mut Allocations, + ) -> Result<()> { + let mut prefix_buffer = allocations.take_byte_vector(); + prefixes.for_each_prefix_of(word2, &mut prefix_buffer, |prefix| { + let mut value = allocations.take_byte_vector(); + value.extend_from_slice(&data); + writer.insert(prefix, proximity, value, allocations); + }); + allocations.reclaim_byte_vector(prefix_buffer); + Ok(()) + } +} + +/** +A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). +The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. + +It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. + +A batch is valid only for a specific `word1`. Also, all prefixes stored in the batch start with the same letter. Make sure to +call [`self.flush_if_necessary`](Self::flush_if_necessary) before inserting a list of sorted `(prefix, proximity)` (and where each +`prefix` starts with the same letter) in order to uphold these invariants. + +The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content +can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: +- key : (word1, prefix, proximity) as bytes +- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes +*/ +#[derive(Default)] +struct PrefixAndProximityBatch { + batch: Vec<(Vec, Vec>)>, + word1: Vec, + word2_start: u8, +} + +impl PrefixAndProximityBatch { + fn insert( + &mut self, + new_prefix: &[u8], + new_proximity: u8, + new_value: Vec, + allocations: &mut Allocations, + ) { + let mut key = allocations.take_byte_vector(); + key.extend_from_slice(new_prefix); + key.push(0); + key.push(new_proximity); + + if let Some(position) = self.batch.iter().position(|(k, _)| k >= &key) { + let (existing_key, existing_data) = &mut self.batch[position]; + if existing_key == &key { + existing_data.push(Cow::Owned(new_value)); + } else { + let mut mergeable_data = allocations.take_mergeable_data_vector(); + mergeable_data.push(Cow::Owned(new_value)); + self.batch.insert(position, (key, mergeable_data)); + } + } else { + let mut mergeable_data = allocations.take_mergeable_data_vector(); + mergeable_data.push(Cow::Owned(new_value)); + self.batch.push((key, mergeable_data)); + } + } + + /// Call [`self.flush`](Self::flush) if `word1` changed or if `word2` begins with a different letter than the + /// previous word2. Update `prev_word1` and `prev_word2_start` with the new values from `word1` and `word2`. + fn flush_if_necessary( + &mut self, + word1: &[u8], + word2: &[u8], + allocations: &mut Allocations, + insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, + ) -> Result<()> { + let word2_start = word2[0]; + if word1 != self.word1 { + self.flush(allocations, insert)?; + self.word1.clear(); + self.word1.extend_from_slice(word1); + if word2_start != self.word2_start { + self.word2_start = word2_start; + } + } + if word2_start != self.word2_start { + self.flush(allocations, insert)?; + self.word2_start = word2_start; + } + Ok(()) + } + + /// Empties the batch, calling `insert` on each element. + /// + /// The key given to insert is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. + fn flush( + &mut self, + allocations: &mut Allocations, + insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, + ) -> Result<()> { + let PrefixAndProximityBatch { batch, word1: prev_word1, word2_start: _ } = self; + let mut buffer = allocations.take_byte_vector(); + buffer.extend_from_slice(prev_word1.as_slice()); + buffer.push(0); + + for (key, mergeable_data) in batch.drain(..) { + buffer.truncate(prev_word1.len() + 1); + buffer.extend_from_slice(key.as_slice()); + let data = merge_cbo_roaring_bitmaps(&buffer, &mergeable_data)?; + insert(buffer.as_slice(), &data)?; + + allocations.reclaim_byte_vector(key); + allocations.reclaim_mergeable_data_vector(mergeable_data); + } + Ok(()) + } +} + +fn insert_into_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + new_key: &[u8], + new_value: &[u8], +) -> Result<()> { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; + match iter.next().transpose()? { + Some((key, old_val)) if new_key == key => { + let val = + merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) + .map_err(|_| { + // TODO just wrap this error? + crate::error::InternalError::IndexingMergingKeys { + process: "get-put-merge", + } + })?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(key, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; + } + } + Ok(()) +} + +// This is adapted from `sorter_into_lmdb_database` +pub fn writer_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + writer: grenad::Writer, +) -> Result<()> { + let file = writer.into_inner()?; + let reader = grenad::Reader::new(BufReader::new(file))?; + + let before = Instant::now(); + + if database.is_empty(wtxn)? { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; + } + } else { + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + insert_into_database(wtxn, database, k, v)?; + } + } + + debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); + Ok(()) +} + +struct Allocations { + byte_vectors: Vec>, + mergeable_data_vectors: Vec>>, +} +impl Default for Allocations { + fn default() -> Self { + Self { + byte_vectors: Vec::with_capacity(65_536), + mergeable_data_vectors: Vec::with_capacity(4096), + } + } +} +impl Allocations { + fn take_byte_vector(&mut self) -> Vec { + self.byte_vectors.pop().unwrap_or_else(|| Vec::with_capacity(16)) + } + fn take_mergeable_data_vector(&mut self) -> Vec> { + self.mergeable_data_vectors.pop().unwrap_or_else(|| Vec::with_capacity(8)) + } + + fn reclaim_byte_vector(&mut self, mut data: Vec) { + data.clear(); + self.byte_vectors.push(data); + } + fn reclaim_mergeable_data_vector(&mut self, mut data: Vec>) { + data.clear(); + self.mergeable_data_vectors.push(data); + } +} + +#[derive(Default, Debug)] +struct PrefixTrieNode { + children: Vec<(PrefixTrieNode, u8)>, + is_end_node: bool, +} + +impl PrefixTrieNode { + fn is_empty(&self) -> bool { + self.children.is_empty() + } + fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { + let mut node = PrefixTrieNode::default(); + for prefix in prefixes { + node.insert_sorted_prefix(prefix.as_bytes().into_iter()); + } + node + } + fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { + if let Some(&c) = prefix.next() { + if let Some((node, byte)) = self.children.last_mut() { + if *byte == c { + node.insert_sorted_prefix(prefix); + return; + } + } + let mut new_node = PrefixTrieNode::default(); + new_node.insert_sorted_prefix(prefix); + self.children.push((new_node, c)); + } else { + self.is_end_node = true; + } + } + fn for_each_prefix_of(&self, word: &[u8], buffer: &mut Vec, mut do_fn: impl FnMut(&[u8])) { + let mut cur_node = self; + for &byte in word { + buffer.push(byte); + if let Some((child_node, _)) = cur_node.children.iter().find(|(_, c)| *c == byte) { + cur_node = child_node; + if cur_node.is_end_node { + do_fn(buffer.as_slice()); + } + } else { + break; + } + } + } + // fn print(&self, buffer: &mut String, ident: usize) { + // let mut spaces = String::new(); + // for _ in 0..ident { + // spaces.push(' ') + // } + // for (child, c) in &self.children { + // buffer.push(char::from_u32(*c as u32).unwrap()); + // println!("{spaces}{buffer}:"); + // child.print(buffer, ident + 4); + // buffer.pop(); + // } + // } +} diff --git a/milli/src/update/word_prefix_pair_proximity_docids/readme.md b/milli/src/update/word_prefix_pair_proximity_docids/readme.md new file mode 100644 index 000000000..7e467e92d --- /dev/null +++ b/milli/src/update/word_prefix_pair_proximity_docids/readme.md @@ -0,0 +1,144 @@ +## What is WordPrefixPairProximityDocids? +The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. + +The prefixes present in this database are only those that correspond to many different words present in the documents. + +## How is it created/updated? (simplified version) +To compute it, we have access to (mainly) two inputs: + +* a list of sorted prefixes, such as: +``` +c +ca +cat +d +do +dog +``` +Note that only prefixes which correspond to more than a certain number of different words from the database are included in this list. + +* a sorted list of word pairs and the distance between them (i.e. proximity), associated with a roaring bitmap, such as: +``` +good dog 3 -> docids1: [2, 5, 6] +good doggo 1 -> docids2: [8] +good dogma 1 -> docids3: [7, 19, 20] +good ghost 2 -> docids4: [1] +horror cathedral 4 -> docids5: [1, 2] +``` + +I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below: + +1. ==Outer loop:== First, we iterate over each word pair and its proximity: +``` +word1 : good +word2 : dog +proximity: 3 +``` +2. ==Inner loop:== Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: +``` +Outer loop 1: +------------------------------ +word1 : good +word2 : dog +proximity: 3 +docids : docids1 + +prefixes: [d, do, dog] + +batch: [ + (d, 3) -> [docids1] + (do, 3) -> [docids1] + (dog, 3) -> [docids1] +] +``` +3. For illustration purpose, let's run through a second iteration of the outer loop: +``` +Outer loop 2: +------------------------------ +word1 : good +word2 : doggo +proximity: 1 +docids : docids2 + +prefixes: [d, do, dog] + +batch: [ + (d, 1) -> [docids2] + (d, 3) -> [docids1] + (do, 1) -> [docids2] + (do, 3) -> [docids1] + (dog, 1) -> [docids2] + (dog, 3) -> [docids1] +] +``` +Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some of the elements inserted in the second iteration of the outer loop appear *before* elements from the first iteration. + +4. And a third: +``` +Outer loop 3: +------------------------------ +word1 : good +word2 : dogma +proximity: 1 +docids : docids3 + +prefixes: [d, do, dog] + +batch: [ + (d, 1) -> [docids2, docids3] + (d, 3) -> [docids1] + (do, 1) -> [docids2, docids3] + (do, 3) -> [docids1] + (dog, 1) -> [docids2, docids3] + (dog, 3) -> [docids1] +] +``` +Notice that there were some conflicts which were resolved by merging the conflicting values together. + +5. On the fourth iteration of the outer loop, we have: +``` +Outer loop 4: +------------------------------ +word1 : good +word2 : ghost +proximity: 2 +``` +Because `word2` begins with a different letter than the previous `word2`, we know that: +1. All the prefixes of `word2` are greater than the prefixes of the previous word2 +2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch. +Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`. + +6. ==Flushing the batch==: to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: +``` +Flushing Batch loop 1: +------------------------------ +word1 : good +word2 : d +proximity: 1 +docids : [docids2, docids3] +``` +We then merge the array of `docids` (of type `Vec>`) using `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a roaring bitmap of all the document ids where `word1` is followed by `prefix` at a distance of `proximity`. +Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` into the database. + +7. That's it! ... except... + +## How is it created/updated (continued) + +I lied a little bit about the input data. In reality, we get two sets of the inputs described above, which come from different places: + +* For the list of sorted prefixes, we have: + * `new_prefixes`, which are all the prefixes that were not present in the database before the insertion of the new documents + * `common_prefixes` which are the prefixes that are present both in the database and in the newly added documents + +* For the list of word pairs and proximities, we have: + * `new_word_pairs`, which is the list of word pairs and their proximities present in the newly added documents + * `word_pairs_db`, which is the list of word pairs from the database. **This list includes all elements in `new_word_pairs`** since `new_word_pairs` was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` function. + +To update the prefix database correctly, we call the algorithm described earlier first on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). Thus: + +1. For all the word pairs that were already present in the DB, we insert them again with the `new_prefixes`. Calling the algorithm on them with the `common_prefixes` would not result in any new data. +3. For all the new word pairs, we insert them twice: first with the `common_prefixes`, and then, because they are part of `word_pairs_db`, with the `new_prefixes`. + +Note, also, that since we read data from the database when iterating over `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-docids from the batch directly into the database (we would have a concurrent reader and writer). Therefore, when calling the algorithm on (`new_prefixes`, `word_pairs_db`), we insert the computed ((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. + + From 86807ca848e1a3573cd8a7e010230d9852312c01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 13 Jul 2022 19:35:17 +0200 Subject: [PATCH 1551/1889] Refactor word prefix pair proximity indexation further --- .../word_prefix_pair_proximity_docids/mod.rs | 511 +++++++++++------- 1 file changed, 326 insertions(+), 185 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs index 119c0c53e..5b073bb95 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs @@ -5,6 +5,7 @@ use heed::BytesDecode; use log::debug; use std::borrow::Cow; +use std::cmp::Ordering; use std::collections::HashSet; use std::io::BufReader; use std::time::Instant; @@ -72,6 +73,84 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { del_prefix_fst_words: &HashSet>, ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + let mut allocations = Allocations::default(); + + let mut count = 0; + + let prefixes = PrefixTrieNode::from_sorted_prefixes( + common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), + ); + + if !prefixes.is_empty() { + let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + Self::execute_on_word_pairs_and_prefixes( + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.move_on_next()? { + let (word1, word2, proximity) = UncheckedStrStrU8Codec::bytes_decode(key) + .ok_or(heed::Error::Decoding)?; + Ok(Some(((word1, word2, proximity), value))) + } else { + Ok(None) + } + }, + &prefixes, + &mut allocations, + self.max_proximity, + |key, value| { + count += 1; + insert_into_database( + &mut self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + } + dbg!(count); + + let prefixes = PrefixTrieNode::from_sorted_prefixes( + new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), + ); + + if !prefixes.is_empty() { + let mut db_iter = self + .index + .word_pair_proximity_docids + .remap_key_type::() + .remap_data_type::() + .iter(self.wtxn)?; + + let mut writer = create_writer( + self.chunk_compression_type, + self.chunk_compression_level, + tempfile::tempfile()?, + ); + + Self::execute_on_word_pairs_and_prefixes( + &mut db_iter, + |db_iter| db_iter.next().transpose().map_err(|e| e.into()), + &prefixes, + &mut allocations, + self.max_proximity, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); + writer_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + writer, + )?; + } // All of the word prefix pairs in the database that have a w2 // that is contained in the `suppr_pw` set must be removed as well. @@ -89,131 +168,71 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } } - // We construct a Trie of all the prefixes that are smaller than the max prefix length - // This is an optimisation that allows us to iterate over all prefixes of a word quickly. - let new_prefix_fst_words = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words - .into_iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - let common_prefix_fst_words = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .into_iter() - .map(|s| s.into_iter()) - .flatten() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - let mut allocations = Allocations::default(); - let mut batch = PrefixAndProximityBatch::default(); - - if !common_prefix_fst_words.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - - while let Some((key, data)) = cursor.move_on_next()? { - let (word1, word2, proximity) = - UncheckedStrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - - if proximity <= self.max_proximity { - batch.flush_if_necessary( - word1, - word2, - &mut allocations, - &mut |key, value| { - insert_into_database( - &mut self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - self.insert_word_prefix_pair_proximity_docids_into_batch( - word2, - proximity, - data, - &common_prefix_fst_words, - &mut batch, - &mut allocations, - )?; - } - } - batch.flush(&mut allocations, &mut |key, value| { - insert_into_database( - &mut self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - })?; - } - - if !new_prefix_fst_words.is_empty() { - let mut db_iter = self - .index - .word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(self.wtxn)?; - - let mut writer = create_writer( - self.chunk_compression_type, - self.chunk_compression_level, - tempfile::tempfile()?, - ); - - while let Some(((word1, word2, proximity), data)) = db_iter.next().transpose()? { - if proximity <= self.max_proximity { - batch.flush_if_necessary( - word1, - word2, - &mut allocations, - &mut |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - self.insert_word_prefix_pair_proximity_docids_into_batch( - word2, - proximity, - data, - &new_prefix_fst_words, - &mut batch, - &mut allocations, - )?; - } - } - batch.flush(&mut allocations, &mut |key, value| { - writer.insert(key, value).map_err(|e| e.into()) - })?; - - drop(db_iter); - writer_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - Ok(()) } - fn insert_word_prefix_pair_proximity_docids_into_batch<'b, 'c>( - &self, - word2: &[u8], - proximity: u8, - data: &'b [u8], - prefixes: &'c PrefixTrieNode, - writer: &'b mut PrefixAndProximityBatch, + fn execute_on_word_pairs_and_prefixes( + iter: &mut Iter, + mut next_word_pair_proximity: impl for<'a> FnMut( + &'a mut Iter, + ) -> Result< + Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, + >, + prefixes: &PrefixTrieNode, allocations: &mut Allocations, + max_proximity: u8, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, ) -> Result<()> { + let mut batch = PrefixAndProximityBatch::default(); + let mut prev_word2_start = 0; + + let mut prefix_search_start = PrefixTrieNodeSearchStart(0); + let mut empty_prefixes = false; + let mut prefix_buffer = allocations.take_byte_vector(); - prefixes.for_each_prefix_of(word2, &mut prefix_buffer, |prefix| { - let mut value = allocations.take_byte_vector(); - value.extend_from_slice(&data); - writer.insert(prefix, proximity, value, allocations); - }); - allocations.reclaim_byte_vector(prefix_buffer); + + while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + if proximity > max_proximity { + continue; + }; + let word2_start_different_than_prev = word2[0] != prev_word2_start; + if empty_prefixes && !word2_start_different_than_prev { + continue; + } + let word1_different_than_prev = word1 != batch.word1; + if word1_different_than_prev || word2_start_different_than_prev { + batch.flush(allocations, &mut insert)?; + if word1_different_than_prev { + prefix_search_start.0 = 0; + batch.word1.clear(); + batch.word1.extend_from_slice(word1); + } + if word2_start_different_than_prev { + // word2_start_different_than_prev == true + prev_word2_start = word2[0]; + } + empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); + } + + if !empty_prefixes { + prefixes.for_each_prefix_of( + word2, + &mut prefix_buffer, + &prefix_search_start, + |prefix_buffer| { + let mut value = allocations.take_byte_vector(); + value.extend_from_slice(&data); + let prefix_len = prefix_buffer.len(); + prefix_buffer.push(0); + prefix_buffer.push(proximity); + batch.insert(&prefix_buffer, value, allocations); + prefix_buffer.truncate(prefix_len); + }, + ); + prefix_buffer.clear(); + } + } + batch.flush(allocations, &mut insert)?; Ok(()) } } @@ -224,10 +243,6 @@ The keys are sorted and conflicts are resolved by merging the vectors of bitstri It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. -A batch is valid only for a specific `word1`. Also, all prefixes stored in the batch start with the same letter. Make sure to -call [`self.flush_if_necessary`](Self::flush_if_necessary) before inserting a list of sorted `(prefix, proximity)` (and where each -`prefix` starts with the same letter) in order to uphold these invariants. - The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: - key : (word1, prefix, proximity) as bytes @@ -235,91 +250,95 @@ can be inserted into the database in sorted order. When it is flushed, it calls */ #[derive(Default)] struct PrefixAndProximityBatch { - batch: Vec<(Vec, Vec>)>, word1: Vec, - word2_start: u8, + batch: Vec<(Vec, Vec>)>, } impl PrefixAndProximityBatch { - fn insert( - &mut self, - new_prefix: &[u8], - new_proximity: u8, - new_value: Vec, - allocations: &mut Allocations, - ) { - let mut key = allocations.take_byte_vector(); - key.extend_from_slice(new_prefix); - key.push(0); - key.push(new_proximity); - - if let Some(position) = self.batch.iter().position(|(k, _)| k >= &key) { - let (existing_key, existing_data) = &mut self.batch[position]; - if existing_key == &key { - existing_data.push(Cow::Owned(new_value)); - } else { + fn insert(&mut self, new_key: &[u8], new_value: Vec, allocations: &mut Allocations) { + // this is a macro instead of a closure because the borrow checker will complain + // about the closure moving `new_value` + macro_rules! insert_new_key_value { + () => { + let mut key = allocations.take_byte_vector(); + key.extend_from_slice(new_key); let mut mergeable_data = allocations.take_mergeable_data_vector(); mergeable_data.push(Cow::Owned(new_value)); - self.batch.insert(position, (key, mergeable_data)); + self.batch.push((key, mergeable_data)); + }; + ($idx:expr) => { + let mut key = allocations.take_byte_vector(); + key.extend_from_slice(new_key); + let mut mergeable_data = allocations.take_mergeable_data_vector(); + mergeable_data.push(Cow::Owned(new_value)); + self.batch.insert($idx, (key, mergeable_data)); + }; + } + + if self.batch.is_empty() { + insert_new_key_value!(); + } else if self.batch.len() == 1 { + let (existing_key, existing_data) = &mut self.batch[0]; + match new_key.cmp(&existing_key) { + Ordering::Less => { + insert_new_key_value!(0); + } + Ordering::Equal => { + existing_data.push(Cow::Owned(new_value)); + } + Ordering::Greater => { + insert_new_key_value!(); + } } } else { - let mut mergeable_data = allocations.take_mergeable_data_vector(); - mergeable_data.push(Cow::Owned(new_value)); - self.batch.push((key, mergeable_data)); - } - } - - /// Call [`self.flush`](Self::flush) if `word1` changed or if `word2` begins with a different letter than the - /// previous word2. Update `prev_word1` and `prev_word2_start` with the new values from `word1` and `word2`. - fn flush_if_necessary( - &mut self, - word1: &[u8], - word2: &[u8], - allocations: &mut Allocations, - insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, - ) -> Result<()> { - let word2_start = word2[0]; - if word1 != self.word1 { - self.flush(allocations, insert)?; - self.word1.clear(); - self.word1.extend_from_slice(word1); - if word2_start != self.word2_start { - self.word2_start = word2_start; + match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { + Ok(position) => { + self.batch[position].1.push(Cow::Owned(new_value)); + } + Err(position) => { + insert_new_key_value!(position); + } } } - if word2_start != self.word2_start { - self.flush(allocations, insert)?; - self.word2_start = word2_start; - } - Ok(()) } /// Empties the batch, calling `insert` on each element. /// - /// The key given to insert is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. + /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. fn flush( &mut self, allocations: &mut Allocations, insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, ) -> Result<()> { - let PrefixAndProximityBatch { batch, word1: prev_word1, word2_start: _ } = self; + let PrefixAndProximityBatch { word1, batch } = self; + if batch.is_empty() { + return Ok(()); + } + let mut buffer = allocations.take_byte_vector(); - buffer.extend_from_slice(prev_word1.as_slice()); + buffer.extend_from_slice(word1); buffer.push(0); for (key, mergeable_data) in batch.drain(..) { - buffer.truncate(prev_word1.len() + 1); + buffer.truncate(word1.len() + 1); buffer.extend_from_slice(key.as_slice()); - let data = merge_cbo_roaring_bitmaps(&buffer, &mergeable_data)?; - insert(buffer.as_slice(), &data)?; - + let merged; + let data = if mergeable_data.len() > 1 { + merged = merge_cbo_roaring_bitmaps(&buffer, &mergeable_data)?; + &merged + } else { + &mergeable_data[0] + }; + insert(buffer.as_slice(), data)?; allocations.reclaim_byte_vector(key); allocations.reclaim_mergeable_data_vector(mergeable_data); } + Ok(()) } } +// This is adapted from `sorter_into_lmdb_database` fn insert_into_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, @@ -356,7 +375,8 @@ pub fn writer_into_lmdb_database( ) -> Result<()> { let file = writer.into_inner()?; let reader = grenad::Reader::new(BufReader::new(file))?; - + let len = reader.len(); + dbg!(len); let before = Instant::now(); if database.is_empty(wtxn)? { @@ -413,10 +433,44 @@ struct PrefixTrieNode { is_end_node: bool, } +#[derive(Debug)] +struct PrefixTrieNodeSearchStart(usize); + impl PrefixTrieNode { fn is_empty(&self) -> bool { self.children.is_empty() } + + /// Returns false if the trie does not contain a prefix of the given word. + /// Returns true if the trie *may* contain a prefix of the given word. + /// + /// Moves the search start to the first node equal to the first letter of the word, + /// or to 0 otherwise. + fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { + let byte = word[0]; + if self.children[search_start.0].1 == byte { + return true; + } else if let Some(position) = + self.children[search_start.0..].iter().position(|(_, c)| *c >= byte) + { + let (_, c) = self.children[search_start.0 + position]; + // dbg!(position, c, byte); + if c == byte { + // dbg!(); + search_start.0 += position; + true + } else { + // dbg!(); + search_start.0 = 0; + false + } + } else { + // dbg!(); + search_start.0 = 0; + false + } + } + fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { let mut node = PrefixTrieNode::default(); for prefix in prefixes { @@ -439,17 +493,41 @@ impl PrefixTrieNode { self.is_end_node = true; } } - fn for_each_prefix_of(&self, word: &[u8], buffer: &mut Vec, mut do_fn: impl FnMut(&[u8])) { + fn for_each_prefix_of( + &self, + word: &[u8], + buffer: &mut Vec, + search_start: &PrefixTrieNodeSearchStart, + mut do_fn: impl FnMut(&mut Vec), + ) { + let first_byte = word[0]; let mut cur_node = self; - for &byte in word { - buffer.push(byte); - if let Some((child_node, _)) = cur_node.children.iter().find(|(_, c)| *c == byte) { + buffer.push(first_byte); + if let Some((child_node, c)) = + cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) + { + if *c == first_byte { cur_node = child_node; if cur_node.is_end_node { - do_fn(buffer.as_slice()); + do_fn(buffer); + } + for &byte in &word[1..] { + buffer.push(byte); + if let Some((child_node, c)) = + cur_node.children.iter().find(|(_, c)| *c >= byte) + { + if *c == byte { + cur_node = child_node; + if cur_node.is_end_node { + do_fn(buffer); + } + } else { + break; + } + } else { + break; + } } - } else { - break; } } } @@ -466,3 +544,66 @@ impl PrefixTrieNode { // } // } } +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_trie() { + let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", + "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", + "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", + "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", + "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", + "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", + "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", + "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", + "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", + "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", + "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", + "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", + "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", + "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", + "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", + "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", + ])); + // let mut buffer = String::new(); + // trie.print(&mut buffer, 0); + // buffer.clear(); + let mut search_start = PrefixTrieNodeSearchStart(0); + let mut buffer = vec![]; + + let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); + println!("{search_start:?}"); + println!("is empty: {is_empty}"); + trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| { + let s = std::str::from_utf8(x).unwrap(); + println!("{s}"); + }); + buffer.clear(); + trie.for_each_prefix_of("trans".as_bytes(), &mut buffer, &search_start, |x| { + let s = std::str::from_utf8(x).unwrap(); + println!("{s}"); + }); + buffer.clear(); + + trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| { + let s = std::str::from_utf8(x).unwrap(); + println!("{s}"); + }); + buffer.clear(); + // trie.for_each_prefix_of("1", |x| { + // println!("{x}"); + // }); + // trie.for_each_prefix_of("19", |x| { + // println!("{x}"); + // }); + // trie.for_each_prefix_of("21", |x| { + // println!("{x}"); + // }); + // let mut buffer = vec![]; + // trie.for_each_prefix_of("integ", &mut buffer, |x| { + // println!("{x}"); + // }); + } +} From d3501141596657fbc74b7f38024d7727d6501519 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 14 Jul 2022 11:25:10 +0200 Subject: [PATCH 1552/1889] Add tests for WordPrefixPairProximityDocIds --- .../word_prefix_pair_proximity_docids/mod.rs | 373 ++++++++++++------ 1 file changed, 242 insertions(+), 131 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs index 5b073bb95..a5ece8005 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs @@ -88,7 +88,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { if !prefixes.is_empty() { let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - Self::execute_on_word_pairs_and_prefixes( + execute_on_word_pairs_and_prefixes( &mut cursor, |cursor| { if let Some((key, value)) = cursor.move_on_next()? { @@ -113,7 +113,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { }, )?; } - dbg!(count); let prefixes = PrefixTrieNode::from_sorted_prefixes( new_prefix_fst_words @@ -136,7 +135,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { tempfile::tempfile()?, ); - Self::execute_on_word_pairs_and_prefixes( + execute_on_word_pairs_and_prefixes( &mut db_iter, |db_iter| db_iter.next().transpose().map_err(|e| e.into()), &prefixes, @@ -145,7 +144,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { |key, value| writer.insert(key, value).map_err(|e| e.into()), )?; drop(db_iter); - writer_into_lmdb_database( + writer_of_new_elements_into_lmdb_database( self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), writer, @@ -170,73 +169,71 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { Ok(()) } - - fn execute_on_word_pairs_and_prefixes( - iter: &mut Iter, - mut next_word_pair_proximity: impl for<'a> FnMut( - &'a mut Iter, - ) -> Result< - Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, - >, - prefixes: &PrefixTrieNode, - allocations: &mut Allocations, - max_proximity: u8, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, - ) -> Result<()> { - let mut batch = PrefixAndProximityBatch::default(); - let mut prev_word2_start = 0; - - let mut prefix_search_start = PrefixTrieNodeSearchStart(0); - let mut empty_prefixes = false; - - let mut prefix_buffer = allocations.take_byte_vector(); - - while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { - if proximity > max_proximity { - continue; - }; - let word2_start_different_than_prev = word2[0] != prev_word2_start; - if empty_prefixes && !word2_start_different_than_prev { - continue; - } - let word1_different_than_prev = word1 != batch.word1; - if word1_different_than_prev || word2_start_different_than_prev { - batch.flush(allocations, &mut insert)?; - if word1_different_than_prev { - prefix_search_start.0 = 0; - batch.word1.clear(); - batch.word1.extend_from_slice(word1); - } - if word2_start_different_than_prev { - // word2_start_different_than_prev == true - prev_word2_start = word2[0]; - } - empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); - } - - if !empty_prefixes { - prefixes.for_each_prefix_of( - word2, - &mut prefix_buffer, - &prefix_search_start, - |prefix_buffer| { - let mut value = allocations.take_byte_vector(); - value.extend_from_slice(&data); - let prefix_len = prefix_buffer.len(); - prefix_buffer.push(0); - prefix_buffer.push(proximity); - batch.insert(&prefix_buffer, value, allocations); - prefix_buffer.truncate(prefix_len); - }, - ); - prefix_buffer.clear(); - } - } - batch.flush(allocations, &mut insert)?; - Ok(()) - } } +fn execute_on_word_pairs_and_prefixes( + iter: &mut Iter, + mut next_word_pair_proximity: impl for<'a> FnMut( + &'a mut Iter, + ) -> Result< + Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, + >, + prefixes: &PrefixTrieNode, + allocations: &mut Allocations, + max_proximity: u8, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, +) -> Result<()> { + let mut batch = PrefixAndProximityBatch::default(); + let mut prev_word2_start = 0; + let mut prefix_search_start = PrefixTrieNodeSearchStart(0); + let mut empty_prefixes = false; + + let mut prefix_buffer = allocations.take_byte_vector(); + + while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + if proximity > max_proximity { + continue; + }; + let word2_start_different_than_prev = word2[0] != prev_word2_start; + if empty_prefixes && !word2_start_different_than_prev { + continue; + } + let word1_different_than_prev = word1 != batch.word1; + if word1_different_than_prev || word2_start_different_than_prev { + batch.flush(allocations, &mut insert)?; + if word1_different_than_prev { + prefix_search_start.0 = 0; + batch.word1.clear(); + batch.word1.extend_from_slice(word1); + } + if word2_start_different_than_prev { + // word2_start_different_than_prev == true + prev_word2_start = word2[0]; + } + empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); + } + + if !empty_prefixes { + prefixes.for_each_prefix_of( + word2, + &mut prefix_buffer, + &prefix_search_start, + |prefix_buffer| { + let mut value = allocations.take_byte_vector(); + value.extend_from_slice(&data); + let prefix_len = prefix_buffer.len(); + prefix_buffer.push(0); + prefix_buffer.push(proximity); + batch.insert(&prefix_buffer, value, allocations); + prefix_buffer.truncate(prefix_len); + }, + ); + prefix_buffer.clear(); + } + } + batch.flush(allocations, &mut insert)?; + Ok(()) +} /** A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. @@ -275,30 +272,32 @@ impl PrefixAndProximityBatch { }; } - if self.batch.is_empty() { - insert_new_key_value!(); - } else if self.batch.len() == 1 { - let (existing_key, existing_data) = &mut self.batch[0]; - match new_key.cmp(&existing_key) { - Ordering::Less => { - insert_new_key_value!(0); - } - Ordering::Equal => { - existing_data.push(Cow::Owned(new_value)); - } - Ordering::Greater => { - insert_new_key_value!(); + match self.batch.len() { + 0 => { + insert_new_key_value!(); + } + 1 => { + let (existing_key, existing_data) = &mut self.batch[0]; + match new_key.cmp(&existing_key) { + Ordering::Less => { + insert_new_key_value!(0); + } + Ordering::Equal => { + existing_data.push(Cow::Owned(new_value)); + } + Ordering::Greater => { + insert_new_key_value!(); + } } } - } else { - match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { + _ => match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { Ok(position) => { self.batch[position].1.push(Cow::Owned(new_value)); } Err(position) => { insert_new_key_value!(position); } - } + }, } } @@ -368,17 +367,13 @@ fn insert_into_database( } // This is adapted from `sorter_into_lmdb_database` -pub fn writer_into_lmdb_database( +pub fn writer_of_new_elements_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, writer: grenad::Writer, ) -> Result<()> { let file = writer.into_inner()?; let reader = grenad::Reader::new(BufReader::new(file))?; - let len = reader.len(); - dbg!(len); - let before = Instant::now(); - if database.is_empty(wtxn)? { let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; let mut cursor = reader.into_cursor()?; @@ -389,11 +384,9 @@ pub fn writer_into_lmdb_database( } else { let mut cursor = reader.into_cursor()?; while let Some((k, v)) = cursor.move_on_next()? { - insert_into_database(wtxn, database, k, v)?; + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; } } - - debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } @@ -454,18 +447,14 @@ impl PrefixTrieNode { self.children[search_start.0..].iter().position(|(_, c)| *c >= byte) { let (_, c) = self.children[search_start.0 + position]; - // dbg!(position, c, byte); if c == byte { - // dbg!(); search_start.0 += position; true } else { - // dbg!(); search_start.0 = 0; false } } else { - // dbg!(); search_start.0 = 0; false } @@ -546,7 +535,26 @@ impl PrefixTrieNode { } #[cfg(test)] mod tests { + use roaring::RoaringBitmap; + + use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; + use super::*; + + fn check_prefixes( + trie: &PrefixTrieNode, + search_start: &PrefixTrieNodeSearchStart, + word: &str, + expected_prefixes: &[&str], + ) { + let mut actual_prefixes = vec![]; + trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), &search_start, |x| { + let s = String::from_utf8(x.to_owned()).unwrap(); + actual_prefixes.push(s); + }); + assert_eq!(actual_prefixes, expected_prefixes); + } + #[test] fn test_trie() { let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ @@ -567,43 +575,146 @@ mod tests { "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", ])); - // let mut buffer = String::new(); - // trie.print(&mut buffer, 0); - // buffer.clear(); + let mut search_start = PrefixTrieNodeSearchStart(0); - let mut buffer = vec![]; let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); - println!("{search_start:?}"); - println!("is empty: {is_empty}"); - trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| { - let s = std::str::from_utf8(x).unwrap(); - println!("{s}"); - }); - buffer.clear(); - trie.for_each_prefix_of("trans".as_bytes(), &mut buffer, &search_start, |x| { - let s = std::str::from_utf8(x).unwrap(); - println!("{s}"); - }); - buffer.clear(); + assert!(!is_empty); + assert_eq!(search_start.0, 2); - trie.for_each_prefix_of("affair".as_bytes(), &mut buffer, &search_start, |x| { - let s = std::str::from_utf8(x).unwrap(); - println!("{s}"); - }); - buffer.clear(); - // trie.for_each_prefix_of("1", |x| { - // println!("{x}"); - // }); - // trie.for_each_prefix_of("19", |x| { - // println!("{x}"); - // }); - // trie.for_each_prefix_of("21", |x| { - // println!("{x}"); - // }); - // let mut buffer = vec![]; - // trie.for_each_prefix_of("integ", &mut buffer, |x| { - // println!("{x}"); - // }); + check_prefixes(&trie, &search_start, "affair", &["a"]); + check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); + + let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); + assert!(!is_empty); + assert_eq!(trie.children[search_start.0].1, b'u'); + + check_prefixes(&trie, &search_start, "unique", &["u", "un"]); + + // NOTE: this should fail, because the search start is already beyong 'a' + let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); + assert!(!is_empty); + // search start is reset + assert_eq!(search_start.0, 0); + + let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "arb", "arbre", "cat", "catto", + ])); + check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); + check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); + } + + #[test] + fn test_execute_on_word_pairs_and_prefixes() { + let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "arb", "arbre", "cat", "catto", + ])); + + let mut serialised_bitmap123 = vec![]; + let mut bitmap123 = RoaringBitmap::new(); + bitmap123.insert(1); + bitmap123.insert(2); + bitmap123.insert(3); + CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); + + let mut serialised_bitmap456 = vec![]; + let mut bitmap456 = RoaringBitmap::new(); + bitmap456.insert(4); + bitmap456.insert(5); + bitmap456.insert(6); + CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); + + let mut serialised_bitmap789 = vec![]; + let mut bitmap789 = RoaringBitmap::new(); + bitmap789.insert(7); + bitmap789.insert(8); + bitmap789.insert(9); + CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); + + let mut serialised_bitmap_ranges = vec![]; + let mut bitmap_ranges = RoaringBitmap::new(); + bitmap_ranges.insert_range(63_000..65_000); + bitmap_ranges.insert_range(123_000..128_000); + CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); + + let word_pairs = [ + // 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456) + (("healthy", "arbre", 2), &serialised_bitmap123), + // not inserted because 3 > max_proximity + (("healthy", "arbre", 3), &serialised_bitmap456), + // 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123) + (("healthy", "arbres", 1), &serialised_bitmap123), + // 1, 3: + (("healthy", "arbres", 2), &serialised_bitmap456), + // not be inserted because 3 > max_proximity + (("healthy", "arbres", 3), &serialised_bitmap789), + // not inserted because no prefixes for boat + (("healthy", "boat", 1), &serialised_bitmap123), + // not inserted because no prefixes for ca + (("healthy", "ca", 1), &serialised_bitmap123), + // 4: (healthy cat 1) with (bitmap456 + bitmap123) + (("healthy", "cats", 1), &serialised_bitmap456), + // 5: (healthy cat 2) with (bitmap789 + bitmap_ranges) + (("healthy", "cats", 2), &serialised_bitmap789), + // 4 + 6: (healthy catto 1) with (bitmap123) + (("healthy", "cattos", 1), &serialised_bitmap123), + // 5 + 7: (healthy catto 2) with (bitmap_ranges) + (("healthy", "cattos", 2), &serialised_bitmap_ranges), + // 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges) + (("jittery", "cat", 1), &serialised_bitmap123), + // 8: + (("jittery", "cata", 1), &serialised_bitmap456), + // 8: + (("jittery", "catb", 1), &serialised_bitmap789), + // 8: + (("jittery", "catc", 1), &serialised_bitmap_ranges), + ]; + + let expected_result = [ + // first batch: + (("healthy", "arb", 1), bitmap123.clone()), + (("healthy", "arb", 2), &bitmap123 | &bitmap456), + (("healthy", "arbre", 1), bitmap123.clone()), + (("healthy", "arbre", 2), &bitmap123 | &bitmap456), + // second batch: + (("healthy", "cat", 1), &bitmap456 | &bitmap123), + (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), + (("healthy", "catto", 1), bitmap123.clone()), + (("healthy", "catto", 2), bitmap_ranges.clone()), + // third batch + (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), + ]; + + let mut result = vec![]; + + let mut allocations = Allocations::default(); + let mut iter = + IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { + ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) + }); + execute_on_word_pairs_and_prefixes( + &mut iter, + |iter| Ok(iter.next()), + &prefixes, + &mut allocations, + 2, + |k, v| { + let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); + let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); + result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); + Ok(()) + }, + ) + .unwrap(); + + for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { + let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x; + let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y; + + assert_eq!(actual_word1, expected_word1); + assert_eq!(actual_prefix, expected_prefix); + assert_eq!(actual_proximity, expected_proximity); + assert_eq!(actual_bitmap, expected_bitmap); + } } } From 044356d22165977be4ecaaffc8ee4027ade7026c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 14 Jul 2022 11:53:21 +0200 Subject: [PATCH 1553/1889] Optimise WordPrefixPairProximityDocIds merge operation --- .../word_prefix_pair_proximity_docids/mod.rs | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs index a5ece8005..ad498b5da 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs @@ -8,12 +8,11 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::collections::HashSet; use std::io::BufReader; -use std::time::Instant; use crate::update::index_documents::{ create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, }; -use crate::{Index, Result, UncheckedStrStrU8Codec}; +use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -189,6 +188,7 @@ fn execute_on_word_pairs_and_prefixes( let mut empty_prefixes = false; let mut prefix_buffer = allocations.take_byte_vector(); + let mut merge_buffer = allocations.take_byte_vector(); while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { if proximity > max_proximity { @@ -200,7 +200,7 @@ fn execute_on_word_pairs_and_prefixes( } let word1_different_than_prev = word1 != batch.word1; if word1_different_than_prev || word2_start_different_than_prev { - batch.flush(allocations, &mut insert)?; + batch.flush(allocations, &mut merge_buffer, &mut insert)?; if word1_different_than_prev { prefix_search_start.0 = 0; batch.word1.clear(); @@ -231,7 +231,7 @@ fn execute_on_word_pairs_and_prefixes( prefix_buffer.clear(); } } - batch.flush(allocations, &mut insert)?; + batch.flush(allocations, &mut merge_buffer, &mut insert)?; Ok(()) } /** @@ -307,12 +307,14 @@ impl PrefixAndProximityBatch { fn flush( &mut self, allocations: &mut Allocations, + merge_buffer: &mut Vec, insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, ) -> Result<()> { let PrefixAndProximityBatch { word1, batch } = self; if batch.is_empty() { return Ok(()); } + merge_buffer.clear(); let mut buffer = allocations.take_byte_vector(); buffer.extend_from_slice(word1); @@ -321,14 +323,15 @@ impl PrefixAndProximityBatch { for (key, mergeable_data) in batch.drain(..) { buffer.truncate(word1.len() + 1); buffer.extend_from_slice(key.as_slice()); - let merged; + let data = if mergeable_data.len() > 1 { - merged = merge_cbo_roaring_bitmaps(&buffer, &mergeable_data)?; - &merged + CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; + merge_buffer.as_slice() } else { &mergeable_data[0] }; insert(buffer.as_slice(), data)?; + merge_buffer.clear(); allocations.reclaim_byte_vector(key); allocations.reclaim_mergeable_data_vector(mergeable_data); } @@ -443,20 +446,17 @@ impl PrefixTrieNode { let byte = word[0]; if self.children[search_start.0].1 == byte { return true; - } else if let Some(position) = - self.children[search_start.0..].iter().position(|(_, c)| *c >= byte) - { - let (_, c) = self.children[search_start.0 + position]; - if c == byte { - search_start.0 += position; - true - } else { - search_start.0 = 0; - false - } } else { - search_start.0 = 0; - false + match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { + Ok(position) => { + search_start.0 += position; + true + } + Err(_) => { + search_start.0 = 0; + false + } + } } } From 220921628b47c0f9e80db80896f640db4550fe08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 15:34:40 +0200 Subject: [PATCH 1554/1889] Simplify and document WordPrefixPairProximityDocIds::execute --- .../word_prefix_pair_proximity_docids/mod.rs | 103 ++++++++---------- .../readme.md | 8 +- 2 files changed, 50 insertions(+), 61 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs index ad498b5da..6345dd210 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs @@ -1,18 +1,14 @@ -use grenad::CompressionType; -use heed::types::ByteSlice; - -use heed::BytesDecode; -use log::debug; - -use std::borrow::Cow; -use std::cmp::Ordering; -use std::collections::HashSet; -use std::io::BufReader; - use crate::update::index_documents::{ create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, }; use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -72,10 +68,11 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { del_prefix_fst_words: &HashSet>, ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + // This is an optimisation, to reuse allocations between loop iterations let mut allocations = Allocations::default(); - let mut count = 0; - + // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length let prefixes = PrefixTrieNode::from_sorted_prefixes( common_prefix_fst_words .into_iter() @@ -85,9 +82,14 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { .filter(|s| s.len() <= self.max_prefix_length), ); + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (word1, common_prefix, proximity) elements + // to insert in the DB if !prefixes.is_empty() { let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + // This is the core of the algorithm execute_on_word_pairs_and_prefixes( + // the first two arguments tell how to iterate over the new word pairs &mut cursor, |cursor| { if let Some((key, value)) = cursor.move_on_next()? { @@ -101,8 +103,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &prefixes, &mut allocations, self.max_proximity, + // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) |key, value| { - count += 1; insert_into_database( &mut self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), @@ -113,6 +115,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { )?; } + // Now we do the same thing with the new prefixes and all word pairs in the DB + let prefixes = PrefixTrieNode::from_sorted_prefixes( new_prefix_fst_words .into_iter() @@ -128,6 +132,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { .remap_data_type::() .iter(self.wtxn)?; + // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) + // element in an intermediary grenad let mut writer = create_writer( self.chunk_compression_type, self.chunk_compression_level, @@ -143,7 +149,12 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { |key, value| writer.insert(key, value).map_err(|e| e.into()), )?; drop(db_iter); - writer_of_new_elements_into_lmdb_database( + + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), writer, @@ -169,6 +180,15 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { Ok(()) } } + +/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// +/// Its main arguments are: +/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements +/// 2. a prefix trie +/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements +/// +/// For more information about the fn execute_on_word_pairs_and_prefixes( iter: &mut Iter, mut next_word_pair_proximity: impl for<'a> FnMut( @@ -252,52 +272,19 @@ struct PrefixAndProximityBatch { } impl PrefixAndProximityBatch { + /// Insert the new key and value into the batch fn insert(&mut self, new_key: &[u8], new_value: Vec, allocations: &mut Allocations) { - // this is a macro instead of a closure because the borrow checker will complain - // about the closure moving `new_value` - macro_rules! insert_new_key_value { - () => { + match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { + Ok(position) => { + self.batch[position].1.push(Cow::Owned(new_value)); + } + Err(position) => { let mut key = allocations.take_byte_vector(); key.extend_from_slice(new_key); let mut mergeable_data = allocations.take_mergeable_data_vector(); mergeable_data.push(Cow::Owned(new_value)); - self.batch.push((key, mergeable_data)); - }; - ($idx:expr) => { - let mut key = allocations.take_byte_vector(); - key.extend_from_slice(new_key); - let mut mergeable_data = allocations.take_mergeable_data_vector(); - mergeable_data.push(Cow::Owned(new_value)); - self.batch.insert($idx, (key, mergeable_data)); - }; - } - - match self.batch.len() { - 0 => { - insert_new_key_value!(); + self.batch.insert(position, (key, mergeable_data)); } - 1 => { - let (existing_key, existing_data) = &mut self.batch[0]; - match new_key.cmp(&existing_key) { - Ordering::Less => { - insert_new_key_value!(0); - } - Ordering::Equal => { - existing_data.push(Cow::Owned(new_value)); - } - Ordering::Greater => { - insert_new_key_value!(); - } - } - } - _ => match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { - Ok(position) => { - self.batch[position].1.push(Cow::Owned(new_value)); - } - Err(position) => { - insert_new_key_value!(position); - } - }, } } @@ -369,8 +356,10 @@ fn insert_into_database( Ok(()) } -// This is adapted from `sorter_into_lmdb_database` -pub fn writer_of_new_elements_into_lmdb_database( +// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, +// but it uses `append` if the database is empty, and it assumes that the values in the +// writer don't conflict with values in the database. +pub fn write_into_lmdb_database_without_merging( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, writer: grenad::Writer, diff --git a/milli/src/update/word_prefix_pair_proximity_docids/readme.md b/milli/src/update/word_prefix_pair_proximity_docids/readme.md index 7e467e92d..0718fd79c 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids/readme.md +++ b/milli/src/update/word_prefix_pair_proximity_docids/readme.md @@ -1,7 +1,7 @@ ## What is WordPrefixPairProximityDocids? The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. -The prefixes present in this database are only those that correspond to many different words present in the documents. +The prefixes present in this database are only those that correspond to many different words in the documents. ## How is it created/updated? (simplified version) To compute it, we have access to (mainly) two inputs: @@ -28,13 +28,13 @@ horror cathedral 4 -> docids5: [1, 2] I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below: -1. ==Outer loop:== First, we iterate over each word pair and its proximity: +1. **Outer loop:** First, we iterate over each word pair and its proximity: ``` word1 : good word2 : dog proximity: 3 ``` -2. ==Inner loop:== Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: +2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: ``` Outer loop 1: ------------------------------ @@ -108,7 +108,7 @@ Because `word2` begins with a different letter than the previous `word2`, we kno 2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch. Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`. -6. ==Flushing the batch==: to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: +6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: ``` Flushing Batch loop 1: ------------------------------ From ea4a96761c37c5b033e6077822dd5c683374bc3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 15:39:39 +0200 Subject: [PATCH 1555/1889] Move content of readme for WordPrefixPairProximityDocids into the code --- .../word_prefix_pair_proximity_docids.rs | 871 +++++++++++++++--- .../word_prefix_pair_proximity_docids/mod.rs | 709 -------------- .../readme.md | 144 --- 3 files changed, 739 insertions(+), 985 deletions(-) delete mode 100644 milli/src/update/word_prefix_pair_proximity_docids/mod.rs delete mode 100644 milli/src/update/word_prefix_pair_proximity_docids/readme.md diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 574b49e97..4a3a7d13e 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,16 +1,161 @@ -use std::collections::{HashMap, HashSet}; +/*! + ## What is WordPrefixPairProximityDocids? +The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. +The prefixes present in this database are only those that correspond to many different words in the documents. + +## How is it created/updated? (simplified version) +To compute it, we have access to (mainly) two inputs: + +* a list of sorted prefixes, such as: +``` +c +ca +cat +d +do +dog +``` +Note that only prefixes which correspond to more than a certain number of different words from the database are included in this list. + +* a sorted list of word pairs and the distance between them (i.e. proximity), associated with a roaring bitmap, such as: +``` +good dog 3 -> docids1: [2, 5, 6] +good doggo 1 -> docids2: [8] +good dogma 1 -> docids3: [7, 19, 20] +good ghost 2 -> docids4: [1] +horror cathedral 4 -> docids5: [1, 2] +``` + +I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below: + +1. **Outer loop:** First, we iterate over each word pair and its proximity: +``` +word1 : good +word2 : dog +proximity: 3 +``` +2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: +``` +Outer loop 1: +------------------------------ +word1 : good +word2 : dog +proximity: 3 +docids : docids1 + +prefixes: [d, do, dog] + +batch: [ + (d, 3) -> [docids1] + (do, 3) -> [docids1] + (dog, 3) -> [docids1] +] +``` +3. For illustration purpose, let's run through a second iteration of the outer loop: +``` +Outer loop 2: +------------------------------ +word1 : good +word2 : doggo +proximity: 1 +docids : docids2 + +prefixes: [d, do, dog] + +batch: [ + (d, 1) -> [docids2] + (d, 3) -> [docids1] + (do, 1) -> [docids2] + (do, 3) -> [docids1] + (dog, 1) -> [docids2] + (dog, 3) -> [docids1] +] +``` +Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some of the elements inserted in the second iteration of the outer loop appear *before* elements from the first iteration. + +4. And a third: +``` +Outer loop 3: +------------------------------ +word1 : good +word2 : dogma +proximity: 1 +docids : docids3 + +prefixes: [d, do, dog] + +batch: [ + (d, 1) -> [docids2, docids3] + (d, 3) -> [docids1] + (do, 1) -> [docids2, docids3] + (do, 3) -> [docids1] + (dog, 1) -> [docids2, docids3] + (dog, 3) -> [docids1] +] +``` +Notice that there were some conflicts which were resolved by merging the conflicting values together. + +5. On the fourth iteration of the outer loop, we have: +``` +Outer loop 4: +------------------------------ +word1 : good +word2 : ghost +proximity: 2 +``` +Because `word2` begins with a different letter than the previous `word2`, we know that: +1. All the prefixes of `word2` are greater than the prefixes of the previous word2 +2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch. +Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`. + +6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: +``` +Flushing Batch loop 1: +------------------------------ +word1 : good +word2 : d +proximity: 1 +docids : [docids2, docids3] +``` +We then merge the array of `docids` (of type `Vec>`) using `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a roaring bitmap of all the document ids where `word1` is followed by `prefix` at a distance of `proximity`. +Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` into the database. + +7. That's it! ... except... + +## How is it created/updated (continued) + +I lied a little bit about the input data. In reality, we get two sets of the inputs described above, which come from different places: + +* For the list of sorted prefixes, we have: + * `new_prefixes`, which are all the prefixes that were not present in the database before the insertion of the new documents + * `common_prefixes` which are the prefixes that are present both in the database and in the newly added documents + +* For the list of word pairs and proximities, we have: + * `new_word_pairs`, which is the list of word pairs and their proximities present in the newly added documents + * `word_pairs_db`, which is the list of word pairs from the database. **This list includes all elements in `new_word_pairs`** since `new_word_pairs` was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` function. + +To update the prefix database correctly, we call the algorithm described earlier first on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). Thus: + +1. For all the word pairs that were already present in the DB, we insert them again with the `new_prefixes`. Calling the algorithm on them with the `common_prefixes` would not result in any new data. +3. For all the new word pairs, we insert them twice: first with the `common_prefixes`, and then, because they are part of `word_pairs_db`, with the `new_prefixes`. + +Note, also, that since we read data from the database when iterating over `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-docids from the batch directly into the database (we would have a concurrent reader and writer). Therefore, when calling the algorithm on (`new_prefixes`, `word_pairs_db`), we insert the computed ((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. + + + +*/ +use crate::update::index_documents::{ + create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, +}; +use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; use grenad::CompressionType; use heed::types::ByteSlice; use heed::BytesDecode; use log::debug; -use slice_group_by::GroupBy; - -use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, - CursorClonableMmap, MergeFn, -}; -use crate::{Index, Result, StrStrU8Codec}; +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -62,94 +207,104 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute( - self, + pub fn execute<'a>( + mut self, new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &[String], - common_prefix_fst_words: &[&[String]], + new_prefix_fst_words: &'a [String], + common_prefix_fst_words: &[&'a [String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - let new_prefix_fst_words: Vec<_> = - new_prefix_fst_words.linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect(); + // This is an optimisation, to reuse allocations between loop iterations + let mut allocations = Allocations::default(); - let mut new_wppd_iter = new_word_pair_proximity_docids.into_cursor()?; - let mut word_prefix_pair_proximity_docids_sorter = create_sorter( - merge_cbo_roaring_bitmaps, - self.chunk_compression_type, - self.chunk_compression_level, - self.max_nb_chunks, - self.max_memory, + // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length + let prefixes = PrefixTrieNode::from_sorted_prefixes( + common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), ); - if !common_prefix_fst_words.is_empty() { - // We compute the prefix docids associated with the common prefixes between - // the old and new word prefix fst. - let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some((key, data)) = new_wppd_iter.move_on_next()? { - let (w1, w2, prox) = - StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - if prox > self.max_proximity { - continue; - } - - insert_current_prefix_data_in_sorter( - &mut buffer, - &mut current_prefixes, - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - common_prefix_fst_words, - self.max_prefix_length, - w1, - w2, - prox, - data, - )?; - } - - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (word1, common_prefix, proximity) elements + // to insert in the DB + if !prefixes.is_empty() { + let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + // This is the core of the algorithm + execute_on_word_pairs_and_prefixes( + // the first two arguments tell how to iterate over the new word pairs + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.move_on_next()? { + let (word1, word2, proximity) = UncheckedStrStrU8Codec::bytes_decode(key) + .ok_or(heed::Error::Decoding)?; + Ok(Some(((word1, word2, proximity), value))) + } else { + Ok(None) + } + }, + &prefixes, + &mut allocations, + self.max_proximity, + // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) + |key, value| { + insert_into_database( + &mut self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, )?; } - if !new_prefix_fst_words.is_empty() { - // We compute the prefix docids associated with the newly added prefixes - // in the new word prefix fst. + // Now we do the same thing with the new prefixes and all word pairs in the DB + + let prefixes = PrefixTrieNode::from_sorted_prefixes( + new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= self.max_prefix_length), + ); + + if !prefixes.is_empty() { let mut db_iter = self .index .word_pair_proximity_docids + .remap_key_type::() .remap_data_type::() .iter(self.wtxn)?; - let mut buffer = Vec::new(); - let mut current_prefixes: Option<&&[String]> = None; - let mut prefixes_cache = HashMap::new(); - while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? { - if prox > self.max_proximity { - continue; - } + // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) + // element in an intermediary grenad + let mut writer = create_writer( + self.chunk_compression_type, + self.chunk_compression_level, + tempfile::tempfile()?, + ); - insert_current_prefix_data_in_sorter( - &mut buffer, - &mut current_prefixes, - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, - &new_prefix_fst_words, - self.max_prefix_length, - w1, - w2, - prox, - data, - )?; - } + execute_on_word_pairs_and_prefixes( + &mut db_iter, + |db_iter| db_iter.next().transpose().map_err(|e| e.into()), + &prefixes, + &mut allocations, + self.max_proximity, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); - write_prefixes_in_sorter( - &mut prefixes_cache, - &mut word_prefix_pair_proximity_docids_sorter, + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( + self.wtxn, + *self.index.word_prefix_pair_proximity_docids.as_polymorph(), + writer, )?; } @@ -169,84 +324,359 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } } - // We finally write and merge the new word prefix pair proximity docids - // in the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - word_prefix_pair_proximity_docids_sorter, - merge_cbo_roaring_bitmaps, - )?; + Ok(()) + } +} + +/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// +/// Its main arguments are: +/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements +/// 2. a prefix trie +/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements +/// +/// For more information about what this function does, read the module documentation. +fn execute_on_word_pairs_and_prefixes( + iter: &mut Iter, + mut next_word_pair_proximity: impl for<'a> FnMut( + &'a mut Iter, + ) -> Result< + Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, + >, + prefixes: &PrefixTrieNode, + allocations: &mut Allocations, + max_proximity: u8, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, +) -> Result<()> { + let mut batch = PrefixAndProximityBatch::default(); + let mut prev_word2_start = 0; + + let mut prefix_search_start = PrefixTrieNodeSearchStart(0); + let mut empty_prefixes = false; + + let mut prefix_buffer = allocations.take_byte_vector(); + let mut merge_buffer = allocations.take_byte_vector(); + + while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + if proximity > max_proximity { + continue; + }; + let word2_start_different_than_prev = word2[0] != prev_word2_start; + if empty_prefixes && !word2_start_different_than_prev { + continue; + } + let word1_different_than_prev = word1 != batch.word1; + if word1_different_than_prev || word2_start_different_than_prev { + batch.flush(allocations, &mut merge_buffer, &mut insert)?; + if word1_different_than_prev { + prefix_search_start.0 = 0; + batch.word1.clear(); + batch.word1.extend_from_slice(word1); + } + if word2_start_different_than_prev { + // word2_start_different_than_prev == true + prev_word2_start = word2[0]; + } + empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); + } + + if !empty_prefixes { + prefixes.for_each_prefix_of( + word2, + &mut prefix_buffer, + &prefix_search_start, + |prefix_buffer| { + let mut value = allocations.take_byte_vector(); + value.extend_from_slice(&data); + let prefix_len = prefix_buffer.len(); + prefix_buffer.push(0); + prefix_buffer.push(proximity); + batch.insert(&prefix_buffer, value, allocations); + prefix_buffer.truncate(prefix_len); + }, + ); + prefix_buffer.clear(); + } + } + batch.flush(allocations, &mut merge_buffer, &mut insert)?; + Ok(()) +} +/** +A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). +The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. + +It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. + +The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content +can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: +- key : (word1, prefix, proximity) as bytes +- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes +*/ +#[derive(Default)] +struct PrefixAndProximityBatch { + word1: Vec, + batch: Vec<(Vec, Vec>)>, +} + +impl PrefixAndProximityBatch { + /// Insert the new key and value into the batch + fn insert(&mut self, new_key: &[u8], new_value: Vec, allocations: &mut Allocations) { + match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { + Ok(position) => { + self.batch[position].1.push(Cow::Owned(new_value)); + } + Err(position) => { + let mut key = allocations.take_byte_vector(); + key.extend_from_slice(new_key); + let mut mergeable_data = allocations.take_mergeable_data_vector(); + mergeable_data.push(Cow::Owned(new_value)); + self.batch.insert(position, (key, mergeable_data)); + } + } + } + + /// Empties the batch, calling `insert` on each element. + /// + /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. + fn flush( + &mut self, + allocations: &mut Allocations, + merge_buffer: &mut Vec, + insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, + ) -> Result<()> { + let PrefixAndProximityBatch { word1, batch } = self; + if batch.is_empty() { + return Ok(()); + } + merge_buffer.clear(); + + let mut buffer = allocations.take_byte_vector(); + buffer.extend_from_slice(word1); + buffer.push(0); + + for (key, mergeable_data) in batch.drain(..) { + buffer.truncate(word1.len() + 1); + buffer.extend_from_slice(key.as_slice()); + + let data = if mergeable_data.len() > 1 { + CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; + merge_buffer.as_slice() + } else { + &mergeable_data[0] + }; + insert(buffer.as_slice(), data)?; + merge_buffer.clear(); + allocations.reclaim_byte_vector(key); + allocations.reclaim_mergeable_data_vector(mergeable_data); + } Ok(()) } } -fn write_prefixes_in_sorter( - prefixes: &mut HashMap, Vec>>, - sorter: &mut grenad::Sorter, +// This is adapted from `sorter_into_lmdb_database` +fn insert_into_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + new_key: &[u8], + new_value: &[u8], ) -> Result<()> { - for (key, data_slices) in prefixes.drain() { - for data in data_slices { - if valid_lmdb_key(&key) { - sorter.insert(&key, data)?; - } + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; + match iter.next().transpose()? { + Some((key, old_val)) if new_key == key => { + let val = + merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) + .map_err(|_| { + // TODO just wrap this error? + crate::error::InternalError::IndexingMergingKeys { + process: "get-put-merge", + } + })?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(key, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; } } - Ok(()) } -/// Computes the current prefix based on the previous and the currently iterated value -/// i.e. w1, w2, prox. It also makes sure to follow the `max_prefix_length` setting. -/// -/// Uses the current prefixes values to insert the associated data i.e. RoaringBitmap, -/// into the sorter that will, later, be inserted in the LMDB database. -fn insert_current_prefix_data_in_sorter<'a>( - buffer: &mut Vec, - current_prefixes: &mut Option<&'a &'a [String]>, - prefixes_cache: &mut HashMap, Vec>>, - word_prefix_pair_proximity_docids_sorter: &mut grenad::Sorter, - prefix_fst_keys: &'a [&'a [std::string::String]], - max_prefix_length: usize, - w1: &str, - w2: &str, - prox: u8, - data: &[u8], +// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, +// but it uses `append` if the database is empty, and it assumes that the values in the +// writer don't conflict with values in the database. +pub fn write_into_lmdb_database_without_merging( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + writer: grenad::Writer, ) -> Result<()> { - *current_prefixes = match current_prefixes.take() { - Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes), - _otherwise => { - write_prefixes_in_sorter(prefixes_cache, word_prefix_pair_proximity_docids_sorter)?; - prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) + let file = writer.into_inner()?; + let reader = grenad::Reader::new(BufReader::new(file))?; + if database.is_empty(wtxn)? { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; } - }; + } else { + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + Ok(()) +} - if let Some(prefixes) = current_prefixes { - buffer.clear(); - buffer.extend_from_slice(w1.as_bytes()); - buffer.push(0); - for prefix in prefixes.iter() { - if prefix.len() <= max_prefix_length && w2.starts_with(prefix) { - buffer.truncate(w1.len() + 1); - buffer.extend_from_slice(prefix.as_bytes()); - buffer.push(prox); +struct Allocations { + byte_vectors: Vec>, + mergeable_data_vectors: Vec>>, +} +impl Default for Allocations { + fn default() -> Self { + Self { + byte_vectors: Vec::with_capacity(65_536), + mergeable_data_vectors: Vec::with_capacity(4096), + } + } +} +impl Allocations { + fn take_byte_vector(&mut self) -> Vec { + self.byte_vectors.pop().unwrap_or_else(|| Vec::with_capacity(16)) + } + fn take_mergeable_data_vector(&mut self) -> Vec> { + self.mergeable_data_vectors.pop().unwrap_or_else(|| Vec::with_capacity(8)) + } - match prefixes_cache.get_mut(buffer.as_slice()) { - Some(value) => value.push(data.to_owned()), - None => { - prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); - } + fn reclaim_byte_vector(&mut self, mut data: Vec) { + data.clear(); + self.byte_vectors.push(data); + } + fn reclaim_mergeable_data_vector(&mut self, mut data: Vec>) { + data.clear(); + self.mergeable_data_vectors.push(data); + } +} + +#[derive(Default, Debug)] +struct PrefixTrieNode { + children: Vec<(PrefixTrieNode, u8)>, + is_end_node: bool, +} + +#[derive(Debug)] +struct PrefixTrieNodeSearchStart(usize); + +impl PrefixTrieNode { + fn is_empty(&self) -> bool { + self.children.is_empty() + } + + /// Returns false if the trie does not contain a prefix of the given word. + /// Returns true if the trie *may* contain a prefix of the given word. + /// + /// Moves the search start to the first node equal to the first letter of the word, + /// or to 0 otherwise. + fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { + let byte = word[0]; + if self.children[search_start.0].1 == byte { + return true; + } else { + match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { + Ok(position) => { + search_start.0 += position; + true + } + Err(_) => { + search_start.0 = 0; + false } } } } - Ok(()) + fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { + let mut node = PrefixTrieNode::default(); + for prefix in prefixes { + node.insert_sorted_prefix(prefix.as_bytes().into_iter()); + } + node + } + fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { + if let Some(&c) = prefix.next() { + if let Some((node, byte)) = self.children.last_mut() { + if *byte == c { + node.insert_sorted_prefix(prefix); + return; + } + } + let mut new_node = PrefixTrieNode::default(); + new_node.insert_sorted_prefix(prefix); + self.children.push((new_node, c)); + } else { + self.is_end_node = true; + } + } + fn for_each_prefix_of( + &self, + word: &[u8], + buffer: &mut Vec, + search_start: &PrefixTrieNodeSearchStart, + mut do_fn: impl FnMut(&mut Vec), + ) { + let first_byte = word[0]; + let mut cur_node = self; + buffer.push(first_byte); + if let Some((child_node, c)) = + cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) + { + if *c == first_byte { + cur_node = child_node; + if cur_node.is_end_node { + do_fn(buffer); + } + for &byte in &word[1..] { + buffer.push(byte); + if let Some((child_node, c)) = + cur_node.children.iter().find(|(_, c)| *c >= byte) + { + if *c == byte { + cur_node = child_node; + if cur_node.is_end_node { + do_fn(buffer); + } + } else { + break; + } + } else { + break; + } + } + } + } + } + // fn print(&self, buffer: &mut String, ident: usize) { + // let mut spaces = String::new(); + // for _ in 0..ident { + // spaces.push(' ') + // } + // for (child, c) in &self.children { + // buffer.push(char::from_u32(*c as u32).unwrap()); + // println!("{spaces}{buffer}:"); + // child.print(buffer, ident + 4); + // buffer.pop(); + // } + // } } - #[cfg(test)] mod tests { + use roaring::RoaringBitmap; + + use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; + + use super::*; + use std::io::Cursor; use crate::db_snap; @@ -328,4 +758,181 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids, "update"); } + + fn check_prefixes( + trie: &PrefixTrieNode, + search_start: &PrefixTrieNodeSearchStart, + word: &str, + expected_prefixes: &[&str], + ) { + let mut actual_prefixes = vec![]; + trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), &search_start, |x| { + let s = String::from_utf8(x.to_owned()).unwrap(); + actual_prefixes.push(s); + }); + assert_eq!(actual_prefixes, expected_prefixes); + } + + #[test] + fn test_trie() { + let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", + "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", + "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", + "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", + "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", + "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", + "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", + "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", + "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", + "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", + "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", + "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", + "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", + "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", + "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", + "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", + ])); + + let mut search_start = PrefixTrieNodeSearchStart(0); + + let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); + assert!(!is_empty); + assert_eq!(search_start.0, 2); + + check_prefixes(&trie, &search_start, "affair", &["a"]); + check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); + + let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); + assert!(!is_empty); + assert_eq!(trie.children[search_start.0].1, b'u'); + + check_prefixes(&trie, &search_start, "unique", &["u", "un"]); + + // NOTE: this should fail, because the search start is already beyong 'a' + let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); + assert!(!is_empty); + // search start is reset + assert_eq!(search_start.0, 0); + + let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "arb", "arbre", "cat", "catto", + ])); + check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); + check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); + } + + #[test] + fn test_execute_on_word_pairs_and_prefixes() { + let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "arb", "arbre", "cat", "catto", + ])); + + let mut serialised_bitmap123 = vec![]; + let mut bitmap123 = RoaringBitmap::new(); + bitmap123.insert(1); + bitmap123.insert(2); + bitmap123.insert(3); + CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); + + let mut serialised_bitmap456 = vec![]; + let mut bitmap456 = RoaringBitmap::new(); + bitmap456.insert(4); + bitmap456.insert(5); + bitmap456.insert(6); + CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); + + let mut serialised_bitmap789 = vec![]; + let mut bitmap789 = RoaringBitmap::new(); + bitmap789.insert(7); + bitmap789.insert(8); + bitmap789.insert(9); + CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); + + let mut serialised_bitmap_ranges = vec![]; + let mut bitmap_ranges = RoaringBitmap::new(); + bitmap_ranges.insert_range(63_000..65_000); + bitmap_ranges.insert_range(123_000..128_000); + CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); + + let word_pairs = [ + // 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456) + (("healthy", "arbre", 2), &serialised_bitmap123), + // not inserted because 3 > max_proximity + (("healthy", "arbre", 3), &serialised_bitmap456), + // 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123) + (("healthy", "arbres", 1), &serialised_bitmap123), + // 1, 3: + (("healthy", "arbres", 2), &serialised_bitmap456), + // not be inserted because 3 > max_proximity + (("healthy", "arbres", 3), &serialised_bitmap789), + // not inserted because no prefixes for boat + (("healthy", "boat", 1), &serialised_bitmap123), + // not inserted because no prefixes for ca + (("healthy", "ca", 1), &serialised_bitmap123), + // 4: (healthy cat 1) with (bitmap456 + bitmap123) + (("healthy", "cats", 1), &serialised_bitmap456), + // 5: (healthy cat 2) with (bitmap789 + bitmap_ranges) + (("healthy", "cats", 2), &serialised_bitmap789), + // 4 + 6: (healthy catto 1) with (bitmap123) + (("healthy", "cattos", 1), &serialised_bitmap123), + // 5 + 7: (healthy catto 2) with (bitmap_ranges) + (("healthy", "cattos", 2), &serialised_bitmap_ranges), + // 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges) + (("jittery", "cat", 1), &serialised_bitmap123), + // 8: + (("jittery", "cata", 1), &serialised_bitmap456), + // 8: + (("jittery", "catb", 1), &serialised_bitmap789), + // 8: + (("jittery", "catc", 1), &serialised_bitmap_ranges), + ]; + + let expected_result = [ + // first batch: + (("healthy", "arb", 1), bitmap123.clone()), + (("healthy", "arb", 2), &bitmap123 | &bitmap456), + (("healthy", "arbre", 1), bitmap123.clone()), + (("healthy", "arbre", 2), &bitmap123 | &bitmap456), + // second batch: + (("healthy", "cat", 1), &bitmap456 | &bitmap123), + (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), + (("healthy", "catto", 1), bitmap123.clone()), + (("healthy", "catto", 2), bitmap_ranges.clone()), + // third batch + (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), + ]; + + let mut result = vec![]; + + let mut allocations = Allocations::default(); + let mut iter = + IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { + ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) + }); + execute_on_word_pairs_and_prefixes( + &mut iter, + |iter| Ok(iter.next()), + &prefixes, + &mut allocations, + 2, + |k, v| { + let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); + let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); + result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); + Ok(()) + }, + ) + .unwrap(); + + for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { + let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x; + let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y; + + assert_eq!(actual_word1, expected_word1); + assert_eq!(actual_prefix, expected_prefix); + assert_eq!(actual_proximity, expected_proximity); + assert_eq!(actual_bitmap, expected_bitmap); + } + } } diff --git a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs b/milli/src/update/word_prefix_pair_proximity_docids/mod.rs deleted file mode 100644 index 6345dd210..000000000 --- a/milli/src/update/word_prefix_pair_proximity_docids/mod.rs +++ /dev/null @@ -1,709 +0,0 @@ -use crate::update::index_documents::{ - create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, -}; -use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::BufReader; - -pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, - max_proximity: u8, - max_prefix_length: usize, -} - -impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { - WordPrefixPairProximityDocids { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - max_nb_chunks: None, - max_memory: None, - max_proximity: 4, - max_prefix_length: 2, - } - } - - /// Set the maximum proximity required to make a prefix be part of the words prefixes - /// database. If two words are too far from the threshold the associated documents will - /// not be part of the prefix database. - /// - /// Default value is 4. This value must be lower or equal than 7 and will be clamped - /// to this bound otherwise. - pub fn max_proximity(&mut self, value: u8) -> &mut Self { - self.max_proximity = value.max(7); - self - } - - /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words - /// prefixes database. If the prefix length is higher than the threshold, the associated documents - /// will not be part of the prefix database. - /// - /// Default value is 2. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value; - self - } - - #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute<'a>( - mut self, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &'a [String], - common_prefix_fst_words: &[&'a [String]], - del_prefix_fst_words: &HashSet>, - ) -> Result<()> { - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - // This is an optimisation, to reuse allocations between loop iterations - let mut allocations = Allocations::default(); - - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length - let prefixes = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .into_iter() - .map(|s| s.into_iter()) - .flatten() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (word1, common_prefix, proximity) elements - // to insert in the DB - if !prefixes.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - // the first two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.move_on_next()? { - let (word1, word2, proximity) = UncheckedStrStrU8Codec::bytes_decode(key) - .ok_or(heed::Error::Decoding)?; - Ok(Some(((word1, word2, proximity), value))) - } else { - Ok(None) - } - }, - &prefixes, - &mut allocations, - self.max_proximity, - // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) - |key, value| { - insert_into_database( - &mut self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - - let prefixes = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words - .into_iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - if !prefixes.is_empty() { - let mut db_iter = self - .index - .word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(self.wtxn)?; - - // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) - // element in an intermediary grenad - let mut writer = create_writer( - self.chunk_compression_type, - self.chunk_compression_level, - tempfile::tempfile()?, - ); - - execute_on_word_pairs_and_prefixes( - &mut db_iter, - |db_iter| db_iter.next().transpose().map_err(|e| e.into()), - &prefixes, - &mut allocations, - self.max_proximity, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = self - .index - .word_prefix_pair_proximity_docids - .remap_data_type::() - .iter_mut(self.wtxn)?; - while let Some(((_, w2, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(w2.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; - } - } - } - - Ok(()) - } -} - -/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. -/// -/// Its main arguments are: -/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements -/// 2. a prefix trie -/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements -/// -/// For more information about the -fn execute_on_word_pairs_and_prefixes( - iter: &mut Iter, - mut next_word_pair_proximity: impl for<'a> FnMut( - &'a mut Iter, - ) -> Result< - Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, - >, - prefixes: &PrefixTrieNode, - allocations: &mut Allocations, - max_proximity: u8, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, -) -> Result<()> { - let mut batch = PrefixAndProximityBatch::default(); - let mut prev_word2_start = 0; - - let mut prefix_search_start = PrefixTrieNodeSearchStart(0); - let mut empty_prefixes = false; - - let mut prefix_buffer = allocations.take_byte_vector(); - let mut merge_buffer = allocations.take_byte_vector(); - - while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { - if proximity > max_proximity { - continue; - }; - let word2_start_different_than_prev = word2[0] != prev_word2_start; - if empty_prefixes && !word2_start_different_than_prev { - continue; - } - let word1_different_than_prev = word1 != batch.word1; - if word1_different_than_prev || word2_start_different_than_prev { - batch.flush(allocations, &mut merge_buffer, &mut insert)?; - if word1_different_than_prev { - prefix_search_start.0 = 0; - batch.word1.clear(); - batch.word1.extend_from_slice(word1); - } - if word2_start_different_than_prev { - // word2_start_different_than_prev == true - prev_word2_start = word2[0]; - } - empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); - } - - if !empty_prefixes { - prefixes.for_each_prefix_of( - word2, - &mut prefix_buffer, - &prefix_search_start, - |prefix_buffer| { - let mut value = allocations.take_byte_vector(); - value.extend_from_slice(&data); - let prefix_len = prefix_buffer.len(); - prefix_buffer.push(0); - prefix_buffer.push(proximity); - batch.insert(&prefix_buffer, value, allocations); - prefix_buffer.truncate(prefix_len); - }, - ); - prefix_buffer.clear(); - } - } - batch.flush(allocations, &mut merge_buffer, &mut insert)?; - Ok(()) -} -/** -A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). -The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. - -It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. - -The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content -can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: -- key : (word1, prefix, proximity) as bytes -- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes -*/ -#[derive(Default)] -struct PrefixAndProximityBatch { - word1: Vec, - batch: Vec<(Vec, Vec>)>, -} - -impl PrefixAndProximityBatch { - /// Insert the new key and value into the batch - fn insert(&mut self, new_key: &[u8], new_value: Vec, allocations: &mut Allocations) { - match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { - Ok(position) => { - self.batch[position].1.push(Cow::Owned(new_value)); - } - Err(position) => { - let mut key = allocations.take_byte_vector(); - key.extend_from_slice(new_key); - let mut mergeable_data = allocations.take_mergeable_data_vector(); - mergeable_data.push(Cow::Owned(new_value)); - self.batch.insert(position, (key, mergeable_data)); - } - } - } - - /// Empties the batch, calling `insert` on each element. - /// - /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. - fn flush( - &mut self, - allocations: &mut Allocations, - merge_buffer: &mut Vec, - insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, - ) -> Result<()> { - let PrefixAndProximityBatch { word1, batch } = self; - if batch.is_empty() { - return Ok(()); - } - merge_buffer.clear(); - - let mut buffer = allocations.take_byte_vector(); - buffer.extend_from_slice(word1); - buffer.push(0); - - for (key, mergeable_data) in batch.drain(..) { - buffer.truncate(word1.len() + 1); - buffer.extend_from_slice(key.as_slice()); - - let data = if mergeable_data.len() > 1 { - CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; - merge_buffer.as_slice() - } else { - &mergeable_data[0] - }; - insert(buffer.as_slice(), data)?; - merge_buffer.clear(); - allocations.reclaim_byte_vector(key); - allocations.reclaim_mergeable_data_vector(mergeable_data); - } - - Ok(()) - } -} - -// This is adapted from `sorter_into_lmdb_database` -fn insert_into_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - new_key: &[u8], - new_value: &[u8], -) -> Result<()> { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; - match iter.next().transpose()? { - Some((key, old_val)) if new_key == key => { - let val = - merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) - .map_err(|_| { - // TODO just wrap this error? - crate::error::InternalError::IndexingMergingKeys { - process: "get-put-merge", - } - })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(key, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; - } - } - Ok(()) -} - -// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, -// but it uses `append` if the database is empty, and it assumes that the values in the -// writer don't conflict with values in the database. -pub fn write_into_lmdb_database_without_merging( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - writer: grenad::Writer, -) -> Result<()> { - let file = writer.into_inner()?; - let reader = grenad::Reader::new(BufReader::new(file))?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; - } - } else { - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - Ok(()) -} - -struct Allocations { - byte_vectors: Vec>, - mergeable_data_vectors: Vec>>, -} -impl Default for Allocations { - fn default() -> Self { - Self { - byte_vectors: Vec::with_capacity(65_536), - mergeable_data_vectors: Vec::with_capacity(4096), - } - } -} -impl Allocations { - fn take_byte_vector(&mut self) -> Vec { - self.byte_vectors.pop().unwrap_or_else(|| Vec::with_capacity(16)) - } - fn take_mergeable_data_vector(&mut self) -> Vec> { - self.mergeable_data_vectors.pop().unwrap_or_else(|| Vec::with_capacity(8)) - } - - fn reclaim_byte_vector(&mut self, mut data: Vec) { - data.clear(); - self.byte_vectors.push(data); - } - fn reclaim_mergeable_data_vector(&mut self, mut data: Vec>) { - data.clear(); - self.mergeable_data_vectors.push(data); - } -} - -#[derive(Default, Debug)] -struct PrefixTrieNode { - children: Vec<(PrefixTrieNode, u8)>, - is_end_node: bool, -} - -#[derive(Debug)] -struct PrefixTrieNodeSearchStart(usize); - -impl PrefixTrieNode { - fn is_empty(&self) -> bool { - self.children.is_empty() - } - - /// Returns false if the trie does not contain a prefix of the given word. - /// Returns true if the trie *may* contain a prefix of the given word. - /// - /// Moves the search start to the first node equal to the first letter of the word, - /// or to 0 otherwise. - fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { - let byte = word[0]; - if self.children[search_start.0].1 == byte { - return true; - } else { - match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { - Ok(position) => { - search_start.0 += position; - true - } - Err(_) => { - search_start.0 = 0; - false - } - } - } - } - - fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { - let mut node = PrefixTrieNode::default(); - for prefix in prefixes { - node.insert_sorted_prefix(prefix.as_bytes().into_iter()); - } - node - } - fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { - if let Some(&c) = prefix.next() { - if let Some((node, byte)) = self.children.last_mut() { - if *byte == c { - node.insert_sorted_prefix(prefix); - return; - } - } - let mut new_node = PrefixTrieNode::default(); - new_node.insert_sorted_prefix(prefix); - self.children.push((new_node, c)); - } else { - self.is_end_node = true; - } - } - fn for_each_prefix_of( - &self, - word: &[u8], - buffer: &mut Vec, - search_start: &PrefixTrieNodeSearchStart, - mut do_fn: impl FnMut(&mut Vec), - ) { - let first_byte = word[0]; - let mut cur_node = self; - buffer.push(first_byte); - if let Some((child_node, c)) = - cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) - { - if *c == first_byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - for &byte in &word[1..] { - buffer.push(byte); - if let Some((child_node, c)) = - cur_node.children.iter().find(|(_, c)| *c >= byte) - { - if *c == byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - } else { - break; - } - } else { - break; - } - } - } - } - } - // fn print(&self, buffer: &mut String, ident: usize) { - // let mut spaces = String::new(); - // for _ in 0..ident { - // spaces.push(' ') - // } - // for (child, c) in &self.children { - // buffer.push(char::from_u32(*c as u32).unwrap()); - // println!("{spaces}{buffer}:"); - // child.print(buffer, ident + 4); - // buffer.pop(); - // } - // } -} -#[cfg(test)] -mod tests { - use roaring::RoaringBitmap; - - use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; - - use super::*; - - fn check_prefixes( - trie: &PrefixTrieNode, - search_start: &PrefixTrieNodeSearchStart, - word: &str, - expected_prefixes: &[&str], - ) { - let mut actual_prefixes = vec![]; - trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), &search_start, |x| { - let s = String::from_utf8(x.to_owned()).unwrap(); - actual_prefixes.push(s); - }); - assert_eq!(actual_prefixes, expected_prefixes); - } - - #[test] - fn test_trie() { - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", - "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", - "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", - "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", - "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", - "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", - "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", - "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", - "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", - "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", - "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", - "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", - "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", - "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", - "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", - "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", - ])); - - let mut search_start = PrefixTrieNodeSearchStart(0); - - let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(search_start.0, 2); - - check_prefixes(&trie, &search_start, "affair", &["a"]); - check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); - - let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(trie.children[search_start.0].1, b'u'); - - check_prefixes(&trie, &search_start, "unique", &["u", "un"]); - - // NOTE: this should fail, because the search start is already beyong 'a' - let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); - assert!(!is_empty); - // search start is reset - assert_eq!(search_start.0, 0); - - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); - check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); - } - - #[test] - fn test_execute_on_word_pairs_and_prefixes() { - let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - - let mut serialised_bitmap123 = vec![]; - let mut bitmap123 = RoaringBitmap::new(); - bitmap123.insert(1); - bitmap123.insert(2); - bitmap123.insert(3); - CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); - - let mut serialised_bitmap456 = vec![]; - let mut bitmap456 = RoaringBitmap::new(); - bitmap456.insert(4); - bitmap456.insert(5); - bitmap456.insert(6); - CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); - - let mut serialised_bitmap789 = vec![]; - let mut bitmap789 = RoaringBitmap::new(); - bitmap789.insert(7); - bitmap789.insert(8); - bitmap789.insert(9); - CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); - - let mut serialised_bitmap_ranges = vec![]; - let mut bitmap_ranges = RoaringBitmap::new(); - bitmap_ranges.insert_range(63_000..65_000); - bitmap_ranges.insert_range(123_000..128_000); - CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); - - let word_pairs = [ - // 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456) - (("healthy", "arbre", 2), &serialised_bitmap123), - // not inserted because 3 > max_proximity - (("healthy", "arbre", 3), &serialised_bitmap456), - // 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123) - (("healthy", "arbres", 1), &serialised_bitmap123), - // 1, 3: - (("healthy", "arbres", 2), &serialised_bitmap456), - // not be inserted because 3 > max_proximity - (("healthy", "arbres", 3), &serialised_bitmap789), - // not inserted because no prefixes for boat - (("healthy", "boat", 1), &serialised_bitmap123), - // not inserted because no prefixes for ca - (("healthy", "ca", 1), &serialised_bitmap123), - // 4: (healthy cat 1) with (bitmap456 + bitmap123) - (("healthy", "cats", 1), &serialised_bitmap456), - // 5: (healthy cat 2) with (bitmap789 + bitmap_ranges) - (("healthy", "cats", 2), &serialised_bitmap789), - // 4 + 6: (healthy catto 1) with (bitmap123) - (("healthy", "cattos", 1), &serialised_bitmap123), - // 5 + 7: (healthy catto 2) with (bitmap_ranges) - (("healthy", "cattos", 2), &serialised_bitmap_ranges), - // 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges) - (("jittery", "cat", 1), &serialised_bitmap123), - // 8: - (("jittery", "cata", 1), &serialised_bitmap456), - // 8: - (("jittery", "catb", 1), &serialised_bitmap789), - // 8: - (("jittery", "catc", 1), &serialised_bitmap_ranges), - ]; - - let expected_result = [ - // first batch: - (("healthy", "arb", 1), bitmap123.clone()), - (("healthy", "arb", 2), &bitmap123 | &bitmap456), - (("healthy", "arbre", 1), bitmap123.clone()), - (("healthy", "arbre", 2), &bitmap123 | &bitmap456), - // second batch: - (("healthy", "cat", 1), &bitmap456 | &bitmap123), - (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), - (("healthy", "catto", 1), bitmap123.clone()), - (("healthy", "catto", 2), bitmap_ranges.clone()), - // third batch - (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), - ]; - - let mut result = vec![]; - - let mut allocations = Allocations::default(); - let mut iter = - IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { - ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) - }); - execute_on_word_pairs_and_prefixes( - &mut iter, - |iter| Ok(iter.next()), - &prefixes, - &mut allocations, - 2, - |k, v| { - let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); - let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); - result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); - Ok(()) - }, - ) - .unwrap(); - - for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { - let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x; - let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y; - - assert_eq!(actual_word1, expected_word1); - assert_eq!(actual_prefix, expected_prefix); - assert_eq!(actual_proximity, expected_proximity); - assert_eq!(actual_bitmap, expected_bitmap); - } - } -} diff --git a/milli/src/update/word_prefix_pair_proximity_docids/readme.md b/milli/src/update/word_prefix_pair_proximity_docids/readme.md deleted file mode 100644 index 0718fd79c..000000000 --- a/milli/src/update/word_prefix_pair_proximity_docids/readme.md +++ /dev/null @@ -1,144 +0,0 @@ -## What is WordPrefixPairProximityDocids? -The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. - -The prefixes present in this database are only those that correspond to many different words in the documents. - -## How is it created/updated? (simplified version) -To compute it, we have access to (mainly) two inputs: - -* a list of sorted prefixes, such as: -``` -c -ca -cat -d -do -dog -``` -Note that only prefixes which correspond to more than a certain number of different words from the database are included in this list. - -* a sorted list of word pairs and the distance between them (i.e. proximity), associated with a roaring bitmap, such as: -``` -good dog 3 -> docids1: [2, 5, 6] -good doggo 1 -> docids2: [8] -good dogma 1 -> docids3: [7, 19, 20] -good ghost 2 -> docids4: [1] -horror cathedral 4 -> docids5: [1, 2] -``` - -I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below: - -1. **Outer loop:** First, we iterate over each word pair and its proximity: -``` -word1 : good -word2 : dog -proximity: 3 -``` -2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: -``` -Outer loop 1: ------------------------------- -word1 : good -word2 : dog -proximity: 3 -docids : docids1 - -prefixes: [d, do, dog] - -batch: [ - (d, 3) -> [docids1] - (do, 3) -> [docids1] - (dog, 3) -> [docids1] -] -``` -3. For illustration purpose, let's run through a second iteration of the outer loop: -``` -Outer loop 2: ------------------------------- -word1 : good -word2 : doggo -proximity: 1 -docids : docids2 - -prefixes: [d, do, dog] - -batch: [ - (d, 1) -> [docids2] - (d, 3) -> [docids1] - (do, 1) -> [docids2] - (do, 3) -> [docids1] - (dog, 1) -> [docids2] - (dog, 3) -> [docids1] -] -``` -Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some of the elements inserted in the second iteration of the outer loop appear *before* elements from the first iteration. - -4. And a third: -``` -Outer loop 3: ------------------------------- -word1 : good -word2 : dogma -proximity: 1 -docids : docids3 - -prefixes: [d, do, dog] - -batch: [ - (d, 1) -> [docids2, docids3] - (d, 3) -> [docids1] - (do, 1) -> [docids2, docids3] - (do, 3) -> [docids1] - (dog, 1) -> [docids2, docids3] - (dog, 3) -> [docids1] -] -``` -Notice that there were some conflicts which were resolved by merging the conflicting values together. - -5. On the fourth iteration of the outer loop, we have: -``` -Outer loop 4: ------------------------------- -word1 : good -word2 : ghost -proximity: 2 -``` -Because `word2` begins with a different letter than the previous `word2`, we know that: -1. All the prefixes of `word2` are greater than the prefixes of the previous word2 -2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch. -Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`. - -6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: -``` -Flushing Batch loop 1: ------------------------------- -word1 : good -word2 : d -proximity: 1 -docids : [docids2, docids3] -``` -We then merge the array of `docids` (of type `Vec>`) using `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a roaring bitmap of all the document ids where `word1` is followed by `prefix` at a distance of `proximity`. -Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` into the database. - -7. That's it! ... except... - -## How is it created/updated (continued) - -I lied a little bit about the input data. In reality, we get two sets of the inputs described above, which come from different places: - -* For the list of sorted prefixes, we have: - * `new_prefixes`, which are all the prefixes that were not present in the database before the insertion of the new documents - * `common_prefixes` which are the prefixes that are present both in the database and in the newly added documents - -* For the list of word pairs and proximities, we have: - * `new_word_pairs`, which is the list of word pairs and their proximities present in the newly added documents - * `word_pairs_db`, which is the list of word pairs from the database. **This list includes all elements in `new_word_pairs`** since `new_word_pairs` was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` function. - -To update the prefix database correctly, we call the algorithm described earlier first on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). Thus: - -1. For all the word pairs that were already present in the DB, we insert them again with the `new_prefixes`. Calling the algorithm on them with the `common_prefixes` would not result in any new data. -3. For all the new word pairs, we insert them twice: first with the `common_prefixes`, and then, because they are part of `word_pairs_db`, with the `new_prefixes`. - -Note, also, that since we read data from the database when iterating over `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-docids from the batch directly into the database (we would have a concurrent reader and writer). Therefore, when calling the algorithm on (`new_prefixes`, `word_pairs_db`), we insert the computed ((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. - - From 474500362c76e84dbf17fdbcc4c828a4762763e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 12:11:09 +0200 Subject: [PATCH 1556/1889] Update wpppd snapshots New snapshot (yes, it's wrong as well, it will get fixed later): --- source: milli/src/update/word_prefix_pair_proximity_docids.rs --- 5 a 1 [101, ] 5 a 2 [101, ] 5 am 1 [101, ] 5 b 4 [101, ] 5 be 4 [101, ] am a 3 [101, ] amazing a 1 [100, ] amazing a 2 [100, ] amazing a 3 [100, ] amazing an 1 [100, ] amazing an 2 [100, ] amazing b 2 [100, ] amazing be 2 [100, ] an a 1 [100, ] an a 2 [100, 202, ] an am 1 [100, ] an b 3 [100, ] an be 3 [100, ] and a 2 [100, ] and a 3 [100, ] and a 4 [100, ] and b 1 [100, ] and be 1 [100, ] d\0 0 [100, 202, ] an an 2 [100, ] and am 2 [100, ] and an 3 [100, ] at a 2 [100, 101, ] at a 3 [100, ] at am 2 [100, 101, ] at an 1 [100, 202, ] at an 3 [100, ] at b 3 [101, ] at b 4 [100, ] at be 3 [101, ] at be 4 [100, ] beautiful a 2 [100, ] beautiful a 3 [100, ] beautiful a 4 [100, ] beautiful am 3 [100, ] beautiful an 2 [100, ] beautiful an 4 [100, ] bell a 2 [101, ] bell a 4 [101, ] bell am 4 [101, ] extraordinary a 2 [202, ] extraordinary a 3 [202, ] extraordinary an 2 [202, ] house a 4 [100, 202, ] house a 4 [100, ] house am 4 [100, ] house an 3 [100, 202, ] house b 2 [100, ] house be 2 [100, ] rings a 1 [101, ] rings a 3 [101, ] rings am 3 [101, ] rings b 2 [101, ] rings be 2 [101, ] the a 3 [101, ] the b 1 [101, ] the be 1 [101, ] --- ...ord_prefix_pair_proximity_docids.hash.snap | 4 ++ .../word_prefix_pair_proximity_docids.snap | 56 ------------------- 2 files changed, 4 insertions(+), 56 deletions(-) create mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap delete mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..574cfa72f --- /dev/null +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/word_prefix_pair_proximity_docids.rs +--- +53e42e513b83885139e4f6d817888561 diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index aabd9ddec..000000000 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,56 +0,0 @@ ---- -source: milli/src/update/word_prefix_pair_proximity_docids.rs ---- -5 a 1 [101, ] -5 a 2 [101, ] -5 am 1 [101, ] -5 b 4 [101, ] -5 be 4 [101, ] -am a 3 [101, ] -amazing a 1 [100, ] -amazing a 2 [100, ] -amazing a 3 [100, ] -amazing b 2 [100, ] -amazing be 2 [100, ] -an a 1 [100, ] -an a 2 [100, 202, ] -an am 1 [100, ] -an b 3 [100, ] -an be 3 [100, ] -and a 2 [100, ] -and a 3 [100, ] -and a 4 [100, ] -and am 2 [100, ] -and b 1 [100, ] -and be 1 [100, ] -at a 1 [100, 202, ] -at a 2 [100, 101, ] -at a 3 [100, ] -at am 2 [100, 101, ] -at b 3 [101, ] -at b 4 [100, ] -at be 3 [101, ] -at be 4 [100, ] -beautiful a 2 [100, ] -beautiful a 3 [100, ] -beautiful a 4 [100, ] -beautiful am 3 [100, ] -bell a 2 [101, ] -bell a 4 [101, ] -bell am 4 [101, ] -extraordinary a 2 [202, ] -extraordinary a 3 [202, ] -house a 3 [100, 202, ] -house a 4 [100, 202, ] -house am 4 [100, ] -house b 2 [100, ] -house be 2 [100, ] -rings a 1 [101, ] -rings a 3 [101, ] -rings am 3 [101, ] -rings b 2 [101, ] -rings be 2 [101, ] -the a 3 [101, ] -the b 1 [101, ] -the be 1 [101, ] - From 06f3fd8c6df0232710ef7e19331499e83193aa58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 15:57:44 +0200 Subject: [PATCH 1557/1889] Add more comments to WordPrefixPairProximityDocids::execute --- .../update/word_prefix_pair_proximity_docids.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 4a3a7d13e..d08646b27 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -351,23 +351,34 @@ fn execute_on_word_pairs_and_prefixes( let mut batch = PrefixAndProximityBatch::default(); let mut prev_word2_start = 0; + // Optimisation: the index at the root of the prefix trie where to search for let mut prefix_search_start = PrefixTrieNodeSearchStart(0); + + // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter let mut empty_prefixes = false; let mut prefix_buffer = allocations.take_byte_vector(); let mut merge_buffer = allocations.take_byte_vector(); while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + // skip this iteration if the proximity is over the threshold if proximity > max_proximity { continue; }; let word2_start_different_than_prev = word2[0] != prev_word2_start; + // if there were no potential prefixes for the previous word2 based on its first letter, + // and if the current word2 starts with the same letter, then there is also no potential + // prefixes for the current word2, and we can skip to the next iteration if empty_prefixes && !word2_start_different_than_prev { continue; } + + // if word1 is different than the previous word1 OR if the start of word2 is different + // than the previous start of word2, then we'll need to flush the batch let word1_different_than_prev = word1 != batch.word1; if word1_different_than_prev || word2_start_different_than_prev { batch.flush(allocations, &mut merge_buffer, &mut insert)?; + // don't forget to reset the value of batch.word1 and prev_word2_start if word1_different_than_prev { prefix_search_start.0 = 0; batch.word1.clear(); @@ -377,10 +388,12 @@ fn execute_on_word_pairs_and_prefixes( // word2_start_different_than_prev == true prev_word2_start = word2[0]; } + // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); } if !empty_prefixes { + // All conditions are satisfied, we can now insert each new prefix of word2 into the batch prefixes.for_each_prefix_of( word2, &mut prefix_buffer, @@ -618,6 +631,10 @@ impl PrefixTrieNode { self.is_end_node = true; } } + + /// Call the given closure on each prefix of the word contained in the prefix trie. + /// + /// The search starts from the given `search_start`. fn for_each_prefix_of( &self, word: &[u8], From 34c991ea02bd3a5f8151de605f4e09849975c889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 07:03:30 +0200 Subject: [PATCH 1558/1889] Add newlines in documentation of word_prefix_pair_proximity_docids --- .../word_prefix_pair_proximity_docids.rs | 95 ++++++++++++++----- 1 file changed, 71 insertions(+), 24 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index d08646b27..0426edef9 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,8 +1,12 @@ /*! ## What is WordPrefixPairProximityDocids? -The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. +The word-prefix-pair-proximity-docids database is a database whose keys are of +the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of +the documents which contain `word` followed by another word starting with +`prefix` at a distance of `proximity`. -The prefixes present in this database are only those that correspond to many different words in the documents. +The prefixes present in this database are only those that correspond to many +different words in the documents. ## How is it created/updated? (simplified version) To compute it, we have access to (mainly) two inputs: @@ -16,9 +20,11 @@ d do dog ``` -Note that only prefixes which correspond to more than a certain number of different words from the database are included in this list. +Note that only prefixes which correspond to more than a certain number of +different words from the database are included in this list. -* a sorted list of word pairs and the distance between them (i.e. proximity), associated with a roaring bitmap, such as: +* a sorted list of word pairs and the distance between them (i.e. proximity), +* associated with a roaring bitmap, such as: ``` good dog 3 -> docids1: [2, 5, 6] good doggo 1 -> docids2: [8] @@ -27,7 +33,8 @@ good ghost 2 -> docids4: [1] horror cathedral 4 -> docids5: [1, 2] ``` -I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below: +I illustrate a simplified version of the algorithm to create the word-prefix +pair-proximity database below: 1. **Outer loop:** First, we iterate over each word pair and its proximity: ``` @@ -35,7 +42,10 @@ word1 : good word2 : dog proximity: 3 ``` -2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: +2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are +in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) +and the value (`docids`) to a sorted map which we call the “batch”. For example, +at the end of the first inner loop, we may have: ``` Outer loop 1: ------------------------------ @@ -72,7 +82,9 @@ batch: [ (dog, 3) -> [docids1] ] ``` -Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some of the elements inserted in the second iteration of the outer loop appear *before* elements from the first iteration. +Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some +of the elements inserted in the second iteration of the outer loop appear +*before* elements from the first iteration. 4. And a third: ``` @@ -94,7 +106,8 @@ batch: [ (dog, 3) -> [docids1] ] ``` -Notice that there were some conflicts which were resolved by merging the conflicting values together. +Notice that there were some conflicts which were resolved by merging the +conflicting values together. 5. On the fourth iteration of the outer loop, we have: ``` @@ -104,12 +117,20 @@ word1 : good word2 : ghost proximity: 2 ``` -Because `word2` begins with a different letter than the previous `word2`, we know that: -1. All the prefixes of `word2` are greater than the prefixes of the previous word2 -2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch. -Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`. +Because `word2` begins with a different letter than the previous `word2`, +we know that: -6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: +1. All the prefixes of `word2` are greater than the prefixes of the previous word2 +2. And therefore, every instance of (`word2`, `prefix`) will be greater than +any element in the batch. + +Therefore, we know that we can insert every element from the batch into the +database before proceeding any further. This operation is called +“flushing the batch”. Flushing the batch should also be done whenever `word1` +is different than the previous `word1`. + +6. **Flushing the batch:** to flush the batch, we look at the `word1` and +iterate over the elements of the batch in sorted order: ``` Flushing Batch loop 1: ------------------------------ @@ -118,29 +139,55 @@ word2 : d proximity: 1 docids : [docids2, docids3] ``` -We then merge the array of `docids` (of type `Vec>`) using `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a roaring bitmap of all the document ids where `word1` is followed by `prefix` at a distance of `proximity`. -Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` into the database. +We then merge the array of `docids` (of type `Vec>`) using +`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a +roaring bitmap of all the document ids where `word1` is followed by `prefix` +at a distance of `proximity`. +Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` +into the database. 7. That's it! ... except... ## How is it created/updated (continued) -I lied a little bit about the input data. In reality, we get two sets of the inputs described above, which come from different places: +I lied a little bit about the input data. In reality, we get two sets of the +inputs described above, which come from different places: * For the list of sorted prefixes, we have: - * `new_prefixes`, which are all the prefixes that were not present in the database before the insertion of the new documents - * `common_prefixes` which are the prefixes that are present both in the database and in the newly added documents + 1. `new_prefixes`, which are all the prefixes that were not present in the + database before the insertion of the new documents + + 2. `common_prefixes` which are the prefixes that are present both in the + database and in the newly added documents * For the list of word pairs and proximities, we have: - * `new_word_pairs`, which is the list of word pairs and their proximities present in the newly added documents - * `word_pairs_db`, which is the list of word pairs from the database. **This list includes all elements in `new_word_pairs`** since `new_word_pairs` was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` function. + 1. `new_word_pairs`, which is the list of word pairs and their proximities + present in the newly added documents -To update the prefix database correctly, we call the algorithm described earlier first on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). Thus: + 2. `word_pairs_db`, which is the list of word pairs from the database. + This list includes all elements in `new_word_pairs`** since `new_word_pairs` + was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` + function. -1. For all the word pairs that were already present in the DB, we insert them again with the `new_prefixes`. Calling the algorithm on them with the `common_prefixes` would not result in any new data. -3. For all the new word pairs, we insert them twice: first with the `common_prefixes`, and then, because they are part of `word_pairs_db`, with the `new_prefixes`. +To update the prefix database correctly, we call the algorithm described earlier first +on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). +Thus: -Note, also, that since we read data from the database when iterating over `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-docids from the batch directly into the database (we would have a concurrent reader and writer). Therefore, when calling the algorithm on (`new_prefixes`, `word_pairs_db`), we insert the computed ((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. +1. For all the word pairs that were already present in the DB, we insert them +again with the `new_prefixes`. Calling the algorithm on them with the +`common_prefixes` would not result in any new data. + +2. For all the new word pairs, we insert them twice: first with the `common_prefixes`, +and then, because they are part of `word_pairs_db`, with the `new_prefixes`. + +Note, also, that since we read data from the database when iterating over +`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- +docids from the batch directly into the database (we would have a concurrent +reader and writer). Therefore, when calling the algorithm on +(`new_prefixes`, `word_pairs_db`), we insert the computed +((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad +Writer instead of the DB. At the end of the outer loop, we finally read from +the grenad and insert its elements in the database. From f6f8f543e105dbbc865f3a269c0ab12a3d8e7c67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 07:08:36 +0200 Subject: [PATCH 1559/1889] Run cargo fmt --- milli/src/heed_codec/mod.rs | 3 +-- .../word_prefix_pair_proximity_docids.rs | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 02235f26d..f3691b7d8 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -15,5 +15,4 @@ pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; pub use self::str_beu32_codec::StrBEU32Codec; -pub use self::str_str_u8_codec::StrStrU8Codec; -pub use self::str_str_u8_codec::UncheckedStrStrU8Codec; +pub use self::str_str_u8_codec::{StrStrU8Codec, UncheckedStrStrU8Codec}; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 0426edef9..07908efb5 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -192,17 +192,19 @@ the grenad and insert its elements in the database. */ -use crate::update::index_documents::{ - create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, -}; -use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; + use grenad::CompressionType; use heed::types::ByteSlice; use heed::BytesDecode; use log::debug; -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::BufReader; + +use crate::update::index_documents::{ + create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, +}; +use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -737,9 +739,8 @@ impl PrefixTrieNode { mod tests { use roaring::RoaringBitmap; - use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; - use super::*; + use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; use std::io::Cursor; From 730911143376a367dacdc59cc6db004c812c4fff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 08:52:01 +0200 Subject: [PATCH 1560/1889] Don't run block code in doc tests of word_pair_proximity_docids --- .../update/word_prefix_pair_proximity_docids.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 07908efb5..90430c0dd 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -12,7 +12,7 @@ different words in the documents. To compute it, we have access to (mainly) two inputs: * a list of sorted prefixes, such as: -``` +```text c ca cat @@ -25,7 +25,7 @@ different words from the database are included in this list. * a sorted list of word pairs and the distance between them (i.e. proximity), * associated with a roaring bitmap, such as: -``` +```text good dog 3 -> docids1: [2, 5, 6] good doggo 1 -> docids2: [8] good dogma 1 -> docids3: [7, 19, 20] @@ -37,7 +37,7 @@ I illustrate a simplified version of the algorithm to create the word-prefix pair-proximity database below: 1. **Outer loop:** First, we iterate over each word pair and its proximity: -``` +```text word1 : good word2 : dog proximity: 3 @@ -46,7 +46,7 @@ proximity: 3 in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: -``` +```text Outer loop 1: ------------------------------ word1 : good @@ -63,7 +63,7 @@ batch: [ ] ``` 3. For illustration purpose, let's run through a second iteration of the outer loop: -``` +```text Outer loop 2: ------------------------------ word1 : good @@ -87,7 +87,7 @@ of the elements inserted in the second iteration of the outer loop appear *before* elements from the first iteration. 4. And a third: -``` +```text Outer loop 3: ------------------------------ word1 : good @@ -110,7 +110,7 @@ Notice that there were some conflicts which were resolved by merging the conflicting values together. 5. On the fourth iteration of the outer loop, we have: -``` +```text Outer loop 4: ------------------------------ word1 : good @@ -131,7 +131,7 @@ is different than the previous `word1`. 6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order: -``` +```text Flushing Batch loop 1: ------------------------------ word1 : good From ef75a77464c63f3723e4c7513c5ca53853a5e9cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 12:04:48 +0200 Subject: [PATCH 1561/1889] Fix undefined behaviour caused by reusing key from the database New full snapshot: --- source: milli/src/update/word_prefix_pair_proximity_docids.rs --- 5 a 1 [101, ] 5 a 2 [101, ] 5 am 1 [101, ] 5 b 4 [101, ] 5 be 4 [101, ] am a 3 [101, ] amazing a 1 [100, ] amazing a 2 [100, ] amazing a 3 [100, ] amazing an 1 [100, ] amazing an 2 [100, ] amazing b 2 [100, ] amazing be 2 [100, ] an a 1 [100, ] an a 2 [100, 202, ] an am 1 [100, ] an an 2 [100, ] an b 3 [100, ] an be 3 [100, ] and a 2 [100, ] and a 3 [100, ] and a 4 [100, ] and am 2 [100, ] and an 3 [100, ] and b 1 [100, ] and be 1 [100, ] at a 1 [100, 202, ] at a 2 [100, 101, ] at a 3 [100, ] at am 2 [100, 101, ] at an 1 [100, 202, ] at an 3 [100, ] at b 3 [101, ] at b 4 [100, ] at be 3 [101, ] at be 4 [100, ] beautiful a 2 [100, ] beautiful a 3 [100, ] beautiful a 4 [100, ] beautiful am 3 [100, ] beautiful an 2 [100, ] beautiful an 4 [100, ] bell a 2 [101, ] bell a 4 [101, ] bell am 4 [101, ] extraordinary a 2 [202, ] extraordinary a 3 [202, ] extraordinary an 2 [202, ] house a 3 [100, 202, ] house a 4 [100, 202, ] house am 4 [100, ] house an 3 [100, 202, ] house b 2 [100, ] house be 2 [100, ] rings a 1 [101, ] rings a 3 [101, ] rings am 3 [101, ] rings b 2 [101, ] rings be 2 [101, ] the a 3 [101, ] the b 1 [101, ] the be 1 [101, ] --- .../update/word_prefix_pair_proximity_docids.hash.snap | 2 +- milli/src/update/word_prefix_pair_proximity_docids.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap index 574cfa72f..a39ee07b5 100644 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/word_prefix_pair_proximity_docids.rs --- -53e42e513b83885139e4f6d817888561 +5ed4bf83317b10962a55ade353427bdd diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 90430c0dd..bcd940410 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -554,8 +554,8 @@ fn insert_into_database( process: "get-put-merge", } })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(key, &val)? }; + // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour + unsafe { iter.put_current(new_key, &val)? }; } _ => { drop(iter); @@ -579,7 +579,7 @@ pub fn write_into_lmdb_database_without_merging( let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; let mut cursor = reader.into_cursor()?; while let Some((k, v)) = cursor.move_on_next()? { - // safety: we don't keep references from inside the LMDB database. + // safety: the key comes from the grenad reader, not the database unsafe { out_iter.append(k, v)? }; } } else { From 1bc4788e5998960ce901d092cd5f2b69043f0b0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 12:47:07 +0200 Subject: [PATCH 1562/1889] Remove cached Allocations struct from wpppd indexing --- .../word_prefix_pair_proximity_docids.rs | 63 +++---------------- 1 file changed, 8 insertions(+), 55 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index bcd940410..e8d63acbb 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -265,9 +265,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - // This is an optimisation, to reuse allocations between loop iterations - let mut allocations = Allocations::default(); - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length let prefixes = PrefixTrieNode::from_sorted_prefixes( common_prefix_fst_words @@ -297,7 +294,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { } }, &prefixes, - &mut allocations, self.max_proximity, // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) |key, value| { @@ -340,7 +336,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &mut db_iter, |db_iter| db_iter.next().transpose().map_err(|e| e.into()), &prefixes, - &mut allocations, self.max_proximity, |key, value| writer.insert(key, value).map_err(|e| e.into()), )?; @@ -393,7 +388,6 @@ fn execute_on_word_pairs_and_prefixes( Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, >, prefixes: &PrefixTrieNode, - allocations: &mut Allocations, max_proximity: u8, mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, ) -> Result<()> { @@ -406,8 +400,8 @@ fn execute_on_word_pairs_and_prefixes( // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter let mut empty_prefixes = false; - let mut prefix_buffer = allocations.take_byte_vector(); - let mut merge_buffer = allocations.take_byte_vector(); + let mut prefix_buffer = Vec::with_capacity(8); + let mut merge_buffer = Vec::with_capacity(65_536); while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { // skip this iteration if the proximity is over the threshold @@ -426,7 +420,7 @@ fn execute_on_word_pairs_and_prefixes( // than the previous start of word2, then we'll need to flush the batch let word1_different_than_prev = word1 != batch.word1; if word1_different_than_prev || word2_start_different_than_prev { - batch.flush(allocations, &mut merge_buffer, &mut insert)?; + batch.flush(&mut merge_buffer, &mut insert)?; // don't forget to reset the value of batch.word1 and prev_word2_start if word1_different_than_prev { prefix_search_start.0 = 0; @@ -448,19 +442,17 @@ fn execute_on_word_pairs_and_prefixes( &mut prefix_buffer, &prefix_search_start, |prefix_buffer| { - let mut value = allocations.take_byte_vector(); - value.extend_from_slice(&data); let prefix_len = prefix_buffer.len(); prefix_buffer.push(0); prefix_buffer.push(proximity); - batch.insert(&prefix_buffer, value, allocations); + batch.insert(&prefix_buffer, data.to_vec()); prefix_buffer.truncate(prefix_len); }, ); prefix_buffer.clear(); } } - batch.flush(allocations, &mut merge_buffer, &mut insert)?; + batch.flush(&mut merge_buffer, &mut insert)?; Ok(()) } /** @@ -482,17 +474,13 @@ struct PrefixAndProximityBatch { impl PrefixAndProximityBatch { /// Insert the new key and value into the batch - fn insert(&mut self, new_key: &[u8], new_value: Vec, allocations: &mut Allocations) { + fn insert(&mut self, new_key: &[u8], new_value: Vec) { match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { Ok(position) => { self.batch[position].1.push(Cow::Owned(new_value)); } Err(position) => { - let mut key = allocations.take_byte_vector(); - key.extend_from_slice(new_key); - let mut mergeable_data = allocations.take_mergeable_data_vector(); - mergeable_data.push(Cow::Owned(new_value)); - self.batch.insert(position, (key, mergeable_data)); + self.batch.insert(position, (new_key.to_vec(), vec![Cow::Owned(new_value)])); } } } @@ -502,7 +490,6 @@ impl PrefixAndProximityBatch { /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. fn flush( &mut self, - allocations: &mut Allocations, merge_buffer: &mut Vec, insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, ) -> Result<()> { @@ -512,7 +499,7 @@ impl PrefixAndProximityBatch { } merge_buffer.clear(); - let mut buffer = allocations.take_byte_vector(); + let mut buffer = Vec::with_capacity(word1.len() + 1 + 6 + 1); buffer.extend_from_slice(word1); buffer.push(0); @@ -528,8 +515,6 @@ impl PrefixAndProximityBatch { }; insert(buffer.as_slice(), data)?; merge_buffer.clear(); - allocations.reclaim_byte_vector(key); - allocations.reclaim_mergeable_data_vector(mergeable_data); } Ok(()) @@ -591,36 +576,6 @@ pub fn write_into_lmdb_database_without_merging( Ok(()) } -struct Allocations { - byte_vectors: Vec>, - mergeable_data_vectors: Vec>>, -} -impl Default for Allocations { - fn default() -> Self { - Self { - byte_vectors: Vec::with_capacity(65_536), - mergeable_data_vectors: Vec::with_capacity(4096), - } - } -} -impl Allocations { - fn take_byte_vector(&mut self) -> Vec { - self.byte_vectors.pop().unwrap_or_else(|| Vec::with_capacity(16)) - } - fn take_mergeable_data_vector(&mut self) -> Vec> { - self.mergeable_data_vectors.pop().unwrap_or_else(|| Vec::with_capacity(8)) - } - - fn reclaim_byte_vector(&mut self, mut data: Vec) { - data.clear(); - self.byte_vectors.push(data); - } - fn reclaim_mergeable_data_vector(&mut self, mut data: Vec>) { - data.clear(); - self.mergeable_data_vectors.push(data); - } -} - #[derive(Default, Debug)] struct PrefixTrieNode { children: Vec<(PrefixTrieNode, u8)>, @@ -970,7 +925,6 @@ mod tests { let mut result = vec![]; - let mut allocations = Allocations::default(); let mut iter = IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) @@ -979,7 +933,6 @@ mod tests { &mut iter, |iter| Ok(iter.next()), &prefixes, - &mut allocations, 2, |k, v| { let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); From 405555b4015635cfb57795108ff18a8d534101d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 12:21:05 +0200 Subject: [PATCH 1563/1889] Add some documentation to PrefixTrieNode --- .../word_prefix_pair_proximity_docids.rs | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index e8d63acbb..367fdc7ab 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -576,6 +576,28 @@ pub fn write_into_lmdb_database_without_merging( Ok(()) } +/** A prefix trie. Used to iterate quickly over the prefixes of a word that are +within a set. + +## Structure +The trie is made of nodes composed of: +1. a byte character (e.g. 'a') +2. whether the node is an end node or not +3. a list of children nodes, sorted by their byte character + +For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]` +is drawn below. Nodes with a double border are "end nodes". + +┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗ +│ a │ │ c │ ║ r ║ +└──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝ +╔══════╗╔══════╗╔══════╗ ┌─────────┐ ╔═════════╗ ┌─────────┐ ╔══════════╗ +║ c ║║ e ║║ r ║ │ e │ ║ h ║ │ e │ ║ i ║ +╚══════╝╚══════╝╚══════╝ └─────────┘ ╚═════════╝ └─────────┘ ╚══════════╝ + ╔═══╗ ╔═══╗ ╔═══╗ + ║ i ║ ║ l ║ ║ l ║ + ╚═══╝ ╚═══╝ ╚═══╝ +*/ #[derive(Default, Debug)] struct PrefixTrieNode { children: Vec<(PrefixTrieNode, u8)>, From 4f9edf13d7c6ec98fcda2be95be777ecae36a3f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 13:11:38 +0200 Subject: [PATCH 1564/1889] Remove commented-out function --- .../src/update/word_prefix_pair_proximity_docids.rs | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 367fdc7ab..4e25e0c73 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -699,18 +699,6 @@ impl PrefixTrieNode { } } } - // fn print(&self, buffer: &mut String, ident: usize) { - // let mut spaces = String::new(); - // for _ in 0..ident { - // spaces.push(' ') - // } - // for (child, c) in &self.children { - // buffer.push(char::from_u32(*c as u32).unwrap()); - // println!("{spaces}{buffer}:"); - // child.print(buffer, ident + 4); - // buffer.pop(); - // } - // } } #[cfg(test)] mod tests { From 78d9f0622df253b5e90b400e710a83ba2fddb789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 12:21:00 +0200 Subject: [PATCH 1565/1889] cargo fmt --- milli/src/update/word_prefix_pair_proximity_docids.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 4e25e0c73..cf5e19a5c 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -702,16 +702,14 @@ impl PrefixTrieNode { } #[cfg(test)] mod tests { + use std::io::Cursor; + use roaring::RoaringBitmap; use super::*; - use crate::{CboRoaringBitmapCodec, StrStrU8Codec}; - - use std::io::Cursor; - - use crate::db_snap; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; + use crate::{db_snap, CboRoaringBitmapCodec, StrStrU8Codec}; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); From cf0cd92ed46df14743b90c091055ccec37988e64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sat, 16 Jul 2022 20:26:59 +0200 Subject: [PATCH 1566/1889] Refactor Facets::execute to increase performance --- milli/src/update/facets.rs | 249 +++++++++++++++++++++++++++++++++++-- 1 file changed, 236 insertions(+), 13 deletions(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 4c4963b56..8899f0485 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,10 +1,11 @@ use std::fs::File; use std::num::{NonZeroU8, NonZeroUsize}; +use std::ops::RangeInclusive; use std::{cmp, mem}; use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{BytesEncode, Error}; +use heed::{BytesDecode, BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; use time::OffsetDateTime; @@ -86,13 +87,32 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; // Compute and store the faceted numbers documents ids. - let number_documents_ids = compute_faceted_numbers_documents_ids( - self.wtxn, - self.index.facet_id_f64_docids.remap_key_type::(), - field_id, - )?; + // let number_documents_ids = compute_faceted_numbers_documents_ids( + // self.wtxn, + // self.index.facet_id_f64_docids.remap_key_type::(), + // field_id, + // )?; - let facet_number_levels = compute_facet_number_levels( + // let facet_number_levels = compute_facet_number_levels( + // self.wtxn, + // self.index.facet_id_f64_docids, + // self.chunk_compression_type, + // self.chunk_compression_level, + // self.level_group_size, + // self.min_level_size, + // field_id, + // )?; + + // println!("printing 1"); + + // let mut cursor = facet_number_levels.into_cursor().unwrap(); + // while let Some((key, bitmap)) = cursor.move_on_next().unwrap() { + // let key = FacetLevelValueF64Codec::bytes_decode(key).unwrap(); + // let bitmap = CboRoaringBitmapCodec::bytes_decode(bitmap).unwrap(); + // println!("{key:?} {bitmap:?}"); + // } + + let (facet_number_levels_2, number_documents_ids) = compute_facet_number_levels_2( self.wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, @@ -102,6 +122,32 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; + // let mut writer = create_writer( + // self.chunk_compression_type, + // self.chunk_compression_level, + // tempfile::tempfile()?, + // ); + // for fnl in facet_number_levels_2 { + // let mut cursor = fnl.into_cursor().unwrap(); + // while let Some((key, bitmap)) = cursor.move_on_next().unwrap() { + // writer.insert(key, bitmap).unwrap(); + // } + // } + // let reader = writer_into_reader(writer)?; + // let mut cursor1 = reader.into_cursor().unwrap(); + // let mut cursor2 = facet_number_levels.into_cursor().unwrap(); + // loop { + // let (c1, c2) = (cursor1.move_on_next().unwrap(), cursor2.move_on_next().unwrap()); + // match (c1, c2) { + // (Some((k1, v1)), Some((k2, v2))) => { + // assert_eq!(k1, k2); + // assert_eq!(v1, v2); + // } + // (None, None) => break, + // _ => panic!(), + // } + // } + self.index.put_string_faceted_documents_ids( self.wtxn, field_id, @@ -113,12 +159,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { &number_documents_ids, )?; - write_into_lmdb_database( - self.wtxn, - *self.index.facet_id_f64_docids.as_polymorph(), - facet_number_levels, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" })?, - )?; + for facet_number_levels in facet_number_levels_2 { + write_into_lmdb_database( + self.wtxn, + *self.index.facet_id_f64_docids.as_polymorph(), + facet_number_levels, + |_, _| { + Err(InternalError::IndexingMergingKeys { process: "facet number levels" })? + }, + )?; + } write_into_lmdb_database( self.wtxn, @@ -143,6 +193,177 @@ fn clear_field_number_levels<'t>( db.delete_range(wtxn, &range).map(drop) } +fn compute_facet_number_levels_2<'t>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + field_id: FieldId, +) -> Result<(Vec>, RoaringBitmap)> { + let first_level_size = db + .remap_key_type::() + .prefix_iter(rtxn, &field_id.to_be_bytes())? + .remap_types::() + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + let level_0_range = { + let left = (field_id, 0, f64::MIN, f64::MIN); + let right = (field_id, 0, f64::MAX, f64::MAX); + left..=right + }; + + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) + .collect::>(); + + // dbg!(first_level_size, min_level_size); + // dbg!(level_group_size); + // dbg!(&group_size_iter); + + let mut number_document_ids = RoaringBitmap::new(); + + if let Some((top_level, _)) = group_size_iter.last() { + let subwriters = recursive_compute_levels( + rtxn, + db, + compression_type, + compression_level, + *top_level, + level_0_range, + level_group_size, + &mut |bitmaps, _, _| { + for bitmap in bitmaps { + number_document_ids |= bitmap; + } + Ok(()) + }, + )?; + Ok((subwriters, number_document_ids)) + } else { + let mut documents_ids = RoaringBitmap::new(); + for result in db.range(rtxn, &level_0_range)? { + let (_key, docids) = result?; + documents_ids |= docids; + } + + Ok((vec![], documents_ids)) + } +} + +fn recursive_compute_levels<'t>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + level: u8, + level_0_range: RangeInclusive<(FieldId, u8, f64, f64)>, + level_group_size: NonZeroUsize, + computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], f64, f64) -> Result<()>, +) -> Result>> { + let (field_id, level_0, first_left, first_right) = level_0_range.start().clone(); + assert_eq!(level_0, 0); + assert_eq!(first_left, first_right); + if level == 0 { + let mut bitmaps = vec![]; + + let mut first_f64_value = first_left; + let mut last_f64_value = first_left; + + let mut first_iteration_for_new_group = true; + for db_result_item in db.range(rtxn, &level_0_range)? { + let ((_field_id, _level, left, _right), docids) = db_result_item?; + // println!("level0: {left}"); + assert_eq!(_level, 0); + assert_eq!(left, _right); + if first_iteration_for_new_group { + first_f64_value = left; + first_iteration_for_new_group = false; + } + last_f64_value = left; + bitmaps.push(docids); + + if bitmaps.len() == level_group_size.get() { + // println!("callback first level with {bitmaps:?} {last_f64_value:?}"); + computed_group_bitmap(&bitmaps, first_f64_value, last_f64_value)?; + first_iteration_for_new_group = true; + bitmaps.clear(); + } + } + if !bitmaps.is_empty() { + // println!("end callback first level with {bitmaps:?} {last_f64_value:?}"); + computed_group_bitmap(&bitmaps, first_f64_value, last_f64_value)?; + bitmaps.clear(); + } + + // level 0 isn't actually stored in this DB, since it contains exactly the same information as that other DB + return Ok(vec![]); + } else { + let mut cur_writer = + create_writer(compression_type, compression_level, tempfile::tempfile()?); + + let mut range_for_bitmaps = vec![]; + let mut bitmaps = vec![]; + + let mut sub_writers = recursive_compute_levels( + rtxn, + db, + compression_type, + compression_level, + level - 1, + level_0_range, + level_group_size, + &mut |sub_bitmaps: &[RoaringBitmap], start_range, end_range| { + let mut combined_bitmap = RoaringBitmap::default(); + for bitmap in sub_bitmaps { + combined_bitmap |= bitmap; + } + range_for_bitmaps.push((start_range, end_range)); + + bitmaps.push(combined_bitmap); + if bitmaps.len() == level_group_size.get() { + let start_range = range_for_bitmaps.first().unwrap().0; + let end_range = range_for_bitmaps.last().unwrap().1; + // println!("callback level {} with {bitmaps:?} {last_f64_value:?}", level + 1); + computed_group_bitmap(&bitmaps, start_range, end_range)?; + for (bitmap, (start_range, end_range)) in + bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) + { + // println!("write {field_id} {level} {start_range} {end_range} {bitmap:?}"); + write_number_entry( + &mut cur_writer, + field_id, + level, + start_range, + end_range, + &bitmap, + )?; + } + } + // println!("end callback level {level}"); + Ok(()) + }, + )?; + if !bitmaps.is_empty() { + let start_range = range_for_bitmaps.first().unwrap().0; + let end_range = range_for_bitmaps.last().unwrap().1; + // println!("end callback level {} with {bitmaps:?} {last_f64_value:?}", level + 1); + computed_group_bitmap(&bitmaps, start_range, end_range)?; + for (bitmap, (left, right)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { + // println!("end write: {field_id} {level} {left} {right} {bitmap:?}"); + write_number_entry(&mut cur_writer, field_id, level, left, right, &bitmap)?; + } + } + + sub_writers.push(writer_into_reader(cur_writer)?); + return Ok(sub_writers); + } +} + fn compute_facet_number_levels<'t>( rtxn: &'t heed::RoTxn, db: heed::Database, @@ -175,6 +396,7 @@ fn compute_facet_number_levels<'t>( .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); for (level, group_size) in group_size_iter { + // dbg!(level, group_size); let mut left = 0.0; let mut right = 0.0; let mut group_docids = RoaringBitmap::new(); @@ -218,6 +440,7 @@ fn write_number_entry( let key = (field_id, level, left, right); let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + // println!(" w{field_id}-{level}-{left}-{right}"); writer.insert(&key, &data)?; Ok(()) } From 8d4b21a00525cec1095e237b1ec6c0a1473512d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 09:59:08 +0200 Subject: [PATCH 1567/1889] Switch string facet levels indexation to new algo Write the algorithm once for both numbers and strings --- milli/src/update/facets.rs | 505 ++++++++++++++----------------------- 1 file changed, 185 insertions(+), 320 deletions(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 8899f0485..b3d9f1c58 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,13 +1,12 @@ -use std::fs::File; -use std::num::{NonZeroU8, NonZeroUsize}; -use std::ops::RangeInclusive; -use std::{cmp, mem}; - use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; +use std::cmp; +use std::fs::File; +use std::num::{NonZeroU8, NonZeroUsize}; +use std::ops::RangeFrom; use time::OffsetDateTime; use crate::error::InternalError; @@ -66,14 +65,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - // Compute and store the faceted strings documents ids. - let string_documents_ids = compute_faceted_strings_documents_ids( - self.wtxn, - self.index.facet_id_string_docids.remap_key_type::(), - field_id, - )?; - - let facet_string_levels = compute_facet_string_levels( + let (facet_string_levels, string_documents_ids) = compute_facet_strings_levels( self.wtxn, self.index.facet_id_string_docids, self.chunk_compression_type, @@ -83,36 +75,26 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; + self.index.put_string_faceted_documents_ids( + self.wtxn, + field_id, + &string_documents_ids, + )?; + for facet_strings_levels in facet_string_levels { + write_into_lmdb_database( + self.wtxn, + *self.index.facet_id_string_docids.as_polymorph(), + facet_strings_levels, + |_, _| { + Err(InternalError::IndexingMergingKeys { process: "facet string levels" })? + }, + )?; + } + // Clear the facet number levels. clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; - // Compute and store the faceted numbers documents ids. - // let number_documents_ids = compute_faceted_numbers_documents_ids( - // self.wtxn, - // self.index.facet_id_f64_docids.remap_key_type::(), - // field_id, - // )?; - - // let facet_number_levels = compute_facet_number_levels( - // self.wtxn, - // self.index.facet_id_f64_docids, - // self.chunk_compression_type, - // self.chunk_compression_level, - // self.level_group_size, - // self.min_level_size, - // field_id, - // )?; - - // println!("printing 1"); - - // let mut cursor = facet_number_levels.into_cursor().unwrap(); - // while let Some((key, bitmap)) = cursor.move_on_next().unwrap() { - // let key = FacetLevelValueF64Codec::bytes_decode(key).unwrap(); - // let bitmap = CboRoaringBitmapCodec::bytes_decode(bitmap).unwrap(); - // println!("{key:?} {bitmap:?}"); - // } - - let (facet_number_levels_2, number_documents_ids) = compute_facet_number_levels_2( + let (facet_number_levels_2, number_documents_ids) = compute_facet_number_levels( self.wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, @@ -122,37 +104,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - // let mut writer = create_writer( - // self.chunk_compression_type, - // self.chunk_compression_level, - // tempfile::tempfile()?, - // ); - // for fnl in facet_number_levels_2 { - // let mut cursor = fnl.into_cursor().unwrap(); - // while let Some((key, bitmap)) = cursor.move_on_next().unwrap() { - // writer.insert(key, bitmap).unwrap(); - // } - // } - // let reader = writer_into_reader(writer)?; - // let mut cursor1 = reader.into_cursor().unwrap(); - // let mut cursor2 = facet_number_levels.into_cursor().unwrap(); - // loop { - // let (c1, c2) = (cursor1.move_on_next().unwrap(), cursor2.move_on_next().unwrap()); - // match (c1, c2) { - // (Some((k1, v1)), Some((k2, v2))) => { - // assert_eq!(k1, k2); - // assert_eq!(v1, v2); - // } - // (None, None) => break, - // _ => panic!(), - // } - // } - - self.index.put_string_faceted_documents_ids( - self.wtxn, - field_id, - &string_documents_ids, - )?; self.index.put_number_faceted_documents_ids( self.wtxn, field_id, @@ -169,31 +120,13 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { }, )?; } - - write_into_lmdb_database( - self.wtxn, - *self.index.facet_id_string_docids.as_polymorph(), - facet_string_levels, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" })?, - )?; } Ok(()) } } -fn clear_field_number_levels<'t>( - wtxn: &'t mut heed::RwTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result<()> { - let left = (field_id, 1, f64::MIN, f64::MIN); - let right = (field_id, u8::MAX, f64::MAX, f64::MAX); - let range = left..=right; - db.delete_range(wtxn, &range).map(drop) -} - -fn compute_facet_number_levels_2<'t>( +fn compute_facet_number_levels<'t>( rtxn: &'t heed::RoTxn, db: heed::Database, compression_type: CompressionType, @@ -208,11 +141,7 @@ fn compute_facet_number_levels_2<'t>( .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_range = { - let left = (field_id, 0, f64::MIN, f64::MIN); - let right = (field_id, 0, f64::MAX, f64::MAX); - left..=right - }; + let level_0_start = (field_id, 0, f64::MIN, f64::MIN); // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -221,32 +150,38 @@ fn compute_facet_number_levels_2<'t>( .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) .collect::>(); - // dbg!(first_level_size, min_level_size); - // dbg!(level_group_size); - // dbg!(&group_size_iter); - let mut number_document_ids = RoaringBitmap::new(); if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = recursive_compute_levels( - rtxn, - db, - compression_type, - compression_level, - *top_level, - level_0_range, - level_group_size, - &mut |bitmaps, _, _| { - for bitmap in bitmaps { - number_document_ids |= bitmap; - } - Ok(()) - }, - )?; + let subwriters = + recursive_compute_levels::( + rtxn, + db, + compression_type, + compression_level, + *top_level, + level_0_start, + &(level_0_start..), + first_level_size, + level_group_size, + &mut |bitmaps, _, _| { + for bitmap in bitmaps { + number_document_ids |= bitmap; + } + Ok(()) + }, + &|_i, (_field_id, _level, left, _right)| *left, + &|bitmap| bitmap, + &|writer, level, left, right, docids| { + write_number_entry(writer, field_id, level.get(), left, right, &docids)?; + Ok(()) + }, + )?; + Ok((subwriters, number_document_ids)) } else { let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &level_0_range)? { + for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { let (_key, docids) = result?; documents_ids |= docids; } @@ -255,52 +190,129 @@ fn compute_facet_number_levels_2<'t>( } } -fn recursive_compute_levels<'t>( +fn compute_facet_strings_levels<'t>( rtxn: &'t heed::RoTxn, - db: heed::Database, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + field_id: FieldId, +) -> Result<(Vec>, RoaringBitmap)> { + let first_level_size = db + .remap_key_type::() + .prefix_iter(rtxn, &field_id.to_be_bytes())? + .remap_types::() + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + let level_0_start = (field_id, ""); + + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) + .collect::>(); + + let mut strings_document_ids = RoaringBitmap::new(); + + if let Some((top_level, _)) = group_size_iter.last() { + let subwriters = recursive_compute_levels::< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + (u32, &str), + >( + rtxn, + db, + compression_type, + compression_level, + *top_level, + level_0_start, + &(level_0_start..), + first_level_size, + level_group_size, + &mut |bitmaps, _, _| { + for bitmap in bitmaps { + strings_document_ids |= bitmap; + } + Ok(()) + }, + &|i, (_field_id, value)| (i as u32, *value), + &|value| value.1, + &|writer, level, start_bound, end_bound, docids| { + write_string_entry(writer, field_id, level, start_bound, end_bound, docids)?; + Ok(()) + }, + )?; + + Ok((subwriters, strings_document_ids)) + } else { + let mut documents_ids = RoaringBitmap::new(); + for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { + let (_key, (_original_value, docids)) = result?; + documents_ids |= docids; + } + + Ok((vec![], documents_ids)) + } +} + +fn recursive_compute_levels<'t, KeyCodec, ValueCodec, Bound>( + rtxn: &'t heed::RoTxn, + db: heed::Database, compression_type: CompressionType, compression_level: Option, level: u8, - level_0_range: RangeInclusive<(FieldId, u8, f64, f64)>, + level_0_start: >::DItem, + level_0_range: &'t RangeFrom<>::DItem>, + level_0_size: usize, level_group_size: NonZeroUsize, - computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], f64, f64) -> Result<()>, -) -> Result>> { - let (field_id, level_0, first_left, first_right) = level_0_range.start().clone(); - assert_eq!(level_0, 0); - assert_eq!(first_left, first_right); + computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], Bound, Bound) -> Result<()>, + bound_from_db_key: &dyn for<'a> Fn(usize, &'a >::DItem) -> Bound, + bitmap_from_db_value: &dyn Fn(>::DItem) -> RoaringBitmap, + write_entry: &dyn Fn(&mut Writer, NonZeroU8, Bound, Bound, RoaringBitmap) -> Result<()>, +) -> Result>> +where + KeyCodec: for<'a> BytesEncode<'a> + + for<'a> BytesDecode<'a, DItem = >::EItem>, + for<'a> >::EItem: Sized, + ValueCodec: for<'a> BytesEncode<'a> + + for<'a> BytesDecode<'a, DItem = >::EItem>, + for<'a> >::EItem: Sized, + Bound: Copy, +{ if level == 0 { + // base case for the recursion + let mut bitmaps = vec![]; - let mut first_f64_value = first_left; - let mut last_f64_value = first_left; - + let mut start_bound = bound_from_db_key(0, &level_0_start); + let mut end_bound = bound_from_db_key(0, &level_0_start); let mut first_iteration_for_new_group = true; - for db_result_item in db.range(rtxn, &level_0_range)? { - let ((_field_id, _level, left, _right), docids) = db_result_item?; - // println!("level0: {left}"); - assert_eq!(_level, 0); - assert_eq!(left, _right); + for (i, db_result_item) in db.range(rtxn, level_0_range)?.take(level_0_size).enumerate() { + let (key, value) = db_result_item?; + + let bound = bound_from_db_key(i, &key); + let docids = bitmap_from_db_value(value); + if first_iteration_for_new_group { - first_f64_value = left; + start_bound = bound; first_iteration_for_new_group = false; } - last_f64_value = left; + end_bound = bound; bitmaps.push(docids); if bitmaps.len() == level_group_size.get() { - // println!("callback first level with {bitmaps:?} {last_f64_value:?}"); - computed_group_bitmap(&bitmaps, first_f64_value, last_f64_value)?; + computed_group_bitmap(&bitmaps, start_bound, end_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); } } if !bitmaps.is_empty() { - // println!("end callback first level with {bitmaps:?} {last_f64_value:?}"); - computed_group_bitmap(&bitmaps, first_f64_value, last_f64_value)?; + computed_group_bitmap(&bitmaps, start_bound, end_bound)?; bitmaps.clear(); } - - // level 0 isn't actually stored in this DB, since it contains exactly the same information as that other DB + // level 0 is already stored in the DB return Ok(vec![]); } else { let mut cur_writer = @@ -315,7 +327,9 @@ fn recursive_compute_levels<'t>( compression_type, compression_level, level - 1, + level_0_start, level_0_range, + level_0_size, level_group_size, &mut |sub_bitmaps: &[RoaringBitmap], start_range, end_range| { let mut combined_bitmap = RoaringBitmap::default(); @@ -326,36 +340,33 @@ fn recursive_compute_levels<'t>( bitmaps.push(combined_bitmap); if bitmaps.len() == level_group_size.get() { - let start_range = range_for_bitmaps.first().unwrap().0; - let end_range = range_for_bitmaps.last().unwrap().1; - // println!("callback level {} with {bitmaps:?} {last_f64_value:?}", level + 1); - computed_group_bitmap(&bitmaps, start_range, end_range)?; - for (bitmap, (start_range, end_range)) in + let start_bound = range_for_bitmaps.first().unwrap().0; + let end_bound = range_for_bitmaps.last().unwrap().1; + computed_group_bitmap(&bitmaps, start_bound, end_bound)?; + for (bitmap, (start_bound, end_bound)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - // println!("write {field_id} {level} {start_range} {end_range} {bitmap:?}"); - write_number_entry( + write_entry( &mut cur_writer, - field_id, - level, - start_range, - end_range, - &bitmap, + NonZeroU8::new(level).unwrap(), + start_bound, + end_bound, + bitmap, )?; } } - // println!("end callback level {level}"); Ok(()) }, + bound_from_db_key, + bitmap_from_db_value, + write_entry, )?; if !bitmaps.is_empty() { let start_range = range_for_bitmaps.first().unwrap().0; let end_range = range_for_bitmaps.last().unwrap().1; - // println!("end callback level {} with {bitmaps:?} {last_f64_value:?}", level + 1); computed_group_bitmap(&bitmaps, start_range, end_range)?; for (bitmap, (left, right)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - // println!("end write: {field_id} {level} {left} {right} {bitmap:?}"); - write_number_entry(&mut cur_writer, field_id, level, left, right, &bitmap)?; + write_entry(&mut cur_writer, NonZeroU8::new(level).unwrap(), left, right, bitmap)?; } } @@ -364,113 +375,15 @@ fn recursive_compute_levels<'t>( } } -fn compute_facet_number_levels<'t>( - rtxn: &'t heed::RoTxn, +fn clear_field_number_levels<'t>( + wtxn: &'t mut heed::RwTxn, db: heed::Database, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, field_id: FieldId, -) -> Result> { - let first_level_size = db - .remap_key_type::() - .prefix_iter(rtxn, &field_id.to_be_bytes())? - .remap_types::() - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - - // It is forbidden to keep a cursor and write in a database at the same time with LMDB - // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = create_writer(compression_type, compression_level, tempfile::tempfile()?); - - let level_0_range = { - let left = (field_id, 0, f64::MIN, f64::MIN); - let right = (field_id, 0, f64::MAX, f64::MAX); - left..=right - }; - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); - - for (level, group_size) in group_size_iter { - // dbg!(level, group_size); - let mut left = 0.0; - let mut right = 0.0; - let mut group_docids = RoaringBitmap::new(); - - for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { - let ((_field_id, _level, value, _right), docids) = result?; - - if i == 0 { - left = value; - } else if i % group_size == 0 { - // we found the first bound of the next group, we must store the left - // and right bounds associated with the docids. - write_number_entry(&mut writer, field_id, level, left, right, &group_docids)?; - - // We save the left bound for the new group and also reset the docids. - group_docids = RoaringBitmap::new(); - left = value; - } - - // The right bound is always the bound we run through. - group_docids |= docids; - right = value; - } - - if !group_docids.is_empty() { - write_number_entry(&mut writer, field_id, level, left, right, &group_docids)?; - } - } - - writer_into_reader(writer) -} - -fn write_number_entry( - writer: &mut Writer, - field_id: FieldId, - level: u8, - left: f64, - right: f64, - ids: &RoaringBitmap, -) -> Result<()> { - let key = (field_id, level, left, right); - let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; - // println!(" w{field_id}-{level}-{left}-{right}"); - writer.insert(&key, &data)?; - Ok(()) -} - -fn compute_faceted_strings_documents_ids( - rtxn: &heed::RoTxn, - db: heed::Database, - field_id: FieldId, -) -> Result { - let mut documents_ids = RoaringBitmap::new(); - for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { - let (_key, (_original_value, docids)) = result?; - documents_ids |= docids; - } - - Ok(documents_ids) -} - -fn compute_faceted_numbers_documents_ids( - rtxn: &heed::RoTxn, - db: heed::Database, - field_id: FieldId, -) -> Result { - let mut documents_ids = RoaringBitmap::new(); - for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { - let (_key, docids) = result?; - documents_ids |= docids; - } - - Ok(documents_ids) +) -> heed::Result<()> { + let left = (field_id, 1, f64::MIN, f64::MIN); + let right = (field_id, u8::MAX, f64::MAX, f64::MAX); + let range = left..=right; + db.delete_range(wtxn, &range).map(drop) } fn clear_field_string_levels<'t>( @@ -484,68 +397,20 @@ fn clear_field_string_levels<'t>( db.remap_key_type::().delete_range(wtxn, &range).map(drop) } -fn compute_facet_string_levels<'t>( - rtxn: &'t heed::RoTxn, - db: heed::Database, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, +fn write_number_entry( + writer: &mut Writer, field_id: FieldId, -) -> Result> { - let first_level_size = db - .remap_key_type::() - .prefix_iter(rtxn, &field_id.to_be_bytes())? - .remap_types::() - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - - // It is forbidden to keep a cursor and write in a database at the same time with LMDB - // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = create_writer(compression_type, compression_level, tempfile::tempfile()?); - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); - - for (level, group_size) in group_size_iter { - let level = NonZeroU8::new(level).unwrap(); - let mut left = (0, ""); - let mut right = (0, ""); - let mut group_docids = RoaringBitmap::new(); - - // Because we know the size of the level 0 we can use a range iterator that starts - // at the first value of the level and goes to the last by simply counting. - for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() { - let ((_field_id, value), (_original_value, docids)) = result?; - - if i == 0 { - left = (i as u32, value); - } else if i % group_size == 0 { - // we found the first bound of the next group, we must store the left - // and right bounds associated with the docids. We also reset the docids. - let docids = mem::take(&mut group_docids); - write_string_entry(&mut writer, field_id, level, left, right, docids)?; - - // We save the left bound for the new group. - left = (i as u32, value); - } - - // The right bound is always the bound we run through. - group_docids |= docids; - right = (i as u32, value); - } - - if !group_docids.is_empty() { - let docids = mem::take(&mut group_docids); - write_string_entry(&mut writer, field_id, level, left, right, docids)?; - } - } - - writer_into_reader(writer) + level: u8, + left: f64, + right: f64, + ids: &RoaringBitmap, +) -> Result<()> { + let key = (field_id, level, left, right); + let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) } - fn write_string_entry( writer: &mut Writer, field_id: FieldId, From 39687908f1af00264f1bdd1eacdd57c51dfe98cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 20 Jul 2022 09:49:40 +0200 Subject: [PATCH 1568/1889] Add documentation and comments to facets.rs --- milli/src/update/facets.rs | 193 +++++++++++++++++++++++++++++++++++-- 1 file changed, 184 insertions(+), 9 deletions(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index b3d9f1c58..56529a3c5 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,12 +1,138 @@ +/*! +This module initialises the databases that are used to quickly get the list +of documents with a faceted field value falling within a certain range. For +example, they can be used to implement filters such as `x >= 3`. + +These databases are `facet_id_string_docids` and `facet_id_f64_docids`. + +## Example with numbers + +In the case of numbers, we start with a sorted list whose keys are +`(field_id, number_value)` and whose value is a roaring bitmap of the document ids +which contain the value `number_value` for the faceted field `field_id`. + +From this list, we want to compute two things: + +1. the bitmap of all documents that contain **any** number for each faceted field +2. a structure that allows us to use a (sort of) binary search to find all documents +containing numbers inside a certain range for a faceted field + +To achieve goal (2), we recursively split the list into chunks. Every time we split it, we +create a new "level" that is several times smaller than the level below it. The base level, +level 0, is the starting list. Level 1 is composed of chunks of up to N elements. Each element +contains a range and a bitmap of docids. Level 2 is composed of chunks up to N^2 elements, etc. + +For example, let's say we have 26 documents which we identify through the letters a-z. +We will focus on a single faceted field. When there are multiple faceted fields, the structure +described below is simply repeated for each field. + +What we want to obtain is the following structure for each faceted field: +```text +┌───────┐ ┌───────────────────────────────────────────────────────────────────────────────┐ +│ all │ │ [a, b, c, d, e, f, g, u, y, z] │ +└───────┘ └───────────────────────────────────────────────────────────────────────────────┘ + ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ +┌───────┐ │ 1.2 – 2 │ 3.4 – 100 │ 102 – 104 │ +│Level 2│ │ │ │ │ +└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ + ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ +┌───────┐ │ 1.2 – 1.3 │ 1.6 – 2 │ 3.4 – 12 │ 12.3 – 100 │ 102 – 104 │ +│Level 1│ │ │ │ │ │ │ +└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ + ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ +┌───────┐ │ 1.2 │ 1.3 │ 1.6 │ 2 │ 3.4 │ 12 │ 12.3 │ 100 │ 102 │ 104 │ +│Level 0│ │ │ │ │ │ │ │ │ │ │ │ +└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ + └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ +``` + +You can read more about this structure (for strings) in `[crate::search::facet::facet_strings]`. + +To create the levels, we use a recursive algorithm which makes sure that we only need to iterate +over the elements of level 0 once. It is implemented by [`recursive_compute_levels`]. + +## Encoding + +### Numbers +For numbers we use the same encoding for level 0 and the other levels. + +The key is given by `FacetLevelValueF64Codec`. It consists of: +1. The field id : u16 +2. The height of the level : u8 +3. The start bound : f64 +4. The end bound : f64 +Note that at level 0, we have start bound == end bound. + +The value is a serialised `RoaringBitmap`. + +### Strings + +For strings, we use a different encoding for level 0 and the other levels. + +At level 0, the key is given by `FacetStringLevelZeroCodec`. It consists of: +1. The field id : u16 +2. The height of the level : u8 <-- always == 0 +3. The normalised string value : &str + +And the value is given by `FacetStringLevelZeroValueCodec`. It consists of: +1. The original string +2. A serialised `RoaringBitmap` + +At level 1, the key is given by `FacetLevelValueU32Codec`. It consists of: +1. The field id : u16 +2. The height of the level : u8 <-- always >= 1 +3. The start bound : u32 +4. The end bound : u32 +where the bounds are indices inside level 0. + +The value is given by `FacetStringZeroBoundsValueCodec`. +If the level is 1, then it consists of: +1. The normalised string of the start bound +2. The normalised string of the end bound +3. A serialised `RoaringBitmap` + +If the level is higher, then it consists only of the serialised roaring bitmap. + +The distinction between the value encoding of level 1 and the levels above it +is to allow us to retrieve the value in level 0 quickly by reading the key of +level 1 (we obtain the string value of the bound and execute a prefix search +in the database). + +Therefore, for strings, the structure for a single faceted field looks more like this: +```text +┌───────┐ ┌───────────────────────────────────────────────────────────────────────────────┐ +│ all │ │ [a, b, c, d, e, f, g, u, y, z] │ +└───────┘ └───────────────────────────────────────────────────────────────────────────────┘ + + ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ +┌───────┐ │ 0 – 3 │ 4 – 7 │ 8 – 9 │ +│Level 2│ │ │ │ │ +└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ + ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ +┌───────┐ │ 0 – 1 │ 2 – 3 │ 4 – 5 │ 6 – 7 │ 8 – 9 │ +│Level 1│ │ "ab" – "ac" │ "ba" – "bac" │ "gaf" – "gal" │"form" – "wow" │ "woz" – "zz" │ +└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ + ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ +┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │ +│Level 0│ │ "AB" │ " Ac" │ "ba " │ "Bac" │ " GAF"│ "gal" │ "Form"│ " wow"│ "woz" │ "ZZ" │ +└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ + └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ + +The first line in a cell is its key (without the field id and level height) and the last two +lines are its values. +``` +*/ + +use std::cmp; +use std::fs::File; +use std::num::{NonZeroU8, NonZeroUsize}; +use std::ops::RangeFrom; + use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; -use std::cmp; -use std::fs::File; -use std::num::{NonZeroU8, NonZeroUsize}; -use std::ops::RangeFrom; use time::OffsetDateTime; use crate::error::InternalError; @@ -80,11 +206,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, &string_documents_ids, )?; - for facet_strings_levels in facet_string_levels { + for facet_strings_level in facet_string_levels { write_into_lmdb_database( self.wtxn, *self.index.facet_id_string_docids.as_polymorph(), - facet_strings_levels, + facet_strings_level, |_, _| { Err(InternalError::IndexingMergingKeys { process: "facet string levels" })? }, @@ -94,7 +220,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { // Clear the facet number levels. clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; - let (facet_number_levels_2, number_documents_ids) = compute_facet_number_levels( + let (facet_number_levels, number_documents_ids) = compute_facet_number_levels( self.wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, @@ -110,11 +236,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { &number_documents_ids, )?; - for facet_number_levels in facet_number_levels_2 { + for facet_number_level in facet_number_levels { write_into_lmdb_database( self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), - facet_number_levels, + facet_number_level, |_, _| { Err(InternalError::IndexingMergingKeys { process: "facet number levels" })? }, @@ -257,6 +383,43 @@ fn compute_facet_strings_levels<'t>( } } +/** +Compute a level from the levels below it, with the elements of level 0 already existing in the given `db`. + +This function is generic to work with both numbers and strings. The generic type parameters are: +* `KeyCodec`/`ValueCodec`: the codecs used to read the elements of the database. +* `Bound`: part of the range in the levels structure. For example, for numbers, the `Bound` is `f64` +because each chunk in a level contains a range such as (1.2 ..= 4.5). + +## Arguments +* `rtxn` : LMDB read transaction +* `db`: a database which already contains a `level 0` +* `compression_type`/`compression_level`: parameters used to create the `grenad::Writer` that +will contain the new levels +* `level` : the height of the level to create, or `0` to read elements from level 0. +* `level_0_start` : a key in the database that points to the beginning of its level 0 +* `level_0_range` : equivalent to `level_0_start..` +* `level_0_size` : the number of elements in level 0 +* `level_group_size` : the number of elements from the level below that are represented by a +* single element of the new level +* `computed_group_bitmap` : a callback that is called whenever at most `level_group_size` elements +from the level below were read/created. Its arguments are: + 0. the list of bitmaps from each read/created element of the level below + 1. the start bound corresponding to the first element + 2. the end bound corresponding to the last element +* `bound_from_db_key` : finds the `Bound` from a key in the database +* `bitmap_from_db_value` : finds the `RoaringBitmap` from a value in the database +* `write_entry` : writes an element of a level into the writer. The arguments are: + 0. the writer + 1. the height of the level + 2. the start bound + 3. the end bound + 4. the docids of all elements between the start and end bound + +## Return +A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` +that must be inserted into the database. +*/ fn recursive_compute_levels<'t, KeyCodec, ValueCodec, Bound>( rtxn: &'t heed::RoTxn, db: heed::Database, @@ -284,6 +447,9 @@ where if level == 0 { // base case for the recursion + // we read the elements one by one and + // 1. keep track of the start and end bounds + // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read let mut bitmaps = vec![]; let mut start_bound = bound_from_db_key(0, &level_0_start); @@ -308,6 +474,7 @@ where bitmaps.clear(); } } + // don't forget to give the leftover bitmaps as well if !bitmaps.is_empty() { computed_group_bitmap(&bitmaps, start_bound, end_bound)?; bitmaps.clear(); @@ -315,12 +482,19 @@ where // level 0 is already stored in the DB return Ok(vec![]); } else { + // level >= 1 + // we compute each element of this level based on the elements of the level below it + // once we have computed `level_group_size` elements, we give the start and end bounds + // of those elements, and their bitmaps, to the level above + let mut cur_writer = create_writer(compression_type, compression_level, tempfile::tempfile()?); let mut range_for_bitmaps = vec![]; let mut bitmaps = vec![]; + // compute the levels below + // in the callback, we fill `cur_writer` with the correct elements for this level let mut sub_writers = recursive_compute_levels( rtxn, db, @@ -361,6 +535,7 @@ where bitmap_from_db_value, write_entry, )?; + // don't forget to insert the leftover elements into the writer as well if !bitmaps.is_empty() { let start_range = range_for_bitmaps.first().unwrap().0; let end_range = range_for_bitmaps.last().unwrap().1; From 258c3dd5637249d31568b83e6f8b490a41c56903 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 20 Jun 2022 18:46:57 +0200 Subject: [PATCH 1569/1889] Make AND+OR filters n-ary (store a vector of subfilters instead of 2) NOTE: The token_at_depth is method is a bit useless now, as the only cases where there would be a toke at depth 1000 are the cases where the parser already stack-overflowed earlier. Example: (((((... (x=1) ...))))) --- filter-parser/src/lib.rs | 117 +++++++++++++++++++------------ http-ui/src/main.rs | 7 +- milli/src/search/facet/filter.rs | 104 +++++++++++---------------- 3 files changed, 118 insertions(+), 110 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 01be432d7..8da3da35f 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -113,8 +113,8 @@ impl<'a> From> for Token<'a> { #[derive(Debug, Clone, PartialEq, Eq)] pub enum FilterCondition<'a> { Condition { fid: Token<'a>, op: Condition<'a> }, - Or(Box, Box), - And(Box, Box), + Or(Vec), + And(Vec), GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> }, GeoGreaterThan { point: [Token<'a>; 2], radius: Token<'a> }, } @@ -124,13 +124,23 @@ impl<'a> FilterCondition<'a> { pub fn token_at_depth(&self, depth: usize) -> Option<&Token> { match self { FilterCondition::Condition { fid, .. } if depth == 0 => Some(fid), - FilterCondition::Or(left, right) => { + FilterCondition::Or(subfilters) => { let depth = depth.saturating_sub(1); - right.token_at_depth(depth).or_else(|| left.token_at_depth(depth)) + for f in subfilters.iter() { + if let Some(t) = f.token_at_depth(depth) { + return Some(t); + } + } + None } - FilterCondition::And(left, right) => { + FilterCondition::And(subfilters) => { let depth = depth.saturating_sub(1); - right.token_at_depth(depth).or_else(|| left.token_at_depth(depth)) + for f in subfilters.iter() { + if let Some(t) = f.token_at_depth(depth) { + return Some(t); + } + } + None } FilterCondition::GeoLowerThan { point: [point, _], .. } if depth == 0 => Some(point), FilterCondition::GeoGreaterThan { point: [point, _], .. } if depth == 0 => Some(point), @@ -144,13 +154,13 @@ impl<'a> FilterCondition<'a> { match self { Condition { fid, op } => match op.negate() { (op, None) => Condition { fid, op }, - (a, Some(b)) => Or( + (a, Some(b)) => Or(vec![ Condition { fid: fid.clone(), op: a }.into(), Condition { fid, op: b }.into(), - ), + ]), }, - Or(a, b) => And(a.negate().into(), b.negate().into()), - And(a, b) => Or(a.negate().into(), b.negate().into()), + Or(subfilters) => And(subfilters.into_iter().map(|x| x.negate().into()).collect()), + And(subfilters) => Or(subfilters.into_iter().map(|x| x.negate().into()).collect()), GeoLowerThan { point, radius } => GeoGreaterThan { point, radius }, GeoGreaterThan { point, radius } => GeoLowerThan { point, radius }, } @@ -172,26 +182,36 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) /// or = and ("OR" WS+ and)* fn parse_or(input: Span) -> IResult { - let (input, lhs) = parse_and(input)?; + let (input, first_filter) = parse_and(input)?; // if we found a `OR` then we MUST find something next - let (input, ors) = many0(preceded(ws(tuple((tag("OR"), multispace1))), cut(parse_and)))(input)?; + let (input, mut ors) = + many0(preceded(ws(tuple((tag("OR"), multispace1))), cut(parse_and)))(input)?; - let expr = ors - .into_iter() - .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch))); - Ok((input, expr)) + let filter = if ors.is_empty() { + first_filter + } else { + ors.insert(0, first_filter); + FilterCondition::Or(ors) + }; + + Ok((input, filter)) } /// and = not ("AND" not)* fn parse_and(input: Span) -> IResult { - let (input, lhs) = parse_not(input)?; + let (input, first_filter) = parse_not(input)?; // if we found a `AND` then we MUST find something next - let (input, ors) = + let (input, mut ands) = many0(preceded(ws(tuple((tag("AND"), multispace1))), cut(parse_not)))(input)?; - let expr = ors - .into_iter() - .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); - Ok((input, expr)) + + let filter = if ands.is_empty() { + first_filter + } else { + ands.insert(0, first_filter); + FilterCondition::And(ands) + }; + + Ok((input, filter)) } /// not = ("NOT" WS+ not) | primary @@ -477,7 +497,7 @@ pub mod tests { ( "NOT subscribers 100 TO 1000", Fc::Or( - Fc::Condition { + vec![Fc::Condition { fid: rtok("NOT ", "subscribers"), op: Condition::LowerThan(rtok("NOT subscribers ", "100")), } @@ -486,7 +506,7 @@ pub mod tests { fid: rtok("NOT ", "subscribers"), op: Condition::GreaterThan(rtok("NOT subscribers 100 TO ", "1000")), } - .into(), + .into()], ), ), ( @@ -506,7 +526,7 @@ pub mod tests { // test simple `or` and `and` ( "channel = ponce AND 'dog race' != 'bernese mountain'", - Fc::And( + Fc::And(vec![ Fc::Condition { fid: rtok("", "channel"), op: Condition::Equal(rtok("channel = ", "ponce")), @@ -520,11 +540,11 @@ pub mod tests { )), } .into(), - ), + ]), ), ( "channel = ponce OR 'dog race' != 'bernese mountain'", - Fc::Or( + Fc::Or(vec![ Fc::Condition { fid: rtok("", "channel"), op: Condition::Equal(rtok("channel = ", "ponce")), @@ -538,12 +558,12 @@ pub mod tests { )), } .into(), - ), + ]), ), ( "channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000", - Fc::Or( - Fc::And( + Fc::Or(vec![ + Fc::And(vec![ Fc::Condition { fid: rtok("", "channel"), op: Condition::Equal(rtok("channel = ", "ponce")), @@ -557,7 +577,7 @@ pub mod tests { )), } .into(), - ) + ]) .into(), Fc::Condition { fid: rtok( @@ -570,30 +590,30 @@ pub mod tests { )), } .into(), - ), + ]), ), // test parenthesis ( "channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )", - Fc::And( + Fc::And(vec![ Fc::Condition { fid: rtok("", "channel"), op: Condition::Equal(rtok("channel = ", "ponce")) }.into(), - Fc::Or( - Fc::Condition { fid: rtok("channel = ponce AND ( '", "dog race"), op: Condition::NotEqual(rtok("channel = ponce AND ( 'dog race' != '", "bernese mountain"))}.into(), - Fc::Condition { fid: rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Condition::GreaterThan(rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(), - ).into()), + Fc::Or(vec![ + Fc::Condition { fid: rtok("channel = ponce AND ( '", "dog race"), op: Condition::NotEqual(rtok("channel = ponce AND ( 'dog race' != '", "bernese mountain"))}.into(), + Fc::Condition { fid: rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Condition::GreaterThan(rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(),] + ).into()]), ), ( "(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14)", - Fc::And( - Fc::Or( - Fc::And( + Fc::And(vec![ + Fc::Or(vec![ + Fc::And(vec![ Fc::Condition { fid: rtok("(", "channel"), op: Condition::Equal(rtok("(channel = ", "ponce")) }.into(), Fc::Condition { fid: rtok("(channel = ponce AND '", "dog race"), op: Condition::NotEqual(rtok("(channel = ponce AND 'dog race' != '", "bernese mountain")) }.into(), - ).into(), + ]).into(), Fc::Condition { fid: rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Condition::GreaterThan(rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(), - ).into(), + ]).into(), Fc::GeoLowerThan { point: [rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(", "12"), rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, ", "13")], radius: rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, ", "14") }.into() - ) + ]) ) ]; @@ -657,6 +677,15 @@ pub mod tests { #[test] fn depth() { let filter = FilterCondition::parse("account_ids=1 OR account_ids=2 OR account_ids=3 OR account_ids=4 OR account_ids=5 OR account_ids=6").unwrap().unwrap(); - assert!(filter.token_at_depth(5).is_some()); + assert!(filter.token_at_depth(1).is_some()); + assert!(filter.token_at_depth(2).is_none()); + + let filter = FilterCondition::parse("(account_ids=1 OR (account_ids=2 AND account_ids=3) OR (account_ids=4 AND account_ids=5) OR account_ids=6)").unwrap().unwrap(); + assert!(filter.token_at_depth(2).is_some()); + assert!(filter.token_at_depth(3).is_none()); + + let filter = FilterCondition::parse("account_ids=1 OR account_ids=2 AND account_ids=3 OR account_ids=4 AND account_ids=5 OR account_ids=6").unwrap().unwrap(); + assert!(filter.token_at_depth(2).is_some()); + assert!(filter.token_at_depth(3).is_none()); } } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 83fce9a9c..da5595cc0 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -745,10 +745,9 @@ async fn main() -> anyhow::Result<()> { }; let condition = match (filters, facet_filters) { - (Some(filters), Some(facet_filters)) => Some(FilterCondition::And( - Box::new(filters.into()), - Box::new(facet_filters.into()), - )), + (Some(filters), Some(facet_filters)) => { + Some(FilterCondition::And(vec![filters.into(), facet_filters.into()])) + } (Some(condition), None) | (None, Some(condition)) => Some(condition.into()), _otherwise => None, }; diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 90aab826a..d14b33f80 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -89,52 +89,44 @@ impl<'a> Filter<'a> { I: IntoIterator>, J: IntoIterator, { - let mut ands: Option = None; + let mut ands = vec![]; for either in array { match either { Either::Left(array) => { - let mut ors = None; + let mut ors = vec![]; for rule in array { if let Some(filter) = Self::from_str(rule.as_ref())? { - let condition = filter.condition; - ors = match ors.take() { - Some(ors) => { - Some(FilterCondition::Or(Box::new(ors), Box::new(condition))) - } - None => Some(condition), - }; + ors.push(filter.condition); } } - if let Some(rule) = ors { - ands = match ands.take() { - Some(ands) => { - Some(FilterCondition::And(Box::new(ands), Box::new(rule))) - } - None => Some(rule), - }; + if ors.len() > 1 { + ands.push(FilterCondition::Or(ors)); + } else if ors.len() == 1 { + ands.push(ors[0].clone()); } } Either::Right(rule) => { if let Some(filter) = Self::from_str(rule.as_ref())? { - let condition = filter.condition; - ands = match ands.take() { - Some(ands) => { - Some(FilterCondition::And(Box::new(ands), Box::new(condition))) - } - None => Some(condition), - }; + ands.push(filter.condition); } } } } + let and = if ands.is_empty() { + return Ok(None); + } else if ands.len() == 1 { + ands[0].clone() + } else { + FilterCondition::And(ands) + }; - if let Some(token) = ands.as_ref().and_then(|fc| fc.token_at_depth(MAX_FILTER_DEPTH)) { + if let Some(token) = and.token_at_depth(MAX_FILTER_DEPTH) { return Err(token.as_external_error(FilterError::TooDeep).into()); } - Ok(ands.map(|ands| Self { condition: ands })) + Ok(Some(Self { condition: and })) } pub fn from_str(expression: &'a str) -> Result> { @@ -397,38 +389,28 @@ impl<'a> Filter<'a> { } } } - FilterCondition::Or(lhs, rhs) => { - let lhs = Self::inner_evaluate( - &(lhs.as_ref().clone()).into(), - rtxn, - index, - filterable_fields, - )?; - let rhs = Self::inner_evaluate( - &(rhs.as_ref().clone()).into(), - rtxn, - index, - filterable_fields, - )?; - Ok(lhs | rhs) - } - FilterCondition::And(lhs, rhs) => { - let lhs = Self::inner_evaluate( - &(lhs.as_ref().clone()).into(), - rtxn, - index, - filterable_fields, - )?; - if lhs.is_empty() { - return Ok(lhs); + FilterCondition::Or(subfilters) => { + let mut bitmap = RoaringBitmap::new(); + for f in subfilters { + bitmap |= Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; + } + Ok(bitmap) + } + FilterCondition::And(subfilters) => { + let mut subfilters_iter = subfilters.iter(); + if let Some(first_subfilter) = subfilters_iter.next() { + let mut bitmap = + Self::inner_evaluate(&(first_subfilter.clone()).into(), rtxn, index, filterable_fields)?; + for f in subfilters_iter { + if bitmap.is_empty() { + return Ok(bitmap); + } + bitmap &= Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; + } + Ok(bitmap) + } else { + Ok(RoaringBitmap::new()) } - let rhs = Self::inner_evaluate( - &(rhs.as_ref().clone()).into(), - rtxn, - index, - filterable_fields, - )?; - Ok(lhs & rhs) } FilterCondition::GeoLowerThan { point, radius } => { if filterable_fields.contains("_geo") { @@ -732,12 +714,10 @@ mod tests { } } - let error = Filter::from_str(&filter_string).unwrap_err(); - assert!( - error.to_string().starts_with("Too many filter conditions"), - "{}", - error.to_string() - ); + // Note: the filter used to be rejected for being too deep, but that is + // no longer the case + let filter = Filter::from_str(&filter_string).unwrap(); + assert!(filter.is_some()); } #[test] From 01675771d5e82b36dbee2bb8f9e61a33b0349f3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Jun 2022 15:08:40 +0200 Subject: [PATCH 1570/1889] Reimplement `!=` filter to select all docids not selected by `=` --- milli/src/search/facet/filter.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index d14b33f80..371cf975e 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -309,11 +309,12 @@ impl<'a> Filter<'a> { return Ok(string_docids | number_docids); } Condition::NotEqual(val) => { - let all_numbers_ids = index.number_faceted_documents_ids(rtxn, field_id)?; - let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; - return Ok((all_numbers_ids | all_strings_ids) - docids); + let docids = Self::evaluate_operator( + rtxn, index, field_id, &operator, + )?; + let all_ids = index.documents_ids(rtxn)?; + return Ok(all_ids - docids); } }; From 44744d9e67c0cf1dafd2ed712b8094d4b2b21ebb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Jun 2022 15:15:05 +0200 Subject: [PATCH 1571/1889] Implement the simplified NOT operator --- filter-parser/src/lib.rs | 23 ++++------------------- milli/src/search/facet/filter.rs | 10 ++++++++++ 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 8da3da35f..49eba1c61 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -112,6 +112,7 @@ impl<'a> From> for Token<'a> { #[derive(Debug, Clone, PartialEq, Eq)] pub enum FilterCondition<'a> { + Not(Box), Condition { fid: Token<'a>, op: Condition<'a> }, Or(Vec), And(Vec), @@ -148,24 +149,6 @@ impl<'a> FilterCondition<'a> { } } - pub fn negate(self) -> FilterCondition<'a> { - use FilterCondition::*; - - match self { - Condition { fid, op } => match op.negate() { - (op, None) => Condition { fid, op }, - (a, Some(b)) => Or(vec![ - Condition { fid: fid.clone(), op: a }.into(), - Condition { fid, op: b }.into(), - ]), - }, - Or(subfilters) => And(subfilters.into_iter().map(|x| x.negate().into()).collect()), - And(subfilters) => Or(subfilters.into_iter().map(|x| x.negate().into()).collect()), - GeoLowerThan { point, radius } => GeoGreaterThan { point, radius }, - GeoGreaterThan { point, radius } => GeoLowerThan { point, radius }, - } - } - pub fn parse(input: &'a str) -> Result, Error> { if input.trim().is_empty() { return Ok(None); @@ -219,7 +202,9 @@ fn parse_and(input: Span) -> IResult { /// If we parse a `NOT` we MUST parse something behind. fn parse_not(input: Span) -> IResult { alt(( - map(preceded(ws(tuple((tag("NOT"), multispace1))), cut(parse_not)), |e| e.negate()), + map(preceded(ws(tuple((tag("NOT"), multispace1))), cut(parse_not)), |e| { + FilterCondition::Not(Box::new(e)) + }), parse_primary, ))(input) } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 371cf975e..23917e4aa 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -360,6 +360,16 @@ impl<'a> Filter<'a> { filterable_fields: &HashSet, ) -> Result { match &self.condition { + FilterCondition::Not(f) => { + let all_ids = index.documents_ids(rtxn)?; + let selected = Self::inner_evaluate( + &(f.as_ref().clone()).into(), + rtxn, + index, + filterable_fields, + )?; + return Ok(all_ids - selected); + } FilterCondition::Condition { fid, op } => { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; From cc7415bb3185918117f5b60562ffc04704d42994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Jun 2022 15:28:34 +0200 Subject: [PATCH 1572/1889] Simplify FilterCondition code, made possible by the new NOT operator --- filter-parser/src/condition.rs | 25 +++++-------------------- filter-parser/src/lib.rs | 24 +++++++++++------------- milli/src/search/facet/filter.rs | 19 ------------------- 3 files changed, 16 insertions(+), 52 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index cbf73b96a..e967bd074 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -21,30 +21,12 @@ pub enum Condition<'a> { Equal(Token<'a>), NotEqual(Token<'a>), Exists, - NotExists, LowerThan(Token<'a>), LowerThanOrEqual(Token<'a>), Between { from: Token<'a>, to: Token<'a> }, } -impl<'a> Condition<'a> { - /// This method can return two operations in case it must express - /// an OR operation for the between case (i.e. `TO`). - pub fn negate(self) -> (Self, Option) { - match self { - GreaterThan(n) => (LowerThanOrEqual(n), None), - GreaterThanOrEqual(n) => (LowerThan(n), None), - Equal(s) => (NotEqual(s), None), - NotEqual(s) => (Equal(s), None), - Exists => (NotExists, None), - NotExists => (Exists, None), - LowerThan(n) => (GreaterThanOrEqual(n), None), - LowerThanOrEqual(n) => (GreaterThan(n), None), - Between { from, to } => (LowerThan(from), Some(GreaterThan(to))), - } - } -} -/// condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value +/// condition = value ("==" | ">" ...) value pub fn parse_condition(input: Span) -> IResult { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?; @@ -73,7 +55,10 @@ pub fn parse_not_exists(input: Span) -> IResult { let (input, key) = parse_value(input)?; let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?; - Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists })) + Ok(( + input, + FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key.into(), op: Exists })), + )) } /// to = value value "TO" WS+ value diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 49eba1c61..9b33f6d24 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -117,7 +117,6 @@ pub enum FilterCondition<'a> { Or(Vec), And(Vec), GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> }, - GeoGreaterThan { point: [Token<'a>; 2], radius: Token<'a> }, } impl<'a> FilterCondition<'a> { @@ -144,7 +143,6 @@ impl<'a> FilterCondition<'a> { None } FilterCondition::GeoLowerThan { point: [point, _], .. } if depth == 0 => Some(point), - FilterCondition::GeoGreaterThan { point: [point, _], .. } if depth == 0 => Some(point), _ => None, } } @@ -443,17 +441,17 @@ pub mod tests { ), ( "NOT subscribers EXISTS", - Fc::Condition { + Fc::Not(Box::new(Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Condition::NotExists, - }, + op: Condition::Exists, + })), ), ( "subscribers NOT EXISTS", - Fc::Condition { + Fc::Not(Box::new(Fc::Condition { fid: rtok("", "subscribers"), - op: Condition::NotExists, - }, + op: Condition::Exists, + })), ), ( "NOT subscribers NOT EXISTS", @@ -464,10 +462,10 @@ pub mod tests { ), ( "subscribers NOT EXISTS", - Fc::Condition { + Fc::Not(Box::new(Fc::Condition { fid: rtok("", "subscribers"), - op: Condition::NotExists, - }, + op: Condition::Exists, + })), ), ( "subscribers 100 TO 1000", @@ -503,10 +501,10 @@ pub mod tests { ), ( "NOT _geoRadius(12, 13, 14)", - Fc::GeoGreaterThan { + Fc::Not(Box::new(Fc::GeoLowerThan { point: [rtok("NOT _geoRadius(", "12"), rtok("NOT _geoRadius(12, ", "13")], radius: rtok("NOT _geoRadius(12, 13, ", "14"), - }, + })), ), // test simple `or` and `and` ( diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 23917e4aa..ac3215dea 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -276,14 +276,6 @@ impl<'a> Filter<'a> { let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; return Ok(exist); } - Condition::NotExists => { - let all_ids = index.documents_ids(rtxn)?; - - let exist = Self::evaluate_operator(rtxn, index, field_id, &Condition::Exists)?; - - let notexist = all_ids - exist; - return Ok(notexist); - } Condition::Equal(val) => { let (_original_value, string_docids) = strings_db .get(rtxn, &(field_id, &val.value().to_lowercase()))? @@ -460,17 +452,6 @@ impl<'a> Filter<'a> { }))?; } } - FilterCondition::GeoGreaterThan { point, radius } => { - let result = Self::inner_evaluate( - &FilterCondition::GeoLowerThan { point: point.clone(), radius: radius.clone() } - .into(), - rtxn, - index, - filterable_fields, - )?; - let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; - Ok(geo_faceted_doc_ids - result) - } } } } From 90a304cb074def76a41dd0d3da264fc9ba479633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 09:48:56 +0200 Subject: [PATCH 1573/1889] Fix tests after simplification of NOT filter --- filter-parser/src/lib.rs | 53 ++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 9b33f6d24..c5eeb84a9 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -371,10 +371,10 @@ pub mod tests { ), ( "NOT channel = ponce", - Fc::Condition { + Fc::Not(Box::new(Fc::Condition { fid: rtok("NOT ", "channel"), - op: Condition::NotEqual(rtok("NOT channel = ", "ponce")), - }, + op: Condition::Equal(rtok("NOT channel = ", "ponce")), + })), ), ( "subscribers < 1000", @@ -406,31 +406,31 @@ pub mod tests { ), ( "NOT subscribers < 1000", - Fc::Condition { + Fc::Not(Box::new(Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Condition::GreaterThanOrEqual(rtok("NOT subscribers < ", "1000")), - }, + op: Condition::LowerThan(rtok("NOT subscribers < ", "1000")), + })), ), ( "NOT subscribers > 1000", - Fc::Condition { + Fc::Not(Box::new(Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Condition::LowerThanOrEqual(rtok("NOT subscribers > ", "1000")), - }, + op: Condition::GreaterThan(rtok("NOT subscribers > ", "1000")), + })), ), ( "NOT subscribers <= 1000", - Fc::Condition { + Fc::Not(Box::new(Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Condition::GreaterThan(rtok("NOT subscribers <= ", "1000")), - }, + op: Condition::LowerThanOrEqual(rtok("NOT subscribers <= ", "1000")), + })), ), ( "NOT subscribers >= 1000", - Fc::Condition { + Fc::Not(Box::new(Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Condition::LowerThan(rtok("NOT subscribers >= ", "1000")), - }, + op: Condition::GreaterThanOrEqual(rtok("NOT subscribers >= ", "1000")), + })), ), ( "subscribers EXISTS", @@ -455,10 +455,10 @@ pub mod tests { ), ( "NOT subscribers NOT EXISTS", - Fc::Condition { + Fc::Not(Box::new(Fc::Not(Box::new(Fc::Condition { fid: rtok("NOT ", "subscribers"), op: Condition::Exists, - }, + })))), ), ( "subscribers NOT EXISTS", @@ -479,18 +479,13 @@ pub mod tests { ), ( "NOT subscribers 100 TO 1000", - Fc::Or( - vec![Fc::Condition { - fid: rtok("NOT ", "subscribers"), - op: Condition::LowerThan(rtok("NOT subscribers ", "100")), - } - .into(), - Fc::Condition { - fid: rtok("NOT ", "subscribers"), - op: Condition::GreaterThan(rtok("NOT subscribers 100 TO ", "1000")), - } - .into()], - ), + Fc::Not(Box::new(Fc::Condition { + fid: rtok("NOT ", "subscribers"), + op: Condition::Between { + from: rtok("NOT subscribers ", "100"), + to: rtok("NOT subscribers 100 TO ", "1000"), + }, + })), ), ( "_geoRadius(12, 13, 14)", From ca97cb0eda3fb18f20928e6526389d2a09be07ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 30 May 2022 13:58:11 +0200 Subject: [PATCH 1574/1889] Implement the IN filter operator --- filter-parser/src/lib.rs | 80 ++++++++++++++++++++++++++++++-- milli/src/search/facet/filter.rs | 26 +++++++++++ 2 files changed, 103 insertions(+), 3 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index c5eeb84a9..bfb02d63c 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -6,12 +6,14 @@ //! or = and ("OR" WS+ and)* //! and = not ("AND" WS+ not)* //! not = ("NOT" WS+ not) | primary -//! primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to +//! primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | in | condition | exists | not_exists | to +//! in = value "IN" WS* "[" value_list "]" //! condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value //! exists = value "EXISTS" //! not_exists = value "NOT" WS+ "EXISTS" //! to = value value "TO" WS+ value //! value = WS* ( word | singleQuoted | doubleQuoted) WS+ +//! value_list = (value ("," value)* ","?)? //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ @@ -51,7 +53,7 @@ pub use error::{Error, ErrorKind}; use nom::branch::alt; use nom::bytes::complete::tag; use nom::character::complete::{char, multispace0, multispace1}; -use nom::combinator::{cut, eof, map}; +use nom::combinator::{cut, eof, map, opt}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, terminated, tuple}; @@ -114,6 +116,7 @@ impl<'a> From> for Token<'a> { pub enum FilterCondition<'a> { Not(Box), Condition { fid: Token<'a>, op: Condition<'a> }, + In { fid: Token<'a>, els: Vec> }, Or(Vec), And(Vec), GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> }, @@ -161,7 +164,36 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } -/// or = and ("OR" WS+ and)* + +/// value_list = (value ("," value)* ","?)? +fn parse_value_list<'a>(input: Span<'a>) -> IResult>> { + let (input, first_value) = opt(parse_value)(input)?; + if let Some(first_value) = first_value { + let value_list_el_parser = preceded(ws(tag(",")), parse_value); + + let (input, mut values) = many0(value_list_el_parser)(input)?; + let (input, _) = opt(ws(tag(",")))(input)?; + values.insert(0, first_value); + + Ok((input, values)) + } else { + Ok((input, vec![])) + } +} + +/// in = value "IN" "[" value_list "]" +fn parse_in(input: Span) -> IResult { + let (input, value) = parse_value(input)?; + let (input, _) = ws(tag("IN"))(input)?; + + let mut els_parser = delimited(tag("["), parse_value_list, tag("]")); + + let (input, content) = els_parser(input)?; + let filter = FilterCondition::In { fid: value, els: content }; + Ok((input, filter)) +} + +/// or = and ("OR" and) fn parse_or(input: Span) -> IResult { let (input, first_filter) = parse_and(input)?; // if we found a `OR` then we MUST find something next @@ -257,6 +289,7 @@ fn parse_primary(input: Span) -> IResult { }), ), parse_geo_radius, + parse_in, parse_condition, parse_exists, parse_not_exists, @@ -297,6 +330,47 @@ pub mod tests { let test_case = [ // simple test + ( + "colour IN[]", + Fc::In { + fid: rtok("", "colour"), + els: vec![] + } + ), + ( + "colour IN[green]", + Fc::In { + fid: rtok("", "colour"), + els: vec![rtok("colour IN[", "green")] + } + ), + ( + "colour IN[green,]", + Fc::In { + fid: rtok("", "colour"), + els: vec![rtok("colour IN[", "green")] + } + ), + ( + "colour IN[green,blue]", + Fc::In { + fid: rtok("", "colour"), + els: vec![ + rtok("colour IN[", "green"), + rtok("colour IN[green, ", "blue"), + ] + } + ), + ( + " colour IN [ green , blue , ]", + Fc::In { + fid: rtok(" ", "colour"), + els: vec![ + rtok("colour IN [ ", "green"), + rtok("colour IN [ green , ", "blue"), + ] + } + ), ( "channel = Ponce", Fc::Condition { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index ac3215dea..25ffe1842 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -362,6 +362,32 @@ impl<'a> Filter<'a> { )?; return Ok(all_ids - selected); } + FilterCondition::In { fid, els } => { + // TODO: this could be optimised + let filterable_fields = index.filterable_fields(rtxn)?; + + if crate::is_faceted(fid.value(), &filterable_fields) { + let field_ids_map = index.fields_ids_map(rtxn)?; + + if let Some(fid) = field_ids_map.id(fid.value()) { + let mut bitmap = RoaringBitmap::new(); + + for el in els { + let op = Condition::Equal(el.clone()); + let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?; + bitmap |= el_bitmap; + } + Ok(bitmap) + } else { + Ok(RoaringBitmap::new()) + } + } else { + return Err(fid.as_external_error(FilterError::AttributeNotFilterable { + attribute: fid.value(), + filterable_fields, + }))?; + } + } FilterCondition::Condition { fid, op } => { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; From 2fd20fadfc12a11b153823d6bc7355ed51786665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 09:58:47 +0200 Subject: [PATCH 1575/1889] Implement the NOT IN syntax for negated IN filter --- filter-parser/src/lib.rs | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index bfb02d63c..4b78b86b8 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -192,6 +192,19 @@ fn parse_in(input: Span) -> IResult { let filter = FilterCondition::In { fid: value, els: content }; Ok((input, filter)) } +/// in = value "NOT" WS* "IN" "[" value_list "]" +fn parse_not_in(input: Span) -> IResult { + let (input, value) = parse_value(input)?; + let (input, _) = tag("NOT")(input)?; + let (input, _) = multispace1(input)?; + let (input, _) = ws(tag("IN"))(input)?; + + let mut els_parser = delimited(tag("["), parse_value_list, tag("]")); + + let (input, content) = els_parser(input)?; + let filter = FilterCondition::Not(Box::new(FilterCondition::In { fid: value, els: content })); + Ok((input, filter)) +} /// or = and ("OR" and) fn parse_or(input: Span) -> IResult { @@ -290,6 +303,7 @@ fn parse_primary(input: Span) -> IResult { ), parse_geo_radius, parse_in, + parse_not_in, parse_condition, parse_exists, parse_not_exists, @@ -361,6 +375,16 @@ pub mod tests { ] } ), + ( + "colour NOT IN[green,blue]", + Fc::Not(Box::new(Fc::In { + fid: rtok("", "colour"), + els: vec![ + rtok("colour NOT IN[", "green"), + rtok("colour NOT IN[green, ", "blue"), + ] + })) + ), ( " colour IN [ green , blue , ]", Fc::In { From 4ecfb95d0ccdeb0aad0fb7ed8cf22c1bc2e48d29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 10:13:34 +0200 Subject: [PATCH 1576/1889] Improve syntax errors for `IN` filter --- filter-parser/src/error.rs | 19 ++++++++++++-- filter-parser/src/lib.rs | 52 +++++++++++++++++++++++++++++++++++--- filter-parser/src/value.rs | 22 +++++++++++++--- 3 files changed, 83 insertions(+), 10 deletions(-) diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index a3720f7bf..0d2959126 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -57,6 +57,10 @@ pub enum ErrorKind<'a> { ExpectedEof, ExpectedValue, MalformedValue, + InOpeningBracket, + InClosingBracket, + InExpectedValue, + ReservedKeyword(String), MissingClosingDelimiter(char), Char(char), InternalError(error::ErrorKind), @@ -109,12 +113,11 @@ impl<'a> ParseError> for Error<'a> { impl<'a> Display for Error<'a> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { let input = self.context.fragment(); - // When printing our error message we want to escape all `\n` to be sure we keep our format with the // first line being the diagnostic and the second line being the incriminated filter. let escaped_input = input.escape_debug(); - match self.kind { + match &self.kind { ErrorKind::ExpectedValue if input.trim().is_empty() => { writeln!(f, "Was expecting a value but instead got nothing.")? } @@ -145,6 +148,18 @@ impl<'a> Display for Error<'a> { ErrorKind::MisusedGeo => { writeln!(f, "The `_geoRadius` filter is an operation and can't be used as a value.")? } + ErrorKind::ReservedKeyword(word) => { + writeln!(f, "`{word}` is a reserved keyword and thus cannot be used as a field name unless it is put inside quotes. Use \"{word}\" or \'{word}\' instead.")? + } + ErrorKind::InOpeningBracket => { + writeln!(f, "Expected `[` after `IN` keyword.")? + } + ErrorKind::InClosingBracket => { + writeln!(f, "Expected matching `]` after the list of field names given to `IN[`")? + } + ErrorKind::InExpectedValue => { + writeln!(f, "Expected only comma-separated field names inside `IN[..]` but instead found `{escaped_input}`")? + } ErrorKind::Char(c) => { panic!("Tried to display a char error with `{}`", c) } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 4b78b86b8..12edd56c8 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -167,6 +167,12 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) /// value_list = (value ("," value)* ","?)? fn parse_value_list<'a>(input: Span<'a>) -> IResult>> { + + // TODO: here, I should return a failure with a clear explanation whenever possible + // for example: + // * expected the name of a field, but got `AND` + // * expected closing square bracket, but got `AND` + let (input, first_value) = opt(parse_value)(input)?; if let Some(first_value) = first_value { let value_list_el_parser = preceded(ws(tag(",")), parse_value); @@ -186,9 +192,22 @@ fn parse_in(input: Span) -> IResult { let (input, value) = parse_value(input)?; let (input, _) = ws(tag("IN"))(input)?; - let mut els_parser = delimited(tag("["), parse_value_list, tag("]")); + // everything after `IN` can be a failure + let (input, _) = cut_with_err(tag("["), |_| { + Error::new_from_kind(input, ErrorKind::InOpeningBracket) + })(input)?; + + let (input, content) = cut(parse_value_list)(input)?; + + // everything after `IN` can be a failure + let (input, _) = cut_with_err(tag("]"), |_| { + if eof::<_, ()>(input).is_ok() { + Error::new_from_kind(input, ErrorKind::InClosingBracket) + } else { + Error::new_from_kind(input, ErrorKind::InExpectedValue) + } + })(input)?; - let (input, content) = els_parser(input)?; let filter = FilterCondition::In { fid: value, els: content }; Ok((input, filter)) } @@ -199,9 +218,19 @@ fn parse_not_in(input: Span) -> IResult { let (input, _) = multispace1(input)?; let (input, _) = ws(tag("IN"))(input)?; - let mut els_parser = delimited(tag("["), parse_value_list, tag("]")); - let (input, content) = els_parser(input)?; + // everything after `IN` can be a failure + let (input, _) = cut_with_err(tag("["), |_| { + Error::new_from_kind(input, ErrorKind::InOpeningBracket) + })(input)?; + + let (input, content) = cut(parse_value_list)(input)?; + + // everything after `IN` can be a failure + let (input, _) = cut_with_err(tag("]"), |_| { + Error::new_from_kind(input, ErrorKind::InClosingBracket) + })(input)?; + let filter = FilterCondition::Not(Box::new(FilterCondition::In { fid: value, els: content })); Ok((input, filter)) } @@ -313,6 +342,9 @@ fn parse_primary(input: Span) -> IResult { ))(input) // if the inner parsers did not match enough information to return an accurate error .map_err(|e| e.map_err(|_| Error::new_from_kind(input, ErrorKind::InvalidPrimary))) + + // TODO: if the filter starts with a reserved keyword that is not NOT, then we should return the reserved keyword error + // TODO: if the filter is x = reserved, idem } /// expression = or @@ -344,6 +376,13 @@ pub mod tests { let test_case = [ // simple test + ( + "x = AND", + Fc::Not(Box::new(Fc::Not(Box::new(Fc::In { + fid: rtok("NOT NOT", "colour"), + els: vec![] + })))) + ), ( "colour IN[]", Fc::In { @@ -734,6 +773,11 @@ pub mod tests { ("subscribers 100 TO1000", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`."), ("channel = ponce ORdog != 'bernese mountain'", "Found unexpected characters at the end of the filter: `ORdog != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), ("channel = ponce AND'dog' != 'bernese mountain'", "Found unexpected characters at the end of the filter: `AND\\'dog\\' != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), + ("colour IN blue, green]", "Expected `[` after `IN` keyword."), + ("colour IN [blue, green, 'blue' > 2]", "Expected only comma-separated field names inside `IN[..]` but instead found `> 2]`"), + ("colour IN [blue, green, AND]", "Expected only comma-separated field names inside `IN[..]` but instead found `AND]`"), + ("colour IN [blue, green", "Expected matching `]` after the list of field names given to `IN[`"), + ("colour IN ['blue, green", "Expression `\\'blue, green` is missing the following closing delimiter: `'`."), ]; for (input, expected) in test_case { diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 22da6a0df..8a7e8f586 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -71,9 +71,17 @@ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { _ => (), } - // word = (alphanumeric | _ | - | .)+ + // word = (alphanumeric | _ | - | .)+ except for reserved keywords let word = |input: Span<'a>| -> IResult> { - take_while1(is_value_component)(input).map(|(s, t)| (s, t.into())) + let (input, word): (_, Token<'a>) = + take_while1(is_value_component)(input).map(|(s, t)| (s, t.into()))?; + if is_keyword(word.value()) { + return Err(nom::Err::Error(Error::new_from_kind( + input, + ErrorKind::ReservedKeyword(word.value().to_owned()), + ))); + } + Ok((input, word)) }; // this parser is only used when an error is encountered and it parse the @@ -85,7 +93,7 @@ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { // when we create the errors from the output of the alt we have spaces everywhere let error_word = take_till::<_, _, Error>(is_syntax_component); - terminated( + let (input, value) = terminated( alt(( delimited(char('\''), cut(|input| quoted_by('\'', input)), cut(char('\''))), delimited(char('"'), cut(|input| quoted_by('"', input)), cut(char('"'))), @@ -107,7 +115,9 @@ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { failure } }) - }) + })?; + + Ok((input, value)) } fn is_value_component(c: char) -> bool { @@ -118,6 +128,10 @@ fn is_syntax_component(c: char) -> bool { c.is_whitespace() || ['(', ')', '=', '<', '>', '!'].contains(&c) } +fn is_keyword(s: &str) -> bool { + matches!(s, "AND" | "OR" | "IN" | "NOT" | "TO" | "EXISTS" | "_geoRadius") +} + #[cfg(test)] pub mod test { use nom::Finish; From d10d78d520d6d92ddbfca9800e7215bc1625f2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 4 Jul 2022 10:27:04 +0200 Subject: [PATCH 1577/1889] Add integration tests for the IN filter --- milli/tests/assets/test_set.ndjson | 26 +++++++++++++------------- milli/tests/search/filters.rs | 4 +++- milli/tests/search/mod.rs | 12 +++++++++++- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 427daca8c..2e77f9faf 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -1,17 +1,17 @@ -{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null]} -{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": []} -{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null} -{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E"} -{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"]} +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null], "tag_in": 1} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": [], "tag_in": 2} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null, "tag_in": 3} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4, "tag_in": "four"} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E", "tag_in": "five"} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"], "tag_in": null} {"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]} -{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8]} -{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""} -{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] } -{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8], "tag_in": 8} +{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":"", "tag_in": "nine"} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}, "tag_in": 10} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] , "tag_in": "eleven"} +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}, "tag_in": 12} {"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]} -{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}} } -{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]] } +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}}} +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]]} {"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16} {"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""} diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 1700a1478..5451a9076 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -85,4 +85,6 @@ test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]); test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]); test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]); -test_filter!(exists_filter_2, vec![Right("opt1.opt2 EXISTS")]); +test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]); +test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]); +test_filter!(not_not_in_filter, vec![Right("NOT tag_in NOT IN[1, 2, 3, four, five]")]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 728a4eb0b..0e1d43d2a 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -44,7 +44,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("asc_desc_rank"), S("_geo"), S("opt1"), - S("opt1.opt2") + S("opt1.opt2"), + S("tag_in") }); builder.set_sortable_fields(hashset! { S("tag"), @@ -205,6 +206,15 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { } else if let Some(opt1) = &document.opt1 { id = contains_key_rec(opt1, "opt2").then(|| document.id.clone()); } + } else if matches!( + filter, + "tag_in IN[1, 2, 3, four, five]" | "NOT tag_in NOT IN[1, 2, 3, four, five]" + ) { + id = matches!(document.id.as_str(), "A" | "B" | "C" | "D" | "E") + .then(|| document.id.clone()); + } else if matches!(filter, "tag_in NOT IN[1, 2, 3, four, five]") { + id = (!matches!(document.id.as_str(), "A" | "B" | "C" | "D" | "E")) + .then(|| document.id.clone()); } id } From 196f79115a4367ef57fc96ca2cc28c22be10fbe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 17:09:52 +0200 Subject: [PATCH 1578/1889] Run cargo fmt --- filter-parser/src/lib.rs | 70 ++++++++++++++++---------------- milli/src/search/facet/filter.rs | 22 ++++++---- 2 files changed, 50 insertions(+), 42 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 12edd56c8..31ca2919a 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -164,10 +164,8 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } - /// value_list = (value ("," value)* ","?)? fn parse_value_list<'a>(input: Span<'a>) -> IResult>> { - // TODO: here, I should return a failure with a clear explanation whenever possible // for example: // * expected the name of a field, but got `AND` @@ -193,12 +191,13 @@ fn parse_in(input: Span) -> IResult { let (input, _) = ws(tag("IN"))(input)?; // everything after `IN` can be a failure - let (input, _) = cut_with_err(tag("["), |_| { - Error::new_from_kind(input, ErrorKind::InOpeningBracket) - })(input)?; + let (input, _) = + cut_with_err(tag("["), |_| Error::new_from_kind(input, ErrorKind::InOpeningBracket))( + input, + )?; let (input, content) = cut(parse_value_list)(input)?; - + // everything after `IN` can be a failure let (input, _) = cut_with_err(tag("]"), |_| { if eof::<_, ()>(input).is_ok() { @@ -218,18 +217,19 @@ fn parse_not_in(input: Span) -> IResult { let (input, _) = multispace1(input)?; let (input, _) = ws(tag("IN"))(input)?; - // everything after `IN` can be a failure - let (input, _) = cut_with_err(tag("["), |_| { - Error::new_from_kind(input, ErrorKind::InOpeningBracket) - })(input)?; + let (input, _) = + cut_with_err(tag("["), |_| Error::new_from_kind(input, ErrorKind::InOpeningBracket))( + input, + )?; let (input, content) = cut(parse_value_list)(input)?; - + // everything after `IN` can be a failure - let (input, _) = cut_with_err(tag("]"), |_| { - Error::new_from_kind(input, ErrorKind::InClosingBracket) - })(input)?; + let (input, _) = + cut_with_err(tag("]"), |_| Error::new_from_kind(input, ErrorKind::InClosingBracket))( + input, + )?; let filter = FilterCondition::Not(Box::new(FilterCondition::In { fid: value, els: content })); Ok((input, filter)) @@ -378,60 +378,60 @@ pub mod tests { // simple test ( "x = AND", - Fc::Not(Box::new(Fc::Not(Box::new(Fc::In { - fid: rtok("NOT NOT", "colour"), - els: vec![] + Fc::Not(Box::new(Fc::Not(Box::new(Fc::In { + fid: rtok("NOT NOT", "colour"), + els: vec![] })))) ), ( "colour IN[]", - Fc::In { - fid: rtok("", "colour"), - els: vec![] + Fc::In { + fid: rtok("", "colour"), + els: vec![] } ), ( "colour IN[green]", - Fc::In { - fid: rtok("", "colour"), - els: vec![rtok("colour IN[", "green")] + Fc::In { + fid: rtok("", "colour"), + els: vec![rtok("colour IN[", "green")] } ), ( "colour IN[green,]", - Fc::In { - fid: rtok("", "colour"), - els: vec![rtok("colour IN[", "green")] + Fc::In { + fid: rtok("", "colour"), + els: vec![rtok("colour IN[", "green")] } ), ( "colour IN[green,blue]", - Fc::In { - fid: rtok("", "colour"), + Fc::In { + fid: rtok("", "colour"), els: vec![ rtok("colour IN[", "green"), rtok("colour IN[green, ", "blue"), - ] + ] } ), ( "colour NOT IN[green,blue]", - Fc::Not(Box::new(Fc::In { - fid: rtok("", "colour"), + Fc::Not(Box::new(Fc::In { + fid: rtok("", "colour"), els: vec![ rtok("colour NOT IN[", "green"), rtok("colour NOT IN[green, ", "blue"), - ] + ] })) ), ( " colour IN [ green , blue , ]", - Fc::In { - fid: rtok(" ", "colour"), + Fc::In { + fid: rtok(" ", "colour"), els: vec![ rtok("colour IN [ ", "green"), rtok("colour IN [ green , ", "blue"), - ] + ] } ), ( diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 25ffe1842..487676f4a 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -302,9 +302,7 @@ impl<'a> Filter<'a> { } Condition::NotEqual(val) => { let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator( - rtxn, index, field_id, &operator, - )?; + let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; let all_ids = index.documents_ids(rtxn)?; return Ok(all_ids - docids); } @@ -421,20 +419,30 @@ impl<'a> Filter<'a> { FilterCondition::Or(subfilters) => { let mut bitmap = RoaringBitmap::new(); for f in subfilters { - bitmap |= Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; + bitmap |= + Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; } Ok(bitmap) } FilterCondition::And(subfilters) => { let mut subfilters_iter = subfilters.iter(); if let Some(first_subfilter) = subfilters_iter.next() { - let mut bitmap = - Self::inner_evaluate(&(first_subfilter.clone()).into(), rtxn, index, filterable_fields)?; + let mut bitmap = Self::inner_evaluate( + &(first_subfilter.clone()).into(), + rtxn, + index, + filterable_fields, + )?; for f in subfilters_iter { if bitmap.is_empty() { return Ok(bitmap); } - bitmap &= Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; + bitmap &= Self::inner_evaluate( + &(f.clone()).into(), + rtxn, + index, + filterable_fields, + )?; } Ok(bitmap) } else { From 93252769af3096971fdef83c6ab02c122e7beb41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 12:41:22 +0200 Subject: [PATCH 1579/1889] Apply review suggestions --- milli/src/update/word_prefix_pair_proximity_docids.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index cf5e19a5c..724858e4f 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -165,7 +165,7 @@ inputs described above, which come from different places: present in the newly added documents 2. `word_pairs_db`, which is the list of word pairs from the database. - This list includes all elements in `new_word_pairs`** since `new_word_pairs` + This list includes all elements in `new_word_pairs` since `new_word_pairs` was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` function. @@ -380,10 +380,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { /// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements /// /// For more information about what this function does, read the module documentation. -fn execute_on_word_pairs_and_prefixes( - iter: &mut Iter, +fn execute_on_word_pairs_and_prefixes( + iter: &mut I, mut next_word_pair_proximity: impl for<'a> FnMut( - &'a mut Iter, + &'a mut I, ) -> Result< Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, >, From 6cc975704d48060602f50b337b672244baec4f3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 12:58:11 +0200 Subject: [PATCH 1580/1889] Add some documentation to facets.rs --- milli/src/update/facets.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 56529a3c5..108acae4f 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -165,11 +165,15 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { } } + /// The number of elements from the level below that are represented by a single element in the level above + /// + /// This setting is always greater than or equal to 2. pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); self } + /// The minimum number of elements that a level is allowed to have. pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { self.min_level_size = value; self @@ -252,6 +256,12 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { } } +/// Compute the content of the database levels from its level 0 for the given field id. +/// +/// ## Returns: +/// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` +/// that must be inserted into the database. +/// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_number_levels<'t>( rtxn: &'t heed::RoTxn, db: heed::Database, @@ -316,6 +326,12 @@ fn compute_facet_number_levels<'t>( } } +/// Compute the content of the database levels from its level 0 for the given field id. +/// +/// ## Returns: +/// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` +/// that must be inserted into the database. +/// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_strings_levels<'t>( rtxn: &'t heed::RoTxn, db: heed::Database, @@ -401,7 +417,7 @@ will contain the new levels * `level_0_range` : equivalent to `level_0_start..` * `level_0_size` : the number of elements in level 0 * `level_group_size` : the number of elements from the level below that are represented by a -* single element of the new level +single element of the new level * `computed_group_bitmap` : a callback that is called whenever at most `level_group_size` elements from the level below were read/created. Its arguments are: 0. the list of bitmaps from each read/created element of the level below From 7384650d856aab4705a0a3308a36422a796418bf Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Aug 2022 15:03:08 +0200 Subject: [PATCH 1581/1889] Update test to showcase the bug --- milli/src/update/index_documents/mod.rs | 27 ++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 114903e39..0951cf227 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -654,6 +654,9 @@ mod tests { let rtxn = index.read_txn().unwrap(); let count = index.number_of_documents(&rtxn).unwrap(); assert_eq!(count, 3); + let count = index.all_documents(&rtxn).unwrap().count(); + assert_eq!(count, 3); + drop(rtxn); } @@ -888,12 +891,26 @@ mod tests { index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index - .add_documents(documents!([{ - "id": 2, - "author": "J. Austen", - "date": "1813" - }])) + .add_documents(documents!([ + {"id":4,"title":"Harry Potter and the Half-Blood Princess"}, + {"id":456,"title":"The Little Prince"} + ])) .unwrap(); + + index + .add_documents(documents!([ + { "id": 2, "author": "J. Austen", "date": "1813" } + ])) + .unwrap(); + + // Check that there is **always** 3 documents. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 6); + let count = index.all_documents(&rtxn).unwrap().count(); + assert_eq!(count, 6); + + drop(rtxn); } #[test] From 2668f841d18fef109d885f7f5f2cea26e807d6ed Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Aug 2022 15:03:37 +0200 Subject: [PATCH 1582/1889] Fix update indexing --- milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/index_documents/transform.rs | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 0951cf227..493935a79 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -407,7 +407,7 @@ where // We write the external documents ids into the main database. self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; - let all_documents_ids = index_documents_ids | new_documents_ids | replaced_documents_ids; + let all_documents_ids = index_documents_ids | new_documents_ids; self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; self.execute_prefix_databases( diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index b61395a96..8818909a3 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -249,11 +249,10 @@ impl<'a, 'i> Transform<'a, 'i> { None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?, } } - } else { - self.new_documents_ids.insert(docid); } if !skip_insertion { + self.new_documents_ids.insert(docid); // We use the extracted/generated user id as the key for this document. self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?; From e9e2349ce67381c9550b47fc43916a8afc6472f1 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Aug 2022 15:09:48 +0200 Subject: [PATCH 1583/1889] Fix typo in comment --- milli/src/update/index_documents/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 493935a79..d1f030fdd 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -903,7 +903,7 @@ mod tests { ])) .unwrap(); - // Check that there is **always** 3 documents. + // Check that there is **always** 6 documents. let rtxn = index.read_txn().unwrap(); let count = index.number_of_documents(&rtxn).unwrap(); assert_eq!(count, 6); From 8c3f1a9c39620d26adaf1a7c3e25f1e1dc5dd133 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 17 Aug 2022 15:20:43 +0200 Subject: [PATCH 1584/1889] Remove useless lifetime declaration --- milli/src/search/criteria/mod.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index ae9e0c218..f48865ba5 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -293,13 +293,13 @@ impl<'t> CriteriaBuilder<'t> { } } -pub fn resolve_query_tree<'t>( - ctx: &'t dyn Context, +pub fn resolve_query_tree( + ctx: &dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, ) -> Result { - fn resolve_operation<'t>( - ctx: &'t dyn Context, + fn resolve_operation( + ctx: &dyn Context, query_tree: &Operation, wdcache: &mut WordDerivationsCache, ) -> Result { @@ -342,7 +342,7 @@ pub fn resolve_query_tree<'t>( resolve_operation(ctx, query_tree, wdcache) } -pub fn resolve_phrase<'t>(ctx: &'t dyn Context, phrase: &[String]) -> Result { +pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result { let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = phrase.len().min(7); From b09a8f1b915911631e94925cf0420b569a6db577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 16:06:29 +0200 Subject: [PATCH 1585/1889] Filters: add explicit error message when using a keyword as value --- filter-parser/src/error.rs | 29 +++++++++++++++++------- filter-parser/src/lib.rs | 45 ++++++++++++++++++++++++-------------- filter-parser/src/value.rs | 14 ++++++++++-- 3 files changed, 62 insertions(+), 26 deletions(-) diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index 0d2959126..ce9470ff8 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -48,6 +48,12 @@ pub struct Error<'a> { kind: ErrorKind<'a>, } +#[derive(Debug)] +pub enum ExpectedValueKind { + ReservedKeyword, + Other, +} + #[derive(Debug)] pub enum ErrorKind<'a> { ReservedGeo(&'a str), @@ -55,11 +61,11 @@ pub enum ErrorKind<'a> { MisusedGeo, InvalidPrimary, ExpectedEof, - ExpectedValue, + ExpectedValue(ExpectedValueKind), MalformedValue, InOpeningBracket, InClosingBracket, - InExpectedValue, + InExpectedValue(ExpectedValueKind), ReservedKeyword(String), MissingClosingDelimiter(char), Char(char), @@ -118,18 +124,22 @@ impl<'a> Display for Error<'a> { let escaped_input = input.escape_debug(); match &self.kind { - ErrorKind::ExpectedValue if input.trim().is_empty() => { + ErrorKind::ExpectedValue(_) if input.trim().is_empty() => { writeln!(f, "Was expecting a value but instead got nothing.")? } + ErrorKind::ExpectedValue(ExpectedValueKind::ReservedKeyword) => { + writeln!(f, "Was expecting a value but instead got `{escaped_input}`, which is a reserved keyword. To use `{escaped_input}` as a field name or a value, surround it by quotes.")? + } + ErrorKind::ExpectedValue(ExpectedValueKind::Other) => { + writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)? + } ErrorKind::MalformedValue => { writeln!(f, "Malformed value: `{}`.", escaped_input)? } ErrorKind::MissingClosingDelimiter(c) => { writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? } - ErrorKind::ExpectedValue => { - writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)? - } + ErrorKind::InvalidPrimary if input.trim().is_empty() => { writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")? } @@ -157,8 +167,11 @@ impl<'a> Display for Error<'a> { ErrorKind::InClosingBracket => { writeln!(f, "Expected matching `]` after the list of field names given to `IN[`")? } - ErrorKind::InExpectedValue => { - writeln!(f, "Expected only comma-separated field names inside `IN[..]` but instead found `{escaped_input}`")? + ErrorKind::InExpectedValue(ExpectedValueKind::ReservedKeyword) => { + writeln!(f, "Expected only comma-separated field names inside `IN[..]` but instead found `{escaped_input}`, which is a keyword. To use `{escaped_input}` as a field name or a value, surround it by quotes.")? + } + ErrorKind::InExpectedValue(ExpectedValueKind::Other) => { + writeln!(f, "Expected only comma-separated field names inside `IN[..]` but instead found `{escaped_input}`.")? } ErrorKind::Char(c) => { panic!("Tried to display a char error with `{}`", c) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 31ca2919a..d0c2d8531 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -48,7 +48,7 @@ use std::str::FromStr; pub use condition::{parse_condition, parse_to, Condition}; use condition::{parse_exists, parse_not_exists}; -use error::{cut_with_err, NomErrorExt}; +use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; use nom::bytes::complete::tag; @@ -166,11 +166,6 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) /// value_list = (value ("," value)* ","?)? fn parse_value_list<'a>(input: Span<'a>) -> IResult>> { - // TODO: here, I should return a failure with a clear explanation whenever possible - // for example: - // * expected the name of a field, but got `AND` - // * expected closing square bracket, but got `AND` - let (input, first_value) = opt(parse_value)(input)?; if let Some(first_value) = first_value { let value_list_el_parser = preceded(ws(tag(",")), parse_value); @@ -203,7 +198,14 @@ fn parse_in(input: Span) -> IResult { if eof::<_, ()>(input).is_ok() { Error::new_from_kind(input, ErrorKind::InClosingBracket) } else { - Error::new_from_kind(input, ErrorKind::InExpectedValue) + let expected_value_kind = match parse_value(input) { + Err(nom::Err::Error(e)) => match e.kind() { + ErrorKind::ReservedKeyword(_) => ExpectedValueKind::ReservedKeyword, + _ => ExpectedValueKind::Other, + }, + _ => ExpectedValueKind::Other, + }; + Error::new_from_kind(input, ErrorKind::InExpectedValue(expected_value_kind)) } })(input)?; @@ -319,6 +321,21 @@ fn parse_geo_point(input: Span) -> IResult { Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) } +fn parse_error_reserved_keyword(input: Span) -> IResult { + match parse_condition(input) { + Ok(result) => Ok(result), + Err(nom::Err::Error(inner) | nom::Err::Failure(inner)) => match inner.kind() { + ErrorKind::ExpectedValue(ExpectedValueKind::ReservedKeyword) => { + return Err(nom::Err::Failure(inner)); + } + _ => return Err(nom::Err::Error(inner)), + }, + Err(e) => { + return Err(e); + } + } +} + /// primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to fn parse_primary(input: Span) -> IResult { alt(( @@ -339,6 +356,7 @@ fn parse_primary(input: Span) -> IResult { parse_to, // the next lines are only for error handling and are written at the end to have the less possible performance impact parse_geo_point, + parse_error_reserved_keyword, ))(input) // if the inner parsers did not match enough information to return an accurate error .map_err(|e| e.map_err(|_| Error::new_from_kind(input, ErrorKind::InvalidPrimary))) @@ -376,13 +394,6 @@ pub mod tests { let test_case = [ // simple test - ( - "x = AND", - Fc::Not(Box::new(Fc::Not(Box::new(Fc::In { - fid: rtok("NOT NOT", "colour"), - els: vec![] - })))) - ), ( "colour IN[]", Fc::In { @@ -756,8 +767,8 @@ pub mod tests { ("channel = ", "Was expecting a value but instead got nothing."), ("channel = 🐻", "Was expecting a value but instead got `🐻`."), ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."), - ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `OR`."), - ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `AND`."), + ("'OR'", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `\\'OR\\'`."), + ("OR", "Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes."), ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."), ("channel = Ponce OR", "Found unexpected characters at the end of the filter: `OR`. You probably forgot an `OR` or an `AND` rule."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), @@ -778,6 +789,8 @@ pub mod tests { ("colour IN [blue, green, AND]", "Expected only comma-separated field names inside `IN[..]` but instead found `AND]`"), ("colour IN [blue, green", "Expected matching `]` after the list of field names given to `IN[`"), ("colour IN ['blue, green", "Expression `\\'blue, green` is missing the following closing delimiter: `'`."), + ("x = EXISTS", "Was expecting a value but instead got `EXISTS`, which is a reserved keyword. To use `EXISTS` as a field name or a value, surround it by quotes."), + ("AND = 8", "Was expecting a value but instead got `AND`, which is a reserved keyword. To use `AND` as a field name or a value, surround it by quotes."), ]; for (input, expected) in test_case { diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 8a7e8f586..bfa3c2730 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -5,7 +5,7 @@ use nom::combinator::cut; use nom::sequence::{delimited, terminated}; use nom::{InputIter, InputLength, InputTake, Slice}; -use crate::error::NomErrorExt; +use crate::error::{ExpectedValueKind, NomErrorExt}; use crate::{parse_geo_point, parse_geo_radius, Error, ErrorKind, IResult, Span, Token}; /// This function goes through all characters in the [Span] if it finds any escaped character (`\`). @@ -103,7 +103,17 @@ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { )(input) // if we found nothing in the alt it means the user specified something that was not recognized as a value .map_err(|e: nom::Err| { - e.map_err(|_| Error::new_from_kind(error_word(input).unwrap().1, ErrorKind::ExpectedValue)) + e.map_err(|error| { + let expected_value_kind = if matches!(error.kind(), ErrorKind::ReservedKeyword(_)) { + ExpectedValueKind::ReservedKeyword + } else { + ExpectedValueKind::Other + }; + Error::new_from_kind( + error_word(input).unwrap().1, + ErrorKind::ExpectedValue(expected_value_kind), + ) + }) }) .map_err(|e| { e.map_fail(|failure| { From 238a7be58d03d3b646e4b49d2457cd3c6c60c02f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 16:53:40 +0200 Subject: [PATCH 1586/1889] Fix filter parser handling of keywords and surrounding spaces Now the following fragments are allowed: AND(field = AND'field' = AND"field" = --- filter-parser/src/lib.rs | 29 ++++++++++++------------ filter-parser/src/value.rs | 46 +++++++++++++++++++++++++------------- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index d0c2d8531..09aa252e1 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -52,7 +52,7 @@ use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; use nom::bytes::complete::tag; -use nom::character::complete::{char, multispace0, multispace1}; +use nom::character::complete::{char, multispace0}; use nom::combinator::{cut, eof, map, opt}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; @@ -60,6 +60,7 @@ use nom::sequence::{delimited, preceded, terminated, tuple}; use nom::Finish; use nom_locate::LocatedSpan; pub(crate) use value::parse_value; +use value::word_exact; pub type Span<'a> = LocatedSpan<&'a str, &'a str>; @@ -183,7 +184,7 @@ fn parse_value_list<'a>(input: Span<'a>) -> IResult>> { /// in = value "IN" "[" value_list "]" fn parse_in(input: Span) -> IResult { let (input, value) = parse_value(input)?; - let (input, _) = ws(tag("IN"))(input)?; + let (input, _) = ws(word_exact("IN"))(input)?; // everything after `IN` can be a failure let (input, _) = @@ -215,9 +216,8 @@ fn parse_in(input: Span) -> IResult { /// in = value "NOT" WS* "IN" "[" value_list "]" fn parse_not_in(input: Span) -> IResult { let (input, value) = parse_value(input)?; - let (input, _) = tag("NOT")(input)?; - let (input, _) = multispace1(input)?; - let (input, _) = ws(tag("IN"))(input)?; + let (input, _) = word_exact("NOT")(input)?; + let (input, _) = ws(word_exact("IN"))(input)?; // everything after `IN` can be a failure let (input, _) = @@ -241,8 +241,7 @@ fn parse_not_in(input: Span) -> IResult { fn parse_or(input: Span) -> IResult { let (input, first_filter) = parse_and(input)?; // if we found a `OR` then we MUST find something next - let (input, mut ors) = - many0(preceded(ws(tuple((tag("OR"), multispace1))), cut(parse_and)))(input)?; + let (input, mut ors) = many0(preceded(ws(word_exact("OR")), cut(parse_and)))(input)?; let filter = if ors.is_empty() { first_filter @@ -258,8 +257,7 @@ fn parse_or(input: Span) -> IResult { fn parse_and(input: Span) -> IResult { let (input, first_filter) = parse_not(input)?; // if we found a `AND` then we MUST find something next - let (input, mut ands) = - many0(preceded(ws(tuple((tag("AND"), multispace1))), cut(parse_not)))(input)?; + let (input, mut ands) = many0(preceded(ws(word_exact("AND")), cut(parse_not)))(input)?; let filter = if ands.is_empty() { first_filter @@ -276,9 +274,7 @@ fn parse_and(input: Span) -> IResult { /// If we parse a `NOT` we MUST parse something behind. fn parse_not(input: Span) -> IResult { alt(( - map(preceded(ws(tuple((tag("NOT"), multispace1))), cut(parse_not)), |e| { - FilterCondition::Not(Box::new(e)) - }), + map(preceded(ws(word_exact("NOT")), cut(parse_not)), |e| FilterCondition::Not(Box::new(e))), parse_primary, ))(input) } @@ -288,7 +284,7 @@ fn parse_not(input: Span) -> IResult { fn parse_geo_radius(input: Span) -> IResult { // we want to allow space BEFORE the _geoRadius but not after let parsed = preceded( - tuple((multispace0, tag("_geoRadius"))), + tuple((multispace0, word_exact("_geoRadius"))), // if we were able to parse `_geoRadius` and can't parse the rest of the input we return a failure cut(delimited(char('('), separated_list1(tag(","), ws(recognize_float)), char(')'))), )(input) @@ -741,6 +737,10 @@ pub mod tests { Fc::GeoLowerThan { point: [rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(", "12"), rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, ", "13")], radius: rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, ", "14") }.into() ]) ) + // ( + // ("channel = ponce AND'dog' != 'bernese mountain'", ), + // ("channel = ponce AND('dog' != 'bernese mountain')", ), + // ) ]; for (input, expected) in test_case { @@ -770,7 +770,7 @@ pub mod tests { ("'OR'", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `\\'OR\\'`."), ("OR", "Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes."), ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Found unexpected characters at the end of the filter: `OR`. You probably forgot an `OR` or an `AND` rule."), + ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), @@ -783,7 +783,6 @@ pub mod tests { ("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."), ("subscribers 100 TO1000", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`."), ("channel = ponce ORdog != 'bernese mountain'", "Found unexpected characters at the end of the filter: `ORdog != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), - ("channel = ponce AND'dog' != 'bernese mountain'", "Found unexpected characters at the end of the filter: `AND\\'dog\\' != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), ("colour IN blue, green]", "Expected `[` after `IN` keyword."), ("colour IN [blue, green, 'blue' > 2]", "Expected only comma-separated field names inside `IN[..]` but instead found `> 2]`"), ("colour IN [blue, green, AND]", "Expected only comma-separated field names inside `IN[..]` but instead found `AND]`"), diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index bfa3c2730..90dc44604 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -3,7 +3,7 @@ use nom::bytes::complete::{take_till, take_while, take_while1}; use nom::character::complete::{char, multispace0}; use nom::combinator::cut; use nom::sequence::{delimited, terminated}; -use nom::{InputIter, InputLength, InputTake, Slice}; +use nom::{error, InputIter, InputLength, InputTake, Slice}; use crate::error::{ExpectedValueKind, NomErrorExt}; use crate::{parse_geo_point, parse_geo_radius, Error, ErrorKind, IResult, Span, Token}; @@ -48,6 +48,35 @@ fn quoted_by(quote: char, input: Span) -> IResult { )) } +// word = (alphanumeric | _ | - | .)+ except for reserved keywords +pub fn word_not_keyword<'a>(input: Span<'a>) -> IResult> { + let (input, word): (_, Token<'a>) = + take_while1(is_value_component)(input).map(|(s, t)| (s, t.into()))?; + if is_keyword(word.value()) { + return Err(nom::Err::Error(Error::new_from_kind( + input, + ErrorKind::ReservedKeyword(word.value().to_owned()), + ))); + } + Ok((input, word)) +} + +// word = {tag} +pub fn word_exact<'a, 'b: 'a>(tag: &'b str) -> impl Fn(Span<'a>) -> IResult<'a, Token<'a>> { + move |input| { + let (input, word): (_, Token<'a>) = + take_while1(is_value_component)(input).map(|(s, t)| (s, t.into()))?; + if word.value() == tag { + Ok((input, word)) + } else { + Err(nom::Err::Error(Error::new_from_kind( + input, + ErrorKind::InternalError(nom::error::ErrorKind::Tag), + ))) + } + } +} + /// value = WS* ( word | singleQuoted | doubleQuoted) WS+ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { // to get better diagnostic message we are going to strip the left whitespaces from the input right now @@ -71,19 +100,6 @@ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { _ => (), } - // word = (alphanumeric | _ | - | .)+ except for reserved keywords - let word = |input: Span<'a>| -> IResult> { - let (input, word): (_, Token<'a>) = - take_while1(is_value_component)(input).map(|(s, t)| (s, t.into()))?; - if is_keyword(word.value()) { - return Err(nom::Err::Error(Error::new_from_kind( - input, - ErrorKind::ReservedKeyword(word.value().to_owned()), - ))); - } - Ok((input, word)) - }; - // this parser is only used when an error is encountered and it parse the // largest string possible that do not contain any “language” syntax. // If we try to parse `name = 🦀 AND language = rust` we want to return an @@ -97,7 +113,7 @@ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { alt(( delimited(char('\''), cut(|input| quoted_by('\'', input)), cut(char('\''))), delimited(char('"'), cut(|input| quoted_by('"', input)), cut(char('"'))), - word, + word_not_keyword, )), multispace0, )(input) From e96b852107221f58a91ec1f023d606eeefce99af Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 10 Aug 2022 16:25:24 +0200 Subject: [PATCH 1587/1889] bump heed --- milli/Cargo.toml | 3 ++- milli/src/error.rs | 3 +++ milli/src/update/index_documents/mod.rs | 29 ++++++++++++++----------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 2bb6a50a1..fbe756ac6 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,7 +18,8 @@ fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.4.1" grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +# heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/meilisearch/heed", branch = "compute_size", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memmap2 = "0.5.3" diff --git a/milli/src/error.rs b/milli/src/error.rs index c817f64fa..d3f0a179f 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -116,6 +116,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco } )] InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, + #[error("{}", HeedError::BadOpenOptions)] + InvalidLmdbOpenOptions, #[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")] SortRankingRuleMissing, #[error("The database file is in an invalid state.")] @@ -244,6 +246,7 @@ impl From for Error { HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })), HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), HeedError::DatabaseClosing => InternalError(DatabaseClosing), + HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions), } } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index d1f030fdd..f5e04435d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -278,27 +278,30 @@ where let stop_words = self.index.stop_words(self.wtxn)?; let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; + let pool_params = GrenadParameters { + chunk_compression_type: self.indexer_config.chunk_compression_type, + chunk_compression_level: self.indexer_config.chunk_compression_level, + max_memory: self.indexer_config.max_memory, + max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. + }; + let documents_chunk_size = + self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB + let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; + // Run extraction pipeline in parallel. pool.install(|| { - let params = GrenadParameters { - chunk_compression_type: self.indexer_config.chunk_compression_type, - chunk_compression_level: self.indexer_config.chunk_compression_level, - max_memory: self.indexer_config.max_memory, - max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. - }; - // split obkv file into several chunks let original_chunk_iter = grenad_obkv_into_chunks( original_documents, - params.clone(), - self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB + pool_params.clone(), + documents_chunk_size, ); // split obkv file into several chunks let flattened_chunk_iter = grenad_obkv_into_chunks( flattened_documents, - params.clone(), - self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB + pool_params.clone(), + documents_chunk_size, ); let result = original_chunk_iter @@ -308,14 +311,14 @@ where extract::data_from_obkv_documents( original_chunk, flattened_chunk, - params, + pool_params, lmdb_writer_sx.clone(), searchable_fields, faceted_fields, primary_key_id, geo_fields_ids, stop_words, - self.indexer_config.max_positions_per_attributes, + max_positions_per_attributes, exact_attributes, ) }); From 4aae07d5f557fca6d4441e8214194943899290e9 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Thu, 11 Aug 2022 11:15:46 +0200 Subject: [PATCH 1588/1889] expose the size methods --- benchmarks/benches/indexing.rs | 2 +- benchmarks/benches/utils.rs | 2 +- cli/Cargo.toml | 1 - cli/src/main.rs | 2 +- helpers/Cargo.toml | 1 - helpers/src/main.rs | 2 +- http-ui/Cargo.toml | 1 - http-ui/src/main.rs | 2 +- http-ui/src/update_store.rs | 1 + infos/Cargo.toml | 1 - infos/src/main.rs | 2 +- milli/Cargo.toml | 3 +-- milli/src/index.rs | 10 ++++++++++ milli/src/update/delete_documents.rs | 26 +++++++++++++++++++++----- 14 files changed, 39 insertions(+), 17 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index d532c85d9..a409e1343 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -5,7 +5,7 @@ use std::fs::{create_dir_all, remove_dir_all}; use std::path::Path; use criterion::{criterion_group, criterion_main, Criterion}; -use heed::{EnvOpenOptions, RwTxn}; +use milli::heed::{EnvOpenOptions, RwTxn}; use milli::update::{ DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, }; diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index fba05edbe..8c556b383 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -6,8 +6,8 @@ use std::num::ParseFloatError; use std::path::Path; use criterion::BenchmarkId; -use heed::EnvOpenOptions; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::heed::EnvOpenOptions; use milli::update::{ IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 504df712e..e45fb3344 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -11,7 +11,6 @@ byte-unit = { version = "4.0.14", features = ["serde"] } color-eyre = "0.6.1" csv = "1.1.6" eyre = "0.6.7" -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } indicatif = "0.16.2" milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } diff --git a/cli/src/main.rs b/cli/src/main.rs index 8485560f5..e3bbced3e 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -13,7 +13,7 @@ use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; -use milli::{Index, Object}; +use milli::{heed, Index, Object}; use structopt::StructOpt; #[global_allocator] diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index bd09574f3..9a8496e28 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -9,7 +9,6 @@ publish = false [dependencies] anyhow = "1.0.56" byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } stderrlog = "0.5.1" diff --git a/helpers/src/main.rs b/helpers/src/main.rs index 0081965ad..d1050e937 100644 --- a/helpers/src/main.rs +++ b/helpers/src/main.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use byte_unit::Byte; -use heed::{CompactionOption, Env, EnvOpenOptions}; +use milli::heed::{CompactionOption, Env, EnvOpenOptions}; use structopt::StructOpt; use Command::*; diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 993818f93..6d902f5b3 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,7 +10,6 @@ publish = false anyhow = "1.0.56" byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } crossbeam-channel = "0.5.2" -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } memmap2 = "0.5.3" milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index de5d3c5ab..afde8cc1a 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -17,8 +17,8 @@ use byte_unit::Byte; use either::Either; use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; -use heed::EnvOpenOptions; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::heed::EnvOpenOptions; use milli::tokenizer::TokenizerBuilder; use milli::update::UpdateIndexingStep::*; use milli::update::{ diff --git a/http-ui/src/update_store.rs b/http-ui/src/update_store.rs index b77057fda..bbbff25c8 100644 --- a/http-ui/src/update_store.rs +++ b/http-ui/src/update_store.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use crossbeam_channel::Sender; use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson}; use heed::{Database, Env, EnvOpenOptions}; +use milli::heed; use serde::{Deserialize, Serialize}; pub type BEU64 = heed::zerocopy::U64; diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 8c92ae649..7c17782c3 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -9,7 +9,6 @@ publish = false anyhow = "1.0.56" byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } csv = "1.1.6" -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } milli = { path = "../milli" } mimalloc = { version = "0.1.29", default-features = false } roaring = "0.9.0" diff --git a/infos/src/main.rs b/infos/src/main.rs index 1fbd50889..f5fdcf94a 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -7,7 +7,7 @@ use byte_unit::Byte; use heed::EnvOpenOptions; use milli::facet::FacetType; use milli::index::db_name::*; -use milli::{FieldId, Index}; +use milli::{heed, FieldId, Index}; use structopt::StructOpt; use Command::*; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index fbe756ac6..1441461f3 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,8 +18,7 @@ fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.4.1" grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } -# heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } -heed = { git = "https://github.com/meilisearch/heed", branch = "compute_size", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memmap2 = "0.5.3" diff --git a/milli/src/index.rs b/milli/src/index.rs index 36e15c181..0dccabf03 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -223,6 +223,16 @@ impl Index { self.env.path() } + /// Returns the size used by the index without the cached pages. + pub fn used_size(&self) -> Result { + Ok(self.env.non_free_pages_size()?) + } + + /// Returns the real size used by the index. + pub fn on_disk_size(&self) -> Result { + Ok(self.env.real_disk_size()?) + } + pub fn copy_to_path>(&self, path: P, option: CompactionOption) -> Result { self.env.copy_to_path(path, option).map_err(Into::into) } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index c981ee061..eae473f51 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -20,10 +20,6 @@ use crate::{ RoaringBitmapCodec, SmallString32, BEU32, }; -/// The threshold we use to determine after which number of documents we want to clear the -/// soft-deleted database and delete documents for real. -const DELETE_DOCUMENTS_THRESHOLD: u64 = 10_000; - pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, @@ -129,7 +125,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // if we have less documents to delete than the threshold we simply save them in // the `soft_deleted_documents_ids` bitmap and early exit. - if soft_deleted_docids.len() < DELETE_DOCUMENTS_THRESHOLD { + let size_used = self.index.used_size()?; + let map_size = self.index.env.map_size()? as u64; + let nb_documents = self.index.number_of_documents(&self.wtxn)?; + let nb_soft_deleted = soft_deleted_docids.len(); + + let percentage_available = 100 - (size_used * 100 / map_size); + let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); + let estimated_size_used_by_soft_deleted = estimated_document_size * nb_soft_deleted; + let percentage_used_by_soft_deleted_documents = + estimated_size_used_by_soft_deleted * 100 / map_size; + + // if we have more than 10% of disk space available and the soft deleted + // documents uses less than 10% of the total space available, + // we skip the deletion. Eg. + // - With 100Go of disk and 20Go used including 5Go of soft-deleted documents + // We don’t delete anything. + // - With 100Go of disk and 95Go used including 1mo of soft-deleted documents + // We run the deletion. + // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents + // We run the deletion. + if percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; return Ok(DocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), From 497f9817a227b20567af95996dc99fdd0839ca81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 17:25:31 +0200 Subject: [PATCH 1589/1889] Use snapshot testing for the filter parser --- filter-parser/Cargo.toml | 3 + filter-parser/src/lib.rs | 666 ++++++++++++++----------------------- filter-parser/src/value.rs | 2 +- 3 files changed, 259 insertions(+), 412 deletions(-) diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 8f61796b3..21676f960 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -8,3 +8,6 @@ publish = false [dependencies] nom = "7.1.0" nom_locate = "4.0.0" + +[dev-dependencies] +insta = "1.18.2" diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 09aa252e1..b898c264c 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -43,9 +43,6 @@ mod condition; mod error; mod value; -use std::fmt::Debug; -use std::str::FromStr; - pub use condition::{parse_condition, parse_to, Condition}; use condition::{parse_exists, parse_not_exists}; use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; @@ -59,6 +56,8 @@ use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, terminated, tuple}; use nom::Finish; use nom_locate::LocatedSpan; +use std::fmt::Debug; +use std::str::FromStr; pub(crate) use value::parse_value; use value::word_exact; @@ -388,422 +387,211 @@ pub mod tests { fn parse() { use FilterCondition as Fc; - let test_case = [ - // simple test - ( - "colour IN[]", - Fc::In { - fid: rtok("", "colour"), - els: vec![] - } - ), - ( - "colour IN[green]", - Fc::In { - fid: rtok("", "colour"), - els: vec![rtok("colour IN[", "green")] - } - ), - ( - "colour IN[green,]", - Fc::In { - fid: rtok("", "colour"), - els: vec![rtok("colour IN[", "green")] - } - ), - ( - "colour IN[green,blue]", - Fc::In { - fid: rtok("", "colour"), - els: vec![ - rtok("colour IN[", "green"), - rtok("colour IN[green, ", "blue"), - ] - } - ), - ( - "colour NOT IN[green,blue]", - Fc::Not(Box::new(Fc::In { - fid: rtok("", "colour"), - els: vec![ - rtok("colour NOT IN[", "green"), - rtok("colour NOT IN[green, ", "blue"), - ] - })) - ), - ( - " colour IN [ green , blue , ]", - Fc::In { - fid: rtok(" ", "colour"), - els: vec![ - rtok("colour IN [ ", "green"), - rtok("colour IN [ green , ", "blue"), - ] - } - ), - ( - "channel = Ponce", - Fc::Condition { - fid: rtok("", "channel"), - op: Condition::Equal(rtok("channel = ", "Ponce")), - }, - ), - ( - "subscribers = 12", - Fc::Condition { - fid: rtok("", "subscribers"), - op: Condition::Equal(rtok("subscribers = ", "12")), - }, - ), - // test all the quotes and simple quotes - ( - "channel = 'Mister Mv'", - Fc::Condition { - fid: rtok("", "channel"), - op: Condition::Equal(rtok("channel = '", "Mister Mv")), - }, - ), - ( - "channel = \"Mister Mv\"", - Fc::Condition { - fid: rtok("", "channel"), - op: Condition::Equal(rtok("channel = \"", "Mister Mv")), - }, - ), - ( - "'dog race' = Borzoi", - Fc::Condition { - fid: rtok("'", "dog race"), - op: Condition::Equal(rtok("'dog race' = ", "Borzoi")), - }, - ), - ( - "\"dog race\" = Chusky", - Fc::Condition { - fid: rtok("\"", "dog race"), - op: Condition::Equal(rtok("\"dog race\" = ", "Chusky")), - }, - ), - ( - "\"dog race\" = \"Bernese Mountain\"", - Fc::Condition { - fid: rtok("\"", "dog race"), - op: Condition::Equal(rtok("\"dog race\" = \"", "Bernese Mountain")), - }, - ), - ( - "'dog race' = 'Bernese Mountain'", - Fc::Condition { - fid: rtok("'", "dog race"), - op: Condition::Equal(rtok("'dog race' = '", "Bernese Mountain")), - }, - ), - ( - "\"dog race\" = 'Bernese Mountain'", - Fc::Condition { - fid: rtok("\"", "dog race"), - op: Condition::Equal(rtok("\"dog race\" = \"", "Bernese Mountain")), - }, - ), - // test all the operators - ( - "channel != ponce", - Fc::Condition { - fid: rtok("", "channel"), - op: Condition::NotEqual(rtok("channel != ", "ponce")), - }, - ), - ( - "NOT channel = ponce", - Fc::Not(Box::new(Fc::Condition { - fid: rtok("NOT ", "channel"), - op: Condition::Equal(rtok("NOT channel = ", "ponce")), - })), - ), - ( - "subscribers < 1000", - Fc::Condition { - fid: rtok("", "subscribers"), - op: Condition::LowerThan(rtok("subscribers < ", "1000")), - }, - ), - ( - "subscribers > 1000", - Fc::Condition { - fid: rtok("", "subscribers"), - op: Condition::GreaterThan(rtok("subscribers > ", "1000")), - }, - ), - ( - "subscribers <= 1000", - Fc::Condition { - fid: rtok("", "subscribers"), - op: Condition::LowerThanOrEqual(rtok("subscribers <= ", "1000")), - }, - ), - ( - "subscribers >= 1000", - Fc::Condition { - fid: rtok("", "subscribers"), - op: Condition::GreaterThanOrEqual(rtok("subscribers >= ", "1000")), - }, - ), - ( - "NOT subscribers < 1000", - Fc::Not(Box::new(Fc::Condition { - fid: rtok("NOT ", "subscribers"), - op: Condition::LowerThan(rtok("NOT subscribers < ", "1000")), - })), - ), - ( - "NOT subscribers > 1000", - Fc::Not(Box::new(Fc::Condition { - fid: rtok("NOT ", "subscribers"), - op: Condition::GreaterThan(rtok("NOT subscribers > ", "1000")), - })), - ), - ( - "NOT subscribers <= 1000", - Fc::Not(Box::new(Fc::Condition { - fid: rtok("NOT ", "subscribers"), - op: Condition::LowerThanOrEqual(rtok("NOT subscribers <= ", "1000")), - })), - ), - ( - "NOT subscribers >= 1000", - Fc::Not(Box::new(Fc::Condition { - fid: rtok("NOT ", "subscribers"), - op: Condition::GreaterThanOrEqual(rtok("NOT subscribers >= ", "1000")), - })), - ), - ( - "subscribers EXISTS", - Fc::Condition { - fid: rtok("", "subscribers"), - op: Condition::Exists, - }, - ), - ( - "NOT subscribers EXISTS", - Fc::Not(Box::new(Fc::Condition { - fid: rtok("NOT ", "subscribers"), - op: Condition::Exists, - })), - ), - ( - "subscribers NOT EXISTS", - Fc::Not(Box::new(Fc::Condition { - fid: rtok("", "subscribers"), - op: Condition::Exists, - })), - ), - ( - "NOT subscribers NOT EXISTS", - Fc::Not(Box::new(Fc::Not(Box::new(Fc::Condition { - fid: rtok("NOT ", "subscribers"), - op: Condition::Exists, - })))), - ), - ( - "subscribers NOT EXISTS", - Fc::Not(Box::new(Fc::Condition { - fid: rtok("", "subscribers"), - op: Condition::Exists, - })), - ), - ( - "subscribers 100 TO 1000", - Fc::Condition { - fid: rtok("", "subscribers"), - op: Condition::Between { - from: rtok("subscribers ", "100"), - to: rtok("subscribers 100 TO ", "1000"), - }, - }, - ), - ( - "NOT subscribers 100 TO 1000", - Fc::Not(Box::new(Fc::Condition { - fid: rtok("NOT ", "subscribers"), - op: Condition::Between { - from: rtok("NOT subscribers ", "100"), - to: rtok("NOT subscribers 100 TO ", "1000"), - }, - })), - ), - ( - "_geoRadius(12, 13, 14)", - Fc::GeoLowerThan { - point: [rtok("_geoRadius(", "12"), rtok("_geoRadius(12, ", "13")], - radius: rtok("_geoRadius(12, 13, ", "14"), - }, - ), - ( - "NOT _geoRadius(12, 13, 14)", - Fc::Not(Box::new(Fc::GeoLowerThan { - point: [rtok("NOT _geoRadius(", "12"), rtok("NOT _geoRadius(12, ", "13")], - radius: rtok("NOT _geoRadius(12, 13, ", "14"), - })), - ), - // test simple `or` and `and` - ( - "channel = ponce AND 'dog race' != 'bernese mountain'", - Fc::And(vec![ - Fc::Condition { - fid: rtok("", "channel"), - op: Condition::Equal(rtok("channel = ", "ponce")), - } - .into(), - Fc::Condition { - fid: rtok("channel = ponce AND '", "dog race"), - op: Condition::NotEqual(rtok( - "channel = ponce AND 'dog race' != '", - "bernese mountain", - )), - } - .into(), - ]), - ), - ( - "channel = ponce OR 'dog race' != 'bernese mountain'", - Fc::Or(vec![ - Fc::Condition { - fid: rtok("", "channel"), - op: Condition::Equal(rtok("channel = ", "ponce")), - } - .into(), - Fc::Condition { - fid: rtok("channel = ponce OR '", "dog race"), - op: Condition::NotEqual(rtok( - "channel = ponce OR 'dog race' != '", - "bernese mountain", - )), - } - .into(), - ]), - ), - ( - "channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000", - Fc::Or(vec![ - Fc::And(vec![ - Fc::Condition { - fid: rtok("", "channel"), - op: Condition::Equal(rtok("channel = ", "ponce")), - } - .into(), - Fc::Condition { - fid: rtok("channel = ponce AND '", "dog race"), - op: Condition::NotEqual(rtok( - "channel = ponce AND 'dog race' != '", - "bernese mountain", - )), - } - .into(), - ]) - .into(), - Fc::Condition { - fid: rtok( - "channel = ponce AND 'dog race' != 'bernese mountain' OR ", - "subscribers", - ), - op: Condition::GreaterThan(rtok( - "channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > ", - "1000", - )), - } - .into(), - ]), - ), - // test parenthesis - ( - "channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )", - Fc::And(vec![ - Fc::Condition { fid: rtok("", "channel"), op: Condition::Equal(rtok("channel = ", "ponce")) }.into(), - Fc::Or(vec![ - Fc::Condition { fid: rtok("channel = ponce AND ( '", "dog race"), op: Condition::NotEqual(rtok("channel = ponce AND ( 'dog race' != '", "bernese mountain"))}.into(), - Fc::Condition { fid: rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Condition::GreaterThan(rtok("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(),] - ).into()]), - ), - ( - "(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14)", - Fc::And(vec![ - Fc::Or(vec![ - Fc::And(vec![ - Fc::Condition { fid: rtok("(", "channel"), op: Condition::Equal(rtok("(channel = ", "ponce")) }.into(), - Fc::Condition { fid: rtok("(channel = ponce AND '", "dog race"), op: Condition::NotEqual(rtok("(channel = ponce AND 'dog race' != '", "bernese mountain")) }.into(), - ]).into(), - Fc::Condition { fid: rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR ", "subscribers"), op: Condition::GreaterThan(rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > ", "1000")) }.into(), - ]).into(), - Fc::GeoLowerThan { point: [rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(", "12"), rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, ", "13")], radius: rtok("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, ", "14") }.into() - ]) - ) - // ( - // ("channel = ponce AND'dog' != 'bernese mountain'", ), - // ("channel = ponce AND('dog' != 'bernese mountain')", ), - // ) - ]; - - for (input, expected) in test_case { - let result = Fc::parse(input); - - assert!( - result.is_ok(), - "Filter `{:?}` was supposed to be parsed but failed with the following error: `{}`", - expected, - result.unwrap_err() - ); - let filter = result.unwrap(); - assert_eq!(filter, Some(expected), "Filter `{}` failed.", input); + macro_rules! p { + ($s:literal) => { + Fc::parse($s).unwrap().unwrap() + }; } + + // Test equal + insta::assert_display_snapshot!(p!("channel = Ponce"), @"{channel} = {Ponce}"); + insta::assert_display_snapshot!(p!("subscribers = 12"), @"{subscribers} = {12}"); + insta::assert_display_snapshot!(p!("channel = 'Mister Mv'"), @"{channel} = {Mister Mv}"); + insta::assert_display_snapshot!(p!("channel = \"Mister Mv\""), @"{channel} = {Mister Mv}"); + insta::assert_display_snapshot!(p!("'dog race' = Borzoi"), @"{dog race} = {Borzoi}"); + insta::assert_display_snapshot!(p!("\"dog race\" = Chusky"), @"{dog race} = {Chusky}"); + insta::assert_display_snapshot!(p!("\"dog race\" = \"Bernese Mountain\""), @"{dog race} = {Bernese Mountain}"); + insta::assert_display_snapshot!(p!("'dog race' = 'Bernese Mountain'"), @"{dog race} = {Bernese Mountain}"); + insta::assert_display_snapshot!(p!("\"dog race\" = 'Bernese Mountain'"), @"{dog race} = {Bernese Mountain}"); + + // Test IN + insta::assert_display_snapshot!(p!("colour IN[]"), @"{colour} IN[]"); + insta::assert_display_snapshot!(p!("colour IN[green]"), @"{colour} IN[{green}, ]"); + insta::assert_display_snapshot!(p!("colour IN[green,]"), @"{colour} IN[{green}, ]"); + insta::assert_display_snapshot!(p!("colour NOT IN[green,blue]"), @"NOT ({colour} IN[{green}, {blue}, ])"); + insta::assert_display_snapshot!(p!(" colour IN [ green , blue , ]"), @"{colour} IN[{green}, {blue}, ]"); + + // Test conditions + insta::assert_display_snapshot!(p!("channel != ponce"), @"{channel} != {ponce}"); + insta::assert_display_snapshot!(p!("NOT channel = ponce"), @"NOT ({channel} = {ponce})"); + insta::assert_display_snapshot!(p!("subscribers < 1000"), @"{subscribers} < {1000}"); + insta::assert_display_snapshot!(p!("subscribers > 1000"), @"{subscribers} > {1000}"); + insta::assert_display_snapshot!(p!("subscribers <= 1000"), @"{subscribers} <= {1000}"); + insta::assert_display_snapshot!(p!("subscribers >= 1000"), @"{subscribers} >= {1000}"); + insta::assert_display_snapshot!(p!("subscribers <= 1000"), @"{subscribers} <= {1000}"); + insta::assert_display_snapshot!(p!("subscribers 100 TO 1000"), @"{subscribers} {100} TO {1000}"); + + // Test NOT + EXISTS + insta::assert_display_snapshot!(p!("subscribers EXISTS"), @"{subscribers} EXISTS"); + insta::assert_display_snapshot!(p!("NOT subscribers < 1000"), @"NOT ({subscribers} < {1000})"); + insta::assert_display_snapshot!(p!("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); + insta::assert_display_snapshot!(p!("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); + insta::assert_display_snapshot!(p!("NOT subscribers NOT EXISTS"), @"NOT (NOT ({subscribers} EXISTS))"); + insta::assert_display_snapshot!(p!("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); + insta::assert_display_snapshot!(p!("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})"); + + // Test geo radius + insta::assert_display_snapshot!(p!("_geoRadius(12, 13, 14)"), @"_geoRadius({12}, {13}, {14})"); + insta::assert_display_snapshot!(p!("NOT _geoRadius(12, 13, 14)"), @"NOT (_geoRadius({12}, {13}, {14}))"); + + // Test OR + AND + insta::assert_display_snapshot!(p!("channel = ponce AND 'dog race' != 'bernese mountain'"), @"AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ]"); + insta::assert_display_snapshot!(p!("channel = ponce OR 'dog race' != 'bernese mountain'"), @"OR[{channel} = {ponce}, {dog race} != {bernese mountain}, ]"); + insta::assert_display_snapshot!(p!("channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000"), @"OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, ]"); + insta::assert_display_snapshot!( + p!("channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000 OR colour = red OR colour = blue AND size = 7"), + @"OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, {colour} = {red}, AND[{colour} = {blue}, {size} = {7}, ], ]" + ); + + // test parentheses + insta::assert_display_snapshot!(p!("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )"), @"AND[{channel} = {ponce}, OR[{dog race} != {bernese mountain}, {subscribers} > {1000}, ], ]"); + insta::assert_display_snapshot!(p!("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14)"), @"AND[OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, ], _geoRadius({12}, {13}, {14}), ]"); } #[test] fn error() { use FilterCondition as Fc; - let test_case = [ - // simple test - ("channel = Ponce = 12", "Found unexpected characters at the end of the filter: `= 12`. You probably forgot an `OR` or an `AND` rule."), - ("channel = ", "Was expecting a value but instead got nothing."), - ("channel = 🐻", "Was expecting a value but instead got `🐻`."), - ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."), - ("'OR'", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `\\'OR\\'`."), - ("OR", "Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes."), - ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing."), - ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), - ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), - ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), - ("position <= _geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), - ("position <= _geoRadius(12, 13, 14)", "The `_geoRadius` filter is an operation and can't be used as a value."), - ("channel = 'ponce", "Expression `\\'ponce` is missing the following closing delimiter: `'`."), - ("channel = \"ponce", "Expression `\\\"ponce` is missing the following closing delimiter: `\"`."), - ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."), - ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."), - ("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."), - ("subscribers 100 TO1000", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`."), - ("channel = ponce ORdog != 'bernese mountain'", "Found unexpected characters at the end of the filter: `ORdog != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), - ("colour IN blue, green]", "Expected `[` after `IN` keyword."), - ("colour IN [blue, green, 'blue' > 2]", "Expected only comma-separated field names inside `IN[..]` but instead found `> 2]`"), - ("colour IN [blue, green, AND]", "Expected only comma-separated field names inside `IN[..]` but instead found `AND]`"), - ("colour IN [blue, green", "Expected matching `]` after the list of field names given to `IN[`"), - ("colour IN ['blue, green", "Expression `\\'blue, green` is missing the following closing delimiter: `'`."), - ("x = EXISTS", "Was expecting a value but instead got `EXISTS`, which is a reserved keyword. To use `EXISTS` as a field name or a value, surround it by quotes."), - ("AND = 8", "Was expecting a value but instead got `AND`, which is a reserved keyword. To use `AND` as a field name or a value, surround it by quotes."), - ]; - - for (input, expected) in test_case { - let result = Fc::parse(input); - - assert!( - result.is_err(), - "Filter `{}` wasn't supposed to be parsed but it did with the following result: `{:?}`", - input, - result.unwrap() - ); - let filter = result.unwrap_err().to_string(); - assert!(filter.starts_with(expected), "Filter `{:?}` was supposed to return the following error:\n{}\n, but instead returned\n{}\n.", input, expected, filter); + macro_rules! p { + ($s:literal) => { + Fc::parse($s).unwrap_err().to_string() + }; } + + insta::assert_display_snapshot!(p!("channel = Ponce = 12"), @r###" + Found unexpected characters at the end of the filter: `= 12`. You probably forgot an `OR` or an `AND` rule. + 17:21 channel = Ponce = 12 + "###); + + insta::assert_display_snapshot!(p!("channel = "), @r###" + Was expecting a value but instead got nothing. + 14:14 channel = + "###); + + insta::assert_display_snapshot!(p!("channel = 🐻"), @r###" + Was expecting a value but instead got `🐻`. + 11:12 channel = 🐻 + "###); + + insta::assert_display_snapshot!(p!("channel = 🐻 AND followers < 100"), @r###" + Was expecting a value but instead got `🐻`. + 11:12 channel = 🐻 AND followers < 100 + "###); + + insta::assert_display_snapshot!(p!("'OR'"), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `\'OR\'`. + 1:5 'OR' + "###); + + insta::assert_display_snapshot!(p!("OR"), @r###" + Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. + 1:3 OR + "###); + + insta::assert_display_snapshot!(p!("channel Ponce"), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`. + 1:14 channel Ponce + "###); + + insta::assert_display_snapshot!(p!("channel = Ponce OR"), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing. + 19:19 channel = Ponce OR + "###); + + insta::assert_display_snapshot!(p!("_geoRadius"), @r###" + The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`. + 1:11 _geoRadius + "###); + + insta::assert_display_snapshot!(p!("_geoRadius = 12"), @r###" + The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`. + 1:16 _geoRadius = 12 + "###); + + insta::assert_display_snapshot!(p!("_geoPoint(12, 13, 14)"), @r###" + `_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates. + 1:22 _geoPoint(12, 13, 14) + "###); + + insta::assert_display_snapshot!(p!("position <= _geoPoint(12, 13, 14)"), @r###" + `_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates. + 13:34 position <= _geoPoint(12, 13, 14) + "###); + + insta::assert_display_snapshot!(p!("position <= _geoRadius(12, 13, 14)"), @r###" + The `_geoRadius` filter is an operation and can't be used as a value. + 13:35 position <= _geoRadius(12, 13, 14) + "###); + + insta::assert_display_snapshot!(p!("channel = 'ponce"), @r###" + Expression `\'ponce` is missing the following closing delimiter: `'`. + 11:17 channel = 'ponce + "###); + + insta::assert_display_snapshot!(p!("channel = \"ponce"), @r###" + Expression `\"ponce` is missing the following closing delimiter: `"`. + 11:17 channel = "ponce + "###); + + insta::assert_display_snapshot!(p!("channel = mv OR (followers >= 1000"), @r###" + Expression `(followers >= 1000` is missing the following closing delimiter: `)`. + 17:35 channel = mv OR (followers >= 1000 + "###); + + insta::assert_display_snapshot!(p!("channel = mv OR followers >= 1000)"), @r###" + Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule. + 34:35 channel = mv OR followers >= 1000) + "###); + + insta::assert_display_snapshot!(p!("colour NOT EXIST"), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`. + 1:17 colour NOT EXIST + "###); + + insta::assert_display_snapshot!(p!("subscribers 100 TO1000"), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`. + 1:23 subscribers 100 TO1000 + "###); + + insta::assert_display_snapshot!(p!("channel = ponce ORdog != 'bernese mountain'"), @r###" + Found unexpected characters at the end of the filter: `ORdog != \'bernese mountain\'`. You probably forgot an `OR` or an `AND` rule. + 17:44 channel = ponce ORdog != 'bernese mountain' + "###); + + insta::assert_display_snapshot!(p!("colour IN blue, green]"), @r###" + Expected `[` after `IN` keyword. + 11:23 colour IN blue, green] + "###); + + insta::assert_display_snapshot!(p!("colour IN [blue, green, 'blue' > 2]"), @r###" + Expected only comma-separated field names inside `IN[..]` but instead found `> 2]`. + 32:36 colour IN [blue, green, 'blue' > 2] + "###); + + insta::assert_display_snapshot!(p!("colour IN [blue, green, AND]"), @r###" + Expected only comma-separated field names inside `IN[..]` but instead found `AND]`. + 25:29 colour IN [blue, green, AND] + "###); + + insta::assert_display_snapshot!(p!("colour IN [blue, green"), @r###" + Expected matching `]` after the list of field names given to `IN[` + 23:23 colour IN [blue, green + "###); + + insta::assert_display_snapshot!(p!("colour IN ['blue, green"), @r###" + Expression `\'blue, green` is missing the following closing delimiter: `'`. + 12:24 colour IN ['blue, green + "###); + + insta::assert_display_snapshot!(p!("x = EXISTS"), @r###" + Was expecting a value but instead got `EXISTS`, which is a reserved keyword. To use `EXISTS` as a field name or a value, surround it by quotes. + 5:11 x = EXISTS + "###); + + insta::assert_display_snapshot!(p!("AND = 8"), @r###" + Was expecting a value but instead got `AND`, which is a reserved keyword. To use `AND` as a field name or a value, surround it by quotes. + 1:4 AND = 8 + "###); } #[test] @@ -821,3 +609,59 @@ pub mod tests { assert!(filter.token_at_depth(3).is_none()); } } + +impl<'a> std::fmt::Display for FilterCondition<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + FilterCondition::Not(filter) => { + write!(f, "NOT ({filter})") + } + FilterCondition::Condition { fid, op } => { + write!(f, "{fid} {op}") + } + FilterCondition::In { fid, els } => { + write!(f, "{fid} IN[")?; + for el in els { + write!(f, "{el}, ")?; + } + write!(f, "]") + } + FilterCondition::Or(els) => { + write!(f, "OR[")?; + for el in els { + write!(f, "{el}, ")?; + } + write!(f, "]") + } + FilterCondition::And(els) => { + write!(f, "AND[")?; + for el in els { + write!(f, "{el}, ")?; + } + write!(f, "]") + } + FilterCondition::GeoLowerThan { point, radius } => { + write!(f, "_geoRadius({}, {}, {})", point[0], point[1], radius) + } + } + } +} +impl<'a> std::fmt::Display for Condition<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Condition::GreaterThan(token) => write!(f, "> {token}"), + Condition::GreaterThanOrEqual(token) => write!(f, ">= {token}"), + Condition::Equal(token) => write!(f, "= {token}"), + Condition::NotEqual(token) => write!(f, "!= {token}"), + Condition::Exists => write!(f, "EXISTS"), + Condition::LowerThan(token) => write!(f, "< {token}"), + Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"), + Condition::Between { from, to } => write!(f, "{from} TO {to}"), + } + } +} +impl<'a> std::fmt::Display for Token<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{{{}}}", self.value()) + } +} diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 90dc44604..d015018c1 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -3,7 +3,7 @@ use nom::bytes::complete::{take_till, take_while, take_while1}; use nom::character::complete::{char, multispace0}; use nom::combinator::cut; use nom::sequence::{delimited, terminated}; -use nom::{error, InputIter, InputLength, InputTake, Slice}; +use nom::{InputIter, InputLength, InputTake, Slice}; use crate::error::{ExpectedValueKind, NomErrorExt}; use crate::{parse_geo_point, parse_geo_radius, Error, ErrorKind, IResult, Span, Token}; From 84a784834e5fb4044597e90cbc98f483b6ea7233 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Wed, 17 Aug 2022 19:25:05 +0200 Subject: [PATCH 1590/1889] retry downloading the benchmarks datasets --- benchmarks/build.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/benchmarks/build.rs b/benchmarks/build.rs index c15123b37..d7b99db37 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -80,7 +80,7 @@ fn main() -> anyhow::Result<()> { } let url = format!("{}/{}.{}.gz", BASE_URL, dataset, extension); eprintln!("downloading: {}", url); - let bytes = download_dataset(url.clone())?; + let bytes = retry(|| download_dataset(url.clone()), 10)?; eprintln!("{} downloaded successfully", url); eprintln!("uncompressing in {}", out_file.display()); uncompress_in_file(bytes, &out_file)?; @@ -89,6 +89,15 @@ fn main() -> anyhow::Result<()> { Ok(()) } +fn retry(fun: impl Fn() -> Result, times: usize) -> Result { + for _ in 0..times { + if let ok @ Ok(_) = fun() { + return ok; + } + } + fun() +} + fn download_dataset(url: U) -> anyhow::Result> { let bytes = reqwest::blocking::Client::builder().timeout(None).build()?.get(url).send()?.bytes()?; From b030efdc832ec23df63ad3020bdea758d3357469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 18 Aug 2022 10:58:04 +0200 Subject: [PATCH 1591/1889] Fix parsing of IN[] filter followed by whitespace + factorise its impl --- filter-parser/src/lib.rs | 47 +++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index b898c264c..6dc176283 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -180,9 +180,8 @@ fn parse_value_list<'a>(input: Span<'a>) -> IResult>> { } } -/// in = value "IN" "[" value_list "]" -fn parse_in(input: Span) -> IResult { - let (input, value) = parse_value(input)?; +/// "IN" "[" value_list "]" +fn parse_in_body(input: Span) -> IResult> { let (input, _) = ws(word_exact("IN"))(input)?; // everything after `IN` can be a failure @@ -194,7 +193,7 @@ fn parse_in(input: Span) -> IResult { let (input, content) = cut(parse_value_list)(input)?; // everything after `IN` can be a failure - let (input, _) = cut_with_err(tag("]"), |_| { + let (input, _) = cut_with_err(ws(tag("]")), |_| { if eof::<_, ()>(input).is_ok() { Error::new_from_kind(input, ErrorKind::InClosingBracket) } else { @@ -209,28 +208,23 @@ fn parse_in(input: Span) -> IResult { } })(input)?; + Ok((input, content)) +} + +/// in = value "IN" "[" value_list "]" +fn parse_in(input: Span) -> IResult { + let (input, value) = parse_value(input)?; + let (input, content) = parse_in_body(input)?; + let filter = FilterCondition::In { fid: value, els: content }; Ok((input, filter)) } + /// in = value "NOT" WS* "IN" "[" value_list "]" fn parse_not_in(input: Span) -> IResult { let (input, value) = parse_value(input)?; let (input, _) = word_exact("NOT")(input)?; - let (input, _) = ws(word_exact("IN"))(input)?; - - // everything after `IN` can be a failure - let (input, _) = - cut_with_err(tag("["), |_| Error::new_from_kind(input, ErrorKind::InOpeningBracket))( - input, - )?; - - let (input, content) = cut(parse_value_list)(input)?; - - // everything after `IN` can be a failure - let (input, _) = - cut_with_err(tag("]"), |_| Error::new_from_kind(input, ErrorKind::InClosingBracket))( - input, - )?; + let (input, content) = parse_in_body(input)?; let filter = FilterCondition::Not(Box::new(FilterCondition::In { fid: value, els: content })); Ok((input, filter)) @@ -355,9 +349,6 @@ fn parse_primary(input: Span) -> IResult { ))(input) // if the inner parsers did not match enough information to return an accurate error .map_err(|e| e.map_err(|_| Error::new_from_kind(input, ErrorKind::InvalidPrimary))) - - // TODO: if the filter starts with a reserved keyword that is not NOT, then we should return the reserved keyword error - // TODO: if the filter is x = reserved, idem } /// expression = or @@ -411,6 +402,18 @@ pub mod tests { insta::assert_display_snapshot!(p!("colour NOT IN[green,blue]"), @"NOT ({colour} IN[{green}, {blue}, ])"); insta::assert_display_snapshot!(p!(" colour IN [ green , blue , ]"), @"{colour} IN[{green}, {blue}, ]"); + // Test IN + OR/AND/() + insta::assert_display_snapshot!(p!(" colour IN [green, blue] AND color = green "), @"AND[{colour} IN[{green}, {blue}, ], {color} = {green}, ]"); + insta::assert_display_snapshot!(p!("NOT (colour IN [green, blue]) AND color = green "), @"AND[NOT ({colour} IN[{green}, {blue}, ]), {color} = {green}, ]"); + insta::assert_display_snapshot!(p!("x = 1 OR NOT (colour IN [green, blue] OR color = green) "), @"OR[{x} = {1}, NOT (OR[{colour} IN[{green}, {blue}, ], {color} = {green}, ]), ]"); + + // Test whitespace start/end + insta::assert_display_snapshot!(p!(" colour = green "), @"{colour} = {green}"); + insta::assert_display_snapshot!(p!(" (colour = green OR colour = red) "), @"OR[{colour} = {green}, {colour} = {red}, ]"); + insta::assert_display_snapshot!(p!(" colour IN [green, blue] AND color = green "), @"AND[{colour} IN[{green}, {blue}, ], {color} = {green}, ]"); + insta::assert_display_snapshot!(p!(" colour NOT IN [green, blue] "), @"NOT ({colour} IN[{green}, {blue}, ])"); + insta::assert_display_snapshot!(p!(" colour IN [green, blue] "), @"{colour} IN[{green}, {blue}, ]"); + // Test conditions insta::assert_display_snapshot!(p!("channel != ponce"), @"{channel} != {ponce}"); insta::assert_display_snapshot!(p!("NOT channel = ponce"), @"NOT ({channel} = {ponce})"); From 98f0da6b383bf60cde4a82fc9132cac3f05dbdbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 18 Aug 2022 10:58:24 +0200 Subject: [PATCH 1592/1889] Simplify representation of nested NOT filters --- filter-parser/src/lib.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 6dc176283..aac8d7d35 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -267,7 +267,10 @@ fn parse_and(input: Span) -> IResult { /// If we parse a `NOT` we MUST parse something behind. fn parse_not(input: Span) -> IResult { alt(( - map(preceded(ws(word_exact("NOT")), cut(parse_not)), |e| FilterCondition::Not(Box::new(e))), + map(preceded(ws(word_exact("NOT")), cut(parse_not)), |e| match e { + FilterCondition::Not(e) => *e, + _ => FilterCondition::Not(Box::new(e)), + }), parse_primary, ))(input) } @@ -429,10 +432,14 @@ pub mod tests { insta::assert_display_snapshot!(p!("NOT subscribers < 1000"), @"NOT ({subscribers} < {1000})"); insta::assert_display_snapshot!(p!("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); insta::assert_display_snapshot!(p!("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); - insta::assert_display_snapshot!(p!("NOT subscribers NOT EXISTS"), @"NOT (NOT ({subscribers} EXISTS))"); + insta::assert_display_snapshot!(p!("NOT subscribers NOT EXISTS"), @"{subscribers} EXISTS"); insta::assert_display_snapshot!(p!("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); insta::assert_display_snapshot!(p!("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})"); + // Test nested NOT + insta::assert_display_snapshot!(p!("NOT NOT NOT NOT x = 5"), @"{x} = {5}"); + insta::assert_display_snapshot!(p!("NOT NOT (NOT NOT x = 5)"), @"{x} = {5}"); + // Test geo radius insta::assert_display_snapshot!(p!("_geoRadius(12, 13, 14)"), @"_geoRadius({12}, {13}, {14})"); insta::assert_display_snapshot!(p!("NOT _geoRadius(12, 13, 14)"), @"NOT (_geoRadius({12}, {13}, {14}))"); From c51dcad51b1d225f24ef5f519c1c94daa7088c4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 18 Aug 2022 10:59:21 +0200 Subject: [PATCH 1593/1889] Don't recompute filterable fields in evaluation of IN[] filter --- milli/src/search/facet/filter.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 487676f4a..2f6fb5b00 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -361,10 +361,7 @@ impl<'a> Filter<'a> { return Ok(all_ids - selected); } FilterCondition::In { fid, els } => { - // TODO: this could be optimised - let filterable_fields = index.filterable_fields(rtxn)?; - - if crate::is_faceted(fid.value(), &filterable_fields) { + if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { @@ -382,7 +379,7 @@ impl<'a> Filter<'a> { } else { return Err(fid.as_external_error(FilterError::AttributeNotFilterable { attribute: fid.value(), - filterable_fields, + filterable_fields: filterable_fields.clone(), }))?; } } From 9af69c151be63327116208ae7cfc7385c729464c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 18 Aug 2022 11:27:39 +0200 Subject: [PATCH 1594/1889] Limit the maximum depth of filters This should have no impact on the user but is there to safeguard meilisearch against malicious inputs. --- filter-parser/src/error.rs | 5 +++ filter-parser/src/lib.rs | 79 +++++++++++++++++++++++++++++--------- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index ce9470ff8..05fc3a276 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -70,6 +70,7 @@ pub enum ErrorKind<'a> { MissingClosingDelimiter(char), Char(char), InternalError(error::ErrorKind), + DepthLimitReached, External(String), } @@ -176,6 +177,10 @@ impl<'a> Display for Error<'a> { ErrorKind::Char(c) => { panic!("Tried to display a char error with `{}`", c) } + ErrorKind::DepthLimitReached => writeln!( + f, + "The filter exceeded the maximum depth limit. Try rewriting the filter so that it contains fewer nested conditions." + )?, ErrorKind::InternalError(kind) => writeln!( f, "Encountered an internal `{:?}` error while parsing your filter. Please fill an issue", kind diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index aac8d7d35..b8d3272c8 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -65,6 +65,8 @@ pub type Span<'a> = LocatedSpan<&'a str, &'a str>; type IResult<'a, Ret> = nom::IResult, Ret, Error<'a>>; +const MAX_FILTER_DEPTH: usize = 200; + #[derive(Debug, Clone, Eq)] pub struct Token<'a> { /// The token in the original input, it should be used when possible. @@ -231,10 +233,14 @@ fn parse_not_in(input: Span) -> IResult { } /// or = and ("OR" and) -fn parse_or(input: Span) -> IResult { - let (input, first_filter) = parse_and(input)?; +fn parse_or(input: Span, depth: usize) -> IResult { + if depth > MAX_FILTER_DEPTH { + return Err(nom::Err::Error(Error::new_from_kind(input, ErrorKind::DepthLimitReached))); + } + let (input, first_filter) = parse_and(input, depth + 1)?; // if we found a `OR` then we MUST find something next - let (input, mut ors) = many0(preceded(ws(word_exact("OR")), cut(parse_and)))(input)?; + let (input, mut ors) = + many0(preceded(ws(word_exact("OR")), cut(|input| parse_and(input, depth + 1))))(input)?; let filter = if ors.is_empty() { first_filter @@ -247,10 +253,14 @@ fn parse_or(input: Span) -> IResult { } /// and = not ("AND" not)* -fn parse_and(input: Span) -> IResult { - let (input, first_filter) = parse_not(input)?; +fn parse_and(input: Span, depth: usize) -> IResult { + if depth > MAX_FILTER_DEPTH { + return Err(nom::Err::Error(Error::new_from_kind(input, ErrorKind::DepthLimitReached))); + } + let (input, first_filter) = parse_not(input, depth + 1)?; // if we found a `AND` then we MUST find something next - let (input, mut ands) = many0(preceded(ws(word_exact("AND")), cut(parse_not)))(input)?; + let (input, mut ands) = + many0(preceded(ws(word_exact("AND")), cut(|input| parse_not(input, depth + 1))))(input)?; let filter = if ands.is_empty() { first_filter @@ -265,13 +275,19 @@ fn parse_and(input: Span) -> IResult { /// not = ("NOT" WS+ not) | primary /// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. /// If we parse a `NOT` we MUST parse something behind. -fn parse_not(input: Span) -> IResult { +fn parse_not(input: Span, depth: usize) -> IResult { + if depth > MAX_FILTER_DEPTH { + return Err(nom::Err::Error(Error::new_from_kind(input, ErrorKind::DepthLimitReached))); + } alt(( - map(preceded(ws(word_exact("NOT")), cut(parse_not)), |e| match e { - FilterCondition::Not(e) => *e, - _ => FilterCondition::Not(Box::new(e)), - }), - parse_primary, + map( + preceded(ws(word_exact("NOT")), cut(|input| parse_not(input, depth + 1))), + |e| match e { + FilterCondition::Not(e) => *e, + _ => FilterCondition::Not(Box::new(e)), + }, + ), + |input| parse_primary(input, depth + 1), ))(input) } @@ -329,12 +345,15 @@ fn parse_error_reserved_keyword(input: Span) -> IResult { } /// primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to -fn parse_primary(input: Span) -> IResult { +fn parse_primary(input: Span, depth: usize) -> IResult { + if depth > MAX_FILTER_DEPTH { + return Err(nom::Err::Error(Error::new_from_kind(input, ErrorKind::DepthLimitReached))); + } alt(( // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis delimited( ws(char('(')), - cut(parse_expression), + cut(|input| parse_expression(input, depth + 1)), cut_with_err(ws(char(')')), |c| { Error::new_from_kind(input, ErrorKind::MissingClosingDelimiter(c.char())) }), @@ -355,13 +374,13 @@ fn parse_primary(input: Span) -> IResult { } /// expression = or -pub fn parse_expression(input: Span) -> IResult { - parse_or(input) +pub fn parse_expression(input: Span, depth: usize) -> IResult { + parse_or(input, depth) } /// filter = expression EOF pub fn parse_filter(input: Span) -> IResult { - terminated(parse_expression, eof)(input) + terminated(|input| parse_expression(input, 0), eof)(input) } #[cfg(test)] @@ -453,9 +472,20 @@ pub mod tests { @"OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, {colour} = {red}, AND[{colour} = {blue}, {size} = {7}, ], ]" ); - // test parentheses + // Test parentheses insta::assert_display_snapshot!(p!("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )"), @"AND[{channel} = {ponce}, OR[{dog race} != {bernese mountain}, {subscribers} > {1000}, ], ]"); insta::assert_display_snapshot!(p!("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14)"), @"AND[OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, ], _geoRadius({12}, {13}, {14}), ]"); + + // Test recursion + // This is the most that is allowed + insta::assert_display_snapshot!( + p!("(((((((((((((((((((((((((((((((((((((((((((((((((x = 1)))))))))))))))))))))))))))))))))))))))))))))))))"), + @"{x} = {1}" + ); + insta::assert_display_snapshot!( + p!("NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT x = 1"), + @"NOT ({x} = {1})" + ); } #[test] @@ -602,6 +632,19 @@ pub mod tests { Was expecting a value but instead got `AND`, which is a reserved keyword. To use `AND` as a field name or a value, surround it by quotes. 1:4 AND = 8 "###); + + insta::assert_display_snapshot!(p!("((((((((((((((((((((((((((((((((((((((((((((((((((x = 1))))))))))))))))))))))))))))))))))))))))))))))))))"), @r###" + The filter exceeded the maximum depth limit. Try rewriting the filter so that it contains fewer nested conditions. + 51:106 ((((((((((((((((((((((((((((((((((((((((((((((((((x = 1)))))))))))))))))))))))))))))))))))))))))))))))))) + "###); + + insta::assert_display_snapshot!( + p!("NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT x = 1"), + @r###" + The filter exceeded the maximum depth limit. Try rewriting the filter so that it contains fewer nested conditions. + 797:802 NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT x = 1 + "### + ); } #[test] From 5d74ebd5e56c411b46be9e737de664dbb94ff6b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 18 Aug 2022 11:36:38 +0200 Subject: [PATCH 1595/1889] Cargo fmt --- filter-parser/src/error.rs | 1 - filter-parser/src/lib.rs | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index 05fc3a276..d5d36bd8e 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -140,7 +140,6 @@ impl<'a> Display for Error<'a> { ErrorKind::MissingClosingDelimiter(c) => { writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? } - ErrorKind::InvalidPrimary if input.trim().is_empty() => { writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")? } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index b8d3272c8..014b76bc5 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -43,6 +43,9 @@ mod condition; mod error; mod value; +use std::fmt::Debug; +use std::str::FromStr; + pub use condition::{parse_condition, parse_to, Condition}; use condition::{parse_exists, parse_not_exists}; use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; @@ -56,8 +59,6 @@ use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, terminated, tuple}; use nom::Finish; use nom_locate::LocatedSpan; -use std::fmt::Debug; -use std::str::FromStr; pub(crate) use value::parse_value; use value::word_exact; @@ -182,7 +183,7 @@ fn parse_value_list<'a>(input: Span<'a>) -> IResult>> { } } -/// "IN" "[" value_list "]" +/// "IN" WS* "[" value_list "]" fn parse_in_body(input: Span) -> IResult> { let (input, _) = ws(word_exact("IN"))(input)?; From dd34dbaca5dae0241d205766d63796bbb23876f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 18 Aug 2022 11:55:01 +0200 Subject: [PATCH 1596/1889] Add more filter parser tests --- filter-parser/src/lib.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 014b76bc5..36307446f 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -487,6 +487,9 @@ pub mod tests { p!("NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT x = 1"), @"NOT ({x} = {1})" ); + + // Confusing keywords + insta::assert_display_snapshot!(p!(r#"NOT "OR" EXISTS AND "EXISTS" NOT EXISTS"#), @"AND[NOT ({OR} EXISTS), NOT ({EXISTS} EXISTS), ]"); } #[test] @@ -646,6 +649,11 @@ pub mod tests { 797:802 NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT x = 1 "### ); + + insta::assert_display_snapshot!(p!(r#"NOT OR EXISTS AND EXISTS NOT EXISTS"#), @r###" + Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. + 5:7 NOT OR EXISTS AND EXISTS NOT EXISTS + "###); } #[test] From 8a271223a9996ebf1593a1d29dcec85a7c12d5fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 18 Aug 2022 13:03:55 +0200 Subject: [PATCH 1597/1889] Change a macro_rules to a function in filter parser --- filter-parser/src/lib.rs | 170 +++++++++++++++++++-------------------- 1 file changed, 83 insertions(+), 87 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 36307446f..ddb218759 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -401,90 +401,88 @@ pub mod tests { fn parse() { use FilterCondition as Fc; - macro_rules! p { - ($s:literal) => { - Fc::parse($s).unwrap().unwrap() - }; + fn p(s: &str) -> impl std::fmt::Display { + Fc::parse(s).unwrap().unwrap() } // Test equal - insta::assert_display_snapshot!(p!("channel = Ponce"), @"{channel} = {Ponce}"); - insta::assert_display_snapshot!(p!("subscribers = 12"), @"{subscribers} = {12}"); - insta::assert_display_snapshot!(p!("channel = 'Mister Mv'"), @"{channel} = {Mister Mv}"); - insta::assert_display_snapshot!(p!("channel = \"Mister Mv\""), @"{channel} = {Mister Mv}"); - insta::assert_display_snapshot!(p!("'dog race' = Borzoi"), @"{dog race} = {Borzoi}"); - insta::assert_display_snapshot!(p!("\"dog race\" = Chusky"), @"{dog race} = {Chusky}"); - insta::assert_display_snapshot!(p!("\"dog race\" = \"Bernese Mountain\""), @"{dog race} = {Bernese Mountain}"); - insta::assert_display_snapshot!(p!("'dog race' = 'Bernese Mountain'"), @"{dog race} = {Bernese Mountain}"); - insta::assert_display_snapshot!(p!("\"dog race\" = 'Bernese Mountain'"), @"{dog race} = {Bernese Mountain}"); + insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}"); + insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}"); + insta::assert_display_snapshot!(p("channel = 'Mister Mv'"), @"{channel} = {Mister Mv}"); + insta::assert_display_snapshot!(p("channel = \"Mister Mv\""), @"{channel} = {Mister Mv}"); + insta::assert_display_snapshot!(p("'dog race' = Borzoi"), @"{dog race} = {Borzoi}"); + insta::assert_display_snapshot!(p("\"dog race\" = Chusky"), @"{dog race} = {Chusky}"); + insta::assert_display_snapshot!(p("\"dog race\" = \"Bernese Mountain\""), @"{dog race} = {Bernese Mountain}"); + insta::assert_display_snapshot!(p("'dog race' = 'Bernese Mountain'"), @"{dog race} = {Bernese Mountain}"); + insta::assert_display_snapshot!(p("\"dog race\" = 'Bernese Mountain'"), @"{dog race} = {Bernese Mountain}"); // Test IN - insta::assert_display_snapshot!(p!("colour IN[]"), @"{colour} IN[]"); - insta::assert_display_snapshot!(p!("colour IN[green]"), @"{colour} IN[{green}, ]"); - insta::assert_display_snapshot!(p!("colour IN[green,]"), @"{colour} IN[{green}, ]"); - insta::assert_display_snapshot!(p!("colour NOT IN[green,blue]"), @"NOT ({colour} IN[{green}, {blue}, ])"); - insta::assert_display_snapshot!(p!(" colour IN [ green , blue , ]"), @"{colour} IN[{green}, {blue}, ]"); + insta::assert_display_snapshot!(p("colour IN[]"), @"{colour} IN[]"); + insta::assert_display_snapshot!(p("colour IN[green]"), @"{colour} IN[{green}, ]"); + insta::assert_display_snapshot!(p("colour IN[green,]"), @"{colour} IN[{green}, ]"); + insta::assert_display_snapshot!(p("colour NOT IN[green,blue]"), @"NOT ({colour} IN[{green}, {blue}, ])"); + insta::assert_display_snapshot!(p(" colour IN [ green , blue , ]"), @"{colour} IN[{green}, {blue}, ]"); // Test IN + OR/AND/() - insta::assert_display_snapshot!(p!(" colour IN [green, blue] AND color = green "), @"AND[{colour} IN[{green}, {blue}, ], {color} = {green}, ]"); - insta::assert_display_snapshot!(p!("NOT (colour IN [green, blue]) AND color = green "), @"AND[NOT ({colour} IN[{green}, {blue}, ]), {color} = {green}, ]"); - insta::assert_display_snapshot!(p!("x = 1 OR NOT (colour IN [green, blue] OR color = green) "), @"OR[{x} = {1}, NOT (OR[{colour} IN[{green}, {blue}, ], {color} = {green}, ]), ]"); + insta::assert_display_snapshot!(p(" colour IN [green, blue] AND color = green "), @"AND[{colour} IN[{green}, {blue}, ], {color} = {green}, ]"); + insta::assert_display_snapshot!(p("NOT (colour IN [green, blue]) AND color = green "), @"AND[NOT ({colour} IN[{green}, {blue}, ]), {color} = {green}, ]"); + insta::assert_display_snapshot!(p("x = 1 OR NOT (colour IN [green, blue] OR color = green) "), @"OR[{x} = {1}, NOT (OR[{colour} IN[{green}, {blue}, ], {color} = {green}, ]), ]"); // Test whitespace start/end - insta::assert_display_snapshot!(p!(" colour = green "), @"{colour} = {green}"); - insta::assert_display_snapshot!(p!(" (colour = green OR colour = red) "), @"OR[{colour} = {green}, {colour} = {red}, ]"); - insta::assert_display_snapshot!(p!(" colour IN [green, blue] AND color = green "), @"AND[{colour} IN[{green}, {blue}, ], {color} = {green}, ]"); - insta::assert_display_snapshot!(p!(" colour NOT IN [green, blue] "), @"NOT ({colour} IN[{green}, {blue}, ])"); - insta::assert_display_snapshot!(p!(" colour IN [green, blue] "), @"{colour} IN[{green}, {blue}, ]"); + insta::assert_display_snapshot!(p(" colour = green "), @"{colour} = {green}"); + insta::assert_display_snapshot!(p(" (colour = green OR colour = red) "), @"OR[{colour} = {green}, {colour} = {red}, ]"); + insta::assert_display_snapshot!(p(" colour IN [green, blue] AND color = green "), @"AND[{colour} IN[{green}, {blue}, ], {color} = {green}, ]"); + insta::assert_display_snapshot!(p(" colour NOT IN [green, blue] "), @"NOT ({colour} IN[{green}, {blue}, ])"); + insta::assert_display_snapshot!(p(" colour IN [green, blue] "), @"{colour} IN[{green}, {blue}, ]"); // Test conditions - insta::assert_display_snapshot!(p!("channel != ponce"), @"{channel} != {ponce}"); - insta::assert_display_snapshot!(p!("NOT channel = ponce"), @"NOT ({channel} = {ponce})"); - insta::assert_display_snapshot!(p!("subscribers < 1000"), @"{subscribers} < {1000}"); - insta::assert_display_snapshot!(p!("subscribers > 1000"), @"{subscribers} > {1000}"); - insta::assert_display_snapshot!(p!("subscribers <= 1000"), @"{subscribers} <= {1000}"); - insta::assert_display_snapshot!(p!("subscribers >= 1000"), @"{subscribers} >= {1000}"); - insta::assert_display_snapshot!(p!("subscribers <= 1000"), @"{subscribers} <= {1000}"); - insta::assert_display_snapshot!(p!("subscribers 100 TO 1000"), @"{subscribers} {100} TO {1000}"); + insta::assert_display_snapshot!(p("channel != ponce"), @"{channel} != {ponce}"); + insta::assert_display_snapshot!(p("NOT channel = ponce"), @"NOT ({channel} = {ponce})"); + insta::assert_display_snapshot!(p("subscribers < 1000"), @"{subscribers} < {1000}"); + insta::assert_display_snapshot!(p("subscribers > 1000"), @"{subscribers} > {1000}"); + insta::assert_display_snapshot!(p("subscribers <= 1000"), @"{subscribers} <= {1000}"); + insta::assert_display_snapshot!(p("subscribers >= 1000"), @"{subscribers} >= {1000}"); + insta::assert_display_snapshot!(p("subscribers <= 1000"), @"{subscribers} <= {1000}"); + insta::assert_display_snapshot!(p("subscribers 100 TO 1000"), @"{subscribers} {100} TO {1000}"); // Test NOT + EXISTS - insta::assert_display_snapshot!(p!("subscribers EXISTS"), @"{subscribers} EXISTS"); - insta::assert_display_snapshot!(p!("NOT subscribers < 1000"), @"NOT ({subscribers} < {1000})"); - insta::assert_display_snapshot!(p!("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); - insta::assert_display_snapshot!(p!("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); - insta::assert_display_snapshot!(p!("NOT subscribers NOT EXISTS"), @"{subscribers} EXISTS"); - insta::assert_display_snapshot!(p!("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); - insta::assert_display_snapshot!(p!("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})"); + insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); + insta::assert_display_snapshot!(p("NOT subscribers < 1000"), @"NOT ({subscribers} < {1000})"); + insta::assert_display_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); + insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); + insta::assert_display_snapshot!(p("NOT subscribers NOT EXISTS"), @"{subscribers} EXISTS"); + insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); + insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})"); // Test nested NOT - insta::assert_display_snapshot!(p!("NOT NOT NOT NOT x = 5"), @"{x} = {5}"); - insta::assert_display_snapshot!(p!("NOT NOT (NOT NOT x = 5)"), @"{x} = {5}"); + insta::assert_display_snapshot!(p("NOT NOT NOT NOT x = 5"), @"{x} = {5}"); + insta::assert_display_snapshot!(p("NOT NOT (NOT NOT x = 5)"), @"{x} = {5}"); // Test geo radius - insta::assert_display_snapshot!(p!("_geoRadius(12, 13, 14)"), @"_geoRadius({12}, {13}, {14})"); - insta::assert_display_snapshot!(p!("NOT _geoRadius(12, 13, 14)"), @"NOT (_geoRadius({12}, {13}, {14}))"); + insta::assert_display_snapshot!(p("_geoRadius(12, 13, 14)"), @"_geoRadius({12}, {13}, {14})"); + insta::assert_display_snapshot!(p("NOT _geoRadius(12, 13, 14)"), @"NOT (_geoRadius({12}, {13}, {14}))"); // Test OR + AND - insta::assert_display_snapshot!(p!("channel = ponce AND 'dog race' != 'bernese mountain'"), @"AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ]"); - insta::assert_display_snapshot!(p!("channel = ponce OR 'dog race' != 'bernese mountain'"), @"OR[{channel} = {ponce}, {dog race} != {bernese mountain}, ]"); - insta::assert_display_snapshot!(p!("channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000"), @"OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, ]"); + insta::assert_display_snapshot!(p("channel = ponce AND 'dog race' != 'bernese mountain'"), @"AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ]"); + insta::assert_display_snapshot!(p("channel = ponce OR 'dog race' != 'bernese mountain'"), @"OR[{channel} = {ponce}, {dog race} != {bernese mountain}, ]"); + insta::assert_display_snapshot!(p("channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000"), @"OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, ]"); insta::assert_display_snapshot!( - p!("channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000 OR colour = red OR colour = blue AND size = 7"), + p("channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000 OR colour = red OR colour = blue AND size = 7"), @"OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, {colour} = {red}, AND[{colour} = {blue}, {size} = {7}, ], ]" ); // Test parentheses - insta::assert_display_snapshot!(p!("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )"), @"AND[{channel} = {ponce}, OR[{dog race} != {bernese mountain}, {subscribers} > {1000}, ], ]"); - insta::assert_display_snapshot!(p!("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14)"), @"AND[OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, ], _geoRadius({12}, {13}, {14}), ]"); + insta::assert_display_snapshot!(p("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )"), @"AND[{channel} = {ponce}, OR[{dog race} != {bernese mountain}, {subscribers} > {1000}, ], ]"); + insta::assert_display_snapshot!(p("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14)"), @"AND[OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, ], _geoRadius({12}, {13}, {14}), ]"); // Test recursion // This is the most that is allowed insta::assert_display_snapshot!( - p!("(((((((((((((((((((((((((((((((((((((((((((((((((x = 1)))))))))))))))))))))))))))))))))))))))))))))))))"), + p("(((((((((((((((((((((((((((((((((((((((((((((((((x = 1)))))))))))))))))))))))))))))))))))))))))))))))))"), @"{x} = {1}" ); insta::assert_display_snapshot!( - p!("NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT x = 1"), + p("NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT x = 1"), @"NOT ({x} = {1})" ); @@ -496,161 +494,159 @@ pub mod tests { fn error() { use FilterCondition as Fc; - macro_rules! p { - ($s:literal) => { - Fc::parse($s).unwrap_err().to_string() - }; + fn p(s: &str) -> impl std::fmt::Display { + Fc::parse(s).unwrap_err().to_string() } - insta::assert_display_snapshot!(p!("channel = Ponce = 12"), @r###" + insta::assert_display_snapshot!(p("channel = Ponce = 12"), @r###" Found unexpected characters at the end of the filter: `= 12`. You probably forgot an `OR` or an `AND` rule. 17:21 channel = Ponce = 12 "###); - insta::assert_display_snapshot!(p!("channel = "), @r###" + insta::assert_display_snapshot!(p("channel = "), @r###" Was expecting a value but instead got nothing. 14:14 channel = "###); - insta::assert_display_snapshot!(p!("channel = 🐻"), @r###" + insta::assert_display_snapshot!(p("channel = 🐻"), @r###" Was expecting a value but instead got `🐻`. 11:12 channel = 🐻 "###); - insta::assert_display_snapshot!(p!("channel = 🐻 AND followers < 100"), @r###" + insta::assert_display_snapshot!(p("channel = 🐻 AND followers < 100"), @r###" Was expecting a value but instead got `🐻`. 11:12 channel = 🐻 AND followers < 100 "###); - insta::assert_display_snapshot!(p!("'OR'"), @r###" + insta::assert_display_snapshot!(p("'OR'"), @r###" Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `\'OR\'`. 1:5 'OR' "###); - insta::assert_display_snapshot!(p!("OR"), @r###" + insta::assert_display_snapshot!(p("OR"), @r###" Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. 1:3 OR "###); - insta::assert_display_snapshot!(p!("channel Ponce"), @r###" + insta::assert_display_snapshot!(p("channel Ponce"), @r###" Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`. 1:14 channel Ponce "###); - insta::assert_display_snapshot!(p!("channel = Ponce OR"), @r###" + insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###" Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing. 19:19 channel = Ponce OR "###); - insta::assert_display_snapshot!(p!("_geoRadius"), @r###" + insta::assert_display_snapshot!(p("_geoRadius"), @r###" The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`. 1:11 _geoRadius "###); - insta::assert_display_snapshot!(p!("_geoRadius = 12"), @r###" + insta::assert_display_snapshot!(p("_geoRadius = 12"), @r###" The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`. 1:16 _geoRadius = 12 "###); - insta::assert_display_snapshot!(p!("_geoPoint(12, 13, 14)"), @r###" + insta::assert_display_snapshot!(p("_geoPoint(12, 13, 14)"), @r###" `_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates. 1:22 _geoPoint(12, 13, 14) "###); - insta::assert_display_snapshot!(p!("position <= _geoPoint(12, 13, 14)"), @r###" + insta::assert_display_snapshot!(p("position <= _geoPoint(12, 13, 14)"), @r###" `_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates. 13:34 position <= _geoPoint(12, 13, 14) "###); - insta::assert_display_snapshot!(p!("position <= _geoRadius(12, 13, 14)"), @r###" + insta::assert_display_snapshot!(p("position <= _geoRadius(12, 13, 14)"), @r###" The `_geoRadius` filter is an operation and can't be used as a value. 13:35 position <= _geoRadius(12, 13, 14) "###); - insta::assert_display_snapshot!(p!("channel = 'ponce"), @r###" + insta::assert_display_snapshot!(p("channel = 'ponce"), @r###" Expression `\'ponce` is missing the following closing delimiter: `'`. 11:17 channel = 'ponce "###); - insta::assert_display_snapshot!(p!("channel = \"ponce"), @r###" + insta::assert_display_snapshot!(p("channel = \"ponce"), @r###" Expression `\"ponce` is missing the following closing delimiter: `"`. 11:17 channel = "ponce "###); - insta::assert_display_snapshot!(p!("channel = mv OR (followers >= 1000"), @r###" + insta::assert_display_snapshot!(p("channel = mv OR (followers >= 1000"), @r###" Expression `(followers >= 1000` is missing the following closing delimiter: `)`. 17:35 channel = mv OR (followers >= 1000 "###); - insta::assert_display_snapshot!(p!("channel = mv OR followers >= 1000)"), @r###" + insta::assert_display_snapshot!(p("channel = mv OR followers >= 1000)"), @r###" Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule. 34:35 channel = mv OR followers >= 1000) "###); - insta::assert_display_snapshot!(p!("colour NOT EXIST"), @r###" + insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###" Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`. 1:17 colour NOT EXIST "###); - insta::assert_display_snapshot!(p!("subscribers 100 TO1000"), @r###" + insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###" Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`. 1:23 subscribers 100 TO1000 "###); - insta::assert_display_snapshot!(p!("channel = ponce ORdog != 'bernese mountain'"), @r###" + insta::assert_display_snapshot!(p("channel = ponce ORdog != 'bernese mountain'"), @r###" Found unexpected characters at the end of the filter: `ORdog != \'bernese mountain\'`. You probably forgot an `OR` or an `AND` rule. 17:44 channel = ponce ORdog != 'bernese mountain' "###); - insta::assert_display_snapshot!(p!("colour IN blue, green]"), @r###" + insta::assert_display_snapshot!(p("colour IN blue, green]"), @r###" Expected `[` after `IN` keyword. 11:23 colour IN blue, green] "###); - insta::assert_display_snapshot!(p!("colour IN [blue, green, 'blue' > 2]"), @r###" + insta::assert_display_snapshot!(p("colour IN [blue, green, 'blue' > 2]"), @r###" Expected only comma-separated field names inside `IN[..]` but instead found `> 2]`. 32:36 colour IN [blue, green, 'blue' > 2] "###); - insta::assert_display_snapshot!(p!("colour IN [blue, green, AND]"), @r###" + insta::assert_display_snapshot!(p("colour IN [blue, green, AND]"), @r###" Expected only comma-separated field names inside `IN[..]` but instead found `AND]`. 25:29 colour IN [blue, green, AND] "###); - insta::assert_display_snapshot!(p!("colour IN [blue, green"), @r###" + insta::assert_display_snapshot!(p("colour IN [blue, green"), @r###" Expected matching `]` after the list of field names given to `IN[` 23:23 colour IN [blue, green "###); - insta::assert_display_snapshot!(p!("colour IN ['blue, green"), @r###" + insta::assert_display_snapshot!(p("colour IN ['blue, green"), @r###" Expression `\'blue, green` is missing the following closing delimiter: `'`. 12:24 colour IN ['blue, green "###); - insta::assert_display_snapshot!(p!("x = EXISTS"), @r###" + insta::assert_display_snapshot!(p("x = EXISTS"), @r###" Was expecting a value but instead got `EXISTS`, which is a reserved keyword. To use `EXISTS` as a field name or a value, surround it by quotes. 5:11 x = EXISTS "###); - insta::assert_display_snapshot!(p!("AND = 8"), @r###" + insta::assert_display_snapshot!(p("AND = 8"), @r###" Was expecting a value but instead got `AND`, which is a reserved keyword. To use `AND` as a field name or a value, surround it by quotes. 1:4 AND = 8 "###); - insta::assert_display_snapshot!(p!("((((((((((((((((((((((((((((((((((((((((((((((((((x = 1))))))))))))))))))))))))))))))))))))))))))))))))))"), @r###" + insta::assert_display_snapshot!(p("((((((((((((((((((((((((((((((((((((((((((((((((((x = 1))))))))))))))))))))))))))))))))))))))))))))))))))"), @r###" The filter exceeded the maximum depth limit. Try rewriting the filter so that it contains fewer nested conditions. 51:106 ((((((((((((((((((((((((((((((((((((((((((((((((((x = 1)))))))))))))))))))))))))))))))))))))))))))))))))) "###); insta::assert_display_snapshot!( - p!("NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT x = 1"), + p("NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT x = 1"), @r###" The filter exceeded the maximum depth limit. Try rewriting the filter so that it contains fewer nested conditions. 797:802 NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT NOT x = 1 "### ); - insta::assert_display_snapshot!(p!(r#"NOT OR EXISTS AND EXISTS NOT EXISTS"#), @r###" + insta::assert_display_snapshot!(p(r#"NOT OR EXISTS AND EXISTS NOT EXISTS"#), @r###" Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. 5:7 NOT OR EXISTS AND EXISTS NOT EXISTS "###); From 9b6602cba2edd365ca1afa8786f8ce723f9f873d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 18 Aug 2022 13:06:57 +0200 Subject: [PATCH 1598/1889] Avoid cloning FilterCondition in filter array parsing --- milli/src/search/facet/filter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 2f6fb5b00..7241dab2b 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -104,7 +104,7 @@ impl<'a> Filter<'a> { if ors.len() > 1 { ands.push(FilterCondition::Or(ors)); } else if ors.len() == 1 { - ands.push(ors[0].clone()); + ands.push(ors.pop().unwrap()); } } Either::Right(rule) => { @@ -117,7 +117,7 @@ impl<'a> Filter<'a> { let and = if ands.is_empty() { return Ok(None); } else if ands.len() == 1 { - ands[0].clone() + ands.pop().unwrap() } else { FilterCondition::And(ands) }; From c7a86b56efddda4b647be6fce10fef9c8322e140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 18 Aug 2022 13:16:56 +0200 Subject: [PATCH 1599/1889] Fix filter parser compilation error --- filter-parser/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index ddb218759..33025e6e9 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -401,7 +401,7 @@ pub mod tests { fn parse() { use FilterCondition as Fc; - fn p(s: &str) -> impl std::fmt::Display { + fn p<'a>(s: &'a str) -> impl std::fmt::Display + 'a { Fc::parse(s).unwrap().unwrap() } @@ -487,14 +487,14 @@ pub mod tests { ); // Confusing keywords - insta::assert_display_snapshot!(p!(r#"NOT "OR" EXISTS AND "EXISTS" NOT EXISTS"#), @"AND[NOT ({OR} EXISTS), NOT ({EXISTS} EXISTS), ]"); + insta::assert_display_snapshot!(p(r#"NOT "OR" EXISTS AND "EXISTS" NOT EXISTS"#), @"AND[NOT ({OR} EXISTS), NOT ({EXISTS} EXISTS), ]"); } #[test] fn error() { use FilterCondition as Fc; - fn p(s: &str) -> impl std::fmt::Display { + fn p<'a>(s: &'a str) -> impl std::fmt::Display + 'a { Fc::parse(s).unwrap_err().to_string() } From 9640976c797c8fba8f2aa4bcb313dd860556c237 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 18 Aug 2022 17:36:08 +0200 Subject: [PATCH 1600/1889] Rename TermMatchingPolicies --- milli/src/lib.rs | 2 +- milli/src/search/mod.rs | 28 ++- milli/src/search/query_tree.rs | 228 ++++++++++++++++-------- milli/src/update/index_documents/mod.rs | 7 +- milli/tests/search/distinct.rs | 27 +-- milli/tests/search/filters.rs | 13 +- milli/tests/search/mod.rs | 8 +- milli/tests/search/query_criteria.rs | 8 +- milli/tests/search/sort.rs | 4 +- milli/tests/search/typo_tolerance.rs | 22 +-- 10 files changed, 222 insertions(+), 125 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ac88ebdab..517d28ccc 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -42,7 +42,7 @@ pub use self::heed_codec::{ pub use self::index::Index; pub use self::search::{ FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, - MatchingWords, Search, SearchResult, DEFAULT_VALUES_PER_FACET, + MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 1930091ef..3da8823dc 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -44,7 +44,7 @@ pub struct Search<'a> { offset: usize, limit: usize, sort_criteria: Option>, - optional_words: bool, + optional_words: TermsMatchingStrategy, authorize_typos: bool, words_limit: usize, rtxn: &'a heed::RoTxn<'a>, @@ -59,7 +59,7 @@ impl<'a> Search<'a> { offset: 0, limit: 20, sort_criteria: None, - optional_words: true, + optional_words: TermsMatchingStrategy::default(), authorize_typos: true, words_limit: 10, rtxn, @@ -87,7 +87,7 @@ impl<'a> Search<'a> { self } - pub fn optional_words(&mut self, value: bool) -> &mut Search<'a> { + pub fn optional_words(&mut self, value: TermsMatchingStrategy) -> &mut Search<'a> { self.optional_words = value; self } @@ -286,6 +286,28 @@ pub struct SearchResult { pub documents_ids: Vec, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TermsMatchingStrategy { + // remove last word first + Last, + // remove first word first + First, + // remove more frequent word first + Frequency, + // remove smallest word first + Size, + // only one of the word is mandatory + Any, + // all words are mandatory + All, +} + +impl Default for TermsMatchingStrategy { + fn default() -> Self { + Self::Last + } +} + pub type WordDerivationsCache = HashMap<(String, bool, u8), Vec<(String, u8)>>; pub fn word_derivations<'c>( diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 617d9e4d9..c3d2f3669 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::cmp::min; use std::{cmp, fmt, mem}; use charabia::classifier::ClassifiedTokenIter; @@ -8,6 +9,7 @@ use roaring::RoaringBitmap; use slice_group_by::GroupBy; use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; +use crate::search::TermsMatchingStrategy; use crate::{Index, MatchingWords, Result}; type IsOptionalWord = bool; @@ -62,6 +64,13 @@ impl Operation { if ops.len() == 1 { ops.pop().unwrap() } else { + let ops = ops + .into_iter() + .flat_map(|o| match o { + Operation::Or(wb, children) if wb == word_branch => children, + op => vec![op], + }) + .collect(); Self::Or(word_branch, ops) } } @@ -153,7 +162,7 @@ trait Context { pub struct QueryTreeBuilder<'a> { rtxn: &'a heed::RoTxn<'a>, index: &'a Index, - optional_words: bool, + optional_words: TermsMatchingStrategy, authorize_typos: bool, words_limit: Option, exact_words: Option>>, @@ -190,7 +199,7 @@ impl<'a> QueryTreeBuilder<'a> { Ok(Self { rtxn, index, - optional_words: true, + optional_words: TermsMatchingStrategy::default(), authorize_typos: true, words_limit: None, exact_words: index.exact_words(rtxn)?, @@ -201,7 +210,7 @@ impl<'a> QueryTreeBuilder<'a> { /// generated forcing all query words to be present in each matching documents /// (the criterion `words` will be ignored). /// default value if not called: `true` - pub fn optional_words(&mut self, optional_words: bool) -> &mut Self { + pub fn optional_words(&mut self, optional_words: TermsMatchingStrategy) -> &mut Self { self.optional_words = optional_words; self } @@ -323,7 +332,7 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result Result { @@ -363,6 +372,7 @@ fn create_query_tree( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], + any_words: bool, ) -> Result { const MAX_NGRAM: usize = 3; let mut op_children = Vec::new(); @@ -415,57 +425,93 @@ fn create_query_tree( } if !is_last { - let ngrams = ngrams(ctx, authorize_typos, tail)?; + let ngrams = ngrams(ctx, authorize_typos, tail, any_words)?; and_op_children.push(ngrams); } - or_op_children.push(Operation::and(and_op_children)); + + if any_words { + or_op_children.push(Operation::or(false, and_op_children)); + } else { + or_op_children.push(Operation::and(and_op_children)); + } } } op_children.push(Operation::or(false, or_op_children)); } - Ok(Operation::and(op_children)) - } - - /// Create a new branch removing the last non-phrase query parts. - fn optional_word( - ctx: &impl Context, - authorize_typos: bool, - query: PrimitiveQuery, - ) -> Result { - let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); - let mut operation_children = Vec::new(); - - let start = number_phrases + (number_phrases == 0) as usize; - for len in start..=query.len() { - let mut word_count = len - number_phrases; - let query: Vec<_> = query - .iter() - .filter(|p| { - if p.is_phrase() { - true - } else if word_count != 0 { - word_count -= 1; - true - } else { - false - } - }) - .cloned() - .collect(); - - let ngrams = ngrams(ctx, authorize_typos, &query)?; - operation_children.push(ngrams); + if any_words { + Ok(Operation::or(false, op_children)) + } else { + Ok(Operation::and(op_children)) } - - Ok(Operation::or(true, operation_children)) } - if optional_words { - optional_word(ctx, authorize_typos, query.to_vec()) - } else { - ngrams(ctx, authorize_typos, query) + let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); + let remove_count = query.len() - min(number_phrases, 1); + if remove_count == 0 { + return ngrams(ctx, authorize_typos, query, false); } + + let mut operation_children = Vec::new(); + let mut query = query.to_vec(); + for _ in 0..remove_count { + let pos = match optional_words { + TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false), + TermsMatchingStrategy::Any => { + let operation = Operation::Or( + true, + vec![ + // branch allowing matching documents to contains any query word. + ngrams(ctx, authorize_typos, &query, true)?, + // branch forcing matching documents to contains all the query words, + // keeping this documents of the top of the resulted list. + ngrams(ctx, authorize_typos, &query, false)?, + ], + ); + + return Ok(operation); + } + TermsMatchingStrategy::Last => query + .iter() + .enumerate() + .filter(|(_, part)| !part.is_phrase()) + .last() + .map(|(pos, _)| pos), + TermsMatchingStrategy::First => { + query.iter().enumerate().find(|(_, part)| !part.is_phrase()).map(|(pos, _)| pos) + } + TermsMatchingStrategy::Size => query + .iter() + .enumerate() + .filter(|(_, part)| !part.is_phrase()) + .min_by_key(|(_, part)| match part { + PrimitiveQueryPart::Word(s, _) => s.len(), + _ => unreachable!(), + }) + .map(|(pos, _)| pos), + TermsMatchingStrategy::Frequency => query + .iter() + .enumerate() + .filter(|(_, part)| !part.is_phrase()) + .max_by_key(|(_, part)| match part { + PrimitiveQueryPart::Word(s, _) => { + ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()) + } + _ => unreachable!(), + }) + .map(|(pos, _)| pos), + }; + + // compute and push the current branch on the front + operation_children.insert(0, ngrams(ctx, authorize_typos, &query, false)?); + // remove word from query before creating an new branch + match pos { + Some(pos) => query.remove(pos), + None => break, + }; + } + + Ok(Operation::Or(true, operation_children)) } /// Main function that matchings words used for crop and highlight. @@ -750,7 +796,7 @@ mod test { impl TestContext { fn build>( &self, - optional_words: bool, + optional_words: TermsMatchingStrategy, authorize_typos: bool, words_limit: Option, query: ClassifiedTokenIter, @@ -852,8 +898,10 @@ mod test { let query = "hey friends"; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" OR @@ -869,8 +917,10 @@ mod test { let query = "hey friends "; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" OR @@ -886,8 +936,10 @@ mod test { let query = "hello world "; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" OR @@ -911,8 +963,10 @@ mod test { let query = "new york city "; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" OR @@ -932,12 +986,11 @@ mod test { Exact { word: "city" } Tolerant { word: "newyork", max typo: 1 } Exact { word: "city" } - OR - Exact { word: "nyc" } - AND - Exact { word: "new" } - Exact { word: "york" } - Tolerant { word: "newyorkcity", max typo: 1 } + Exact { word: "nyc" } + AND + Exact { word: "new" } + Exact { word: "york" } + Tolerant { word: "newyorkcity", max typo: 1 } "###); } @@ -946,8 +999,10 @@ mod test { let query = "n grams "; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" OR @@ -963,8 +1018,10 @@ mod test { let query = "wordsplit fish "; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" OR @@ -982,8 +1039,10 @@ mod test { let query = "\"hey friends\" \" \" \"wooop"; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" AND @@ -997,8 +1056,10 @@ mod test { let query = "\"hey friends. wooop wooop\""; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" AND @@ -1012,8 +1073,10 @@ mod test { let query = "hey my friend "; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::default(), true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" OR(WORD) @@ -1043,8 +1106,10 @@ mod test { let query = "\"hey my\""; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::default(), true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" PHRASE ["hey", "my"] @@ -1056,8 +1121,10 @@ mod test { let query = r#""hey" my good "friend""#; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::default(), true, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" OR(WORD) @@ -1084,8 +1151,10 @@ mod test { let query = "hey friends "; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, false, None, tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" OR @@ -1101,8 +1170,10 @@ mod test { let query = "\"hey my\" good friend"; let tokens = query.tokenize(); - let (query_tree, _) = - TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, false, Some(2), tokens) + .unwrap() + .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" AND @@ -1145,7 +1216,8 @@ mod test { let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); let context = TestContext { exact_words, ..Default::default() }; - let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap(); + let (query_tree, _) = + context.build(TermsMatchingStrategy::All, true, Some(2), tokens).unwrap().unwrap(); assert!(matches!( query_tree, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index f5e04435d..3a8f961ac 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -613,6 +613,7 @@ mod tests { use super::*; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; + use crate::search::TermsMatchingStrategy; use crate::update::DeleteDocuments; use crate::BEU16; @@ -1207,7 +1208,7 @@ mod tests { let mut search = crate::Search::new(&rtxn, &index); search.query("document"); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); // all documents should be returned let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); assert_eq!(documents_ids.len(), 4); @@ -1313,7 +1314,7 @@ mod tests { let mut search = crate::Search::new(&rtxn, &index); search.query("document"); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); // all documents should be returned let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); assert_eq!(documents_ids.len(), 4); @@ -1512,7 +1513,7 @@ mod tests { let mut search = crate::Search::new(&rtxn, &index); search.query("化妆包"); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); // only 1 document should be returned let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index 022724fde..9e9905c3f 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -2,7 +2,7 @@ use std::collections::HashSet; use big_s::S; use milli::update::Settings; -use milli::{Criterion, Search, SearchResult}; +use milli::{Criterion, Search, SearchResult, TermsMatchingStrategy}; use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -28,24 +28,25 @@ macro_rules! test_distinct { search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let SearchResult { documents_ids, candidates, .. } = search.execute().unwrap(); assert_eq!(candidates.len(), $n_res); let mut distinct_values = HashSet::new(); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) - .into_iter() - .filter_map(|d| { - if distinct_values.contains(&d.$distinct) { - None - } else { - distinct_values.insert(d.$distinct.to_owned()); - Some(d.id) - } - }) - .collect(); + let expected_external_ids: Vec<_> = + search::expected_order(&criteria, true, TermsMatchingStrategy::default(), &[]) + .into_iter() + .filter_map(|d| { + if distinct_values.contains(&d.$distinct) { + None + } else { + distinct_values.insert(d.$distinct.to_owned()); + Some(d.id) + } + }) + .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 5451a9076..675004b56 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -1,5 +1,5 @@ use either::{Either, Left, Right}; -use milli::{Criterion, Filter, Search, SearchResult}; +use milli::{Criterion, Filter, Search, SearchResult, TermsMatchingStrategy}; use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -19,16 +19,17 @@ macro_rules! test_filter { search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); search.filter(filter_conditions); let SearchResult { documents_ids, .. } = search.execute().unwrap(); let filtered_ids = search::expected_filtered_ids($filter); - let expected_external_ids: Vec<_> = search::expected_order(&criteria, true, true, &[]) - .into_iter() - .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None }) - .collect(); + let expected_external_ids: Vec<_> = + search::expected_order(&criteria, true, TermsMatchingStrategy::default(), &[]) + .into_iter() + .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None }) + .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); assert_eq!(documents_ids, expected_external_ids); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 0e1d43d2a..4ec1aeb83 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -8,7 +8,7 @@ use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object}; +use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy}; use serde::{Deserialize, Deserializer}; use slice_group_by::GroupBy; @@ -96,7 +96,7 @@ pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> V pub fn expected_order( criteria: &[Criterion], authorize_typo: bool, - optional_words: bool, + optional_words: TermsMatchingStrategy, sort_by: &[AscDesc], ) -> Vec { let dataset = @@ -155,9 +155,9 @@ pub fn expected_order( groups = std::mem::take(&mut new_groups); } - if authorize_typo && optional_words { + if authorize_typo && optional_words == TermsMatchingStrategy::default() { groups.into_iter().flatten().collect() - } else if optional_words { + } else if optional_words == TermsMatchingStrategy::default() { groups.into_iter().flatten().filter(|d| d.typo_rank == 0).collect() } else if authorize_typo { groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect() diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index a96366f5e..0fce7c6df 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -7,7 +7,7 @@ use itertools::Itertools; use maplit::hashset; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; +use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy}; use rand::Rng; use Criterion::*; @@ -15,8 +15,8 @@ use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; const ALLOW_TYPOS: bool = true; const DISALLOW_TYPOS: bool = false; -const ALLOW_OPTIONAL_WORDS: bool = true; -const DISALLOW_OPTIONAL_WORDS: bool = false; +const ALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::Last; +const DISALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::All; const ASC_DESC_CANDIDATES_THRESHOLD: usize = 1000; macro_rules! test_criterion { @@ -359,7 +359,7 @@ fn criteria_mixup() { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let expected_external_ids: Vec<_> = - search::expected_order(&criteria, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, &[]) + search::expected_order(&criteria, ALLOW_TYPOS, ALLOW_OPTIONAL_WORDS, &[]) .into_iter() .map(|d| d.id) .collect(); diff --git a/milli/tests/search/sort.rs b/milli/tests/search/sort.rs index 86404bb99..eca0d2986 100644 --- a/milli/tests/search/sort.rs +++ b/milli/tests/search/sort.rs @@ -1,6 +1,6 @@ use big_s::S; use milli::Criterion::{Attribute, Exactness, Proximity, Typo, Words}; -use milli::{AscDesc, Error, Member, Search, UserError}; +use milli::{AscDesc, Error, Member, Search, TermsMatchingStrategy, UserError}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; @@ -15,7 +15,7 @@ fn sort_ranking_rule_missing() { search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); search.sort_criteria(vec![AscDesc::Asc(Member::Field(S("tag")))]); let result = search.execute(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 7c4cf8971..7719cf34d 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -2,7 +2,7 @@ use std::collections::BTreeSet; use heed::EnvOpenOptions; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{Criterion, Index, Search}; +use milli::{Criterion, Index, Search, TermsMatchingStrategy}; use serde_json::json; use tempfile::tempdir; use Criterion::*; @@ -20,7 +20,7 @@ fn test_typo_tolerance_one_typo() { search.query("zeal"); search.limit(10); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -29,7 +29,7 @@ fn test_typo_tolerance_one_typo() { search.query("zean"); search.limit(10); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 0); @@ -47,7 +47,7 @@ fn test_typo_tolerance_one_typo() { search.query("zean"); search.limit(10); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -66,7 +66,7 @@ fn test_typo_tolerance_two_typo() { search.query("zealand"); search.limit(10); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -75,7 +75,7 @@ fn test_typo_tolerance_two_typo() { search.query("zealemd"); search.limit(10); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 0); @@ -93,7 +93,7 @@ fn test_typo_tolerance_two_typo() { search.query("zealemd"); search.limit(10); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -142,7 +142,7 @@ fn test_typo_disabled_on_word() { search.query("zealand"); search.limit(10); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 2); @@ -162,7 +162,7 @@ fn test_typo_disabled_on_word() { search.query("zealand"); search.limit(10); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -182,7 +182,7 @@ fn test_disable_typo_on_attribute() { search.query("antebelum"); search.limit(10); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -200,7 +200,7 @@ fn test_disable_typo_on_attribute() { search.query("antebelum"); search.limit(10); search.authorize_typos(true); - search.optional_words(true); + search.optional_words(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 0); From bff9653050070305f4b4905315e44c2c26937fd5 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 28 Jul 2022 14:09:06 +0200 Subject: [PATCH 1601/1889] Fix remove count --- milli/src/search/query_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index c3d2f3669..4ea594b71 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -454,7 +454,7 @@ fn create_query_tree( let mut operation_children = Vec::new(); let mut query = query.to_vec(); - for _ in 0..remove_count { + for _ in 0..=remove_count { let pos = match optional_words { TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false), TermsMatchingStrategy::Any => { From 993aa1321c34b872d2d10dc9c1ba65be54e54e68 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 18 Aug 2022 17:56:06 +0200 Subject: [PATCH 1602/1889] Fix query tree building --- milli/src/search/query_tree.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4ea594b71..39377d8f9 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -454,7 +454,7 @@ fn create_query_tree( let mut operation_children = Vec::new(); let mut query = query.to_vec(); - for _ in 0..=remove_count { + for _ in 0..remove_count { let pos = match optional_words { TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false), TermsMatchingStrategy::Any => { @@ -511,7 +511,7 @@ fn create_query_tree( }; } - Ok(Operation::Or(true, operation_children)) + Ok(Operation::or(true, operation_children)) } /// Main function that matchings words used for crop and highlight. From e7624abe63b1d0ed8cab9f33f50acfd3ce5fa7ad Mon Sep 17 00:00:00 2001 From: Irevoire Date: Fri, 19 Aug 2022 11:23:41 +0200 Subject: [PATCH 1603/1889] share heed between all sub-crates --- benchmarks/Cargo.toml | 1 - milli/fuzz/Cargo.toml | 1 - milli/fuzz/fuzz_targets/indexing.rs | 4 ++-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 600525372..e8aa18165 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -13,7 +13,6 @@ serde_json = { version = "1.0.79", features = ["preserve_order"] } [dev-dependencies] criterion = { version = "0.3.5", features = ["html_reports"] } -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } rand = "0.8.5" rand_chacha = "0.3.1" roaring = "0.9.0" diff --git a/milli/fuzz/Cargo.toml b/milli/fuzz/Cargo.toml index 7e1bea3c5..6bf7b2c6d 100644 --- a/milli/fuzz/Cargo.toml +++ b/milli/fuzz/Cargo.toml @@ -11,7 +11,6 @@ cargo-fuzz = true [dependencies] arbitrary = "1.0" libfuzzer-sys = "0.4" -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } serde_json = { version = "1.0.62", features = ["preserve_order"] } anyhow = "1.0" tempfile = "3.3" diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index a447aebe2..8ce470718 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -5,11 +5,11 @@ use std::io::{BufWriter, Cursor, Read, Seek, Write}; use anyhow::{bail, Result}; use arbitrary_json::ArbitraryValue; -use heed::EnvOpenOptions; use libfuzzer_sys::fuzz_target; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::heed::EnvOpenOptions; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::Index; +use milli::{Index, Object}; use serde_json::{Map, Value}; #[global_allocator] From 5943e1c3b2a128c7116cc576dddcc9b54d87de38 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 22 Aug 2022 13:55:01 +0200 Subject: [PATCH 1604/1889] Update log dependency --- http-ui/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 6d902f5b3..d809f74e3 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -33,7 +33,7 @@ warp = "0.3.2" # logging fst = "0.4.7" -log = "0.4.14" +log = "0.4.17" stderrlog = "0.5.1" # Temporary fix for bitvec, remove once fixed. (https://github.com/bitvecto-rs/bitvec/issues/105) From ba5ca8a3624e2889b5aa5a6408195d879846548c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 22 Aug 2022 14:38:00 +0200 Subject: [PATCH 1605/1889] Upgrade charabia v0.6.0 --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1441461f3..15201d075 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" bstr = "0.2.17" byteorder = "1.4.3" -charabia = "0.5.1" +charabia = "0.6.0" concat-arrays = "0.1.2" crossbeam-channel = "0.5.2" either = "1.6.1" From a5b9a35c508838197b26b746b731b67ac2b1a39d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 22 Aug 2022 14:39:16 +0200 Subject: [PATCH 1606/1889] Activate char_map for highlighting --- http-ui/src/main.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index c20210443..3b14889cc 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -784,8 +784,10 @@ async fn main() -> anyhow::Result<()> { None => fields_ids_map.iter().map(|(_, name)| name).map(String::from).collect(), }; - let mut matcher_builder = - MatcherBuilder::new(matching_words, TokenizerBuilder::default().build()); + let mut matcher_builder = MatcherBuilder::new( + matching_words, + TokenizerBuilder::default().create_char_map(true).build(), + ); matcher_builder.highlight_prefix("".to_string()); matcher_builder.highlight_suffix("".to_string()); let highlighter = Highlighter::new(matcher_builder); From f9029727e0cc7e5a2814397dab51ac6a1e8b2782 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 22 Aug 2022 14:55:53 +0200 Subject: [PATCH 1607/1889] Fix benchmarks --- benchmarks/benches/utils.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 8c556b383..81e50d1bb 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -11,7 +11,7 @@ use milli::heed::EnvOpenOptions; use milli::update::{ IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; -use milli::{Filter, Index, Object}; +use milli::{Filter, Index, Object, TermsMatchingStrategy}; use serde_json::Value; pub struct Conf<'a> { @@ -119,7 +119,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { b.iter(|| { let rtxn = index.read_txn().unwrap(); let mut search = index.search(&rtxn); - search.query(query).optional_words(conf.optional_words); + search.query(query).optional_words(TermsMatchingStrategy::default()); if let Some(filter) = conf.filter { let filter = Filter::from_str(filter).unwrap().unwrap(); search.filter(filter); From 5391e3842c75afc4b903e548e1db2367e47e194d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 22 Aug 2022 17:37:36 +0200 Subject: [PATCH 1608/1889] replace optional_words by term_matching_strategy --- benchmarks/benches/utils.rs | 2 +- milli/src/search/mod.rs | 14 +++++------ milli/src/search/query_tree.rs | 33 +++++++++++++++---------- milli/src/update/index_documents/mod.rs | 6 ++--- milli/tests/search/distinct.rs | 2 +- milli/tests/search/filters.rs | 2 +- milli/tests/search/query_criteria.rs | 4 +-- milli/tests/search/sort.rs | 2 +- milli/tests/search/typo_tolerance.rs | 20 +++++++-------- 9 files changed, 46 insertions(+), 39 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 81e50d1bb..a240ce299 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -119,7 +119,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { b.iter(|| { let rtxn = index.read_txn().unwrap(); let mut search = index.search(&rtxn); - search.query(query).optional_words(TermsMatchingStrategy::default()); + search.query(query).terms_matching_strategy(TermsMatchingStrategy::default()); if let Some(filter) = conf.filter { let filter = Filter::from_str(filter).unwrap().unwrap(); search.filter(filter); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3da8823dc..7145c1445 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -44,7 +44,7 @@ pub struct Search<'a> { offset: usize, limit: usize, sort_criteria: Option>, - optional_words: TermsMatchingStrategy, + terms_matching_strategy: TermsMatchingStrategy, authorize_typos: bool, words_limit: usize, rtxn: &'a heed::RoTxn<'a>, @@ -59,7 +59,7 @@ impl<'a> Search<'a> { offset: 0, limit: 20, sort_criteria: None, - optional_words: TermsMatchingStrategy::default(), + terms_matching_strategy: TermsMatchingStrategy::default(), authorize_typos: true, words_limit: 10, rtxn, @@ -87,8 +87,8 @@ impl<'a> Search<'a> { self } - pub fn optional_words(&mut self, value: TermsMatchingStrategy) -> &mut Search<'a> { - self.optional_words = value; + pub fn terms_matching_strategy(&mut self, value: TermsMatchingStrategy) -> &mut Search<'a> { + self.terms_matching_strategy = value; self } @@ -119,7 +119,7 @@ impl<'a> Search<'a> { let (query_tree, primitive_query, matching_words) = match self.query.as_ref() { Some(query) => { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index)?; - builder.optional_words(self.optional_words); + builder.terms_matching_strategy(self.terms_matching_strategy); builder.authorize_typos(self.is_typo_authorized()?); @@ -259,7 +259,7 @@ impl fmt::Debug for Search<'_> { offset, limit, sort_criteria, - optional_words, + terms_matching_strategy, authorize_typos, words_limit, rtxn: _, @@ -271,7 +271,7 @@ impl fmt::Debug for Search<'_> { .field("offset", offset) .field("limit", limit) .field("sort_criteria", sort_criteria) - .field("optional_words", optional_words) + .field("terms_matching_strategy", terms_matching_strategy) .field("authorize_typos", authorize_typos) .field("words_limit", words_limit) .finish() diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 39377d8f9..51774d8b4 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -162,7 +162,7 @@ trait Context { pub struct QueryTreeBuilder<'a> { rtxn: &'a heed::RoTxn<'a>, index: &'a Index, - optional_words: TermsMatchingStrategy, + terms_matching_strategy: TermsMatchingStrategy, authorize_typos: bool, words_limit: Option, exact_words: Option>>, @@ -199,19 +199,22 @@ impl<'a> QueryTreeBuilder<'a> { Ok(Self { rtxn, index, - optional_words: TermsMatchingStrategy::default(), + terms_matching_strategy: TermsMatchingStrategy::default(), authorize_typos: true, words_limit: None, exact_words: index.exact_words(rtxn)?, }) } - /// if `optional_words` is set to `false` the query tree will be + /// if `terms_matching_strategy` is set to `All` the query tree will be /// generated forcing all query words to be present in each matching documents /// (the criterion `words` will be ignored). - /// default value if not called: `true` - pub fn optional_words(&mut self, optional_words: TermsMatchingStrategy) -> &mut Self { - self.optional_words = optional_words; + /// default value if not called: `Last` + pub fn terms_matching_strategy( + &mut self, + terms_matching_strategy: TermsMatchingStrategy, + ) -> &mut Self { + self.terms_matching_strategy = terms_matching_strategy; self } @@ -232,7 +235,7 @@ impl<'a> QueryTreeBuilder<'a> { } /// Build the query tree: - /// - if `optional_words` is set to `false` the query tree will be + /// - if `terms_matching_strategy` is set to `All` the query tree will be /// generated forcing all query words to be present in each matching documents /// (the criterion `words` will be ignored) /// - if `authorize_typos` is set to `false` the query tree will be generated @@ -247,7 +250,7 @@ impl<'a> QueryTreeBuilder<'a> { if !primitive_query.is_empty() { let qt = create_query_tree( self, - self.optional_words, + self.terms_matching_strategy, self.authorize_typos, &primitive_query, )?; @@ -332,7 +335,7 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result Result { @@ -455,7 +458,7 @@ fn create_query_tree( let mut operation_children = Vec::new(); let mut query = query.to_vec(); for _ in 0..remove_count { - let pos = match optional_words { + let pos = match terms_matching_strategy { TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false), TermsMatchingStrategy::Any => { let operation = Operation::Or( @@ -796,15 +799,19 @@ mod test { impl TestContext { fn build>( &self, - optional_words: TermsMatchingStrategy, + terms_matching_strategy: TermsMatchingStrategy, authorize_typos: bool, words_limit: Option, query: ClassifiedTokenIter, ) -> Result> { let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { - let qt = - create_query_tree(self, optional_words, authorize_typos, &primitive_query)?; + let qt = create_query_tree( + self, + terms_matching_strategy, + authorize_typos, + &primitive_query, + )?; Ok(Some((qt, primitive_query))) } else { Ok(None) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3a8f961ac..23618b478 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1208,7 +1208,7 @@ mod tests { let mut search = crate::Search::new(&rtxn, &index); search.query("document"); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); // all documents should be returned let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); assert_eq!(documents_ids.len(), 4); @@ -1314,7 +1314,7 @@ mod tests { let mut search = crate::Search::new(&rtxn, &index); search.query("document"); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); // all documents should be returned let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); assert_eq!(documents_ids.len(), 4); @@ -1513,7 +1513,7 @@ mod tests { let mut search = crate::Search::new(&rtxn, &index); search.query("化妆包"); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); // only 1 document should be returned let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index 9e9905c3f..64dd16f09 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -28,7 +28,7 @@ macro_rules! test_distinct { search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let SearchResult { documents_ids, candidates, .. } = search.execute().unwrap(); diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index 675004b56..18de24ac3 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -19,7 +19,7 @@ macro_rules! test_filter { search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); search.filter(filter_conditions); let SearchResult { documents_ids, .. } = search.execute().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 0fce7c6df..8b72c8420 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -31,7 +31,7 @@ macro_rules! test_criterion { search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos($authorize_typos); - search.optional_words($optional_word); + search.terms_matching_strategy($optional_word); search.sort_criteria($sort_criteria); let SearchResult { documents_ids, .. } = search.execute().unwrap(); @@ -353,7 +353,7 @@ fn criteria_mixup() { let mut search = Search::new(&mut rtxn, &index); search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); - search.optional_words(ALLOW_OPTIONAL_WORDS); + search.terms_matching_strategy(ALLOW_OPTIONAL_WORDS); search.authorize_typos(ALLOW_TYPOS); let SearchResult { documents_ids, .. } = search.execute().unwrap(); diff --git a/milli/tests/search/sort.rs b/milli/tests/search/sort.rs index eca0d2986..16d21eac8 100644 --- a/milli/tests/search/sort.rs +++ b/milli/tests/search/sort.rs @@ -15,7 +15,7 @@ fn sort_ranking_rule_missing() { search.query(search::TEST_QUERY); search.limit(EXTERNAL_DOCUMENTS_IDS.len()); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); search.sort_criteria(vec![AscDesc::Asc(Member::Field(S("tag")))]); let result = search.execute(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 7719cf34d..7dc6b0c4f 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -20,7 +20,7 @@ fn test_typo_tolerance_one_typo() { search.query("zeal"); search.limit(10); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -29,7 +29,7 @@ fn test_typo_tolerance_one_typo() { search.query("zean"); search.limit(10); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 0); @@ -47,7 +47,7 @@ fn test_typo_tolerance_one_typo() { search.query("zean"); search.limit(10); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -66,7 +66,7 @@ fn test_typo_tolerance_two_typo() { search.query("zealand"); search.limit(10); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -75,7 +75,7 @@ fn test_typo_tolerance_two_typo() { search.query("zealemd"); search.limit(10); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 0); @@ -93,7 +93,7 @@ fn test_typo_tolerance_two_typo() { search.query("zealemd"); search.limit(10); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -142,7 +142,7 @@ fn test_typo_disabled_on_word() { search.query("zealand"); search.limit(10); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 2); @@ -162,7 +162,7 @@ fn test_typo_disabled_on_word() { search.query("zealand"); search.limit(10); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -182,7 +182,7 @@ fn test_disable_typo_on_attribute() { search.query("antebelum"); search.limit(10); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); @@ -200,7 +200,7 @@ fn test_disable_typo_on_attribute() { search.query("antebelum"); search.limit(10); search.authorize_typos(true); - search.optional_words(TermsMatchingStrategy::default()); + search.terms_matching_strategy(TermsMatchingStrategy::default()); let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 0); From e140227065750a50541c6ee6a36da16db8290643 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 23 Aug 2022 11:45:29 +0200 Subject: [PATCH 1609/1889] Remove Bors required test for Windows --- bors.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bors.toml b/bors.toml index 9b75c79d5..e388a1d4b 100644 --- a/bors.toml +++ b/bors.toml @@ -1,7 +1,7 @@ status = [ 'Tests on ubuntu-18.04', 'Tests on macos-latest', - 'Tests on windows-latest', + # 'Tests on windows-latest', 'Run Rustfmt', ] # 3 hours timeout From 9ed732499562bd9772b33dc258b0c027dba78344 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 23 Aug 2022 11:47:48 +0200 Subject: [PATCH 1610/1889] Update version for next release (v0.33.0) --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index e8aa18165..3a832defb 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.32.0" +version = "0.33.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index e45fb3344..5e27943b8 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.32.0" +version = "0.33.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 21676f960..6900f1fa9 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.32.0" +version = "0.33.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 5cbc35f25..9d26902dd 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.32.0" +version = "0.33.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 9a8496e28..4084b29cd 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.32.0" +version = "0.33.0" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 6d902f5b3..0e39c0ca1 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.32.0" +version = "0.33.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 7c17782c3..6b6b1f300 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.32.0" +version = "0.33.0" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index a0bb76676..eddc44600 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.32.0" +version = "0.33.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 1441461f3..65e540360 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.32.0" +version = "0.33.0" authors = ["Kerollmops "] edition = "2018" From f6024b32697612500936f1d31f163f2b31daa7af Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 23 Aug 2022 16:10:38 +0200 Subject: [PATCH 1611/1889] Remove the artifacts of the past --- milli/src/search/facet/grammar.pest | 33 ----------------------------- 1 file changed, 33 deletions(-) delete mode 100644 milli/src/search/facet/grammar.pest diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest deleted file mode 100644 index 8bfdeb667..000000000 --- a/milli/src/search/facet/grammar.pest +++ /dev/null @@ -1,33 +0,0 @@ -key = _{reserved | quoted | word } -value = _{quoted | word } -quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP } -string = {char*} -word = ${(LETTER | NUMBER | "_" | "-" | ".")+} - -char = _{ !(PEEK | "\\") ~ ANY - | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") - | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} - -reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" } -// we deliberately choose to allow empty parameters to generate more specific error message later -parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} -condition = _{between | eq | greater | less | geq | leq | neq} -between = {key ~ value ~ "TO" ~ value} -geq = {key ~ ">=" ~ value} -leq = {key ~ "<=" ~ value} -neq = {key ~ "!=" ~ value} -eq = {key ~ "=" ~ value} -greater = {key ~ ">" ~ value} -less = {key ~ "<" ~ value} -geo_radius = {"_geoRadius" ~ parameters } - -prgm = {SOI ~ expr ~ EOI} -expr = _{ ( term ~ (operation ~ term)* ) } -term = { ("(" ~ expr ~ ")") | condition | not | geo_radius } -operation = _{ and | or } -and = {"AND"} -or = {"OR"} - -not = {"NOT" ~ term} - -WHITESPACE = _{ " " } From 7f92116b516b174f5c0009caadd3a1eabad28ff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 31 Aug 2022 10:56:39 +0200 Subject: [PATCH 1612/1889] Accept again integers as document ids --- milli/src/documents/builder.rs | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 1a57db34b..9fda31cf0 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -118,6 +118,8 @@ impl DocumentsBatchBuilder { AllowedType::Number => { if value.trim().is_empty() { to_writer(&mut self.value_buffer, &Value::Null)?; + } else if let Ok(integer) = value.trim().parse::() { + to_writer(&mut self.value_buffer, &integer)?; } else { match value.trim().parse::() { Ok(float) => { @@ -359,7 +361,34 @@ mod test { json!({ "city": "Boston", "country": "United States", - "pop": 4628910.0, + "pop": 4628910, + }) + ); + } + + #[test] + fn integer_as_id() { + let csv_content = r#""id:number","title:string","comment:string" +"1239","Pride and Prejudice","A great book""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + "id": 1239, + "title": "Pride and Prejudice", + "comment": "A great book", }) ); } From c3363706c5e846b0d3643fbdcc5e64181497440a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Wed, 31 Aug 2022 11:36:34 +0200 Subject: [PATCH 1613/1889] Update version for next release (v0.33.1) in Cargo.toml --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 3a832defb..570ed2a2a 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.33.0" +version = "0.33.1" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 5e27943b8..fe3dc7c96 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.33.0" +version = "0.33.1" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 6900f1fa9..b8d159355 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.33.0" +version = "0.33.1" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 9d26902dd..e965efe5e 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.33.0" +version = "0.33.1" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 4084b29cd..552ec863c 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.33.0" +version = "0.33.1" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 1972e0db7..87044d540 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.33.0" +version = "0.33.1" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 6b6b1f300..b161d7a63 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.33.0" +version = "0.33.1" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index eddc44600..0d869062b 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.33.0" +version = "0.33.1" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 56a1da5cd..639734757 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.33.0" +version = "0.33.1" authors = ["Kerollmops "] edition = "2018" From 97a04887a3e676cf328d58dbc4d9f755020e47c7 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 1 Sep 2022 11:47:23 +0200 Subject: [PATCH 1614/1889] Update version for next release (v0.33.2) in Cargo.toml --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 570ed2a2a..78d08b77f 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.33.1" +version = "0.33.2" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index fe3dc7c96..52cad5696 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.33.1" +version = "0.33.2" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index b8d159355..f44d72370 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.33.1" +version = "0.33.2" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index e965efe5e..1d556a243 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.33.1" +version = "0.33.2" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 552ec863c..f5a89eaaa 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.33.1" +version = "0.33.2" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 87044d540..2d9540e21 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.33.1" +version = "0.33.2" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index b161d7a63..60474c829 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.33.1" +version = "0.33.2" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 0d869062b..510eaf985 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.33.1" +version = "0.33.2" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 639734757..547c65dfb 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.33.1" +version = "0.33.2" authors = ["Kerollmops "] edition = "2018" From a38608fe59c1e169d698c1324fe966dc0cd069b3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 1 Sep 2022 12:02:10 +0200 Subject: [PATCH 1615/1889] Add test mixing phrased and no-phrased words --- milli/src/search/query_tree.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 51774d8b4..5fd004c99 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1058,6 +1058,26 @@ mod test { "###); } + #[test] + fn phrase_2() { + // https://github.com/meilisearch/meilisearch/issues/2722 + let query = "coco \"harry\""; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::default(), true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR(WORD) + Exact { word: "harry" } + AND + Exact { word: "coco" } + Exact { word: "harry" } + "###); + } + #[test] fn phrase_with_hard_separator() { let query = "\"hey friends. wooop wooop\""; From bf750e45a11031c33f746f34224439bbdfaca46b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 1 Sep 2022 12:10:47 +0200 Subject: [PATCH 1616/1889] Fix word removal issue --- milli/src/search/query_tree.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 5fd004c99..1c60e41f7 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::cmp::min; +use std::cmp::max; use std::{cmp, fmt, mem}; use charabia::classifier::ClassifiedTokenIter; @@ -450,14 +450,14 @@ fn create_query_tree( } let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); - let remove_count = query.len() - min(number_phrases, 1); + let remove_count = query.len() - max(number_phrases, 1); if remove_count == 0 { return ngrams(ctx, authorize_typos, query, false); } let mut operation_children = Vec::new(); let mut query = query.to_vec(); - for _ in 0..remove_count { + for _ in 0..=remove_count { let pos = match terms_matching_strategy { TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false), TermsMatchingStrategy::Any => { From 0639b14906242f2ffac65eb3ffd41adc8f63cd19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Sun, 4 Sep 2022 11:19:50 +0200 Subject: [PATCH 1617/1889] Add CI to update the Milli version --- .../workflows/update-cargo-toml-version.yml | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 .github/workflows/update-cargo-toml-version.yml diff --git a/.github/workflows/update-cargo-toml-version.yml b/.github/workflows/update-cargo-toml-version.yml new file mode 100644 index 000000000..cb3d360e0 --- /dev/null +++ b/.github/workflows/update-cargo-toml-version.yml @@ -0,0 +1,43 @@ +name: Update Milli version in all Cargo.toml files + +on: + workflow_dispatch: + inputs: + new_version: + description: 'The new version (vX.Y.Z)' + required: true + +env: + NEW_VERSION: ${{ github.event.inputs.new_version }} + GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }} + +jobs: + update-version-cargo-toml: + name: Update version in cargo.toml files + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + - name: Install sd + run: cargo install sd + - name: Update files + run: | + echo "$GITHUB_REF_NAME" + raw_new_version=$(echo $NEW_VERSION | cut -d 'v' -f 2) + new_string="version = \"$raw_new_version\"" + sd '^version = "\d+.\d+.\w+"$' "$new_string" */Cargo.toml + - name: Commits and push the changes to the ${{ github.ref_name }} branch + uses: EndBug/add-and-commit@v9 + with: + message: "Update version for the next release (${{ env.NEW_VERSION }}) in Cargo.toml files" + new_branch: update-version-${{ env.NEW_VERSION }} + - name: Create the PR + run: | + gh pr create \ + --title "Update version for the next release ($NEW_VERSION) in Cargo.toml files" \ + --body '⚠️ This PR is automatically generated. Check the new version is the expected one before merging.' \ + --label 'skip changelog' From 61abc61a695d3ebb7690ffd35139bd5e86d8e598 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 5 Sep 2022 16:01:32 +0200 Subject: [PATCH 1618/1889] Minor fixes in the just added update-version CI --- .github/workflows/update-cargo-toml-version.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/update-cargo-toml-version.yml b/.github/workflows/update-cargo-toml-version.yml index cb3d360e0..45c611c29 100644 --- a/.github/workflows/update-cargo-toml-version.yml +++ b/.github/workflows/update-cargo-toml-version.yml @@ -9,11 +9,12 @@ on: env: NEW_VERSION: ${{ github.event.inputs.new_version }} + NEW_BRANCH: update-version-${{ github.event.inputs.new_version }} GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }} jobs: update-version-cargo-toml: - name: Update version in cargo.toml files + name: Update version in Cargo.toml file runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 @@ -24,18 +25,17 @@ jobs: override: true - name: Install sd run: cargo install sd - - name: Update files + - name: Update all Cargo.toml files run: | - echo "$GITHUB_REF_NAME" raw_new_version=$(echo $NEW_VERSION | cut -d 'v' -f 2) new_string="version = \"$raw_new_version\"" sd '^version = "\d+.\d+.\w+"$' "$new_string" */Cargo.toml - - name: Commits and push the changes to the ${{ github.ref_name }} branch + - name: Commit and push the changes to the ${{ env.NEW_BRANCH }} branch uses: EndBug/add-and-commit@v9 with: message: "Update version for the next release (${{ env.NEW_VERSION }}) in Cargo.toml files" - new_branch: update-version-${{ env.NEW_VERSION }} - - name: Create the PR + new_branch: ${{ env.NEW_BRANCH }} + - name: Create the PR pointing to ${{ github.ref_name }} run: | gh pr create \ --title "Update version for the next release ($NEW_VERSION) in Cargo.toml files" \ From 44192d754fb17ec34b468fd798a995a7b4d6452f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Tue, 6 Sep 2022 17:54:05 +0200 Subject: [PATCH 1619/1889] Add dependabot for GHA --- .github/dependabot.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 000000000..9a812c779 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,12 @@ +# Set update schedule for GitHub Actions only + +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + labels: + - 'skip changelog' + - 'dependencies' + rebase-strategy: disabled From 5e85059a719c674463799ad5f3c62f3ddd2fde64 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Sep 2022 16:08:48 +0000 Subject: [PATCH 1620/1889] Bump Swatinem/rust-cache from 1.3.0 to 2.0.0 Bumps [Swatinem/rust-cache](https://github.com/Swatinem/rust-cache) from 1.3.0 to 2.0.0. - [Release notes](https://github.com/Swatinem/rust-cache/releases) - [Changelog](https://github.com/Swatinem/rust-cache/blob/master/CHANGELOG.md) - [Commits](https://github.com/Swatinem/rust-cache/compare/v1.3.0...v2.0.0) --- updated-dependencies: - dependency-name: Swatinem/rust-cache dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 09cd99b80..a316bb95d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -36,7 +36,7 @@ jobs: toolchain: stable override: true - name: Cache dependencies - uses: Swatinem/rust-cache@v1.3.0 + uses: Swatinem/rust-cache@v2.0.0 - name: Run cargo check uses: actions-rs/cargo@v1 with: @@ -60,7 +60,7 @@ jobs: override: true components: rustfmt - name: Cache dependencies - uses: Swatinem/rust-cache@v1.3.0 + uses: Swatinem/rust-cache@v2.0.0 - name: Run cargo fmt # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate From b308463022df07441e209c80c1d4cb2981b5864f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Sep 2022 16:08:51 +0000 Subject: [PATCH 1621/1889] Bump actions/checkout from 2 to 3 Bumps [actions/checkout](https://github.com/actions/checkout) from 2 to 3. - [Release notes](https://github.com/actions/checkout/releases) - [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md) - [Commits](https://github.com/actions/checkout/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/checkout dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/manual_benchmarks.yml | 2 +- .github/workflows/push_benchmarks_indexing.yml | 2 +- .github/workflows/push_benchmarks_search_geo.yml | 2 +- .github/workflows/push_benchmarks_search_songs.yml | 2 +- .github/workflows/push_benchmarks_search_wiki.yml | 2 +- .github/workflows/rust.yml | 4 ++-- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/manual_benchmarks.yml b/.github/workflows/manual_benchmarks.yml index 456e87168..d85a6c07b 100644 --- a/.github/workflows/manual_benchmarks.yml +++ b/.github/workflows/manual_benchmarks.yml @@ -17,7 +17,7 @@ jobs: runs-on: benchmarks timeout-minutes: 4320 # 72h steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions-rs/toolchain@v1 with: profile: minimal diff --git a/.github/workflows/push_benchmarks_indexing.yml b/.github/workflows/push_benchmarks_indexing.yml index f00542001..c53de93da 100644 --- a/.github/workflows/push_benchmarks_indexing.yml +++ b/.github/workflows/push_benchmarks_indexing.yml @@ -15,7 +15,7 @@ jobs: runs-on: benchmarks timeout-minutes: 4320 # 72h steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions-rs/toolchain@v1 with: profile: minimal diff --git a/.github/workflows/push_benchmarks_search_geo.yml b/.github/workflows/push_benchmarks_search_geo.yml index cdf9264a7..8a79ce14d 100644 --- a/.github/workflows/push_benchmarks_search_geo.yml +++ b/.github/workflows/push_benchmarks_search_geo.yml @@ -14,7 +14,7 @@ jobs: name: Run and upload benchmarks runs-on: benchmarks steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions-rs/toolchain@v1 with: profile: minimal diff --git a/.github/workflows/push_benchmarks_search_songs.yml b/.github/workflows/push_benchmarks_search_songs.yml index cb2eddd46..8e6f2de75 100644 --- a/.github/workflows/push_benchmarks_search_songs.yml +++ b/.github/workflows/push_benchmarks_search_songs.yml @@ -14,7 +14,7 @@ jobs: name: Run and upload benchmarks runs-on: benchmarks steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions-rs/toolchain@v1 with: profile: minimal diff --git a/.github/workflows/push_benchmarks_search_wiki.yml b/.github/workflows/push_benchmarks_search_wiki.yml index 71eb89c97..91718b1ce 100644 --- a/.github/workflows/push_benchmarks_search_wiki.yml +++ b/.github/workflows/push_benchmarks_search_wiki.yml @@ -14,7 +14,7 @@ jobs: name: Run and upload benchmarks runs-on: benchmarks steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions-rs/toolchain@v1 with: profile: minimal diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 09cd99b80..ce6ff59e4 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -20,7 +20,7 @@ jobs: matrix: os: [ubuntu-18.04, macos-latest, windows-latest] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Run test with Rust nightly if: github.event_name == 'schedule' uses: actions-rs/toolchain@v1 @@ -52,7 +52,7 @@ jobs: name: Run Rustfmt runs-on: ubuntu-18.04 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: actions-rs/toolchain@v1 with: profile: minimal From e3400a05d3d32e55fec898873e96e52db842472e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 6 Sep 2022 16:08:54 +0000 Subject: [PATCH 1622/1889] Bump yogevbd/enforce-label-action from 2.1.0 to 2.2.2 Bumps [yogevbd/enforce-label-action](https://github.com/yogevbd/enforce-label-action) from 2.1.0 to 2.2.2. - [Release notes](https://github.com/yogevbd/enforce-label-action/releases) - [Commits](https://github.com/yogevbd/enforce-label-action/compare/2.1.0...2.2.2) --- updated-dependencies: - dependency-name: yogevbd/enforce-label-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/enforce-label.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/enforce-label.yml b/.github/workflows/enforce-label.yml index f8d8bc4e6..61038d91d 100644 --- a/.github/workflows/enforce-label.yml +++ b/.github/workflows/enforce-label.yml @@ -9,6 +9,6 @@ jobs: name: Specify breaking runs-on: ubuntu-latest steps: - - uses: yogevbd/enforce-label-action@2.1.0 + - uses: yogevbd/enforce-label-action@2.2.2 with: REQUIRED_LABELS_ANY: 'no breaking,DB breaking,API breaking,skip changelog' From c83c3cd7967c2833d0b4fab63b8c7528532c7421 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 7 Sep 2022 14:11:44 +0200 Subject: [PATCH 1623/1889] Add a test to make sure that long words are correctly skipped --- milli/src/update/index_documents/mod.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 23618b478..365b0d024 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1741,4 +1741,22 @@ mod tests { index.add_documents(doc3).unwrap_err(); index.add_documents(doc4).unwrap_err(); } + + #[test] + fn long_words_must_be_skipped() { + let index = TempIndex::new(); + + // this is obviousy too long + let long_word = "lol".repeat(1000); + let doc1 = documents! {[{ + "id": "1", + "title": long_word.clone(), + }]}; + + index.add_documents(doc1).unwrap(); + + let rtxn = index.read_txn().unwrap(); + let words_fst = index.words_fst(&rtxn).unwrap(); + assert!(!words_fst.contains(&long_word)); + } } From fe3973a51c53613d24a263a25fb0417be55d7cbe Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 7 Sep 2022 14:12:08 +0200 Subject: [PATCH 1624/1889] Make sure that long words are correctly skipped --- .../index_documents/extract/extract_docid_word_positions.rs | 6 ++++-- milli/src/update/index_documents/helpers/mod.rs | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 9a6060805..3cc842b00 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -7,7 +7,9 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; -use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; +use super::helpers::{ + concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH, +}; use crate::error::{InternalError, SerializationError}; use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; @@ -68,7 +70,7 @@ pub fn extract_docid_word_positions( for (index, token) in tokens { let token = token.lemma().trim(); - if !token.is_empty() { + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(token.as_bytes()); diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 79d0d0466..6466a636b 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -18,8 +18,11 @@ pub use merge_functions::{ roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, }; +/// The maximum length a word can be +pub const MAX_WORD_LENGTH: usize = 250; + pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { - key.as_ref().len() <= 511 && !key.as_ref().is_empty() + key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty() } /// Divides one slice into two at an index, returns `None` if mid is out of bounds. From 077dcd20020a4332b75de82ca5e08789bfb412ce Mon Sep 17 00:00:00 2001 From: curquiza Date: Wed, 7 Sep 2022 15:48:53 +0000 Subject: [PATCH 1625/1889] Update version for the next release (v0.33.3) in Cargo.toml files --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 78d08b77f..bc6ffed42 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.33.2" +version = "0.33.3" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 52cad5696..5d3cfa356 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.33.2" +version = "0.33.3" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index f44d72370..c1579ad05 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.33.2" +version = "0.33.3" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 1d556a243..e4f1ac639 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.33.2" +version = "0.33.3" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index f5a89eaaa..c602bc535 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.33.2" +version = "0.33.3" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 2d9540e21..a42c688a4 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.33.2" +version = "0.33.3" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 60474c829..432e23779 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.33.2" +version = "0.33.3" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 510eaf985..723979ec2 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.33.2" +version = "0.33.3" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 547c65dfb..421a7f839 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.33.2" +version = "0.33.3" authors = ["Kerollmops "] edition = "2018" From 3af3d3f7d9f3730840a50aff9ee7ff25dc4238d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Wed, 7 Sep 2022 18:36:10 +0200 Subject: [PATCH 1626/1889] Revert "Remove Bors required test for Windows" --- bors.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bors.toml b/bors.toml index e388a1d4b..9b75c79d5 100644 --- a/bors.toml +++ b/bors.toml @@ -1,7 +1,7 @@ status = [ 'Tests on ubuntu-18.04', 'Tests on macos-latest', - # 'Tests on windows-latest', + 'Tests on windows-latest', 'Run Rustfmt', ] # 3 hours timeout From 5e07ea79c233c7354e33eeabebf4612fa35c02d0 Mon Sep 17 00:00:00 2001 From: Vincent Herlemont Date: Wed, 7 Sep 2022 20:54:31 +0200 Subject: [PATCH 1627/1889] Make charabia default feature optional --- milli/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 421a7f839..8e87b1616 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" bstr = "0.2.17" byteorder = "1.4.3" -charabia = "0.6.0" +charabia = { version = "0.6.0", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.2" either = "1.6.1" @@ -57,4 +57,4 @@ md5 = "0.7.0" rand = "0.8.5" [features] -default = [] +default = [ "charabia/default" ] \ No newline at end of file From 8cd5200f48b5be9629cc7107aebdcc06bcf27d73 Mon Sep 17 00:00:00 2001 From: Vincent Herlemont Date: Thu, 8 Sep 2022 12:19:44 +0200 Subject: [PATCH 1628/1889] Make charabia languages configurable --- milli/Cargo.toml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 8e87b1616..016711198 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -57,4 +57,16 @@ md5 = "0.7.0" rand = "0.8.5" [features] -default = [ "charabia/default" ] \ No newline at end of file +default = [ "charabia/default" ] + +# allow chinese specialized tokenization +chinese = ["charabia/chinese"] + +# allow hebrew specialized tokenization +hebrew = ["charabia/hebrew"] + +# allow japanese specialized tokenization +japanese = ["charabia/japanese"] + +# allow thai specialized tokenization +thai = ["charabia/thai"] From 69b2d31b719fe2c63ed00b403ab4667a99f9b3d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 8 Sep 2022 14:58:06 +0200 Subject: [PATCH 1629/1889] Upgrade ubuntu-18.04 to 20.04 --- .github/workflows/rust.yml | 4 ++-- bors.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 913cb90ad..9939d3f24 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-18.04, macos-latest, windows-latest] + os: [ubuntu-20.04, macos-latest, windows-latest] steps: - uses: actions/checkout@v3 - name: Run test with Rust nightly @@ -50,7 +50,7 @@ jobs: fmt: name: Run Rustfmt - runs-on: ubuntu-18.04 + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 - uses: actions-rs/toolchain@v1 diff --git a/bors.toml b/bors.toml index 9b75c79d5..73324892f 100644 --- a/bors.toml +++ b/bors.toml @@ -1,5 +1,5 @@ status = [ - 'Tests on ubuntu-18.04', + 'Tests on ubuntu-20.04', 'Tests on macos-latest', 'Tests on windows-latest', 'Run Rustfmt', From d4d7c9d577cbf23c3821df40efaeecc9c6799d2e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 13 Sep 2022 14:03:00 +0200 Subject: [PATCH 1630/1889] We avoid skipping errors in the indexing pipeline --- milli/src/update/index_documents/mod.rs | 35 ++++++++++++------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 365b0d024..be45499c6 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -304,24 +304,23 @@ where documents_chunk_size, ); - let result = original_chunk_iter - .and_then(|original_chunk_iter| Ok((original_chunk_iter, flattened_chunk_iter?))) - .map(|(original_chunk, flattened_chunk)| { - // extract all databases from the chunked obkv douments - extract::data_from_obkv_documents( - original_chunk, - flattened_chunk, - pool_params, - lmdb_writer_sx.clone(), - searchable_fields, - faceted_fields, - primary_key_id, - geo_fields_ids, - stop_words, - max_positions_per_attributes, - exact_attributes, - ) - }); + let result = original_chunk_iter.and_then(|original_chunk| { + let flattened_chunk = flattened_chunk_iter?; + // extract all databases from the chunked obkv douments + extract::data_from_obkv_documents( + original_chunk, + flattened_chunk, + pool_params, + lmdb_writer_sx.clone(), + searchable_fields, + faceted_fields, + primary_key_id, + geo_fields_ids, + stop_words, + max_positions_per_attributes, + exact_attributes, + ) + }); if let Err(e) = result { let _ = lmdb_writer_sx.send(Err(e)); From 379496233013830c5f22da373cdbd720c171d51a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 13 Sep 2022 10:40:37 +0200 Subject: [PATCH 1631/1889] Use an unstable algorithm for grenad::Sorter when possible --- milli/Cargo.toml | 2 +- .../index_documents/extract/extract_docid_word_positions.rs | 1 + .../index_documents/extract/extract_facet_number_docids.rs | 1 + .../index_documents/extract/extract_facet_string_docids.rs | 1 + .../index_documents/extract/extract_fid_docid_facet_values.rs | 2 ++ .../index_documents/extract/extract_fid_word_count_docids.rs | 1 + milli/src/update/index_documents/extract/extract_word_docids.rs | 2 ++ .../extract/extract_word_pair_proximity_docids.rs | 1 + .../index_documents/extract/extract_word_position_docids.rs | 1 + milli/src/update/index_documents/helpers/grenad_helpers.rs | 2 ++ milli/src/update/index_documents/mod.rs | 1 + milli/src/update/index_documents/transform.rs | 2 ++ milli/src/update/word_prefix_docids.rs | 1 + milli/src/update/words_prefix_position_docids.rs | 1 + 14 files changed, 18 insertions(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 016711198..c9853548c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.4.1" -grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } +grenad = { version = "0.4.3", default-features = false, features = ["tempfile"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 3cc842b00..e067623e2 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -32,6 +32,7 @@ pub fn extract_docid_word_positions( let mut documents_ids = RoaringBitmap::new(); let mut docid_word_positions_sorter = create_sorter( + grenad::SortAlgorithm::Stable, concat_u32s_array, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index fa63d9549..61157fa35 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -21,6 +21,7 @@ pub fn extract_facet_number_docids( let max_memory = indexer.max_memory_by_thread(); let mut facet_number_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 8209d817b..f7aa3730c 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -23,6 +23,7 @@ pub fn extract_facet_string_docids( let max_memory = indexer.max_memory_by_thread(); let mut facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, keep_first_prefix_value_merge_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index cf116e6f5..f9d1443d5 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -28,6 +28,7 @@ pub fn extract_fid_docid_facet_values( let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( + grenad::SortAlgorithm::Stable, keep_first, indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -36,6 +37,7 @@ pub fn extract_fid_docid_facet_values( ); let mut fid_docid_facet_strings_sorter = create_sorter( + grenad::SortAlgorithm::Stable, keep_first, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 85a65ee14..d425e8d14 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -25,6 +25,7 @@ pub fn extract_fid_word_count_docids( let max_memory = indexer.max_memory_by_thread(); let mut fid_word_count_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index f3a44162b..4b965e9a8 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -30,6 +30,7 @@ pub fn extract_word_docids( let max_memory = indexer.max_memory_by_thread(); let mut word_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, @@ -38,6 +39,7 @@ pub fn extract_word_docids( ); let mut exact_word_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 5117bfaba..6add9d980 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -24,6 +24,7 @@ pub fn extract_word_pair_proximity_docids( let max_memory = indexer.max_memory_by_thread(); let mut word_pair_proximity_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index a4720ba2b..c1661072a 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -21,6 +21,7 @@ pub fn extract_word_position_docids( let max_memory = indexer.max_memory_by_thread(); let mut word_position_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 9d5a67d78..202e689f8 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -27,6 +27,7 @@ pub fn create_writer( } pub fn create_sorter( + sort_algorithm: grenad::SortAlgorithm, merge: MergeFn, chunk_compression_type: grenad::CompressionType, chunk_compression_level: Option, @@ -45,6 +46,7 @@ pub fn create_sorter( builder.dump_threshold(memory); builder.allow_realloc(false); } + builder.sort_algorithm(sort_algorithm); builder.build() } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 365b0d024..f69a4e893 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1489,6 +1489,7 @@ mod tests { assert_eq!(count, 4); } + #[cfg(feature = "default")] #[test] fn test_meilisearch_1714() { let index = TempIndex::new(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 8818909a3..f52d5c7af 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -99,6 +99,7 @@ impl<'a, 'i> Transform<'a, 'i> { // We initialize the sorter with the user indexing settings. let original_sorter = create_sorter( + grenad::SortAlgorithm::Stable, merge_function, indexer_settings.chunk_compression_type, indexer_settings.chunk_compression_level, @@ -108,6 +109,7 @@ impl<'a, 'i> Transform<'a, 'i> { // We initialize the sorter with the user indexing settings. let flattened_sorter = create_sorter( + grenad::SortAlgorithm::Stable, merge_function, indexer_settings.chunk_compression_type, indexer_settings.chunk_compression_level, diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 1002c13cf..976ff3dd0 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -48,6 +48,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // It is forbidden to keep a mutable reference into the database // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs index b2b24084d..5dbc9f89b 100644 --- a/milli/src/update/words_prefix_position_docids.rs +++ b/milli/src/update/words_prefix_position_docids.rs @@ -65,6 +65,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { debug!("Computing and writing the word levels positions docids into LMDB on disk..."); let mut prefix_position_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, merge_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, From 753e76d4510576af9c48115a0abc6b0eacf40860 Mon Sep 17 00:00:00 2001 From: curquiza Date: Tue, 13 Sep 2022 13:55:50 +0000 Subject: [PATCH 1632/1889] Update version for the next release (v0.33.4) in Cargo.toml files --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index bc6ffed42..9c1e83663 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.33.3" +version = "0.33.4" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 5d3cfa356..3e5df29c5 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.33.3" +version = "0.33.4" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index c1579ad05..684ef44f0 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.33.3" +version = "0.33.4" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index e4f1ac639..27da77b78 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.33.3" +version = "0.33.4" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index c602bc535..b1034d092 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.33.3" +version = "0.33.4" authors = ["Clément Renault "] edition = "2018" description = "A small tool to do operations on the database" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index a42c688a4..8d4db3a04 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.33.3" +version = "0.33.4" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 432e23779..23d21f042 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.33.3" +version = "0.33.4" authors = ["Clément Renault "] edition = "2018" publish = false diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 723979ec2..460f4a582 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.33.3" +version = "0.33.4" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 016711198..f6e1913e6 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.33.3" +version = "0.33.4" authors = ["Kerollmops "] edition = "2018" From add96f921b4046aaec72e2e844eed51ad1daa37d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 13 Sep 2022 10:46:56 +0200 Subject: [PATCH 1633/1889] Remove unused infos/ http-ui/ and fuzz/ crates --- CONTRIBUTING.md | 13 - Cargo.toml | 2 +- README.md | 4 +- http-ui/Cargo.toml | 47 - http-ui/public/bulma-prefers-dark.min.css | 1 - http-ui/public/bulma.min.css | 1 - http-ui/public/filesize.min.js | 5 - http-ui/public/jquery-3.4.1.min.js | 2 - http-ui/public/logo-black.svg | 6 - http-ui/public/logo-white.svg | 6 - http-ui/public/script.js | 154 --- http-ui/public/style.css | 144 --- http-ui/public/updates-script.js | 102 -- http-ui/src/main.rs | 1176 -------------------- http-ui/src/update_store.rs | 362 ------ http-ui/templates/index.html | 102 -- http-ui/templates/updates.html | 95 -- infos/Cargo.toml | 17 - infos/src/main.rs | 1221 --------------------- milli/README.md | 26 - milli/fuzz/.gitignore | 5 - milli/fuzz/Cargo.toml | 34 - milli/fuzz/fuzz_targets/indexing.rs | 114 -- 23 files changed, 2 insertions(+), 3637 deletions(-) delete mode 100644 http-ui/Cargo.toml delete mode 100644 http-ui/public/bulma-prefers-dark.min.css delete mode 100644 http-ui/public/bulma.min.css delete mode 100644 http-ui/public/filesize.min.js delete mode 100644 http-ui/public/jquery-3.4.1.min.js delete mode 100644 http-ui/public/logo-black.svg delete mode 100644 http-ui/public/logo-white.svg delete mode 100644 http-ui/public/script.js delete mode 100644 http-ui/public/style.css delete mode 100644 http-ui/public/updates-script.js delete mode 100644 http-ui/src/main.rs delete mode 100644 http-ui/src/update_store.rs delete mode 100644 http-ui/templates/index.html delete mode 100644 http-ui/templates/updates.html delete mode 100644 infos/Cargo.toml delete mode 100644 infos/src/main.rs delete mode 100644 milli/README.md delete mode 100644 milli/fuzz/.gitignore delete mode 100644 milli/fuzz/Cargo.toml delete mode 100644 milli/fuzz/fuzz_targets/indexing.rs diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9e7ff8c90..daf2a8892 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -45,19 +45,6 @@ We recommend using the `--release` flag to test the full performance. cargo test ``` -### Querying the engine via the web interface - -To help you develop your feature you might need to use a web interface! You can query the engine by going to [the HTML page itself](http://127.0.0.1:9700). - -### Compile and run the HTTP debug server - -You can specify the number of threads to use to index documents and many other settings too. - -```bash -cd http-ui -cargo run --release -- --db my-database.mdb -vvv --indexing-jobs 8 -``` - ### Index your documents It can index a massive amount of documents in not much time, I already achieved to index: diff --git a/Cargo.toml b/Cargo.toml index f0fc59499..6a618c381 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "http-ui", "benchmarks", "infos", "helpers", "cli"] +members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "benchmarks", "helpers", "cli"] default-members = ["milli"] [profile.dev] diff --git a/README.md b/README.md index 5e916905d..93f4b2e6c 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

- the milli logo + the milli logo

a concurrent indexer combined with fast and relevant search algorithms

@@ -19,8 +19,6 @@ This repository contains crates to quickly debug the engine: - The `filter-parser` crate contains the parser for the Meilisearch filter syntax. - The `flatten-serde-json` crate contains the library that flattens serde-json `Value` objects like Elasticsearch does. - The `helpers` crate is only used to do operations on the database. - - The `http-ui` crate is a simple HTTP dashboard to test the features like for real! - - The `infos` crate is used to dump the internal data-structure and ensure correctness. - The `json-depth-checker` crate is used to indicate if a JSON must be flattened. ## How to use it? diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml deleted file mode 100644 index 8d4db3a04..000000000 --- a/http-ui/Cargo.toml +++ /dev/null @@ -1,47 +0,0 @@ -[package] -name = "http-ui" -description = "The HTTP user interface of the milli search engine" -version = "0.33.4" -authors = ["Clément Renault "] -edition = "2018" -publish = false - -[dependencies] -anyhow = "1.0.56" -byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } -crossbeam-channel = "0.5.2" -memmap2 = "0.5.3" -milli = { path = "../milli" } -mimalloc = { version = "0.1.29", default-features = false } -once_cell = "1.10.0" -rayon = "1.5.1" -structopt = { version = "0.3.26", default-features = false, features = ["wrap_help"] } -tempfile = "3.3.0" - -# http server -askama = "0.11.1" -askama_warp = "0.12.0" -bytes = "1.1.0" -either = "1.6.1" -flate2 = "1.0.22" -futures = "0.3.21" -serde = { version = "1.0.136", features = ["derive"] } -serde_json = { version = "1.0.79", features = ["preserve_order"] } -tokio = { version = "1.17.0", features = ["full"] } -tokio-stream = { version = "0.1.8", default-features = false, features = ["sync"] } -warp = "0.3.2" - -# logging -fst = "0.4.7" -log = "0.4.17" -stderrlog = "0.5.1" - -# Temporary fix for bitvec, remove once fixed. (https://github.com/bitvecto-rs/bitvec/issues/105) -bimap = "0.6.2" -csv = "1.1.6" -funty = "2.0.0" - -[dev-dependencies] -maplit = "1.0.2" -serde_test = "1.0.136" - diff --git a/http-ui/public/bulma-prefers-dark.min.css b/http-ui/public/bulma-prefers-dark.min.css deleted file mode 100644 index 7ebab0105..000000000 --- a/http-ui/public/bulma-prefers-dark.min.css +++ /dev/null @@ -1 +0,0 @@ -@media (prefers-color-scheme:dark){html{background-color:#17181c}body{color:#b5b5b5}a{color:#5ea3e4}a:hover{color:#dbdbdb}code{background-color:#242424;color:#eb002f}hr{background-color:#242424}strong{color:#dbdbdb}pre{background-color:#242424;color:#b5b5b5}table th{color:#dbdbdb}.has-text-white-dark{color:#fff!important}a.has-text-white-dark:focus,a.has-text-white-dark:hover{color:#fff!important}.has-background-white-dark{background-color:#fff!important}.has-text-black-dark{color:#0a0a0a!important}a.has-text-black-dark:focus,a.has-text-black-dark:hover{color:#242424!important}.has-background-black-dark{background-color:#0a0a0a!important}.has-text-light-dark{color:#f5f5f5!important}a.has-text-light-dark:focus,a.has-text-light-dark:hover{color:#fff!important}.has-background-light-dark{background-color:#f5f5f5!important}.has-text-dark-dark{color:#363636!important}a.has-text-dark-dark:focus,a.has-text-dark-dark:hover{color:#4f4f4f!important}.has-background-dark-dark{background-color:#363636!important}.has-text-primary-dark{color:#00d1b2!important}a.has-text-primary-dark:focus,a.has-text-primary-dark:hover{color:#05ffda!important}.has-background-primary-dark{background-color:#00d1b2!important}.has-text-link-dark{color:#3273dc!important}a.has-text-link-dark:focus,a.has-text-link-dark:hover{color:#5e91e4!important}.has-background-link-dark{background-color:#3273dc!important}.has-text-info-dark{color:#209cee!important}a.has-text-info-dark:focus,a.has-text-info-dark:hover{color:#50b1f2!important}.has-background-info-dark{background-color:#209cee!important}.has-text-success-dark{color:#23d160!important}a.has-text-success-dark:focus,a.has-text-success-dark:hover{color:#48e07d!important}.has-background-success-dark{background-color:#23d160!important}.has-text-warning-dark{color:#ffdd57!important}a.has-text-warning-dark:focus,a.has-text-warning-dark:hover{color:#ffe88a!important}.has-background-warning-dark{background-color:#ffdd57!important}.has-text-danger-dark{color:#ff3860!important}a.has-text-danger-dark:focus,a.has-text-danger-dark:hover{color:#ff6b89!important}.has-background-danger-dark{background-color:#ff3860!important}.has-text-black-bis-dark{color:#121212!important}.has-background-black-bis-dark{background-color:#121212!important}.has-text-black-ter-dark{color:#242424!important}.has-background-black-ter-dark{background-color:#242424!important}.has-text-grey-darker-dark{color:#363636!important}.has-background-grey-darker-dark{background-color:#363636!important}.has-text-grey-dark-dark{color:#4a4a4a!important}.has-background-grey-dark-dark{background-color:#4a4a4a!important}.has-text-grey-dark{color:#7a7a7a!important}.has-background-grey-dark{background-color:#7a7a7a!important}.has-text-grey-light-dark{color:#b5b5b5!important}.has-background-grey-light-dark{background-color:#b5b5b5!important}.has-text-grey-lighter-dark{color:#dbdbdb!important}.has-background-grey-lighter-dark{background-color:#dbdbdb!important}.has-text-white-ter-dark{color:#f5f5f5!important}.has-background-white-ter-dark{background-color:#f5f5f5!important}.has-text-white-bis-dark{color:#fafafa!important}.has-background-white-bis-dark{background-color:#fafafa!important}.box{background-color:#0a0a0a;box-shadow:0 2px 3px rgba(255,255,255,.1),0 0 0 1px rgba(255,255,255,.1);color:#b5b5b5}a.box:focus,a.box:hover{box-shadow:0 2px 3px rgba(255,255,255,.1),0 0 0 1px #5ea3e4}a.box:active{box-shadow:inset 0 1px 2px rgba(255,255,255,.2),0 0 0 1px #5ea3e4}.button{background-color:#0a0a0a;border-color:#363636;color:#dbdbdb}.button.is-hovered,.button:hover{border-color:#4a4a4a;color:#dbdbdb}.button.is-focused,.button:focus{border-color:#5ea3e4;color:#dbdbdb}.button.is-focused:not(:active),.button:focus:not(:active){box-shadow:0 0 0 .125em rgba(94,163,228,.25)}.button.is-active,.button:active{border-color:#b5b5b5;color:#dbdbdb}.button.is-text{color:#b5b5b5}.button.is-text.is-focused,.button.is-text.is-hovered,.button.is-text:focus,.button.is-text:hover{background-color:#242424;color:#dbdbdb}.button.is-text.is-active,.button.is-text:active{background-color:#171717;color:#dbdbdb}.button.is-white{background-color:#e6e6e6;border-color:transparent;color:#0a0a0a}.button.is-white.is-hovered,.button.is-white:hover{background-color:#dfdfdf;border-color:transparent;color:#0a0a0a}.button.is-white.is-focused,.button.is-white:focus{border-color:transparent;color:#0a0a0a}.button.is-white.is-focused:not(:active),.button.is-white:focus:not(:active){box-shadow:0 0 0 .125em rgba(230,230,230,.25)}.button.is-white.is-active,.button.is-white:active{background-color:#d9d9d9;border-color:transparent;color:#0a0a0a}.button.is-white[disabled],fieldset[disabled] .button.is-white{background-color:#e6e6e6;border-color:transparent;box-shadow:none}.button.is-white.is-inverted{background-color:#0a0a0a;color:#e6e6e6}.button.is-white.is-inverted:hover{background-color:#000}.button.is-white.is-inverted[disabled],fieldset[disabled] .button.is-white.is-inverted{background-color:#0a0a0a;border-color:transparent;box-shadow:none;color:#e6e6e6}.button.is-white.is-loading::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-white.is-outlined{background-color:transparent;border-color:#e6e6e6;color:#e6e6e6}.button.is-white.is-outlined:focus,.button.is-white.is-outlined:hover{background-color:#e6e6e6;border-color:#e6e6e6;color:#0a0a0a}.button.is-white.is-outlined.is-loading::after{border-color:transparent transparent #e6e6e6 #e6e6e6!important}.button.is-white.is-outlined[disabled],fieldset[disabled] .button.is-white.is-outlined{background-color:transparent;border-color:#e6e6e6;box-shadow:none;color:#e6e6e6}.button.is-white.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;color:#0a0a0a}.button.is-white.is-inverted.is-outlined:focus,.button.is-white.is-inverted.is-outlined:hover{background-color:#0a0a0a;color:#e6e6e6}.button.is-white.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-white.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;box-shadow:none;color:#0a0a0a}.button.is-black{background-color:#000;border-color:transparent;color:#fff}.button.is-black.is-hovered,.button.is-black:hover{background-color:#000;border-color:transparent;color:#fff}.button.is-black.is-focused,.button.is-black:focus{border-color:transparent;color:#fff}.button.is-black.is-focused:not(:active),.button.is-black:focus:not(:active){box-shadow:0 0 0 .125em rgba(0,0,0,.25)}.button.is-black.is-active,.button.is-black:active{background-color:#000;border-color:transparent;color:#fff}.button.is-black[disabled],fieldset[disabled] .button.is-black{background-color:#000;border-color:transparent;box-shadow:none}.button.is-black.is-inverted{background-color:#fff;color:#000}.button.is-black.is-inverted:hover{background-color:#f2f2f2}.button.is-black.is-inverted[disabled],fieldset[disabled] .button.is-black.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#000}.button.is-black.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-black.is-outlined{background-color:transparent;border-color:#000;color:#000}.button.is-black.is-outlined:focus,.button.is-black.is-outlined:hover{background-color:#000;border-color:#000;color:#fff}.button.is-black.is-outlined.is-loading::after{border-color:transparent transparent #000 #000!important}.button.is-black.is-outlined[disabled],fieldset[disabled] .button.is-black.is-outlined{background-color:transparent;border-color:#000;box-shadow:none;color:#000}.button.is-black.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-black.is-inverted.is-outlined:focus,.button.is-black.is-inverted.is-outlined:hover{background-color:#fff;color:#000}.button.is-black.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-black.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-light{background-color:#dbdbdb;border-color:transparent;color:#363636}.button.is-light.is-hovered,.button.is-light:hover{background-color:#d5d5d5;border-color:transparent;color:#363636}.button.is-light.is-focused,.button.is-light:focus{border-color:transparent;color:#363636}.button.is-light.is-focused:not(:active),.button.is-light:focus:not(:active){box-shadow:0 0 0 .125em rgba(219,219,219,.25)}.button.is-light.is-active,.button.is-light:active{background-color:#cfcfcf;border-color:transparent;color:#363636}.button.is-light[disabled],fieldset[disabled] .button.is-light{background-color:#dbdbdb;border-color:transparent;box-shadow:none}.button.is-light.is-inverted{background-color:#363636;color:#dbdbdb}.button.is-light.is-inverted:hover{background-color:#292929}.button.is-light.is-inverted[disabled],fieldset[disabled] .button.is-light.is-inverted{background-color:#363636;border-color:transparent;box-shadow:none;color:#dbdbdb}.button.is-light.is-loading::after{border-color:transparent transparent #363636 #363636!important}.button.is-light.is-outlined{background-color:transparent;border-color:#dbdbdb;color:#dbdbdb}.button.is-light.is-outlined:focus,.button.is-light.is-outlined:hover{background-color:#dbdbdb;border-color:#dbdbdb;color:#363636}.button.is-light.is-outlined.is-loading::after{border-color:transparent transparent #dbdbdb #dbdbdb!important}.button.is-light.is-outlined[disabled],fieldset[disabled] .button.is-light.is-outlined{background-color:transparent;border-color:#dbdbdb;box-shadow:none;color:#dbdbdb}.button.is-light.is-inverted.is-outlined{background-color:transparent;border-color:#363636;color:#363636}.button.is-light.is-inverted.is-outlined:focus,.button.is-light.is-inverted.is-outlined:hover{background-color:#363636;color:#dbdbdb}.button.is-light.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-light.is-inverted.is-outlined{background-color:transparent;border-color:#363636;box-shadow:none;color:#363636}.button.is-dark{background-color:#1c1c1c;border-color:transparent;color:#f5f5f5}.button.is-dark.is-hovered,.button.is-dark:hover{background-color:#161616;border-color:transparent;color:#f5f5f5}.button.is-dark.is-focused,.button.is-dark:focus{border-color:transparent;color:#f5f5f5}.button.is-dark.is-focused:not(:active),.button.is-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(28,28,28,.25)}.button.is-dark.is-active,.button.is-dark:active{background-color:#0f0f0f;border-color:transparent;color:#f5f5f5}.button.is-dark[disabled],fieldset[disabled] .button.is-dark{background-color:#1c1c1c;border-color:transparent;box-shadow:none}.button.is-dark.is-inverted{background-color:#f5f5f5;color:#1c1c1c}.button.is-dark.is-inverted:hover{background-color:#e8e8e8}.button.is-dark.is-inverted[disabled],fieldset[disabled] .button.is-dark.is-inverted{background-color:#f5f5f5;border-color:transparent;box-shadow:none;color:#1c1c1c}.button.is-dark.is-loading::after{border-color:transparent transparent #f5f5f5 #f5f5f5!important}.button.is-dark.is-outlined{background-color:transparent;border-color:#1c1c1c;color:#1c1c1c}.button.is-dark.is-outlined:focus,.button.is-dark.is-outlined:hover{background-color:#1c1c1c;border-color:#1c1c1c;color:#f5f5f5}.button.is-dark.is-outlined.is-loading::after{border-color:transparent transparent #1c1c1c #1c1c1c!important}.button.is-dark.is-outlined[disabled],fieldset[disabled] .button.is-dark.is-outlined{background-color:transparent;border-color:#1c1c1c;box-shadow:none;color:#1c1c1c}.button.is-dark.is-inverted.is-outlined{background-color:transparent;border-color:#f5f5f5;color:#f5f5f5}.button.is-dark.is-inverted.is-outlined:focus,.button.is-dark.is-inverted.is-outlined:hover{background-color:#f5f5f5;color:#1c1c1c}.button.is-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-dark.is-inverted.is-outlined{background-color:transparent;border-color:#f5f5f5;box-shadow:none;color:#f5f5f5}.button.is-primary{background-color:#009e86;border-color:transparent;color:#fff}.button.is-primary.is-hovered,.button.is-primary:hover{background-color:#00917c;border-color:transparent;color:#fff}.button.is-primary.is-focused,.button.is-primary:focus{border-color:transparent;color:#fff}.button.is-primary.is-focused:not(:active),.button.is-primary:focus:not(:active){box-shadow:0 0 0 .125em rgba(0,158,134,.25)}.button.is-primary.is-active,.button.is-primary:active{background-color:#008571;border-color:transparent;color:#fff}.button.is-primary[disabled],fieldset[disabled] .button.is-primary{background-color:#009e86;border-color:transparent;box-shadow:none}.button.is-primary.is-inverted{background-color:#fff;color:#009e86}.button.is-primary.is-inverted:hover{background-color:#f2f2f2}.button.is-primary.is-inverted[disabled],fieldset[disabled] .button.is-primary.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#009e86}.button.is-primary.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-primary.is-outlined{background-color:transparent;border-color:#009e86;color:#009e86}.button.is-primary.is-outlined:focus,.button.is-primary.is-outlined:hover{background-color:#009e86;border-color:#009e86;color:#fff}.button.is-primary.is-outlined.is-loading::after{border-color:transparent transparent #009e86 #009e86!important}.button.is-primary.is-outlined[disabled],fieldset[disabled] .button.is-primary.is-outlined{background-color:transparent;border-color:#009e86;box-shadow:none;color:#009e86}.button.is-primary.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-primary.is-inverted.is-outlined:focus,.button.is-primary.is-inverted.is-outlined:hover{background-color:#fff;color:#009e86}.button.is-primary.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-primary.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-link{background-color:#205bbc;border-color:transparent;color:#fff}.button.is-link.is-hovered,.button.is-link:hover{background-color:#1e56b1;border-color:transparent;color:#fff}.button.is-link.is-focused,.button.is-link:focus{border-color:transparent;color:#fff}.button.is-link.is-focused:not(:active),.button.is-link:focus:not(:active){box-shadow:0 0 0 .125em rgba(32,91,188,.25)}.button.is-link.is-active,.button.is-link:active{background-color:#1c51a6;border-color:transparent;color:#fff}.button.is-link[disabled],fieldset[disabled] .button.is-link{background-color:#205bbc;border-color:transparent;box-shadow:none}.button.is-link.is-inverted{background-color:#fff;color:#205bbc}.button.is-link.is-inverted:hover{background-color:#f2f2f2}.button.is-link.is-inverted[disabled],fieldset[disabled] .button.is-link.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#205bbc}.button.is-link.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-link.is-outlined{background-color:transparent;border-color:#205bbc;color:#205bbc}.button.is-link.is-outlined:focus,.button.is-link.is-outlined:hover{background-color:#205bbc;border-color:#205bbc;color:#fff}.button.is-link.is-outlined.is-loading::after{border-color:transparent transparent #205bbc #205bbc!important}.button.is-link.is-outlined[disabled],fieldset[disabled] .button.is-link.is-outlined{background-color:transparent;border-color:#205bbc;box-shadow:none;color:#205bbc}.button.is-link.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-link.is-inverted.is-outlined:focus,.button.is-link.is-inverted.is-outlined:hover{background-color:#fff;color:#205bbc}.button.is-link.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-link.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-info{background-color:#0f81cc;border-color:transparent;color:#fff}.button.is-info.is-hovered,.button.is-info:hover{background-color:#0e79c0;border-color:transparent;color:#fff}.button.is-info.is-focused,.button.is-info:focus{border-color:transparent;color:#fff}.button.is-info.is-focused:not(:active),.button.is-info:focus:not(:active){box-shadow:0 0 0 .125em rgba(15,129,204,.25)}.button.is-info.is-active,.button.is-info:active{background-color:#0e72b4;border-color:transparent;color:#fff}.button.is-info[disabled],fieldset[disabled] .button.is-info{background-color:#0f81cc;border-color:transparent;box-shadow:none}.button.is-info.is-inverted{background-color:#fff;color:#0f81cc}.button.is-info.is-inverted:hover{background-color:#f2f2f2}.button.is-info.is-inverted[disabled],fieldset[disabled] .button.is-info.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#0f81cc}.button.is-info.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-info.is-outlined{background-color:transparent;border-color:#0f81cc;color:#0f81cc}.button.is-info.is-outlined:focus,.button.is-info.is-outlined:hover{background-color:#0f81cc;border-color:#0f81cc;color:#fff}.button.is-info.is-outlined.is-loading::after{border-color:transparent transparent #0f81cc #0f81cc!important}.button.is-info.is-outlined[disabled],fieldset[disabled] .button.is-info.is-outlined{background-color:transparent;border-color:#0f81cc;box-shadow:none;color:#0f81cc}.button.is-info.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-info.is-inverted.is-outlined:focus,.button.is-info.is-inverted.is-outlined:hover{background-color:#fff;color:#0f81cc}.button.is-info.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-info.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-success{background-color:#1ca64c;border-color:transparent;color:#fff}.button.is-success.is-hovered,.button.is-success:hover{background-color:#1a9b47;border-color:transparent;color:#fff}.button.is-success.is-focused,.button.is-success:focus{border-color:transparent;color:#fff}.button.is-success.is-focused:not(:active),.button.is-success:focus:not(:active){box-shadow:0 0 0 .125em rgba(28,166,76,.25)}.button.is-success.is-active,.button.is-success:active{background-color:#189042;border-color:transparent;color:#fff}.button.is-success[disabled],fieldset[disabled] .button.is-success{background-color:#1ca64c;border-color:transparent;box-shadow:none}.button.is-success.is-inverted{background-color:#fff;color:#1ca64c}.button.is-success.is-inverted:hover{background-color:#f2f2f2}.button.is-success.is-inverted[disabled],fieldset[disabled] .button.is-success.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#1ca64c}.button.is-success.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-success.is-outlined{background-color:transparent;border-color:#1ca64c;color:#1ca64c}.button.is-success.is-outlined:focus,.button.is-success.is-outlined:hover{background-color:#1ca64c;border-color:#1ca64c;color:#fff}.button.is-success.is-outlined.is-loading::after{border-color:transparent transparent #1ca64c #1ca64c!important}.button.is-success.is-outlined[disabled],fieldset[disabled] .button.is-success.is-outlined{background-color:transparent;border-color:#1ca64c;box-shadow:none;color:#1ca64c}.button.is-success.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-success.is-inverted.is-outlined:focus,.button.is-success.is-inverted.is-outlined:hover{background-color:#fff;color:#1ca64c}.button.is-success.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-success.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-warning{background-color:#ffd324;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-hovered,.button.is-warning:hover{background-color:#ffd117;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-focused,.button.is-warning:focus{border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-focused:not(:active),.button.is-warning:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,211,36,.25)}.button.is-warning.is-active,.button.is-warning:active{background-color:#ffce0a;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning[disabled],fieldset[disabled] .button.is-warning{background-color:#ffd324;border-color:transparent;box-shadow:none}.button.is-warning.is-inverted{background-color:rgba(0,0,0,.7);color:#ffd324}.button.is-warning.is-inverted:hover{background-color:rgba(0,0,0,.7)}.button.is-warning.is-inverted[disabled],fieldset[disabled] .button.is-warning.is-inverted{background-color:rgba(0,0,0,.7);border-color:transparent;box-shadow:none;color:#ffd324}.button.is-warning.is-loading::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-warning.is-outlined{background-color:transparent;border-color:#ffd324;color:#ffd324}.button.is-warning.is-outlined:focus,.button.is-warning.is-outlined:hover{background-color:#ffd324;border-color:#ffd324;color:rgba(0,0,0,.7)}.button.is-warning.is-outlined.is-loading::after{border-color:transparent transparent #ffd324 #ffd324!important}.button.is-warning.is-outlined[disabled],fieldset[disabled] .button.is-warning.is-outlined{background-color:transparent;border-color:#ffd324;box-shadow:none;color:#ffd324}.button.is-warning.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);color:rgba(0,0,0,.7)}.button.is-warning.is-inverted.is-outlined:focus,.button.is-warning.is-inverted.is-outlined:hover{background-color:rgba(0,0,0,.7);color:#ffd324}.button.is-warning.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-warning.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);box-shadow:none;color:rgba(0,0,0,.7)}.button.is-danger{background-color:#ff0537;border-color:transparent;color:#fff}.button.is-danger.is-hovered,.button.is-danger:hover{background-color:#f70031;border-color:transparent;color:#fff}.button.is-danger.is-focused,.button.is-danger:focus{border-color:transparent;color:#fff}.button.is-danger.is-focused:not(:active),.button.is-danger:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,5,55,.25)}.button.is-danger.is-active,.button.is-danger:active{background-color:#eb002f;border-color:transparent;color:#fff}.button.is-danger[disabled],fieldset[disabled] .button.is-danger{background-color:#ff0537;border-color:transparent;box-shadow:none}.button.is-danger.is-inverted{background-color:#fff;color:#ff0537}.button.is-danger.is-inverted:hover{background-color:#f2f2f2}.button.is-danger.is-inverted[disabled],fieldset[disabled] .button.is-danger.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#ff0537}.button.is-danger.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-danger.is-outlined{background-color:transparent;border-color:#ff0537;color:#ff0537}.button.is-danger.is-outlined:focus,.button.is-danger.is-outlined:hover{background-color:#ff0537;border-color:#ff0537;color:#fff}.button.is-danger.is-outlined.is-loading::after{border-color:transparent transparent #ff0537 #ff0537!important}.button.is-danger.is-outlined[disabled],fieldset[disabled] .button.is-danger.is-outlined{background-color:transparent;border-color:#ff0537;box-shadow:none;color:#ff0537}.button.is-danger.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-danger.is-inverted.is-outlined:focus,.button.is-danger.is-inverted.is-outlined:hover{background-color:#fff;color:#ff0537}.button.is-danger.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-danger.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-white-dark{background-color:#fff;border-color:transparent;color:#0a0a0a}.button.is-white-dark.is-hovered,.button.is-white-dark:hover{background-color:#f9f9f9;border-color:transparent;color:#0a0a0a}.button.is-white-dark.is-focused,.button.is-white-dark:focus{border-color:transparent;color:#0a0a0a}.button.is-white-dark.is-focused:not(:active),.button.is-white-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.button.is-white-dark.is-active,.button.is-white-dark:active{background-color:#f2f2f2;border-color:transparent;color:#0a0a0a}.button.is-white-dark[disabled],fieldset[disabled] .button.is-white-dark{background-color:#fff;border-color:transparent;box-shadow:none}.button.is-white-dark.is-inverted{background-color:#0a0a0a;color:#fff}.button.is-white-dark.is-inverted:hover{background-color:#000}.button.is-white-dark.is-inverted[disabled],fieldset[disabled] .button.is-white-dark.is-inverted{background-color:#0a0a0a;border-color:transparent;box-shadow:none;color:#fff}.button.is-white-dark.is-loading::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-white-dark.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-white-dark.is-outlined:focus,.button.is-white-dark.is-outlined:hover{background-color:#fff;border-color:#fff;color:#0a0a0a}.button.is-white-dark.is-outlined.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-white-dark.is-outlined[disabled],fieldset[disabled] .button.is-white-dark.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-white-dark.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;color:#0a0a0a}.button.is-white-dark.is-inverted.is-outlined:focus,.button.is-white-dark.is-inverted.is-outlined:hover{background-color:#0a0a0a;color:#fff}.button.is-white-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-white-dark.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;box-shadow:none;color:#0a0a0a}.button.is-black-dark{background-color:#0a0a0a;border-color:transparent;color:#fff}.button.is-black-dark.is-hovered,.button.is-black-dark:hover{background-color:#040404;border-color:transparent;color:#fff}.button.is-black-dark.is-focused,.button.is-black-dark:focus{border-color:transparent;color:#fff}.button.is-black-dark.is-focused:not(:active),.button.is-black-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.button.is-black-dark.is-active,.button.is-black-dark:active{background-color:#000;border-color:transparent;color:#fff}.button.is-black-dark[disabled],fieldset[disabled] .button.is-black-dark{background-color:#0a0a0a;border-color:transparent;box-shadow:none}.button.is-black-dark.is-inverted{background-color:#fff;color:#0a0a0a}.button.is-black-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-black-dark.is-inverted[disabled],fieldset[disabled] .button.is-black-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#0a0a0a}.button.is-black-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-black-dark.is-outlined{background-color:transparent;border-color:#0a0a0a;color:#0a0a0a}.button.is-black-dark.is-outlined:focus,.button.is-black-dark.is-outlined:hover{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.button.is-black-dark.is-outlined.is-loading::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-black-dark.is-outlined[disabled],fieldset[disabled] .button.is-black-dark.is-outlined{background-color:transparent;border-color:#0a0a0a;box-shadow:none;color:#0a0a0a}.button.is-black-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-black-dark.is-inverted.is-outlined:focus,.button.is-black-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#0a0a0a}.button.is-black-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-black-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-light-dark{background-color:#f5f5f5;border-color:transparent;color:#363636}.button.is-light-dark.is-hovered,.button.is-light-dark:hover{background-color:#eee;border-color:transparent;color:#363636}.button.is-light-dark.is-focused,.button.is-light-dark:focus{border-color:transparent;color:#363636}.button.is-light-dark.is-focused:not(:active),.button.is-light-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.button.is-light-dark.is-active,.button.is-light-dark:active{background-color:#e8e8e8;border-color:transparent;color:#363636}.button.is-light-dark[disabled],fieldset[disabled] .button.is-light-dark{background-color:#f5f5f5;border-color:transparent;box-shadow:none}.button.is-light-dark.is-inverted{background-color:#363636;color:#f5f5f5}.button.is-light-dark.is-inverted:hover{background-color:#292929}.button.is-light-dark.is-inverted[disabled],fieldset[disabled] .button.is-light-dark.is-inverted{background-color:#363636;border-color:transparent;box-shadow:none;color:#f5f5f5}.button.is-light-dark.is-loading::after{border-color:transparent transparent #363636 #363636!important}.button.is-light-dark.is-outlined{background-color:transparent;border-color:#f5f5f5;color:#f5f5f5}.button.is-light-dark.is-outlined:focus,.button.is-light-dark.is-outlined:hover{background-color:#f5f5f5;border-color:#f5f5f5;color:#363636}.button.is-light-dark.is-outlined.is-loading::after{border-color:transparent transparent #f5f5f5 #f5f5f5!important}.button.is-light-dark.is-outlined[disabled],fieldset[disabled] .button.is-light-dark.is-outlined{background-color:transparent;border-color:#f5f5f5;box-shadow:none;color:#f5f5f5}.button.is-light-dark.is-inverted.is-outlined{background-color:transparent;border-color:#363636;color:#363636}.button.is-light-dark.is-inverted.is-outlined:focus,.button.is-light-dark.is-inverted.is-outlined:hover{background-color:#363636;color:#f5f5f5}.button.is-light-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-light-dark.is-inverted.is-outlined{background-color:transparent;border-color:#363636;box-shadow:none;color:#363636}.button.is-dark-dark{background-color:#363636;border-color:transparent;color:#f5f5f5}.button.is-dark-dark.is-hovered,.button.is-dark-dark:hover{background-color:#2f2f2f;border-color:transparent;color:#f5f5f5}.button.is-dark-dark.is-focused,.button.is-dark-dark:focus{border-color:transparent;color:#f5f5f5}.button.is-dark-dark.is-focused:not(:active),.button.is-dark-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.button.is-dark-dark.is-active,.button.is-dark-dark:active{background-color:#292929;border-color:transparent;color:#f5f5f5}.button.is-dark-dark[disabled],fieldset[disabled] .button.is-dark-dark{background-color:#363636;border-color:transparent;box-shadow:none}.button.is-dark-dark.is-inverted{background-color:#f5f5f5;color:#363636}.button.is-dark-dark.is-inverted:hover{background-color:#e8e8e8}.button.is-dark-dark.is-inverted[disabled],fieldset[disabled] .button.is-dark-dark.is-inverted{background-color:#f5f5f5;border-color:transparent;box-shadow:none;color:#363636}.button.is-dark-dark.is-loading::after{border-color:transparent transparent #f5f5f5 #f5f5f5!important}.button.is-dark-dark.is-outlined{background-color:transparent;border-color:#363636;color:#363636}.button.is-dark-dark.is-outlined:focus,.button.is-dark-dark.is-outlined:hover{background-color:#363636;border-color:#363636;color:#f5f5f5}.button.is-dark-dark.is-outlined.is-loading::after{border-color:transparent transparent #363636 #363636!important}.button.is-dark-dark.is-outlined[disabled],fieldset[disabled] .button.is-dark-dark.is-outlined{background-color:transparent;border-color:#363636;box-shadow:none;color:#363636}.button.is-dark-dark.is-inverted.is-outlined{background-color:transparent;border-color:#f5f5f5;color:#f5f5f5}.button.is-dark-dark.is-inverted.is-outlined:focus,.button.is-dark-dark.is-inverted.is-outlined:hover{background-color:#f5f5f5;color:#363636}.button.is-dark-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-dark-dark.is-inverted.is-outlined{background-color:transparent;border-color:#f5f5f5;box-shadow:none;color:#f5f5f5}.button.is-primary-dark{background-color:#00d1b2;border-color:transparent;color:#fff}.button.is-primary-dark.is-hovered,.button.is-primary-dark:hover{background-color:#00c4a7;border-color:transparent;color:#fff}.button.is-primary-dark.is-focused,.button.is-primary-dark:focus{border-color:transparent;color:#fff}.button.is-primary-dark.is-focused:not(:active),.button.is-primary-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.button.is-primary-dark.is-active,.button.is-primary-dark:active{background-color:#00b89c;border-color:transparent;color:#fff}.button.is-primary-dark[disabled],fieldset[disabled] .button.is-primary-dark{background-color:#00d1b2;border-color:transparent;box-shadow:none}.button.is-primary-dark.is-inverted{background-color:#fff;color:#00d1b2}.button.is-primary-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-primary-dark.is-inverted[disabled],fieldset[disabled] .button.is-primary-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#00d1b2}.button.is-primary-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-primary-dark.is-outlined{background-color:transparent;border-color:#00d1b2;color:#00d1b2}.button.is-primary-dark.is-outlined:focus,.button.is-primary-dark.is-outlined:hover{background-color:#00d1b2;border-color:#00d1b2;color:#fff}.button.is-primary-dark.is-outlined.is-loading::after{border-color:transparent transparent #00d1b2 #00d1b2!important}.button.is-primary-dark.is-outlined[disabled],fieldset[disabled] .button.is-primary-dark.is-outlined{background-color:transparent;border-color:#00d1b2;box-shadow:none;color:#00d1b2}.button.is-primary-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-primary-dark.is-inverted.is-outlined:focus,.button.is-primary-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#00d1b2}.button.is-primary-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-primary-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-link-dark{background-color:#3273dc;border-color:transparent;color:#fff}.button.is-link-dark.is-hovered,.button.is-link-dark:hover{background-color:#276cda;border-color:transparent;color:#fff}.button.is-link-dark.is-focused,.button.is-link-dark:focus{border-color:transparent;color:#fff}.button.is-link-dark.is-focused:not(:active),.button.is-link-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.button.is-link-dark.is-active,.button.is-link-dark:active{background-color:#2366d1;border-color:transparent;color:#fff}.button.is-link-dark[disabled],fieldset[disabled] .button.is-link-dark{background-color:#3273dc;border-color:transparent;box-shadow:none}.button.is-link-dark.is-inverted{background-color:#fff;color:#3273dc}.button.is-link-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-link-dark.is-inverted[disabled],fieldset[disabled] .button.is-link-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#3273dc}.button.is-link-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-link-dark.is-outlined{background-color:transparent;border-color:#3273dc;color:#3273dc}.button.is-link-dark.is-outlined:focus,.button.is-link-dark.is-outlined:hover{background-color:#3273dc;border-color:#3273dc;color:#fff}.button.is-link-dark.is-outlined.is-loading::after{border-color:transparent transparent #3273dc #3273dc!important}.button.is-link-dark.is-outlined[disabled],fieldset[disabled] .button.is-link-dark.is-outlined{background-color:transparent;border-color:#3273dc;box-shadow:none;color:#3273dc}.button.is-link-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-link-dark.is-inverted.is-outlined:focus,.button.is-link-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#3273dc}.button.is-link-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-link-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-info-dark{background-color:#209cee;border-color:transparent;color:#fff}.button.is-info-dark.is-hovered,.button.is-info-dark:hover{background-color:#1496ed;border-color:transparent;color:#fff}.button.is-info-dark.is-focused,.button.is-info-dark:focus{border-color:transparent;color:#fff}.button.is-info-dark.is-focused:not(:active),.button.is-info-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(32,156,238,.25)}.button.is-info-dark.is-active,.button.is-info-dark:active{background-color:#118fe4;border-color:transparent;color:#fff}.button.is-info-dark[disabled],fieldset[disabled] .button.is-info-dark{background-color:#209cee;border-color:transparent;box-shadow:none}.button.is-info-dark.is-inverted{background-color:#fff;color:#209cee}.button.is-info-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-info-dark.is-inverted[disabled],fieldset[disabled] .button.is-info-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#209cee}.button.is-info-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-info-dark.is-outlined{background-color:transparent;border-color:#209cee;color:#209cee}.button.is-info-dark.is-outlined:focus,.button.is-info-dark.is-outlined:hover{background-color:#209cee;border-color:#209cee;color:#fff}.button.is-info-dark.is-outlined.is-loading::after{border-color:transparent transparent #209cee #209cee!important}.button.is-info-dark.is-outlined[disabled],fieldset[disabled] .button.is-info-dark.is-outlined{background-color:transparent;border-color:#209cee;box-shadow:none;color:#209cee}.button.is-info-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-info-dark.is-inverted.is-outlined:focus,.button.is-info-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#209cee}.button.is-info-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-info-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-success-dark{background-color:#23d160;border-color:transparent;color:#fff}.button.is-success-dark.is-hovered,.button.is-success-dark:hover{background-color:#22c65b;border-color:transparent;color:#fff}.button.is-success-dark.is-focused,.button.is-success-dark:focus{border-color:transparent;color:#fff}.button.is-success-dark.is-focused:not(:active),.button.is-success-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(35,209,96,.25)}.button.is-success-dark.is-active,.button.is-success-dark:active{background-color:#20bc56;border-color:transparent;color:#fff}.button.is-success-dark[disabled],fieldset[disabled] .button.is-success-dark{background-color:#23d160;border-color:transparent;box-shadow:none}.button.is-success-dark.is-inverted{background-color:#fff;color:#23d160}.button.is-success-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-success-dark.is-inverted[disabled],fieldset[disabled] .button.is-success-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#23d160}.button.is-success-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-success-dark.is-outlined{background-color:transparent;border-color:#23d160;color:#23d160}.button.is-success-dark.is-outlined:focus,.button.is-success-dark.is-outlined:hover{background-color:#23d160;border-color:#23d160;color:#fff}.button.is-success-dark.is-outlined.is-loading::after{border-color:transparent transparent #23d160 #23d160!important}.button.is-success-dark.is-outlined[disabled],fieldset[disabled] .button.is-success-dark.is-outlined{background-color:transparent;border-color:#23d160;box-shadow:none;color:#23d160}.button.is-success-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-success-dark.is-inverted.is-outlined:focus,.button.is-success-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#23d160}.button.is-success-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-success-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-warning-dark{background-color:#ffdd57;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning-dark.is-hovered,.button.is-warning-dark:hover{background-color:#ffdb4a;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning-dark.is-focused,.button.is-warning-dark:focus{border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning-dark.is-focused:not(:active),.button.is-warning-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.button.is-warning-dark.is-active,.button.is-warning-dark:active{background-color:#ffd83d;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning-dark[disabled],fieldset[disabled] .button.is-warning-dark{background-color:#ffdd57;border-color:transparent;box-shadow:none}.button.is-warning-dark.is-inverted{background-color:rgba(0,0,0,.7);color:#ffdd57}.button.is-warning-dark.is-inverted:hover{background-color:rgba(0,0,0,.7)}.button.is-warning-dark.is-inverted[disabled],fieldset[disabled] .button.is-warning-dark.is-inverted{background-color:rgba(0,0,0,.7);border-color:transparent;box-shadow:none;color:#ffdd57}.button.is-warning-dark.is-loading::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-warning-dark.is-outlined{background-color:transparent;border-color:#ffdd57;color:#ffdd57}.button.is-warning-dark.is-outlined:focus,.button.is-warning-dark.is-outlined:hover{background-color:#ffdd57;border-color:#ffdd57;color:rgba(0,0,0,.7)}.button.is-warning-dark.is-outlined.is-loading::after{border-color:transparent transparent #ffdd57 #ffdd57!important}.button.is-warning-dark.is-outlined[disabled],fieldset[disabled] .button.is-warning-dark.is-outlined{background-color:transparent;border-color:#ffdd57;box-shadow:none;color:#ffdd57}.button.is-warning-dark.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);color:rgba(0,0,0,.7)}.button.is-warning-dark.is-inverted.is-outlined:focus,.button.is-warning-dark.is-inverted.is-outlined:hover{background-color:rgba(0,0,0,.7);color:#ffdd57}.button.is-warning-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-warning-dark.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);box-shadow:none;color:rgba(0,0,0,.7)}.button.is-danger-dark{background-color:#ff3860;border-color:transparent;color:#fff}.button.is-danger-dark.is-hovered,.button.is-danger-dark:hover{background-color:#ff2b56;border-color:transparent;color:#fff}.button.is-danger-dark.is-focused,.button.is-danger-dark:focus{border-color:transparent;color:#fff}.button.is-danger-dark.is-focused:not(:active),.button.is-danger-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,56,96,.25)}.button.is-danger-dark.is-active,.button.is-danger-dark:active{background-color:#ff1f4b;border-color:transparent;color:#fff}.button.is-danger-dark[disabled],fieldset[disabled] .button.is-danger-dark{background-color:#ff3860;border-color:transparent;box-shadow:none}.button.is-danger-dark.is-inverted{background-color:#fff;color:#ff3860}.button.is-danger-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-danger-dark.is-inverted[disabled],fieldset[disabled] .button.is-danger-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#ff3860}.button.is-danger-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-danger-dark.is-outlined{background-color:transparent;border-color:#ff3860;color:#ff3860}.button.is-danger-dark.is-outlined:focus,.button.is-danger-dark.is-outlined:hover{background-color:#ff3860;border-color:#ff3860;color:#fff}.button.is-danger-dark.is-outlined.is-loading::after{border-color:transparent transparent #ff3860 #ff3860!important}.button.is-danger-dark.is-outlined[disabled],fieldset[disabled] .button.is-danger-dark.is-outlined{background-color:transparent;border-color:#ff3860;box-shadow:none;color:#ff3860}.button.is-danger-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-danger-dark.is-inverted.is-outlined:focus,.button.is-danger-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#ff3860}.button.is-danger-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-danger-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button[disabled],fieldset[disabled] .button{background-color:#0a0a0a;border-color:#363636}.button.is-static{background-color:#f5f5f5;border-color:#363636;color:#7a7a7a}.content h1,.content h2,.content h3,.content h4,.content h5,.content h6{color:#dbdbdb}.content blockquote{background-color:#242424;border-left:5px solid #363636}.content table td,.content table th{border:1px solid #363636}.content table th{color:#dbdbdb}.content table thead td,.content table thead th{color:#dbdbdb}.content table tfoot td,.content table tfoot th{color:#dbdbdb}.input,.textarea{background-color:#0a0a0a;border-color:#363636;color:#dbdbdb;box-shadow:inset 0 1px 2px rgba(255,255,255,.1)}.input::-moz-placeholder,.textarea::-moz-placeholder{color:rgba(219,219,219,.3)}.input::-webkit-input-placeholder,.textarea::-webkit-input-placeholder{color:rgba(219,219,219,.3)}.input:-moz-placeholder,.textarea:-moz-placeholder{color:rgba(219,219,219,.3)}.input:-ms-input-placeholder,.textarea:-ms-input-placeholder{color:rgba(219,219,219,.3)}.input.is-hovered,.input:hover,.textarea.is-hovered,.textarea:hover{border-color:#4a4a4a}.input.is-active,.input.is-focused,.input:active,.input:focus,.textarea.is-active,.textarea.is-focused,.textarea:active,.textarea:focus{border-color:#5ea3e4;box-shadow:0 0 0 .125em rgba(94,163,228,.25)}.input[disabled],.textarea[disabled],fieldset[disabled] .input,fieldset[disabled] .textarea{background-color:#242424;border-color:#242424;color:#b5b5b5}.input[disabled]::-moz-placeholder,.textarea[disabled]::-moz-placeholder,fieldset[disabled] .input::-moz-placeholder,fieldset[disabled] .textarea::-moz-placeholder{color:rgba(181,181,181,.3)}.input[disabled]::-webkit-input-placeholder,.textarea[disabled]::-webkit-input-placeholder,fieldset[disabled] .input::-webkit-input-placeholder,fieldset[disabled] .textarea::-webkit-input-placeholder{color:rgba(181,181,181,.3)}.input[disabled]:-moz-placeholder,.textarea[disabled]:-moz-placeholder,fieldset[disabled] .input:-moz-placeholder,fieldset[disabled] .textarea:-moz-placeholder{color:rgba(181,181,181,.3)}.input[disabled]:-ms-input-placeholder,.textarea[disabled]:-ms-input-placeholder,fieldset[disabled] .input:-ms-input-placeholder,fieldset[disabled] .textarea:-ms-input-placeholder{color:rgba(181,181,181,.3)}.input.is-white,.textarea.is-white{border-color:#e6e6e6}.input.is-white.is-active,.input.is-white.is-focused,.input.is-white:active,.input.is-white:focus,.textarea.is-white.is-active,.textarea.is-white.is-focused,.textarea.is-white:active,.textarea.is-white:focus{box-shadow:0 0 0 .125em rgba(230,230,230,.25)}.input.is-black,.textarea.is-black{border-color:#000}.input.is-black.is-active,.input.is-black.is-focused,.input.is-black:active,.input.is-black:focus,.textarea.is-black.is-active,.textarea.is-black.is-focused,.textarea.is-black:active,.textarea.is-black:focus{box-shadow:0 0 0 .125em rgba(0,0,0,.25)}.input.is-light,.textarea.is-light{border-color:#dbdbdb}.input.is-light.is-active,.input.is-light.is-focused,.input.is-light:active,.input.is-light:focus,.textarea.is-light.is-active,.textarea.is-light.is-focused,.textarea.is-light:active,.textarea.is-light:focus{box-shadow:0 0 0 .125em rgba(219,219,219,.25)}.input.is-dark,.textarea.is-dark{border-color:#1c1c1c}.input.is-dark.is-active,.input.is-dark.is-focused,.input.is-dark:active,.input.is-dark:focus,.textarea.is-dark.is-active,.textarea.is-dark.is-focused,.textarea.is-dark:active,.textarea.is-dark:focus{box-shadow:0 0 0 .125em rgba(28,28,28,.25)}.input.is-primary,.textarea.is-primary{border-color:#009e86}.input.is-primary.is-active,.input.is-primary.is-focused,.input.is-primary:active,.input.is-primary:focus,.textarea.is-primary.is-active,.textarea.is-primary.is-focused,.textarea.is-primary:active,.textarea.is-primary:focus{box-shadow:0 0 0 .125em rgba(0,158,134,.25)}.input.is-link,.textarea.is-link{border-color:#205bbc}.input.is-link.is-active,.input.is-link.is-focused,.input.is-link:active,.input.is-link:focus,.textarea.is-link.is-active,.textarea.is-link.is-focused,.textarea.is-link:active,.textarea.is-link:focus{box-shadow:0 0 0 .125em rgba(32,91,188,.25)}.input.is-info,.textarea.is-info{border-color:#0f81cc}.input.is-info.is-active,.input.is-info.is-focused,.input.is-info:active,.input.is-info:focus,.textarea.is-info.is-active,.textarea.is-info.is-focused,.textarea.is-info:active,.textarea.is-info:focus{box-shadow:0 0 0 .125em rgba(15,129,204,.25)}.input.is-success,.textarea.is-success{border-color:#1ca64c}.input.is-success.is-active,.input.is-success.is-focused,.input.is-success:active,.input.is-success:focus,.textarea.is-success.is-active,.textarea.is-success.is-focused,.textarea.is-success:active,.textarea.is-success:focus{box-shadow:0 0 0 .125em rgba(28,166,76,.25)}.input.is-warning,.textarea.is-warning{border-color:#ffd324}.input.is-warning.is-active,.input.is-warning.is-focused,.input.is-warning:active,.input.is-warning:focus,.textarea.is-warning.is-active,.textarea.is-warning.is-focused,.textarea.is-warning:active,.textarea.is-warning:focus{box-shadow:0 0 0 .125em rgba(255,211,36,.25)}.input.is-danger,.textarea.is-danger{border-color:#ff0537}.input.is-danger.is-active,.input.is-danger.is-focused,.input.is-danger:active,.input.is-danger:focus,.textarea.is-danger.is-active,.textarea.is-danger.is-focused,.textarea.is-danger:active,.textarea.is-danger:focus{box-shadow:0 0 0 .125em rgba(255,5,55,.25)}.input.is-white-dark,.textarea.is-white-dark{border-color:#fff}.input.is-white-dark.is-active,.input.is-white-dark.is-focused,.input.is-white-dark:active,.input.is-white-dark:focus,.textarea.is-white-dark.is-active,.textarea.is-white-dark.is-focused,.textarea.is-white-dark:active,.textarea.is-white-dark:focus{box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.input.is-black-dark,.textarea.is-black-dark{border-color:#0a0a0a}.input.is-black-dark.is-active,.input.is-black-dark.is-focused,.input.is-black-dark:active,.input.is-black-dark:focus,.textarea.is-black-dark.is-active,.textarea.is-black-dark.is-focused,.textarea.is-black-dark:active,.textarea.is-black-dark:focus{box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.input.is-light-dark,.textarea.is-light-dark{border-color:#f5f5f5}.input.is-light-dark.is-active,.input.is-light-dark.is-focused,.input.is-light-dark:active,.input.is-light-dark:focus,.textarea.is-light-dark.is-active,.textarea.is-light-dark.is-focused,.textarea.is-light-dark:active,.textarea.is-light-dark:focus{box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.input.is-dark-dark,.textarea.is-dark-dark{border-color:#363636}.input.is-dark-dark.is-active,.input.is-dark-dark.is-focused,.input.is-dark-dark:active,.input.is-dark-dark:focus,.textarea.is-dark-dark.is-active,.textarea.is-dark-dark.is-focused,.textarea.is-dark-dark:active,.textarea.is-dark-dark:focus{box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.input.is-primary-dark,.textarea.is-primary-dark{border-color:#00d1b2}.input.is-primary-dark.is-active,.input.is-primary-dark.is-focused,.input.is-primary-dark:active,.input.is-primary-dark:focus,.textarea.is-primary-dark.is-active,.textarea.is-primary-dark.is-focused,.textarea.is-primary-dark:active,.textarea.is-primary-dark:focus{box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.input.is-link-dark,.textarea.is-link-dark{border-color:#3273dc}.input.is-link-dark.is-active,.input.is-link-dark.is-focused,.input.is-link-dark:active,.input.is-link-dark:focus,.textarea.is-link-dark.is-active,.textarea.is-link-dark.is-focused,.textarea.is-link-dark:active,.textarea.is-link-dark:focus{box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.input.is-info-dark,.textarea.is-info-dark{border-color:#209cee}.input.is-info-dark.is-active,.input.is-info-dark.is-focused,.input.is-info-dark:active,.input.is-info-dark:focus,.textarea.is-info-dark.is-active,.textarea.is-info-dark.is-focused,.textarea.is-info-dark:active,.textarea.is-info-dark:focus{box-shadow:0 0 0 .125em rgba(32,156,238,.25)}.input.is-success-dark,.textarea.is-success-dark{border-color:#23d160}.input.is-success-dark.is-active,.input.is-success-dark.is-focused,.input.is-success-dark:active,.input.is-success-dark:focus,.textarea.is-success-dark.is-active,.textarea.is-success-dark.is-focused,.textarea.is-success-dark:active,.textarea.is-success-dark:focus{box-shadow:0 0 0 .125em rgba(35,209,96,.25)}.input.is-warning-dark,.textarea.is-warning-dark{border-color:#ffdd57}.input.is-warning-dark.is-active,.input.is-warning-dark.is-focused,.input.is-warning-dark:active,.input.is-warning-dark:focus,.textarea.is-warning-dark.is-active,.textarea.is-warning-dark.is-focused,.textarea.is-warning-dark:active,.textarea.is-warning-dark:focus{box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.input.is-danger-dark,.textarea.is-danger-dark{border-color:#ff3860}.input.is-danger-dark.is-active,.input.is-danger-dark.is-focused,.input.is-danger-dark:active,.input.is-danger-dark:focus,.textarea.is-danger-dark.is-active,.textarea.is-danger-dark.is-focused,.textarea.is-danger-dark:active,.textarea.is-danger-dark:focus{box-shadow:0 0 0 .125em rgba(255,56,96,.25)}.checkbox:hover,.radio:hover{color:#dbdbdb}.checkbox[disabled],.radio[disabled],fieldset[disabled] .checkbox,fieldset[disabled] .radio{color:#b5b5b5}.select:not(.is-multiple):not(.is-loading)::after{border-color:#5ea3e4}.select select[disabled]:hover,fieldset[disabled] .select select:hover{border-color:#242424}.select:not(.is-multiple):not(.is-loading):hover::after{border-color:#dbdbdb}.select.is-white:not(:hover)::after{border-color:#e6e6e6}.select.is-white select{border-color:#e6e6e6}.select.is-white select.is-hovered,.select.is-white select:hover{border-color:#d9d9d9}.select.is-white select.is-active,.select.is-white select.is-focused,.select.is-white select:active,.select.is-white select:focus{box-shadow:0 0 0 .125em rgba(230,230,230,.25)}.select.is-black:not(:hover)::after{border-color:#000}.select.is-black select{border-color:#000}.select.is-black select.is-hovered,.select.is-black select:hover{border-color:#000}.select.is-black select.is-active,.select.is-black select.is-focused,.select.is-black select:active,.select.is-black select:focus{box-shadow:0 0 0 .125em rgba(0,0,0,.25)}.select.is-light:not(:hover)::after{border-color:#dbdbdb}.select.is-light select{border-color:#dbdbdb}.select.is-light select.is-hovered,.select.is-light select:hover{border-color:#cfcfcf}.select.is-light select.is-active,.select.is-light select.is-focused,.select.is-light select:active,.select.is-light select:focus{box-shadow:0 0 0 .125em rgba(219,219,219,.25)}.select.is-dark:not(:hover)::after{border-color:#1c1c1c}.select.is-dark select{border-color:#1c1c1c}.select.is-dark select.is-hovered,.select.is-dark select:hover{border-color:#0f0f0f}.select.is-dark select.is-active,.select.is-dark select.is-focused,.select.is-dark select:active,.select.is-dark select:focus{box-shadow:0 0 0 .125em rgba(28,28,28,.25)}.select.is-primary:not(:hover)::after{border-color:#009e86}.select.is-primary select{border-color:#009e86}.select.is-primary select.is-hovered,.select.is-primary select:hover{border-color:#008571}.select.is-primary select.is-active,.select.is-primary select.is-focused,.select.is-primary select:active,.select.is-primary select:focus{box-shadow:0 0 0 .125em rgba(0,158,134,.25)}.select.is-link:not(:hover)::after{border-color:#205bbc}.select.is-link select{border-color:#205bbc}.select.is-link select.is-hovered,.select.is-link select:hover{border-color:#1c51a6}.select.is-link select.is-active,.select.is-link select.is-focused,.select.is-link select:active,.select.is-link select:focus{box-shadow:0 0 0 .125em rgba(32,91,188,.25)}.select.is-info:not(:hover)::after{border-color:#0f81cc}.select.is-info select{border-color:#0f81cc}.select.is-info select.is-hovered,.select.is-info select:hover{border-color:#0e72b4}.select.is-info select.is-active,.select.is-info select.is-focused,.select.is-info select:active,.select.is-info select:focus{box-shadow:0 0 0 .125em rgba(15,129,204,.25)}.select.is-success:not(:hover)::after{border-color:#1ca64c}.select.is-success select{border-color:#1ca64c}.select.is-success select.is-hovered,.select.is-success select:hover{border-color:#189042}.select.is-success select.is-active,.select.is-success select.is-focused,.select.is-success select:active,.select.is-success select:focus{box-shadow:0 0 0 .125em rgba(28,166,76,.25)}.select.is-warning:not(:hover)::after{border-color:#ffd324}.select.is-warning select{border-color:#ffd324}.select.is-warning select.is-hovered,.select.is-warning select:hover{border-color:#ffce0a}.select.is-warning select.is-active,.select.is-warning select.is-focused,.select.is-warning select:active,.select.is-warning select:focus{box-shadow:0 0 0 .125em rgba(255,211,36,.25)}.select.is-danger:not(:hover)::after{border-color:#ff0537}.select.is-danger select{border-color:#ff0537}.select.is-danger select.is-hovered,.select.is-danger select:hover{border-color:#eb002f}.select.is-danger select.is-active,.select.is-danger select.is-focused,.select.is-danger select:active,.select.is-danger select:focus{box-shadow:0 0 0 .125em rgba(255,5,55,.25)}.select.is-white-dark:not(:hover)::after{border-color:#fff}.select.is-white-dark select{border-color:#fff}.select.is-white-dark select.is-hovered,.select.is-white-dark select:hover{border-color:#f2f2f2}.select.is-white-dark select.is-active,.select.is-white-dark select.is-focused,.select.is-white-dark select:active,.select.is-white-dark select:focus{box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.select.is-black-dark:not(:hover)::after{border-color:#0a0a0a}.select.is-black-dark select{border-color:#0a0a0a}.select.is-black-dark select.is-hovered,.select.is-black-dark select:hover{border-color:#000}.select.is-black-dark select.is-active,.select.is-black-dark select.is-focused,.select.is-black-dark select:active,.select.is-black-dark select:focus{box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.select.is-light-dark:not(:hover)::after{border-color:#f5f5f5}.select.is-light-dark select{border-color:#f5f5f5}.select.is-light-dark select.is-hovered,.select.is-light-dark select:hover{border-color:#e8e8e8}.select.is-light-dark select.is-active,.select.is-light-dark select.is-focused,.select.is-light-dark select:active,.select.is-light-dark select:focus{box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.select.is-dark-dark:not(:hover)::after{border-color:#363636}.select.is-dark-dark select{border-color:#363636}.select.is-dark-dark select.is-hovered,.select.is-dark-dark select:hover{border-color:#292929}.select.is-dark-dark select.is-active,.select.is-dark-dark select.is-focused,.select.is-dark-dark select:active,.select.is-dark-dark select:focus{box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.select.is-primary-dark:not(:hover)::after{border-color:#00d1b2}.select.is-primary-dark select{border-color:#00d1b2}.select.is-primary-dark select.is-hovered,.select.is-primary-dark select:hover{border-color:#00b89c}.select.is-primary-dark select.is-active,.select.is-primary-dark select.is-focused,.select.is-primary-dark select:active,.select.is-primary-dark select:focus{box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.select.is-link-dark:not(:hover)::after{border-color:#3273dc}.select.is-link-dark select{border-color:#3273dc}.select.is-link-dark select.is-hovered,.select.is-link-dark select:hover{border-color:#2366d1}.select.is-link-dark select.is-active,.select.is-link-dark select.is-focused,.select.is-link-dark select:active,.select.is-link-dark select:focus{box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.select.is-info-dark:not(:hover)::after{border-color:#209cee}.select.is-info-dark select{border-color:#209cee}.select.is-info-dark select.is-hovered,.select.is-info-dark select:hover{border-color:#118fe4}.select.is-info-dark select.is-active,.select.is-info-dark select.is-focused,.select.is-info-dark select:active,.select.is-info-dark select:focus{box-shadow:0 0 0 .125em rgba(32,156,238,.25)}.select.is-success-dark:not(:hover)::after{border-color:#23d160}.select.is-success-dark select{border-color:#23d160}.select.is-success-dark select.is-hovered,.select.is-success-dark select:hover{border-color:#20bc56}.select.is-success-dark select.is-active,.select.is-success-dark select.is-focused,.select.is-success-dark select:active,.select.is-success-dark select:focus{box-shadow:0 0 0 .125em rgba(35,209,96,.25)}.select.is-warning-dark:not(:hover)::after{border-color:#ffdd57}.select.is-warning-dark select{border-color:#ffdd57}.select.is-warning-dark select.is-hovered,.select.is-warning-dark select:hover{border-color:#ffd83d}.select.is-warning-dark select.is-active,.select.is-warning-dark select.is-focused,.select.is-warning-dark select:active,.select.is-warning-dark select:focus{box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.select.is-danger-dark:not(:hover)::after{border-color:#ff3860}.select.is-danger-dark select{border-color:#ff3860}.select.is-danger-dark select.is-hovered,.select.is-danger-dark select:hover{border-color:#ff1f4b}.select.is-danger-dark select.is-active,.select.is-danger-dark select.is-focused,.select.is-danger-dark select:active,.select.is-danger-dark select:focus{box-shadow:0 0 0 .125em rgba(255,56,96,.25)}.select.is-disabled::after{border-color:#b5b5b5}.file.is-white .file-cta{background-color:#e6e6e6;color:#0a0a0a}.file.is-white.is-hovered .file-cta,.file.is-white:hover .file-cta{background-color:#dfdfdf;color:#0a0a0a}.file.is-white.is-focused .file-cta,.file.is-white:focus .file-cta{box-shadow:0 0 .5em rgba(230,230,230,.25);color:#0a0a0a}.file.is-white.is-active .file-cta,.file.is-white:active .file-cta{background-color:#d9d9d9;color:#0a0a0a}.file.is-black .file-cta{background-color:#000;color:#fff}.file.is-black.is-hovered .file-cta,.file.is-black:hover .file-cta{background-color:#000;color:#fff}.file.is-black.is-focused .file-cta,.file.is-black:focus .file-cta{box-shadow:0 0 .5em rgba(0,0,0,.25);color:#fff}.file.is-black.is-active .file-cta,.file.is-black:active .file-cta{background-color:#000;color:#fff}.file.is-light .file-cta{background-color:#dbdbdb;color:#363636}.file.is-light.is-hovered .file-cta,.file.is-light:hover .file-cta{background-color:#d5d5d5;color:#363636}.file.is-light.is-focused .file-cta,.file.is-light:focus .file-cta{box-shadow:0 0 .5em rgba(219,219,219,.25);color:#363636}.file.is-light.is-active .file-cta,.file.is-light:active .file-cta{background-color:#cfcfcf;color:#363636}.file.is-dark .file-cta{background-color:#1c1c1c;color:#f5f5f5}.file.is-dark.is-hovered .file-cta,.file.is-dark:hover .file-cta{background-color:#161616;color:#f5f5f5}.file.is-dark.is-focused .file-cta,.file.is-dark:focus .file-cta{box-shadow:0 0 .5em rgba(28,28,28,.25);color:#f5f5f5}.file.is-dark.is-active .file-cta,.file.is-dark:active .file-cta{background-color:#0f0f0f;color:#f5f5f5}.file.is-primary .file-cta{background-color:#009e86;color:#fff}.file.is-primary.is-hovered .file-cta,.file.is-primary:hover .file-cta{background-color:#00917c;color:#fff}.file.is-primary.is-focused .file-cta,.file.is-primary:focus .file-cta{box-shadow:0 0 .5em rgba(0,158,134,.25);color:#fff}.file.is-primary.is-active .file-cta,.file.is-primary:active .file-cta{background-color:#008571;color:#fff}.file.is-link .file-cta{background-color:#205bbc;color:#fff}.file.is-link.is-hovered .file-cta,.file.is-link:hover .file-cta{background-color:#1e56b1;color:#fff}.file.is-link.is-focused .file-cta,.file.is-link:focus .file-cta{box-shadow:0 0 .5em rgba(32,91,188,.25);color:#fff}.file.is-link.is-active .file-cta,.file.is-link:active .file-cta{background-color:#1c51a6;color:#fff}.file.is-info .file-cta{background-color:#0f81cc;color:#fff}.file.is-info.is-hovered .file-cta,.file.is-info:hover .file-cta{background-color:#0e79c0;color:#fff}.file.is-info.is-focused .file-cta,.file.is-info:focus .file-cta{box-shadow:0 0 .5em rgba(15,129,204,.25);color:#fff}.file.is-info.is-active .file-cta,.file.is-info:active .file-cta{background-color:#0e72b4;color:#fff}.file.is-success .file-cta{background-color:#1ca64c;color:#fff}.file.is-success.is-hovered .file-cta,.file.is-success:hover .file-cta{background-color:#1a9b47;color:#fff}.file.is-success.is-focused .file-cta,.file.is-success:focus .file-cta{box-shadow:0 0 .5em rgba(28,166,76,.25);color:#fff}.file.is-success.is-active .file-cta,.file.is-success:active .file-cta{background-color:#189042;color:#fff}.file.is-warning .file-cta{background-color:#ffd324;color:rgba(0,0,0,.7)}.file.is-warning.is-hovered .file-cta,.file.is-warning:hover .file-cta{background-color:#ffd117;color:rgba(0,0,0,.7)}.file.is-warning.is-focused .file-cta,.file.is-warning:focus .file-cta{box-shadow:0 0 .5em rgba(255,211,36,.25);color:rgba(0,0,0,.7)}.file.is-warning.is-active .file-cta,.file.is-warning:active .file-cta{background-color:#ffce0a;color:rgba(0,0,0,.7)}.file.is-danger .file-cta{background-color:#ff0537;color:#fff}.file.is-danger.is-hovered .file-cta,.file.is-danger:hover .file-cta{background-color:#f70031;color:#fff}.file.is-danger.is-focused .file-cta,.file.is-danger:focus .file-cta{box-shadow:0 0 .5em rgba(255,5,55,.25);color:#fff}.file.is-danger.is-active .file-cta,.file.is-danger:active .file-cta{background-color:#eb002f;color:#fff}.file.is-white-dark .file-cta{background-color:#fff;color:#0a0a0a}.file.is-white-dark.is-hovered .file-cta,.file.is-white-dark:hover .file-cta{background-color:#f9f9f9;color:#0a0a0a}.file.is-white-dark.is-focused .file-cta,.file.is-white-dark:focus .file-cta{box-shadow:0 0 .5em rgba(255,255,255,.25);color:#0a0a0a}.file.is-white-dark.is-active .file-cta,.file.is-white-dark:active .file-cta{background-color:#f2f2f2;color:#0a0a0a}.file.is-black-dark .file-cta{background-color:#0a0a0a;color:#fff}.file.is-black-dark.is-hovered .file-cta,.file.is-black-dark:hover .file-cta{background-color:#040404;color:#fff}.file.is-black-dark.is-focused .file-cta,.file.is-black-dark:focus .file-cta{box-shadow:0 0 .5em rgba(10,10,10,.25);color:#fff}.file.is-black-dark.is-active .file-cta,.file.is-black-dark:active .file-cta{background-color:#000;color:#fff}.file.is-light-dark .file-cta{background-color:#f5f5f5;color:#363636}.file.is-light-dark.is-hovered .file-cta,.file.is-light-dark:hover .file-cta{background-color:#eee;color:#363636}.file.is-light-dark.is-focused .file-cta,.file.is-light-dark:focus .file-cta{box-shadow:0 0 .5em rgba(245,245,245,.25);color:#363636}.file.is-light-dark.is-active .file-cta,.file.is-light-dark:active .file-cta{background-color:#e8e8e8;color:#363636}.file.is-dark-dark .file-cta{background-color:#363636;color:#f5f5f5}.file.is-dark-dark.is-hovered .file-cta,.file.is-dark-dark:hover .file-cta{background-color:#2f2f2f;color:#f5f5f5}.file.is-dark-dark.is-focused .file-cta,.file.is-dark-dark:focus .file-cta{box-shadow:0 0 .5em rgba(54,54,54,.25);color:#f5f5f5}.file.is-dark-dark.is-active .file-cta,.file.is-dark-dark:active .file-cta{background-color:#292929;color:#f5f5f5}.file.is-primary-dark .file-cta{background-color:#00d1b2;color:#fff}.file.is-primary-dark.is-hovered .file-cta,.file.is-primary-dark:hover .file-cta{background-color:#00c4a7;color:#fff}.file.is-primary-dark.is-focused .file-cta,.file.is-primary-dark:focus .file-cta{box-shadow:0 0 .5em rgba(0,209,178,.25);color:#fff}.file.is-primary-dark.is-active .file-cta,.file.is-primary-dark:active .file-cta{background-color:#00b89c;color:#fff}.file.is-link-dark .file-cta{background-color:#3273dc;color:#fff}.file.is-link-dark.is-hovered .file-cta,.file.is-link-dark:hover .file-cta{background-color:#276cda;color:#fff}.file.is-link-dark.is-focused .file-cta,.file.is-link-dark:focus .file-cta{box-shadow:0 0 .5em rgba(50,115,220,.25);color:#fff}.file.is-link-dark.is-active .file-cta,.file.is-link-dark:active .file-cta{background-color:#2366d1;color:#fff}.file.is-info-dark .file-cta{background-color:#209cee;color:#fff}.file.is-info-dark.is-hovered .file-cta,.file.is-info-dark:hover .file-cta{background-color:#1496ed;color:#fff}.file.is-info-dark.is-focused .file-cta,.file.is-info-dark:focus .file-cta{box-shadow:0 0 .5em rgba(32,156,238,.25);color:#fff}.file.is-info-dark.is-active .file-cta,.file.is-info-dark:active .file-cta{background-color:#118fe4;color:#fff}.file.is-success-dark .file-cta{background-color:#23d160;color:#fff}.file.is-success-dark.is-hovered .file-cta,.file.is-success-dark:hover .file-cta{background-color:#22c65b;color:#fff}.file.is-success-dark.is-focused .file-cta,.file.is-success-dark:focus .file-cta{box-shadow:0 0 .5em rgba(35,209,96,.25);color:#fff}.file.is-success-dark.is-active .file-cta,.file.is-success-dark:active .file-cta{background-color:#20bc56;color:#fff}.file.is-warning-dark .file-cta{background-color:#ffdd57;color:rgba(0,0,0,.7)}.file.is-warning-dark.is-hovered .file-cta,.file.is-warning-dark:hover .file-cta{background-color:#ffdb4a;color:rgba(0,0,0,.7)}.file.is-warning-dark.is-focused .file-cta,.file.is-warning-dark:focus .file-cta{box-shadow:0 0 .5em rgba(255,221,87,.25);color:rgba(0,0,0,.7)}.file.is-warning-dark.is-active .file-cta,.file.is-warning-dark:active .file-cta{background-color:#ffd83d;color:rgba(0,0,0,.7)}.file.is-danger-dark .file-cta{background-color:#ff3860;color:#fff}.file.is-danger-dark.is-hovered .file-cta,.file.is-danger-dark:hover .file-cta{background-color:#ff2b56;color:#fff}.file.is-danger-dark.is-focused .file-cta,.file.is-danger-dark:focus .file-cta{box-shadow:0 0 .5em rgba(255,56,96,.25);color:#fff}.file.is-danger-dark.is-active .file-cta,.file.is-danger-dark:active .file-cta{background-color:#ff1f4b;color:#fff}.file-label:hover .file-cta{background-color:#1d1d1d;color:#dbdbdb}.file-label:hover .file-name{border-color:#2f2f2f}.file-label:active .file-cta{background-color:#171717;color:#dbdbdb}.file-label:active .file-name{border-color:#292929}.file-cta,.file-name{border-color:#363636}.file-cta{background-color:#242424;color:#b5b5b5}.file-name{border-color:#363636}.label{color:#dbdbdb}.help.is-white{color:#e6e6e6}.help.is-black{color:#000}.help.is-light{color:#dbdbdb}.help.is-dark{color:#1c1c1c}.help.is-primary{color:#009e86}.help.is-link{color:#205bbc}.help.is-info{color:#0f81cc}.help.is-success{color:#1ca64c}.help.is-warning{color:#ffd324}.help.is-danger{color:#ff0537}.help.is-white-dark{color:#fff}.help.is-black-dark{color:#0a0a0a}.help.is-light-dark{color:#f5f5f5}.help.is-dark-dark{color:#363636}.help.is-primary-dark{color:#00d1b2}.help.is-link-dark{color:#3273dc}.help.is-info-dark{color:#209cee}.help.is-success-dark{color:#23d160}.help.is-warning-dark{color:#ffdd57}.help.is-danger-dark{color:#ff3860}.control.has-icons-left .icon,.control.has-icons-right .icon{color:#363636}.notification{background-color:#242424}.notification code,.notification pre{background:#0a0a0a}.notification.is-white{background-color:#e6e6e6;color:#0a0a0a}.notification.is-black{background-color:#000;color:#fff}.notification.is-light{background-color:#dbdbdb;color:#363636}.notification.is-dark{background-color:#1c1c1c;color:#f5f5f5}.notification.is-primary{background-color:#009e86;color:#fff}.notification.is-link{background-color:#205bbc;color:#fff}.notification.is-info{background-color:#0f81cc;color:#fff}.notification.is-success{background-color:#1ca64c;color:#fff}.notification.is-warning{background-color:#ffd324;color:rgba(0,0,0,.7)}.notification.is-danger{background-color:#ff0537;color:#fff}.notification.is-white-dark{background-color:#fff;color:#0a0a0a}.notification.is-black-dark{background-color:#0a0a0a;color:#fff}.notification.is-light-dark{background-color:#f5f5f5;color:#363636}.notification.is-dark-dark{background-color:#363636;color:#f5f5f5}.notification.is-primary-dark{background-color:#00d1b2;color:#fff}.notification.is-link-dark{background-color:#3273dc;color:#fff}.notification.is-info-dark{background-color:#209cee;color:#fff}.notification.is-success-dark{background-color:#23d160;color:#fff}.notification.is-warning-dark{background-color:#ffdd57;color:rgba(0,0,0,.7)}.notification.is-danger-dark{background-color:#ff3860;color:#fff}.progress::-webkit-progress-bar{background-color:#363636}.progress::-webkit-progress-value{background-color:#b5b5b5}.progress::-moz-progress-bar{background-color:#b5b5b5}.progress::-ms-fill{background-color:#b5b5b5}.progress:indeterminate{background-color:#363636;background-image:linear-gradient(to right,#4a4a4a 30%,#363636 30%)}.progress.is-white::-webkit-progress-value{background-color:#e6e6e6}.progress.is-white::-moz-progress-bar{background-color:#e6e6e6}.progress.is-white::-ms-fill{background-color:#e6e6e6}.progress.is-white:indeterminate{background-image:linear-gradient(to right,#e6e6e6 30%,#363636 30%)}.progress.is-black::-webkit-progress-value{background-color:#000}.progress.is-black::-moz-progress-bar{background-color:#000}.progress.is-black::-ms-fill{background-color:#000}.progress.is-black:indeterminate{background-image:linear-gradient(to right,#000 30%,#363636 30%)}.progress.is-light::-webkit-progress-value{background-color:#dbdbdb}.progress.is-light::-moz-progress-bar{background-color:#dbdbdb}.progress.is-light::-ms-fill{background-color:#dbdbdb}.progress.is-light:indeterminate{background-image:linear-gradient(to right,#dbdbdb 30%,#363636 30%)}.progress.is-dark::-webkit-progress-value{background-color:#1c1c1c}.progress.is-dark::-moz-progress-bar{background-color:#1c1c1c}.progress.is-dark::-ms-fill{background-color:#1c1c1c}.progress.is-dark:indeterminate{background-image:linear-gradient(to right,#1c1c1c 30%,#363636 30%)}.progress.is-primary::-webkit-progress-value{background-color:#009e86}.progress.is-primary::-moz-progress-bar{background-color:#009e86}.progress.is-primary::-ms-fill{background-color:#009e86}.progress.is-primary:indeterminate{background-image:linear-gradient(to right,#009e86 30%,#363636 30%)}.progress.is-link::-webkit-progress-value{background-color:#205bbc}.progress.is-link::-moz-progress-bar{background-color:#205bbc}.progress.is-link::-ms-fill{background-color:#205bbc}.progress.is-link:indeterminate{background-image:linear-gradient(to right,#205bbc 30%,#363636 30%)}.progress.is-info::-webkit-progress-value{background-color:#0f81cc}.progress.is-info::-moz-progress-bar{background-color:#0f81cc}.progress.is-info::-ms-fill{background-color:#0f81cc}.progress.is-info:indeterminate{background-image:linear-gradient(to right,#0f81cc 30%,#363636 30%)}.progress.is-success::-webkit-progress-value{background-color:#1ca64c}.progress.is-success::-moz-progress-bar{background-color:#1ca64c}.progress.is-success::-ms-fill{background-color:#1ca64c}.progress.is-success:indeterminate{background-image:linear-gradient(to right,#1ca64c 30%,#363636 30%)}.progress.is-warning::-webkit-progress-value{background-color:#ffd324}.progress.is-warning::-moz-progress-bar{background-color:#ffd324}.progress.is-warning::-ms-fill{background-color:#ffd324}.progress.is-warning:indeterminate{background-image:linear-gradient(to right,#ffd324 30%,#363636 30%)}.progress.is-danger::-webkit-progress-value{background-color:#ff0537}.progress.is-danger::-moz-progress-bar{background-color:#ff0537}.progress.is-danger::-ms-fill{background-color:#ff0537}.progress.is-danger:indeterminate{background-image:linear-gradient(to right,#ff0537 30%,#363636 30%)}.progress.is-white-dark::-webkit-progress-value{background-color:#fff}.progress.is-white-dark::-moz-progress-bar{background-color:#fff}.progress.is-white-dark::-ms-fill{background-color:#fff}.progress.is-white-dark:indeterminate{background-image:linear-gradient(to right,#fff 30%,#363636 30%)}.progress.is-black-dark::-webkit-progress-value{background-color:#0a0a0a}.progress.is-black-dark::-moz-progress-bar{background-color:#0a0a0a}.progress.is-black-dark::-ms-fill{background-color:#0a0a0a}.progress.is-black-dark:indeterminate{background-image:linear-gradient(to right,#0a0a0a 30%,#363636 30%)}.progress.is-light-dark::-webkit-progress-value{background-color:#f5f5f5}.progress.is-light-dark::-moz-progress-bar{background-color:#f5f5f5}.progress.is-light-dark::-ms-fill{background-color:#f5f5f5}.progress.is-light-dark:indeterminate{background-image:linear-gradient(to right,#f5f5f5 30%,#363636 30%)}.progress.is-dark-dark::-webkit-progress-value{background-color:#363636}.progress.is-dark-dark::-moz-progress-bar{background-color:#363636}.progress.is-dark-dark::-ms-fill{background-color:#363636}.progress.is-dark-dark:indeterminate{background-image:linear-gradient(to right,#363636 30%,#363636 30%)}.progress.is-primary-dark::-webkit-progress-value{background-color:#00d1b2}.progress.is-primary-dark::-moz-progress-bar{background-color:#00d1b2}.progress.is-primary-dark::-ms-fill{background-color:#00d1b2}.progress.is-primary-dark:indeterminate{background-image:linear-gradient(to right,#00d1b2 30%,#363636 30%)}.progress.is-link-dark::-webkit-progress-value{background-color:#3273dc}.progress.is-link-dark::-moz-progress-bar{background-color:#3273dc}.progress.is-link-dark::-ms-fill{background-color:#3273dc}.progress.is-link-dark:indeterminate{background-image:linear-gradient(to right,#3273dc 30%,#363636 30%)}.progress.is-info-dark::-webkit-progress-value{background-color:#209cee}.progress.is-info-dark::-moz-progress-bar{background-color:#209cee}.progress.is-info-dark::-ms-fill{background-color:#209cee}.progress.is-info-dark:indeterminate{background-image:linear-gradient(to right,#209cee 30%,#363636 30%)}.progress.is-success-dark::-webkit-progress-value{background-color:#23d160}.progress.is-success-dark::-moz-progress-bar{background-color:#23d160}.progress.is-success-dark::-ms-fill{background-color:#23d160}.progress.is-success-dark:indeterminate{background-image:linear-gradient(to right,#23d160 30%,#363636 30%)}.progress.is-warning-dark::-webkit-progress-value{background-color:#ffdd57}.progress.is-warning-dark::-moz-progress-bar{background-color:#ffdd57}.progress.is-warning-dark::-ms-fill{background-color:#ffdd57}.progress.is-warning-dark:indeterminate{background-image:linear-gradient(to right,#ffdd57 30%,#363636 30%)}.progress.is-danger-dark::-webkit-progress-value{background-color:#ff3860}.progress.is-danger-dark::-moz-progress-bar{background-color:#ff3860}.progress.is-danger-dark::-ms-fill{background-color:#ff3860}.progress.is-danger-dark:indeterminate{background-image:linear-gradient(to right,#ff3860 30%,#363636 30%)}.table{background-color:#0a0a0a;color:#dbdbdb}.table td,.table th{border:1px solid #363636}.table td.is-white,.table th.is-white{background-color:#e6e6e6;border-color:#e6e6e6;color:#0a0a0a}.table td.is-black,.table th.is-black{background-color:#000;border-color:#000;color:#fff}.table td.is-light,.table th.is-light{background-color:#dbdbdb;border-color:#dbdbdb;color:#363636}.table td.is-dark,.table th.is-dark{background-color:#1c1c1c;border-color:#1c1c1c;color:#f5f5f5}.table td.is-primary,.table th.is-primary{background-color:#009e86;border-color:#009e86;color:#fff}.table td.is-link,.table th.is-link{background-color:#205bbc;border-color:#205bbc;color:#fff}.table td.is-info,.table th.is-info{background-color:#0f81cc;border-color:#0f81cc;color:#fff}.table td.is-success,.table th.is-success{background-color:#1ca64c;border-color:#1ca64c;color:#fff}.table td.is-warning,.table th.is-warning{background-color:#ffd324;border-color:#ffd324;color:rgba(0,0,0,.7)}.table td.is-danger,.table th.is-danger{background-color:#ff0537;border-color:#ff0537;color:#fff}.table td.is-white-dark,.table th.is-white-dark{background-color:#fff;border-color:#fff;color:#0a0a0a}.table td.is-black-dark,.table th.is-black-dark{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.table td.is-light-dark,.table th.is-light-dark{background-color:#f5f5f5;border-color:#f5f5f5;color:#363636}.table td.is-dark-dark,.table th.is-dark-dark{background-color:#363636;border-color:#363636;color:#f5f5f5}.table td.is-primary-dark,.table th.is-primary-dark{background-color:#00d1b2;border-color:#00d1b2;color:#fff}.table td.is-link-dark,.table th.is-link-dark{background-color:#3273dc;border-color:#3273dc;color:#fff}.table td.is-info-dark,.table th.is-info-dark{background-color:#209cee;border-color:#209cee;color:#fff}.table td.is-success-dark,.table th.is-success-dark{background-color:#23d160;border-color:#23d160;color:#fff}.table td.is-warning-dark,.table th.is-warning-dark{background-color:#ffdd57;border-color:#ffdd57;color:rgba(0,0,0,.7)}.table td.is-danger-dark,.table th.is-danger-dark{background-color:#ff3860;border-color:#ff3860;color:#fff}.table td.is-selected,.table th.is-selected{background-color:#009e86;color:#e6e6e6}.table th{color:#dbdbdb}.table tr.is-selected{background-color:#009e86;color:#e6e6e6}.table tr.is-selected td,.table tr.is-selected th{border-color:#e6e6e6}.table thead td,.table thead th{color:#dbdbdb}.table tfoot td,.table tfoot th{color:#dbdbdb}.table.is-hoverable tbody tr:not(.is-selected):hover{background-color:#121212}.table.is-hoverable.is-striped tbody tr:not(.is-selected):hover{background-color:#121212}.table.is-hoverable.is-striped tbody tr:not(.is-selected):hover:nth-child(even){background-color:#242424}.table.is-striped tbody tr:not(.is-selected):nth-child(even){background-color:#121212}.tag:not(body){background-color:#242424;color:#b5b5b5}.tag:not(body).is-white{background-color:#e6e6e6;color:#0a0a0a}.tag:not(body).is-black{background-color:#000;color:#fff}.tag:not(body).is-light{background-color:#dbdbdb;color:#363636}.tag:not(body).is-dark{background-color:#1c1c1c;color:#f5f5f5}.tag:not(body).is-primary{background-color:#009e86;color:#fff}.tag:not(body).is-link{background-color:#205bbc;color:#fff}.tag:not(body).is-info{background-color:#0f81cc;color:#fff}.tag:not(body).is-success{background-color:#1ca64c;color:#fff}.tag:not(body).is-warning{background-color:#ffd324;color:rgba(0,0,0,.7)}.tag:not(body).is-danger{background-color:#ff0537;color:#fff}.tag:not(body).is-white-dark{background-color:#fff;color:#0a0a0a}.tag:not(body).is-black-dark{background-color:#0a0a0a;color:#fff}.tag:not(body).is-light-dark{background-color:#f5f5f5;color:#363636}.tag:not(body).is-dark-dark{background-color:#363636;color:#f5f5f5}.tag:not(body).is-primary-dark{background-color:#00d1b2;color:#fff}.tag:not(body).is-link-dark{background-color:#3273dc;color:#fff}.tag:not(body).is-info-dark{background-color:#209cee;color:#fff}.tag:not(body).is-success-dark{background-color:#23d160;color:#fff}.tag:not(body).is-warning-dark{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tag:not(body).is-danger-dark{background-color:#ff3860;color:#fff}.tag:not(body).is-delete:focus,.tag:not(body).is-delete:hover{background-color:#171717}.tag:not(body).is-delete:active{background-color:#0a0a0a}.title{color:#dbdbdb}.subtitle{color:#b5b5b5}.subtitle strong{color:#dbdbdb}.number{background-color:#242424}.breadcrumb a{color:#5ea3e4}.breadcrumb a:hover{color:#dbdbdb}.breadcrumb li.is-active a{color:#dbdbdb}.breadcrumb li+li::before{color:#4a4a4a}.card{background-color:#0a0a0a;box-shadow:0 2px 3px rgba(255,255,255,.1),0 0 0 1px rgba(255,255,255,.1);color:#b5b5b5}.card-header{box-shadow:0 1px 2px rgba(255,255,255,.1)}.card-header-title{color:#dbdbdb}.card-footer{border-top:1px solid #363636}.card-footer-item:not(:last-child){border-right:1px solid #363636}.dropdown-content{background-color:#0a0a0a;box-shadow:0 2px 3px rgba(255,255,255,.1),0 0 0 1px rgba(255,255,255,.1)}.dropdown-item{color:#b5b5b5}a.dropdown-item:hover,button.dropdown-item:hover{background-color:#242424;color:#fff}a.dropdown-item.is-active,button.dropdown-item.is-active{background-color:#5ea3e4;color:#fff}.dropdown-divider{background-color:#363636}.list{background-color:#0a0a0a;box-shadow:0 2px 3px rgba(255,255,255,.1),0 0 0 1px rgba(255,255,255,.1)}.list-item:not(a){color:#b5b5b5}.list-item:not(:last-child){border-bottom:1px solid #363636}.list-item.is-active{background-color:#5ea3e4;color:#fff}a.list-item{background-color:#242424}.media .media{border-top:1px solid rgba(54,54,54,.5)}.media+.media{border-top:1px solid rgba(54,54,54,.5)}.menu-list a{color:#b5b5b5}.menu-list a:hover{background-color:#242424;color:#dbdbdb}.menu-list a.is-active{background-color:#5ea3e4;color:#fff}.menu-list li ul{border-left:1px solid #363636}.message{background-color:#242424}.message.is-white{background-color:#242424}.message.is-white .message-header{background-color:#fff;color:#0a0a0a}.message.is-white .message-body{border-color:#fff;color:#b5b5b5}.message.is-black{background-color:#242424}.message.is-black .message-header{background-color:#0a0a0a;color:#fff}.message.is-black .message-body{border-color:#0a0a0a;color:#b5b5b5}.message.is-light{background-color:#242424}.message.is-light .message-header{background-color:#f5f5f5;color:#363636}.message.is-light .message-body{border-color:#f5f5f5;color:#b5b5b5}.message.is-dark{background-color:#242424}.message.is-dark .message-header{background-color:#363636;color:#f5f5f5}.message.is-dark .message-body{border-color:#363636;color:#b5b5b5}.message.is-primary{background-color:#242424}.message.is-primary .message-header{background-color:#00d1b2;color:#fff}.message.is-primary .message-body{border-color:#00d1b2;color:#b5b5b5}.message.is-link{background-color:#242424}.message.is-link .message-header{background-color:#3273dc;color:#fff}.message.is-link .message-body{border-color:#3273dc;color:#b5b5b5}.message.is-info{background-color:#242424}.message.is-info .message-header{background-color:#209cee;color:#fff}.message.is-info .message-body{border-color:#209cee;color:#b5b5b5}.message.is-success{background-color:#242424}.message.is-success .message-header{background-color:#23d160;color:#fff}.message.is-success .message-body{border-color:#23d160;color:#b5b5b5}.message.is-warning{background-color:#242424}.message.is-warning .message-header{background-color:#ffdd57;color:rgba(0,0,0,.7)}.message.is-warning .message-body{border-color:#ffdd57;color:#b5b5b5}.message.is-danger{background-color:#242424}.message.is-danger .message-header{background-color:#ff3860;color:#fff}.message.is-danger .message-body{border-color:#ff3860;color:#b5b5b5}.message.is-white-dark{background-color:#242424}.message.is-white-dark .message-header{background-color:#fff;color:#0a0a0a}.message.is-white-dark .message-body{border-color:#fff;color:#b5b5b5}.message.is-black-dark{background-color:#242424}.message.is-black-dark .message-header{background-color:#0a0a0a;color:#fff}.message.is-black-dark .message-body{border-color:#0a0a0a;color:#b5b5b5}.message.is-light-dark{background-color:#242424}.message.is-light-dark .message-header{background-color:#f5f5f5;color:#363636}.message.is-light-dark .message-body{border-color:#f5f5f5;color:#b5b5b5}.message.is-dark-dark{background-color:#242424}.message.is-dark-dark .message-header{background-color:#363636;color:#f5f5f5}.message.is-dark-dark .message-body{border-color:#363636;color:#b5b5b5}.message.is-primary-dark{background-color:#242424}.message.is-primary-dark .message-header{background-color:#00d1b2;color:#fff}.message.is-primary-dark .message-body{border-color:#00d1b2;color:#b5b5b5}.message.is-link-dark{background-color:#242424}.message.is-link-dark .message-header{background-color:#3273dc;color:#fff}.message.is-link-dark .message-body{border-color:#3273dc;color:#b5b5b5}.message.is-info-dark{background-color:#242424}.message.is-info-dark .message-header{background-color:#209cee;color:#fff}.message.is-info-dark .message-body{border-color:#209cee;color:#b5b5b5}.message.is-success-dark{background-color:#242424}.message.is-success-dark .message-header{background-color:#23d160;color:#fff}.message.is-success-dark .message-body{border-color:#23d160;color:#b5b5b5}.message.is-warning-dark{background-color:#242424}.message.is-warning-dark .message-header{background-color:#ffdd57;color:rgba(0,0,0,.7)}.message.is-warning-dark .message-body{border-color:#ffdd57;color:#b5b5b5}.message.is-danger-dark{background-color:#242424}.message.is-danger-dark .message-header{background-color:#ff3860;color:#fff}.message.is-danger-dark .message-body{border-color:#ff3860;color:#b5b5b5}.message-header{background-color:#b5b5b5;color:#fff}.message-body{border-color:#363636;color:#b5b5b5}.message-body code,.message-body pre{background-color:#0a0a0a}.modal-background{background-color:rgba(255,255,255,.86)}.modal-card-foot,.modal-card-head{background-color:#242424}.modal-card-head{border-bottom:1px solid #363636}.modal-card-title{color:#dbdbdb}.modal-card-foot{border-top:1px solid #363636}.modal-card-body{-webkit-overflow-scrolling:touch;background-color:#fff}.navbar{background-color:#17181c}.navbar.is-white{background-color:#e6e6e6;color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link,.navbar.is-white .navbar-brand>.navbar-item{color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link.is-active,.navbar.is-white .navbar-brand .navbar-link:hover,.navbar.is-white .navbar-brand>a.navbar-item.is-active,.navbar.is-white .navbar-brand>a.navbar-item:hover{background-color:#d9d9d9;color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link::after{border-color:#0a0a0a}.navbar.is-white .navbar-burger{color:#0a0a0a}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-white .navbar-end .navbar-link,.navbar.is-white .navbar-end>.navbar-item,.navbar.is-white .navbar-start .navbar-link,.navbar.is-white .navbar-start>.navbar-item{color:#0a0a0a}.navbar.is-white .navbar-end .navbar-link.is-active,.navbar.is-white .navbar-end .navbar-link:hover,.navbar.is-white .navbar-end>a.navbar-item.is-active,.navbar.is-white .navbar-end>a.navbar-item:hover,.navbar.is-white .navbar-start .navbar-link.is-active,.navbar.is-white .navbar-start .navbar-link:hover,.navbar.is-white .navbar-start>a.navbar-item.is-active,.navbar.is-white .navbar-start>a.navbar-item:hover{background-color:#d9d9d9;color:#0a0a0a}.navbar.is-white .navbar-end .navbar-link::after,.navbar.is-white .navbar-start .navbar-link::after{border-color:#0a0a0a}.navbar.is-white .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-white .navbar-item.has-dropdown:hover .navbar-link{background-color:#d9d9d9;color:#0a0a0a}.navbar.is-white .navbar-dropdown a.navbar-item.is-active{background-color:#e6e6e6;color:#0a0a0a}}@media (prefers-color-scheme:dark){.navbar.is-black{background-color:#000;color:#fff}.navbar.is-black .navbar-brand .navbar-link,.navbar.is-black .navbar-brand>.navbar-item{color:#fff}.navbar.is-black .navbar-brand .navbar-link.is-active,.navbar.is-black .navbar-brand .navbar-link:hover,.navbar.is-black .navbar-brand>a.navbar-item.is-active,.navbar.is-black .navbar-brand>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-black .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-black .navbar-end .navbar-link,.navbar.is-black .navbar-end>.navbar-item,.navbar.is-black .navbar-start .navbar-link,.navbar.is-black .navbar-start>.navbar-item{color:#fff}.navbar.is-black .navbar-end .navbar-link.is-active,.navbar.is-black .navbar-end .navbar-link:hover,.navbar.is-black .navbar-end>a.navbar-item.is-active,.navbar.is-black .navbar-end>a.navbar-item:hover,.navbar.is-black .navbar-start .navbar-link.is-active,.navbar.is-black .navbar-start .navbar-link:hover,.navbar.is-black .navbar-start>a.navbar-item.is-active,.navbar.is-black .navbar-start>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black .navbar-end .navbar-link::after,.navbar.is-black .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-black .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-black .navbar-item.has-dropdown:hover .navbar-link{background-color:#000;color:#fff}.navbar.is-black .navbar-dropdown a.navbar-item.is-active{background-color:#000;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-light{background-color:#dbdbdb;color:#363636}.navbar.is-light .navbar-brand .navbar-link,.navbar.is-light .navbar-brand>.navbar-item{color:#363636}.navbar.is-light .navbar-brand .navbar-link.is-active,.navbar.is-light .navbar-brand .navbar-link:hover,.navbar.is-light .navbar-brand>a.navbar-item.is-active,.navbar.is-light .navbar-brand>a.navbar-item:hover{background-color:#cfcfcf;color:#363636}.navbar.is-light .navbar-brand .navbar-link::after{border-color:#363636}.navbar.is-light .navbar-burger{color:#363636}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-light .navbar-end .navbar-link,.navbar.is-light .navbar-end>.navbar-item,.navbar.is-light .navbar-start .navbar-link,.navbar.is-light .navbar-start>.navbar-item{color:#363636}.navbar.is-light .navbar-end .navbar-link.is-active,.navbar.is-light .navbar-end .navbar-link:hover,.navbar.is-light .navbar-end>a.navbar-item.is-active,.navbar.is-light .navbar-end>a.navbar-item:hover,.navbar.is-light .navbar-start .navbar-link.is-active,.navbar.is-light .navbar-start .navbar-link:hover,.navbar.is-light .navbar-start>a.navbar-item.is-active,.navbar.is-light .navbar-start>a.navbar-item:hover{background-color:#cfcfcf;color:#363636}.navbar.is-light .navbar-end .navbar-link::after,.navbar.is-light .navbar-start .navbar-link::after{border-color:#363636}.navbar.is-light .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-light .navbar-item.has-dropdown:hover .navbar-link{background-color:#cfcfcf;color:#363636}.navbar.is-light .navbar-dropdown a.navbar-item.is-active{background-color:#dbdbdb;color:#363636}}@media (prefers-color-scheme:dark){.navbar.is-dark{background-color:#1c1c1c;color:#f5f5f5}.navbar.is-dark .navbar-brand .navbar-link,.navbar.is-dark .navbar-brand>.navbar-item{color:#f5f5f5}.navbar.is-dark .navbar-brand .navbar-link.is-active,.navbar.is-dark .navbar-brand .navbar-link:hover,.navbar.is-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-dark .navbar-brand>a.navbar-item:hover{background-color:#0f0f0f;color:#f5f5f5}.navbar.is-dark .navbar-brand .navbar-link::after{border-color:#f5f5f5}.navbar.is-dark .navbar-burger{color:#f5f5f5}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-dark .navbar-end .navbar-link,.navbar.is-dark .navbar-end>.navbar-item,.navbar.is-dark .navbar-start .navbar-link,.navbar.is-dark .navbar-start>.navbar-item{color:#f5f5f5}.navbar.is-dark .navbar-end .navbar-link.is-active,.navbar.is-dark .navbar-end .navbar-link:hover,.navbar.is-dark .navbar-end>a.navbar-item.is-active,.navbar.is-dark .navbar-end>a.navbar-item:hover,.navbar.is-dark .navbar-start .navbar-link.is-active,.navbar.is-dark .navbar-start .navbar-link:hover,.navbar.is-dark .navbar-start>a.navbar-item.is-active,.navbar.is-dark .navbar-start>a.navbar-item:hover{background-color:#0f0f0f;color:#f5f5f5}.navbar.is-dark .navbar-end .navbar-link::after,.navbar.is-dark .navbar-start .navbar-link::after{border-color:#f5f5f5}.navbar.is-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#0f0f0f;color:#f5f5f5}.navbar.is-dark .navbar-dropdown a.navbar-item.is-active{background-color:#1c1c1c;color:#f5f5f5}}@media (prefers-color-scheme:dark){.navbar.is-primary{background-color:#009e86;color:#fff}.navbar.is-primary .navbar-brand .navbar-link,.navbar.is-primary .navbar-brand>.navbar-item{color:#fff}.navbar.is-primary .navbar-brand .navbar-link.is-active,.navbar.is-primary .navbar-brand .navbar-link:hover,.navbar.is-primary .navbar-brand>a.navbar-item.is-active,.navbar.is-primary .navbar-brand>a.navbar-item:hover{background-color:#008571;color:#fff}.navbar.is-primary .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-primary .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-primary .navbar-end .navbar-link,.navbar.is-primary .navbar-end>.navbar-item,.navbar.is-primary .navbar-start .navbar-link,.navbar.is-primary .navbar-start>.navbar-item{color:#fff}.navbar.is-primary .navbar-end .navbar-link.is-active,.navbar.is-primary .navbar-end .navbar-link:hover,.navbar.is-primary .navbar-end>a.navbar-item.is-active,.navbar.is-primary .navbar-end>a.navbar-item:hover,.navbar.is-primary .navbar-start .navbar-link.is-active,.navbar.is-primary .navbar-start .navbar-link:hover,.navbar.is-primary .navbar-start>a.navbar-item.is-active,.navbar.is-primary .navbar-start>a.navbar-item:hover{background-color:#008571;color:#fff}.navbar.is-primary .navbar-end .navbar-link::after,.navbar.is-primary .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-primary .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-primary .navbar-item.has-dropdown:hover .navbar-link{background-color:#008571;color:#fff}.navbar.is-primary .navbar-dropdown a.navbar-item.is-active{background-color:#009e86;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-link{background-color:#205bbc;color:#fff}.navbar.is-link .navbar-brand .navbar-link,.navbar.is-link .navbar-brand>.navbar-item{color:#fff}.navbar.is-link .navbar-brand .navbar-link.is-active,.navbar.is-link .navbar-brand .navbar-link:hover,.navbar.is-link .navbar-brand>a.navbar-item.is-active,.navbar.is-link .navbar-brand>a.navbar-item:hover{background-color:#1c51a6;color:#fff}.navbar.is-link .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-link .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-link .navbar-end .navbar-link,.navbar.is-link .navbar-end>.navbar-item,.navbar.is-link .navbar-start .navbar-link,.navbar.is-link .navbar-start>.navbar-item{color:#fff}.navbar.is-link .navbar-end .navbar-link.is-active,.navbar.is-link .navbar-end .navbar-link:hover,.navbar.is-link .navbar-end>a.navbar-item.is-active,.navbar.is-link .navbar-end>a.navbar-item:hover,.navbar.is-link .navbar-start .navbar-link.is-active,.navbar.is-link .navbar-start .navbar-link:hover,.navbar.is-link .navbar-start>a.navbar-item.is-active,.navbar.is-link .navbar-start>a.navbar-item:hover{background-color:#1c51a6;color:#fff}.navbar.is-link .navbar-end .navbar-link::after,.navbar.is-link .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-link .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-link .navbar-item.has-dropdown:hover .navbar-link{background-color:#1c51a6;color:#fff}.navbar.is-link .navbar-dropdown a.navbar-item.is-active{background-color:#205bbc;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-info{background-color:#0f81cc;color:#fff}.navbar.is-info .navbar-brand .navbar-link,.navbar.is-info .navbar-brand>.navbar-item{color:#fff}.navbar.is-info .navbar-brand .navbar-link.is-active,.navbar.is-info .navbar-brand .navbar-link:hover,.navbar.is-info .navbar-brand>a.navbar-item.is-active,.navbar.is-info .navbar-brand>a.navbar-item:hover{background-color:#0e72b4;color:#fff}.navbar.is-info .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-info .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-info .navbar-end .navbar-link,.navbar.is-info .navbar-end>.navbar-item,.navbar.is-info .navbar-start .navbar-link,.navbar.is-info .navbar-start>.navbar-item{color:#fff}.navbar.is-info .navbar-end .navbar-link.is-active,.navbar.is-info .navbar-end .navbar-link:hover,.navbar.is-info .navbar-end>a.navbar-item.is-active,.navbar.is-info .navbar-end>a.navbar-item:hover,.navbar.is-info .navbar-start .navbar-link.is-active,.navbar.is-info .navbar-start .navbar-link:hover,.navbar.is-info .navbar-start>a.navbar-item.is-active,.navbar.is-info .navbar-start>a.navbar-item:hover{background-color:#0e72b4;color:#fff}.navbar.is-info .navbar-end .navbar-link::after,.navbar.is-info .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-info .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-info .navbar-item.has-dropdown:hover .navbar-link{background-color:#0e72b4;color:#fff}.navbar.is-info .navbar-dropdown a.navbar-item.is-active{background-color:#0f81cc;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-success{background-color:#1ca64c;color:#fff}.navbar.is-success .navbar-brand .navbar-link,.navbar.is-success .navbar-brand>.navbar-item{color:#fff}.navbar.is-success .navbar-brand .navbar-link.is-active,.navbar.is-success .navbar-brand .navbar-link:hover,.navbar.is-success .navbar-brand>a.navbar-item.is-active,.navbar.is-success .navbar-brand>a.navbar-item:hover{background-color:#189042;color:#fff}.navbar.is-success .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-success .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-success .navbar-end .navbar-link,.navbar.is-success .navbar-end>.navbar-item,.navbar.is-success .navbar-start .navbar-link,.navbar.is-success .navbar-start>.navbar-item{color:#fff}.navbar.is-success .navbar-end .navbar-link.is-active,.navbar.is-success .navbar-end .navbar-link:hover,.navbar.is-success .navbar-end>a.navbar-item.is-active,.navbar.is-success .navbar-end>a.navbar-item:hover,.navbar.is-success .navbar-start .navbar-link.is-active,.navbar.is-success .navbar-start .navbar-link:hover,.navbar.is-success .navbar-start>a.navbar-item.is-active,.navbar.is-success .navbar-start>a.navbar-item:hover{background-color:#189042;color:#fff}.navbar.is-success .navbar-end .navbar-link::after,.navbar.is-success .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-success .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-success .navbar-item.has-dropdown:hover .navbar-link{background-color:#189042;color:#fff}.navbar.is-success .navbar-dropdown a.navbar-item.is-active{background-color:#1ca64c;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-warning{background-color:#ffd324;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link,.navbar.is-warning .navbar-brand>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link.is-active,.navbar.is-warning .navbar-brand .navbar-link:hover,.navbar.is-warning .navbar-brand>a.navbar-item.is-active,.navbar.is-warning .navbar-brand>a.navbar-item:hover{background-color:#ffce0a;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-burger{color:rgba(0,0,0,.7)}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-warning .navbar-end .navbar-link,.navbar.is-warning .navbar-end>.navbar-item,.navbar.is-warning .navbar-start .navbar-link,.navbar.is-warning .navbar-start>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-end .navbar-link.is-active,.navbar.is-warning .navbar-end .navbar-link:hover,.navbar.is-warning .navbar-end>a.navbar-item.is-active,.navbar.is-warning .navbar-end>a.navbar-item:hover,.navbar.is-warning .navbar-start .navbar-link.is-active,.navbar.is-warning .navbar-start .navbar-link:hover,.navbar.is-warning .navbar-start>a.navbar-item.is-active,.navbar.is-warning .navbar-start>a.navbar-item:hover{background-color:#ffce0a;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-end .navbar-link::after,.navbar.is-warning .navbar-start .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-warning .navbar-item.has-dropdown:hover .navbar-link{background-color:#ffce0a;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-dropdown a.navbar-item.is-active{background-color:#ffd324;color:rgba(0,0,0,.7)}}@media (prefers-color-scheme:dark){.navbar.is-danger{background-color:#ff0537;color:#fff}.navbar.is-danger .navbar-brand .navbar-link,.navbar.is-danger .navbar-brand>.navbar-item{color:#fff}.navbar.is-danger .navbar-brand .navbar-link.is-active,.navbar.is-danger .navbar-brand .navbar-link:hover,.navbar.is-danger .navbar-brand>a.navbar-item.is-active,.navbar.is-danger .navbar-brand>a.navbar-item:hover{background-color:#eb002f;color:#fff}.navbar.is-danger .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-danger .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-danger .navbar-end .navbar-link,.navbar.is-danger .navbar-end>.navbar-item,.navbar.is-danger .navbar-start .navbar-link,.navbar.is-danger .navbar-start>.navbar-item{color:#fff}.navbar.is-danger .navbar-end .navbar-link.is-active,.navbar.is-danger .navbar-end .navbar-link:hover,.navbar.is-danger .navbar-end>a.navbar-item.is-active,.navbar.is-danger .navbar-end>a.navbar-item:hover,.navbar.is-danger .navbar-start .navbar-link.is-active,.navbar.is-danger .navbar-start .navbar-link:hover,.navbar.is-danger .navbar-start>a.navbar-item.is-active,.navbar.is-danger .navbar-start>a.navbar-item:hover{background-color:#eb002f;color:#fff}.navbar.is-danger .navbar-end .navbar-link::after,.navbar.is-danger .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-danger .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-danger .navbar-item.has-dropdown:hover .navbar-link{background-color:#eb002f;color:#fff}.navbar.is-danger .navbar-dropdown a.navbar-item.is-active{background-color:#ff0537;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-white-dark{background-color:#fff;color:#0a0a0a}.navbar.is-white-dark .navbar-brand .navbar-link,.navbar.is-white-dark .navbar-brand>.navbar-item{color:#0a0a0a}.navbar.is-white-dark .navbar-brand .navbar-link.is-active,.navbar.is-white-dark .navbar-brand .navbar-link:hover,.navbar.is-white-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-white-dark .navbar-brand>a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white-dark .navbar-brand .navbar-link::after{border-color:#0a0a0a}.navbar.is-white-dark .navbar-burger{color:#0a0a0a}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-white-dark .navbar-end .navbar-link,.navbar.is-white-dark .navbar-end>.navbar-item,.navbar.is-white-dark .navbar-start .navbar-link,.navbar.is-white-dark .navbar-start>.navbar-item{color:#0a0a0a}.navbar.is-white-dark .navbar-end .navbar-link.is-active,.navbar.is-white-dark .navbar-end .navbar-link:hover,.navbar.is-white-dark .navbar-end>a.navbar-item.is-active,.navbar.is-white-dark .navbar-end>a.navbar-item:hover,.navbar.is-white-dark .navbar-start .navbar-link.is-active,.navbar.is-white-dark .navbar-start .navbar-link:hover,.navbar.is-white-dark .navbar-start>a.navbar-item.is-active,.navbar.is-white-dark .navbar-start>a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white-dark .navbar-end .navbar-link::after,.navbar.is-white-dark .navbar-start .navbar-link::after{border-color:#0a0a0a}.navbar.is-white-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-white-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white-dark .navbar-dropdown a.navbar-item.is-active{background-color:#fff;color:#0a0a0a}}@media (prefers-color-scheme:dark){.navbar.is-black-dark{background-color:#0a0a0a;color:#fff}.navbar.is-black-dark .navbar-brand .navbar-link,.navbar.is-black-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-black-dark .navbar-brand .navbar-link.is-active,.navbar.is-black-dark .navbar-brand .navbar-link:hover,.navbar.is-black-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-black-dark .navbar-brand>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-black-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-black-dark .navbar-end .navbar-link,.navbar.is-black-dark .navbar-end>.navbar-item,.navbar.is-black-dark .navbar-start .navbar-link,.navbar.is-black-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-black-dark .navbar-end .navbar-link.is-active,.navbar.is-black-dark .navbar-end .navbar-link:hover,.navbar.is-black-dark .navbar-end>a.navbar-item.is-active,.navbar.is-black-dark .navbar-end>a.navbar-item:hover,.navbar.is-black-dark .navbar-start .navbar-link.is-active,.navbar.is-black-dark .navbar-start .navbar-link:hover,.navbar.is-black-dark .navbar-start>a.navbar-item.is-active,.navbar.is-black-dark .navbar-start>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black-dark .navbar-end .navbar-link::after,.navbar.is-black-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-black-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-black-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#000;color:#fff}.navbar.is-black-dark .navbar-dropdown a.navbar-item.is-active{background-color:#0a0a0a;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-light-dark{background-color:#f5f5f5;color:#363636}.navbar.is-light-dark .navbar-brand .navbar-link,.navbar.is-light-dark .navbar-brand>.navbar-item{color:#363636}.navbar.is-light-dark .navbar-brand .navbar-link.is-active,.navbar.is-light-dark .navbar-brand .navbar-link:hover,.navbar.is-light-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-light-dark .navbar-brand>a.navbar-item:hover{background-color:#e8e8e8;color:#363636}.navbar.is-light-dark .navbar-brand .navbar-link::after{border-color:#363636}.navbar.is-light-dark .navbar-burger{color:#363636}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-light-dark .navbar-end .navbar-link,.navbar.is-light-dark .navbar-end>.navbar-item,.navbar.is-light-dark .navbar-start .navbar-link,.navbar.is-light-dark .navbar-start>.navbar-item{color:#363636}.navbar.is-light-dark .navbar-end .navbar-link.is-active,.navbar.is-light-dark .navbar-end .navbar-link:hover,.navbar.is-light-dark .navbar-end>a.navbar-item.is-active,.navbar.is-light-dark .navbar-end>a.navbar-item:hover,.navbar.is-light-dark .navbar-start .navbar-link.is-active,.navbar.is-light-dark .navbar-start .navbar-link:hover,.navbar.is-light-dark .navbar-start>a.navbar-item.is-active,.navbar.is-light-dark .navbar-start>a.navbar-item:hover{background-color:#e8e8e8;color:#363636}.navbar.is-light-dark .navbar-end .navbar-link::after,.navbar.is-light-dark .navbar-start .navbar-link::after{border-color:#363636}.navbar.is-light-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-light-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#e8e8e8;color:#363636}.navbar.is-light-dark .navbar-dropdown a.navbar-item.is-active{background-color:#f5f5f5;color:#363636}}@media (prefers-color-scheme:dark){.navbar.is-dark-dark{background-color:#363636;color:#f5f5f5}.navbar.is-dark-dark .navbar-brand .navbar-link,.navbar.is-dark-dark .navbar-brand>.navbar-item{color:#f5f5f5}.navbar.is-dark-dark .navbar-brand .navbar-link.is-active,.navbar.is-dark-dark .navbar-brand .navbar-link:hover,.navbar.is-dark-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-dark-dark .navbar-brand>a.navbar-item:hover{background-color:#292929;color:#f5f5f5}.navbar.is-dark-dark .navbar-brand .navbar-link::after{border-color:#f5f5f5}.navbar.is-dark-dark .navbar-burger{color:#f5f5f5}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-dark-dark .navbar-end .navbar-link,.navbar.is-dark-dark .navbar-end>.navbar-item,.navbar.is-dark-dark .navbar-start .navbar-link,.navbar.is-dark-dark .navbar-start>.navbar-item{color:#f5f5f5}.navbar.is-dark-dark .navbar-end .navbar-link.is-active,.navbar.is-dark-dark .navbar-end .navbar-link:hover,.navbar.is-dark-dark .navbar-end>a.navbar-item.is-active,.navbar.is-dark-dark .navbar-end>a.navbar-item:hover,.navbar.is-dark-dark .navbar-start .navbar-link.is-active,.navbar.is-dark-dark .navbar-start .navbar-link:hover,.navbar.is-dark-dark .navbar-start>a.navbar-item.is-active,.navbar.is-dark-dark .navbar-start>a.navbar-item:hover{background-color:#292929;color:#f5f5f5}.navbar.is-dark-dark .navbar-end .navbar-link::after,.navbar.is-dark-dark .navbar-start .navbar-link::after{border-color:#f5f5f5}.navbar.is-dark-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-dark-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#292929;color:#f5f5f5}.navbar.is-dark-dark .navbar-dropdown a.navbar-item.is-active{background-color:#363636;color:#f5f5f5}}@media (prefers-color-scheme:dark){.navbar.is-primary-dark{background-color:#00d1b2;color:#fff}.navbar.is-primary-dark .navbar-brand .navbar-link,.navbar.is-primary-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-primary-dark .navbar-brand .navbar-link.is-active,.navbar.is-primary-dark .navbar-brand .navbar-link:hover,.navbar.is-primary-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-primary-dark .navbar-brand>a.navbar-item:hover{background-color:#00b89c;color:#fff}.navbar.is-primary-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-primary-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-primary-dark .navbar-end .navbar-link,.navbar.is-primary-dark .navbar-end>.navbar-item,.navbar.is-primary-dark .navbar-start .navbar-link,.navbar.is-primary-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-primary-dark .navbar-end .navbar-link.is-active,.navbar.is-primary-dark .navbar-end .navbar-link:hover,.navbar.is-primary-dark .navbar-end>a.navbar-item.is-active,.navbar.is-primary-dark .navbar-end>a.navbar-item:hover,.navbar.is-primary-dark .navbar-start .navbar-link.is-active,.navbar.is-primary-dark .navbar-start .navbar-link:hover,.navbar.is-primary-dark .navbar-start>a.navbar-item.is-active,.navbar.is-primary-dark .navbar-start>a.navbar-item:hover{background-color:#00b89c;color:#fff}.navbar.is-primary-dark .navbar-end .navbar-link::after,.navbar.is-primary-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-primary-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-primary-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#00b89c;color:#fff}.navbar.is-primary-dark .navbar-dropdown a.navbar-item.is-active{background-color:#00d1b2;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-link-dark{background-color:#3273dc;color:#fff}.navbar.is-link-dark .navbar-brand .navbar-link,.navbar.is-link-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-link-dark .navbar-brand .navbar-link.is-active,.navbar.is-link-dark .navbar-brand .navbar-link:hover,.navbar.is-link-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-link-dark .navbar-brand>a.navbar-item:hover{background-color:#2366d1;color:#fff}.navbar.is-link-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-link-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-link-dark .navbar-end .navbar-link,.navbar.is-link-dark .navbar-end>.navbar-item,.navbar.is-link-dark .navbar-start .navbar-link,.navbar.is-link-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-link-dark .navbar-end .navbar-link.is-active,.navbar.is-link-dark .navbar-end .navbar-link:hover,.navbar.is-link-dark .navbar-end>a.navbar-item.is-active,.navbar.is-link-dark .navbar-end>a.navbar-item:hover,.navbar.is-link-dark .navbar-start .navbar-link.is-active,.navbar.is-link-dark .navbar-start .navbar-link:hover,.navbar.is-link-dark .navbar-start>a.navbar-item.is-active,.navbar.is-link-dark .navbar-start>a.navbar-item:hover{background-color:#2366d1;color:#fff}.navbar.is-link-dark .navbar-end .navbar-link::after,.navbar.is-link-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-link-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-link-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#2366d1;color:#fff}.navbar.is-link-dark .navbar-dropdown a.navbar-item.is-active{background-color:#3273dc;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-info-dark{background-color:#209cee;color:#fff}.navbar.is-info-dark .navbar-brand .navbar-link,.navbar.is-info-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-info-dark .navbar-brand .navbar-link.is-active,.navbar.is-info-dark .navbar-brand .navbar-link:hover,.navbar.is-info-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-info-dark .navbar-brand>a.navbar-item:hover{background-color:#118fe4;color:#fff}.navbar.is-info-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-info-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-info-dark .navbar-end .navbar-link,.navbar.is-info-dark .navbar-end>.navbar-item,.navbar.is-info-dark .navbar-start .navbar-link,.navbar.is-info-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-info-dark .navbar-end .navbar-link.is-active,.navbar.is-info-dark .navbar-end .navbar-link:hover,.navbar.is-info-dark .navbar-end>a.navbar-item.is-active,.navbar.is-info-dark .navbar-end>a.navbar-item:hover,.navbar.is-info-dark .navbar-start .navbar-link.is-active,.navbar.is-info-dark .navbar-start .navbar-link:hover,.navbar.is-info-dark .navbar-start>a.navbar-item.is-active,.navbar.is-info-dark .navbar-start>a.navbar-item:hover{background-color:#118fe4;color:#fff}.navbar.is-info-dark .navbar-end .navbar-link::after,.navbar.is-info-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-info-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-info-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#118fe4;color:#fff}.navbar.is-info-dark .navbar-dropdown a.navbar-item.is-active{background-color:#209cee;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-success-dark{background-color:#23d160;color:#fff}.navbar.is-success-dark .navbar-brand .navbar-link,.navbar.is-success-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-success-dark .navbar-brand .navbar-link.is-active,.navbar.is-success-dark .navbar-brand .navbar-link:hover,.navbar.is-success-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-success-dark .navbar-brand>a.navbar-item:hover{background-color:#20bc56;color:#fff}.navbar.is-success-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-success-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-success-dark .navbar-end .navbar-link,.navbar.is-success-dark .navbar-end>.navbar-item,.navbar.is-success-dark .navbar-start .navbar-link,.navbar.is-success-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-success-dark .navbar-end .navbar-link.is-active,.navbar.is-success-dark .navbar-end .navbar-link:hover,.navbar.is-success-dark .navbar-end>a.navbar-item.is-active,.navbar.is-success-dark .navbar-end>a.navbar-item:hover,.navbar.is-success-dark .navbar-start .navbar-link.is-active,.navbar.is-success-dark .navbar-start .navbar-link:hover,.navbar.is-success-dark .navbar-start>a.navbar-item.is-active,.navbar.is-success-dark .navbar-start>a.navbar-item:hover{background-color:#20bc56;color:#fff}.navbar.is-success-dark .navbar-end .navbar-link::after,.navbar.is-success-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-success-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-success-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#20bc56;color:#fff}.navbar.is-success-dark .navbar-dropdown a.navbar-item.is-active{background-color:#23d160;color:#fff}}@media (prefers-color-scheme:dark){.navbar.is-warning-dark{background-color:#ffdd57;color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-brand .navbar-link,.navbar.is-warning-dark .navbar-brand>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-brand .navbar-link.is-active,.navbar.is-warning-dark .navbar-brand .navbar-link:hover,.navbar.is-warning-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-warning-dark .navbar-brand>a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-brand .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-burger{color:rgba(0,0,0,.7)}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-warning-dark .navbar-end .navbar-link,.navbar.is-warning-dark .navbar-end>.navbar-item,.navbar.is-warning-dark .navbar-start .navbar-link,.navbar.is-warning-dark .navbar-start>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-end .navbar-link.is-active,.navbar.is-warning-dark .navbar-end .navbar-link:hover,.navbar.is-warning-dark .navbar-end>a.navbar-item.is-active,.navbar.is-warning-dark .navbar-end>a.navbar-item:hover,.navbar.is-warning-dark .navbar-start .navbar-link.is-active,.navbar.is-warning-dark .navbar-start .navbar-link:hover,.navbar.is-warning-dark .navbar-start>a.navbar-item.is-active,.navbar.is-warning-dark .navbar-start>a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-end .navbar-link::after,.navbar.is-warning-dark .navbar-start .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-warning-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning-dark .navbar-dropdown a.navbar-item.is-active{background-color:#ffdd57;color:rgba(0,0,0,.7)}}@media (prefers-color-scheme:dark){.navbar.is-danger-dark{background-color:#ff3860;color:#fff}.navbar.is-danger-dark .navbar-brand .navbar-link,.navbar.is-danger-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-danger-dark .navbar-brand .navbar-link.is-active,.navbar.is-danger-dark .navbar-brand .navbar-link:hover,.navbar.is-danger-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-danger-dark .navbar-brand>a.navbar-item:hover{background-color:#ff1f4b;color:#fff}.navbar.is-danger-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-danger-dark .navbar-burger{color:#fff}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-danger-dark .navbar-end .navbar-link,.navbar.is-danger-dark .navbar-end>.navbar-item,.navbar.is-danger-dark .navbar-start .navbar-link,.navbar.is-danger-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-danger-dark .navbar-end .navbar-link.is-active,.navbar.is-danger-dark .navbar-end .navbar-link:hover,.navbar.is-danger-dark .navbar-end>a.navbar-item.is-active,.navbar.is-danger-dark .navbar-end>a.navbar-item:hover,.navbar.is-danger-dark .navbar-start .navbar-link.is-active,.navbar.is-danger-dark .navbar-start .navbar-link:hover,.navbar.is-danger-dark .navbar-start>a.navbar-item.is-active,.navbar.is-danger-dark .navbar-start>a.navbar-item:hover{background-color:#ff1f4b;color:#fff}.navbar.is-danger-dark .navbar-end .navbar-link::after,.navbar.is-danger-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-danger-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-danger-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#ff1f4b;color:#fff}.navbar.is-danger-dark .navbar-dropdown a.navbar-item.is-active{background-color:#ff3860;color:#fff}}@media (prefers-color-scheme:dark){.navbar.has-shadow{box-shadow:0 2px 0 0 #242424}.navbar.is-fixed-bottom.has-shadow{box-shadow:0 -2px 0 0 #242424}.navbar-burger{color:#b5b5b5}.navbar-item,.navbar-link{color:#b5b5b5}.navbar-link.is-active,.navbar-link:hover,a.navbar-item.is-active,a.navbar-item:hover{background-color:#121212;color:#5ea3e4}.navbar-item:hover{border-bottom-color:#5ea3e4}.navbar-item.is-active{border-bottom-color:#5ea3e4;color:#5ea3e4}.navbar-link:not(.is-arrowless)::after{border-color:#5ea3e4}.navbar-divider{background-color:#242424}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.navbar-menu{background-color:#17181c;box-shadow:0 8px 16px rgba(255,255,255,.1)}.navbar.is-fixed-bottom-touch.has-shadow{box-shadow:0 -2px 3px rgba(255,255,255,.1)}}@media screen and (prefers-color-scheme:dark) and (min-width:1088px){.navbar.is-transparent .navbar-dropdown a.navbar-item:hover{background-color:#242424;color:#fff}.navbar.is-transparent .navbar-dropdown a.navbar-item.is-active{background-color:#242424;color:#5ea3e4}.navbar-item.has-dropdown-up .navbar-dropdown{border-bottom:2px solid #363636;box-shadow:0 -8px 8px rgba(255,255,255,.1)}.navbar-dropdown{background-color:#0a0a0a;border-top:2px solid #363636;box-shadow:0 8px 8px rgba(255,255,255,.1)}.navbar-dropdown a.navbar-item:hover{background-color:#242424;color:#fff}.navbar-dropdown a.navbar-item.is-active{background-color:#242424;color:#5ea3e4}.navbar-dropdown.is-boxed,.navbar.is-spaced .navbar-dropdown{box-shadow:0 8px 8px rgba(255,255,255,.1),0 0 0 1px rgba(255,255,255,.1)}.navbar.is-fixed-bottom-desktop.has-shadow{box-shadow:0 -2px 3px rgba(255,255,255,.1)}.navbar-link.is-active,a.navbar-item.is-active{color:#fff}.navbar-item.has-dropdown.is-active .navbar-link,.navbar-item.has-dropdown:hover .navbar-link{background-color:#121212}}@media (prefers-color-scheme:dark){.pagination-link,.pagination-next,.pagination-previous{border-color:#363636;color:#dbdbdb}.pagination-link:hover,.pagination-next:hover,.pagination-previous:hover{border-color:#4a4a4a;color:#dbdbdb}.pagination-link:focus,.pagination-next:focus,.pagination-previous:focus{border-color:#5ea3e4}.pagination-link:active,.pagination-next:active,.pagination-previous:active{box-shadow:inset 0 1px 2px rgba(255,255,255,.2)}.pagination-link[disabled],.pagination-next[disabled],.pagination-previous[disabled]{background-color:#363636;border-color:#363636;color:#7a7a7a}.pagination-link.is-current{background-color:#5ea3e4;border-color:#5ea3e4;color:#fff}.pagination-ellipsis{color:#4a4a4a}.panel-block,.panel-heading,.panel-tabs{border-bottom:1px solid #363636;border-left:1px solid #363636;border-right:1px solid #363636}.panel-block:first-child,.panel-heading:first-child,.panel-tabs:first-child{border-top:1px solid #363636}.panel-heading{background-color:#242424;color:#dbdbdb}.panel-tabs a{border-bottom:1px solid #363636}.panel-tabs a.is-active{border-bottom-color:#b5b5b5;color:#dbdbdb}.panel-list a{color:#b5b5b5}.panel-list a:hover{color:#5ea3e4}.panel-block{color:#dbdbdb}.panel-block.is-active{border-left-color:#5ea3e4;color:#dbdbdb}.panel-block.is-active .panel-icon{color:#5ea3e4}a.panel-block:hover,label.panel-block:hover{background-color:#242424}.tabs a{border-bottom-color:#363636;color:#b5b5b5}.tabs a:hover{border-bottom-color:#dbdbdb;color:#dbdbdb}.tabs li.is-active a{border-bottom-color:#5ea3e4;color:#5ea3e4}.tabs ul{border-bottom-color:#363636}.tabs.is-boxed a:hover{background-color:#242424;border-bottom-color:#363636}.tabs.is-boxed li.is-active a{background-color:#0a0a0a;border-color:#363636}.tabs.is-toggle a{border-color:#363636}.tabs.is-toggle a:hover{background-color:#242424;border-color:#4a4a4a}.tabs.is-toggle li.is-active a{background-color:#5ea3e4;border-color:#5ea3e4;color:#fff}.hero.is-white,.hero.is-white-dark{background-color:#e6e6e6;color:#0a0a0a}.hero.is-white a:not(.button):not(.dropdown-item):not(.tag),.hero.is-white strong,.hero.is-white-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-white-dark strong{color:inherit}.hero.is-white .title,.hero.is-white-dark .title{color:#0a0a0a}.hero.is-white .subtitle,.hero.is-white-dark .subtitle{color:rgba(10,10,10,.9)}.hero.is-white .subtitle a:not(.button),.hero.is-white .subtitle strong,.hero.is-white-dark .subtitle a:not(.button),.hero.is-white-dark .subtitle strong{color:#0a0a0a}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-white .navbar-menu,.hero.is-white-dark .navbar-menu{background-color:#e6e6e6}}@media (prefers-color-scheme:dark){.hero.is-white .navbar-item,.hero.is-white .navbar-link,.hero.is-white-dark .navbar-item,.hero.is-white-dark .navbar-link{color:rgba(10,10,10,.7)}.hero.is-white .navbar-link.is-active,.hero.is-white .navbar-link:hover,.hero.is-white a.navbar-item.is-active,.hero.is-white a.navbar-item:hover,.hero.is-white-dark .navbar-link.is-active,.hero.is-white-dark .navbar-link:hover,.hero.is-white-dark a.navbar-item.is-active,.hero.is-white-dark a.navbar-item:hover{background-color:#d9d9d9;color:#0a0a0a}.hero.is-white .tabs a,.hero.is-white-dark .tabs a{color:#0a0a0a;opacity:.9}.hero.is-white .tabs a:hover,.hero.is-white-dark .tabs a:hover{opacity:1}.hero.is-white .tabs li.is-active a,.hero.is-white-dark .tabs li.is-active a{opacity:1}.hero.is-white .tabs.is-boxed a,.hero.is-white .tabs.is-toggle a,.hero.is-white-dark .tabs.is-boxed a,.hero.is-white-dark .tabs.is-toggle a{color:#0a0a0a}.hero.is-white .tabs.is-boxed a:hover,.hero.is-white .tabs.is-toggle a:hover,.hero.is-white-dark .tabs.is-boxed a:hover,.hero.is-white-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-white .tabs.is-boxed li.is-active a,.hero.is-white .tabs.is-boxed li.is-active a:hover,.hero.is-white .tabs.is-toggle li.is-active a,.hero.is-white .tabs.is-toggle li.is-active a:hover,.hero.is-white-dark .tabs.is-boxed li.is-active a,.hero.is-white-dark .tabs.is-boxed li.is-active a:hover,.hero.is-white-dark .tabs.is-toggle li.is-active a,.hero.is-white-dark .tabs.is-toggle li.is-active a:hover{background-color:#0a0a0a;border-color:#0a0a0a;color:#e6e6e6}.hero.is-white-dark.is-bold,.hero.is-white.is-bold{background-image:linear-gradient(141deg,#d1c7c9 0,#e6e6e6 71%,#f3f2f2 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-white-dark.is-bold .navbar-menu,.hero.is-white.is-bold .navbar-menu{background-image:linear-gradient(141deg,#d1c7c9 0,#e6e6e6 71%,#f3f2f2 100%)}}@media (prefers-color-scheme:dark){.hero.is-black,.hero.is-black-dark{background-color:#000;color:#fff}.hero.is-black a:not(.button):not(.dropdown-item):not(.tag),.hero.is-black strong,.hero.is-black-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-black-dark strong{color:inherit}.hero.is-black .title,.hero.is-black-dark .title{color:#fff}.hero.is-black .subtitle,.hero.is-black-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-black .subtitle a:not(.button),.hero.is-black .subtitle strong,.hero.is-black-dark .subtitle a:not(.button),.hero.is-black-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-black .navbar-menu,.hero.is-black-dark .navbar-menu{background-color:#000}}@media (prefers-color-scheme:dark){.hero.is-black .navbar-item,.hero.is-black .navbar-link,.hero.is-black-dark .navbar-item,.hero.is-black-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-black .navbar-link.is-active,.hero.is-black .navbar-link:hover,.hero.is-black a.navbar-item.is-active,.hero.is-black a.navbar-item:hover,.hero.is-black-dark .navbar-link.is-active,.hero.is-black-dark .navbar-link:hover,.hero.is-black-dark a.navbar-item.is-active,.hero.is-black-dark a.navbar-item:hover{background-color:#000;color:#fff}.hero.is-black .tabs a,.hero.is-black-dark .tabs a{color:#fff;opacity:.9}.hero.is-black .tabs a:hover,.hero.is-black-dark .tabs a:hover{opacity:1}.hero.is-black .tabs li.is-active a,.hero.is-black-dark .tabs li.is-active a{opacity:1}.hero.is-black .tabs.is-boxed a,.hero.is-black .tabs.is-toggle a,.hero.is-black-dark .tabs.is-boxed a,.hero.is-black-dark .tabs.is-toggle a{color:#fff}.hero.is-black .tabs.is-boxed a:hover,.hero.is-black .tabs.is-toggle a:hover,.hero.is-black-dark .tabs.is-boxed a:hover,.hero.is-black-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-black .tabs.is-boxed li.is-active a,.hero.is-black .tabs.is-boxed li.is-active a:hover,.hero.is-black .tabs.is-toggle li.is-active a,.hero.is-black .tabs.is-toggle li.is-active a:hover,.hero.is-black-dark .tabs.is-boxed li.is-active a,.hero.is-black-dark .tabs.is-boxed li.is-active a:hover,.hero.is-black-dark .tabs.is-toggle li.is-active a,.hero.is-black-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#000}.hero.is-black-dark.is-bold,.hero.is-black.is-bold{background-image:linear-gradient(141deg,#000 0,#000 71%,#0d0d0d 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-black-dark.is-bold .navbar-menu,.hero.is-black.is-bold .navbar-menu{background-image:linear-gradient(141deg,#000 0,#000 71%,#0d0d0d 100%)}}@media (prefers-color-scheme:dark){.hero.is-light,.hero.is-light-dark{background-color:#dbdbdb;color:#363636}.hero.is-light a:not(.button):not(.dropdown-item):not(.tag),.hero.is-light strong,.hero.is-light-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-light-dark strong{color:inherit}.hero.is-light .title,.hero.is-light-dark .title{color:#363636}.hero.is-light .subtitle,.hero.is-light-dark .subtitle{color:rgba(54,54,54,.9)}.hero.is-light .subtitle a:not(.button),.hero.is-light .subtitle strong,.hero.is-light-dark .subtitle a:not(.button),.hero.is-light-dark .subtitle strong{color:#363636}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-light .navbar-menu,.hero.is-light-dark .navbar-menu{background-color:#dbdbdb}}@media (prefers-color-scheme:dark){.hero.is-light .navbar-item,.hero.is-light .navbar-link,.hero.is-light-dark .navbar-item,.hero.is-light-dark .navbar-link{color:rgba(54,54,54,.7)}.hero.is-light .navbar-link.is-active,.hero.is-light .navbar-link:hover,.hero.is-light a.navbar-item.is-active,.hero.is-light a.navbar-item:hover,.hero.is-light-dark .navbar-link.is-active,.hero.is-light-dark .navbar-link:hover,.hero.is-light-dark a.navbar-item.is-active,.hero.is-light-dark a.navbar-item:hover{background-color:#cfcfcf;color:#363636}.hero.is-light .tabs a,.hero.is-light-dark .tabs a{color:#363636;opacity:.9}.hero.is-light .tabs a:hover,.hero.is-light-dark .tabs a:hover{opacity:1}.hero.is-light .tabs li.is-active a,.hero.is-light-dark .tabs li.is-active a{opacity:1}.hero.is-light .tabs.is-boxed a,.hero.is-light .tabs.is-toggle a,.hero.is-light-dark .tabs.is-boxed a,.hero.is-light-dark .tabs.is-toggle a{color:#363636}.hero.is-light .tabs.is-boxed a:hover,.hero.is-light .tabs.is-toggle a:hover,.hero.is-light-dark .tabs.is-boxed a:hover,.hero.is-light-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-light .tabs.is-boxed li.is-active a,.hero.is-light .tabs.is-boxed li.is-active a:hover,.hero.is-light .tabs.is-toggle li.is-active a,.hero.is-light .tabs.is-toggle li.is-active a:hover,.hero.is-light-dark .tabs.is-boxed li.is-active a,.hero.is-light-dark .tabs.is-boxed li.is-active a:hover,.hero.is-light-dark .tabs.is-toggle li.is-active a,.hero.is-light-dark .tabs.is-toggle li.is-active a:hover{background-color:#363636;border-color:#363636;color:#dbdbdb}.hero.is-light-dark.is-bold,.hero.is-light.is-bold{background-image:linear-gradient(141deg,#c8bcbe 0,#dbdbdb 71%,#e9e7e7 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-light-dark.is-bold .navbar-menu,.hero.is-light.is-bold .navbar-menu{background-image:linear-gradient(141deg,#c8bcbe 0,#dbdbdb 71%,#e9e7e7 100%)}}@media (prefers-color-scheme:dark){.hero.is-dark,.hero.is-dark-dark{background-color:#1c1c1c;color:#f5f5f5}.hero.is-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-dark strong,.hero.is-dark-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-dark-dark strong{color:inherit}.hero.is-dark .title,.hero.is-dark-dark .title{color:#f5f5f5}.hero.is-dark .subtitle,.hero.is-dark-dark .subtitle{color:rgba(245,245,245,.9)}.hero.is-dark .subtitle a:not(.button),.hero.is-dark .subtitle strong,.hero.is-dark-dark .subtitle a:not(.button),.hero.is-dark-dark .subtitle strong{color:#f5f5f5}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-dark .navbar-menu,.hero.is-dark-dark .navbar-menu{background-color:#1c1c1c}}@media (prefers-color-scheme:dark){.hero.is-dark .navbar-item,.hero.is-dark .navbar-link,.hero.is-dark-dark .navbar-item,.hero.is-dark-dark .navbar-link{color:rgba(245,245,245,.7)}.hero.is-dark .navbar-link.is-active,.hero.is-dark .navbar-link:hover,.hero.is-dark a.navbar-item.is-active,.hero.is-dark a.navbar-item:hover,.hero.is-dark-dark .navbar-link.is-active,.hero.is-dark-dark .navbar-link:hover,.hero.is-dark-dark a.navbar-item.is-active,.hero.is-dark-dark a.navbar-item:hover{background-color:#0f0f0f;color:#f5f5f5}.hero.is-dark .tabs a,.hero.is-dark-dark .tabs a{color:#f5f5f5;opacity:.9}.hero.is-dark .tabs a:hover,.hero.is-dark-dark .tabs a:hover{opacity:1}.hero.is-dark .tabs li.is-active a,.hero.is-dark-dark .tabs li.is-active a{opacity:1}.hero.is-dark .tabs.is-boxed a,.hero.is-dark .tabs.is-toggle a,.hero.is-dark-dark .tabs.is-boxed a,.hero.is-dark-dark .tabs.is-toggle a{color:#f5f5f5}.hero.is-dark .tabs.is-boxed a:hover,.hero.is-dark .tabs.is-toggle a:hover,.hero.is-dark-dark .tabs.is-boxed a:hover,.hero.is-dark-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-dark .tabs.is-boxed li.is-active a,.hero.is-dark .tabs.is-boxed li.is-active a:hover,.hero.is-dark .tabs.is-toggle li.is-active a,.hero.is-dark .tabs.is-toggle li.is-active a:hover,.hero.is-dark-dark .tabs.is-boxed li.is-active a,.hero.is-dark-dark .tabs.is-boxed li.is-active a:hover,.hero.is-dark-dark .tabs.is-toggle li.is-active a,.hero.is-dark-dark .tabs.is-toggle li.is-active a:hover{background-color:#f5f5f5;border-color:#f5f5f5;color:#1c1c1c}.hero.is-dark-dark.is-bold,.hero.is-dark.is-bold{background-image:linear-gradient(141deg,#030202 0,#1c1c1c 71%,#2b2727 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-dark-dark.is-bold .navbar-menu,.hero.is-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#030202 0,#1c1c1c 71%,#2b2727 100%)}}@media (prefers-color-scheme:dark){.hero.is-primary,.hero.is-primary-dark{background-color:#009e86;color:#fff}.hero.is-primary a:not(.button):not(.dropdown-item):not(.tag),.hero.is-primary strong,.hero.is-primary-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-primary-dark strong{color:inherit}.hero.is-primary .title,.hero.is-primary-dark .title{color:#fff}.hero.is-primary .subtitle,.hero.is-primary-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-primary .subtitle a:not(.button),.hero.is-primary .subtitle strong,.hero.is-primary-dark .subtitle a:not(.button),.hero.is-primary-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-primary .navbar-menu,.hero.is-primary-dark .navbar-menu{background-color:#009e86}}@media (prefers-color-scheme:dark){.hero.is-primary .navbar-item,.hero.is-primary .navbar-link,.hero.is-primary-dark .navbar-item,.hero.is-primary-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-primary .navbar-link.is-active,.hero.is-primary .navbar-link:hover,.hero.is-primary a.navbar-item.is-active,.hero.is-primary a.navbar-item:hover,.hero.is-primary-dark .navbar-link.is-active,.hero.is-primary-dark .navbar-link:hover,.hero.is-primary-dark a.navbar-item.is-active,.hero.is-primary-dark a.navbar-item:hover{background-color:#008571;color:#fff}.hero.is-primary .tabs a,.hero.is-primary-dark .tabs a{color:#fff;opacity:.9}.hero.is-primary .tabs a:hover,.hero.is-primary-dark .tabs a:hover{opacity:1}.hero.is-primary .tabs li.is-active a,.hero.is-primary-dark .tabs li.is-active a{opacity:1}.hero.is-primary .tabs.is-boxed a,.hero.is-primary .tabs.is-toggle a,.hero.is-primary-dark .tabs.is-boxed a,.hero.is-primary-dark .tabs.is-toggle a{color:#fff}.hero.is-primary .tabs.is-boxed a:hover,.hero.is-primary .tabs.is-toggle a:hover,.hero.is-primary-dark .tabs.is-boxed a:hover,.hero.is-primary-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-primary .tabs.is-boxed li.is-active a,.hero.is-primary .tabs.is-boxed li.is-active a:hover,.hero.is-primary .tabs.is-toggle li.is-active a,.hero.is-primary .tabs.is-toggle li.is-active a:hover,.hero.is-primary-dark .tabs.is-boxed li.is-active a,.hero.is-primary-dark .tabs.is-boxed li.is-active a:hover,.hero.is-primary-dark .tabs.is-toggle li.is-active a,.hero.is-primary-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#009e86}.hero.is-primary-dark.is-bold,.hero.is-primary.is-bold{background-image:linear-gradient(141deg,#006b49 0,#009e86 71%,#00b5b8 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-primary-dark.is-bold .navbar-menu,.hero.is-primary.is-bold .navbar-menu{background-image:linear-gradient(141deg,#006b49 0,#009e86 71%,#00b5b8 100%)}}@media (prefers-color-scheme:dark){.hero.is-link,.hero.is-link-dark{background-color:#205bbc;color:#fff}.hero.is-link a:not(.button):not(.dropdown-item):not(.tag),.hero.is-link strong,.hero.is-link-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-link-dark strong{color:inherit}.hero.is-link .title,.hero.is-link-dark .title{color:#fff}.hero.is-link .subtitle,.hero.is-link-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-link .subtitle a:not(.button),.hero.is-link .subtitle strong,.hero.is-link-dark .subtitle a:not(.button),.hero.is-link-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-link .navbar-menu,.hero.is-link-dark .navbar-menu{background-color:#205bbc}}@media (prefers-color-scheme:dark){.hero.is-link .navbar-item,.hero.is-link .navbar-link,.hero.is-link-dark .navbar-item,.hero.is-link-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-link .navbar-link.is-active,.hero.is-link .navbar-link:hover,.hero.is-link a.navbar-item.is-active,.hero.is-link a.navbar-item:hover,.hero.is-link-dark .navbar-link.is-active,.hero.is-link-dark .navbar-link:hover,.hero.is-link-dark a.navbar-item.is-active,.hero.is-link-dark a.navbar-item:hover{background-color:#1c51a6;color:#fff}.hero.is-link .tabs a,.hero.is-link-dark .tabs a{color:#fff;opacity:.9}.hero.is-link .tabs a:hover,.hero.is-link-dark .tabs a:hover{opacity:1}.hero.is-link .tabs li.is-active a,.hero.is-link-dark .tabs li.is-active a{opacity:1}.hero.is-link .tabs.is-boxed a,.hero.is-link .tabs.is-toggle a,.hero.is-link-dark .tabs.is-boxed a,.hero.is-link-dark .tabs.is-toggle a{color:#fff}.hero.is-link .tabs.is-boxed a:hover,.hero.is-link .tabs.is-toggle a:hover,.hero.is-link-dark .tabs.is-boxed a:hover,.hero.is-link-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-link .tabs.is-boxed li.is-active a,.hero.is-link .tabs.is-boxed li.is-active a:hover,.hero.is-link .tabs.is-toggle li.is-active a,.hero.is-link .tabs.is-toggle li.is-active a:hover,.hero.is-link-dark .tabs.is-boxed li.is-active a,.hero.is-link-dark .tabs.is-boxed li.is-active a:hover,.hero.is-link-dark .tabs.is-toggle li.is-active a,.hero.is-link-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#205bbc}.hero.is-link-dark.is-bold,.hero.is-link.is-bold{background-image:linear-gradient(141deg,#105b98 0,#205bbc 71%,#1d46d7 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-link-dark.is-bold .navbar-menu,.hero.is-link.is-bold .navbar-menu{background-image:linear-gradient(141deg,#105b98 0,#205bbc 71%,#1d46d7 100%)}}@media (prefers-color-scheme:dark){.hero.is-info,.hero.is-info-dark{background-color:#0f81cc;color:#fff}.hero.is-info a:not(.button):not(.dropdown-item):not(.tag),.hero.is-info strong,.hero.is-info-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-info-dark strong{color:inherit}.hero.is-info .title,.hero.is-info-dark .title{color:#fff}.hero.is-info .subtitle,.hero.is-info-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-info .subtitle a:not(.button),.hero.is-info .subtitle strong,.hero.is-info-dark .subtitle a:not(.button),.hero.is-info-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-info .navbar-menu,.hero.is-info-dark .navbar-menu{background-color:#0f81cc}}@media (prefers-color-scheme:dark){.hero.is-info .navbar-item,.hero.is-info .navbar-link,.hero.is-info-dark .navbar-item,.hero.is-info-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-info .navbar-link.is-active,.hero.is-info .navbar-link:hover,.hero.is-info a.navbar-item.is-active,.hero.is-info a.navbar-item:hover,.hero.is-info-dark .navbar-link.is-active,.hero.is-info-dark .navbar-link:hover,.hero.is-info-dark a.navbar-item.is-active,.hero.is-info-dark a.navbar-item:hover{background-color:#0e72b4;color:#fff}.hero.is-info .tabs a,.hero.is-info-dark .tabs a{color:#fff;opacity:.9}.hero.is-info .tabs a:hover,.hero.is-info-dark .tabs a:hover{opacity:1}.hero.is-info .tabs li.is-active a,.hero.is-info-dark .tabs li.is-active a{opacity:1}.hero.is-info .tabs.is-boxed a,.hero.is-info .tabs.is-toggle a,.hero.is-info-dark .tabs.is-boxed a,.hero.is-info-dark .tabs.is-toggle a{color:#fff}.hero.is-info .tabs.is-boxed a:hover,.hero.is-info .tabs.is-toggle a:hover,.hero.is-info-dark .tabs.is-boxed a:hover,.hero.is-info-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-info .tabs.is-boxed li.is-active a,.hero.is-info .tabs.is-boxed li.is-active a:hover,.hero.is-info .tabs.is-toggle li.is-active a,.hero.is-info .tabs.is-toggle li.is-active a:hover,.hero.is-info-dark .tabs.is-boxed li.is-active a,.hero.is-info-dark .tabs.is-boxed li.is-active a:hover,.hero.is-info-dark .tabs.is-toggle li.is-active a,.hero.is-info-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#0f81cc}.hero.is-info-dark.is-bold,.hero.is-info.is-bold{background-image:linear-gradient(141deg,#037fa5 0,#0f81cc 71%,#0b6cea 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-info-dark.is-bold .navbar-menu,.hero.is-info.is-bold .navbar-menu{background-image:linear-gradient(141deg,#037fa5 0,#0f81cc 71%,#0b6cea 100%)}}@media (prefers-color-scheme:dark){.hero.is-success,.hero.is-success-dark{background-color:#1ca64c;color:#fff}.hero.is-success a:not(.button):not(.dropdown-item):not(.tag),.hero.is-success strong,.hero.is-success-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-success-dark strong{color:inherit}.hero.is-success .title,.hero.is-success-dark .title{color:#fff}.hero.is-success .subtitle,.hero.is-success-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-success .subtitle a:not(.button),.hero.is-success .subtitle strong,.hero.is-success-dark .subtitle a:not(.button),.hero.is-success-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-success .navbar-menu,.hero.is-success-dark .navbar-menu{background-color:#1ca64c}}@media (prefers-color-scheme:dark){.hero.is-success .navbar-item,.hero.is-success .navbar-link,.hero.is-success-dark .navbar-item,.hero.is-success-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-success .navbar-link.is-active,.hero.is-success .navbar-link:hover,.hero.is-success a.navbar-item.is-active,.hero.is-success a.navbar-item:hover,.hero.is-success-dark .navbar-link.is-active,.hero.is-success-dark .navbar-link:hover,.hero.is-success-dark a.navbar-item.is-active,.hero.is-success-dark a.navbar-item:hover{background-color:#189042;color:#fff}.hero.is-success .tabs a,.hero.is-success-dark .tabs a{color:#fff;opacity:.9}.hero.is-success .tabs a:hover,.hero.is-success-dark .tabs a:hover{opacity:1}.hero.is-success .tabs li.is-active a,.hero.is-success-dark .tabs li.is-active a{opacity:1}.hero.is-success .tabs.is-boxed a,.hero.is-success .tabs.is-toggle a,.hero.is-success-dark .tabs.is-boxed a,.hero.is-success-dark .tabs.is-toggle a{color:#fff}.hero.is-success .tabs.is-boxed a:hover,.hero.is-success .tabs.is-toggle a:hover,.hero.is-success-dark .tabs.is-boxed a:hover,.hero.is-success-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-success .tabs.is-boxed li.is-active a,.hero.is-success .tabs.is-boxed li.is-active a:hover,.hero.is-success .tabs.is-toggle li.is-active a,.hero.is-success .tabs.is-toggle li.is-active a:hover,.hero.is-success-dark .tabs.is-boxed li.is-active a,.hero.is-success-dark .tabs.is-boxed li.is-active a:hover,.hero.is-success-dark .tabs.is-toggle li.is-active a,.hero.is-success-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#1ca64c}.hero.is-success-dark.is-bold,.hero.is-success.is-bold{background-image:linear-gradient(141deg,#0e8123 0,#1ca64c 71%,#1ac170 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-success-dark.is-bold .navbar-menu,.hero.is-success.is-bold .navbar-menu{background-image:linear-gradient(141deg,#0e8123 0,#1ca64c 71%,#1ac170 100%)}}@media (prefers-color-scheme:dark){.hero.is-warning,.hero.is-warning-dark{background-color:#ffd324;color:rgba(0,0,0,.7)}.hero.is-warning a:not(.button):not(.dropdown-item):not(.tag),.hero.is-warning strong,.hero.is-warning-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-warning-dark strong{color:inherit}.hero.is-warning .title,.hero.is-warning-dark .title{color:rgba(0,0,0,.7)}.hero.is-warning .subtitle,.hero.is-warning-dark .subtitle{color:rgba(0,0,0,.9)}.hero.is-warning .subtitle a:not(.button),.hero.is-warning .subtitle strong,.hero.is-warning-dark .subtitle a:not(.button),.hero.is-warning-dark .subtitle strong{color:rgba(0,0,0,.7)}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-warning .navbar-menu,.hero.is-warning-dark .navbar-menu{background-color:#ffd324}}@media (prefers-color-scheme:dark){.hero.is-warning .navbar-item,.hero.is-warning .navbar-link,.hero.is-warning-dark .navbar-item,.hero.is-warning-dark .navbar-link{color:rgba(0,0,0,.7)}.hero.is-warning .navbar-link.is-active,.hero.is-warning .navbar-link:hover,.hero.is-warning a.navbar-item.is-active,.hero.is-warning a.navbar-item:hover,.hero.is-warning-dark .navbar-link.is-active,.hero.is-warning-dark .navbar-link:hover,.hero.is-warning-dark a.navbar-item.is-active,.hero.is-warning-dark a.navbar-item:hover{background-color:#ffce0a;color:rgba(0,0,0,.7)}.hero.is-warning .tabs a,.hero.is-warning-dark .tabs a{color:rgba(0,0,0,.7);opacity:.9}.hero.is-warning .tabs a:hover,.hero.is-warning-dark .tabs a:hover{opacity:1}.hero.is-warning .tabs li.is-active a,.hero.is-warning-dark .tabs li.is-active a{opacity:1}.hero.is-warning .tabs.is-boxed a,.hero.is-warning .tabs.is-toggle a,.hero.is-warning-dark .tabs.is-boxed a,.hero.is-warning-dark .tabs.is-toggle a{color:rgba(0,0,0,.7)}.hero.is-warning .tabs.is-boxed a:hover,.hero.is-warning .tabs.is-toggle a:hover,.hero.is-warning-dark .tabs.is-boxed a:hover,.hero.is-warning-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-warning .tabs.is-boxed li.is-active a,.hero.is-warning .tabs.is-boxed li.is-active a:hover,.hero.is-warning .tabs.is-toggle li.is-active a,.hero.is-warning .tabs.is-toggle li.is-active a:hover,.hero.is-warning-dark .tabs.is-boxed li.is-active a,.hero.is-warning-dark .tabs.is-boxed li.is-active a:hover,.hero.is-warning-dark .tabs.is-toggle li.is-active a,.hero.is-warning-dark .tabs.is-toggle li.is-active a:hover{background-color:rgba(0,0,0,.7);border-color:rgba(0,0,0,.7);color:#ffd324}.hero.is-warning-dark.is-bold,.hero.is-warning.is-bold{background-image:linear-gradient(141deg,#f09800 0,#ffd324 71%,#fff93d 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-warning-dark.is-bold .navbar-menu,.hero.is-warning.is-bold .navbar-menu{background-image:linear-gradient(141deg,#f09800 0,#ffd324 71%,#fff93d 100%)}}@media (prefers-color-scheme:dark){.hero.is-danger,.hero.is-danger-dark{background-color:#ff0537;color:#fff}.hero.is-danger a:not(.button):not(.dropdown-item):not(.tag),.hero.is-danger strong,.hero.is-danger-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-danger-dark strong{color:inherit}.hero.is-danger .title,.hero.is-danger-dark .title{color:#fff}.hero.is-danger .subtitle,.hero.is-danger-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-danger .subtitle a:not(.button),.hero.is-danger .subtitle strong,.hero.is-danger-dark .subtitle a:not(.button),.hero.is-danger-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-danger .navbar-menu,.hero.is-danger-dark .navbar-menu{background-color:#ff0537}}@media (prefers-color-scheme:dark){.hero.is-danger .navbar-item,.hero.is-danger .navbar-link,.hero.is-danger-dark .navbar-item,.hero.is-danger-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-danger .navbar-link.is-active,.hero.is-danger .navbar-link:hover,.hero.is-danger a.navbar-item.is-active,.hero.is-danger a.navbar-item:hover,.hero.is-danger-dark .navbar-link.is-active,.hero.is-danger-dark .navbar-link:hover,.hero.is-danger-dark a.navbar-item.is-active,.hero.is-danger-dark a.navbar-item:hover{background-color:#eb002f;color:#fff}.hero.is-danger .tabs a,.hero.is-danger-dark .tabs a{color:#fff;opacity:.9}.hero.is-danger .tabs a:hover,.hero.is-danger-dark .tabs a:hover{opacity:1}.hero.is-danger .tabs li.is-active a,.hero.is-danger-dark .tabs li.is-active a{opacity:1}.hero.is-danger .tabs.is-boxed a,.hero.is-danger .tabs.is-toggle a,.hero.is-danger-dark .tabs.is-boxed a,.hero.is-danger-dark .tabs.is-toggle a{color:#fff}.hero.is-danger .tabs.is-boxed a:hover,.hero.is-danger .tabs.is-toggle a:hover,.hero.is-danger-dark .tabs.is-boxed a:hover,.hero.is-danger-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-danger .tabs.is-boxed li.is-active a,.hero.is-danger .tabs.is-boxed li.is-active a:hover,.hero.is-danger .tabs.is-toggle li.is-active a,.hero.is-danger .tabs.is-toggle li.is-active a:hover,.hero.is-danger-dark .tabs.is-boxed li.is-active a,.hero.is-danger-dark .tabs.is-boxed li.is-active a:hover,.hero.is-danger-dark .tabs.is-toggle li.is-active a,.hero.is-danger-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#ff0537}.hero.is-danger-dark.is-bold,.hero.is-danger.is-bold{background-image:linear-gradient(141deg,#d1004d 0,#ff0537 71%,#ff1f26 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-danger-dark.is-bold .navbar-menu,.hero.is-danger.is-bold .navbar-menu{background-image:linear-gradient(141deg,#d1004d 0,#ff0537 71%,#ff1f26 100%)}}@media (prefers-color-scheme:dark){.hero.is-white-dark{background-color:#fff;color:#0a0a0a}.hero.is-white-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-white-dark strong{color:inherit}.hero.is-white-dark .title{color:#0a0a0a}.hero.is-white-dark .subtitle{color:rgba(10,10,10,.9)}.hero.is-white-dark .subtitle a:not(.button),.hero.is-white-dark .subtitle strong{color:#0a0a0a}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-white-dark .navbar-menu{background-color:#fff}}@media (prefers-color-scheme:dark){.hero.is-white-dark .navbar-item,.hero.is-white-dark .navbar-link{color:rgba(10,10,10,.7)}.hero.is-white-dark .navbar-link.is-active,.hero.is-white-dark .navbar-link:hover,.hero.is-white-dark a.navbar-item.is-active,.hero.is-white-dark a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.hero.is-white-dark .tabs a{color:#0a0a0a;opacity:.9}.hero.is-white-dark .tabs a:hover{opacity:1}.hero.is-white-dark .tabs li.is-active a{opacity:1}.hero.is-white-dark .tabs.is-boxed a,.hero.is-white-dark .tabs.is-toggle a{color:#0a0a0a}.hero.is-white-dark .tabs.is-boxed a:hover,.hero.is-white-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-white-dark .tabs.is-boxed li.is-active a,.hero.is-white-dark .tabs.is-boxed li.is-active a:hover,.hero.is-white-dark .tabs.is-toggle li.is-active a,.hero.is-white-dark .tabs.is-toggle li.is-active a:hover{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.hero.is-white-dark.is-bold{background-image:linear-gradient(141deg,#e6e6e6 0,#fff 71%,#fff 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-white-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#e6e6e6 0,#fff 71%,#fff 100%)}}@media (prefers-color-scheme:dark){.hero.is-black-dark{background-color:#0a0a0a;color:#fff}.hero.is-black-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-black-dark strong{color:inherit}.hero.is-black-dark .title{color:#fff}.hero.is-black-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-black-dark .subtitle a:not(.button),.hero.is-black-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-black-dark .navbar-menu{background-color:#0a0a0a}}@media (prefers-color-scheme:dark){.hero.is-black-dark .navbar-item,.hero.is-black-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-black-dark .navbar-link.is-active,.hero.is-black-dark .navbar-link:hover,.hero.is-black-dark a.navbar-item.is-active,.hero.is-black-dark a.navbar-item:hover{background-color:#000;color:#fff}.hero.is-black-dark .tabs a{color:#fff;opacity:.9}.hero.is-black-dark .tabs a:hover{opacity:1}.hero.is-black-dark .tabs li.is-active a{opacity:1}.hero.is-black-dark .tabs.is-boxed a,.hero.is-black-dark .tabs.is-toggle a{color:#fff}.hero.is-black-dark .tabs.is-boxed a:hover,.hero.is-black-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-black-dark .tabs.is-boxed li.is-active a,.hero.is-black-dark .tabs.is-boxed li.is-active a:hover,.hero.is-black-dark .tabs.is-toggle li.is-active a,.hero.is-black-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#0a0a0a}.hero.is-black-dark.is-bold{background-image:linear-gradient(141deg,#000 0,#0a0a0a 71%,#181616 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-black-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#000 0,#0a0a0a 71%,#181616 100%)}}@media (prefers-color-scheme:dark){.hero.is-light-dark{background-color:#f5f5f5;color:#363636}.hero.is-light-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-light-dark strong{color:inherit}.hero.is-light-dark .title{color:#363636}.hero.is-light-dark .subtitle{color:rgba(54,54,54,.9)}.hero.is-light-dark .subtitle a:not(.button),.hero.is-light-dark .subtitle strong{color:#363636}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-light-dark .navbar-menu{background-color:#f5f5f5}}@media (prefers-color-scheme:dark){.hero.is-light-dark .navbar-item,.hero.is-light-dark .navbar-link{color:rgba(54,54,54,.7)}.hero.is-light-dark .navbar-link.is-active,.hero.is-light-dark .navbar-link:hover,.hero.is-light-dark a.navbar-item.is-active,.hero.is-light-dark a.navbar-item:hover{background-color:#e8e8e8;color:#363636}.hero.is-light-dark .tabs a{color:#363636;opacity:.9}.hero.is-light-dark .tabs a:hover{opacity:1}.hero.is-light-dark .tabs li.is-active a{opacity:1}.hero.is-light-dark .tabs.is-boxed a,.hero.is-light-dark .tabs.is-toggle a{color:#363636}.hero.is-light-dark .tabs.is-boxed a:hover,.hero.is-light-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-light-dark .tabs.is-boxed li.is-active a,.hero.is-light-dark .tabs.is-boxed li.is-active a:hover,.hero.is-light-dark .tabs.is-toggle li.is-active a,.hero.is-light-dark .tabs.is-toggle li.is-active a:hover{background-color:#363636;border-color:#363636;color:#f5f5f5}.hero.is-light-dark.is-bold{background-image:linear-gradient(141deg,#dfd8d9 0,#f5f5f5 71%,#fff 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-light-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#dfd8d9 0,#f5f5f5 71%,#fff 100%)}}@media (prefers-color-scheme:dark){.hero.is-dark-dark{background-color:#363636;color:#f5f5f5}.hero.is-dark-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-dark-dark strong{color:inherit}.hero.is-dark-dark .title{color:#f5f5f5}.hero.is-dark-dark .subtitle{color:rgba(245,245,245,.9)}.hero.is-dark-dark .subtitle a:not(.button),.hero.is-dark-dark .subtitle strong{color:#f5f5f5}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-dark-dark .navbar-menu{background-color:#363636}}@media (prefers-color-scheme:dark){.hero.is-dark-dark .navbar-item,.hero.is-dark-dark .navbar-link{color:rgba(245,245,245,.7)}.hero.is-dark-dark .navbar-link.is-active,.hero.is-dark-dark .navbar-link:hover,.hero.is-dark-dark a.navbar-item.is-active,.hero.is-dark-dark a.navbar-item:hover{background-color:#292929;color:#f5f5f5}.hero.is-dark-dark .tabs a{color:#f5f5f5;opacity:.9}.hero.is-dark-dark .tabs a:hover{opacity:1}.hero.is-dark-dark .tabs li.is-active a{opacity:1}.hero.is-dark-dark .tabs.is-boxed a,.hero.is-dark-dark .tabs.is-toggle a{color:#f5f5f5}.hero.is-dark-dark .tabs.is-boxed a:hover,.hero.is-dark-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-dark-dark .tabs.is-boxed li.is-active a,.hero.is-dark-dark .tabs.is-boxed li.is-active a:hover,.hero.is-dark-dark .tabs.is-toggle li.is-active a,.hero.is-dark-dark .tabs.is-toggle li.is-active a:hover{background-color:#f5f5f5;border-color:#f5f5f5;color:#363636}.hero.is-dark-dark.is-bold{background-image:linear-gradient(141deg,#1f191a 0,#363636 71%,#46403f 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-dark-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#1f191a 0,#363636 71%,#46403f 100%)}}@media (prefers-color-scheme:dark){.hero.is-primary-dark{background-color:#00d1b2;color:#fff}.hero.is-primary-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-primary-dark strong{color:inherit}.hero.is-primary-dark .title{color:#fff}.hero.is-primary-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-primary-dark .subtitle a:not(.button),.hero.is-primary-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-primary-dark .navbar-menu{background-color:#00d1b2}}@media (prefers-color-scheme:dark){.hero.is-primary-dark .navbar-item,.hero.is-primary-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-primary-dark .navbar-link.is-active,.hero.is-primary-dark .navbar-link:hover,.hero.is-primary-dark a.navbar-item.is-active,.hero.is-primary-dark a.navbar-item:hover{background-color:#00b89c;color:#fff}.hero.is-primary-dark .tabs a{color:#fff;opacity:.9}.hero.is-primary-dark .tabs a:hover{opacity:1}.hero.is-primary-dark .tabs li.is-active a{opacity:1}.hero.is-primary-dark .tabs.is-boxed a,.hero.is-primary-dark .tabs.is-toggle a{color:#fff}.hero.is-primary-dark .tabs.is-boxed a:hover,.hero.is-primary-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-primary-dark .tabs.is-boxed li.is-active a,.hero.is-primary-dark .tabs.is-boxed li.is-active a:hover,.hero.is-primary-dark .tabs.is-toggle li.is-active a,.hero.is-primary-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#00d1b2}.hero.is-primary-dark.is-bold{background-image:linear-gradient(141deg,#009e6c 0,#00d1b2 71%,#00e7eb 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-primary-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#009e6c 0,#00d1b2 71%,#00e7eb 100%)}}@media (prefers-color-scheme:dark){.hero.is-link-dark{background-color:#3273dc;color:#fff}.hero.is-link-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-link-dark strong{color:inherit}.hero.is-link-dark .title{color:#fff}.hero.is-link-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-link-dark .subtitle a:not(.button),.hero.is-link-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-link-dark .navbar-menu{background-color:#3273dc}}@media (prefers-color-scheme:dark){.hero.is-link-dark .navbar-item,.hero.is-link-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-link-dark .navbar-link.is-active,.hero.is-link-dark .navbar-link:hover,.hero.is-link-dark a.navbar-item.is-active,.hero.is-link-dark a.navbar-item:hover{background-color:#2366d1;color:#fff}.hero.is-link-dark .tabs a{color:#fff;opacity:.9}.hero.is-link-dark .tabs a:hover{opacity:1}.hero.is-link-dark .tabs li.is-active a{opacity:1}.hero.is-link-dark .tabs.is-boxed a,.hero.is-link-dark .tabs.is-toggle a{color:#fff}.hero.is-link-dark .tabs.is-boxed a:hover,.hero.is-link-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-link-dark .tabs.is-boxed li.is-active a,.hero.is-link-dark .tabs.is-boxed li.is-active a:hover,.hero.is-link-dark .tabs.is-toggle li.is-active a,.hero.is-link-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#3273dc}.hero.is-link-dark.is-bold{background-image:linear-gradient(141deg,#1577c6 0,#3273dc 71%,#4366e5 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-link-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#1577c6 0,#3273dc 71%,#4366e5 100%)}}@media (prefers-color-scheme:dark){.hero.is-info-dark{background-color:#209cee;color:#fff}.hero.is-info-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-info-dark strong{color:inherit}.hero.is-info-dark .title{color:#fff}.hero.is-info-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-info-dark .subtitle a:not(.button),.hero.is-info-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-info-dark .navbar-menu{background-color:#209cee}}@media (prefers-color-scheme:dark){.hero.is-info-dark .navbar-item,.hero.is-info-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-info-dark .navbar-link.is-active,.hero.is-info-dark .navbar-link:hover,.hero.is-info-dark a.navbar-item.is-active,.hero.is-info-dark a.navbar-item:hover{background-color:#118fe4;color:#fff}.hero.is-info-dark .tabs a{color:#fff;opacity:.9}.hero.is-info-dark .tabs a:hover{opacity:1}.hero.is-info-dark .tabs li.is-active a{opacity:1}.hero.is-info-dark .tabs.is-boxed a,.hero.is-info-dark .tabs.is-toggle a{color:#fff}.hero.is-info-dark .tabs.is-boxed a:hover,.hero.is-info-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-info-dark .tabs.is-boxed li.is-active a,.hero.is-info-dark .tabs.is-boxed li.is-active a:hover,.hero.is-info-dark .tabs.is-toggle li.is-active a,.hero.is-info-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#209cee}.hero.is-info-dark.is-bold{background-image:linear-gradient(141deg,#04a6d7 0,#209cee 71%,#3287f5 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-info-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#04a6d7 0,#209cee 71%,#3287f5 100%)}}@media (prefers-color-scheme:dark){.hero.is-success-dark{background-color:#23d160;color:#fff}.hero.is-success-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-success-dark strong{color:inherit}.hero.is-success-dark .title{color:#fff}.hero.is-success-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-success-dark .subtitle a:not(.button),.hero.is-success-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-success-dark .navbar-menu{background-color:#23d160}}@media (prefers-color-scheme:dark){.hero.is-success-dark .navbar-item,.hero.is-success-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-success-dark .navbar-link.is-active,.hero.is-success-dark .navbar-link:hover,.hero.is-success-dark a.navbar-item.is-active,.hero.is-success-dark a.navbar-item:hover{background-color:#20bc56;color:#fff}.hero.is-success-dark .tabs a{color:#fff;opacity:.9}.hero.is-success-dark .tabs a:hover{opacity:1}.hero.is-success-dark .tabs li.is-active a{opacity:1}.hero.is-success-dark .tabs.is-boxed a,.hero.is-success-dark .tabs.is-toggle a{color:#fff}.hero.is-success-dark .tabs.is-boxed a:hover,.hero.is-success-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-success-dark .tabs.is-boxed li.is-active a,.hero.is-success-dark .tabs.is-boxed li.is-active a:hover,.hero.is-success-dark .tabs.is-toggle li.is-active a,.hero.is-success-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#23d160}.hero.is-success-dark.is-bold{background-image:linear-gradient(141deg,#12af2f 0,#23d160 71%,#2ce28a 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-success-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#12af2f 0,#23d160 71%,#2ce28a 100%)}}@media (prefers-color-scheme:dark){.hero.is-warning-dark{background-color:#ffdd57;color:rgba(0,0,0,.7)}.hero.is-warning-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-warning-dark strong{color:inherit}.hero.is-warning-dark .title{color:rgba(0,0,0,.7)}.hero.is-warning-dark .subtitle{color:rgba(0,0,0,.9)}.hero.is-warning-dark .subtitle a:not(.button),.hero.is-warning-dark .subtitle strong{color:rgba(0,0,0,.7)}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-warning-dark .navbar-menu{background-color:#ffdd57}}@media (prefers-color-scheme:dark){.hero.is-warning-dark .navbar-item,.hero.is-warning-dark .navbar-link{color:rgba(0,0,0,.7)}.hero.is-warning-dark .navbar-link.is-active,.hero.is-warning-dark .navbar-link:hover,.hero.is-warning-dark a.navbar-item.is-active,.hero.is-warning-dark a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.hero.is-warning-dark .tabs a{color:rgba(0,0,0,.7);opacity:.9}.hero.is-warning-dark .tabs a:hover{opacity:1}.hero.is-warning-dark .tabs li.is-active a{opacity:1}.hero.is-warning-dark .tabs.is-boxed a,.hero.is-warning-dark .tabs.is-toggle a{color:rgba(0,0,0,.7)}.hero.is-warning-dark .tabs.is-boxed a:hover,.hero.is-warning-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-warning-dark .tabs.is-boxed li.is-active a,.hero.is-warning-dark .tabs.is-boxed li.is-active a:hover,.hero.is-warning-dark .tabs.is-toggle li.is-active a,.hero.is-warning-dark .tabs.is-toggle li.is-active a:hover{background-color:rgba(0,0,0,.7);border-color:rgba(0,0,0,.7);color:#ffdd57}.hero.is-warning-dark.is-bold{background-image:linear-gradient(141deg,#ffaf24 0,#ffdd57 71%,#fffa70 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-warning-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#ffaf24 0,#ffdd57 71%,#fffa70 100%)}}@media (prefers-color-scheme:dark){.hero.is-danger-dark{background-color:#ff3860;color:#fff}.hero.is-danger-dark a:not(.button):not(.dropdown-item):not(.tag),.hero.is-danger-dark strong{color:inherit}.hero.is-danger-dark .title{color:#fff}.hero.is-danger-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-danger-dark .subtitle a:not(.button),.hero.is-danger-dark .subtitle strong{color:#fff}}@media screen and (prefers-color-scheme:dark) and (max-width:1087px){.hero.is-danger-dark .navbar-menu{background-color:#ff3860}}@media (prefers-color-scheme:dark){.hero.is-danger-dark .navbar-item,.hero.is-danger-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-danger-dark .navbar-link.is-active,.hero.is-danger-dark .navbar-link:hover,.hero.is-danger-dark a.navbar-item.is-active,.hero.is-danger-dark a.navbar-item:hover{background-color:#ff1f4b;color:#fff}.hero.is-danger-dark .tabs a{color:#fff;opacity:.9}.hero.is-danger-dark .tabs a:hover{opacity:1}.hero.is-danger-dark .tabs li.is-active a{opacity:1}.hero.is-danger-dark .tabs.is-boxed a,.hero.is-danger-dark .tabs.is-toggle a{color:#fff}.hero.is-danger-dark .tabs.is-boxed a:hover,.hero.is-danger-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-danger-dark .tabs.is-boxed li.is-active a,.hero.is-danger-dark .tabs.is-boxed li.is-active a:hover,.hero.is-danger-dark .tabs.is-toggle li.is-active a,.hero.is-danger-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#ff3860}.hero.is-danger-dark.is-bold{background-image:linear-gradient(141deg,#ff0561 0,#ff3860 71%,#ff5257 100%)}}@media screen and (prefers-color-scheme:dark) and (max-width:768px){.hero.is-danger-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#ff0561 0,#ff3860 71%,#ff5257 100%)}}@media (prefers-color-scheme:dark){.footer{background-color:#121212}} diff --git a/http-ui/public/bulma.min.css b/http-ui/public/bulma.min.css deleted file mode 100644 index 8340d7fba..000000000 --- a/http-ui/public/bulma.min.css +++ /dev/null @@ -1 +0,0 @@ -/*! bulma.io v0.9.0 | MIT License | github.com/jgthms/bulma */@-webkit-keyframes spinAround{from{transform:rotate(0)}to{transform:rotate(359deg)}}@keyframes spinAround{from{transform:rotate(0)}to{transform:rotate(359deg)}}.breadcrumb,.button,.delete,.file,.is-unselectable,.modal-close,.pagination-ellipsis,.pagination-link,.pagination-next,.pagination-previous,.tabs{-webkit-touch-callout:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none}.navbar-link:not(.is-arrowless)::after,.select:not(.is-multiple):not(.is-loading)::after{border:3px solid transparent;border-radius:2px;border-right:0;border-top:0;content:" ";display:block;height:.625em;margin-top:-.4375em;pointer-events:none;position:absolute;top:50%;transform:rotate(-45deg);transform-origin:center;width:.625em}.block:not(:last-child),.box:not(:last-child),.breadcrumb:not(:last-child),.content:not(:last-child),.highlight:not(:last-child),.level:not(:last-child),.message:not(:last-child),.notification:not(:last-child),.pagination:not(:last-child),.progress:not(:last-child),.subtitle:not(:last-child),.table-container:not(:last-child),.table:not(:last-child),.tabs:not(:last-child),.title:not(:last-child){margin-bottom:1.5rem}.delete,.modal-close{-moz-appearance:none;-webkit-appearance:none;background-color:rgba(10,10,10,.2);border:none;border-radius:290486px;cursor:pointer;pointer-events:auto;display:inline-block;flex-grow:0;flex-shrink:0;font-size:0;height:20px;max-height:20px;max-width:20px;min-height:20px;min-width:20px;outline:0;position:relative;vertical-align:top;width:20px}.delete::after,.delete::before,.modal-close::after,.modal-close::before{background-color:#fff;content:"";display:block;left:50%;position:absolute;top:50%;transform:translateX(-50%) translateY(-50%) rotate(45deg);transform-origin:center center}.delete::before,.modal-close::before{height:2px;width:50%}.delete::after,.modal-close::after{height:50%;width:2px}.delete:focus,.delete:hover,.modal-close:focus,.modal-close:hover{background-color:rgba(10,10,10,.3)}.delete:active,.modal-close:active{background-color:rgba(10,10,10,.4)}.is-small.delete,.is-small.modal-close{height:16px;max-height:16px;max-width:16px;min-height:16px;min-width:16px;width:16px}.is-medium.delete,.is-medium.modal-close{height:24px;max-height:24px;max-width:24px;min-height:24px;min-width:24px;width:24px}.is-large.delete,.is-large.modal-close{height:32px;max-height:32px;max-width:32px;min-height:32px;min-width:32px;width:32px}.button.is-loading::after,.control.is-loading::after,.loader,.select.is-loading::after{-webkit-animation:spinAround .5s infinite linear;animation:spinAround .5s infinite linear;border:2px solid #dbdbdb;border-radius:290486px;border-right-color:transparent;border-top-color:transparent;content:"";display:block;height:1em;position:relative;width:1em}.hero-video,.image.is-16by9 .has-ratio,.image.is-16by9 img,.image.is-1by1 .has-ratio,.image.is-1by1 img,.image.is-1by2 .has-ratio,.image.is-1by2 img,.image.is-1by3 .has-ratio,.image.is-1by3 img,.image.is-2by1 .has-ratio,.image.is-2by1 img,.image.is-2by3 .has-ratio,.image.is-2by3 img,.image.is-3by1 .has-ratio,.image.is-3by1 img,.image.is-3by2 .has-ratio,.image.is-3by2 img,.image.is-3by4 .has-ratio,.image.is-3by4 img,.image.is-3by5 .has-ratio,.image.is-3by5 img,.image.is-4by3 .has-ratio,.image.is-4by3 img,.image.is-4by5 .has-ratio,.image.is-4by5 img,.image.is-5by3 .has-ratio,.image.is-5by3 img,.image.is-5by4 .has-ratio,.image.is-5by4 img,.image.is-9by16 .has-ratio,.image.is-9by16 img,.image.is-square .has-ratio,.image.is-square img,.is-overlay,.modal,.modal-background{bottom:0;left:0;position:absolute;right:0;top:0}.button,.file-cta,.file-name,.input,.pagination-ellipsis,.pagination-link,.pagination-next,.pagination-previous,.select select,.textarea{-moz-appearance:none;-webkit-appearance:none;align-items:center;border:1px solid transparent;border-radius:4px;box-shadow:none;display:inline-flex;font-size:1rem;height:2.5em;justify-content:flex-start;line-height:1.5;padding-bottom:calc(.5em - 1px);padding-left:calc(.75em - 1px);padding-right:calc(.75em - 1px);padding-top:calc(.5em - 1px);position:relative;vertical-align:top}.button:active,.button:focus,.file-cta:active,.file-cta:focus,.file-name:active,.file-name:focus,.input:active,.input:focus,.is-active.button,.is-active.file-cta,.is-active.file-name,.is-active.input,.is-active.pagination-ellipsis,.is-active.pagination-link,.is-active.pagination-next,.is-active.pagination-previous,.is-active.textarea,.is-focused.button,.is-focused.file-cta,.is-focused.file-name,.is-focused.input,.is-focused.pagination-ellipsis,.is-focused.pagination-link,.is-focused.pagination-next,.is-focused.pagination-previous,.is-focused.textarea,.pagination-ellipsis:active,.pagination-ellipsis:focus,.pagination-link:active,.pagination-link:focus,.pagination-next:active,.pagination-next:focus,.pagination-previous:active,.pagination-previous:focus,.select select.is-active,.select select.is-focused,.select select:active,.select select:focus,.textarea:active,.textarea:focus{outline:0}.button[disabled],.file-cta[disabled],.file-name[disabled],.input[disabled],.pagination-ellipsis[disabled],.pagination-link[disabled],.pagination-next[disabled],.pagination-previous[disabled],.select fieldset[disabled] select,.select select[disabled],.textarea[disabled],fieldset[disabled] .button,fieldset[disabled] .file-cta,fieldset[disabled] .file-name,fieldset[disabled] .input,fieldset[disabled] .pagination-ellipsis,fieldset[disabled] .pagination-link,fieldset[disabled] .pagination-next,fieldset[disabled] .pagination-previous,fieldset[disabled] .select select,fieldset[disabled] .textarea{cursor:not-allowed}/*! minireset.css v0.0.6 | MIT License | github.com/jgthms/minireset.css */blockquote,body,dd,dl,dt,fieldset,figure,h1,h2,h3,h4,h5,h6,hr,html,iframe,legend,li,ol,p,pre,textarea,ul{margin:0;padding:0}h1,h2,h3,h4,h5,h6{font-size:100%;font-weight:400}ul{list-style:none}button,input,select,textarea{margin:0}html{box-sizing:border-box}*,::after,::before{box-sizing:inherit}img,video{height:auto;max-width:100%}iframe{border:0}table{border-collapse:collapse;border-spacing:0}td,th{padding:0}td:not([align]),th:not([align]){text-align:inherit}html{background-color:#fff;font-size:16px;-moz-osx-font-smoothing:grayscale;-webkit-font-smoothing:antialiased;min-width:300px;overflow-x:hidden;overflow-y:scroll;text-rendering:optimizeLegibility;-webkit-text-size-adjust:100%;-moz-text-size-adjust:100%;-ms-text-size-adjust:100%;text-size-adjust:100%}article,aside,figure,footer,header,hgroup,section{display:block}body,button,input,select,textarea{font-family:BlinkMacSystemFont,-apple-system,"Segoe UI",Roboto,Oxygen,Ubuntu,Cantarell,"Fira Sans","Droid Sans","Helvetica Neue",Helvetica,Arial,sans-serif}code,pre{-moz-osx-font-smoothing:auto;-webkit-font-smoothing:auto;font-family:monospace}body{color:#4a4a4a;font-size:1em;font-weight:400;line-height:1.5}a{color:#3273dc;cursor:pointer;text-decoration:none}a strong{color:currentColor}a:hover{color:#363636}code{background-color:#f5f5f5;color:#f14668;font-size:.875em;font-weight:400;padding:.25em .5em .25em}hr{background-color:#f5f5f5;border:none;display:block;height:2px;margin:1.5rem 0}img{height:auto;max-width:100%}input[type=checkbox],input[type=radio]{vertical-align:baseline}small{font-size:.875em}span{font-style:inherit;font-weight:inherit}strong{color:#363636;font-weight:700}fieldset{border:none}pre{-webkit-overflow-scrolling:touch;background-color:#f5f5f5;color:#4a4a4a;font-size:.875em;overflow-x:auto;padding:1.25rem 1.5rem;white-space:pre;word-wrap:normal}pre code{background-color:transparent;color:currentColor;font-size:1em;padding:0}table td,table th{vertical-align:top}table td:not([align]),table th:not([align]){text-align:inherit}table th{color:#363636}.box{background-color:#fff;border-radius:6px;box-shadow:0 .5em 1em -.125em rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.02);color:#4a4a4a;display:block;padding:1.25rem}a.box:focus,a.box:hover{box-shadow:0 .5em 1em -.125em rgba(10,10,10,.1),0 0 0 1px #3273dc}a.box:active{box-shadow:inset 0 1px 2px rgba(10,10,10,.2),0 0 0 1px #3273dc}.button{background-color:#fff;border-color:#dbdbdb;border-width:1px;color:#363636;cursor:pointer;justify-content:center;padding-bottom:calc(.5em - 1px);padding-left:1em;padding-right:1em;padding-top:calc(.5em - 1px);text-align:center;white-space:nowrap}.button strong{color:inherit}.button .icon,.button .icon.is-large,.button .icon.is-medium,.button .icon.is-small{height:1.5em;width:1.5em}.button .icon:first-child:not(:last-child){margin-right:calc(-.5em - 1px);margin-left:.25em}.button .icon:last-child:not(:first-child){margin-right:.25em;margin-left:calc(-.5em - 1px)}.button .icon:first-child:last-child{margin-left:calc(-.5em - 1px);margin-right:calc(-.5em - 1px)}.button.is-hovered,.button:hover{border-color:#b5b5b5;color:#363636}.button.is-focused,.button:focus{border-color:#3273dc;color:#363636}.button.is-focused:not(:active),.button:focus:not(:active){box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.button.is-active,.button:active{border-color:#4a4a4a;color:#363636}.button.is-text{background-color:transparent;border-color:transparent;color:#4a4a4a;text-decoration:underline}.button.is-text.is-focused,.button.is-text.is-hovered,.button.is-text:focus,.button.is-text:hover{background-color:#f5f5f5;color:#363636}.button.is-text.is-active,.button.is-text:active{background-color:#e8e8e8;color:#363636}.button.is-text[disabled],fieldset[disabled] .button.is-text{background-color:transparent;border-color:transparent;box-shadow:none}.button.is-white{background-color:#fff;border-color:transparent;color:#0a0a0a}.button.is-white.is-hovered,.button.is-white:hover{background-color:#f9f9f9;border-color:transparent;color:#0a0a0a}.button.is-white.is-focused,.button.is-white:focus{border-color:transparent;color:#0a0a0a}.button.is-white.is-focused:not(:active),.button.is-white:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.button.is-white.is-active,.button.is-white:active{background-color:#f2f2f2;border-color:transparent;color:#0a0a0a}.button.is-white[disabled],fieldset[disabled] .button.is-white{background-color:#fff;border-color:transparent;box-shadow:none}.button.is-white.is-inverted{background-color:#0a0a0a;color:#fff}.button.is-white.is-inverted.is-hovered,.button.is-white.is-inverted:hover{background-color:#000}.button.is-white.is-inverted[disabled],fieldset[disabled] .button.is-white.is-inverted{background-color:#0a0a0a;border-color:transparent;box-shadow:none;color:#fff}.button.is-white.is-loading::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-white.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-white.is-outlined.is-focused,.button.is-white.is-outlined.is-hovered,.button.is-white.is-outlined:focus,.button.is-white.is-outlined:hover{background-color:#fff;border-color:#fff;color:#0a0a0a}.button.is-white.is-outlined.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-white.is-outlined.is-loading.is-focused::after,.button.is-white.is-outlined.is-loading.is-hovered::after,.button.is-white.is-outlined.is-loading:focus::after,.button.is-white.is-outlined.is-loading:hover::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-white.is-outlined[disabled],fieldset[disabled] .button.is-white.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-white.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;color:#0a0a0a}.button.is-white.is-inverted.is-outlined.is-focused,.button.is-white.is-inverted.is-outlined.is-hovered,.button.is-white.is-inverted.is-outlined:focus,.button.is-white.is-inverted.is-outlined:hover{background-color:#0a0a0a;color:#fff}.button.is-white.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-white.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-white.is-inverted.is-outlined.is-loading:focus::after,.button.is-white.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-white.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-white.is-inverted.is-outlined{background-color:transparent;border-color:#0a0a0a;box-shadow:none;color:#0a0a0a}.button.is-black{background-color:#0a0a0a;border-color:transparent;color:#fff}.button.is-black.is-hovered,.button.is-black:hover{background-color:#040404;border-color:transparent;color:#fff}.button.is-black.is-focused,.button.is-black:focus{border-color:transparent;color:#fff}.button.is-black.is-focused:not(:active),.button.is-black:focus:not(:active){box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.button.is-black.is-active,.button.is-black:active{background-color:#000;border-color:transparent;color:#fff}.button.is-black[disabled],fieldset[disabled] .button.is-black{background-color:#0a0a0a;border-color:transparent;box-shadow:none}.button.is-black.is-inverted{background-color:#fff;color:#0a0a0a}.button.is-black.is-inverted.is-hovered,.button.is-black.is-inverted:hover{background-color:#f2f2f2}.button.is-black.is-inverted[disabled],fieldset[disabled] .button.is-black.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#0a0a0a}.button.is-black.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-black.is-outlined{background-color:transparent;border-color:#0a0a0a;color:#0a0a0a}.button.is-black.is-outlined.is-focused,.button.is-black.is-outlined.is-hovered,.button.is-black.is-outlined:focus,.button.is-black.is-outlined:hover{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.button.is-black.is-outlined.is-loading::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-black.is-outlined.is-loading.is-focused::after,.button.is-black.is-outlined.is-loading.is-hovered::after,.button.is-black.is-outlined.is-loading:focus::after,.button.is-black.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-black.is-outlined[disabled],fieldset[disabled] .button.is-black.is-outlined{background-color:transparent;border-color:#0a0a0a;box-shadow:none;color:#0a0a0a}.button.is-black.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-black.is-inverted.is-outlined.is-focused,.button.is-black.is-inverted.is-outlined.is-hovered,.button.is-black.is-inverted.is-outlined:focus,.button.is-black.is-inverted.is-outlined:hover{background-color:#fff;color:#0a0a0a}.button.is-black.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-black.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-black.is-inverted.is-outlined.is-loading:focus::after,.button.is-black.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #0a0a0a #0a0a0a!important}.button.is-black.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-black.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-light{background-color:#f5f5f5;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-light.is-hovered,.button.is-light:hover{background-color:#eee;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-light.is-focused,.button.is-light:focus{border-color:transparent;color:rgba(0,0,0,.7)}.button.is-light.is-focused:not(:active),.button.is-light:focus:not(:active){box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.button.is-light.is-active,.button.is-light:active{background-color:#e8e8e8;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-light[disabled],fieldset[disabled] .button.is-light{background-color:#f5f5f5;border-color:transparent;box-shadow:none}.button.is-light.is-inverted{background-color:rgba(0,0,0,.7);color:#f5f5f5}.button.is-light.is-inverted.is-hovered,.button.is-light.is-inverted:hover{background-color:rgba(0,0,0,.7)}.button.is-light.is-inverted[disabled],fieldset[disabled] .button.is-light.is-inverted{background-color:rgba(0,0,0,.7);border-color:transparent;box-shadow:none;color:#f5f5f5}.button.is-light.is-loading::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-light.is-outlined{background-color:transparent;border-color:#f5f5f5;color:#f5f5f5}.button.is-light.is-outlined.is-focused,.button.is-light.is-outlined.is-hovered,.button.is-light.is-outlined:focus,.button.is-light.is-outlined:hover{background-color:#f5f5f5;border-color:#f5f5f5;color:rgba(0,0,0,.7)}.button.is-light.is-outlined.is-loading::after{border-color:transparent transparent #f5f5f5 #f5f5f5!important}.button.is-light.is-outlined.is-loading.is-focused::after,.button.is-light.is-outlined.is-loading.is-hovered::after,.button.is-light.is-outlined.is-loading:focus::after,.button.is-light.is-outlined.is-loading:hover::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-light.is-outlined[disabled],fieldset[disabled] .button.is-light.is-outlined{background-color:transparent;border-color:#f5f5f5;box-shadow:none;color:#f5f5f5}.button.is-light.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);color:rgba(0,0,0,.7)}.button.is-light.is-inverted.is-outlined.is-focused,.button.is-light.is-inverted.is-outlined.is-hovered,.button.is-light.is-inverted.is-outlined:focus,.button.is-light.is-inverted.is-outlined:hover{background-color:rgba(0,0,0,.7);color:#f5f5f5}.button.is-light.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-light.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-light.is-inverted.is-outlined.is-loading:focus::after,.button.is-light.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #f5f5f5 #f5f5f5!important}.button.is-light.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-light.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);box-shadow:none;color:rgba(0,0,0,.7)}.button.is-dark{background-color:#363636;border-color:transparent;color:#fff}.button.is-dark.is-hovered,.button.is-dark:hover{background-color:#2f2f2f;border-color:transparent;color:#fff}.button.is-dark.is-focused,.button.is-dark:focus{border-color:transparent;color:#fff}.button.is-dark.is-focused:not(:active),.button.is-dark:focus:not(:active){box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.button.is-dark.is-active,.button.is-dark:active{background-color:#292929;border-color:transparent;color:#fff}.button.is-dark[disabled],fieldset[disabled] .button.is-dark{background-color:#363636;border-color:transparent;box-shadow:none}.button.is-dark.is-inverted{background-color:#fff;color:#363636}.button.is-dark.is-inverted.is-hovered,.button.is-dark.is-inverted:hover{background-color:#f2f2f2}.button.is-dark.is-inverted[disabled],fieldset[disabled] .button.is-dark.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#363636}.button.is-dark.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-dark.is-outlined{background-color:transparent;border-color:#363636;color:#363636}.button.is-dark.is-outlined.is-focused,.button.is-dark.is-outlined.is-hovered,.button.is-dark.is-outlined:focus,.button.is-dark.is-outlined:hover{background-color:#363636;border-color:#363636;color:#fff}.button.is-dark.is-outlined.is-loading::after{border-color:transparent transparent #363636 #363636!important}.button.is-dark.is-outlined.is-loading.is-focused::after,.button.is-dark.is-outlined.is-loading.is-hovered::after,.button.is-dark.is-outlined.is-loading:focus::after,.button.is-dark.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-dark.is-outlined[disabled],fieldset[disabled] .button.is-dark.is-outlined{background-color:transparent;border-color:#363636;box-shadow:none;color:#363636}.button.is-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-dark.is-inverted.is-outlined.is-focused,.button.is-dark.is-inverted.is-outlined.is-hovered,.button.is-dark.is-inverted.is-outlined:focus,.button.is-dark.is-inverted.is-outlined:hover{background-color:#fff;color:#363636}.button.is-dark.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-dark.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-dark.is-inverted.is-outlined.is-loading:focus::after,.button.is-dark.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #363636 #363636!important}.button.is-dark.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-dark.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-primary{background-color:#00d1b2;border-color:transparent;color:#fff}.button.is-primary.is-hovered,.button.is-primary:hover{background-color:#00c4a7;border-color:transparent;color:#fff}.button.is-primary.is-focused,.button.is-primary:focus{border-color:transparent;color:#fff}.button.is-primary.is-focused:not(:active),.button.is-primary:focus:not(:active){box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.button.is-primary.is-active,.button.is-primary:active{background-color:#00b89c;border-color:transparent;color:#fff}.button.is-primary[disabled],fieldset[disabled] .button.is-primary{background-color:#00d1b2;border-color:transparent;box-shadow:none}.button.is-primary.is-inverted{background-color:#fff;color:#00d1b2}.button.is-primary.is-inverted.is-hovered,.button.is-primary.is-inverted:hover{background-color:#f2f2f2}.button.is-primary.is-inverted[disabled],fieldset[disabled] .button.is-primary.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#00d1b2}.button.is-primary.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-primary.is-outlined{background-color:transparent;border-color:#00d1b2;color:#00d1b2}.button.is-primary.is-outlined.is-focused,.button.is-primary.is-outlined.is-hovered,.button.is-primary.is-outlined:focus,.button.is-primary.is-outlined:hover{background-color:#00d1b2;border-color:#00d1b2;color:#fff}.button.is-primary.is-outlined.is-loading::after{border-color:transparent transparent #00d1b2 #00d1b2!important}.button.is-primary.is-outlined.is-loading.is-focused::after,.button.is-primary.is-outlined.is-loading.is-hovered::after,.button.is-primary.is-outlined.is-loading:focus::after,.button.is-primary.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-primary.is-outlined[disabled],fieldset[disabled] .button.is-primary.is-outlined{background-color:transparent;border-color:#00d1b2;box-shadow:none;color:#00d1b2}.button.is-primary.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-primary.is-inverted.is-outlined.is-focused,.button.is-primary.is-inverted.is-outlined.is-hovered,.button.is-primary.is-inverted.is-outlined:focus,.button.is-primary.is-inverted.is-outlined:hover{background-color:#fff;color:#00d1b2}.button.is-primary.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-primary.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-primary.is-inverted.is-outlined.is-loading:focus::after,.button.is-primary.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #00d1b2 #00d1b2!important}.button.is-primary.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-primary.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-primary.is-light{background-color:#ebfffc;color:#00947e}.button.is-primary.is-light.is-hovered,.button.is-primary.is-light:hover{background-color:#defffa;border-color:transparent;color:#00947e}.button.is-primary.is-light.is-active,.button.is-primary.is-light:active{background-color:#d1fff8;border-color:transparent;color:#00947e}.button.is-link{background-color:#3273dc;border-color:transparent;color:#fff}.button.is-link.is-hovered,.button.is-link:hover{background-color:#276cda;border-color:transparent;color:#fff}.button.is-link.is-focused,.button.is-link:focus{border-color:transparent;color:#fff}.button.is-link.is-focused:not(:active),.button.is-link:focus:not(:active){box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.button.is-link.is-active,.button.is-link:active{background-color:#2366d1;border-color:transparent;color:#fff}.button.is-link[disabled],fieldset[disabled] .button.is-link{background-color:#3273dc;border-color:transparent;box-shadow:none}.button.is-link.is-inverted{background-color:#fff;color:#3273dc}.button.is-link.is-inverted.is-hovered,.button.is-link.is-inverted:hover{background-color:#f2f2f2}.button.is-link.is-inverted[disabled],fieldset[disabled] .button.is-link.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#3273dc}.button.is-link.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-link.is-outlined{background-color:transparent;border-color:#3273dc;color:#3273dc}.button.is-link.is-outlined.is-focused,.button.is-link.is-outlined.is-hovered,.button.is-link.is-outlined:focus,.button.is-link.is-outlined:hover{background-color:#3273dc;border-color:#3273dc;color:#fff}.button.is-link.is-outlined.is-loading::after{border-color:transparent transparent #3273dc #3273dc!important}.button.is-link.is-outlined.is-loading.is-focused::after,.button.is-link.is-outlined.is-loading.is-hovered::after,.button.is-link.is-outlined.is-loading:focus::after,.button.is-link.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-link.is-outlined[disabled],fieldset[disabled] .button.is-link.is-outlined{background-color:transparent;border-color:#3273dc;box-shadow:none;color:#3273dc}.button.is-link.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-link.is-inverted.is-outlined.is-focused,.button.is-link.is-inverted.is-outlined.is-hovered,.button.is-link.is-inverted.is-outlined:focus,.button.is-link.is-inverted.is-outlined:hover{background-color:#fff;color:#3273dc}.button.is-link.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-link.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-link.is-inverted.is-outlined.is-loading:focus::after,.button.is-link.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #3273dc #3273dc!important}.button.is-link.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-link.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-link.is-light{background-color:#eef3fc;color:#2160c4}.button.is-link.is-light.is-hovered,.button.is-link.is-light:hover{background-color:#e3ecfa;border-color:transparent;color:#2160c4}.button.is-link.is-light.is-active,.button.is-link.is-light:active{background-color:#d8e4f8;border-color:transparent;color:#2160c4}.button.is-info{background-color:#3298dc;border-color:transparent;color:#fff}.button.is-info.is-hovered,.button.is-info:hover{background-color:#2793da;border-color:transparent;color:#fff}.button.is-info.is-focused,.button.is-info:focus{border-color:transparent;color:#fff}.button.is-info.is-focused:not(:active),.button.is-info:focus:not(:active){box-shadow:0 0 0 .125em rgba(50,152,220,.25)}.button.is-info.is-active,.button.is-info:active{background-color:#238cd1;border-color:transparent;color:#fff}.button.is-info[disabled],fieldset[disabled] .button.is-info{background-color:#3298dc;border-color:transparent;box-shadow:none}.button.is-info.is-inverted{background-color:#fff;color:#3298dc}.button.is-info.is-inverted.is-hovered,.button.is-info.is-inverted:hover{background-color:#f2f2f2}.button.is-info.is-inverted[disabled],fieldset[disabled] .button.is-info.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#3298dc}.button.is-info.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-info.is-outlined{background-color:transparent;border-color:#3298dc;color:#3298dc}.button.is-info.is-outlined.is-focused,.button.is-info.is-outlined.is-hovered,.button.is-info.is-outlined:focus,.button.is-info.is-outlined:hover{background-color:#3298dc;border-color:#3298dc;color:#fff}.button.is-info.is-outlined.is-loading::after{border-color:transparent transparent #3298dc #3298dc!important}.button.is-info.is-outlined.is-loading.is-focused::after,.button.is-info.is-outlined.is-loading.is-hovered::after,.button.is-info.is-outlined.is-loading:focus::after,.button.is-info.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-info.is-outlined[disabled],fieldset[disabled] .button.is-info.is-outlined{background-color:transparent;border-color:#3298dc;box-shadow:none;color:#3298dc}.button.is-info.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-info.is-inverted.is-outlined.is-focused,.button.is-info.is-inverted.is-outlined.is-hovered,.button.is-info.is-inverted.is-outlined:focus,.button.is-info.is-inverted.is-outlined:hover{background-color:#fff;color:#3298dc}.button.is-info.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-info.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-info.is-inverted.is-outlined.is-loading:focus::after,.button.is-info.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #3298dc #3298dc!important}.button.is-info.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-info.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-info.is-light{background-color:#eef6fc;color:#1d72aa}.button.is-info.is-light.is-hovered,.button.is-info.is-light:hover{background-color:#e3f1fa;border-color:transparent;color:#1d72aa}.button.is-info.is-light.is-active,.button.is-info.is-light:active{background-color:#d8ebf8;border-color:transparent;color:#1d72aa}.button.is-success{background-color:#48c774;border-color:transparent;color:#fff}.button.is-success.is-hovered,.button.is-success:hover{background-color:#3ec46d;border-color:transparent;color:#fff}.button.is-success.is-focused,.button.is-success:focus{border-color:transparent;color:#fff}.button.is-success.is-focused:not(:active),.button.is-success:focus:not(:active){box-shadow:0 0 0 .125em rgba(72,199,116,.25)}.button.is-success.is-active,.button.is-success:active{background-color:#3abb67;border-color:transparent;color:#fff}.button.is-success[disabled],fieldset[disabled] .button.is-success{background-color:#48c774;border-color:transparent;box-shadow:none}.button.is-success.is-inverted{background-color:#fff;color:#48c774}.button.is-success.is-inverted.is-hovered,.button.is-success.is-inverted:hover{background-color:#f2f2f2}.button.is-success.is-inverted[disabled],fieldset[disabled] .button.is-success.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#48c774}.button.is-success.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-success.is-outlined{background-color:transparent;border-color:#48c774;color:#48c774}.button.is-success.is-outlined.is-focused,.button.is-success.is-outlined.is-hovered,.button.is-success.is-outlined:focus,.button.is-success.is-outlined:hover{background-color:#48c774;border-color:#48c774;color:#fff}.button.is-success.is-outlined.is-loading::after{border-color:transparent transparent #48c774 #48c774!important}.button.is-success.is-outlined.is-loading.is-focused::after,.button.is-success.is-outlined.is-loading.is-hovered::after,.button.is-success.is-outlined.is-loading:focus::after,.button.is-success.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-success.is-outlined[disabled],fieldset[disabled] .button.is-success.is-outlined{background-color:transparent;border-color:#48c774;box-shadow:none;color:#48c774}.button.is-success.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-success.is-inverted.is-outlined.is-focused,.button.is-success.is-inverted.is-outlined.is-hovered,.button.is-success.is-inverted.is-outlined:focus,.button.is-success.is-inverted.is-outlined:hover{background-color:#fff;color:#48c774}.button.is-success.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-success.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-success.is-inverted.is-outlined.is-loading:focus::after,.button.is-success.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #48c774 #48c774!important}.button.is-success.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-success.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-success.is-light{background-color:#effaf3;color:#257942}.button.is-success.is-light.is-hovered,.button.is-success.is-light:hover{background-color:#e6f7ec;border-color:transparent;color:#257942}.button.is-success.is-light.is-active,.button.is-success.is-light:active{background-color:#dcf4e4;border-color:transparent;color:#257942}.button.is-warning{background-color:#ffdd57;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-hovered,.button.is-warning:hover{background-color:#ffdb4a;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-focused,.button.is-warning:focus{border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning.is-focused:not(:active),.button.is-warning:focus:not(:active){box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.button.is-warning.is-active,.button.is-warning:active{background-color:#ffd83d;border-color:transparent;color:rgba(0,0,0,.7)}.button.is-warning[disabled],fieldset[disabled] .button.is-warning{background-color:#ffdd57;border-color:transparent;box-shadow:none}.button.is-warning.is-inverted{background-color:rgba(0,0,0,.7);color:#ffdd57}.button.is-warning.is-inverted.is-hovered,.button.is-warning.is-inverted:hover{background-color:rgba(0,0,0,.7)}.button.is-warning.is-inverted[disabled],fieldset[disabled] .button.is-warning.is-inverted{background-color:rgba(0,0,0,.7);border-color:transparent;box-shadow:none;color:#ffdd57}.button.is-warning.is-loading::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-warning.is-outlined{background-color:transparent;border-color:#ffdd57;color:#ffdd57}.button.is-warning.is-outlined.is-focused,.button.is-warning.is-outlined.is-hovered,.button.is-warning.is-outlined:focus,.button.is-warning.is-outlined:hover{background-color:#ffdd57;border-color:#ffdd57;color:rgba(0,0,0,.7)}.button.is-warning.is-outlined.is-loading::after{border-color:transparent transparent #ffdd57 #ffdd57!important}.button.is-warning.is-outlined.is-loading.is-focused::after,.button.is-warning.is-outlined.is-loading.is-hovered::after,.button.is-warning.is-outlined.is-loading:focus::after,.button.is-warning.is-outlined.is-loading:hover::after{border-color:transparent transparent rgba(0,0,0,.7) rgba(0,0,0,.7)!important}.button.is-warning.is-outlined[disabled],fieldset[disabled] .button.is-warning.is-outlined{background-color:transparent;border-color:#ffdd57;box-shadow:none;color:#ffdd57}.button.is-warning.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);color:rgba(0,0,0,.7)}.button.is-warning.is-inverted.is-outlined.is-focused,.button.is-warning.is-inverted.is-outlined.is-hovered,.button.is-warning.is-inverted.is-outlined:focus,.button.is-warning.is-inverted.is-outlined:hover{background-color:rgba(0,0,0,.7);color:#ffdd57}.button.is-warning.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-warning.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-warning.is-inverted.is-outlined.is-loading:focus::after,.button.is-warning.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #ffdd57 #ffdd57!important}.button.is-warning.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-warning.is-inverted.is-outlined{background-color:transparent;border-color:rgba(0,0,0,.7);box-shadow:none;color:rgba(0,0,0,.7)}.button.is-warning.is-light{background-color:#fffbeb;color:#947600}.button.is-warning.is-light.is-hovered,.button.is-warning.is-light:hover{background-color:#fff8de;border-color:transparent;color:#947600}.button.is-warning.is-light.is-active,.button.is-warning.is-light:active{background-color:#fff6d1;border-color:transparent;color:#947600}.button.is-danger{background-color:#f14668;border-color:transparent;color:#fff}.button.is-danger.is-hovered,.button.is-danger:hover{background-color:#f03a5f;border-color:transparent;color:#fff}.button.is-danger.is-focused,.button.is-danger:focus{border-color:transparent;color:#fff}.button.is-danger.is-focused:not(:active),.button.is-danger:focus:not(:active){box-shadow:0 0 0 .125em rgba(241,70,104,.25)}.button.is-danger.is-active,.button.is-danger:active{background-color:#ef2e55;border-color:transparent;color:#fff}.button.is-danger[disabled],fieldset[disabled] .button.is-danger{background-color:#f14668;border-color:transparent;box-shadow:none}.button.is-danger.is-inverted{background-color:#fff;color:#f14668}.button.is-danger.is-inverted.is-hovered,.button.is-danger.is-inverted:hover{background-color:#f2f2f2}.button.is-danger.is-inverted[disabled],fieldset[disabled] .button.is-danger.is-inverted{background-color:#fff;border-color:transparent;box-shadow:none;color:#f14668}.button.is-danger.is-loading::after{border-color:transparent transparent #fff #fff!important}.button.is-danger.is-outlined{background-color:transparent;border-color:#f14668;color:#f14668}.button.is-danger.is-outlined.is-focused,.button.is-danger.is-outlined.is-hovered,.button.is-danger.is-outlined:focus,.button.is-danger.is-outlined:hover{background-color:#f14668;border-color:#f14668;color:#fff}.button.is-danger.is-outlined.is-loading::after{border-color:transparent transparent #f14668 #f14668!important}.button.is-danger.is-outlined.is-loading.is-focused::after,.button.is-danger.is-outlined.is-loading.is-hovered::after,.button.is-danger.is-outlined.is-loading:focus::after,.button.is-danger.is-outlined.is-loading:hover::after{border-color:transparent transparent #fff #fff!important}.button.is-danger.is-outlined[disabled],fieldset[disabled] .button.is-danger.is-outlined{background-color:transparent;border-color:#f14668;box-shadow:none;color:#f14668}.button.is-danger.is-inverted.is-outlined{background-color:transparent;border-color:#fff;color:#fff}.button.is-danger.is-inverted.is-outlined.is-focused,.button.is-danger.is-inverted.is-outlined.is-hovered,.button.is-danger.is-inverted.is-outlined:focus,.button.is-danger.is-inverted.is-outlined:hover{background-color:#fff;color:#f14668}.button.is-danger.is-inverted.is-outlined.is-loading.is-focused::after,.button.is-danger.is-inverted.is-outlined.is-loading.is-hovered::after,.button.is-danger.is-inverted.is-outlined.is-loading:focus::after,.button.is-danger.is-inverted.is-outlined.is-loading:hover::after{border-color:transparent transparent #f14668 #f14668!important}.button.is-danger.is-inverted.is-outlined[disabled],fieldset[disabled] .button.is-danger.is-inverted.is-outlined{background-color:transparent;border-color:#fff;box-shadow:none;color:#fff}.button.is-danger.is-light{background-color:#feecf0;color:#cc0f35}.button.is-danger.is-light.is-hovered,.button.is-danger.is-light:hover{background-color:#fde0e6;border-color:transparent;color:#cc0f35}.button.is-danger.is-light.is-active,.button.is-danger.is-light:active{background-color:#fcd4dc;border-color:transparent;color:#cc0f35}.button.is-small{border-radius:2px;font-size:.75rem}.button.is-normal{font-size:1rem}.button.is-medium{font-size:1.25rem}.button.is-large{font-size:1.5rem}.button[disabled],fieldset[disabled] .button{background-color:#fff;border-color:#dbdbdb;box-shadow:none;opacity:.5}.button.is-fullwidth{display:flex;width:100%}.button.is-loading{color:transparent!important;pointer-events:none}.button.is-loading::after{position:absolute;left:calc(50% - (1em / 2));top:calc(50% - (1em / 2));position:absolute!important}.button.is-static{background-color:#f5f5f5;border-color:#dbdbdb;color:#7a7a7a;box-shadow:none;pointer-events:none}.button.is-rounded{border-radius:290486px;padding-left:calc(1em + .25em);padding-right:calc(1em + .25em)}.buttons{align-items:center;display:flex;flex-wrap:wrap;justify-content:flex-start}.buttons .button{margin-bottom:.5rem}.buttons .button:not(:last-child):not(.is-fullwidth){margin-left:.5rem}.buttons:last-child{margin-bottom:-.5rem}.buttons:not(:last-child){margin-bottom:1rem}.buttons.are-small .button:not(.is-normal):not(.is-medium):not(.is-large){border-radius:2px;font-size:.75rem}.buttons.are-medium .button:not(.is-small):not(.is-normal):not(.is-large){font-size:1.25rem}.buttons.are-large .button:not(.is-small):not(.is-normal):not(.is-medium){font-size:1.5rem}.buttons.has-addons .button:not(:first-child){border-bottom-left-radius:0;border-top-left-radius:0}.buttons.has-addons .button:not(:last-child){border-bottom-right-radius:0;border-top-right-radius:0;margin-left:-1px}.buttons.has-addons .button:last-child{margin-left:0}.buttons.has-addons .button.is-hovered,.buttons.has-addons .button:hover{z-index:2}.buttons.has-addons .button.is-active,.buttons.has-addons .button.is-focused,.buttons.has-addons .button.is-selected,.buttons.has-addons .button:active,.buttons.has-addons .button:focus{z-index:3}.buttons.has-addons .button.is-active:hover,.buttons.has-addons .button.is-focused:hover,.buttons.has-addons .button.is-selected:hover,.buttons.has-addons .button:active:hover,.buttons.has-addons .button:focus:hover{z-index:4}.buttons.has-addons .button.is-expanded{flex-grow:1;flex-shrink:1}.buttons.is-centered{justify-content:center}.buttons.is-centered:not(.has-addons) .button:not(.is-fullwidth){margin-left:.25rem;margin-right:.25rem}.buttons.is-right{justify-content:flex-end}.buttons.is-right:not(.has-addons) .button:not(.is-fullwidth){margin-left:.25rem;margin-right:.25rem}.container{flex-grow:1;margin:0 auto;position:relative;width:auto}.container.is-fluid{max-width:none;padding-left:32px;padding-right:32px;width:100%}@media screen and (min-width:1024px){.container{max-width:960px}}@media screen and (max-width:1215px){.container.is-widescreen{max-width:1152px}}@media screen and (max-width:1407px){.container.is-fullhd{max-width:1344px}}@media screen and (min-width:1216px){.container{max-width:1152px}}@media screen and (min-width:1408px){.container{max-width:1344px}}.content li+li{margin-top:.25em}.content blockquote:not(:last-child),.content dl:not(:last-child),.content ol:not(:last-child),.content p:not(:last-child),.content pre:not(:last-child),.content table:not(:last-child),.content ul:not(:last-child){margin-bottom:1em}.content h1,.content h2,.content h3,.content h4,.content h5,.content h6{color:#363636;font-weight:600;line-height:1.125}.content h1{font-size:2em;margin-bottom:.5em}.content h1:not(:first-child){margin-top:1em}.content h2{font-size:1.75em;margin-bottom:.5714em}.content h2:not(:first-child){margin-top:1.1428em}.content h3{font-size:1.5em;margin-bottom:.6666em}.content h3:not(:first-child){margin-top:1.3333em}.content h4{font-size:1.25em;margin-bottom:.8em}.content h5{font-size:1.125em;margin-bottom:.8888em}.content h6{font-size:1em;margin-bottom:1em}.content blockquote{background-color:#f5f5f5;border-right:5px solid #dbdbdb;padding:1.25em 1.5em}.content ol{list-style-position:outside;margin-right:2em;margin-top:1em}.content ol:not([type]){list-style-type:decimal}.content ol:not([type]).is-lower-alpha{list-style-type:lower-alpha}.content ol:not([type]).is-lower-roman{list-style-type:lower-roman}.content ol:not([type]).is-upper-alpha{list-style-type:upper-alpha}.content ol:not([type]).is-upper-roman{list-style-type:upper-roman}.content ul{list-style:disc outside;margin-right:2em;margin-top:1em}.content ul ul{list-style-type:circle;margin-top:.5em}.content ul ul ul{list-style-type:square}.content dd{margin-right:2em}.content figure{margin-left:2em;margin-right:2em;text-align:center}.content figure:not(:first-child){margin-top:2em}.content figure:not(:last-child){margin-bottom:2em}.content figure img{display:inline-block}.content figure figcaption{font-style:italic}.content pre{-webkit-overflow-scrolling:touch;overflow-x:auto;padding:1.25em 1.5em;white-space:pre;word-wrap:normal}.content sub,.content sup{font-size:75%}.content table{width:100%}.content table td,.content table th{border:1px solid #dbdbdb;border-width:0 0 1px;padding:.5em .75em;vertical-align:top}.content table th{color:#363636}.content table th:not([align]){text-align:inherit}.content table thead td,.content table thead th{border-width:0 0 2px;color:#363636}.content table tfoot td,.content table tfoot th{border-width:2px 0 0;color:#363636}.content table tbody tr:last-child td,.content table tbody tr:last-child th{border-bottom-width:0}.content .tabs li+li{margin-top:0}.content.is-small{font-size:.75rem}.content.is-medium{font-size:1.25rem}.content.is-large{font-size:1.5rem}.icon{align-items:center;display:inline-flex;justify-content:center;height:1.5rem;width:1.5rem}.icon.is-small{height:1rem;width:1rem}.icon.is-medium{height:2rem;width:2rem}.icon.is-large{height:3rem;width:3rem}.image{display:block;position:relative}.image img{display:block;height:auto;width:100%}.image img.is-rounded{border-radius:290486px}.image.is-fullwidth{width:100%}.image.is-16by9 .has-ratio,.image.is-16by9 img,.image.is-1by1 .has-ratio,.image.is-1by1 img,.image.is-1by2 .has-ratio,.image.is-1by2 img,.image.is-1by3 .has-ratio,.image.is-1by3 img,.image.is-2by1 .has-ratio,.image.is-2by1 img,.image.is-2by3 .has-ratio,.image.is-2by3 img,.image.is-3by1 .has-ratio,.image.is-3by1 img,.image.is-3by2 .has-ratio,.image.is-3by2 img,.image.is-3by4 .has-ratio,.image.is-3by4 img,.image.is-3by5 .has-ratio,.image.is-3by5 img,.image.is-4by3 .has-ratio,.image.is-4by3 img,.image.is-4by5 .has-ratio,.image.is-4by5 img,.image.is-5by3 .has-ratio,.image.is-5by3 img,.image.is-5by4 .has-ratio,.image.is-5by4 img,.image.is-9by16 .has-ratio,.image.is-9by16 img,.image.is-square .has-ratio,.image.is-square img{height:100%;width:100%}.image.is-1by1,.image.is-square{padding-top:100%}.image.is-5by4{padding-top:80%}.image.is-4by3{padding-top:75%}.image.is-3by2{padding-top:66.6666%}.image.is-5by3{padding-top:60%}.image.is-16by9{padding-top:56.25%}.image.is-2by1{padding-top:50%}.image.is-3by1{padding-top:33.3333%}.image.is-4by5{padding-top:125%}.image.is-3by4{padding-top:133.3333%}.image.is-2by3{padding-top:150%}.image.is-3by5{padding-top:166.6666%}.image.is-9by16{padding-top:177.7777%}.image.is-1by2{padding-top:200%}.image.is-1by3{padding-top:300%}.image.is-16x16{height:16px;width:16px}.image.is-24x24{height:24px;width:24px}.image.is-32x32{height:32px;width:32px}.image.is-48x48{height:48px;width:48px}.image.is-64x64{height:64px;width:64px}.image.is-96x96{height:96px;width:96px}.image.is-128x128{height:128px;width:128px}.notification{background-color:#f5f5f5;border-radius:4px;position:relative;padding:1.25rem 1.5rem 1.25rem 2.5rem}.notification a:not(.button):not(.dropdown-item){color:currentColor;text-decoration:underline}.notification strong{color:currentColor}.notification code,.notification pre{background:#fff}.notification pre code{background:0 0}.notification>.delete{left:.5rem;position:absolute;top:.5rem}.notification .content,.notification .subtitle,.notification .title{color:currentColor}.notification.is-white{background-color:#fff;color:#0a0a0a}.notification.is-black{background-color:#0a0a0a;color:#fff}.notification.is-light{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.notification.is-dark{background-color:#363636;color:#fff}.notification.is-primary{background-color:#00d1b2;color:#fff}.notification.is-primary.is-light{background-color:#ebfffc;color:#00947e}.notification.is-link{background-color:#3273dc;color:#fff}.notification.is-link.is-light{background-color:#eef3fc;color:#2160c4}.notification.is-info{background-color:#3298dc;color:#fff}.notification.is-info.is-light{background-color:#eef6fc;color:#1d72aa}.notification.is-success{background-color:#48c774;color:#fff}.notification.is-success.is-light{background-color:#effaf3;color:#257942}.notification.is-warning{background-color:#ffdd57;color:rgba(0,0,0,.7)}.notification.is-warning.is-light{background-color:#fffbeb;color:#947600}.notification.is-danger{background-color:#f14668;color:#fff}.notification.is-danger.is-light{background-color:#feecf0;color:#cc0f35}.progress{-moz-appearance:none;-webkit-appearance:none;border:none;border-radius:290486px;display:block;height:1rem;overflow:hidden;padding:0;width:100%}.progress::-webkit-progress-bar{background-color:#ededed}.progress::-webkit-progress-value{background-color:#4a4a4a}.progress::-moz-progress-bar{background-color:#4a4a4a}.progress::-ms-fill{background-color:#4a4a4a;border:none}.progress.is-white::-webkit-progress-value{background-color:#fff}.progress.is-white::-moz-progress-bar{background-color:#fff}.progress.is-white::-ms-fill{background-color:#fff}.progress.is-white:indeterminate{background-image:linear-gradient(to right,#fff 30%,#ededed 30%)}.progress.is-black::-webkit-progress-value{background-color:#0a0a0a}.progress.is-black::-moz-progress-bar{background-color:#0a0a0a}.progress.is-black::-ms-fill{background-color:#0a0a0a}.progress.is-black:indeterminate{background-image:linear-gradient(to right,#0a0a0a 30%,#ededed 30%)}.progress.is-light::-webkit-progress-value{background-color:#f5f5f5}.progress.is-light::-moz-progress-bar{background-color:#f5f5f5}.progress.is-light::-ms-fill{background-color:#f5f5f5}.progress.is-light:indeterminate{background-image:linear-gradient(to right,#f5f5f5 30%,#ededed 30%)}.progress.is-dark::-webkit-progress-value{background-color:#363636}.progress.is-dark::-moz-progress-bar{background-color:#363636}.progress.is-dark::-ms-fill{background-color:#363636}.progress.is-dark:indeterminate{background-image:linear-gradient(to right,#363636 30%,#ededed 30%)}.progress.is-primary::-webkit-progress-value{background-color:#00d1b2}.progress.is-primary::-moz-progress-bar{background-color:#00d1b2}.progress.is-primary::-ms-fill{background-color:#00d1b2}.progress.is-primary:indeterminate{background-image:linear-gradient(to right,#00d1b2 30%,#ededed 30%)}.progress.is-link::-webkit-progress-value{background-color:#3273dc}.progress.is-link::-moz-progress-bar{background-color:#3273dc}.progress.is-link::-ms-fill{background-color:#3273dc}.progress.is-link:indeterminate{background-image:linear-gradient(to right,#3273dc 30%,#ededed 30%)}.progress.is-info::-webkit-progress-value{background-color:#3298dc}.progress.is-info::-moz-progress-bar{background-color:#3298dc}.progress.is-info::-ms-fill{background-color:#3298dc}.progress.is-info:indeterminate{background-image:linear-gradient(to right,#3298dc 30%,#ededed 30%)}.progress.is-success::-webkit-progress-value{background-color:#48c774}.progress.is-success::-moz-progress-bar{background-color:#48c774}.progress.is-success::-ms-fill{background-color:#48c774}.progress.is-success:indeterminate{background-image:linear-gradient(to right,#48c774 30%,#ededed 30%)}.progress.is-warning::-webkit-progress-value{background-color:#ffdd57}.progress.is-warning::-moz-progress-bar{background-color:#ffdd57}.progress.is-warning::-ms-fill{background-color:#ffdd57}.progress.is-warning:indeterminate{background-image:linear-gradient(to right,#ffdd57 30%,#ededed 30%)}.progress.is-danger::-webkit-progress-value{background-color:#f14668}.progress.is-danger::-moz-progress-bar{background-color:#f14668}.progress.is-danger::-ms-fill{background-color:#f14668}.progress.is-danger:indeterminate{background-image:linear-gradient(to right,#f14668 30%,#ededed 30%)}.progress:indeterminate{-webkit-animation-duration:1.5s;animation-duration:1.5s;-webkit-animation-iteration-count:infinite;animation-iteration-count:infinite;-webkit-animation-name:moveIndeterminate;animation-name:moveIndeterminate;-webkit-animation-timing-function:linear;animation-timing-function:linear;background-color:#ededed;background-image:linear-gradient(to right,#4a4a4a 30%,#ededed 30%);background-position:top left;background-repeat:no-repeat;background-size:150% 150%}.progress:indeterminate::-webkit-progress-bar{background-color:transparent}.progress:indeterminate::-moz-progress-bar{background-color:transparent}.progress.is-small{height:.75rem}.progress.is-medium{height:1.25rem}.progress.is-large{height:1.5rem}@-webkit-keyframes moveIndeterminate{from{background-position:200% 0}to{background-position:-200% 0}}@keyframes moveIndeterminate{from{background-position:200% 0}to{background-position:-200% 0}}.table{background-color:#fff;color:#363636}.table td,.table th{border:1px solid #dbdbdb;border-width:0 0 1px;padding:.5em .75em;vertical-align:top}.table td.is-white,.table th.is-white{background-color:#fff;border-color:#fff;color:#0a0a0a}.table td.is-black,.table th.is-black{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.table td.is-light,.table th.is-light{background-color:#f5f5f5;border-color:#f5f5f5;color:rgba(0,0,0,.7)}.table td.is-dark,.table th.is-dark{background-color:#363636;border-color:#363636;color:#fff}.table td.is-primary,.table th.is-primary{background-color:#00d1b2;border-color:#00d1b2;color:#fff}.table td.is-link,.table th.is-link{background-color:#3273dc;border-color:#3273dc;color:#fff}.table td.is-info,.table th.is-info{background-color:#3298dc;border-color:#3298dc;color:#fff}.table td.is-success,.table th.is-success{background-color:#48c774;border-color:#48c774;color:#fff}.table td.is-warning,.table th.is-warning{background-color:#ffdd57;border-color:#ffdd57;color:rgba(0,0,0,.7)}.table td.is-danger,.table th.is-danger{background-color:#f14668;border-color:#f14668;color:#fff}.table td.is-narrow,.table th.is-narrow{white-space:nowrap;width:1%}.table td.is-selected,.table th.is-selected{background-color:#00d1b2;color:#fff}.table td.is-selected a,.table td.is-selected strong,.table th.is-selected a,.table th.is-selected strong{color:currentColor}.table td.is-vcentered,.table th.is-vcentered{vertical-align:middle}.table th{color:#363636}.table th:not([align]){text-align:inherit}.table tr.is-selected{background-color:#00d1b2;color:#fff}.table tr.is-selected a,.table tr.is-selected strong{color:currentColor}.table tr.is-selected td,.table tr.is-selected th{border-color:#fff;color:currentColor}.table thead{background-color:transparent}.table thead td,.table thead th{border-width:0 0 2px;color:#363636}.table tfoot{background-color:transparent}.table tfoot td,.table tfoot th{border-width:2px 0 0;color:#363636}.table tbody{background-color:transparent}.table tbody tr:last-child td,.table tbody tr:last-child th{border-bottom-width:0}.table.is-bordered td,.table.is-bordered th{border-width:1px}.table.is-bordered tr:last-child td,.table.is-bordered tr:last-child th{border-bottom-width:1px}.table.is-fullwidth{width:100%}.table.is-hoverable tbody tr:not(.is-selected):hover{background-color:#fafafa}.table.is-hoverable.is-striped tbody tr:not(.is-selected):hover{background-color:#fafafa}.table.is-hoverable.is-striped tbody tr:not(.is-selected):hover:nth-child(even){background-color:#f5f5f5}.table.is-narrow td,.table.is-narrow th{padding:.25em .5em}.table.is-striped tbody tr:not(.is-selected):nth-child(even){background-color:#fafafa}.table-container{-webkit-overflow-scrolling:touch;overflow:auto;overflow-y:hidden;max-width:100%}.tags{align-items:center;display:flex;flex-wrap:wrap;justify-content:flex-start}.tags .tag{margin-bottom:.5rem}.tags .tag:not(:last-child){margin-left:.5rem}.tags:last-child{margin-bottom:-.5rem}.tags:not(:last-child){margin-bottom:1rem}.tags.are-medium .tag:not(.is-normal):not(.is-large){font-size:1rem}.tags.are-large .tag:not(.is-normal):not(.is-medium){font-size:1.25rem}.tags.is-centered{justify-content:center}.tags.is-centered .tag{margin-right:.25rem;margin-left:.25rem}.tags.is-right{justify-content:flex-end}.tags.is-right .tag:not(:first-child){margin-left:.5rem}.tags.is-right .tag:not(:last-child){margin-right:0}.tags.has-addons .tag{margin-left:0}.tags.has-addons .tag:not(:first-child){margin-right:0;border-top-right-radius:0;border-bottom-right-radius:0}.tags.has-addons .tag:not(:last-child){border-top-left-radius:0;border-bottom-left-radius:0}.tag:not(body){align-items:center;background-color:#f5f5f5;border-radius:4px;color:#4a4a4a;display:inline-flex;font-size:.75rem;height:2em;justify-content:center;line-height:1.5;padding-left:.75em;padding-right:.75em;white-space:nowrap}.tag:not(body) .delete{margin-right:.25rem;margin-left:-.375rem}.tag:not(body).is-white{background-color:#fff;color:#0a0a0a}.tag:not(body).is-black{background-color:#0a0a0a;color:#fff}.tag:not(body).is-light{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.tag:not(body).is-dark{background-color:#363636;color:#fff}.tag:not(body).is-primary{background-color:#00d1b2;color:#fff}.tag:not(body).is-primary.is-light{background-color:#ebfffc;color:#00947e}.tag:not(body).is-link{background-color:#3273dc;color:#fff}.tag:not(body).is-link.is-light{background-color:#eef3fc;color:#2160c4}.tag:not(body).is-info{background-color:#3298dc;color:#fff}.tag:not(body).is-info.is-light{background-color:#eef6fc;color:#1d72aa}.tag:not(body).is-success{background-color:#48c774;color:#fff}.tag:not(body).is-success.is-light{background-color:#effaf3;color:#257942}.tag:not(body).is-warning{background-color:#ffdd57;color:rgba(0,0,0,.7)}.tag:not(body).is-warning.is-light{background-color:#fffbeb;color:#947600}.tag:not(body).is-danger{background-color:#f14668;color:#fff}.tag:not(body).is-danger.is-light{background-color:#feecf0;color:#cc0f35}.tag:not(body).is-normal{font-size:.75rem}.tag:not(body).is-medium{font-size:1rem}.tag:not(body).is-large{font-size:1.25rem}.tag:not(body) .icon:first-child:not(:last-child){margin-right:-.375em;margin-left:.1875em}.tag:not(body) .icon:last-child:not(:first-child){margin-right:.1875em;margin-left:-.375em}.tag:not(body) .icon:first-child:last-child{margin-right:-.375em;margin-left:-.375em}.tag:not(body).is-delete{margin-right:1px;padding:0;position:relative;width:2em}.tag:not(body).is-delete::after,.tag:not(body).is-delete::before{background-color:currentColor;content:"";display:block;left:50%;position:absolute;top:50%;transform:translateX(-50%) translateY(-50%) rotate(45deg);transform-origin:center center}.tag:not(body).is-delete::before{height:1px;width:50%}.tag:not(body).is-delete::after{height:50%;width:1px}.tag:not(body).is-delete:focus,.tag:not(body).is-delete:hover{background-color:#e8e8e8}.tag:not(body).is-delete:active{background-color:#dbdbdb}.tag:not(body).is-rounded{border-radius:290486px}a.tag:hover{text-decoration:underline}.subtitle,.title{word-break:break-word}.subtitle em,.subtitle span,.title em,.title span{font-weight:inherit}.subtitle sub,.title sub{font-size:.75em}.subtitle sup,.title sup{font-size:.75em}.subtitle .tag,.title .tag{vertical-align:middle}.title{color:#363636;font-size:2rem;font-weight:600;line-height:1.125}.title strong{color:inherit;font-weight:inherit}.title+.highlight{margin-top:-.75rem}.title:not(.is-spaced)+.subtitle{margin-top:-1.25rem}.title.is-1{font-size:3rem}.title.is-2{font-size:2.5rem}.title.is-3{font-size:2rem}.title.is-4{font-size:1.5rem}.title.is-5{font-size:1.25rem}.title.is-6{font-size:1rem}.title.is-7{font-size:.75rem}.subtitle{color:#4a4a4a;font-size:1.25rem;font-weight:400;line-height:1.25}.subtitle strong{color:#363636;font-weight:600}.subtitle:not(.is-spaced)+.title{margin-top:-1.25rem}.subtitle.is-1{font-size:3rem}.subtitle.is-2{font-size:2.5rem}.subtitle.is-3{font-size:2rem}.subtitle.is-4{font-size:1.5rem}.subtitle.is-5{font-size:1.25rem}.subtitle.is-6{font-size:1rem}.subtitle.is-7{font-size:.75rem}.heading{display:block;font-size:11px;letter-spacing:1px;margin-bottom:5px;text-transform:uppercase}.highlight{font-weight:400;max-width:100%;overflow:hidden;padding:0}.highlight pre{overflow:auto;max-width:100%}.number{align-items:center;background-color:#f5f5f5;border-radius:290486px;display:inline-flex;font-size:1.25rem;height:2em;justify-content:center;margin-right:1.5rem;min-width:2.5em;padding:.25rem .5rem;text-align:center;vertical-align:top}.input,.select select,.textarea{background-color:#fff;border-color:#dbdbdb;border-radius:4px;color:#363636}.input::-moz-placeholder,.select select::-moz-placeholder,.textarea::-moz-placeholder{color:rgba(54,54,54,.3)}.input::-webkit-input-placeholder,.select select::-webkit-input-placeholder,.textarea::-webkit-input-placeholder{color:rgba(54,54,54,.3)}.input:-moz-placeholder,.select select:-moz-placeholder,.textarea:-moz-placeholder{color:rgba(54,54,54,.3)}.input:-ms-input-placeholder,.select select:-ms-input-placeholder,.textarea:-ms-input-placeholder{color:rgba(54,54,54,.3)}.input:hover,.is-hovered.input,.is-hovered.textarea,.select select.is-hovered,.select select:hover,.textarea:hover{border-color:#b5b5b5}.input:active,.input:focus,.is-active.input,.is-active.textarea,.is-focused.input,.is-focused.textarea,.select select.is-active,.select select.is-focused,.select select:active,.select select:focus,.textarea:active,.textarea:focus{border-color:#3273dc;box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.input[disabled],.select fieldset[disabled] select,.select select[disabled],.textarea[disabled],fieldset[disabled] .input,fieldset[disabled] .select select,fieldset[disabled] .textarea{background-color:#f5f5f5;border-color:#f5f5f5;box-shadow:none;color:#7a7a7a}.input[disabled]::-moz-placeholder,.select fieldset[disabled] select::-moz-placeholder,.select select[disabled]::-moz-placeholder,.textarea[disabled]::-moz-placeholder,fieldset[disabled] .input::-moz-placeholder,fieldset[disabled] .select select::-moz-placeholder,fieldset[disabled] .textarea::-moz-placeholder{color:rgba(122,122,122,.3)}.input[disabled]::-webkit-input-placeholder,.select fieldset[disabled] select::-webkit-input-placeholder,.select select[disabled]::-webkit-input-placeholder,.textarea[disabled]::-webkit-input-placeholder,fieldset[disabled] .input::-webkit-input-placeholder,fieldset[disabled] .select select::-webkit-input-placeholder,fieldset[disabled] .textarea::-webkit-input-placeholder{color:rgba(122,122,122,.3)}.input[disabled]:-moz-placeholder,.select fieldset[disabled] select:-moz-placeholder,.select select[disabled]:-moz-placeholder,.textarea[disabled]:-moz-placeholder,fieldset[disabled] .input:-moz-placeholder,fieldset[disabled] .select select:-moz-placeholder,fieldset[disabled] .textarea:-moz-placeholder{color:rgba(122,122,122,.3)}.input[disabled]:-ms-input-placeholder,.select fieldset[disabled] select:-ms-input-placeholder,.select select[disabled]:-ms-input-placeholder,.textarea[disabled]:-ms-input-placeholder,fieldset[disabled] .input:-ms-input-placeholder,fieldset[disabled] .select select:-ms-input-placeholder,fieldset[disabled] .textarea:-ms-input-placeholder{color:rgba(122,122,122,.3)}.input,.textarea{box-shadow:inset 0 .0625em .125em rgba(10,10,10,.05);max-width:100%;width:100%}.input[readonly],.textarea[readonly]{box-shadow:none}.is-white.input,.is-white.textarea{border-color:#fff}.is-white.input:active,.is-white.input:focus,.is-white.is-active.input,.is-white.is-active.textarea,.is-white.is-focused.input,.is-white.is-focused.textarea,.is-white.textarea:active,.is-white.textarea:focus{box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.is-black.input,.is-black.textarea{border-color:#0a0a0a}.is-black.input:active,.is-black.input:focus,.is-black.is-active.input,.is-black.is-active.textarea,.is-black.is-focused.input,.is-black.is-focused.textarea,.is-black.textarea:active,.is-black.textarea:focus{box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.is-light.input,.is-light.textarea{border-color:#f5f5f5}.is-light.input:active,.is-light.input:focus,.is-light.is-active.input,.is-light.is-active.textarea,.is-light.is-focused.input,.is-light.is-focused.textarea,.is-light.textarea:active,.is-light.textarea:focus{box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.is-dark.input,.is-dark.textarea{border-color:#363636}.is-dark.input:active,.is-dark.input:focus,.is-dark.is-active.input,.is-dark.is-active.textarea,.is-dark.is-focused.input,.is-dark.is-focused.textarea,.is-dark.textarea:active,.is-dark.textarea:focus{box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.is-primary.input,.is-primary.textarea{border-color:#00d1b2}.is-primary.input:active,.is-primary.input:focus,.is-primary.is-active.input,.is-primary.is-active.textarea,.is-primary.is-focused.input,.is-primary.is-focused.textarea,.is-primary.textarea:active,.is-primary.textarea:focus{box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.is-link.input,.is-link.textarea{border-color:#3273dc}.is-link.input:active,.is-link.input:focus,.is-link.is-active.input,.is-link.is-active.textarea,.is-link.is-focused.input,.is-link.is-focused.textarea,.is-link.textarea:active,.is-link.textarea:focus{box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.is-info.input,.is-info.textarea{border-color:#3298dc}.is-info.input:active,.is-info.input:focus,.is-info.is-active.input,.is-info.is-active.textarea,.is-info.is-focused.input,.is-info.is-focused.textarea,.is-info.textarea:active,.is-info.textarea:focus{box-shadow:0 0 0 .125em rgba(50,152,220,.25)}.is-success.input,.is-success.textarea{border-color:#48c774}.is-success.input:active,.is-success.input:focus,.is-success.is-active.input,.is-success.is-active.textarea,.is-success.is-focused.input,.is-success.is-focused.textarea,.is-success.textarea:active,.is-success.textarea:focus{box-shadow:0 0 0 .125em rgba(72,199,116,.25)}.is-warning.input,.is-warning.textarea{border-color:#ffdd57}.is-warning.input:active,.is-warning.input:focus,.is-warning.is-active.input,.is-warning.is-active.textarea,.is-warning.is-focused.input,.is-warning.is-focused.textarea,.is-warning.textarea:active,.is-warning.textarea:focus{box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.is-danger.input,.is-danger.textarea{border-color:#f14668}.is-danger.input:active,.is-danger.input:focus,.is-danger.is-active.input,.is-danger.is-active.textarea,.is-danger.is-focused.input,.is-danger.is-focused.textarea,.is-danger.textarea:active,.is-danger.textarea:focus{box-shadow:0 0 0 .125em rgba(241,70,104,.25)}.is-small.input,.is-small.textarea{border-radius:2px;font-size:.75rem}.is-medium.input,.is-medium.textarea{font-size:1.25rem}.is-large.input,.is-large.textarea{font-size:1.5rem}.is-fullwidth.input,.is-fullwidth.textarea{display:block;width:100%}.is-inline.input,.is-inline.textarea{display:inline;width:auto}.input.is-rounded{border-radius:290486px;padding-left:calc(calc(.75em - 1px) + .375em);padding-right:calc(calc(.75em - 1px) + .375em)}.input.is-static{background-color:transparent;border-color:transparent;box-shadow:none;padding-left:0;padding-right:0}.textarea{display:block;max-width:100%;min-width:100%;padding:calc(.75em - 1px);resize:vertical}.textarea:not([rows]){max-height:40em;min-height:8em}.textarea[rows]{height:initial}.textarea.has-fixed-size{resize:none}.checkbox,.radio{cursor:pointer;display:inline-block;line-height:1.25;position:relative}.checkbox input,.radio input{cursor:pointer}.checkbox:hover,.radio:hover{color:#363636}.checkbox[disabled],.radio[disabled],fieldset[disabled] .checkbox,fieldset[disabled] .radio{color:#7a7a7a;cursor:not-allowed}.radio+.radio{margin-right:.5em}.select{display:inline-block;max-width:100%;position:relative;vertical-align:top}.select:not(.is-multiple){height:2.5em}.select:not(.is-multiple):not(.is-loading)::after{border-color:#3273dc;left:1.125em;z-index:4}.select.is-rounded select{border-radius:290486px;padding-right:1em}.select select{cursor:pointer;display:block;font-size:1em;max-width:100%;outline:0}.select select::-ms-expand{display:none}.select select[disabled]:hover,fieldset[disabled] .select select:hover{border-color:#f5f5f5}.select select:not([multiple]){padding-left:2.5em}.select select[multiple]{height:auto;padding:0}.select select[multiple] option{padding:.5em 1em}.select:not(.is-multiple):not(.is-loading):hover::after{border-color:#363636}.select.is-white:not(:hover)::after{border-color:#fff}.select.is-white select{border-color:#fff}.select.is-white select.is-hovered,.select.is-white select:hover{border-color:#f2f2f2}.select.is-white select.is-active,.select.is-white select.is-focused,.select.is-white select:active,.select.is-white select:focus{box-shadow:0 0 0 .125em rgba(255,255,255,.25)}.select.is-black:not(:hover)::after{border-color:#0a0a0a}.select.is-black select{border-color:#0a0a0a}.select.is-black select.is-hovered,.select.is-black select:hover{border-color:#000}.select.is-black select.is-active,.select.is-black select.is-focused,.select.is-black select:active,.select.is-black select:focus{box-shadow:0 0 0 .125em rgba(10,10,10,.25)}.select.is-light:not(:hover)::after{border-color:#f5f5f5}.select.is-light select{border-color:#f5f5f5}.select.is-light select.is-hovered,.select.is-light select:hover{border-color:#e8e8e8}.select.is-light select.is-active,.select.is-light select.is-focused,.select.is-light select:active,.select.is-light select:focus{box-shadow:0 0 0 .125em rgba(245,245,245,.25)}.select.is-dark:not(:hover)::after{border-color:#363636}.select.is-dark select{border-color:#363636}.select.is-dark select.is-hovered,.select.is-dark select:hover{border-color:#292929}.select.is-dark select.is-active,.select.is-dark select.is-focused,.select.is-dark select:active,.select.is-dark select:focus{box-shadow:0 0 0 .125em rgba(54,54,54,.25)}.select.is-primary:not(:hover)::after{border-color:#00d1b2}.select.is-primary select{border-color:#00d1b2}.select.is-primary select.is-hovered,.select.is-primary select:hover{border-color:#00b89c}.select.is-primary select.is-active,.select.is-primary select.is-focused,.select.is-primary select:active,.select.is-primary select:focus{box-shadow:0 0 0 .125em rgba(0,209,178,.25)}.select.is-link:not(:hover)::after{border-color:#3273dc}.select.is-link select{border-color:#3273dc}.select.is-link select.is-hovered,.select.is-link select:hover{border-color:#2366d1}.select.is-link select.is-active,.select.is-link select.is-focused,.select.is-link select:active,.select.is-link select:focus{box-shadow:0 0 0 .125em rgba(50,115,220,.25)}.select.is-info:not(:hover)::after{border-color:#3298dc}.select.is-info select{border-color:#3298dc}.select.is-info select.is-hovered,.select.is-info select:hover{border-color:#238cd1}.select.is-info select.is-active,.select.is-info select.is-focused,.select.is-info select:active,.select.is-info select:focus{box-shadow:0 0 0 .125em rgba(50,152,220,.25)}.select.is-success:not(:hover)::after{border-color:#48c774}.select.is-success select{border-color:#48c774}.select.is-success select.is-hovered,.select.is-success select:hover{border-color:#3abb67}.select.is-success select.is-active,.select.is-success select.is-focused,.select.is-success select:active,.select.is-success select:focus{box-shadow:0 0 0 .125em rgba(72,199,116,.25)}.select.is-warning:not(:hover)::after{border-color:#ffdd57}.select.is-warning select{border-color:#ffdd57}.select.is-warning select.is-hovered,.select.is-warning select:hover{border-color:#ffd83d}.select.is-warning select.is-active,.select.is-warning select.is-focused,.select.is-warning select:active,.select.is-warning select:focus{box-shadow:0 0 0 .125em rgba(255,221,87,.25)}.select.is-danger:not(:hover)::after{border-color:#f14668}.select.is-danger select{border-color:#f14668}.select.is-danger select.is-hovered,.select.is-danger select:hover{border-color:#ef2e55}.select.is-danger select.is-active,.select.is-danger select.is-focused,.select.is-danger select:active,.select.is-danger select:focus{box-shadow:0 0 0 .125em rgba(241,70,104,.25)}.select.is-small{border-radius:2px;font-size:.75rem}.select.is-medium{font-size:1.25rem}.select.is-large{font-size:1.5rem}.select.is-disabled::after{border-color:#7a7a7a}.select.is-fullwidth{width:100%}.select.is-fullwidth select{width:100%}.select.is-loading::after{margin-top:0;position:absolute;left:.625em;top:.625em;transform:none}.select.is-loading.is-small:after{font-size:.75rem}.select.is-loading.is-medium:after{font-size:1.25rem}.select.is-loading.is-large:after{font-size:1.5rem}.file{align-items:stretch;display:flex;justify-content:flex-start;position:relative}.file.is-white .file-cta{background-color:#fff;border-color:transparent;color:#0a0a0a}.file.is-white.is-hovered .file-cta,.file.is-white:hover .file-cta{background-color:#f9f9f9;border-color:transparent;color:#0a0a0a}.file.is-white.is-focused .file-cta,.file.is-white:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(255,255,255,.25);color:#0a0a0a}.file.is-white.is-active .file-cta,.file.is-white:active .file-cta{background-color:#f2f2f2;border-color:transparent;color:#0a0a0a}.file.is-black .file-cta{background-color:#0a0a0a;border-color:transparent;color:#fff}.file.is-black.is-hovered .file-cta,.file.is-black:hover .file-cta{background-color:#040404;border-color:transparent;color:#fff}.file.is-black.is-focused .file-cta,.file.is-black:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(10,10,10,.25);color:#fff}.file.is-black.is-active .file-cta,.file.is-black:active .file-cta{background-color:#000;border-color:transparent;color:#fff}.file.is-light .file-cta{background-color:#f5f5f5;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-light.is-hovered .file-cta,.file.is-light:hover .file-cta{background-color:#eee;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-light.is-focused .file-cta,.file.is-light:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(245,245,245,.25);color:rgba(0,0,0,.7)}.file.is-light.is-active .file-cta,.file.is-light:active .file-cta{background-color:#e8e8e8;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-dark .file-cta{background-color:#363636;border-color:transparent;color:#fff}.file.is-dark.is-hovered .file-cta,.file.is-dark:hover .file-cta{background-color:#2f2f2f;border-color:transparent;color:#fff}.file.is-dark.is-focused .file-cta,.file.is-dark:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(54,54,54,.25);color:#fff}.file.is-dark.is-active .file-cta,.file.is-dark:active .file-cta{background-color:#292929;border-color:transparent;color:#fff}.file.is-primary .file-cta{background-color:#00d1b2;border-color:transparent;color:#fff}.file.is-primary.is-hovered .file-cta,.file.is-primary:hover .file-cta{background-color:#00c4a7;border-color:transparent;color:#fff}.file.is-primary.is-focused .file-cta,.file.is-primary:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(0,209,178,.25);color:#fff}.file.is-primary.is-active .file-cta,.file.is-primary:active .file-cta{background-color:#00b89c;border-color:transparent;color:#fff}.file.is-link .file-cta{background-color:#3273dc;border-color:transparent;color:#fff}.file.is-link.is-hovered .file-cta,.file.is-link:hover .file-cta{background-color:#276cda;border-color:transparent;color:#fff}.file.is-link.is-focused .file-cta,.file.is-link:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(50,115,220,.25);color:#fff}.file.is-link.is-active .file-cta,.file.is-link:active .file-cta{background-color:#2366d1;border-color:transparent;color:#fff}.file.is-info .file-cta{background-color:#3298dc;border-color:transparent;color:#fff}.file.is-info.is-hovered .file-cta,.file.is-info:hover .file-cta{background-color:#2793da;border-color:transparent;color:#fff}.file.is-info.is-focused .file-cta,.file.is-info:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(50,152,220,.25);color:#fff}.file.is-info.is-active .file-cta,.file.is-info:active .file-cta{background-color:#238cd1;border-color:transparent;color:#fff}.file.is-success .file-cta{background-color:#48c774;border-color:transparent;color:#fff}.file.is-success.is-hovered .file-cta,.file.is-success:hover .file-cta{background-color:#3ec46d;border-color:transparent;color:#fff}.file.is-success.is-focused .file-cta,.file.is-success:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(72,199,116,.25);color:#fff}.file.is-success.is-active .file-cta,.file.is-success:active .file-cta{background-color:#3abb67;border-color:transparent;color:#fff}.file.is-warning .file-cta{background-color:#ffdd57;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-warning.is-hovered .file-cta,.file.is-warning:hover .file-cta{background-color:#ffdb4a;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-warning.is-focused .file-cta,.file.is-warning:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(255,221,87,.25);color:rgba(0,0,0,.7)}.file.is-warning.is-active .file-cta,.file.is-warning:active .file-cta{background-color:#ffd83d;border-color:transparent;color:rgba(0,0,0,.7)}.file.is-danger .file-cta{background-color:#f14668;border-color:transparent;color:#fff}.file.is-danger.is-hovered .file-cta,.file.is-danger:hover .file-cta{background-color:#f03a5f;border-color:transparent;color:#fff}.file.is-danger.is-focused .file-cta,.file.is-danger:focus .file-cta{border-color:transparent;box-shadow:0 0 .5em rgba(241,70,104,.25);color:#fff}.file.is-danger.is-active .file-cta,.file.is-danger:active .file-cta{background-color:#ef2e55;border-color:transparent;color:#fff}.file.is-small{font-size:.75rem}.file.is-medium{font-size:1.25rem}.file.is-medium .file-icon .fa{font-size:21px}.file.is-large{font-size:1.5rem}.file.is-large .file-icon .fa{font-size:28px}.file.has-name .file-cta{border-bottom-right-radius:0;border-top-right-radius:0}.file.has-name .file-name{border-bottom-left-radius:0;border-top-left-radius:0}.file.has-name.is-empty .file-cta{border-radius:4px}.file.has-name.is-empty .file-name{display:none}.file.is-boxed .file-label{flex-direction:column}.file.is-boxed .file-cta{flex-direction:column;height:auto;padding:1em 3em}.file.is-boxed .file-name{border-width:0 1px 1px}.file.is-boxed .file-icon{height:1.5em;width:1.5em}.file.is-boxed .file-icon .fa{font-size:21px}.file.is-boxed.is-small .file-icon .fa{font-size:14px}.file.is-boxed.is-medium .file-icon .fa{font-size:28px}.file.is-boxed.is-large .file-icon .fa{font-size:35px}.file.is-boxed.has-name .file-cta{border-radius:4px 4px 0 0}.file.is-boxed.has-name .file-name{border-radius:0 0 4px 4px;border-width:0 1px 1px}.file.is-centered{justify-content:center}.file.is-fullwidth .file-label{width:100%}.file.is-fullwidth .file-name{flex-grow:1;max-width:none}.file.is-right{justify-content:flex-end}.file.is-right .file-cta{border-radius:0 4px 4px 0}.file.is-right .file-name{border-radius:4px 0 0 4px;border-width:1px 0 1px 1px;order:-1}.file-label{align-items:stretch;display:flex;cursor:pointer;justify-content:flex-start;overflow:hidden;position:relative}.file-label:hover .file-cta{background-color:#eee;color:#363636}.file-label:hover .file-name{border-color:#d5d5d5}.file-label:active .file-cta{background-color:#e8e8e8;color:#363636}.file-label:active .file-name{border-color:#cfcfcf}.file-input{height:100%;left:0;opacity:0;outline:0;position:absolute;top:0;width:100%}.file-cta,.file-name{border-color:#dbdbdb;border-radius:4px;font-size:1em;padding-left:1em;padding-right:1em;white-space:nowrap}.file-cta{background-color:#f5f5f5;color:#4a4a4a}.file-name{border-color:#dbdbdb;border-style:solid;border-width:1px 1px 1px 0;display:block;max-width:16em;overflow:hidden;text-align:inherit;text-overflow:ellipsis}.file-icon{align-items:center;display:flex;height:1em;justify-content:center;margin-left:.5em;width:1em}.file-icon .fa{font-size:14px}.label{color:#363636;display:block;font-size:1rem;font-weight:700}.label:not(:last-child){margin-bottom:.5em}.label.is-small{font-size:.75rem}.label.is-medium{font-size:1.25rem}.label.is-large{font-size:1.5rem}.help{display:block;font-size:.75rem;margin-top:.25rem}.help.is-white{color:#fff}.help.is-black{color:#0a0a0a}.help.is-light{color:#f5f5f5}.help.is-dark{color:#363636}.help.is-primary{color:#00d1b2}.help.is-link{color:#3273dc}.help.is-info{color:#3298dc}.help.is-success{color:#48c774}.help.is-warning{color:#ffdd57}.help.is-danger{color:#f14668}.field:not(:last-child){margin-bottom:.75rem}.field.has-addons{display:flex;justify-content:flex-start}.field.has-addons .control:not(:last-child){margin-left:-1px}.field.has-addons .control:not(:first-child):not(:last-child) .button,.field.has-addons .control:not(:first-child):not(:last-child) .input,.field.has-addons .control:not(:first-child):not(:last-child) .select select{border-radius:0}.field.has-addons .control:first-child:not(:only-child) .button,.field.has-addons .control:first-child:not(:only-child) .input,.field.has-addons .control:first-child:not(:only-child) .select select{border-bottom-left-radius:0;border-top-left-radius:0}.field.has-addons .control:last-child:not(:only-child) .button,.field.has-addons .control:last-child:not(:only-child) .input,.field.has-addons .control:last-child:not(:only-child) .select select{border-bottom-right-radius:0;border-top-right-radius:0}.field.has-addons .control .button:not([disabled]).is-hovered,.field.has-addons .control .button:not([disabled]):hover,.field.has-addons .control .input:not([disabled]).is-hovered,.field.has-addons .control .input:not([disabled]):hover,.field.has-addons .control .select select:not([disabled]).is-hovered,.field.has-addons .control .select select:not([disabled]):hover{z-index:2}.field.has-addons .control .button:not([disabled]).is-active,.field.has-addons .control .button:not([disabled]).is-focused,.field.has-addons .control .button:not([disabled]):active,.field.has-addons .control .button:not([disabled]):focus,.field.has-addons .control .input:not([disabled]).is-active,.field.has-addons .control .input:not([disabled]).is-focused,.field.has-addons .control .input:not([disabled]):active,.field.has-addons .control .input:not([disabled]):focus,.field.has-addons .control .select select:not([disabled]).is-active,.field.has-addons .control .select select:not([disabled]).is-focused,.field.has-addons .control .select select:not([disabled]):active,.field.has-addons .control .select select:not([disabled]):focus{z-index:3}.field.has-addons .control .button:not([disabled]).is-active:hover,.field.has-addons .control .button:not([disabled]).is-focused:hover,.field.has-addons .control .button:not([disabled]):active:hover,.field.has-addons .control .button:not([disabled]):focus:hover,.field.has-addons .control .input:not([disabled]).is-active:hover,.field.has-addons .control .input:not([disabled]).is-focused:hover,.field.has-addons .control .input:not([disabled]):active:hover,.field.has-addons .control .input:not([disabled]):focus:hover,.field.has-addons .control .select select:not([disabled]).is-active:hover,.field.has-addons .control .select select:not([disabled]).is-focused:hover,.field.has-addons .control .select select:not([disabled]):active:hover,.field.has-addons .control .select select:not([disabled]):focus:hover{z-index:4}.field.has-addons .control.is-expanded{flex-grow:1;flex-shrink:1}.field.has-addons.has-addons-centered{justify-content:center}.field.has-addons.has-addons-right{justify-content:flex-end}.field.has-addons.has-addons-fullwidth .control{flex-grow:1;flex-shrink:0}.field.is-grouped{display:flex;justify-content:flex-start}.field.is-grouped>.control{flex-shrink:0}.field.is-grouped>.control:not(:last-child){margin-bottom:0;margin-left:.75rem}.field.is-grouped>.control.is-expanded{flex-grow:1;flex-shrink:1}.field.is-grouped.is-grouped-centered{justify-content:center}.field.is-grouped.is-grouped-right{justify-content:flex-end}.field.is-grouped.is-grouped-multiline{flex-wrap:wrap}.field.is-grouped.is-grouped-multiline>.control:last-child,.field.is-grouped.is-grouped-multiline>.control:not(:last-child){margin-bottom:.75rem}.field.is-grouped.is-grouped-multiline:last-child{margin-bottom:-.75rem}.field.is-grouped.is-grouped-multiline:not(:last-child){margin-bottom:0}@media screen and (min-width:769px),print{.field.is-horizontal{display:flex}}.field-label .label{font-size:inherit}@media screen and (max-width:768px){.field-label{margin-bottom:.5rem}}@media screen and (min-width:769px),print{.field-label{flex-basis:0;flex-grow:1;flex-shrink:0;margin-left:1.5rem;text-align:right}.field-label.is-small{font-size:.75rem;padding-top:.375em}.field-label.is-normal{padding-top:.375em}.field-label.is-medium{font-size:1.25rem;padding-top:.375em}.field-label.is-large{font-size:1.5rem;padding-top:.375em}}.field-body .field .field{margin-bottom:0}@media screen and (min-width:769px),print{.field-body{display:flex;flex-basis:0;flex-grow:5;flex-shrink:1}.field-body .field{margin-bottom:0}.field-body>.field{flex-shrink:1}.field-body>.field:not(.is-narrow){flex-grow:1}.field-body>.field:not(:last-child){margin-left:.75rem}}.control{box-sizing:border-box;clear:both;font-size:1rem;position:relative;text-align:inherit}.control.has-icons-left .input:focus~.icon,.control.has-icons-left .select:focus~.icon,.control.has-icons-right .input:focus~.icon,.control.has-icons-right .select:focus~.icon{color:#4a4a4a}.control.has-icons-left .input.is-small~.icon,.control.has-icons-left .select.is-small~.icon,.control.has-icons-right .input.is-small~.icon,.control.has-icons-right .select.is-small~.icon{font-size:.75rem}.control.has-icons-left .input.is-medium~.icon,.control.has-icons-left .select.is-medium~.icon,.control.has-icons-right .input.is-medium~.icon,.control.has-icons-right .select.is-medium~.icon{font-size:1.25rem}.control.has-icons-left .input.is-large~.icon,.control.has-icons-left .select.is-large~.icon,.control.has-icons-right .input.is-large~.icon,.control.has-icons-right .select.is-large~.icon{font-size:1.5rem}.control.has-icons-left .icon,.control.has-icons-right .icon{color:#dbdbdb;height:2.5em;pointer-events:none;position:absolute;top:0;width:2.5em;z-index:4}.control.has-icons-left .input,.control.has-icons-left .select select{padding-left:2.5em}.control.has-icons-left .icon.is-left{left:0}.control.has-icons-right .input,.control.has-icons-right .select select{padding-right:2.5em}.control.has-icons-right .icon.is-right{right:0}.control.is-loading::after{position:absolute!important;left:.625em;top:.625em;z-index:4}.control.is-loading.is-small:after{font-size:.75rem}.control.is-loading.is-medium:after{font-size:1.25rem}.control.is-loading.is-large:after{font-size:1.5rem}.breadcrumb{font-size:1rem;white-space:nowrap}.breadcrumb a{align-items:center;color:#3273dc;display:flex;justify-content:center;padding:0 .75em}.breadcrumb a:hover{color:#363636}.breadcrumb li{align-items:center;display:flex}.breadcrumb li:first-child a{padding-right:0}.breadcrumb li.is-active a{color:#363636;cursor:default;pointer-events:none}.breadcrumb li+li::before{color:#b5b5b5;content:"\0002f"}.breadcrumb ol,.breadcrumb ul{align-items:flex-start;display:flex;flex-wrap:wrap;justify-content:flex-start}.breadcrumb .icon:first-child{margin-left:.5em}.breadcrumb .icon:last-child{margin-right:.5em}.breadcrumb.is-centered ol,.breadcrumb.is-centered ul{justify-content:center}.breadcrumb.is-right ol,.breadcrumb.is-right ul{justify-content:flex-end}.breadcrumb.is-small{font-size:.75rem}.breadcrumb.is-medium{font-size:1.25rem}.breadcrumb.is-large{font-size:1.5rem}.breadcrumb.has-arrow-separator li+li::before{content:"\02192"}.breadcrumb.has-bullet-separator li+li::before{content:"\02022"}.breadcrumb.has-dot-separator li+li::before{content:"\000b7"}.breadcrumb.has-succeeds-separator li+li::before{content:"\0227B"}.card{background-color:#fff;box-shadow:0 .5em 1em -.125em rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.02);color:#4a4a4a;max-width:100%;position:relative}.card-header{background-color:transparent;align-items:stretch;box-shadow:0 .125em .25em rgba(10,10,10,.1);display:flex}.card-header-title{align-items:center;color:#363636;display:flex;flex-grow:1;font-weight:700;padding:.75rem 1rem}.card-header-title.is-centered{justify-content:center}.card-header-icon{align-items:center;cursor:pointer;display:flex;justify-content:center;padding:.75rem 1rem}.card-image{display:block;position:relative}.card-content{background-color:transparent;padding:1.5rem}.card-footer{background-color:transparent;border-top:1px solid #ededed;align-items:stretch;display:flex}.card-footer-item{align-items:center;display:flex;flex-basis:0;flex-grow:1;flex-shrink:0;justify-content:center;padding:.75rem}.card-footer-item:not(:last-child){border-left:1px solid #ededed}.card .media:not(:last-child){margin-bottom:1.5rem}.dropdown{display:inline-flex;position:relative;vertical-align:top}.dropdown.is-active .dropdown-menu,.dropdown.is-hoverable:hover .dropdown-menu{display:block}.dropdown.is-right .dropdown-menu{left:auto;right:0}.dropdown.is-up .dropdown-menu{bottom:100%;padding-bottom:4px;padding-top:initial;top:auto}.dropdown-menu{display:none;right:0;min-width:12rem;padding-top:4px;position:absolute;top:100%;z-index:20}.dropdown-content{background-color:#fff;border-radius:4px;box-shadow:0 .5em 1em -.125em rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.02);padding-bottom:.5rem;padding-top:.5rem}.dropdown-item{color:#4a4a4a;display:block;font-size:.875rem;line-height:1.5;padding:.375rem 1rem;position:relative}a.dropdown-item,button.dropdown-item{padding-left:3rem;text-align:inherit;white-space:nowrap;width:100%}a.dropdown-item:hover,button.dropdown-item:hover{background-color:#f5f5f5;color:#0a0a0a}a.dropdown-item.is-active,button.dropdown-item.is-active{background-color:#3273dc;color:#fff}.dropdown-divider{background-color:#ededed;border:none;display:block;height:1px;margin:.5rem 0}.level{align-items:center;justify-content:space-between}.level code{border-radius:4px}.level img{display:inline-block;vertical-align:top}.level.is-mobile{display:flex}.level.is-mobile .level-left,.level.is-mobile .level-right{display:flex}.level.is-mobile .level-left+.level-right{margin-top:0}.level.is-mobile .level-item:not(:last-child){margin-bottom:0;margin-left:.75rem}.level.is-mobile .level-item:not(.is-narrow){flex-grow:1}@media screen and (min-width:769px),print{.level{display:flex}.level>.level-item:not(.is-narrow){flex-grow:1}}.level-item{align-items:center;display:flex;flex-basis:auto;flex-grow:0;flex-shrink:0;justify-content:center}.level-item .subtitle,.level-item .title{margin-bottom:0}@media screen and (max-width:768px){.level-item:not(:last-child){margin-bottom:.75rem}}.level-left,.level-right{flex-basis:auto;flex-grow:0;flex-shrink:0}.level-left .level-item.is-flexible,.level-right .level-item.is-flexible{flex-grow:1}@media screen and (min-width:769px),print{.level-left .level-item:not(:last-child),.level-right .level-item:not(:last-child){margin-left:.75rem}}.level-left{align-items:center;justify-content:flex-start}@media screen and (max-width:768px){.level-left+.level-right{margin-top:1.5rem}}@media screen and (min-width:769px),print{.level-left{display:flex}}.level-right{align-items:center;justify-content:flex-end}@media screen and (min-width:769px),print{.level-right{display:flex}}.media{align-items:flex-start;display:flex;text-align:inherit}.media .content:not(:last-child){margin-bottom:.75rem}.media .media{border-top:1px solid rgba(219,219,219,.5);display:flex;padding-top:.75rem}.media .media .content:not(:last-child),.media .media .control:not(:last-child){margin-bottom:.5rem}.media .media .media{padding-top:.5rem}.media .media .media+.media{margin-top:.5rem}.media+.media{border-top:1px solid rgba(219,219,219,.5);margin-top:1rem;padding-top:1rem}.media.is-large+.media{margin-top:1.5rem;padding-top:1.5rem}.media-left,.media-right{flex-basis:auto;flex-grow:0;flex-shrink:0}.media-left{margin-left:1rem}.media-right{margin-right:1rem}.media-content{flex-basis:auto;flex-grow:1;flex-shrink:1;text-align:inherit}@media screen and (max-width:768px){.media-content{overflow-x:auto}}.menu{font-size:1rem}.menu.is-small{font-size:.75rem}.menu.is-medium{font-size:1.25rem}.menu.is-large{font-size:1.5rem}.menu-list{line-height:1.25}.menu-list a{border-radius:2px;color:#4a4a4a;display:block;padding:.5em .75em}.menu-list a:hover{background-color:#f5f5f5;color:#363636}.menu-list a.is-active{background-color:#3273dc;color:#fff}.menu-list li ul{border-right:1px solid #dbdbdb;margin:.75em;padding-right:.75em}.menu-label{color:#7a7a7a;font-size:.75em;letter-spacing:.1em;text-transform:uppercase}.menu-label:not(:first-child){margin-top:1em}.menu-label:not(:last-child){margin-bottom:1em}.message{background-color:#f5f5f5;border-radius:4px;font-size:1rem}.message strong{color:currentColor}.message a:not(.button):not(.tag):not(.dropdown-item){color:currentColor;text-decoration:underline}.message.is-small{font-size:.75rem}.message.is-medium{font-size:1.25rem}.message.is-large{font-size:1.5rem}.message.is-white{background-color:#fff}.message.is-white .message-header{background-color:#fff;color:#0a0a0a}.message.is-white .message-body{border-color:#fff}.message.is-black{background-color:#fafafa}.message.is-black .message-header{background-color:#0a0a0a;color:#fff}.message.is-black .message-body{border-color:#0a0a0a}.message.is-light{background-color:#fafafa}.message.is-light .message-header{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.message.is-light .message-body{border-color:#f5f5f5}.message.is-dark{background-color:#fafafa}.message.is-dark .message-header{background-color:#363636;color:#fff}.message.is-dark .message-body{border-color:#363636}.message.is-primary{background-color:#ebfffc}.message.is-primary .message-header{background-color:#00d1b2;color:#fff}.message.is-primary .message-body{border-color:#00d1b2;color:#00947e}.message.is-link{background-color:#eef3fc}.message.is-link .message-header{background-color:#3273dc;color:#fff}.message.is-link .message-body{border-color:#3273dc;color:#2160c4}.message.is-info{background-color:#eef6fc}.message.is-info .message-header{background-color:#3298dc;color:#fff}.message.is-info .message-body{border-color:#3298dc;color:#1d72aa}.message.is-success{background-color:#effaf3}.message.is-success .message-header{background-color:#48c774;color:#fff}.message.is-success .message-body{border-color:#48c774;color:#257942}.message.is-warning{background-color:#fffbeb}.message.is-warning .message-header{background-color:#ffdd57;color:rgba(0,0,0,.7)}.message.is-warning .message-body{border-color:#ffdd57;color:#947600}.message.is-danger{background-color:#feecf0}.message.is-danger .message-header{background-color:#f14668;color:#fff}.message.is-danger .message-body{border-color:#f14668;color:#cc0f35}.message-header{align-items:center;background-color:#4a4a4a;border-radius:4px 4px 0 0;color:#fff;display:flex;font-weight:700;justify-content:space-between;line-height:1.25;padding:.75em 1em;position:relative}.message-header .delete{flex-grow:0;flex-shrink:0;margin-right:.75em}.message-header+.message-body{border-width:0;border-top-left-radius:0;border-top-right-radius:0}.message-body{border-color:#dbdbdb;border-radius:4px;border-style:solid;border-width:0 0 0 4px;color:#4a4a4a;padding:1.25em 1.5em}.message-body code,.message-body pre{background-color:#fff}.message-body pre code{background-color:transparent}.modal{align-items:center;display:none;flex-direction:column;justify-content:center;overflow:hidden;position:fixed;z-index:40}.modal.is-active{display:flex}.modal-background{background-color:rgba(10,10,10,.86)}.modal-card,.modal-content{margin:0 20px;max-height:calc(100vh - 160px);overflow:auto;position:relative;width:100%}@media screen and (min-width:769px),print{.modal-card,.modal-content{margin:0 auto;max-height:calc(100vh - 40px);width:640px}}.modal-close{background:0 0;height:40px;position:fixed;left:20px;top:20px;width:40px}.modal-card{display:flex;flex-direction:column;max-height:calc(100vh - 40px);overflow:hidden;-ms-overflow-y:visible}.modal-card-foot,.modal-card-head{align-items:center;background-color:#f5f5f5;display:flex;flex-shrink:0;justify-content:flex-start;padding:20px;position:relative}.modal-card-head{border-bottom:1px solid #dbdbdb;border-top-left-radius:6px;border-top-right-radius:6px}.modal-card-title{color:#363636;flex-grow:1;flex-shrink:0;font-size:1.5rem;line-height:1}.modal-card-foot{border-bottom-left-radius:6px;border-bottom-right-radius:6px;border-top:1px solid #dbdbdb}.modal-card-foot .button:not(:last-child){margin-left:.5em}.modal-card-body{-webkit-overflow-scrolling:touch;background-color:#fff;flex-grow:1;flex-shrink:1;overflow:auto;padding:20px}.navbar{background-color:#fff;min-height:3.25rem;position:relative;z-index:30}.navbar.is-white{background-color:#fff;color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link,.navbar.is-white .navbar-brand>.navbar-item{color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link.is-active,.navbar.is-white .navbar-brand .navbar-link:focus,.navbar.is-white .navbar-brand .navbar-link:hover,.navbar.is-white .navbar-brand>a.navbar-item.is-active,.navbar.is-white .navbar-brand>a.navbar-item:focus,.navbar.is-white .navbar-brand>a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white .navbar-brand .navbar-link::after{border-color:#0a0a0a}.navbar.is-white .navbar-burger{color:#0a0a0a}@media screen and (min-width:1024px){.navbar.is-white .navbar-end .navbar-link,.navbar.is-white .navbar-end>.navbar-item,.navbar.is-white .navbar-start .navbar-link,.navbar.is-white .navbar-start>.navbar-item{color:#0a0a0a}.navbar.is-white .navbar-end .navbar-link.is-active,.navbar.is-white .navbar-end .navbar-link:focus,.navbar.is-white .navbar-end .navbar-link:hover,.navbar.is-white .navbar-end>a.navbar-item.is-active,.navbar.is-white .navbar-end>a.navbar-item:focus,.navbar.is-white .navbar-end>a.navbar-item:hover,.navbar.is-white .navbar-start .navbar-link.is-active,.navbar.is-white .navbar-start .navbar-link:focus,.navbar.is-white .navbar-start .navbar-link:hover,.navbar.is-white .navbar-start>a.navbar-item.is-active,.navbar.is-white .navbar-start>a.navbar-item:focus,.navbar.is-white .navbar-start>a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white .navbar-end .navbar-link::after,.navbar.is-white .navbar-start .navbar-link::after{border-color:#0a0a0a}.navbar.is-white .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-white .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-white .navbar-item.has-dropdown:hover .navbar-link{background-color:#f2f2f2;color:#0a0a0a}.navbar.is-white .navbar-dropdown a.navbar-item.is-active{background-color:#fff;color:#0a0a0a}}.navbar.is-black{background-color:#0a0a0a;color:#fff}.navbar.is-black .navbar-brand .navbar-link,.navbar.is-black .navbar-brand>.navbar-item{color:#fff}.navbar.is-black .navbar-brand .navbar-link.is-active,.navbar.is-black .navbar-brand .navbar-link:focus,.navbar.is-black .navbar-brand .navbar-link:hover,.navbar.is-black .navbar-brand>a.navbar-item.is-active,.navbar.is-black .navbar-brand>a.navbar-item:focus,.navbar.is-black .navbar-brand>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-black .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-black .navbar-end .navbar-link,.navbar.is-black .navbar-end>.navbar-item,.navbar.is-black .navbar-start .navbar-link,.navbar.is-black .navbar-start>.navbar-item{color:#fff}.navbar.is-black .navbar-end .navbar-link.is-active,.navbar.is-black .navbar-end .navbar-link:focus,.navbar.is-black .navbar-end .navbar-link:hover,.navbar.is-black .navbar-end>a.navbar-item.is-active,.navbar.is-black .navbar-end>a.navbar-item:focus,.navbar.is-black .navbar-end>a.navbar-item:hover,.navbar.is-black .navbar-start .navbar-link.is-active,.navbar.is-black .navbar-start .navbar-link:focus,.navbar.is-black .navbar-start .navbar-link:hover,.navbar.is-black .navbar-start>a.navbar-item.is-active,.navbar.is-black .navbar-start>a.navbar-item:focus,.navbar.is-black .navbar-start>a.navbar-item:hover{background-color:#000;color:#fff}.navbar.is-black .navbar-end .navbar-link::after,.navbar.is-black .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-black .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-black .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-black .navbar-item.has-dropdown:hover .navbar-link{background-color:#000;color:#fff}.navbar.is-black .navbar-dropdown a.navbar-item.is-active{background-color:#0a0a0a;color:#fff}}.navbar.is-light{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.navbar.is-light .navbar-brand .navbar-link,.navbar.is-light .navbar-brand>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-light .navbar-brand .navbar-link.is-active,.navbar.is-light .navbar-brand .navbar-link:focus,.navbar.is-light .navbar-brand .navbar-link:hover,.navbar.is-light .navbar-brand>a.navbar-item.is-active,.navbar.is-light .navbar-brand>a.navbar-item:focus,.navbar.is-light .navbar-brand>a.navbar-item:hover{background-color:#e8e8e8;color:rgba(0,0,0,.7)}.navbar.is-light .navbar-brand .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-light .navbar-burger{color:rgba(0,0,0,.7)}@media screen and (min-width:1024px){.navbar.is-light .navbar-end .navbar-link,.navbar.is-light .navbar-end>.navbar-item,.navbar.is-light .navbar-start .navbar-link,.navbar.is-light .navbar-start>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-light .navbar-end .navbar-link.is-active,.navbar.is-light .navbar-end .navbar-link:focus,.navbar.is-light .navbar-end .navbar-link:hover,.navbar.is-light .navbar-end>a.navbar-item.is-active,.navbar.is-light .navbar-end>a.navbar-item:focus,.navbar.is-light .navbar-end>a.navbar-item:hover,.navbar.is-light .navbar-start .navbar-link.is-active,.navbar.is-light .navbar-start .navbar-link:focus,.navbar.is-light .navbar-start .navbar-link:hover,.navbar.is-light .navbar-start>a.navbar-item.is-active,.navbar.is-light .navbar-start>a.navbar-item:focus,.navbar.is-light .navbar-start>a.navbar-item:hover{background-color:#e8e8e8;color:rgba(0,0,0,.7)}.navbar.is-light .navbar-end .navbar-link::after,.navbar.is-light .navbar-start .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-light .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-light .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-light .navbar-item.has-dropdown:hover .navbar-link{background-color:#e8e8e8;color:rgba(0,0,0,.7)}.navbar.is-light .navbar-dropdown a.navbar-item.is-active{background-color:#f5f5f5;color:rgba(0,0,0,.7)}}.navbar.is-dark{background-color:#363636;color:#fff}.navbar.is-dark .navbar-brand .navbar-link,.navbar.is-dark .navbar-brand>.navbar-item{color:#fff}.navbar.is-dark .navbar-brand .navbar-link.is-active,.navbar.is-dark .navbar-brand .navbar-link:focus,.navbar.is-dark .navbar-brand .navbar-link:hover,.navbar.is-dark .navbar-brand>a.navbar-item.is-active,.navbar.is-dark .navbar-brand>a.navbar-item:focus,.navbar.is-dark .navbar-brand>a.navbar-item:hover{background-color:#292929;color:#fff}.navbar.is-dark .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-dark .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-dark .navbar-end .navbar-link,.navbar.is-dark .navbar-end>.navbar-item,.navbar.is-dark .navbar-start .navbar-link,.navbar.is-dark .navbar-start>.navbar-item{color:#fff}.navbar.is-dark .navbar-end .navbar-link.is-active,.navbar.is-dark .navbar-end .navbar-link:focus,.navbar.is-dark .navbar-end .navbar-link:hover,.navbar.is-dark .navbar-end>a.navbar-item.is-active,.navbar.is-dark .navbar-end>a.navbar-item:focus,.navbar.is-dark .navbar-end>a.navbar-item:hover,.navbar.is-dark .navbar-start .navbar-link.is-active,.navbar.is-dark .navbar-start .navbar-link:focus,.navbar.is-dark .navbar-start .navbar-link:hover,.navbar.is-dark .navbar-start>a.navbar-item.is-active,.navbar.is-dark .navbar-start>a.navbar-item:focus,.navbar.is-dark .navbar-start>a.navbar-item:hover{background-color:#292929;color:#fff}.navbar.is-dark .navbar-end .navbar-link::after,.navbar.is-dark .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-dark .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-dark .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-dark .navbar-item.has-dropdown:hover .navbar-link{background-color:#292929;color:#fff}.navbar.is-dark .navbar-dropdown a.navbar-item.is-active{background-color:#363636;color:#fff}}.navbar.is-primary{background-color:#00d1b2;color:#fff}.navbar.is-primary .navbar-brand .navbar-link,.navbar.is-primary .navbar-brand>.navbar-item{color:#fff}.navbar.is-primary .navbar-brand .navbar-link.is-active,.navbar.is-primary .navbar-brand .navbar-link:focus,.navbar.is-primary .navbar-brand .navbar-link:hover,.navbar.is-primary .navbar-brand>a.navbar-item.is-active,.navbar.is-primary .navbar-brand>a.navbar-item:focus,.navbar.is-primary .navbar-brand>a.navbar-item:hover{background-color:#00b89c;color:#fff}.navbar.is-primary .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-primary .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-primary .navbar-end .navbar-link,.navbar.is-primary .navbar-end>.navbar-item,.navbar.is-primary .navbar-start .navbar-link,.navbar.is-primary .navbar-start>.navbar-item{color:#fff}.navbar.is-primary .navbar-end .navbar-link.is-active,.navbar.is-primary .navbar-end .navbar-link:focus,.navbar.is-primary .navbar-end .navbar-link:hover,.navbar.is-primary .navbar-end>a.navbar-item.is-active,.navbar.is-primary .navbar-end>a.navbar-item:focus,.navbar.is-primary .navbar-end>a.navbar-item:hover,.navbar.is-primary .navbar-start .navbar-link.is-active,.navbar.is-primary .navbar-start .navbar-link:focus,.navbar.is-primary .navbar-start .navbar-link:hover,.navbar.is-primary .navbar-start>a.navbar-item.is-active,.navbar.is-primary .navbar-start>a.navbar-item:focus,.navbar.is-primary .navbar-start>a.navbar-item:hover{background-color:#00b89c;color:#fff}.navbar.is-primary .navbar-end .navbar-link::after,.navbar.is-primary .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-primary .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-primary .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-primary .navbar-item.has-dropdown:hover .navbar-link{background-color:#00b89c;color:#fff}.navbar.is-primary .navbar-dropdown a.navbar-item.is-active{background-color:#00d1b2;color:#fff}}.navbar.is-link{background-color:#3273dc;color:#fff}.navbar.is-link .navbar-brand .navbar-link,.navbar.is-link .navbar-brand>.navbar-item{color:#fff}.navbar.is-link .navbar-brand .navbar-link.is-active,.navbar.is-link .navbar-brand .navbar-link:focus,.navbar.is-link .navbar-brand .navbar-link:hover,.navbar.is-link .navbar-brand>a.navbar-item.is-active,.navbar.is-link .navbar-brand>a.navbar-item:focus,.navbar.is-link .navbar-brand>a.navbar-item:hover{background-color:#2366d1;color:#fff}.navbar.is-link .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-link .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-link .navbar-end .navbar-link,.navbar.is-link .navbar-end>.navbar-item,.navbar.is-link .navbar-start .navbar-link,.navbar.is-link .navbar-start>.navbar-item{color:#fff}.navbar.is-link .navbar-end .navbar-link.is-active,.navbar.is-link .navbar-end .navbar-link:focus,.navbar.is-link .navbar-end .navbar-link:hover,.navbar.is-link .navbar-end>a.navbar-item.is-active,.navbar.is-link .navbar-end>a.navbar-item:focus,.navbar.is-link .navbar-end>a.navbar-item:hover,.navbar.is-link .navbar-start .navbar-link.is-active,.navbar.is-link .navbar-start .navbar-link:focus,.navbar.is-link .navbar-start .navbar-link:hover,.navbar.is-link .navbar-start>a.navbar-item.is-active,.navbar.is-link .navbar-start>a.navbar-item:focus,.navbar.is-link .navbar-start>a.navbar-item:hover{background-color:#2366d1;color:#fff}.navbar.is-link .navbar-end .navbar-link::after,.navbar.is-link .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-link .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-link .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-link .navbar-item.has-dropdown:hover .navbar-link{background-color:#2366d1;color:#fff}.navbar.is-link .navbar-dropdown a.navbar-item.is-active{background-color:#3273dc;color:#fff}}.navbar.is-info{background-color:#3298dc;color:#fff}.navbar.is-info .navbar-brand .navbar-link,.navbar.is-info .navbar-brand>.navbar-item{color:#fff}.navbar.is-info .navbar-brand .navbar-link.is-active,.navbar.is-info .navbar-brand .navbar-link:focus,.navbar.is-info .navbar-brand .navbar-link:hover,.navbar.is-info .navbar-brand>a.navbar-item.is-active,.navbar.is-info .navbar-brand>a.navbar-item:focus,.navbar.is-info .navbar-brand>a.navbar-item:hover{background-color:#238cd1;color:#fff}.navbar.is-info .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-info .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-info .navbar-end .navbar-link,.navbar.is-info .navbar-end>.navbar-item,.navbar.is-info .navbar-start .navbar-link,.navbar.is-info .navbar-start>.navbar-item{color:#fff}.navbar.is-info .navbar-end .navbar-link.is-active,.navbar.is-info .navbar-end .navbar-link:focus,.navbar.is-info .navbar-end .navbar-link:hover,.navbar.is-info .navbar-end>a.navbar-item.is-active,.navbar.is-info .navbar-end>a.navbar-item:focus,.navbar.is-info .navbar-end>a.navbar-item:hover,.navbar.is-info .navbar-start .navbar-link.is-active,.navbar.is-info .navbar-start .navbar-link:focus,.navbar.is-info .navbar-start .navbar-link:hover,.navbar.is-info .navbar-start>a.navbar-item.is-active,.navbar.is-info .navbar-start>a.navbar-item:focus,.navbar.is-info .navbar-start>a.navbar-item:hover{background-color:#238cd1;color:#fff}.navbar.is-info .navbar-end .navbar-link::after,.navbar.is-info .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-info .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-info .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-info .navbar-item.has-dropdown:hover .navbar-link{background-color:#238cd1;color:#fff}.navbar.is-info .navbar-dropdown a.navbar-item.is-active{background-color:#3298dc;color:#fff}}.navbar.is-success{background-color:#48c774;color:#fff}.navbar.is-success .navbar-brand .navbar-link,.navbar.is-success .navbar-brand>.navbar-item{color:#fff}.navbar.is-success .navbar-brand .navbar-link.is-active,.navbar.is-success .navbar-brand .navbar-link:focus,.navbar.is-success .navbar-brand .navbar-link:hover,.navbar.is-success .navbar-brand>a.navbar-item.is-active,.navbar.is-success .navbar-brand>a.navbar-item:focus,.navbar.is-success .navbar-brand>a.navbar-item:hover{background-color:#3abb67;color:#fff}.navbar.is-success .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-success .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-success .navbar-end .navbar-link,.navbar.is-success .navbar-end>.navbar-item,.navbar.is-success .navbar-start .navbar-link,.navbar.is-success .navbar-start>.navbar-item{color:#fff}.navbar.is-success .navbar-end .navbar-link.is-active,.navbar.is-success .navbar-end .navbar-link:focus,.navbar.is-success .navbar-end .navbar-link:hover,.navbar.is-success .navbar-end>a.navbar-item.is-active,.navbar.is-success .navbar-end>a.navbar-item:focus,.navbar.is-success .navbar-end>a.navbar-item:hover,.navbar.is-success .navbar-start .navbar-link.is-active,.navbar.is-success .navbar-start .navbar-link:focus,.navbar.is-success .navbar-start .navbar-link:hover,.navbar.is-success .navbar-start>a.navbar-item.is-active,.navbar.is-success .navbar-start>a.navbar-item:focus,.navbar.is-success .navbar-start>a.navbar-item:hover{background-color:#3abb67;color:#fff}.navbar.is-success .navbar-end .navbar-link::after,.navbar.is-success .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-success .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-success .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-success .navbar-item.has-dropdown:hover .navbar-link{background-color:#3abb67;color:#fff}.navbar.is-success .navbar-dropdown a.navbar-item.is-active{background-color:#48c774;color:#fff}}.navbar.is-warning{background-color:#ffdd57;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link,.navbar.is-warning .navbar-brand>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link.is-active,.navbar.is-warning .navbar-brand .navbar-link:focus,.navbar.is-warning .navbar-brand .navbar-link:hover,.navbar.is-warning .navbar-brand>a.navbar-item.is-active,.navbar.is-warning .navbar-brand>a.navbar-item:focus,.navbar.is-warning .navbar-brand>a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-brand .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-burger{color:rgba(0,0,0,.7)}@media screen and (min-width:1024px){.navbar.is-warning .navbar-end .navbar-link,.navbar.is-warning .navbar-end>.navbar-item,.navbar.is-warning .navbar-start .navbar-link,.navbar.is-warning .navbar-start>.navbar-item{color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-end .navbar-link.is-active,.navbar.is-warning .navbar-end .navbar-link:focus,.navbar.is-warning .navbar-end .navbar-link:hover,.navbar.is-warning .navbar-end>a.navbar-item.is-active,.navbar.is-warning .navbar-end>a.navbar-item:focus,.navbar.is-warning .navbar-end>a.navbar-item:hover,.navbar.is-warning .navbar-start .navbar-link.is-active,.navbar.is-warning .navbar-start .navbar-link:focus,.navbar.is-warning .navbar-start .navbar-link:hover,.navbar.is-warning .navbar-start>a.navbar-item.is-active,.navbar.is-warning .navbar-start>a.navbar-item:focus,.navbar.is-warning .navbar-start>a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-end .navbar-link::after,.navbar.is-warning .navbar-start .navbar-link::after{border-color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-warning .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-warning .navbar-item.has-dropdown:hover .navbar-link{background-color:#ffd83d;color:rgba(0,0,0,.7)}.navbar.is-warning .navbar-dropdown a.navbar-item.is-active{background-color:#ffdd57;color:rgba(0,0,0,.7)}}.navbar.is-danger{background-color:#f14668;color:#fff}.navbar.is-danger .navbar-brand .navbar-link,.navbar.is-danger .navbar-brand>.navbar-item{color:#fff}.navbar.is-danger .navbar-brand .navbar-link.is-active,.navbar.is-danger .navbar-brand .navbar-link:focus,.navbar.is-danger .navbar-brand .navbar-link:hover,.navbar.is-danger .navbar-brand>a.navbar-item.is-active,.navbar.is-danger .navbar-brand>a.navbar-item:focus,.navbar.is-danger .navbar-brand>a.navbar-item:hover{background-color:#ef2e55;color:#fff}.navbar.is-danger .navbar-brand .navbar-link::after{border-color:#fff}.navbar.is-danger .navbar-burger{color:#fff}@media screen and (min-width:1024px){.navbar.is-danger .navbar-end .navbar-link,.navbar.is-danger .navbar-end>.navbar-item,.navbar.is-danger .navbar-start .navbar-link,.navbar.is-danger .navbar-start>.navbar-item{color:#fff}.navbar.is-danger .navbar-end .navbar-link.is-active,.navbar.is-danger .navbar-end .navbar-link:focus,.navbar.is-danger .navbar-end .navbar-link:hover,.navbar.is-danger .navbar-end>a.navbar-item.is-active,.navbar.is-danger .navbar-end>a.navbar-item:focus,.navbar.is-danger .navbar-end>a.navbar-item:hover,.navbar.is-danger .navbar-start .navbar-link.is-active,.navbar.is-danger .navbar-start .navbar-link:focus,.navbar.is-danger .navbar-start .navbar-link:hover,.navbar.is-danger .navbar-start>a.navbar-item.is-active,.navbar.is-danger .navbar-start>a.navbar-item:focus,.navbar.is-danger .navbar-start>a.navbar-item:hover{background-color:#ef2e55;color:#fff}.navbar.is-danger .navbar-end .navbar-link::after,.navbar.is-danger .navbar-start .navbar-link::after{border-color:#fff}.navbar.is-danger .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-danger .navbar-item.has-dropdown:focus .navbar-link,.navbar.is-danger .navbar-item.has-dropdown:hover .navbar-link{background-color:#ef2e55;color:#fff}.navbar.is-danger .navbar-dropdown a.navbar-item.is-active{background-color:#f14668;color:#fff}}.navbar>.container{align-items:stretch;display:flex;min-height:3.25rem;width:100%}.navbar.has-shadow{box-shadow:0 2px 0 0 #f5f5f5}.navbar.is-fixed-bottom,.navbar.is-fixed-top{left:0;position:fixed;right:0;z-index:30}.navbar.is-fixed-bottom{bottom:0}.navbar.is-fixed-bottom.has-shadow{box-shadow:0 -2px 0 0 #f5f5f5}.navbar.is-fixed-top{top:0}body.has-navbar-fixed-top,html.has-navbar-fixed-top{padding-top:3.25rem}body.has-navbar-fixed-bottom,html.has-navbar-fixed-bottom{padding-bottom:3.25rem}.navbar-brand,.navbar-tabs{align-items:stretch;display:flex;flex-shrink:0;min-height:3.25rem}.navbar-brand a.navbar-item:focus,.navbar-brand a.navbar-item:hover{background-color:transparent}.navbar-tabs{-webkit-overflow-scrolling:touch;max-width:100vw;overflow-x:auto;overflow-y:hidden}.navbar-burger{color:#4a4a4a;cursor:pointer;display:block;height:3.25rem;position:relative;width:3.25rem;margin-right:auto}.navbar-burger span{background-color:currentColor;display:block;height:1px;left:calc(50% - 8px);position:absolute;transform-origin:center;transition-duration:86ms;transition-property:background-color,opacity,transform;transition-timing-function:ease-out;width:16px}.navbar-burger span:nth-child(1){top:calc(50% - 6px)}.navbar-burger span:nth-child(2){top:calc(50% - 1px)}.navbar-burger span:nth-child(3){top:calc(50% + 4px)}.navbar-burger:hover{background-color:rgba(0,0,0,.05)}.navbar-burger.is-active span:nth-child(1){transform:translateY(5px) rotate(45deg)}.navbar-burger.is-active span:nth-child(2){opacity:0}.navbar-burger.is-active span:nth-child(3){transform:translateY(-5px) rotate(-45deg)}.navbar-menu{display:none}.navbar-item,.navbar-link{color:#4a4a4a;display:block;line-height:1.5;padding:.5rem .75rem;position:relative}.navbar-item .icon:only-child,.navbar-link .icon:only-child{margin-left:-.25rem;margin-right:-.25rem}.navbar-link,a.navbar-item{cursor:pointer}.navbar-link.is-active,.navbar-link:focus,.navbar-link:focus-within,.navbar-link:hover,a.navbar-item.is-active,a.navbar-item:focus,a.navbar-item:focus-within,a.navbar-item:hover{background-color:#fafafa;color:#3273dc}.navbar-item{flex-grow:0;flex-shrink:0}.navbar-item img{max-height:1.75rem}.navbar-item.has-dropdown{padding:0}.navbar-item.is-expanded{flex-grow:1;flex-shrink:1}.navbar-item.is-tab{border-bottom:1px solid transparent;min-height:3.25rem;padding-bottom:calc(.5rem - 1px)}.navbar-item.is-tab:focus,.navbar-item.is-tab:hover{background-color:transparent;border-bottom-color:#3273dc}.navbar-item.is-tab.is-active{background-color:transparent;border-bottom-color:#3273dc;border-bottom-style:solid;border-bottom-width:3px;color:#3273dc;padding-bottom:calc(.5rem - 3px)}.navbar-content{flex-grow:1;flex-shrink:1}.navbar-link:not(.is-arrowless){padding-left:2.5em}.navbar-link:not(.is-arrowless)::after{border-color:#3273dc;margin-top:-.375em;left:1.125em}.navbar-dropdown{font-size:.875rem;padding-bottom:.5rem;padding-top:.5rem}.navbar-dropdown .navbar-item{padding-left:1.5rem;padding-right:1.5rem}.navbar-divider{background-color:#f5f5f5;border:none;display:none;height:2px;margin:.5rem 0}@media screen and (max-width:1023px){.navbar>.container{display:block}.navbar-brand .navbar-item,.navbar-tabs .navbar-item{align-items:center;display:flex}.navbar-link::after{display:none}.navbar-menu{background-color:#fff;box-shadow:0 8px 16px rgba(10,10,10,.1);padding:.5rem 0}.navbar-menu.is-active{display:block}.navbar.is-fixed-bottom-touch,.navbar.is-fixed-top-touch{left:0;position:fixed;right:0;z-index:30}.navbar.is-fixed-bottom-touch{bottom:0}.navbar.is-fixed-bottom-touch.has-shadow{box-shadow:0 -2px 3px rgba(10,10,10,.1)}.navbar.is-fixed-top-touch{top:0}.navbar.is-fixed-top .navbar-menu,.navbar.is-fixed-top-touch .navbar-menu{-webkit-overflow-scrolling:touch;max-height:calc(100vh - 3.25rem);overflow:auto}body.has-navbar-fixed-top-touch,html.has-navbar-fixed-top-touch{padding-top:3.25rem}body.has-navbar-fixed-bottom-touch,html.has-navbar-fixed-bottom-touch{padding-bottom:3.25rem}}@media screen and (min-width:1024px){.navbar,.navbar-end,.navbar-menu,.navbar-start{align-items:stretch;display:flex}.navbar{min-height:3.25rem}.navbar.is-spaced{padding:1rem 2rem}.navbar.is-spaced .navbar-end,.navbar.is-spaced .navbar-start{align-items:center}.navbar.is-spaced .navbar-link,.navbar.is-spaced a.navbar-item{border-radius:4px}.navbar.is-transparent .navbar-link.is-active,.navbar.is-transparent .navbar-link:focus,.navbar.is-transparent .navbar-link:hover,.navbar.is-transparent a.navbar-item.is-active,.navbar.is-transparent a.navbar-item:focus,.navbar.is-transparent a.navbar-item:hover{background-color:transparent!important}.navbar.is-transparent .navbar-item.has-dropdown.is-active .navbar-link,.navbar.is-transparent .navbar-item.has-dropdown.is-hoverable:focus .navbar-link,.navbar.is-transparent .navbar-item.has-dropdown.is-hoverable:focus-within .navbar-link,.navbar.is-transparent .navbar-item.has-dropdown.is-hoverable:hover .navbar-link{background-color:transparent!important}.navbar.is-transparent .navbar-dropdown a.navbar-item:focus,.navbar.is-transparent .navbar-dropdown a.navbar-item:hover{background-color:#f5f5f5;color:#0a0a0a}.navbar.is-transparent .navbar-dropdown a.navbar-item.is-active{background-color:#f5f5f5;color:#3273dc}.navbar-burger{display:none}.navbar-item,.navbar-link{align-items:center;display:flex}.navbar-item.has-dropdown{align-items:stretch}.navbar-item.has-dropdown-up .navbar-link::after{transform:rotate(135deg) translate(.25em,-.25em)}.navbar-item.has-dropdown-up .navbar-dropdown{border-bottom:2px solid #dbdbdb;border-radius:6px 6px 0 0;border-top:none;bottom:100%;box-shadow:0 -8px 8px rgba(10,10,10,.1);top:auto}.navbar-item.is-active .navbar-dropdown,.navbar-item.is-hoverable:focus .navbar-dropdown,.navbar-item.is-hoverable:focus-within .navbar-dropdown,.navbar-item.is-hoverable:hover .navbar-dropdown{display:block}.navbar-item.is-active .navbar-dropdown.is-boxed,.navbar-item.is-hoverable:focus .navbar-dropdown.is-boxed,.navbar-item.is-hoverable:focus-within .navbar-dropdown.is-boxed,.navbar-item.is-hoverable:hover .navbar-dropdown.is-boxed,.navbar.is-spaced .navbar-item.is-active .navbar-dropdown,.navbar.is-spaced .navbar-item.is-hoverable:focus .navbar-dropdown,.navbar.is-spaced .navbar-item.is-hoverable:focus-within .navbar-dropdown,.navbar.is-spaced .navbar-item.is-hoverable:hover .navbar-dropdown{opacity:1;pointer-events:auto;transform:translateY(0)}.navbar-menu{flex-grow:1;flex-shrink:0}.navbar-start{justify-content:flex-start;margin-left:auto}.navbar-end{justify-content:flex-end;margin-right:auto}.navbar-dropdown{background-color:#fff;border-bottom-left-radius:6px;border-bottom-right-radius:6px;border-top:2px solid #dbdbdb;box-shadow:0 8px 8px rgba(10,10,10,.1);display:none;font-size:.875rem;right:0;min-width:100%;position:absolute;top:100%;z-index:20}.navbar-dropdown .navbar-item{padding:.375rem 1rem;white-space:nowrap}.navbar-dropdown a.navbar-item{padding-left:3rem}.navbar-dropdown a.navbar-item:focus,.navbar-dropdown a.navbar-item:hover{background-color:#f5f5f5;color:#0a0a0a}.navbar-dropdown a.navbar-item.is-active{background-color:#f5f5f5;color:#3273dc}.navbar-dropdown.is-boxed,.navbar.is-spaced .navbar-dropdown{border-radius:6px;border-top:none;box-shadow:0 8px 8px rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.1);display:block;opacity:0;pointer-events:none;top:calc(100% + (-4px));transform:translateY(-5px);transition-duration:86ms;transition-property:opacity,transform}.navbar-dropdown.is-right{left:auto;right:0}.navbar-divider{display:block}.container>.navbar .navbar-brand,.navbar>.container .navbar-brand{margin-right:-.75rem}.container>.navbar .navbar-menu,.navbar>.container .navbar-menu{margin-left:-.75rem}.navbar.is-fixed-bottom-desktop,.navbar.is-fixed-top-desktop{left:0;position:fixed;right:0;z-index:30}.navbar.is-fixed-bottom-desktop{bottom:0}.navbar.is-fixed-bottom-desktop.has-shadow{box-shadow:0 -2px 3px rgba(10,10,10,.1)}.navbar.is-fixed-top-desktop{top:0}body.has-navbar-fixed-top-desktop,html.has-navbar-fixed-top-desktop{padding-top:3.25rem}body.has-navbar-fixed-bottom-desktop,html.has-navbar-fixed-bottom-desktop{padding-bottom:3.25rem}body.has-spaced-navbar-fixed-top,html.has-spaced-navbar-fixed-top{padding-top:5.25rem}body.has-spaced-navbar-fixed-bottom,html.has-spaced-navbar-fixed-bottom{padding-bottom:5.25rem}.navbar-link.is-active,a.navbar-item.is-active{color:#0a0a0a}.navbar-link.is-active:not(:focus):not(:hover),a.navbar-item.is-active:not(:focus):not(:hover){background-color:transparent}.navbar-item.has-dropdown.is-active .navbar-link,.navbar-item.has-dropdown:focus .navbar-link,.navbar-item.has-dropdown:hover .navbar-link{background-color:#fafafa}}.hero.is-fullheight-with-navbar{min-height:calc(100vh - 3.25rem)}.pagination{font-size:1rem;margin:-.25rem}.pagination.is-small{font-size:.75rem}.pagination.is-medium{font-size:1.25rem}.pagination.is-large{font-size:1.5rem}.pagination.is-rounded .pagination-next,.pagination.is-rounded .pagination-previous{padding-left:1em;padding-right:1em;border-radius:290486px}.pagination.is-rounded .pagination-link{border-radius:290486px}.pagination,.pagination-list{align-items:center;display:flex;justify-content:center;text-align:center}.pagination-ellipsis,.pagination-link,.pagination-next,.pagination-previous{font-size:1em;justify-content:center;margin:.25rem;padding-left:.5em;padding-right:.5em;text-align:center}.pagination-link,.pagination-next,.pagination-previous{border-color:#dbdbdb;color:#363636;min-width:2.5em}.pagination-link:hover,.pagination-next:hover,.pagination-previous:hover{border-color:#b5b5b5;color:#363636}.pagination-link:focus,.pagination-next:focus,.pagination-previous:focus{border-color:#3273dc}.pagination-link:active,.pagination-next:active,.pagination-previous:active{box-shadow:inset 0 1px 2px rgba(10,10,10,.2)}.pagination-link[disabled],.pagination-next[disabled],.pagination-previous[disabled]{background-color:#dbdbdb;border-color:#dbdbdb;box-shadow:none;color:#7a7a7a;opacity:.5}.pagination-next,.pagination-previous{padding-left:.75em;padding-right:.75em;white-space:nowrap}.pagination-link.is-current{background-color:#3273dc;border-color:#3273dc;color:#fff}.pagination-ellipsis{color:#b5b5b5;pointer-events:none}.pagination-list{flex-wrap:wrap}@media screen and (max-width:768px){.pagination{flex-wrap:wrap}.pagination-next,.pagination-previous{flex-grow:1;flex-shrink:1}.pagination-list li{flex-grow:1;flex-shrink:1}}@media screen and (min-width:769px),print{.pagination-list{flex-grow:1;flex-shrink:1;justify-content:flex-start;order:1}.pagination-previous{order:2}.pagination-next{order:3}.pagination{justify-content:space-between}.pagination.is-centered .pagination-previous{order:1}.pagination.is-centered .pagination-list{justify-content:center;order:2}.pagination.is-centered .pagination-next{order:3}.pagination.is-right .pagination-previous{order:1}.pagination.is-right .pagination-next{order:2}.pagination.is-right .pagination-list{justify-content:flex-end;order:3}}.panel{border-radius:6px;box-shadow:0 .5em 1em -.125em rgba(10,10,10,.1),0 0 0 1px rgba(10,10,10,.02);font-size:1rem}.panel:not(:last-child){margin-bottom:1.5rem}.panel.is-white .panel-heading{background-color:#fff;color:#0a0a0a}.panel.is-white .panel-tabs a.is-active{border-bottom-color:#fff}.panel.is-white .panel-block.is-active .panel-icon{color:#fff}.panel.is-black .panel-heading{background-color:#0a0a0a;color:#fff}.panel.is-black .panel-tabs a.is-active{border-bottom-color:#0a0a0a}.panel.is-black .panel-block.is-active .panel-icon{color:#0a0a0a}.panel.is-light .panel-heading{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.panel.is-light .panel-tabs a.is-active{border-bottom-color:#f5f5f5}.panel.is-light .panel-block.is-active .panel-icon{color:#f5f5f5}.panel.is-dark .panel-heading{background-color:#363636;color:#fff}.panel.is-dark .panel-tabs a.is-active{border-bottom-color:#363636}.panel.is-dark .panel-block.is-active .panel-icon{color:#363636}.panel.is-primary .panel-heading{background-color:#00d1b2;color:#fff}.panel.is-primary .panel-tabs a.is-active{border-bottom-color:#00d1b2}.panel.is-primary .panel-block.is-active .panel-icon{color:#00d1b2}.panel.is-link .panel-heading{background-color:#3273dc;color:#fff}.panel.is-link .panel-tabs a.is-active{border-bottom-color:#3273dc}.panel.is-link .panel-block.is-active .panel-icon{color:#3273dc}.panel.is-info .panel-heading{background-color:#3298dc;color:#fff}.panel.is-info .panel-tabs a.is-active{border-bottom-color:#3298dc}.panel.is-info .panel-block.is-active .panel-icon{color:#3298dc}.panel.is-success .panel-heading{background-color:#48c774;color:#fff}.panel.is-success .panel-tabs a.is-active{border-bottom-color:#48c774}.panel.is-success .panel-block.is-active .panel-icon{color:#48c774}.panel.is-warning .panel-heading{background-color:#ffdd57;color:rgba(0,0,0,.7)}.panel.is-warning .panel-tabs a.is-active{border-bottom-color:#ffdd57}.panel.is-warning .panel-block.is-active .panel-icon{color:#ffdd57}.panel.is-danger .panel-heading{background-color:#f14668;color:#fff}.panel.is-danger .panel-tabs a.is-active{border-bottom-color:#f14668}.panel.is-danger .panel-block.is-active .panel-icon{color:#f14668}.panel-block:not(:last-child),.panel-tabs:not(:last-child){border-bottom:1px solid #ededed}.panel-heading{background-color:#ededed;border-radius:6px 6px 0 0;color:#363636;font-size:1.25em;font-weight:700;line-height:1.25;padding:.75em 1em}.panel-tabs{align-items:flex-end;display:flex;font-size:.875em;justify-content:center}.panel-tabs a{border-bottom:1px solid #dbdbdb;margin-bottom:-1px;padding:.5em}.panel-tabs a.is-active{border-bottom-color:#4a4a4a;color:#363636}.panel-list a{color:#4a4a4a}.panel-list a:hover{color:#3273dc}.panel-block{align-items:center;color:#363636;display:flex;justify-content:flex-start;padding:.5em .75em}.panel-block input[type=checkbox]{margin-left:.75em}.panel-block>.control{flex-grow:1;flex-shrink:1;width:100%}.panel-block.is-wrapped{flex-wrap:wrap}.panel-block.is-active{border-left-color:#3273dc;color:#363636}.panel-block.is-active .panel-icon{color:#3273dc}.panel-block:last-child{border-bottom-left-radius:6px;border-bottom-right-radius:6px}a.panel-block,label.panel-block{cursor:pointer}a.panel-block:hover,label.panel-block:hover{background-color:#f5f5f5}.panel-icon{display:inline-block;font-size:14px;height:1em;line-height:1em;text-align:center;vertical-align:top;width:1em;color:#7a7a7a;margin-left:.75em}.panel-icon .fa{font-size:inherit;line-height:inherit}.tabs{-webkit-overflow-scrolling:touch;align-items:stretch;display:flex;font-size:1rem;justify-content:space-between;overflow:hidden;overflow-x:auto;white-space:nowrap}.tabs a{align-items:center;border-bottom-color:#dbdbdb;border-bottom-style:solid;border-bottom-width:1px;color:#4a4a4a;display:flex;justify-content:center;margin-bottom:-1px;padding:.5em 1em;vertical-align:top}.tabs a:hover{border-bottom-color:#363636;color:#363636}.tabs li{display:block}.tabs li.is-active a{border-bottom-color:#3273dc;color:#3273dc}.tabs ul{align-items:center;border-bottom-color:#dbdbdb;border-bottom-style:solid;border-bottom-width:1px;display:flex;flex-grow:1;flex-shrink:0;justify-content:flex-start}.tabs ul.is-left{padding-right:.75em}.tabs ul.is-center{flex:none;justify-content:center;padding-left:.75em;padding-right:.75em}.tabs ul.is-right{justify-content:flex-end;padding-left:.75em}.tabs .icon:first-child{margin-left:.5em}.tabs .icon:last-child{margin-right:.5em}.tabs.is-centered ul{justify-content:center}.tabs.is-right ul{justify-content:flex-end}.tabs.is-boxed a{border:1px solid transparent;border-radius:0 0 4px 4px}.tabs.is-boxed a:hover{background-color:#f5f5f5;border-bottom-color:#dbdbdb}.tabs.is-boxed li.is-active a{background-color:#fff;border-color:#dbdbdb;border-bottom-color:transparent!important}.tabs.is-fullwidth li{flex-grow:1;flex-shrink:0}.tabs.is-toggle a{border-color:#dbdbdb;border-style:solid;border-width:1px;margin-bottom:0;position:relative}.tabs.is-toggle a:hover{background-color:#f5f5f5;border-color:#b5b5b5;z-index:2}.tabs.is-toggle li+li{margin-right:-1px}.tabs.is-toggle li:first-child a{border-top-right-radius:4px;border-bottom-right-radius:4px}.tabs.is-toggle li:last-child a{border-top-left-radius:4px;border-bottom-left-radius:4px}.tabs.is-toggle li.is-active a{background-color:#3273dc;border-color:#3273dc;color:#fff;z-index:1}.tabs.is-toggle ul{border-bottom:none}.tabs.is-toggle.is-toggle-rounded li:first-child a{border-bottom-right-radius:290486px;border-top-right-radius:290486px;padding-right:1.25em}.tabs.is-toggle.is-toggle-rounded li:last-child a{border-bottom-left-radius:290486px;border-top-left-radius:290486px;padding-left:1.25em}.tabs.is-small{font-size:.75rem}.tabs.is-medium{font-size:1.25rem}.tabs.is-large{font-size:1.5rem}.column{display:block;flex-basis:0;flex-grow:1;flex-shrink:1;padding:.75rem}.columns.is-mobile>.column.is-narrow{flex:none}.columns.is-mobile>.column.is-full{flex:none;width:100%}.columns.is-mobile>.column.is-three-quarters{flex:none;width:75%}.columns.is-mobile>.column.is-two-thirds{flex:none;width:66.6666%}.columns.is-mobile>.column.is-half{flex:none;width:50%}.columns.is-mobile>.column.is-one-third{flex:none;width:33.3333%}.columns.is-mobile>.column.is-one-quarter{flex:none;width:25%}.columns.is-mobile>.column.is-one-fifth{flex:none;width:20%}.columns.is-mobile>.column.is-two-fifths{flex:none;width:40%}.columns.is-mobile>.column.is-three-fifths{flex:none;width:60%}.columns.is-mobile>.column.is-four-fifths{flex:none;width:80%}.columns.is-mobile>.column.is-offset-three-quarters{margin-left:75%}.columns.is-mobile>.column.is-offset-two-thirds{margin-left:66.6666%}.columns.is-mobile>.column.is-offset-half{margin-left:50%}.columns.is-mobile>.column.is-offset-one-third{margin-left:33.3333%}.columns.is-mobile>.column.is-offset-one-quarter{margin-left:25%}.columns.is-mobile>.column.is-offset-one-fifth{margin-left:20%}.columns.is-mobile>.column.is-offset-two-fifths{margin-left:40%}.columns.is-mobile>.column.is-offset-three-fifths{margin-left:60%}.columns.is-mobile>.column.is-offset-four-fifths{margin-left:80%}.columns.is-mobile>.column.is-0{flex:none;width:0%}.columns.is-mobile>.column.is-offset-0{margin-left:0}.columns.is-mobile>.column.is-1{flex:none;width:8.33333%}.columns.is-mobile>.column.is-offset-1{margin-left:8.33333%}.columns.is-mobile>.column.is-2{flex:none;width:16.66667%}.columns.is-mobile>.column.is-offset-2{margin-left:16.66667%}.columns.is-mobile>.column.is-3{flex:none;width:25%}.columns.is-mobile>.column.is-offset-3{margin-left:25%}.columns.is-mobile>.column.is-4{flex:none;width:33.33333%}.columns.is-mobile>.column.is-offset-4{margin-left:33.33333%}.columns.is-mobile>.column.is-5{flex:none;width:41.66667%}.columns.is-mobile>.column.is-offset-5{margin-left:41.66667%}.columns.is-mobile>.column.is-6{flex:none;width:50%}.columns.is-mobile>.column.is-offset-6{margin-left:50%}.columns.is-mobile>.column.is-7{flex:none;width:58.33333%}.columns.is-mobile>.column.is-offset-7{margin-left:58.33333%}.columns.is-mobile>.column.is-8{flex:none;width:66.66667%}.columns.is-mobile>.column.is-offset-8{margin-left:66.66667%}.columns.is-mobile>.column.is-9{flex:none;width:75%}.columns.is-mobile>.column.is-offset-9{margin-left:75%}.columns.is-mobile>.column.is-10{flex:none;width:83.33333%}.columns.is-mobile>.column.is-offset-10{margin-left:83.33333%}.columns.is-mobile>.column.is-11{flex:none;width:91.66667%}.columns.is-mobile>.column.is-offset-11{margin-left:91.66667%}.columns.is-mobile>.column.is-12{flex:none;width:100%}.columns.is-mobile>.column.is-offset-12{margin-left:100%}@media screen and (max-width:768px){.column.is-narrow-mobile{flex:none}.column.is-full-mobile{flex:none;width:100%}.column.is-three-quarters-mobile{flex:none;width:75%}.column.is-two-thirds-mobile{flex:none;width:66.6666%}.column.is-half-mobile{flex:none;width:50%}.column.is-one-third-mobile{flex:none;width:33.3333%}.column.is-one-quarter-mobile{flex:none;width:25%}.column.is-one-fifth-mobile{flex:none;width:20%}.column.is-two-fifths-mobile{flex:none;width:40%}.column.is-three-fifths-mobile{flex:none;width:60%}.column.is-four-fifths-mobile{flex:none;width:80%}.column.is-offset-three-quarters-mobile{margin-left:75%}.column.is-offset-two-thirds-mobile{margin-left:66.6666%}.column.is-offset-half-mobile{margin-left:50%}.column.is-offset-one-third-mobile{margin-left:33.3333%}.column.is-offset-one-quarter-mobile{margin-left:25%}.column.is-offset-one-fifth-mobile{margin-left:20%}.column.is-offset-two-fifths-mobile{margin-left:40%}.column.is-offset-three-fifths-mobile{margin-left:60%}.column.is-offset-four-fifths-mobile{margin-left:80%}.column.is-0-mobile{flex:none;width:0%}.column.is-offset-0-mobile{margin-left:0}.column.is-1-mobile{flex:none;width:8.33333%}.column.is-offset-1-mobile{margin-left:8.33333%}.column.is-2-mobile{flex:none;width:16.66667%}.column.is-offset-2-mobile{margin-left:16.66667%}.column.is-3-mobile{flex:none;width:25%}.column.is-offset-3-mobile{margin-left:25%}.column.is-4-mobile{flex:none;width:33.33333%}.column.is-offset-4-mobile{margin-left:33.33333%}.column.is-5-mobile{flex:none;width:41.66667%}.column.is-offset-5-mobile{margin-left:41.66667%}.column.is-6-mobile{flex:none;width:50%}.column.is-offset-6-mobile{margin-left:50%}.column.is-7-mobile{flex:none;width:58.33333%}.column.is-offset-7-mobile{margin-left:58.33333%}.column.is-8-mobile{flex:none;width:66.66667%}.column.is-offset-8-mobile{margin-left:66.66667%}.column.is-9-mobile{flex:none;width:75%}.column.is-offset-9-mobile{margin-left:75%}.column.is-10-mobile{flex:none;width:83.33333%}.column.is-offset-10-mobile{margin-left:83.33333%}.column.is-11-mobile{flex:none;width:91.66667%}.column.is-offset-11-mobile{margin-left:91.66667%}.column.is-12-mobile{flex:none;width:100%}.column.is-offset-12-mobile{margin-left:100%}}@media screen and (min-width:769px),print{.column.is-narrow,.column.is-narrow-tablet{flex:none}.column.is-full,.column.is-full-tablet{flex:none;width:100%}.column.is-three-quarters,.column.is-three-quarters-tablet{flex:none;width:75%}.column.is-two-thirds,.column.is-two-thirds-tablet{flex:none;width:66.6666%}.column.is-half,.column.is-half-tablet{flex:none;width:50%}.column.is-one-third,.column.is-one-third-tablet{flex:none;width:33.3333%}.column.is-one-quarter,.column.is-one-quarter-tablet{flex:none;width:25%}.column.is-one-fifth,.column.is-one-fifth-tablet{flex:none;width:20%}.column.is-two-fifths,.column.is-two-fifths-tablet{flex:none;width:40%}.column.is-three-fifths,.column.is-three-fifths-tablet{flex:none;width:60%}.column.is-four-fifths,.column.is-four-fifths-tablet{flex:none;width:80%}.column.is-offset-three-quarters,.column.is-offset-three-quarters-tablet{margin-left:75%}.column.is-offset-two-thirds,.column.is-offset-two-thirds-tablet{margin-left:66.6666%}.column.is-offset-half,.column.is-offset-half-tablet{margin-left:50%}.column.is-offset-one-third,.column.is-offset-one-third-tablet{margin-left:33.3333%}.column.is-offset-one-quarter,.column.is-offset-one-quarter-tablet{margin-left:25%}.column.is-offset-one-fifth,.column.is-offset-one-fifth-tablet{margin-left:20%}.column.is-offset-two-fifths,.column.is-offset-two-fifths-tablet{margin-left:40%}.column.is-offset-three-fifths,.column.is-offset-three-fifths-tablet{margin-left:60%}.column.is-offset-four-fifths,.column.is-offset-four-fifths-tablet{margin-left:80%}.column.is-0,.column.is-0-tablet{flex:none;width:0%}.column.is-offset-0,.column.is-offset-0-tablet{margin-left:0}.column.is-1,.column.is-1-tablet{flex:none;width:8.33333%}.column.is-offset-1,.column.is-offset-1-tablet{margin-left:8.33333%}.column.is-2,.column.is-2-tablet{flex:none;width:16.66667%}.column.is-offset-2,.column.is-offset-2-tablet{margin-left:16.66667%}.column.is-3,.column.is-3-tablet{flex:none;width:25%}.column.is-offset-3,.column.is-offset-3-tablet{margin-left:25%}.column.is-4,.column.is-4-tablet{flex:none;width:33.33333%}.column.is-offset-4,.column.is-offset-4-tablet{margin-left:33.33333%}.column.is-5,.column.is-5-tablet{flex:none;width:41.66667%}.column.is-offset-5,.column.is-offset-5-tablet{margin-left:41.66667%}.column.is-6,.column.is-6-tablet{flex:none;width:50%}.column.is-offset-6,.column.is-offset-6-tablet{margin-left:50%}.column.is-7,.column.is-7-tablet{flex:none;width:58.33333%}.column.is-offset-7,.column.is-offset-7-tablet{margin-left:58.33333%}.column.is-8,.column.is-8-tablet{flex:none;width:66.66667%}.column.is-offset-8,.column.is-offset-8-tablet{margin-left:66.66667%}.column.is-9,.column.is-9-tablet{flex:none;width:75%}.column.is-offset-9,.column.is-offset-9-tablet{margin-left:75%}.column.is-10,.column.is-10-tablet{flex:none;width:83.33333%}.column.is-offset-10,.column.is-offset-10-tablet{margin-left:83.33333%}.column.is-11,.column.is-11-tablet{flex:none;width:91.66667%}.column.is-offset-11,.column.is-offset-11-tablet{margin-left:91.66667%}.column.is-12,.column.is-12-tablet{flex:none;width:100%}.column.is-offset-12,.column.is-offset-12-tablet{margin-left:100%}}@media screen and (max-width:1023px){.column.is-narrow-touch{flex:none}.column.is-full-touch{flex:none;width:100%}.column.is-three-quarters-touch{flex:none;width:75%}.column.is-two-thirds-touch{flex:none;width:66.6666%}.column.is-half-touch{flex:none;width:50%}.column.is-one-third-touch{flex:none;width:33.3333%}.column.is-one-quarter-touch{flex:none;width:25%}.column.is-one-fifth-touch{flex:none;width:20%}.column.is-two-fifths-touch{flex:none;width:40%}.column.is-three-fifths-touch{flex:none;width:60%}.column.is-four-fifths-touch{flex:none;width:80%}.column.is-offset-three-quarters-touch{margin-left:75%}.column.is-offset-two-thirds-touch{margin-left:66.6666%}.column.is-offset-half-touch{margin-left:50%}.column.is-offset-one-third-touch{margin-left:33.3333%}.column.is-offset-one-quarter-touch{margin-left:25%}.column.is-offset-one-fifth-touch{margin-left:20%}.column.is-offset-two-fifths-touch{margin-left:40%}.column.is-offset-three-fifths-touch{margin-left:60%}.column.is-offset-four-fifths-touch{margin-left:80%}.column.is-0-touch{flex:none;width:0%}.column.is-offset-0-touch{margin-left:0}.column.is-1-touch{flex:none;width:8.33333%}.column.is-offset-1-touch{margin-left:8.33333%}.column.is-2-touch{flex:none;width:16.66667%}.column.is-offset-2-touch{margin-left:16.66667%}.column.is-3-touch{flex:none;width:25%}.column.is-offset-3-touch{margin-left:25%}.column.is-4-touch{flex:none;width:33.33333%}.column.is-offset-4-touch{margin-left:33.33333%}.column.is-5-touch{flex:none;width:41.66667%}.column.is-offset-5-touch{margin-left:41.66667%}.column.is-6-touch{flex:none;width:50%}.column.is-offset-6-touch{margin-left:50%}.column.is-7-touch{flex:none;width:58.33333%}.column.is-offset-7-touch{margin-left:58.33333%}.column.is-8-touch{flex:none;width:66.66667%}.column.is-offset-8-touch{margin-left:66.66667%}.column.is-9-touch{flex:none;width:75%}.column.is-offset-9-touch{margin-left:75%}.column.is-10-touch{flex:none;width:83.33333%}.column.is-offset-10-touch{margin-left:83.33333%}.column.is-11-touch{flex:none;width:91.66667%}.column.is-offset-11-touch{margin-left:91.66667%}.column.is-12-touch{flex:none;width:100%}.column.is-offset-12-touch{margin-left:100%}}@media screen and (min-width:1024px){.column.is-narrow-desktop{flex:none}.column.is-full-desktop{flex:none;width:100%}.column.is-three-quarters-desktop{flex:none;width:75%}.column.is-two-thirds-desktop{flex:none;width:66.6666%}.column.is-half-desktop{flex:none;width:50%}.column.is-one-third-desktop{flex:none;width:33.3333%}.column.is-one-quarter-desktop{flex:none;width:25%}.column.is-one-fifth-desktop{flex:none;width:20%}.column.is-two-fifths-desktop{flex:none;width:40%}.column.is-three-fifths-desktop{flex:none;width:60%}.column.is-four-fifths-desktop{flex:none;width:80%}.column.is-offset-three-quarters-desktop{margin-left:75%}.column.is-offset-two-thirds-desktop{margin-left:66.6666%}.column.is-offset-half-desktop{margin-left:50%}.column.is-offset-one-third-desktop{margin-left:33.3333%}.column.is-offset-one-quarter-desktop{margin-left:25%}.column.is-offset-one-fifth-desktop{margin-left:20%}.column.is-offset-two-fifths-desktop{margin-left:40%}.column.is-offset-three-fifths-desktop{margin-left:60%}.column.is-offset-four-fifths-desktop{margin-left:80%}.column.is-0-desktop{flex:none;width:0%}.column.is-offset-0-desktop{margin-left:0}.column.is-1-desktop{flex:none;width:8.33333%}.column.is-offset-1-desktop{margin-left:8.33333%}.column.is-2-desktop{flex:none;width:16.66667%}.column.is-offset-2-desktop{margin-left:16.66667%}.column.is-3-desktop{flex:none;width:25%}.column.is-offset-3-desktop{margin-left:25%}.column.is-4-desktop{flex:none;width:33.33333%}.column.is-offset-4-desktop{margin-left:33.33333%}.column.is-5-desktop{flex:none;width:41.66667%}.column.is-offset-5-desktop{margin-left:41.66667%}.column.is-6-desktop{flex:none;width:50%}.column.is-offset-6-desktop{margin-left:50%}.column.is-7-desktop{flex:none;width:58.33333%}.column.is-offset-7-desktop{margin-left:58.33333%}.column.is-8-desktop{flex:none;width:66.66667%}.column.is-offset-8-desktop{margin-left:66.66667%}.column.is-9-desktop{flex:none;width:75%}.column.is-offset-9-desktop{margin-left:75%}.column.is-10-desktop{flex:none;width:83.33333%}.column.is-offset-10-desktop{margin-left:83.33333%}.column.is-11-desktop{flex:none;width:91.66667%}.column.is-offset-11-desktop{margin-left:91.66667%}.column.is-12-desktop{flex:none;width:100%}.column.is-offset-12-desktop{margin-left:100%}}@media screen and (min-width:1216px){.column.is-narrow-widescreen{flex:none}.column.is-full-widescreen{flex:none;width:100%}.column.is-three-quarters-widescreen{flex:none;width:75%}.column.is-two-thirds-widescreen{flex:none;width:66.6666%}.column.is-half-widescreen{flex:none;width:50%}.column.is-one-third-widescreen{flex:none;width:33.3333%}.column.is-one-quarter-widescreen{flex:none;width:25%}.column.is-one-fifth-widescreen{flex:none;width:20%}.column.is-two-fifths-widescreen{flex:none;width:40%}.column.is-three-fifths-widescreen{flex:none;width:60%}.column.is-four-fifths-widescreen{flex:none;width:80%}.column.is-offset-three-quarters-widescreen{margin-left:75%}.column.is-offset-two-thirds-widescreen{margin-left:66.6666%}.column.is-offset-half-widescreen{margin-left:50%}.column.is-offset-one-third-widescreen{margin-left:33.3333%}.column.is-offset-one-quarter-widescreen{margin-left:25%}.column.is-offset-one-fifth-widescreen{margin-left:20%}.column.is-offset-two-fifths-widescreen{margin-left:40%}.column.is-offset-three-fifths-widescreen{margin-left:60%}.column.is-offset-four-fifths-widescreen{margin-left:80%}.column.is-0-widescreen{flex:none;width:0%}.column.is-offset-0-widescreen{margin-left:0}.column.is-1-widescreen{flex:none;width:8.33333%}.column.is-offset-1-widescreen{margin-left:8.33333%}.column.is-2-widescreen{flex:none;width:16.66667%}.column.is-offset-2-widescreen{margin-left:16.66667%}.column.is-3-widescreen{flex:none;width:25%}.column.is-offset-3-widescreen{margin-left:25%}.column.is-4-widescreen{flex:none;width:33.33333%}.column.is-offset-4-widescreen{margin-left:33.33333%}.column.is-5-widescreen{flex:none;width:41.66667%}.column.is-offset-5-widescreen{margin-left:41.66667%}.column.is-6-widescreen{flex:none;width:50%}.column.is-offset-6-widescreen{margin-left:50%}.column.is-7-widescreen{flex:none;width:58.33333%}.column.is-offset-7-widescreen{margin-left:58.33333%}.column.is-8-widescreen{flex:none;width:66.66667%}.column.is-offset-8-widescreen{margin-left:66.66667%}.column.is-9-widescreen{flex:none;width:75%}.column.is-offset-9-widescreen{margin-left:75%}.column.is-10-widescreen{flex:none;width:83.33333%}.column.is-offset-10-widescreen{margin-left:83.33333%}.column.is-11-widescreen{flex:none;width:91.66667%}.column.is-offset-11-widescreen{margin-left:91.66667%}.column.is-12-widescreen{flex:none;width:100%}.column.is-offset-12-widescreen{margin-left:100%}}@media screen and (min-width:1408px){.column.is-narrow-fullhd{flex:none}.column.is-full-fullhd{flex:none;width:100%}.column.is-three-quarters-fullhd{flex:none;width:75%}.column.is-two-thirds-fullhd{flex:none;width:66.6666%}.column.is-half-fullhd{flex:none;width:50%}.column.is-one-third-fullhd{flex:none;width:33.3333%}.column.is-one-quarter-fullhd{flex:none;width:25%}.column.is-one-fifth-fullhd{flex:none;width:20%}.column.is-two-fifths-fullhd{flex:none;width:40%}.column.is-three-fifths-fullhd{flex:none;width:60%}.column.is-four-fifths-fullhd{flex:none;width:80%}.column.is-offset-three-quarters-fullhd{margin-left:75%}.column.is-offset-two-thirds-fullhd{margin-left:66.6666%}.column.is-offset-half-fullhd{margin-left:50%}.column.is-offset-one-third-fullhd{margin-left:33.3333%}.column.is-offset-one-quarter-fullhd{margin-left:25%}.column.is-offset-one-fifth-fullhd{margin-left:20%}.column.is-offset-two-fifths-fullhd{margin-left:40%}.column.is-offset-three-fifths-fullhd{margin-left:60%}.column.is-offset-four-fifths-fullhd{margin-left:80%}.column.is-0-fullhd{flex:none;width:0%}.column.is-offset-0-fullhd{margin-left:0}.column.is-1-fullhd{flex:none;width:8.33333%}.column.is-offset-1-fullhd{margin-left:8.33333%}.column.is-2-fullhd{flex:none;width:16.66667%}.column.is-offset-2-fullhd{margin-left:16.66667%}.column.is-3-fullhd{flex:none;width:25%}.column.is-offset-3-fullhd{margin-left:25%}.column.is-4-fullhd{flex:none;width:33.33333%}.column.is-offset-4-fullhd{margin-left:33.33333%}.column.is-5-fullhd{flex:none;width:41.66667%}.column.is-offset-5-fullhd{margin-left:41.66667%}.column.is-6-fullhd{flex:none;width:50%}.column.is-offset-6-fullhd{margin-left:50%}.column.is-7-fullhd{flex:none;width:58.33333%}.column.is-offset-7-fullhd{margin-left:58.33333%}.column.is-8-fullhd{flex:none;width:66.66667%}.column.is-offset-8-fullhd{margin-left:66.66667%}.column.is-9-fullhd{flex:none;width:75%}.column.is-offset-9-fullhd{margin-left:75%}.column.is-10-fullhd{flex:none;width:83.33333%}.column.is-offset-10-fullhd{margin-left:83.33333%}.column.is-11-fullhd{flex:none;width:91.66667%}.column.is-offset-11-fullhd{margin-left:91.66667%}.column.is-12-fullhd{flex:none;width:100%}.column.is-offset-12-fullhd{margin-left:100%}}.columns{margin-left:-.75rem;margin-right:-.75rem;margin-top:-.75rem}.columns:last-child{margin-bottom:-.75rem}.columns:not(:last-child){margin-bottom:calc(1.5rem - .75rem)}.columns.is-centered{justify-content:center}.columns.is-gapless{margin-left:0;margin-right:0;margin-top:0}.columns.is-gapless>.column{margin:0;padding:0!important}.columns.is-gapless:not(:last-child){margin-bottom:1.5rem}.columns.is-gapless:last-child{margin-bottom:0}.columns.is-mobile{display:flex}.columns.is-multiline{flex-wrap:wrap}.columns.is-vcentered{align-items:center}@media screen and (min-width:769px),print{.columns:not(.is-desktop){display:flex}}@media screen and (min-width:1024px){.columns.is-desktop{display:flex}}.columns.is-variable{--columnGap:0.75rem;margin-left:calc(-1 * var(--columnGap));margin-right:calc(-1 * var(--columnGap))}.columns.is-variable .column{padding-left:var(--columnGap);padding-right:var(--columnGap)}.columns.is-variable.is-0{--columnGap:0rem}@media screen and (max-width:768px){.columns.is-variable.is-0-mobile{--columnGap:0rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-0-tablet{--columnGap:0rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-0-tablet-only{--columnGap:0rem}}@media screen and (max-width:1023px){.columns.is-variable.is-0-touch{--columnGap:0rem}}@media screen and (min-width:1024px){.columns.is-variable.is-0-desktop{--columnGap:0rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-0-desktop-only{--columnGap:0rem}}@media screen and (min-width:1216px){.columns.is-variable.is-0-widescreen{--columnGap:0rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-0-widescreen-only{--columnGap:0rem}}@media screen and (min-width:1408px){.columns.is-variable.is-0-fullhd{--columnGap:0rem}}.columns.is-variable.is-1{--columnGap:0.25rem}@media screen and (max-width:768px){.columns.is-variable.is-1-mobile{--columnGap:0.25rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-1-tablet{--columnGap:0.25rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-1-tablet-only{--columnGap:0.25rem}}@media screen and (max-width:1023px){.columns.is-variable.is-1-touch{--columnGap:0.25rem}}@media screen and (min-width:1024px){.columns.is-variable.is-1-desktop{--columnGap:0.25rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-1-desktop-only{--columnGap:0.25rem}}@media screen and (min-width:1216px){.columns.is-variable.is-1-widescreen{--columnGap:0.25rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-1-widescreen-only{--columnGap:0.25rem}}@media screen and (min-width:1408px){.columns.is-variable.is-1-fullhd{--columnGap:0.25rem}}.columns.is-variable.is-2{--columnGap:0.5rem}@media screen and (max-width:768px){.columns.is-variable.is-2-mobile{--columnGap:0.5rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-2-tablet{--columnGap:0.5rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-2-tablet-only{--columnGap:0.5rem}}@media screen and (max-width:1023px){.columns.is-variable.is-2-touch{--columnGap:0.5rem}}@media screen and (min-width:1024px){.columns.is-variable.is-2-desktop{--columnGap:0.5rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-2-desktop-only{--columnGap:0.5rem}}@media screen and (min-width:1216px){.columns.is-variable.is-2-widescreen{--columnGap:0.5rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-2-widescreen-only{--columnGap:0.5rem}}@media screen and (min-width:1408px){.columns.is-variable.is-2-fullhd{--columnGap:0.5rem}}.columns.is-variable.is-3{--columnGap:0.75rem}@media screen and (max-width:768px){.columns.is-variable.is-3-mobile{--columnGap:0.75rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-3-tablet{--columnGap:0.75rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-3-tablet-only{--columnGap:0.75rem}}@media screen and (max-width:1023px){.columns.is-variable.is-3-touch{--columnGap:0.75rem}}@media screen and (min-width:1024px){.columns.is-variable.is-3-desktop{--columnGap:0.75rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-3-desktop-only{--columnGap:0.75rem}}@media screen and (min-width:1216px){.columns.is-variable.is-3-widescreen{--columnGap:0.75rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-3-widescreen-only{--columnGap:0.75rem}}@media screen and (min-width:1408px){.columns.is-variable.is-3-fullhd{--columnGap:0.75rem}}.columns.is-variable.is-4{--columnGap:1rem}@media screen and (max-width:768px){.columns.is-variable.is-4-mobile{--columnGap:1rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-4-tablet{--columnGap:1rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-4-tablet-only{--columnGap:1rem}}@media screen and (max-width:1023px){.columns.is-variable.is-4-touch{--columnGap:1rem}}@media screen and (min-width:1024px){.columns.is-variable.is-4-desktop{--columnGap:1rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-4-desktop-only{--columnGap:1rem}}@media screen and (min-width:1216px){.columns.is-variable.is-4-widescreen{--columnGap:1rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-4-widescreen-only{--columnGap:1rem}}@media screen and (min-width:1408px){.columns.is-variable.is-4-fullhd{--columnGap:1rem}}.columns.is-variable.is-5{--columnGap:1.25rem}@media screen and (max-width:768px){.columns.is-variable.is-5-mobile{--columnGap:1.25rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-5-tablet{--columnGap:1.25rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-5-tablet-only{--columnGap:1.25rem}}@media screen and (max-width:1023px){.columns.is-variable.is-5-touch{--columnGap:1.25rem}}@media screen and (min-width:1024px){.columns.is-variable.is-5-desktop{--columnGap:1.25rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-5-desktop-only{--columnGap:1.25rem}}@media screen and (min-width:1216px){.columns.is-variable.is-5-widescreen{--columnGap:1.25rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-5-widescreen-only{--columnGap:1.25rem}}@media screen and (min-width:1408px){.columns.is-variable.is-5-fullhd{--columnGap:1.25rem}}.columns.is-variable.is-6{--columnGap:1.5rem}@media screen and (max-width:768px){.columns.is-variable.is-6-mobile{--columnGap:1.5rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-6-tablet{--columnGap:1.5rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-6-tablet-only{--columnGap:1.5rem}}@media screen and (max-width:1023px){.columns.is-variable.is-6-touch{--columnGap:1.5rem}}@media screen and (min-width:1024px){.columns.is-variable.is-6-desktop{--columnGap:1.5rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-6-desktop-only{--columnGap:1.5rem}}@media screen and (min-width:1216px){.columns.is-variable.is-6-widescreen{--columnGap:1.5rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-6-widescreen-only{--columnGap:1.5rem}}@media screen and (min-width:1408px){.columns.is-variable.is-6-fullhd{--columnGap:1.5rem}}.columns.is-variable.is-7{--columnGap:1.75rem}@media screen and (max-width:768px){.columns.is-variable.is-7-mobile{--columnGap:1.75rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-7-tablet{--columnGap:1.75rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-7-tablet-only{--columnGap:1.75rem}}@media screen and (max-width:1023px){.columns.is-variable.is-7-touch{--columnGap:1.75rem}}@media screen and (min-width:1024px){.columns.is-variable.is-7-desktop{--columnGap:1.75rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-7-desktop-only{--columnGap:1.75rem}}@media screen and (min-width:1216px){.columns.is-variable.is-7-widescreen{--columnGap:1.75rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-7-widescreen-only{--columnGap:1.75rem}}@media screen and (min-width:1408px){.columns.is-variable.is-7-fullhd{--columnGap:1.75rem}}.columns.is-variable.is-8{--columnGap:2rem}@media screen and (max-width:768px){.columns.is-variable.is-8-mobile{--columnGap:2rem}}@media screen and (min-width:769px),print{.columns.is-variable.is-8-tablet{--columnGap:2rem}}@media screen and (min-width:769px) and (max-width:1023px){.columns.is-variable.is-8-tablet-only{--columnGap:2rem}}@media screen and (max-width:1023px){.columns.is-variable.is-8-touch{--columnGap:2rem}}@media screen and (min-width:1024px){.columns.is-variable.is-8-desktop{--columnGap:2rem}}@media screen and (min-width:1024px) and (max-width:1215px){.columns.is-variable.is-8-desktop-only{--columnGap:2rem}}@media screen and (min-width:1216px){.columns.is-variable.is-8-widescreen{--columnGap:2rem}}@media screen and (min-width:1216px) and (max-width:1407px){.columns.is-variable.is-8-widescreen-only{--columnGap:2rem}}@media screen and (min-width:1408px){.columns.is-variable.is-8-fullhd{--columnGap:2rem}}.tile{align-items:stretch;display:block;flex-basis:0;flex-grow:1;flex-shrink:1;min-height:-webkit-min-content;min-height:-moz-min-content;min-height:min-content}.tile.is-ancestor{margin-left:-.75rem;margin-right:-.75rem;margin-top:-.75rem}.tile.is-ancestor:last-child{margin-bottom:-.75rem}.tile.is-ancestor:not(:last-child){margin-bottom:.75rem}.tile.is-child{margin:0!important}.tile.is-parent{padding:.75rem}.tile.is-vertical{flex-direction:column}.tile.is-vertical>.tile.is-child:not(:last-child){margin-bottom:1.5rem!important}@media screen and (min-width:769px),print{.tile:not(.is-child){display:flex}.tile.is-1{flex:none;width:8.33333%}.tile.is-2{flex:none;width:16.66667%}.tile.is-3{flex:none;width:25%}.tile.is-4{flex:none;width:33.33333%}.tile.is-5{flex:none;width:41.66667%}.tile.is-6{flex:none;width:50%}.tile.is-7{flex:none;width:58.33333%}.tile.is-8{flex:none;width:66.66667%}.tile.is-9{flex:none;width:75%}.tile.is-10{flex:none;width:83.33333%}.tile.is-11{flex:none;width:91.66667%}.tile.is-12{flex:none;width:100%}}.has-text-white{color:#fff!important}a.has-text-white:focus,a.has-text-white:hover{color:#e6e6e6!important}.has-background-white{background-color:#fff!important}.has-text-black{color:#0a0a0a!important}a.has-text-black:focus,a.has-text-black:hover{color:#000!important}.has-background-black{background-color:#0a0a0a!important}.has-text-light{color:#f5f5f5!important}a.has-text-light:focus,a.has-text-light:hover{color:#dbdbdb!important}.has-background-light{background-color:#f5f5f5!important}.has-text-dark{color:#363636!important}a.has-text-dark:focus,a.has-text-dark:hover{color:#1c1c1c!important}.has-background-dark{background-color:#363636!important}.has-text-primary{color:#00d1b2!important}a.has-text-primary:focus,a.has-text-primary:hover{color:#009e86!important}.has-background-primary{background-color:#00d1b2!important}.has-text-primary-light{color:#ebfffc!important}a.has-text-primary-light:focus,a.has-text-primary-light:hover{color:#b8fff4!important}.has-background-primary-light{background-color:#ebfffc!important}.has-text-primary-dark{color:#00947e!important}a.has-text-primary-dark:focus,a.has-text-primary-dark:hover{color:#00c7a9!important}.has-background-primary-dark{background-color:#00947e!important}.has-text-link{color:#3273dc!important}a.has-text-link:focus,a.has-text-link:hover{color:#205bbc!important}.has-background-link{background-color:#3273dc!important}.has-text-link-light{color:#eef3fc!important}a.has-text-link-light:focus,a.has-text-link-light:hover{color:#c2d5f5!important}.has-background-link-light{background-color:#eef3fc!important}.has-text-link-dark{color:#2160c4!important}a.has-text-link-dark:focus,a.has-text-link-dark:hover{color:#3b79de!important}.has-background-link-dark{background-color:#2160c4!important}.has-text-info{color:#3298dc!important}a.has-text-info:focus,a.has-text-info:hover{color:#207dbc!important}.has-background-info{background-color:#3298dc!important}.has-text-info-light{color:#eef6fc!important}a.has-text-info-light:focus,a.has-text-info-light:hover{color:#c2e0f5!important}.has-background-info-light{background-color:#eef6fc!important}.has-text-info-dark{color:#1d72aa!important}a.has-text-info-dark:focus,a.has-text-info-dark:hover{color:#248fd6!important}.has-background-info-dark{background-color:#1d72aa!important}.has-text-success{color:#48c774!important}a.has-text-success:focus,a.has-text-success:hover{color:#34a85c!important}.has-background-success{background-color:#48c774!important}.has-text-success-light{color:#effaf3!important}a.has-text-success-light:focus,a.has-text-success-light:hover{color:#c8eed6!important}.has-background-success-light{background-color:#effaf3!important}.has-text-success-dark{color:#257942!important}a.has-text-success-dark:focus,a.has-text-success-dark:hover{color:#31a058!important}.has-background-success-dark{background-color:#257942!important}.has-text-warning{color:#ffdd57!important}a.has-text-warning:focus,a.has-text-warning:hover{color:#ffd324!important}.has-background-warning{background-color:#ffdd57!important}.has-text-warning-light{color:#fffbeb!important}a.has-text-warning-light:focus,a.has-text-warning-light:hover{color:#fff1b8!important}.has-background-warning-light{background-color:#fffbeb!important}.has-text-warning-dark{color:#947600!important}a.has-text-warning-dark:focus,a.has-text-warning-dark:hover{color:#c79f00!important}.has-background-warning-dark{background-color:#947600!important}.has-text-danger{color:#f14668!important}a.has-text-danger:focus,a.has-text-danger:hover{color:#ee1742!important}.has-background-danger{background-color:#f14668!important}.has-text-danger-light{color:#feecf0!important}a.has-text-danger-light:focus,a.has-text-danger-light:hover{color:#fabdc9!important}.has-background-danger-light{background-color:#feecf0!important}.has-text-danger-dark{color:#cc0f35!important}a.has-text-danger-dark:focus,a.has-text-danger-dark:hover{color:#ee2049!important}.has-background-danger-dark{background-color:#cc0f35!important}.has-text-black-bis{color:#121212!important}.has-background-black-bis{background-color:#121212!important}.has-text-black-ter{color:#242424!important}.has-background-black-ter{background-color:#242424!important}.has-text-grey-darker{color:#363636!important}.has-background-grey-darker{background-color:#363636!important}.has-text-grey-dark{color:#4a4a4a!important}.has-background-grey-dark{background-color:#4a4a4a!important}.has-text-grey{color:#7a7a7a!important}.has-background-grey{background-color:#7a7a7a!important}.has-text-grey-light{color:#b5b5b5!important}.has-background-grey-light{background-color:#b5b5b5!important}.has-text-grey-lighter{color:#dbdbdb!important}.has-background-grey-lighter{background-color:#dbdbdb!important}.has-text-white-ter{color:#f5f5f5!important}.has-background-white-ter{background-color:#f5f5f5!important}.has-text-white-bis{color:#fafafa!important}.has-background-white-bis{background-color:#fafafa!important}.is-clearfix::after{clear:both;content:" ";display:table}.is-pulled-left{float:left!important}.is-pulled-right{float:right!important}.is-radiusless{border-radius:0!important}.is-shadowless{box-shadow:none!important}.is-clipped{overflow:hidden!important}.is-relative{position:relative!important}.is-marginless{margin:0!important}.is-paddingless{padding:0!important}.mt-0{margin-top:0!important}.mr-0{margin-right:0!important}.mb-0{margin-bottom:0!important}.ml-0{margin-left:0!important}.mx-0{margin-left:0!important;margin-right:0!important}.my-0{margin-top:0!important;margin-bottom:0!important}.mt-1{margin-top:.25rem!important}.mr-1{margin-right:.25rem!important}.mb-1{margin-bottom:.25rem!important}.ml-1{margin-left:.25rem!important}.mx-1{margin-left:.25rem!important;margin-right:.25rem!important}.my-1{margin-top:.25rem!important;margin-bottom:.25rem!important}.mt-2{margin-top:.5rem!important}.mr-2{margin-right:.5rem!important}.mb-2{margin-bottom:.5rem!important}.ml-2{margin-left:.5rem!important}.mx-2{margin-left:.5rem!important;margin-right:.5rem!important}.my-2{margin-top:.5rem!important;margin-bottom:.5rem!important}.mt-3{margin-top:.75rem!important}.mr-3{margin-right:.75rem!important}.mb-3{margin-bottom:.75rem!important}.ml-3{margin-left:.75rem!important}.mx-3{margin-left:.75rem!important;margin-right:.75rem!important}.my-3{margin-top:.75rem!important;margin-bottom:.75rem!important}.mt-4{margin-top:1rem!important}.mr-4{margin-right:1rem!important}.mb-4{margin-bottom:1rem!important}.ml-4{margin-left:1rem!important}.mx-4{margin-left:1rem!important;margin-right:1rem!important}.my-4{margin-top:1rem!important;margin-bottom:1rem!important}.mt-5{margin-top:1.5rem!important}.mr-5{margin-right:1.5rem!important}.mb-5{margin-bottom:1.5rem!important}.ml-5{margin-left:1.5rem!important}.mx-5{margin-left:1.5rem!important;margin-right:1.5rem!important}.my-5{margin-top:1.5rem!important;margin-bottom:1.5rem!important}.mt-6{margin-top:3rem!important}.mr-6{margin-right:3rem!important}.mb-6{margin-bottom:3rem!important}.ml-6{margin-left:3rem!important}.mx-6{margin-left:3rem!important;margin-right:3rem!important}.my-6{margin-top:3rem!important;margin-bottom:3rem!important}.pt-0{padding-top:0!important}.pr-0{padding-right:0!important}.pb-0{padding-bottom:0!important}.pl-0{padding-left:0!important}.px-0{padding-left:0!important;padding-right:0!important}.py-0{padding-top:0!important;padding-bottom:0!important}.pt-1{padding-top:.25rem!important}.pr-1{padding-right:.25rem!important}.pb-1{padding-bottom:.25rem!important}.pl-1{padding-left:.25rem!important}.px-1{padding-left:.25rem!important;padding-right:.25rem!important}.py-1{padding-top:.25rem!important;padding-bottom:.25rem!important}.pt-2{padding-top:.5rem!important}.pr-2{padding-right:.5rem!important}.pb-2{padding-bottom:.5rem!important}.pl-2{padding-left:.5rem!important}.px-2{padding-left:.5rem!important;padding-right:.5rem!important}.py-2{padding-top:.5rem!important;padding-bottom:.5rem!important}.pt-3{padding-top:.75rem!important}.pr-3{padding-right:.75rem!important}.pb-3{padding-bottom:.75rem!important}.pl-3{padding-left:.75rem!important}.px-3{padding-left:.75rem!important;padding-right:.75rem!important}.py-3{padding-top:.75rem!important;padding-bottom:.75rem!important}.pt-4{padding-top:1rem!important}.pr-4{padding-right:1rem!important}.pb-4{padding-bottom:1rem!important}.pl-4{padding-left:1rem!important}.px-4{padding-left:1rem!important;padding-right:1rem!important}.py-4{padding-top:1rem!important;padding-bottom:1rem!important}.pt-5{padding-top:1.5rem!important}.pr-5{padding-right:1.5rem!important}.pb-5{padding-bottom:1.5rem!important}.pl-5{padding-left:1.5rem!important}.px-5{padding-left:1.5rem!important;padding-right:1.5rem!important}.py-5{padding-top:1.5rem!important;padding-bottom:1.5rem!important}.pt-6{padding-top:3rem!important}.pr-6{padding-right:3rem!important}.pb-6{padding-bottom:3rem!important}.pl-6{padding-left:3rem!important}.px-6{padding-left:3rem!important;padding-right:3rem!important}.py-6{padding-top:3rem!important;padding-bottom:3rem!important}.is-size-1{font-size:3rem!important}.is-size-2{font-size:2.5rem!important}.is-size-3{font-size:2rem!important}.is-size-4{font-size:1.5rem!important}.is-size-5{font-size:1.25rem!important}.is-size-6{font-size:1rem!important}.is-size-7{font-size:.75rem!important}@media screen and (max-width:768px){.is-size-1-mobile{font-size:3rem!important}.is-size-2-mobile{font-size:2.5rem!important}.is-size-3-mobile{font-size:2rem!important}.is-size-4-mobile{font-size:1.5rem!important}.is-size-5-mobile{font-size:1.25rem!important}.is-size-6-mobile{font-size:1rem!important}.is-size-7-mobile{font-size:.75rem!important}}@media screen and (min-width:769px),print{.is-size-1-tablet{font-size:3rem!important}.is-size-2-tablet{font-size:2.5rem!important}.is-size-3-tablet{font-size:2rem!important}.is-size-4-tablet{font-size:1.5rem!important}.is-size-5-tablet{font-size:1.25rem!important}.is-size-6-tablet{font-size:1rem!important}.is-size-7-tablet{font-size:.75rem!important}}@media screen and (max-width:1023px){.is-size-1-touch{font-size:3rem!important}.is-size-2-touch{font-size:2.5rem!important}.is-size-3-touch{font-size:2rem!important}.is-size-4-touch{font-size:1.5rem!important}.is-size-5-touch{font-size:1.25rem!important}.is-size-6-touch{font-size:1rem!important}.is-size-7-touch{font-size:.75rem!important}}@media screen and (min-width:1024px){.is-size-1-desktop{font-size:3rem!important}.is-size-2-desktop{font-size:2.5rem!important}.is-size-3-desktop{font-size:2rem!important}.is-size-4-desktop{font-size:1.5rem!important}.is-size-5-desktop{font-size:1.25rem!important}.is-size-6-desktop{font-size:1rem!important}.is-size-7-desktop{font-size:.75rem!important}}@media screen and (min-width:1216px){.is-size-1-widescreen{font-size:3rem!important}.is-size-2-widescreen{font-size:2.5rem!important}.is-size-3-widescreen{font-size:2rem!important}.is-size-4-widescreen{font-size:1.5rem!important}.is-size-5-widescreen{font-size:1.25rem!important}.is-size-6-widescreen{font-size:1rem!important}.is-size-7-widescreen{font-size:.75rem!important}}@media screen and (min-width:1408px){.is-size-1-fullhd{font-size:3rem!important}.is-size-2-fullhd{font-size:2.5rem!important}.is-size-3-fullhd{font-size:2rem!important}.is-size-4-fullhd{font-size:1.5rem!important}.is-size-5-fullhd{font-size:1.25rem!important}.is-size-6-fullhd{font-size:1rem!important}.is-size-7-fullhd{font-size:.75rem!important}}.has-text-centered{text-align:center!important}.has-text-justified{text-align:justify!important}.has-text-left{text-align:left!important}.has-text-right{text-align:right!important}@media screen and (max-width:768px){.has-text-centered-mobile{text-align:center!important}}@media screen and (min-width:769px),print{.has-text-centered-tablet{text-align:center!important}}@media screen and (min-width:769px) and (max-width:1023px){.has-text-centered-tablet-only{text-align:center!important}}@media screen and (max-width:1023px){.has-text-centered-touch{text-align:center!important}}@media screen and (min-width:1024px){.has-text-centered-desktop{text-align:center!important}}@media screen and (min-width:1024px) and (max-width:1215px){.has-text-centered-desktop-only{text-align:center!important}}@media screen and (min-width:1216px){.has-text-centered-widescreen{text-align:center!important}}@media screen and (min-width:1216px) and (max-width:1407px){.has-text-centered-widescreen-only{text-align:center!important}}@media screen and (min-width:1408px){.has-text-centered-fullhd{text-align:center!important}}@media screen and (max-width:768px){.has-text-justified-mobile{text-align:justify!important}}@media screen and (min-width:769px),print{.has-text-justified-tablet{text-align:justify!important}}@media screen and (min-width:769px) and (max-width:1023px){.has-text-justified-tablet-only{text-align:justify!important}}@media screen and (max-width:1023px){.has-text-justified-touch{text-align:justify!important}}@media screen and (min-width:1024px){.has-text-justified-desktop{text-align:justify!important}}@media screen and (min-width:1024px) and (max-width:1215px){.has-text-justified-desktop-only{text-align:justify!important}}@media screen and (min-width:1216px){.has-text-justified-widescreen{text-align:justify!important}}@media screen and (min-width:1216px) and (max-width:1407px){.has-text-justified-widescreen-only{text-align:justify!important}}@media screen and (min-width:1408px){.has-text-justified-fullhd{text-align:justify!important}}@media screen and (max-width:768px){.has-text-left-mobile{text-align:left!important}}@media screen and (min-width:769px),print{.has-text-left-tablet{text-align:left!important}}@media screen and (min-width:769px) and (max-width:1023px){.has-text-left-tablet-only{text-align:left!important}}@media screen and (max-width:1023px){.has-text-left-touch{text-align:left!important}}@media screen and (min-width:1024px){.has-text-left-desktop{text-align:left!important}}@media screen and (min-width:1024px) and (max-width:1215px){.has-text-left-desktop-only{text-align:left!important}}@media screen and (min-width:1216px){.has-text-left-widescreen{text-align:left!important}}@media screen and (min-width:1216px) and (max-width:1407px){.has-text-left-widescreen-only{text-align:left!important}}@media screen and (min-width:1408px){.has-text-left-fullhd{text-align:left!important}}@media screen and (max-width:768px){.has-text-right-mobile{text-align:right!important}}@media screen and (min-width:769px),print{.has-text-right-tablet{text-align:right!important}}@media screen and (min-width:769px) and (max-width:1023px){.has-text-right-tablet-only{text-align:right!important}}@media screen and (max-width:1023px){.has-text-right-touch{text-align:right!important}}@media screen and (min-width:1024px){.has-text-right-desktop{text-align:right!important}}@media screen and (min-width:1024px) and (max-width:1215px){.has-text-right-desktop-only{text-align:right!important}}@media screen and (min-width:1216px){.has-text-right-widescreen{text-align:right!important}}@media screen and (min-width:1216px) and (max-width:1407px){.has-text-right-widescreen-only{text-align:right!important}}@media screen and (min-width:1408px){.has-text-right-fullhd{text-align:right!important}}.is-capitalized{text-transform:capitalize!important}.is-lowercase{text-transform:lowercase!important}.is-uppercase{text-transform:uppercase!important}.is-italic{font-style:italic!important}.has-text-weight-light{font-weight:300!important}.has-text-weight-normal{font-weight:400!important}.has-text-weight-medium{font-weight:500!important}.has-text-weight-semibold{font-weight:600!important}.has-text-weight-bold{font-weight:700!important}.is-family-primary{font-family:BlinkMacSystemFont,-apple-system,"Segoe UI",Roboto,Oxygen,Ubuntu,Cantarell,"Fira Sans","Droid Sans","Helvetica Neue",Helvetica,Arial,sans-serif!important}.is-family-secondary{font-family:BlinkMacSystemFont,-apple-system,"Segoe UI",Roboto,Oxygen,Ubuntu,Cantarell,"Fira Sans","Droid Sans","Helvetica Neue",Helvetica,Arial,sans-serif!important}.is-family-sans-serif{font-family:BlinkMacSystemFont,-apple-system,"Segoe UI",Roboto,Oxygen,Ubuntu,Cantarell,"Fira Sans","Droid Sans","Helvetica Neue",Helvetica,Arial,sans-serif!important}.is-family-monospace{font-family:monospace!important}.is-family-code{font-family:monospace!important}.is-block{display:block!important}@media screen and (max-width:768px){.is-block-mobile{display:block!important}}@media screen and (min-width:769px),print{.is-block-tablet{display:block!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-block-tablet-only{display:block!important}}@media screen and (max-width:1023px){.is-block-touch{display:block!important}}@media screen and (min-width:1024px){.is-block-desktop{display:block!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-block-desktop-only{display:block!important}}@media screen and (min-width:1216px){.is-block-widescreen{display:block!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-block-widescreen-only{display:block!important}}@media screen and (min-width:1408px){.is-block-fullhd{display:block!important}}.is-flex{display:flex!important}@media screen and (max-width:768px){.is-flex-mobile{display:flex!important}}@media screen and (min-width:769px),print{.is-flex-tablet{display:flex!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-flex-tablet-only{display:flex!important}}@media screen and (max-width:1023px){.is-flex-touch{display:flex!important}}@media screen and (min-width:1024px){.is-flex-desktop{display:flex!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-flex-desktop-only{display:flex!important}}@media screen and (min-width:1216px){.is-flex-widescreen{display:flex!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-flex-widescreen-only{display:flex!important}}@media screen and (min-width:1408px){.is-flex-fullhd{display:flex!important}}.is-inline{display:inline!important}@media screen and (max-width:768px){.is-inline-mobile{display:inline!important}}@media screen and (min-width:769px),print{.is-inline-tablet{display:inline!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-inline-tablet-only{display:inline!important}}@media screen and (max-width:1023px){.is-inline-touch{display:inline!important}}@media screen and (min-width:1024px){.is-inline-desktop{display:inline!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-inline-desktop-only{display:inline!important}}@media screen and (min-width:1216px){.is-inline-widescreen{display:inline!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-inline-widescreen-only{display:inline!important}}@media screen and (min-width:1408px){.is-inline-fullhd{display:inline!important}}.is-inline-block{display:inline-block!important}@media screen and (max-width:768px){.is-inline-block-mobile{display:inline-block!important}}@media screen and (min-width:769px),print{.is-inline-block-tablet{display:inline-block!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-inline-block-tablet-only{display:inline-block!important}}@media screen and (max-width:1023px){.is-inline-block-touch{display:inline-block!important}}@media screen and (min-width:1024px){.is-inline-block-desktop{display:inline-block!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-inline-block-desktop-only{display:inline-block!important}}@media screen and (min-width:1216px){.is-inline-block-widescreen{display:inline-block!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-inline-block-widescreen-only{display:inline-block!important}}@media screen and (min-width:1408px){.is-inline-block-fullhd{display:inline-block!important}}.is-inline-flex{display:inline-flex!important}@media screen and (max-width:768px){.is-inline-flex-mobile{display:inline-flex!important}}@media screen and (min-width:769px),print{.is-inline-flex-tablet{display:inline-flex!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-inline-flex-tablet-only{display:inline-flex!important}}@media screen and (max-width:1023px){.is-inline-flex-touch{display:inline-flex!important}}@media screen and (min-width:1024px){.is-inline-flex-desktop{display:inline-flex!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-inline-flex-desktop-only{display:inline-flex!important}}@media screen and (min-width:1216px){.is-inline-flex-widescreen{display:inline-flex!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-inline-flex-widescreen-only{display:inline-flex!important}}@media screen and (min-width:1408px){.is-inline-flex-fullhd{display:inline-flex!important}}.is-hidden{display:none!important}.is-sr-only{border:none!important;clip:rect(0,0,0,0)!important;height:.01em!important;overflow:hidden!important;padding:0!important;position:absolute!important;white-space:nowrap!important;width:.01em!important}@media screen and (max-width:768px){.is-hidden-mobile{display:none!important}}@media screen and (min-width:769px),print{.is-hidden-tablet{display:none!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-hidden-tablet-only{display:none!important}}@media screen and (max-width:1023px){.is-hidden-touch{display:none!important}}@media screen and (min-width:1024px){.is-hidden-desktop{display:none!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-hidden-desktop-only{display:none!important}}@media screen and (min-width:1216px){.is-hidden-widescreen{display:none!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-hidden-widescreen-only{display:none!important}}@media screen and (min-width:1408px){.is-hidden-fullhd{display:none!important}}.is-invisible{visibility:hidden!important}@media screen and (max-width:768px){.is-invisible-mobile{visibility:hidden!important}}@media screen and (min-width:769px),print{.is-invisible-tablet{visibility:hidden!important}}@media screen and (min-width:769px) and (max-width:1023px){.is-invisible-tablet-only{visibility:hidden!important}}@media screen and (max-width:1023px){.is-invisible-touch{visibility:hidden!important}}@media screen and (min-width:1024px){.is-invisible-desktop{visibility:hidden!important}}@media screen and (min-width:1024px) and (max-width:1215px){.is-invisible-desktop-only{visibility:hidden!important}}@media screen and (min-width:1216px){.is-invisible-widescreen{visibility:hidden!important}}@media screen and (min-width:1216px) and (max-width:1407px){.is-invisible-widescreen-only{visibility:hidden!important}}@media screen and (min-width:1408px){.is-invisible-fullhd{visibility:hidden!important}}.hero{align-items:stretch;display:flex;flex-direction:column;justify-content:space-between}.hero .navbar{background:0 0}.hero .tabs ul{border-bottom:none}.hero.is-white{background-color:#fff;color:#0a0a0a}.hero.is-white a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-white strong{color:inherit}.hero.is-white .title{color:#0a0a0a}.hero.is-white .subtitle{color:rgba(10,10,10,.9)}.hero.is-white .subtitle a:not(.button),.hero.is-white .subtitle strong{color:#0a0a0a}@media screen and (max-width:1023px){.hero.is-white .navbar-menu{background-color:#fff}}.hero.is-white .navbar-item,.hero.is-white .navbar-link{color:rgba(10,10,10,.7)}.hero.is-white .navbar-link.is-active,.hero.is-white .navbar-link:hover,.hero.is-white a.navbar-item.is-active,.hero.is-white a.navbar-item:hover{background-color:#f2f2f2;color:#0a0a0a}.hero.is-white .tabs a{color:#0a0a0a;opacity:.9}.hero.is-white .tabs a:hover{opacity:1}.hero.is-white .tabs li.is-active a{opacity:1}.hero.is-white .tabs.is-boxed a,.hero.is-white .tabs.is-toggle a{color:#0a0a0a}.hero.is-white .tabs.is-boxed a:hover,.hero.is-white .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-white .tabs.is-boxed li.is-active a,.hero.is-white .tabs.is-boxed li.is-active a:hover,.hero.is-white .tabs.is-toggle li.is-active a,.hero.is-white .tabs.is-toggle li.is-active a:hover{background-color:#0a0a0a;border-color:#0a0a0a;color:#fff}.hero.is-white.is-bold{background-image:linear-gradient(141deg,#e6e6e6 0,#fff 71%,#fff 100%)}@media screen and (max-width:768px){.hero.is-white.is-bold .navbar-menu{background-image:linear-gradient(141deg,#e6e6e6 0,#fff 71%,#fff 100%)}}.hero.is-black{background-color:#0a0a0a;color:#fff}.hero.is-black a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-black strong{color:inherit}.hero.is-black .title{color:#fff}.hero.is-black .subtitle{color:rgba(255,255,255,.9)}.hero.is-black .subtitle a:not(.button),.hero.is-black .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-black .navbar-menu{background-color:#0a0a0a}}.hero.is-black .navbar-item,.hero.is-black .navbar-link{color:rgba(255,255,255,.7)}.hero.is-black .navbar-link.is-active,.hero.is-black .navbar-link:hover,.hero.is-black a.navbar-item.is-active,.hero.is-black a.navbar-item:hover{background-color:#000;color:#fff}.hero.is-black .tabs a{color:#fff;opacity:.9}.hero.is-black .tabs a:hover{opacity:1}.hero.is-black .tabs li.is-active a{opacity:1}.hero.is-black .tabs.is-boxed a,.hero.is-black .tabs.is-toggle a{color:#fff}.hero.is-black .tabs.is-boxed a:hover,.hero.is-black .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-black .tabs.is-boxed li.is-active a,.hero.is-black .tabs.is-boxed li.is-active a:hover,.hero.is-black .tabs.is-toggle li.is-active a,.hero.is-black .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#0a0a0a}.hero.is-black.is-bold{background-image:linear-gradient(141deg,#000 0,#0a0a0a 71%,#181616 100%)}@media screen and (max-width:768px){.hero.is-black.is-bold .navbar-menu{background-image:linear-gradient(141deg,#000 0,#0a0a0a 71%,#181616 100%)}}.hero.is-light{background-color:#f5f5f5;color:rgba(0,0,0,.7)}.hero.is-light a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-light strong{color:inherit}.hero.is-light .title{color:rgba(0,0,0,.7)}.hero.is-light .subtitle{color:rgba(0,0,0,.9)}.hero.is-light .subtitle a:not(.button),.hero.is-light .subtitle strong{color:rgba(0,0,0,.7)}@media screen and (max-width:1023px){.hero.is-light .navbar-menu{background-color:#f5f5f5}}.hero.is-light .navbar-item,.hero.is-light .navbar-link{color:rgba(0,0,0,.7)}.hero.is-light .navbar-link.is-active,.hero.is-light .navbar-link:hover,.hero.is-light a.navbar-item.is-active,.hero.is-light a.navbar-item:hover{background-color:#e8e8e8;color:rgba(0,0,0,.7)}.hero.is-light .tabs a{color:rgba(0,0,0,.7);opacity:.9}.hero.is-light .tabs a:hover{opacity:1}.hero.is-light .tabs li.is-active a{opacity:1}.hero.is-light .tabs.is-boxed a,.hero.is-light .tabs.is-toggle a{color:rgba(0,0,0,.7)}.hero.is-light .tabs.is-boxed a:hover,.hero.is-light .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-light .tabs.is-boxed li.is-active a,.hero.is-light .tabs.is-boxed li.is-active a:hover,.hero.is-light .tabs.is-toggle li.is-active a,.hero.is-light .tabs.is-toggle li.is-active a:hover{background-color:rgba(0,0,0,.7);border-color:rgba(0,0,0,.7);color:#f5f5f5}.hero.is-light.is-bold{background-image:linear-gradient(141deg,#dfd8d9 0,#f5f5f5 71%,#fff 100%)}@media screen and (max-width:768px){.hero.is-light.is-bold .navbar-menu{background-image:linear-gradient(141deg,#dfd8d9 0,#f5f5f5 71%,#fff 100%)}}.hero.is-dark{background-color:#363636;color:#fff}.hero.is-dark a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-dark strong{color:inherit}.hero.is-dark .title{color:#fff}.hero.is-dark .subtitle{color:rgba(255,255,255,.9)}.hero.is-dark .subtitle a:not(.button),.hero.is-dark .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-dark .navbar-menu{background-color:#363636}}.hero.is-dark .navbar-item,.hero.is-dark .navbar-link{color:rgba(255,255,255,.7)}.hero.is-dark .navbar-link.is-active,.hero.is-dark .navbar-link:hover,.hero.is-dark a.navbar-item.is-active,.hero.is-dark a.navbar-item:hover{background-color:#292929;color:#fff}.hero.is-dark .tabs a{color:#fff;opacity:.9}.hero.is-dark .tabs a:hover{opacity:1}.hero.is-dark .tabs li.is-active a{opacity:1}.hero.is-dark .tabs.is-boxed a,.hero.is-dark .tabs.is-toggle a{color:#fff}.hero.is-dark .tabs.is-boxed a:hover,.hero.is-dark .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-dark .tabs.is-boxed li.is-active a,.hero.is-dark .tabs.is-boxed li.is-active a:hover,.hero.is-dark .tabs.is-toggle li.is-active a,.hero.is-dark .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#363636}.hero.is-dark.is-bold{background-image:linear-gradient(141deg,#1f191a 0,#363636 71%,#46403f 100%)}@media screen and (max-width:768px){.hero.is-dark.is-bold .navbar-menu{background-image:linear-gradient(141deg,#1f191a 0,#363636 71%,#46403f 100%)}}.hero.is-primary{background-color:#00d1b2;color:#fff}.hero.is-primary a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-primary strong{color:inherit}.hero.is-primary .title{color:#fff}.hero.is-primary .subtitle{color:rgba(255,255,255,.9)}.hero.is-primary .subtitle a:not(.button),.hero.is-primary .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-primary .navbar-menu{background-color:#00d1b2}}.hero.is-primary .navbar-item,.hero.is-primary .navbar-link{color:rgba(255,255,255,.7)}.hero.is-primary .navbar-link.is-active,.hero.is-primary .navbar-link:hover,.hero.is-primary a.navbar-item.is-active,.hero.is-primary a.navbar-item:hover{background-color:#00b89c;color:#fff}.hero.is-primary .tabs a{color:#fff;opacity:.9}.hero.is-primary .tabs a:hover{opacity:1}.hero.is-primary .tabs li.is-active a{opacity:1}.hero.is-primary .tabs.is-boxed a,.hero.is-primary .tabs.is-toggle a{color:#fff}.hero.is-primary .tabs.is-boxed a:hover,.hero.is-primary .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-primary .tabs.is-boxed li.is-active a,.hero.is-primary .tabs.is-boxed li.is-active a:hover,.hero.is-primary .tabs.is-toggle li.is-active a,.hero.is-primary .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#00d1b2}.hero.is-primary.is-bold{background-image:linear-gradient(141deg,#009e6c 0,#00d1b2 71%,#00e7eb 100%)}@media screen and (max-width:768px){.hero.is-primary.is-bold .navbar-menu{background-image:linear-gradient(141deg,#009e6c 0,#00d1b2 71%,#00e7eb 100%)}}.hero.is-link{background-color:#3273dc;color:#fff}.hero.is-link a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-link strong{color:inherit}.hero.is-link .title{color:#fff}.hero.is-link .subtitle{color:rgba(255,255,255,.9)}.hero.is-link .subtitle a:not(.button),.hero.is-link .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-link .navbar-menu{background-color:#3273dc}}.hero.is-link .navbar-item,.hero.is-link .navbar-link{color:rgba(255,255,255,.7)}.hero.is-link .navbar-link.is-active,.hero.is-link .navbar-link:hover,.hero.is-link a.navbar-item.is-active,.hero.is-link a.navbar-item:hover{background-color:#2366d1;color:#fff}.hero.is-link .tabs a{color:#fff;opacity:.9}.hero.is-link .tabs a:hover{opacity:1}.hero.is-link .tabs li.is-active a{opacity:1}.hero.is-link .tabs.is-boxed a,.hero.is-link .tabs.is-toggle a{color:#fff}.hero.is-link .tabs.is-boxed a:hover,.hero.is-link .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-link .tabs.is-boxed li.is-active a,.hero.is-link .tabs.is-boxed li.is-active a:hover,.hero.is-link .tabs.is-toggle li.is-active a,.hero.is-link .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#3273dc}.hero.is-link.is-bold{background-image:linear-gradient(141deg,#1577c6 0,#3273dc 71%,#4366e5 100%)}@media screen and (max-width:768px){.hero.is-link.is-bold .navbar-menu{background-image:linear-gradient(141deg,#1577c6 0,#3273dc 71%,#4366e5 100%)}}.hero.is-info{background-color:#3298dc;color:#fff}.hero.is-info a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-info strong{color:inherit}.hero.is-info .title{color:#fff}.hero.is-info .subtitle{color:rgba(255,255,255,.9)}.hero.is-info .subtitle a:not(.button),.hero.is-info .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-info .navbar-menu{background-color:#3298dc}}.hero.is-info .navbar-item,.hero.is-info .navbar-link{color:rgba(255,255,255,.7)}.hero.is-info .navbar-link.is-active,.hero.is-info .navbar-link:hover,.hero.is-info a.navbar-item.is-active,.hero.is-info a.navbar-item:hover{background-color:#238cd1;color:#fff}.hero.is-info .tabs a{color:#fff;opacity:.9}.hero.is-info .tabs a:hover{opacity:1}.hero.is-info .tabs li.is-active a{opacity:1}.hero.is-info .tabs.is-boxed a,.hero.is-info .tabs.is-toggle a{color:#fff}.hero.is-info .tabs.is-boxed a:hover,.hero.is-info .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-info .tabs.is-boxed li.is-active a,.hero.is-info .tabs.is-boxed li.is-active a:hover,.hero.is-info .tabs.is-toggle li.is-active a,.hero.is-info .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#3298dc}.hero.is-info.is-bold{background-image:linear-gradient(141deg,#159dc6 0,#3298dc 71%,#4389e5 100%)}@media screen and (max-width:768px){.hero.is-info.is-bold .navbar-menu{background-image:linear-gradient(141deg,#159dc6 0,#3298dc 71%,#4389e5 100%)}}.hero.is-success{background-color:#48c774;color:#fff}.hero.is-success a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-success strong{color:inherit}.hero.is-success .title{color:#fff}.hero.is-success .subtitle{color:rgba(255,255,255,.9)}.hero.is-success .subtitle a:not(.button),.hero.is-success .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-success .navbar-menu{background-color:#48c774}}.hero.is-success .navbar-item,.hero.is-success .navbar-link{color:rgba(255,255,255,.7)}.hero.is-success .navbar-link.is-active,.hero.is-success .navbar-link:hover,.hero.is-success a.navbar-item.is-active,.hero.is-success a.navbar-item:hover{background-color:#3abb67;color:#fff}.hero.is-success .tabs a{color:#fff;opacity:.9}.hero.is-success .tabs a:hover{opacity:1}.hero.is-success .tabs li.is-active a{opacity:1}.hero.is-success .tabs.is-boxed a,.hero.is-success .tabs.is-toggle a{color:#fff}.hero.is-success .tabs.is-boxed a:hover,.hero.is-success .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-success .tabs.is-boxed li.is-active a,.hero.is-success .tabs.is-boxed li.is-active a:hover,.hero.is-success .tabs.is-toggle li.is-active a,.hero.is-success .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#48c774}.hero.is-success.is-bold{background-image:linear-gradient(141deg,#29b342 0,#48c774 71%,#56d296 100%)}@media screen and (max-width:768px){.hero.is-success.is-bold .navbar-menu{background-image:linear-gradient(141deg,#29b342 0,#48c774 71%,#56d296 100%)}}.hero.is-warning{background-color:#ffdd57;color:rgba(0,0,0,.7)}.hero.is-warning a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-warning strong{color:inherit}.hero.is-warning .title{color:rgba(0,0,0,.7)}.hero.is-warning .subtitle{color:rgba(0,0,0,.9)}.hero.is-warning .subtitle a:not(.button),.hero.is-warning .subtitle strong{color:rgba(0,0,0,.7)}@media screen and (max-width:1023px){.hero.is-warning .navbar-menu{background-color:#ffdd57}}.hero.is-warning .navbar-item,.hero.is-warning .navbar-link{color:rgba(0,0,0,.7)}.hero.is-warning .navbar-link.is-active,.hero.is-warning .navbar-link:hover,.hero.is-warning a.navbar-item.is-active,.hero.is-warning a.navbar-item:hover{background-color:#ffd83d;color:rgba(0,0,0,.7)}.hero.is-warning .tabs a{color:rgba(0,0,0,.7);opacity:.9}.hero.is-warning .tabs a:hover{opacity:1}.hero.is-warning .tabs li.is-active a{opacity:1}.hero.is-warning .tabs.is-boxed a,.hero.is-warning .tabs.is-toggle a{color:rgba(0,0,0,.7)}.hero.is-warning .tabs.is-boxed a:hover,.hero.is-warning .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-warning .tabs.is-boxed li.is-active a,.hero.is-warning .tabs.is-boxed li.is-active a:hover,.hero.is-warning .tabs.is-toggle li.is-active a,.hero.is-warning .tabs.is-toggle li.is-active a:hover{background-color:rgba(0,0,0,.7);border-color:rgba(0,0,0,.7);color:#ffdd57}.hero.is-warning.is-bold{background-image:linear-gradient(141deg,#ffaf24 0,#ffdd57 71%,#fffa70 100%)}@media screen and (max-width:768px){.hero.is-warning.is-bold .navbar-menu{background-image:linear-gradient(141deg,#ffaf24 0,#ffdd57 71%,#fffa70 100%)}}.hero.is-danger{background-color:#f14668;color:#fff}.hero.is-danger a:not(.button):not(.dropdown-item):not(.tag):not(.pagination-link.is-current),.hero.is-danger strong{color:inherit}.hero.is-danger .title{color:#fff}.hero.is-danger .subtitle{color:rgba(255,255,255,.9)}.hero.is-danger .subtitle a:not(.button),.hero.is-danger .subtitle strong{color:#fff}@media screen and (max-width:1023px){.hero.is-danger .navbar-menu{background-color:#f14668}}.hero.is-danger .navbar-item,.hero.is-danger .navbar-link{color:rgba(255,255,255,.7)}.hero.is-danger .navbar-link.is-active,.hero.is-danger .navbar-link:hover,.hero.is-danger a.navbar-item.is-active,.hero.is-danger a.navbar-item:hover{background-color:#ef2e55;color:#fff}.hero.is-danger .tabs a{color:#fff;opacity:.9}.hero.is-danger .tabs a:hover{opacity:1}.hero.is-danger .tabs li.is-active a{opacity:1}.hero.is-danger .tabs.is-boxed a,.hero.is-danger .tabs.is-toggle a{color:#fff}.hero.is-danger .tabs.is-boxed a:hover,.hero.is-danger .tabs.is-toggle a:hover{background-color:rgba(10,10,10,.1)}.hero.is-danger .tabs.is-boxed li.is-active a,.hero.is-danger .tabs.is-boxed li.is-active a:hover,.hero.is-danger .tabs.is-toggle li.is-active a,.hero.is-danger .tabs.is-toggle li.is-active a:hover{background-color:#fff;border-color:#fff;color:#f14668}.hero.is-danger.is-bold{background-image:linear-gradient(141deg,#fa0a62 0,#f14668 71%,#f7595f 100%)}@media screen and (max-width:768px){.hero.is-danger.is-bold .navbar-menu{background-image:linear-gradient(141deg,#fa0a62 0,#f14668 71%,#f7595f 100%)}}.hero.is-small .hero-body{padding:1.5rem}@media screen and (min-width:769px),print{.hero.is-medium .hero-body{padding:9rem 1.5rem}}@media screen and (min-width:769px),print{.hero.is-large .hero-body{padding:18rem 1.5rem}}.hero.is-fullheight .hero-body,.hero.is-fullheight-with-navbar .hero-body,.hero.is-halfheight .hero-body{align-items:center;display:flex}.hero.is-fullheight .hero-body>.container,.hero.is-fullheight-with-navbar .hero-body>.container,.hero.is-halfheight .hero-body>.container{flex-grow:1;flex-shrink:1}.hero.is-halfheight{min-height:50vh}.hero.is-fullheight{min-height:100vh}.hero-video{overflow:hidden}.hero-video video{left:50%;min-height:100%;min-width:100%;position:absolute;top:50%;transform:translate3d(-50%,-50%,0)}.hero-video.is-transparent{opacity:.3}@media screen and (max-width:768px){.hero-video{display:none}}.hero-buttons{margin-top:1.5rem}@media screen and (max-width:768px){.hero-buttons .button{display:flex}.hero-buttons .button:not(:last-child){margin-bottom:.75rem}}@media screen and (min-width:769px),print{.hero-buttons{display:flex;justify-content:center}.hero-buttons .button:not(:last-child){margin-left:1.5rem}}.hero-foot,.hero-head{flex-grow:0;flex-shrink:0}.hero-body{flex-grow:1;flex-shrink:0;padding:3rem 1.5rem}.section{padding:3rem 1.5rem}@media screen and (min-width:1024px){.section.is-medium{padding:9rem 1.5rem}.section.is-large{padding:18rem 1.5rem}}.footer{background-color:#fafafa;padding:3rem 1.5rem 6rem} diff --git a/http-ui/public/filesize.min.js b/http-ui/public/filesize.min.js deleted file mode 100644 index f84992b72..000000000 --- a/http-ui/public/filesize.min.js +++ /dev/null @@ -1,5 +0,0 @@ -/* - 2020 Jason Mulligan - @version 6.1.0 -*/ -"use strict";!function(e){var x=/^(b|B)$/,M={iec:{bits:["b","Kib","Mib","Gib","Tib","Pib","Eib","Zib","Yib"],bytes:["B","KiB","MiB","GiB","TiB","PiB","EiB","ZiB","YiB"]},jedec:{bits:["b","Kb","Mb","Gb","Tb","Pb","Eb","Zb","Yb"],bytes:["B","KB","MB","GB","TB","PB","EB","ZB","YB"]}},w={iec:["","kibi","mebi","gibi","tebi","pebi","exbi","zebi","yobi"],jedec:["","kilo","mega","giga","tera","peta","exa","zetta","yotta"]};function t(e){var i,t,o,n,b,r,a,l,s,d,u,c,f,p,B,y=1+~]|"+M+")"+M+"*"),U=new RegExp(M+"|>"),X=new RegExp($),V=new RegExp("^"+I+"$"),G={ID:new RegExp("^#("+I+")"),CLASS:new RegExp("^\\.("+I+")"),TAG:new RegExp("^("+I+"|[*])"),ATTR:new RegExp("^"+W),PSEUDO:new RegExp("^"+$),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+M+"*(even|odd|(([+-]|)(\\d*)n|)"+M+"*(?:([+-]|)"+M+"*(\\d+)|))"+M+"*\\)|)","i"),bool:new RegExp("^(?:"+R+")$","i"),needsContext:new RegExp("^"+M+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+M+"*((?:-\\d)?\\d*)"+M+"*\\)|)(?=[^-]|$)","i")},Y=/HTML$/i,Q=/^(?:input|select|textarea|button)$/i,J=/^h\d$/i,K=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,ee=/[+~]/,te=new RegExp("\\\\([\\da-f]{1,6}"+M+"?|("+M+")|.)","ig"),ne=function(e,t,n){var r="0x"+t-65536;return r!=r||n?t:r<0?String.fromCharCode(r+65536):String.fromCharCode(r>>10|55296,1023&r|56320)},re=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ie=function(e,t){return t?"\0"===e?"\ufffd":e.slice(0,-1)+"\\"+e.charCodeAt(e.length-1).toString(16)+" ":"\\"+e},oe=function(){T()},ae=be(function(e){return!0===e.disabled&&"fieldset"===e.nodeName.toLowerCase()},{dir:"parentNode",next:"legend"});try{H.apply(t=O.call(m.childNodes),m.childNodes),t[m.childNodes.length].nodeType}catch(e){H={apply:t.length?function(e,t){L.apply(e,O.call(t))}:function(e,t){var n=e.length,r=0;while(e[n++]=t[r++]);e.length=n-1}}}function se(t,e,n,r){var i,o,a,s,u,l,c,f=e&&e.ownerDocument,p=e?e.nodeType:9;if(n=n||[],"string"!=typeof t||!t||1!==p&&9!==p&&11!==p)return n;if(!r&&((e?e.ownerDocument||e:m)!==C&&T(e),e=e||C,E)){if(11!==p&&(u=Z.exec(t)))if(i=u[1]){if(9===p){if(!(a=e.getElementById(i)))return n;if(a.id===i)return n.push(a),n}else if(f&&(a=f.getElementById(i))&&y(e,a)&&a.id===i)return n.push(a),n}else{if(u[2])return H.apply(n,e.getElementsByTagName(t)),n;if((i=u[3])&&d.getElementsByClassName&&e.getElementsByClassName)return H.apply(n,e.getElementsByClassName(i)),n}if(d.qsa&&!A[t+" "]&&(!v||!v.test(t))&&(1!==p||"object"!==e.nodeName.toLowerCase())){if(c=t,f=e,1===p&&U.test(t)){(s=e.getAttribute("id"))?s=s.replace(re,ie):e.setAttribute("id",s=k),o=(l=h(t)).length;while(o--)l[o]="#"+s+" "+xe(l[o]);c=l.join(","),f=ee.test(t)&&ye(e.parentNode)||e}try{return H.apply(n,f.querySelectorAll(c)),n}catch(e){A(t,!0)}finally{s===k&&e.removeAttribute("id")}}}return g(t.replace(B,"$1"),e,n,r)}function ue(){var r=[];return function e(t,n){return r.push(t+" ")>b.cacheLength&&delete e[r.shift()],e[t+" "]=n}}function le(e){return e[k]=!0,e}function ce(e){var t=C.createElement("fieldset");try{return!!e(t)}catch(e){return!1}finally{t.parentNode&&t.parentNode.removeChild(t),t=null}}function fe(e,t){var n=e.split("|"),r=n.length;while(r--)b.attrHandle[n[r]]=t}function pe(e,t){var n=t&&e,r=n&&1===e.nodeType&&1===t.nodeType&&e.sourceIndex-t.sourceIndex;if(r)return r;if(n)while(n=n.nextSibling)if(n===t)return-1;return e?1:-1}function de(t){return function(e){return"input"===e.nodeName.toLowerCase()&&e.type===t}}function he(n){return function(e){var t=e.nodeName.toLowerCase();return("input"===t||"button"===t)&&e.type===n}}function ge(t){return function(e){return"form"in e?e.parentNode&&!1===e.disabled?"label"in e?"label"in e.parentNode?e.parentNode.disabled===t:e.disabled===t:e.isDisabled===t||e.isDisabled!==!t&&ae(e)===t:e.disabled===t:"label"in e&&e.disabled===t}}function ve(a){return le(function(o){return o=+o,le(function(e,t){var n,r=a([],e.length,o),i=r.length;while(i--)e[n=r[i]]&&(e[n]=!(t[n]=e[n]))})})}function ye(e){return e&&"undefined"!=typeof e.getElementsByTagName&&e}for(e in d=se.support={},i=se.isXML=function(e){var t=e.namespaceURI,n=(e.ownerDocument||e).documentElement;return!Y.test(t||n&&n.nodeName||"HTML")},T=se.setDocument=function(e){var t,n,r=e?e.ownerDocument||e:m;return r!==C&&9===r.nodeType&&r.documentElement&&(a=(C=r).documentElement,E=!i(C),m!==C&&(n=C.defaultView)&&n.top!==n&&(n.addEventListener?n.addEventListener("unload",oe,!1):n.attachEvent&&n.attachEvent("onunload",oe)),d.attributes=ce(function(e){return e.className="i",!e.getAttribute("className")}),d.getElementsByTagName=ce(function(e){return e.appendChild(C.createComment("")),!e.getElementsByTagName("*").length}),d.getElementsByClassName=K.test(C.getElementsByClassName),d.getById=ce(function(e){return a.appendChild(e).id=k,!C.getElementsByName||!C.getElementsByName(k).length}),d.getById?(b.filter.ID=function(e){var t=e.replace(te,ne);return function(e){return e.getAttribute("id")===t}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n=t.getElementById(e);return n?[n]:[]}}):(b.filter.ID=function(e){var n=e.replace(te,ne);return function(e){var t="undefined"!=typeof e.getAttributeNode&&e.getAttributeNode("id");return t&&t.value===n}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n,r,i,o=t.getElementById(e);if(o){if((n=o.getAttributeNode("id"))&&n.value===e)return[o];i=t.getElementsByName(e),r=0;while(o=i[r++])if((n=o.getAttributeNode("id"))&&n.value===e)return[o]}return[]}}),b.find.TAG=d.getElementsByTagName?function(e,t){return"undefined"!=typeof t.getElementsByTagName?t.getElementsByTagName(e):d.qsa?t.querySelectorAll(e):void 0}:function(e,t){var n,r=[],i=0,o=t.getElementsByTagName(e);if("*"===e){while(n=o[i++])1===n.nodeType&&r.push(n);return r}return o},b.find.CLASS=d.getElementsByClassName&&function(e,t){if("undefined"!=typeof t.getElementsByClassName&&E)return t.getElementsByClassName(e)},s=[],v=[],(d.qsa=K.test(C.querySelectorAll))&&(ce(function(e){a.appendChild(e).innerHTML="
",e.querySelectorAll("[msallowcapture^='']").length&&v.push("[*^$]="+M+"*(?:''|\"\")"),e.querySelectorAll("[selected]").length||v.push("\\["+M+"*(?:value|"+R+")"),e.querySelectorAll("[id~="+k+"-]").length||v.push("~="),e.querySelectorAll(":checked").length||v.push(":checked"),e.querySelectorAll("a#"+k+"+*").length||v.push(".#.+[+~]")}),ce(function(e){e.innerHTML="";var t=C.createElement("input");t.setAttribute("type","hidden"),e.appendChild(t).setAttribute("name","D"),e.querySelectorAll("[name=d]").length&&v.push("name"+M+"*[*^$|!~]?="),2!==e.querySelectorAll(":enabled").length&&v.push(":enabled",":disabled"),a.appendChild(e).disabled=!0,2!==e.querySelectorAll(":disabled").length&&v.push(":enabled",":disabled"),e.querySelectorAll("*,:x"),v.push(",.*:")})),(d.matchesSelector=K.test(c=a.matches||a.webkitMatchesSelector||a.mozMatchesSelector||a.oMatchesSelector||a.msMatchesSelector))&&ce(function(e){d.disconnectedMatch=c.call(e,"*"),c.call(e,"[s!='']:x"),s.push("!=",$)}),v=v.length&&new RegExp(v.join("|")),s=s.length&&new RegExp(s.join("|")),t=K.test(a.compareDocumentPosition),y=t||K.test(a.contains)?function(e,t){var n=9===e.nodeType?e.documentElement:e,r=t&&t.parentNode;return e===r||!(!r||1!==r.nodeType||!(n.contains?n.contains(r):e.compareDocumentPosition&&16&e.compareDocumentPosition(r)))}:function(e,t){if(t)while(t=t.parentNode)if(t===e)return!0;return!1},D=t?function(e,t){if(e===t)return l=!0,0;var n=!e.compareDocumentPosition-!t.compareDocumentPosition;return n||(1&(n=(e.ownerDocument||e)===(t.ownerDocument||t)?e.compareDocumentPosition(t):1)||!d.sortDetached&&t.compareDocumentPosition(e)===n?e===C||e.ownerDocument===m&&y(m,e)?-1:t===C||t.ownerDocument===m&&y(m,t)?1:u?P(u,e)-P(u,t):0:4&n?-1:1)}:function(e,t){if(e===t)return l=!0,0;var n,r=0,i=e.parentNode,o=t.parentNode,a=[e],s=[t];if(!i||!o)return e===C?-1:t===C?1:i?-1:o?1:u?P(u,e)-P(u,t):0;if(i===o)return pe(e,t);n=e;while(n=n.parentNode)a.unshift(n);n=t;while(n=n.parentNode)s.unshift(n);while(a[r]===s[r])r++;return r?pe(a[r],s[r]):a[r]===m?-1:s[r]===m?1:0}),C},se.matches=function(e,t){return se(e,null,null,t)},se.matchesSelector=function(e,t){if((e.ownerDocument||e)!==C&&T(e),d.matchesSelector&&E&&!A[t+" "]&&(!s||!s.test(t))&&(!v||!v.test(t)))try{var n=c.call(e,t);if(n||d.disconnectedMatch||e.document&&11!==e.document.nodeType)return n}catch(e){A(t,!0)}return 0":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(e){return e[1]=e[1].replace(te,ne),e[3]=(e[3]||e[4]||e[5]||"").replace(te,ne),"~="===e[2]&&(e[3]=" "+e[3]+" "),e.slice(0,4)},CHILD:function(e){return e[1]=e[1].toLowerCase(),"nth"===e[1].slice(0,3)?(e[3]||se.error(e[0]),e[4]=+(e[4]?e[5]+(e[6]||1):2*("even"===e[3]||"odd"===e[3])),e[5]=+(e[7]+e[8]||"odd"===e[3])):e[3]&&se.error(e[0]),e},PSEUDO:function(e){var t,n=!e[6]&&e[2];return G.CHILD.test(e[0])?null:(e[3]?e[2]=e[4]||e[5]||"":n&&X.test(n)&&(t=h(n,!0))&&(t=n.indexOf(")",n.length-t)-n.length)&&(e[0]=e[0].slice(0,t),e[2]=n.slice(0,t)),e.slice(0,3))}},filter:{TAG:function(e){var t=e.replace(te,ne).toLowerCase();return"*"===e?function(){return!0}:function(e){return e.nodeName&&e.nodeName.toLowerCase()===t}},CLASS:function(e){var t=p[e+" "];return t||(t=new RegExp("(^|"+M+")"+e+"("+M+"|$)"))&&p(e,function(e){return t.test("string"==typeof e.className&&e.className||"undefined"!=typeof e.getAttribute&&e.getAttribute("class")||"")})},ATTR:function(n,r,i){return function(e){var t=se.attr(e,n);return null==t?"!="===r:!r||(t+="","="===r?t===i:"!="===r?t!==i:"^="===r?i&&0===t.indexOf(i):"*="===r?i&&-1:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function j(e,n,r){return m(n)?k.grep(e,function(e,t){return!!n.call(e,t,e)!==r}):n.nodeType?k.grep(e,function(e){return e===n!==r}):"string"!=typeof n?k.grep(e,function(e){return-1)[^>]*|#([\w-]+))$/;(k.fn.init=function(e,t,n){var r,i;if(!e)return this;if(n=n||q,"string"==typeof e){if(!(r="<"===e[0]&&">"===e[e.length-1]&&3<=e.length?[null,e,null]:L.exec(e))||!r[1]&&t)return!t||t.jquery?(t||n).find(e):this.constructor(t).find(e);if(r[1]){if(t=t instanceof k?t[0]:t,k.merge(this,k.parseHTML(r[1],t&&t.nodeType?t.ownerDocument||t:E,!0)),D.test(r[1])&&k.isPlainObject(t))for(r in t)m(this[r])?this[r](t[r]):this.attr(r,t[r]);return this}return(i=E.getElementById(r[2]))&&(this[0]=i,this.length=1),this}return e.nodeType?(this[0]=e,this.length=1,this):m(e)?void 0!==n.ready?n.ready(e):e(k):k.makeArray(e,this)}).prototype=k.fn,q=k(E);var H=/^(?:parents|prev(?:Until|All))/,O={children:!0,contents:!0,next:!0,prev:!0};function P(e,t){while((e=e[t])&&1!==e.nodeType);return e}k.fn.extend({has:function(e){var t=k(e,this),n=t.length;return this.filter(function(){for(var e=0;e\x20\t\r\n\f]*)/i,he=/^$|^module$|\/(?:java|ecma)script/i,ge={option:[1,""],thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};function ve(e,t){var n;return n="undefined"!=typeof e.getElementsByTagName?e.getElementsByTagName(t||"*"):"undefined"!=typeof e.querySelectorAll?e.querySelectorAll(t||"*"):[],void 0===t||t&&A(e,t)?k.merge([e],n):n}function ye(e,t){for(var n=0,r=e.length;nx",y.noCloneChecked=!!me.cloneNode(!0).lastChild.defaultValue;var Te=/^key/,Ce=/^(?:mouse|pointer|contextmenu|drag|drop)|click/,Ee=/^([^.]*)(?:\.(.+)|)/;function ke(){return!0}function Se(){return!1}function Ne(e,t){return e===function(){try{return E.activeElement}catch(e){}}()==("focus"===t)}function Ae(e,t,n,r,i,o){var a,s;if("object"==typeof t){for(s in"string"!=typeof n&&(r=r||n,n=void 0),t)Ae(e,s,n,r,t[s],o);return e}if(null==r&&null==i?(i=n,r=n=void 0):null==i&&("string"==typeof n?(i=r,r=void 0):(i=r,r=n,n=void 0)),!1===i)i=Se;else if(!i)return e;return 1===o&&(a=i,(i=function(e){return k().off(e),a.apply(this,arguments)}).guid=a.guid||(a.guid=k.guid++)),e.each(function(){k.event.add(this,t,i,r,n)})}function De(e,i,o){o?(Q.set(e,i,!1),k.event.add(e,i,{namespace:!1,handler:function(e){var t,n,r=Q.get(this,i);if(1&e.isTrigger&&this[i]){if(r.length)(k.event.special[i]||{}).delegateType&&e.stopPropagation();else if(r=s.call(arguments),Q.set(this,i,r),t=o(this,i),this[i](),r!==(n=Q.get(this,i))||t?Q.set(this,i,!1):n={},r!==n)return e.stopImmediatePropagation(),e.preventDefault(),n.value}else r.length&&(Q.set(this,i,{value:k.event.trigger(k.extend(r[0],k.Event.prototype),r.slice(1),this)}),e.stopImmediatePropagation())}})):void 0===Q.get(e,i)&&k.event.add(e,i,ke)}k.event={global:{},add:function(t,e,n,r,i){var o,a,s,u,l,c,f,p,d,h,g,v=Q.get(t);if(v){n.handler&&(n=(o=n).handler,i=o.selector),i&&k.find.matchesSelector(ie,i),n.guid||(n.guid=k.guid++),(u=v.events)||(u=v.events={}),(a=v.handle)||(a=v.handle=function(e){return"undefined"!=typeof k&&k.event.triggered!==e.type?k.event.dispatch.apply(t,arguments):void 0}),l=(e=(e||"").match(R)||[""]).length;while(l--)d=g=(s=Ee.exec(e[l])||[])[1],h=(s[2]||"").split(".").sort(),d&&(f=k.event.special[d]||{},d=(i?f.delegateType:f.bindType)||d,f=k.event.special[d]||{},c=k.extend({type:d,origType:g,data:r,handler:n,guid:n.guid,selector:i,needsContext:i&&k.expr.match.needsContext.test(i),namespace:h.join(".")},o),(p=u[d])||((p=u[d]=[]).delegateCount=0,f.setup&&!1!==f.setup.call(t,r,h,a)||t.addEventListener&&t.addEventListener(d,a)),f.add&&(f.add.call(t,c),c.handler.guid||(c.handler.guid=n.guid)),i?p.splice(p.delegateCount++,0,c):p.push(c),k.event.global[d]=!0)}},remove:function(e,t,n,r,i){var o,a,s,u,l,c,f,p,d,h,g,v=Q.hasData(e)&&Q.get(e);if(v&&(u=v.events)){l=(t=(t||"").match(R)||[""]).length;while(l--)if(d=g=(s=Ee.exec(t[l])||[])[1],h=(s[2]||"").split(".").sort(),d){f=k.event.special[d]||{},p=u[d=(r?f.delegateType:f.bindType)||d]||[],s=s[2]&&new RegExp("(^|\\.)"+h.join("\\.(?:.*\\.|)")+"(\\.|$)"),a=o=p.length;while(o--)c=p[o],!i&&g!==c.origType||n&&n.guid!==c.guid||s&&!s.test(c.namespace)||r&&r!==c.selector&&("**"!==r||!c.selector)||(p.splice(o,1),c.selector&&p.delegateCount--,f.remove&&f.remove.call(e,c));a&&!p.length&&(f.teardown&&!1!==f.teardown.call(e,h,v.handle)||k.removeEvent(e,d,v.handle),delete u[d])}else for(d in u)k.event.remove(e,d+t[l],n,r,!0);k.isEmptyObject(u)&&Q.remove(e,"handle events")}},dispatch:function(e){var t,n,r,i,o,a,s=k.event.fix(e),u=new Array(arguments.length),l=(Q.get(this,"events")||{})[s.type]||[],c=k.event.special[s.type]||{};for(u[0]=s,t=1;t\x20\t\r\n\f]*)[^>]*)\/>/gi,qe=/\s*$/g;function Oe(e,t){return A(e,"table")&&A(11!==t.nodeType?t:t.firstChild,"tr")&&k(e).children("tbody")[0]||e}function Pe(e){return e.type=(null!==e.getAttribute("type"))+"/"+e.type,e}function Re(e){return"true/"===(e.type||"").slice(0,5)?e.type=e.type.slice(5):e.removeAttribute("type"),e}function Me(e,t){var n,r,i,o,a,s,u,l;if(1===t.nodeType){if(Q.hasData(e)&&(o=Q.access(e),a=Q.set(t,o),l=o.events))for(i in delete a.handle,a.events={},l)for(n=0,r=l[i].length;n")},clone:function(e,t,n){var r,i,o,a,s,u,l,c=e.cloneNode(!0),f=oe(e);if(!(y.noCloneChecked||1!==e.nodeType&&11!==e.nodeType||k.isXMLDoc(e)))for(a=ve(c),r=0,i=(o=ve(e)).length;r").attr(n.scriptAttrs||{}).prop({charset:n.scriptCharset,src:n.url}).on("load error",i=function(e){r.remove(),i=null,e&&t("error"===e.type?404:200,e.type)}),E.head.appendChild(r[0])},abort:function(){i&&i()}}});var Vt,Gt=[],Yt=/(=)\?(?=&|$)|\?\?/;k.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var e=Gt.pop()||k.expando+"_"+kt++;return this[e]=!0,e}}),k.ajaxPrefilter("json jsonp",function(e,t,n){var r,i,o,a=!1!==e.jsonp&&(Yt.test(e.url)?"url":"string"==typeof e.data&&0===(e.contentType||"").indexOf("application/x-www-form-urlencoded")&&Yt.test(e.data)&&"data");if(a||"jsonp"===e.dataTypes[0])return r=e.jsonpCallback=m(e.jsonpCallback)?e.jsonpCallback():e.jsonpCallback,a?e[a]=e[a].replace(Yt,"$1"+r):!1!==e.jsonp&&(e.url+=(St.test(e.url)?"&":"?")+e.jsonp+"="+r),e.converters["script json"]=function(){return o||k.error(r+" was not called"),o[0]},e.dataTypes[0]="json",i=C[r],C[r]=function(){o=arguments},n.always(function(){void 0===i?k(C).removeProp(r):C[r]=i,e[r]&&(e.jsonpCallback=t.jsonpCallback,Gt.push(r)),o&&m(i)&&i(o[0]),o=i=void 0}),"script"}),y.createHTMLDocument=((Vt=E.implementation.createHTMLDocument("").body).innerHTML="
",2===Vt.childNodes.length),k.parseHTML=function(e,t,n){return"string"!=typeof e?[]:("boolean"==typeof t&&(n=t,t=!1),t||(y.createHTMLDocument?((r=(t=E.implementation.createHTMLDocument("")).createElement("base")).href=E.location.href,t.head.appendChild(r)):t=E),o=!n&&[],(i=D.exec(e))?[t.createElement(i[1])]:(i=we([e],t,o),o&&o.length&&k(o).remove(),k.merge([],i.childNodes)));var r,i,o},k.fn.load=function(e,t,n){var r,i,o,a=this,s=e.indexOf(" ");return-1").append(k.parseHTML(e)).find(r):e)}).always(n&&function(e,t){a.each(function(){n.apply(this,o||[e.responseText,t,e])})}),this},k.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){k.fn[t]=function(e){return this.on(t,e)}}),k.expr.pseudos.animated=function(t){return k.grep(k.timers,function(e){return t===e.elem}).length},k.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=k.css(e,"position"),c=k(e),f={};"static"===l&&(e.style.position="relative"),s=c.offset(),o=k.css(e,"top"),u=k.css(e,"left"),("absolute"===l||"fixed"===l)&&-1<(o+u).indexOf("auto")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),m(t)&&(t=t.call(e,n,k.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),"using"in t?t.using.call(e,f):c.css(f)}},k.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){k.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if("fixed"===k.css(r,"position"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&"static"===k.css(e,"position"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=k(e).offset()).top+=k.css(e,"borderTopWidth",!0),i.left+=k.css(e,"borderLeftWidth",!0))}return{top:t.top-i.top-k.css(r,"marginTop",!0),left:t.left-i.left-k.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&"static"===k.css(e,"position"))e=e.offsetParent;return e||ie})}}),k.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,i){var o="pageYOffset"===i;k.fn[t]=function(e){return _(this,function(e,t,n){var r;if(x(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),k.each(["top","left"],function(e,n){k.cssHooks[n]=ze(y.pixelPosition,function(e,t){if(t)return t=_e(e,n),$e.test(t)?k(e).position()[n]+"px":t})}),k.each({Height:"height",Width:"width"},function(a,s){k.each({padding:"inner"+a,content:s,"":"outer"+a},function(r,o){k.fn[o]=function(e,t){var n=arguments.length&&(r||"boolean"!=typeof e),i=r||(!0===e||!0===t?"margin":"border");return _(this,function(e,t,n){var r;return x(e)?0===o.indexOf("outer")?e["inner"+a]:e.document.documentElement["client"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body["scroll"+a],r["scroll"+a],e.body["offset"+a],r["offset"+a],r["client"+a])):void 0===n?k.css(e,t,i):k.style(e,t,n,i)},s,n?e:void 0,n)}})}),k.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,n){k.fn[n]=function(e,t){return 0 - - - - - diff --git a/http-ui/public/logo-white.svg b/http-ui/public/logo-white.svg deleted file mode 100644 index 58bfd5738..000000000 --- a/http-ui/public/logo-white.svg +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - diff --git a/http-ui/public/script.js b/http-ui/public/script.js deleted file mode 100644 index e4de86672..000000000 --- a/http-ui/public/script.js +++ /dev/null @@ -1,154 +0,0 @@ -var request = null; -var timeoutID = null; -var display_facets = false; - -$('#query, #filters').on('input', function () { - var query = $('#query').val(); - var filters = $('#filters').val(); - var timeoutMs = 100; - - if (timeoutID !== null) { - window.clearTimeout(timeoutID); - } - - timeoutID = window.setTimeout(function () { - request = $.ajax({ - type: "POST", - url: "query", - contentType: 'application/json', - data: JSON.stringify({ - 'query': query, - 'filters': filters, - "facetDistribution": display_facets, - }), - contentType: 'application/json', - success: function (data, textStatus, request) { - results.innerHTML = ''; - facets.innerHTML = ''; - - let timeSpent = request.getResponseHeader('Time-Ms'); - let numberOfDocuments = data.documents.length; - count.innerHTML = data.numberOfCandidates.toLocaleString(); - time.innerHTML = `${timeSpent}ms`; - time.classList.remove('fade-in-out'); - - for (facet_name in data.facets) { - for (value in data.facets[facet_name]) { - const elem = document.createElement('span'); - const count = data.facets[facet_name][value]; - elem.classList.add("tag"); - elem.setAttribute('data-name', facet_name); - elem.setAttribute('data-value', value); - elem.innerHTML = `${facet_name}:${value} (${count})`; - facets.appendChild(elem); - } - } - - for (element of data.documents) { - const elem = document.createElement('li'); - elem.classList.add("document"); - - const ol = document.createElement('ol'); - - for (const prop in element) { - const field = document.createElement('li'); - field.classList.add("field"); - - const attribute = document.createElement('div'); - attribute.classList.add("attribute"); - attribute.innerHTML = prop; - - const content = document.createElement('div'); - content.classList.add("content"); - - // Stringify Objects and Arrays to avoid [Object object] - if (typeof element[prop] === 'object' && element[prop] !== null) { - content.innerHTML = JSON.stringify(element[prop]); - } else { - content.innerHTML = element[prop]; - } - - field.appendChild(attribute); - field.appendChild(content); - - ol.appendChild(field); - } - - elem.appendChild(ol); - results.appendChild(elem); - } - - // When we click on a tag we append the facet value - // at the end of the facet query. - $('#facets .tag').on('click', function () { - let name = $(this).attr("data-name"); - let value = $(this).attr("data-value"); - - let facet_query = $('#filters').val().trim(); - if (facet_query === "") { - $('#filters').val(`${name} = "${value}"`).trigger('input'); - } else { - $('#filters').val(`${facet_query} AND ${name} = "${value}"`).trigger('input'); - } - }); - }, - beforeSend: function () { - if (request !== null) { - request.abort(); - time.classList.add('fade-in-out'); - } - }, - }); - }, timeoutMs); -}); - -function diffArray(arr1, arr2) { - return arr1.concat(arr2).filter(function (val) { - if (!(arr1.includes(val) && arr2.includes(val))) - return val; - }); -} - -function selectedFacetsToArray(facets_obj) { - var array = []; - for (const facet_name in facets_obj) { - var subarray = []; - for (const facet_value of facets_obj[facet_name]) { - subarray.push(`${facet_name}:${facet_value}`); - } - array.push(subarray); - } - return array; -} - -$('#display_facets').click(function() { - if (display_facets) { - display_facets = false; - $('#display_facets').html("Display facets") - $('#display_facets').removeClass("is-danger"); - $('#display_facets').addClass("is-success"); - $('#facets').hide(); - } else { - display_facets = true; - $('#display_facets').html("Hide facets") - $('#display_facets').addClass("is-danger"); - $('#display_facets').removeClass("is-success"); - $('#facets').show(); - } -}); - -// Make the number of document a little bit prettier -$('#docs-count').text(function(index, text) { - return parseInt(text).toLocaleString() -}); - -// Make the database a little bit easier to read -$('#db-size').text(function(index, text) { - return filesize(parseInt(text)) -}); - -// We trigger the input when we load the script. -$(window).on('load', function () { - // We execute a placeholder search when the input is empty. - $('#query').trigger('input'); -}); diff --git a/http-ui/public/style.css b/http-ui/public/style.css deleted file mode 100644 index ef032e51e..000000000 --- a/http-ui/public/style.css +++ /dev/null @@ -1,144 +0,0 @@ -#results { - max-width: 900px; - margin: 20px auto 0 auto; - padding: 0; -} - -#facets .tag { - margin-right: 1em; - margin-bottom: 1em; -} - -#facets { - display: none; - max-width: 900px; - margin: 20px auto 0 auto; - padding: 0; - max-height: 16em; - overflow: scroll; -} - -#display_facets { - margin: 20px auto 0 auto; - padding: 5px; - max-height: 16em; - overflow: scroll; -} - -#facets .tag:hover { - cursor: pointer; -} - -#logo-white { - display: none; -} - -#logo-black { - display: inherit; -} - -.notification { - display: flex; - justify-content: center; -} - -.document { - padding: 20px 20px; - background-color: #f5f5f5; - border-radius: 4px; - margin-bottom: 20px; - display: flex; -} - -.document ol { - flex: 0 0 75%; - max-width: 75%; - padding: 0; - margin: 0; -} - -.document .image { - max-width: 25%; - flex: 0 0 25%; - padding-left: 30px; - box-sizing: border-box; -} - -.document .image img { - width: 100%; -} - -.field { - list-style-type: none; - display: flex; - flex-wrap: wrap; -} - -.field:not(:last-child) { - margin-bottom: 7px; -} - -.attribute { - flex: 0 0 35%; - max-width: 35%; - text-align: right; - padding-right: 10px; - box-sizing: border-box; - text-transform: uppercase; - opacity: 0.7; -} - -.content { - max-width: 65%; - flex: 0 0 65%; - box-sizing: border-box; - padding-left: 10px; - color: rgba(0,0,0,.9); -} - -.content mark { - background-color: hsl(204, 86%, 88%); - color: hsl(204, 86%, 25%); -} - -@keyframes fadeInOut { - 0% { opacity: 1; } - 30% { opacity: 0.3; } - 100% { opacity: 1; } -} - -.fade-in-out { - animation: fadeInOut ease 1s infinite; -} - -@media (prefers-color-scheme:dark) { - #logo-white { - display: inherit; - } - - #logo-black { - display: none; - } - - .hero.is-light { - background-color: #242424; - color: inherit; - } - - .hero.is-light .title { - color: inherit; - } - - .document { - background-color: #242424; - } - - .content { - color: #dbdbdb; - } - - .content mark { - background-color: hsl(0, 0%, 35%); - color: hsl(0,0%,90.2%); - } -} diff --git a/http-ui/public/updates-script.js b/http-ui/public/updates-script.js deleted file mode 100644 index bb91de313..000000000 --- a/http-ui/public/updates-script.js +++ /dev/null @@ -1,102 +0,0 @@ -$(window).on('load', function () { - let wsProtcol = "ws"; - if (window.location.protocol === 'https') { - wsProtcol = 'wss'; - } - - let url = wsProtcol + '://' + window.location.hostname + ':' + window.location.port + '/updates/ws'; - var socket = new WebSocket(url); - - socket.onmessage = function (event) { - let status = JSON.parse(event.data); - - if (status.type == 'Pending') { - const elem = document.createElement('li'); - elem.classList.add("document"); - elem.setAttribute("id", 'update-' + status.update_id); - - const ol = document.createElement('ol'); - const field = document.createElement('li'); - field.classList.add("field"); - - const attributeUpdateId = document.createElement('div'); - attributeUpdateId.classList.add("attribute"); - attributeUpdateId.innerHTML = "update id"; - - const contentUpdateId = document.createElement('div'); - contentUpdateId.classList.add("updateId"); - contentUpdateId.classList.add("content"); - contentUpdateId.innerHTML = status.update_id; - - field.appendChild(attributeUpdateId); - field.appendChild(contentUpdateId); - - const attributeUpdateStatus = document.createElement('div'); - attributeUpdateStatus.classList.add("attribute"); - attributeUpdateStatus.innerHTML = "update status"; - - const contentUpdateStatus = document.createElement('div'); - contentUpdateStatus.classList.add("updateStatus"); - contentUpdateStatus.classList.add("content"); - contentUpdateStatus.innerHTML = 'pending'; - - field.appendChild(attributeUpdateStatus); - field.appendChild(contentUpdateStatus); - - ol.appendChild(field); - elem.appendChild(ol); - - prependChild(results, elem); - } - - if (status.type == "Progressing") { - const id = 'update-' + status.update_id; - const content = $(`#${id} .updateStatus.content`); - - let html; - - let { type, step, total_steps, current, total } = status.meta; - - if (type === 'DocumentsAddition') { - // If the total is null or undefined then the progress results is infinity. - let progress = Math.round(current / total * 100); - // We must divide the progress by the total number of indexing steps. - progress = progress / total_steps; - // And mark the previous steps as processed. - progress = progress + (step * 100 / total_steps); - // Generate the appropriate html bulma progress bar. - html = ``; - } else { - html = ``; - } - - content.html(html); - } - - if (status.type == "Processed") { - const id = 'update-' + status.update_id; - const content = $(`#${id} .updateStatus.content`); - content.html('processed ' + JSON.stringify(status.meta)); - } - - if (status.type == "Aborted") { - const id = 'update-' + status.update_id; - const content = $(`#${id} .updateStatus.content`); - content.html('aborted ' + JSON.stringify(status.meta)); - } - } -}); - -function prependChild(parent, newFirstChild) { - parent.insertBefore(newFirstChild, parent.firstChild) -} - -// Make the number of document a little bit prettier -$('#docs-count').text(function(index, text) { - return parseInt(text).toLocaleString() -}); - -// Make the database a little bit easier to read -$('#db-size').text(function(index, text) { - return filesize(parseInt(text)) -}); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs deleted file mode 100644 index 3b14889cc..000000000 --- a/http-ui/src/main.rs +++ /dev/null @@ -1,1176 +0,0 @@ -mod update_store; - -use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; -use std::fmt::Display; -use std::fs::{create_dir_all, File}; -use std::io::{BufReader, Cursor, Read}; -use std::net::SocketAddr; -use std::num::{NonZeroU32, NonZeroUsize}; -use std::path::PathBuf; -use std::str::FromStr; -use std::sync::Arc; -use std::time::Instant; -use std::{io, mem}; - -use askama_warp::Template; -use byte_unit::Byte; -use either::Either; -use flate2::read::GzDecoder; -use futures::{stream, FutureExt, StreamExt}; -use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; -use milli::heed::EnvOpenOptions; -use milli::tokenizer::TokenizerBuilder; -use milli::update::UpdateIndexingStep::*; -use milli::update::{ - ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, -}; -use milli::{ - obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index, - MatcherBuilder, Object, SearchResult, SortError, -}; -use once_cell::sync::OnceCell; -use serde::{Deserialize, Serialize}; -use serde_json::Value; -use structopt::StructOpt; -use tokio::fs::File as TFile; -use tokio::io::AsyncWriteExt; -use tokio::sync::broadcast; -use tokio_stream::wrappers::BroadcastStream; -use warp::filters::ws::Message; -use warp::http::Response; -use warp::Filter; - -use self::update_store::UpdateStore; - -#[global_allocator] -static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; - -static GLOBAL_CONFIG: OnceCell = OnceCell::new(); - -#[derive(Debug, StructOpt)] -/// The HTTP main server of the milli project. -pub struct Opt { - /// The database path where the LMDB database is located. - /// It is created if it doesn't already exist. - #[structopt(long = "db", parse(from_os_str))] - database: PathBuf, - - /// The maximum size the database can take on disk. It is recommended to specify - /// the whole disk space (value must be a multiple of a page size). - #[structopt(long = "db-size", default_value = "100 GiB")] - database_size: Byte, - - /// The maximum size the database that stores the updates can take on disk. It is recommended - /// to specify the whole disk space (value must be a multiple of a page size). - #[structopt(long = "udb-size", default_value = "10 GiB")] - update_database_size: Byte, - - /// Disable document highlighting on the dashboard. - #[structopt(long)] - disable_highlighting: bool, - - /// Verbose mode (-v, -vv, -vvv, etc.) - #[structopt(short, long, parse(from_occurrences))] - verbose: usize, - - /// The ip and port on which the database will listen for HTTP requests. - #[structopt(short = "l", long, default_value = "127.0.0.1:9700")] - http_listen_addr: String, - - #[structopt(flatten)] - indexer: IndexerOpt, -} - -#[derive(Debug, Clone, StructOpt)] -pub struct IndexerOpt { - /// The amount of documents to skip before printing - /// a log regarding the indexing advancement. - #[structopt(long, default_value = "100000")] // 100k - pub log_every_n: usize, - - /// MTBL max number of chunks in bytes. - #[structopt(long)] - pub max_nb_chunks: Option, - - /// The maximum amount of memory to use for the MTBL buffer. It is recommended - /// to use something like 80%-90% of the available memory. - /// - /// It is automatically split by the number of jobs e.g. if you use 7 jobs - /// and 7 GB of max memory, each thread will use a maximum of 1 GB. - #[structopt(long, default_value = "7 GiB")] - pub max_memory: Byte, - - /// Size of the linked hash map cache when indexing. - /// The bigger it is, the faster the indexing is but the more memory it takes. - #[structopt(long, default_value = "500")] - pub linked_hash_map_size: usize, - - /// The name of the compression algorithm to use when compressing intermediate - /// chunks during indexing documents. - /// - /// Choosing a fast algorithm will make the indexing faster but may consume more memory. - #[structopt(long, possible_values = &["snappy", "zlib", "lz4", "lz4hc", "zstd"])] - pub chunk_compression_type: Option, - - /// The level of compression of the chosen algorithm. - #[structopt(long, requires = "chunk-compression-type")] - pub chunk_compression_level: Option, - - /// The number of bytes to remove from the begining of the chunks while reading/sorting - /// or merging them. - /// - /// File fusing must only be enable on file systems that support the `FALLOC_FL_COLLAPSE_RANGE`, - /// (i.e. ext4 and XFS). File fusing will only work if the `enable-chunk-fusing` is set. - #[structopt(long, default_value = "4 GiB")] - pub chunk_fusing_shrink_size: Byte, - - /// Enable the chunk fusing or not, this reduces the amount of disk used by a factor of 2. - #[structopt(long)] - pub enable_chunk_fusing: bool, - - /// Number of parallel jobs for indexing, defaults to # of CPUs. - #[structopt(long)] - pub indexing_jobs: Option, - - /// Maximum relative position in an attribute for a word to be indexed. - /// Any value higher than 65535 will be clamped. - #[structopt(long)] - pub max_positions_per_attributes: Option, -} - -struct Highlighter<'s, A> { - matcher_builder: MatcherBuilder<'s, A>, -} - -impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { - fn new(matcher_builder: MatcherBuilder<'s, A>) -> Self { - Self { matcher_builder } - } - - fn highlight_value(&self, value: Value) -> Value { - match value { - Value::Null => Value::Null, - Value::Bool(boolean) => Value::Bool(boolean), - Value::Number(number) => Value::Number(number), - Value::String(old_string) => { - let mut matcher = self.matcher_builder.build(&old_string); - - let format_options = FormatOptions { highlight: true, crop: Some(10) }; - - Value::String(matcher.format(format_options).to_string()) - } - Value::Array(values) => { - Value::Array(values.into_iter().map(|v| self.highlight_value(v)).collect()) - } - Value::Object(object) => Value::Object( - object.into_iter().map(|(k, v)| (k, self.highlight_value(v))).collect(), - ), - } - } - - fn highlight_record(&self, object: &mut Object, attributes_to_highlight: &HashSet) { - // TODO do we need to create a string for element that are not and needs to be highlight? - for (key, value) in object.iter_mut() { - if attributes_to_highlight.contains(key) { - let old_value = mem::take(value); - *value = self.highlight_value(old_value); - } - } - } -} - -#[derive(Template)] -#[template(path = "index.html")] -struct IndexTemplate { - db_name: String, - db_size: usize, - docs_count: usize, -} - -#[derive(Template)] -#[template(path = "updates.html")] -struct UpdatesTemplate { - db_name: String, - db_size: usize, - docs_count: usize, - updates: Vec>, -} - -#[derive(Debug, Clone, Serialize)] -#[serde(tag = "type")] -enum UpdateStatus { - Pending { update_id: u64, meta: M }, - Progressing { update_id: u64, meta: P }, - Processed { update_id: u64, meta: N }, - Aborted { update_id: u64, meta: M }, -} - -impl UpdateStatus { - fn update_id(&self) -> u64 { - match self { - UpdateStatus::Pending { update_id, .. } => *update_id, - UpdateStatus::Progressing { update_id, .. } => *update_id, - UpdateStatus::Processed { update_id, .. } => *update_id, - UpdateStatus::Aborted { update_id, .. } => *update_id, - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(tag = "type")] -enum UpdateMeta { - DocumentsAddition { method: String, format: String, encoding: Option }, - ClearDocuments, - Settings(Settings), - Facets(Facets), -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(tag = "type")] -enum UpdateMetaProgress { - DocumentsAddition { step: usize, total_steps: usize, current: usize, total: Option }, -} - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -#[serde(deny_unknown_fields)] -#[serde(rename_all = "camelCase")] -struct Settings { - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - displayed_attributes: Setting>, - - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - searchable_attributes: Setting>, - - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - filterable_attributes: Setting>, - - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - sortable_attributes: Setting>, - - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - criteria: Setting>, - - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - stop_words: Setting>, - - #[serde(default, skip_serializing_if = "Setting::is_not_set")] - synonyms: Setting>>, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -#[serde(rename_all = "camelCase")] -struct Facets { - level_group_size: Option, - min_level_size: Option, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -#[serde(rename_all = "camelCase")] -struct WordsPrefixes { - threshold: Option, - max_prefix_length: Option, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -#[serde(deny_unknown_fields)] -#[serde(rename_all = "camelCase")] -struct WordsLevelPositions { - level_group_size: Option, - min_level_size: Option, -} - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - let opt = Opt::from_args(); - - stderrlog::new() - .verbosity(opt.verbose) - .show_level(false) - .timestamp(stderrlog::Timestamp::Off) - .init()?; - - create_dir_all(&opt.database)?; - let mut options = EnvOpenOptions::new(); - options.map_size(opt.database_size.get_bytes() as usize); - - // Setup the global thread pool - let jobs = opt.indexer.indexing_jobs.unwrap_or(0); - let pool = rayon::ThreadPoolBuilder::new().num_threads(jobs).build()?; - - let config = IndexerConfig { - max_nb_chunks: opt.indexer.max_nb_chunks, - chunk_compression_level: opt.indexer.chunk_compression_level, - max_positions_per_attributes: opt.indexer.max_positions_per_attributes, - thread_pool: Some(pool), - log_every_n: Some(opt.indexer.log_every_n), - max_memory: Some(opt.indexer.max_memory.get_bytes() as usize), - chunk_compression_type: opt.indexer.chunk_compression_type.unwrap_or(CompressionType::None), - ..Default::default() - }; - - GLOBAL_CONFIG.set(config).unwrap(); - - // Open the LMDB database. - let index = Index::new(options, &opt.database)?; - - // Setup the LMDB based update database. - let mut update_store_options = EnvOpenOptions::new(); - update_store_options.map_size(opt.update_database_size.get_bytes() as usize); - - let update_store_path = opt.database.join("updates.mdb"); - create_dir_all(&update_store_path)?; - - let (update_status_sender, _) = broadcast::channel(100); - let update_status_sender_cloned = update_status_sender.clone(); - let index_cloned = index.clone(); - let update_store = UpdateStore::open( - update_store_options, - update_store_path, - // the type hint is necessary: https://github.com/rust-lang/rust/issues/32600 - move |update_id, meta, content: &_| { - // We prepare the update by using the update builder. - - let before_update = Instant::now(); - // we extract the update type and execute the update itself. - let result: anyhow::Result<()> = (|| match meta { - UpdateMeta::DocumentsAddition { method, format, encoding } => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let update_method = match method.as_str() { - "replace" => IndexDocumentsMethod::ReplaceDocuments, - "update" => IndexDocumentsMethod::UpdateDocuments, - otherwise => panic!("invalid indexing method {:?}", otherwise), - }; - let indexing_config = IndexDocumentsConfig { - update_method, - autogenerate_docids: true, - ..Default::default() - }; - - let indexing_callback = |indexing_step| { - let (current, total) = match indexing_step { - RemapDocumentAddition { documents_seen } => (documents_seen, None), - ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { - (documents_seen, Some(total_documents)) - } - IndexDocuments { documents_seen, total_documents } => { - (documents_seen, Some(total_documents)) - } - MergeDataIntoFinalDatabase { databases_seen, total_databases } => { - (databases_seen, Some(total_databases)) - } - }; - let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { - update_id, - meta: UpdateMetaProgress::DocumentsAddition { - step: indexing_step.step(), - total_steps: indexing_step.number_of_steps(), - current, - total, - }, - }); - }; - - let builder = milli::update::IndexDocuments::new( - &mut wtxn, - &index_cloned, - GLOBAL_CONFIG.get().unwrap(), - indexing_config, - indexing_callback, - )?; - - let reader = match encoding.as_deref() { - Some("gzip") => Box::new(GzDecoder::new(content)), - None => Box::new(content) as Box, - otherwise => panic!("invalid encoding format {:?}", otherwise), - }; - - let documents = match format.as_str() { - "csv" => documents_from_csv(reader)?, - "json" => documents_from_json(reader)?, - "jsonl" => documents_from_jsonl(reader)?, - otherwise => panic!("invalid update format {:?}", otherwise), - }; - - let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?; - - let (builder, user_error) = builder.add_documents(documents)?; - let _count = user_error?; - let result = builder.execute(); - - match result { - Ok(_) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } - } - UpdateMeta::ClearDocuments => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let builder = ClearDocuments::new(&mut wtxn, &index_cloned); - - match builder.execute() { - Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } - } - UpdateMeta::Settings(settings) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = milli::update::Settings::new( - &mut wtxn, - &index_cloned, - GLOBAL_CONFIG.get().unwrap(), - ); - - // We transpose the settings JSON struct into a real setting update. - match settings.searchable_attributes { - Setting::Set(searchable_attributes) => { - builder.set_searchable_fields(searchable_attributes) - } - Setting::Reset => builder.reset_searchable_fields(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.displayed_attributes { - Setting::Set(displayed_attributes) => { - builder.set_displayed_fields(displayed_attributes) - } - Setting::Reset => builder.reset_displayed_fields(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.filterable_attributes { - Setting::Set(filterable_attributes) => { - builder.set_filterable_fields(filterable_attributes) - } - Setting::Reset => builder.reset_filterable_fields(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.sortable_attributes { - Setting::Set(sortable_attributes) => { - builder.set_sortable_fields(sortable_attributes) - } - Setting::Reset => builder.reset_sortable_fields(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.criteria { - Setting::Set(criteria) => builder.set_criteria(criteria), - Setting::Reset => builder.reset_criteria(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.stop_words { - Setting::Set(stop_words) => builder.set_stop_words(stop_words), - Setting::Reset => builder.reset_stop_words(), - Setting::NotSet => (), - } - - // We transpose the settings JSON struct into a real setting update. - match settings.synonyms { - Setting::Set(synonyms) => builder.set_synonyms(synonyms), - Setting::Reset => builder.reset_synonyms(), - Setting::NotSet => (), - } - - let result = builder.execute(|indexing_step| { - let (current, total) = match indexing_step { - RemapDocumentAddition { documents_seen } => (documents_seen, None), - ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { - (documents_seen, Some(total_documents)) - } - IndexDocuments { documents_seen, total_documents } => { - (documents_seen, Some(total_documents)) - } - MergeDataIntoFinalDatabase { databases_seen, total_databases } => { - (databases_seen, Some(total_databases)) - } - }; - let _ = update_status_sender_cloned.send(UpdateStatus::Progressing { - update_id, - meta: UpdateMetaProgress::DocumentsAddition { - step: indexing_step.step(), - total_steps: indexing_step.number_of_steps(), - current, - total, - }, - }); - }); - - match result { - Ok(_count) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } - } - UpdateMeta::Facets(levels) => { - // We must use the write transaction of the update here. - let mut wtxn = index_cloned.write_txn()?; - let mut builder = milli::update::Facets::new(&mut wtxn, &index_cloned); - if let Some(value) = levels.level_group_size { - builder.level_group_size(value); - } - if let Some(value) = levels.min_level_size { - builder.min_level_size(value); - } - match builder.execute() { - Ok(()) => wtxn.commit().map_err(Into::into), - Err(e) => Err(e.into()), - } - } - })(); - - let meta = match result { - Ok(()) => { - format!("valid update content processed in {:.02?}", before_update.elapsed()) - } - Err(e) => format!("error while processing update content: {:?}", e), - }; - - let processed = UpdateStatus::Processed { update_id, meta: meta.clone() }; - let _ = update_status_sender_cloned.send(processed); - - Ok(meta) - }, - )?; - - // The database name will not change. - let db_name = opt.database.file_stem().and_then(|s| s.to_str()).unwrap_or("").to_string(); - let lmdb_path = opt.database.join("data.mdb"); - - // We run and wait on the HTTP server - - // Expose an HTML page to debug the search in a browser - let db_name_cloned = db_name.clone(); - let lmdb_path_cloned = lmdb_path.clone(); - let index_cloned = index.clone(); - let dash_html_route = - warp::filters::method::get().and(warp::filters::path::end()).map(move || { - // We retrieve the database size. - let db_size = - File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() as usize; - - // And the number of documents in the database. - let rtxn = index_cloned.read_txn().unwrap(); - let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize; - - IndexTemplate { db_name: db_name_cloned.clone(), db_size, docs_count } - }); - - let update_store_cloned = update_store.clone(); - let lmdb_path_cloned = lmdb_path.clone(); - let index_cloned = index.clone(); - let updates_list_or_html_route = warp::filters::method::get() - .and(warp::header("Accept")) - .and(warp::path!("updates")) - .map(move |header: String| { - let update_store = update_store_cloned.clone(); - let mut updates = update_store - .iter_metas(|processed, aborted, pending| { - let mut updates = Vec::>::new(); - for result in processed { - let (uid, meta) = result?; - updates.push(UpdateStatus::Processed { update_id: uid.get(), meta }); - } - for result in aborted { - let (uid, meta) = result?; - updates.push(UpdateStatus::Aborted { update_id: uid.get(), meta }); - } - for result in pending { - let (uid, meta) = result?; - updates.push(UpdateStatus::Pending { update_id: uid.get(), meta }); - } - Ok(updates) - }) - .unwrap(); - - updates.sort_unstable_by(|s1, s2| s1.update_id().cmp(&s2.update_id()).reverse()); - - if header.contains("text/html") { - // We retrieve the database size. - let db_size = - File::open(lmdb_path_cloned.clone()).unwrap().metadata().unwrap().len() - as usize; - - // And the number of documents in the database. - let rtxn = index_cloned.read_txn().unwrap(); - let docs_count = index_cloned.clone().number_of_documents(&rtxn).unwrap() as usize; - - let template = - UpdatesTemplate { db_name: db_name.clone(), db_size, docs_count, updates }; - Box::new(template) as Box - } else { - Box::new(warp::reply::json(&updates)) - } - }); - - let dash_bulma_route = - warp::filters::method::get().and(warp::path!("bulma.min.css")).map(|| { - Response::builder() - .header("content-type", "text/css; charset=utf-8") - .body(include_str!("../public/bulma.min.css")) - }); - - let dash_bulma_dark_route = - warp::filters::method::get().and(warp::path!("bulma-prefers-dark.min.css")).map(|| { - Response::builder() - .header("content-type", "text/css; charset=utf-8") - .body(include_str!("../public/bulma-prefers-dark.min.css")) - }); - - let dash_style_route = warp::filters::method::get().and(warp::path!("style.css")).map(|| { - Response::builder() - .header("content-type", "text/css; charset=utf-8") - .body(include_str!("../public/style.css")) - }); - - let dash_jquery_route = - warp::filters::method::get().and(warp::path!("jquery-3.4.1.min.js")).map(|| { - Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../public/jquery-3.4.1.min.js")) - }); - - let dash_filesize_route = - warp::filters::method::get().and(warp::path!("filesize.min.js")).map(|| { - Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../public/filesize.min.js")) - }); - - let dash_script_route = warp::filters::method::get().and(warp::path!("script.js")).map(|| { - Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../public/script.js")) - }); - - let updates_script_route = - warp::filters::method::get().and(warp::path!("updates-script.js")).map(|| { - Response::builder() - .header("content-type", "application/javascript; charset=utf-8") - .body(include_str!("../public/updates-script.js")) - }); - - let dash_logo_white_route = - warp::filters::method::get().and(warp::path!("logo-white.svg")).map(|| { - Response::builder() - .header("content-type", "image/svg+xml") - .body(include_str!("../public/logo-white.svg")) - }); - - let dash_logo_black_route = - warp::filters::method::get().and(warp::path!("logo-black.svg")).map(|| { - Response::builder() - .header("content-type", "image/svg+xml") - .body(include_str!("../public/logo-black.svg")) - }); - - #[derive(Debug, Deserialize)] - #[serde(untagged)] - enum UntaggedEither { - Left(L), - Right(R), - } - - impl From> for Either { - fn from(value: UntaggedEither) -> Either { - match value { - UntaggedEither::Left(left) => Either::Left(left), - UntaggedEither::Right(right) => Either::Right(right), - } - } - } - - #[derive(Debug, Deserialize)] - #[serde(deny_unknown_fields)] - #[serde(rename_all = "camelCase")] - struct QueryBody { - query: Option, - filters: Option, - sort: Option, - facet_filters: Option, String>>>, - facet_distribution: Option, - limit: Option, - } - - #[derive(Debug, Serialize)] - #[serde(rename_all = "camelCase")] - struct Answer { - documents: Vec, - number_of_candidates: u64, - facets: BTreeMap>, - } - - let disable_highlighting = opt.disable_highlighting; - let index_cloned = index.clone(); - let query_route = warp::filters::method::post() - .and(warp::path!("query")) - .and(warp::body::json()) - .map(move |query: QueryBody| { - let before_search = Instant::now(); - let index = index_cloned.clone(); - let rtxn = index.read_txn().unwrap(); - - let mut search = index.search(&rtxn); - if let Some(query) = query.query { - search.query(query); - } - - let filters = match query.filters.as_ref() { - Some(condition) if !condition.trim().is_empty() => { - MilliFilter::from_str(condition).unwrap() - } - _otherwise => None, - }; - - let facet_filters = match query.facet_filters.as_ref() { - Some(array) => { - let eithers = array.iter().map(|either| match either { - UntaggedEither::Left(l) => { - Either::Left(l.iter().map(|s| s.as_str()).collect::>()) - } - UntaggedEither::Right(r) => Either::Right(r.as_str()), - }); - MilliFilter::from_array(eithers).unwrap() - } - _otherwise => None, - }; - - let condition = match (filters, facet_filters) { - (Some(filters), Some(facet_filters)) => { - Some(FilterCondition::And(vec![filters.into(), facet_filters.into()])) - } - (Some(condition), None) | (None, Some(condition)) => Some(condition.into()), - _otherwise => None, - }; - - if let Some(condition) = condition { - search.filter(condition.into()); - } - - if let Some(limit) = query.limit { - search.limit(limit); - } - - if let Some(sort) = query.sort { - search.sort_criteria(vec![sort.parse().map_err(SortError::from).unwrap()]); - } - - let SearchResult { matching_words, candidates, documents_ids } = - search.execute().unwrap(); - - let number_of_candidates = candidates.len(); - let facets = if query.facet_distribution == Some(true) { - Some(index.facets_distribution(&rtxn).candidates(candidates).execute().unwrap()) - } else { - None - }; - - let mut documents = Vec::new(); - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let displayed_fields = match index.displayed_fields_ids(&rtxn).unwrap() { - Some(fields) => fields, - None => fields_ids_map.iter().map(|(id, _)| id).collect(), - }; - let attributes_to_highlight = match index.searchable_fields(&rtxn).unwrap() { - Some(fields) => fields.into_iter().map(String::from).collect(), - None => fields_ids_map.iter().map(|(_, name)| name).map(String::from).collect(), - }; - - let mut matcher_builder = MatcherBuilder::new( - matching_words, - TokenizerBuilder::default().create_char_map(true).build(), - ); - matcher_builder.highlight_prefix("".to_string()); - matcher_builder.highlight_suffix("".to_string()); - let highlighter = Highlighter::new(matcher_builder); - for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { - let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); - if !disable_highlighting { - highlighter.highlight_record(&mut object, &attributes_to_highlight); - } - - documents.push(object); - } - - let answer = - Answer { documents, number_of_candidates, facets: facets.unwrap_or_default() }; - - Response::builder() - .header("Content-Type", "application/json") - .header("Time-Ms", before_search.elapsed().as_millis().to_string()) - .body(serde_json::to_string(&answer).unwrap()) - }); - - let index_cloned = index.clone(); - let document_route = warp::filters::method::get().and(warp::path!("document" / String)).map( - move |id: String| { - let index = index_cloned.clone(); - let rtxn = index.read_txn().unwrap(); - - let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - let displayed_fields = match index.displayed_fields_ids(&rtxn).unwrap() { - Some(fields) => fields, - None => fields_ids_map.iter().map(|(id, _)| id).collect(), - }; - - match external_documents_ids.get(&id) { - Some(document_id) => { - let document_id = document_id as u32; - let (_, obkv) = - index.documents(&rtxn, Some(document_id)).unwrap().pop().unwrap(); - let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); - - Response::builder() - .header("Content-Type", "application/json") - .body(serde_json::to_string(&document).unwrap()) - } - None => Response::builder() - .status(404) - .body(format!("Document with id {:?} not found.", id)), - } - }, - ); - - async fn buf_stream( - update_store: Arc>, - update_status_sender: broadcast::Sender< - UpdateStatus, - >, - update_method: Option, - format: String, - encoding: Option, - mut stream: impl futures::Stream> + Unpin, - ) -> Result { - let file = tokio::task::block_in_place(tempfile::tempfile).unwrap(); - let mut file = TFile::from_std(file); - - while let Some(result) = stream.next().await { - let mut bytes = Vec::new(); - result.unwrap().reader().read_to_end(&mut bytes).unwrap(); - file.write_all(&bytes[..]).await.unwrap(); - } - - let file = file.into_std().await; - let mmap = unsafe { memmap2::Mmap::map(&file).expect("can't map file") }; - - let method = match update_method.as_deref() { - Some("replace") => String::from("replace"), - Some("update") => String::from("update"), - _ => String::from("replace"), - }; - - let meta = UpdateMeta::DocumentsAddition { method, format, encoding }; - let update_id = update_store.register_update(&meta, &mmap[..]).unwrap(); - let _ = update_status_sender.send(UpdateStatus::Pending { update_id, meta }); - eprintln!("update {} registered", update_id); - - Ok(warp::reply()) - } - - #[derive(Deserialize)] - struct QueryUpdate { - method: Option, - } - - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let indexing_route = warp::filters::method::post() - .and(warp::path!("documents")) - .and(warp::header::header("content-type")) - .and(warp::header::optional::("content-encoding")) - .and(warp::query::query()) - .and(warp::body::stream()) - .and_then(move |content_type: String, content_encoding, params: QueryUpdate, stream| { - let format = match content_type.as_str() { - "text/csv" => "csv", - "application/json" => "json", - "application/x-ndjson" => "jsonl", - otherwise => panic!("invalid update format: {}", otherwise), - }; - - buf_stream( - update_store_cloned.clone(), - update_status_sender_cloned.clone(), - params.method, - format.to_string(), - content_encoding, - stream, - ) - }); - - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let clearing_route = - warp::filters::method::post().and(warp::path!("clear-documents")).map(move || { - let meta = UpdateMeta::ClearDocuments; - let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); - let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); - eprintln!("update {} registered", update_id); - Ok(warp::reply()) - }); - - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let change_settings_route = warp::filters::method::post() - .and(warp::path!("settings")) - .and(warp::body::json()) - .map(move |settings: Settings| { - let meta = UpdateMeta::Settings(settings); - let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); - let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); - eprintln!("update {} registered", update_id); - Ok(warp::reply()) - }); - - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let change_facet_levels_route = warp::filters::method::post() - .and(warp::path!("facet-level-sizes")) - .and(warp::body::json()) - .map(move |levels: Facets| { - let meta = UpdateMeta::Facets(levels); - let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); - let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); - eprintln!("update {} registered", update_id); - warp::reply() - }); - - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let abort_update_id_route = warp::filters::method::delete() - .and(warp::path!("update" / u64)) - .map(move |update_id: u64| { - if let Some(meta) = update_store_cloned.abort_update(update_id).unwrap() { - let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta }); - eprintln!("update {} aborted", update_id); - } - warp::reply() - }); - - let update_store_cloned = update_store.clone(); - let update_status_sender_cloned = update_status_sender.clone(); - let abort_pending_updates_route = - warp::filters::method::delete().and(warp::path!("updates")).map(move || { - let updates = update_store_cloned.abort_pendings().unwrap(); - for (update_id, meta) in updates { - let _ = update_status_sender_cloned.send(UpdateStatus::Aborted { update_id, meta }); - eprintln!("update {} aborted", update_id); - } - warp::reply() - }); - - let update_ws_route = - warp::ws().and(warp::path!("updates" / "ws")).map(move |ws: warp::ws::Ws| { - // And then our closure will be called when it completes... - let update_status_receiver = update_status_sender.subscribe(); - ws.on_upgrade(|websocket| { - // Just echo all updates messages... - BroadcastStream::new(update_status_receiver) - .flat_map(|result| match result { - Ok(status) => { - let msg = serde_json::to_string(&status).unwrap(); - stream::iter(Some(Ok(Message::text(msg)))) - } - Err(e) => { - eprintln!("channel error: {:?}", e); - stream::iter(None) - } - }) - .forward(websocket) - .map(|result| { - if let Err(e) = result { - eprintln!("websocket error: {:?}", e); - } - }) - }) - }); - - let die_route = warp::filters::method::get().and(warp::path!("die")).map(move || { - eprintln!("Killed by an HTTP request received on the die route"); - std::process::exit(0); - #[allow(unreachable_code)] - warp::reply() - }); - - let routes = dash_html_route - .or(updates_list_or_html_route) - .or(dash_bulma_route) - .or(dash_bulma_dark_route) - .or(dash_style_route) - .or(dash_jquery_route) - .or(dash_filesize_route) - .or(dash_script_route) - .or(updates_script_route) - .or(dash_logo_white_route) - .or(dash_logo_black_route) - .or(query_route) - .or(document_route) - .or(indexing_route) - .or(abort_update_id_route) - .or(abort_pending_updates_route) - .or(clearing_route) - .or(change_settings_route) - .or(change_facet_levels_route) - .or(update_ws_route) - .or(die_route); - - let addr = SocketAddr::from_str(&opt.http_listen_addr)?; - warp::serve(routes).run(addr).await; - Ok(()) -} - -fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let reader = BufReader::new(reader); - - for result in serde_json::Deserializer::from_reader(reader).into_iter::() { - let object = result?; - documents.append_json_object(&object)?; - } - - documents.into_inner().map_err(Into::into) -} - -fn documents_from_json(reader: impl Read) -> anyhow::Result> { - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - documents.append_json_array(reader)?; - - documents.into_inner().map_err(Into::into) -} - -fn documents_from_csv(reader: impl Read) -> anyhow::Result> { - let csv = csv::Reader::from_reader(reader); - - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - documents.append_csv(csv)?; - - documents.into_inner().map_err(Into::into) -} - -#[cfg(test)] -mod tests { - use maplit::{btreeset, hashmap, hashset}; - use milli::update::Setting; - use serde_test::{assert_tokens, Token}; - - use crate::Settings; - - #[test] - fn serde_settings_set() { - let settings = Settings { - displayed_attributes: Setting::Set(vec!["name".to_string()]), - searchable_attributes: Setting::Set(vec!["age".to_string()]), - filterable_attributes: Setting::Set(hashset! { "age".to_string() }), - sortable_attributes: Setting::Set(hashset! { "age".to_string() }), - criteria: Setting::Set(vec!["age:asc".to_string()]), - stop_words: Setting::Set(btreeset! { "and".to_string() }), - synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }), - }; - - assert_tokens( - &settings, - &[ - Token::Struct { name: "Settings", len: 7 }, - Token::Str("displayedAttributes"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("name"), - Token::SeqEnd, - Token::Str("searchableAttributes"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("age"), - Token::SeqEnd, - Token::Str("filterableAttributes"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("age"), - Token::SeqEnd, - Token::Str("sortableAttributes"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("age"), - Token::SeqEnd, - Token::Str("criteria"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("age:asc"), - Token::SeqEnd, - Token::Str("stopWords"), - Token::Some, - Token::Seq { len: Some(1) }, - Token::Str("and"), - Token::SeqEnd, - Token::Str("synonyms"), - Token::Some, - Token::Map { len: Some(1) }, - Token::Str("alex"), - Token::Seq { len: Some(1) }, - Token::Str("alexey"), - Token::SeqEnd, - Token::MapEnd, - Token::StructEnd, - ], - ); - } - - #[test] - fn serde_settings_reset() { - let settings = Settings { - displayed_attributes: Setting::Reset, - searchable_attributes: Setting::Reset, - filterable_attributes: Setting::Reset, - sortable_attributes: Setting::Reset, - criteria: Setting::Reset, - stop_words: Setting::Reset, - synonyms: Setting::Reset, - }; - - assert_tokens( - &settings, - &[ - Token::Struct { name: "Settings", len: 7 }, - Token::Str("displayedAttributes"), - Token::None, - Token::Str("searchableAttributes"), - Token::None, - Token::Str("filterableAttributes"), - Token::None, - Token::Str("sortableAttributes"), - Token::None, - Token::Str("criteria"), - Token::None, - Token::Str("stopWords"), - Token::None, - Token::Str("synonyms"), - Token::None, - Token::StructEnd, - ], - ); - } - - #[test] - fn serde_settings_notset() { - let settings = Settings { - displayed_attributes: Setting::NotSet, - searchable_attributes: Setting::NotSet, - filterable_attributes: Setting::NotSet, - sortable_attributes: Setting::NotSet, - criteria: Setting::NotSet, - stop_words: Setting::NotSet, - synonyms: Setting::NotSet, - }; - - assert_tokens(&settings, &[Token::Struct { name: "Settings", len: 0 }, Token::StructEnd]); - } -} diff --git a/http-ui/src/update_store.rs b/http-ui/src/update_store.rs deleted file mode 100644 index bbbff25c8..000000000 --- a/http-ui/src/update_store.rs +++ /dev/null @@ -1,362 +0,0 @@ -#![allow(unused)] - -use std::path::Path; -use std::sync::Arc; - -use crossbeam_channel::Sender; -use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson}; -use heed::{Database, Env, EnvOpenOptions}; -use milli::heed; -use serde::{Deserialize, Serialize}; - -pub type BEU64 = heed::zerocopy::U64; - -#[derive(Clone)] -pub struct UpdateStore { - env: Env, - pending_meta: Database, SerdeJson>, - pending: Database, ByteSlice>, - processed_meta: Database, SerdeJson>, - aborted_meta: Database, SerdeJson>, - notification_sender: Sender<()>, -} - -pub trait UpdateHandler { - fn handle_update(&mut self, update_id: u64, meta: M, content: &[u8]) -> heed::Result; -} - -impl UpdateHandler for F -where - F: FnMut(u64, M, &[u8]) -> heed::Result + Send + 'static, -{ - fn handle_update(&mut self, update_id: u64, meta: M, content: &[u8]) -> heed::Result { - self(update_id, meta, content) - } -} - -impl UpdateStore { - pub fn open( - mut options: EnvOpenOptions, - path: P, - mut update_handler: U, - ) -> heed::Result>> - where - P: AsRef, - U: UpdateHandler + Send + 'static, - M: for<'a> Deserialize<'a>, - N: Serialize, - { - options.max_dbs(4); - let env = options.open(path)?; - let pending_meta = env.create_database(Some("pending-meta"))?; - let pending = env.create_database(Some("pending"))?; - let processed_meta = env.create_database(Some("processed-meta"))?; - let aborted_meta = env.create_database(Some("aborted-meta"))?; - - let (notification_sender, notification_receiver) = crossbeam_channel::bounded(1); - // Send a first notification to trigger the process. - let _ = notification_sender.send(()); - - let update_store = Arc::new(UpdateStore { - env, - pending, - pending_meta, - processed_meta, - aborted_meta, - notification_sender, - }); - - let update_store_cloned = update_store.clone(); - std::thread::spawn(move || { - // Block and wait for something to process. - for () in notification_receiver { - loop { - match update_store_cloned.process_pending_update(&mut update_handler) { - Ok(Some(_)) => (), - Ok(None) => break, - Err(e) => eprintln!("error while processing update: {}", e), - } - } - } - }); - - Ok(update_store) - } - - /// Returns the new biggest id to use to store the new update. - fn new_update_id(&self, txn: &heed::RoTxn) -> heed::Result { - let last_pending = - self.pending_meta.remap_data_type::().last(txn)?.map(|(k, _)| k.get()); - - let last_processed = - self.processed_meta.remap_data_type::().last(txn)?.map(|(k, _)| k.get()); - - let last_aborted = - self.aborted_meta.remap_data_type::().last(txn)?.map(|(k, _)| k.get()); - - let last_update_id = - [last_pending, last_processed, last_aborted].iter().copied().flatten().max(); - - match last_update_id { - Some(last_id) => Ok(last_id + 1), - None => Ok(0), - } - } - - /// Registers the update content in the pending store and the meta - /// into the pending-meta store. Returns the new unique update id. - pub fn register_update(&self, meta: &M, content: &[u8]) -> heed::Result - where - M: Serialize, - { - let mut wtxn = self.env.write_txn()?; - - // We ask the update store to give us a new update id, this is safe, - // no other update can have the same id because we use a write txn before - // asking for the id and registering it so other update registering - // will be forced to wait for a new write txn. - let update_id = self.new_update_id(&wtxn)?; - let update_key = BEU64::new(update_id); - - self.pending_meta.put(&mut wtxn, &update_key, meta)?; - self.pending.put(&mut wtxn, &update_key, content)?; - - wtxn.commit()?; - - if let Err(e) = self.notification_sender.try_send(()) { - assert!(!e.is_disconnected(), "update notification channel is disconnected"); - } - - Ok(update_id) - } - - /// Executes the user provided function on the next pending update (the one with the lowest id). - /// This is asynchronous as it let the user process the update with a read-only txn and - /// only writing the result meta to the processed-meta store *after* it has been processed. - fn process_pending_update(&self, handler: &mut U) -> heed::Result> - where - U: UpdateHandler, - M: for<'a> Deserialize<'a>, - N: Serialize, - { - // Create a read transaction to be able to retrieve the pending update in order. - let rtxn = self.env.read_txn()?; - let first_meta = self.pending_meta.first(&rtxn)?; - - // If there is a pending update we process and only keep - // a reader while processing it, not a writer. - match first_meta { - Some((first_id, first_meta)) => { - let first_content = - self.pending.get(&rtxn, &first_id)?.expect("associated update content"); - - // Process the pending update using the provided user function. - let new_meta = handler.handle_update(first_id.get(), first_meta, first_content)?; - drop(rtxn); - - // Once the pending update have been successfully processed - // we must remove the content from the pending stores and - // write the *new* meta to the processed-meta store and commit. - let mut wtxn = self.env.write_txn()?; - self.pending_meta.delete(&mut wtxn, &first_id)?; - self.pending.delete(&mut wtxn, &first_id)?; - self.processed_meta.put(&mut wtxn, &first_id, &new_meta)?; - wtxn.commit()?; - - Ok(Some((first_id.get(), new_meta))) - } - None => Ok(None), - } - } - - /// The id and metadata of the update that is currently being processed, - /// `None` if no update is being processed. - pub fn processing_update(&self) -> heed::Result> - where - M: for<'a> Deserialize<'a>, - { - let rtxn = self.env.read_txn()?; - match self.pending_meta.first(&rtxn)? { - Some((key, meta)) => Ok(Some((key.get(), meta))), - None => Ok(None), - } - } - - /// Execute the user defined function with the meta-store iterators, the first - /// iterator is the *processed* meta one, the second the *aborted* meta one - /// and, the last is the *pending* meta one. - pub fn iter_metas(&self, mut f: F) -> heed::Result - where - M: for<'a> Deserialize<'a>, - N: for<'a> Deserialize<'a>, - F: for<'a> FnMut( - heed::RoIter<'a, OwnedType, SerdeJson>, - heed::RoIter<'a, OwnedType, SerdeJson>, - heed::RoIter<'a, OwnedType, SerdeJson>, - ) -> heed::Result, - { - let rtxn = self.env.read_txn()?; - - // We get the pending, processed and aborted meta iterators. - let processed_iter = self.processed_meta.iter(&rtxn)?; - let aborted_iter = self.aborted_meta.iter(&rtxn)?; - let pending_iter = self.pending_meta.iter(&rtxn)?; - - // We execute the user defined function with both iterators. - (f)(processed_iter, aborted_iter, pending_iter) - } - - /// Returns the update associated meta or `None` if the update doesn't exist. - pub fn meta(&self, update_id: u64) -> heed::Result>> - where - M: for<'a> Deserialize<'a>, - N: for<'a> Deserialize<'a>, - { - let rtxn = self.env.read_txn()?; - let key = BEU64::new(update_id); - - if let Some(meta) = self.pending_meta.get(&rtxn, &key)? { - return Ok(Some(UpdateStatusMeta::Pending(meta))); - } - - if let Some(meta) = self.processed_meta.get(&rtxn, &key)? { - return Ok(Some(UpdateStatusMeta::Processed(meta))); - } - - if let Some(meta) = self.aborted_meta.get(&rtxn, &key)? { - return Ok(Some(UpdateStatusMeta::Aborted(meta))); - } - - Ok(None) - } - - /// Aborts an update, an aborted update content is deleted and - /// the meta of it is moved into the aborted updates database. - /// - /// Trying to abort an update that is currently being processed, an update - /// that as already been processed or which doesn't actually exist, will - /// return `None`. - pub fn abort_update(&self, update_id: u64) -> heed::Result> - where - M: Serialize + for<'a> Deserialize<'a>, - { - let mut wtxn = self.env.write_txn()?; - let key = BEU64::new(update_id); - - // We cannot abort an update that is currently being processed. - if self.pending_meta.first(&wtxn)?.map(|(key, _)| key.get()) == Some(update_id) { - return Ok(None); - } - - let meta = match self.pending_meta.get(&wtxn, &key)? { - Some(meta) => meta, - None => return Ok(None), - }; - - self.aborted_meta.put(&mut wtxn, &key, &meta)?; - self.pending_meta.delete(&mut wtxn, &key)?; - self.pending.delete(&mut wtxn, &key)?; - - wtxn.commit()?; - - Ok(Some(meta)) - } - - /// Aborts all the pending updates, and not the one being currently processed. - /// Returns the update metas and ids that were successfully aborted. - pub fn abort_pendings(&self) -> heed::Result> - where - M: Serialize + for<'a> Deserialize<'a>, - { - let mut wtxn = self.env.write_txn()?; - let mut aborted_updates = Vec::new(); - - // We skip the first pending update as it is currently being processed. - for result in self.pending_meta.iter(&wtxn)?.skip(1) { - let (key, meta) = result?; - let id = key.get(); - aborted_updates.push((id, meta)); - } - - for (id, meta) in &aborted_updates { - let key = BEU64::new(*id); - self.aborted_meta.put(&mut wtxn, &key, &meta)?; - self.pending_meta.delete(&mut wtxn, &key)?; - self.pending.delete(&mut wtxn, &key)?; - } - - wtxn.commit()?; - - Ok(aborted_updates) - } -} - -#[derive(Debug, PartialEq, Eq, Hash)] -pub enum UpdateStatusMeta { - Pending(M), - Processed(N), - Aborted(M), -} - -#[cfg(test)] -mod tests { - use std::thread; - use std::time::{Duration, Instant}; - - use super::*; - - #[test] - fn simple() { - let dir = tempfile::tempdir().unwrap(); - let options = EnvOpenOptions::new(); - let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| { - Ok(meta + " processed") - }) - .unwrap(); - - let meta = String::from("kiki"); - let update_id = update_store.register_update(&meta, &[]).unwrap(); - - thread::sleep(Duration::from_millis(100)); - - let meta = update_store.meta(update_id).unwrap().unwrap(); - assert_eq!(meta, UpdateStatusMeta::Processed(format!("kiki processed"))); - } - - #[test] - #[ignore] - fn long_running_update() { - let dir = tempfile::tempdir().unwrap(); - let options = EnvOpenOptions::new(); - let update_store = UpdateStore::open(options, dir, |_id, meta: String, _content: &_| { - thread::sleep(Duration::from_millis(400)); - Ok(meta + " processed") - }) - .unwrap(); - - let before_register = Instant::now(); - - let meta = String::from("kiki"); - let update_id_kiki = update_store.register_update(&meta, &[]).unwrap(); - assert!(before_register.elapsed() < Duration::from_millis(200)); - - let meta = String::from("coco"); - let update_id_coco = update_store.register_update(&meta, &[]).unwrap(); - assert!(before_register.elapsed() < Duration::from_millis(200)); - - let meta = String::from("cucu"); - let update_id_cucu = update_store.register_update(&meta, &[]).unwrap(); - assert!(before_register.elapsed() < Duration::from_millis(200)); - - thread::sleep(Duration::from_millis(400 * 3 + 100)); - - let meta = update_store.meta(update_id_kiki).unwrap().unwrap(); - assert_eq!(meta, UpdateStatusMeta::Processed(format!("kiki processed"))); - - let meta = update_store.meta(update_id_coco).unwrap().unwrap(); - assert_eq!(meta, UpdateStatusMeta::Processed(format!("coco processed"))); - - let meta = update_store.meta(update_id_cucu).unwrap().unwrap(); - assert_eq!(meta, UpdateStatusMeta::Processed(format!("cucu processed"))); - } -} diff --git a/http-ui/templates/index.html b/http-ui/templates/index.html deleted file mode 100644 index 49fb0eb2b..000000000 --- a/http-ui/templates/index.html +++ /dev/null @@ -1,102 +0,0 @@ - - - - - - - - - - - {{ db_name }} | The milli engine - - - -
-
-
- -
-
- milli logo in white - milli logo in black -
-
- - -
-
-
- -
-
- - - -
-
- -
- -
- -
-
    - -
-
- - - - - - diff --git a/http-ui/templates/updates.html b/http-ui/templates/updates.html deleted file mode 100644 index 276bee40c..000000000 --- a/http-ui/templates/updates.html +++ /dev/null @@ -1,95 +0,0 @@ - - - - - - - - - - - {{ db_name }} | Updates - - - -
-
-
- - -
-
- milli logo in white - milli logo in black -
-
-
- - -
-
-
- -
-
    - - {% for update in updates %} - {% match update %} - {% when UpdateStatus::Pending with { update_id, meta } %} -
  1. -
      -
    1. -
      update id
      {{ update_id }}
      -
      update status
      pending
      -
    2. -
    -
  2. - {% when UpdateStatus::Processed with { update_id, meta } %} -
  3. -
      -
    1. -
      update id
      {{ update_id }}
      -
      update status
      {{ meta }}
      -
    2. -
    -
  4. - {% when UpdateStatus::Aborted with { update_id, meta } %} -
  5. -
      -
    1. -
      update id
      {{ update_id }}
      -
      update status
      aborted
      -
    2. -
    -
  6. - {% else %} - {% endmatch %} - {% endfor %} - -
-
- - - - - - diff --git a/infos/Cargo.toml b/infos/Cargo.toml deleted file mode 100644 index 23d21f042..000000000 --- a/infos/Cargo.toml +++ /dev/null @@ -1,17 +0,0 @@ -[package] -name = "infos" -version = "0.33.4" -authors = ["Clément Renault "] -edition = "2018" -publish = false - -[dependencies] -anyhow = "1.0.56" -byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } -csv = "1.1.6" -milli = { path = "../milli" } -mimalloc = { version = "0.1.29", default-features = false } -roaring = "0.9.0" -serde_json = "1.0.79" -stderrlog = "0.5.1" -structopt = { version = "0.3.26", default-features = false } diff --git a/infos/src/main.rs b/infos/src/main.rs deleted file mode 100644 index f5fdcf94a..000000000 --- a/infos/src/main.rs +++ /dev/null @@ -1,1221 +0,0 @@ -use std::fmt::Write as _; -use std::path::PathBuf; -use std::{fmt, io, str}; - -use anyhow::Context; -use byte_unit::Byte; -use heed::EnvOpenOptions; -use milli::facet::FacetType; -use milli::index::db_name::*; -use milli::{heed, FieldId, Index}; -use structopt::StructOpt; -use Command::*; - -#[global_allocator] -static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; - -const ALL_DATABASE_NAMES: &[&str] = &[ - MAIN, - WORD_DOCIDS, - WORD_PREFIX_DOCIDS, - DOCID_WORD_POSITIONS, - WORD_PAIR_PROXIMITY_DOCIDS, - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS, - WORD_POSITION_DOCIDS, - WORD_PREFIX_POSITION_DOCIDS, - FIELD_ID_WORD_COUNT_DOCIDS, - FACET_ID_F64_DOCIDS, - FACET_ID_STRING_DOCIDS, - FIELD_ID_DOCID_FACET_F64S, - FIELD_ID_DOCID_FACET_STRINGS, - EXACT_WORD_DOCIDS, - EXACT_WORD_PREFIX_DOCIDS, - DOCUMENTS, -]; - -const POSTINGS_DATABASE_NAMES: &[&str] = &[ - WORD_DOCIDS, - WORD_PREFIX_DOCIDS, - DOCID_WORD_POSITIONS, - WORD_PAIR_PROXIMITY_DOCIDS, - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS, -]; - -#[derive(Debug, StructOpt)] -/// A stats fetcher for milli. -pub struct Opt { - /// The database path where the database is located. - /// It is created if it doesn't already exist. - #[structopt(long = "db", parse(from_os_str))] - database: PathBuf, - - /// The maximum size the database can take on disk. It is recommended to specify - /// the whole disk space (value must be a multiple of a page size). - #[structopt(long = "db-size", default_value = "100 GiB")] - database_size: Byte, - - /// Verbose mode (-v, -vv, -vvv, etc.) - #[structopt(short, long, parse(from_occurrences))] - verbose: usize, - - #[structopt(subcommand)] - command: Command, -} - -#[derive(Debug, StructOpt)] -enum Command { - /// Outputs a CSV of the most frequent words of this index. - /// - /// `word` are displayed and ordered by frequency. - /// `document_frequency` defines the number of documents which contains the word. - MostCommonWords { - /// The maximum number of frequencies to return. - #[structopt(default_value = "10")] - limit: usize, - }, - - /// Outputs a CSV with the biggest entries of the database. - BiggestValues { - /// The maximum number of sizes to return. - #[structopt(default_value = "10")] - limit: usize, - }, - - /// Outputs a CSV with the documents ids where the given words appears. - WordsDocids { - /// Display the whole documents ids in details. - #[structopt(long)] - full_display: bool, - - /// The words to display the documents ids of. - words: Vec, - }, - - /// Outputs a CSV with the documents ids where the given words prefixes appears. - WordsPrefixesDocids { - /// Display the whole documents ids in details. - #[structopt(long)] - full_display: bool, - - /// The prefixes to display the documents ids of. - prefixes: Vec, - }, - - /// Outputs a CSV with the documents ids along with the facet numbers where it appears. - FacetNumbersDocids { - /// Display the whole documents ids in details. - #[structopt(long)] - full_display: bool, - - /// The field name in the document. - field_name: String, - }, - - /// Outputs a CSV with the documents ids along with the facet strings where it appears. - FacetStringsDocids { - /// Display the whole documents ids in details. - #[structopt(long)] - full_display: bool, - - /// The field name in the document. - field_name: String, - }, - - /// Outputs a CSV with the documents ids along with the word level positions where it appears. - WordsLevelPositionsDocids { - /// Display the whole documents ids in details. - #[structopt(long)] - full_display: bool, - - /// Words appearing in the documents. - words: Vec, - }, - - /// Outputs a CSV with the documents ids along with - /// the word prefix level positions where it appears. - WordPrefixesLevelPositionsDocids { - /// Display the whole documents ids in details. - #[structopt(long)] - full_display: bool, - - /// Prefixes of words appearing in the documents. - prefixes: Vec, - }, - - /// Outputs a CSV with the documents ids along with - /// the field id and the word count where it appears. - FieldIdWordCountDocids { - /// Display the whole documents ids in details. - #[structopt(long)] - full_display: bool, - - /// The field name in the document. - field_name: String, - }, - - /// Outputs a CSV with the documents ids, words and the positions where this word appears. - DocidsWordsPositions { - /// Display the whole positions in detail. - #[structopt(long)] - full_display: bool, - - /// If defined, only retrieve the documents that corresponds to these internal ids. - internal_documents_ids: Vec, - }, - - /// Outputs some facets numbers statistics for the given facet name. - FacetNumberStats { - /// The field name in the document. - field_name: String, - }, - - /// Outputs the average number of *different* words by document. - AverageNumberOfWordsByDoc, - - /// Outputs the average number of positions for each document words. - AverageNumberOfPositionsByWord, - - /// Outputs some statistics about the given database (e.g. median, quartiles, - /// percentiles, minimum, maximum, averge, key size, value size). - DatabaseStats { - #[structopt(possible_values = POSTINGS_DATABASE_NAMES)] - database: String, - }, - - /// Outputs the size in bytes of the specified databases names. - SizeOfDatabase { - /// The name of the database to measure the size of, if not specified it's equivalent - /// to specifying all the databases names. - #[structopt(possible_values = ALL_DATABASE_NAMES)] - databases: Vec, - }, - - /// Outputs a CSV with the proximities for the two specidied words and - /// the documents ids where these relations appears. - /// - /// `word1`, `word2` defines the word pair specified *in this specific order*. - /// `proximity` defines the proximity between the two specified words. - /// `documents_ids` defines the documents ids where the relation appears. - WordPairProximitiesDocids { - /// Display the whole documents ids in details. - #[structopt(long)] - full_display: bool, - - /// First word of the word pair. - word1: String, - - /// Second word of the word pair. - word2: String, - }, - - /// Outputs a CSV with the proximities for the two specified words and - /// the documents ids where these relations appears. - /// - /// `word1`, `prefix` defines the word pair specified *in this specific order*. - /// `proximity` defines the proximity between the two specified words. - /// `documents_ids` defines the documents ids where the relation appears. - WordPrefixPairProximitiesDocids { - /// Display the whole documents ids in details. - #[structopt(long)] - full_display: bool, - - /// First word of the word pair. - word1: String, - - /// Second word of the word pair. - prefix: String, - }, - - /// Outputs the words FST to standard output. - /// - /// One can use the FST binary helper to dissect and analyze it, - /// you can install it using `cargo install fst-bin`. - ExportWordsFst, - - /// Outputs the words prefix FST to standard output. - /// - /// One can use the FST binary helper to dissect and analyze it, - /// you can install it using `cargo install fst-bin`. - ExportWordsPrefixFst, - - /// Outputs the documents as JSON lines to the standard output. - /// - /// All of the fields are extracted, not just the displayed ones. - ExportDocuments { - /// If defined, only retrieve the documents that corresponds to these internal ids. - internal_documents_ids: Vec, - }, -} - -fn main() -> anyhow::Result<()> { - let opt = Opt::from_args(); - - stderrlog::new() - .verbosity(opt.verbose) - .show_level(false) - .timestamp(stderrlog::Timestamp::Off) - .init()?; - - let mut options = EnvOpenOptions::new(); - options.map_size(opt.database_size.get_bytes() as usize); - - // Return an error if the database does not exist. - if !opt.database.exists() { - anyhow::bail!("The database ({}) does not exist.", opt.database.display()); - } - - // Open the LMDB database. - let index = Index::new(options, opt.database)?; - let rtxn = index.read_txn()?; - - match opt.command { - MostCommonWords { limit } => most_common_words(&index, &rtxn, limit), - BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit), - WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words), - WordsPrefixesDocids { full_display, prefixes } => { - words_prefixes_docids(&index, &rtxn, !full_display, prefixes) - } - FacetNumbersDocids { full_display, field_name } => { - facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name) - } - FacetStringsDocids { full_display, field_name } => { - facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name) - } - WordsLevelPositionsDocids { full_display, words } => { - words_positions_docids(&index, &rtxn, !full_display, words) - } - WordPrefixesLevelPositionsDocids { full_display, prefixes } => { - word_prefixes_positions_docids(&index, &rtxn, !full_display, prefixes) - } - FieldIdWordCountDocids { full_display, field_name } => { - field_id_word_count_docids(&index, &rtxn, !full_display, field_name) - } - DocidsWordsPositions { full_display, internal_documents_ids } => { - docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) - } - FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name), - AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), - AverageNumberOfPositionsByWord => average_number_of_positions_by_word(&index, &rtxn), - SizeOfDatabase { databases } => size_of_databases(&index, &rtxn, databases), - DatabaseStats { database } => database_stats(&index, &rtxn, &database), - WordPairProximitiesDocids { full_display, word1, word2 } => { - word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) - } - WordPrefixPairProximitiesDocids { full_display, word1, prefix } => { - word_prefix_pair_proximities_docids(&index, &rtxn, !full_display, word1, prefix) - } - ExportWordsFst => export_words_fst(&index, &rtxn), - ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), - ExportDocuments { internal_documents_ids } => { - export_documents(&index, &rtxn, internal_documents_ids) - } - } -} - -fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { - use std::cmp::Reverse; - use std::collections::BinaryHeap; - - let mut heap = BinaryHeap::with_capacity(limit + 1); - for result in index.word_docids.iter(rtxn)? { - if limit == 0 { - break; - } - let (word, docids) = result?; - heap.push((Reverse(docids.len()), word)); - if heap.len() > limit { - heap.pop(); - } - } - - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["word", "document_frequency"])?; - - for (Reverse(document_frequency), word) in heap.into_sorted_vec() { - wtr.write_record(&[word, &document_frequency.to_string()])?; - } - - Ok(wtr.flush()?) -} - -/// Helper function that converts the facet value key to a unique type -/// that can be used for log or display purposes. -fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>( - rtxn: &'txn heed::RoTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result> + 'txn>> -where - KC: heed::BytesDecode<'txn>, - DC: heed::BytesDecode<'txn>, -{ - let iter = db - .remap_key_type::() - .prefix_iter(&rtxn, &field_id.to_be_bytes())? - .remap_key_type::(); - - Ok(Box::new(iter)) -} - -fn facet_number_value_to_string(level: u8, left: T, right: T) -> (u8, String) { - if level == 0 { - (level, format!("{:?}", left)) - } else { - (level, format!("{:?} to {:?}", left, right)) - } -} - -fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { - use std::cmp::Reverse; - use std::collections::BinaryHeap; - - use heed::types::ByteSlice; - - let Index { - word_docids, - word_prefix_docids, - docid_word_positions, - word_pair_proximity_docids, - word_prefix_pair_proximity_docids, - word_position_docids, - word_prefix_position_docids, - field_id_word_count_docids, - facet_id_f64_docids, - facet_id_string_docids, - facet_id_exists_docids, - exact_word_docids, - exact_word_prefix_docids, - field_id_docid_facet_f64s: _, - field_id_docid_facet_strings: _, - .. - } = index; - - let main_name = "main"; - let word_docids_name = "word_docids"; - let word_prefix_docids_name = "word_prefix_docids"; - let docid_word_positions_name = "docid_word_positions"; - let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids"; - let word_pair_proximity_docids_name = "word_pair_proximity_docids"; - let word_position_docids_name = "word_position_docids"; - let word_prefix_position_docids_name = "word_prefix_position_docids"; - let field_id_word_count_docids_name = "field_id_word_count_docids"; - let facet_id_f64_docids_name = "facet_id_f64_docids"; - let facet_id_string_docids_name = "facet_id_string_docids"; - let facet_id_exists_docids_name = "facet_id_exists_docids"; - let documents_name = "documents"; - - let mut heap = BinaryHeap::with_capacity(limit + 1); - - if limit > 0 { - // Fetch the words FST - let words_fst = index.words_fst(rtxn)?; - let length = words_fst.as_fst().as_bytes().len(); - heap.push(Reverse((length, "words-fst".to_string(), main_name))); - if heap.len() > limit { - heap.pop(); - } - - // Fetch the word prefix FST - let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; - let length = words_prefixes_fst.as_fst().as_bytes().len(); - heap.push(Reverse((length, "words-prefixes-fst".to_string(), main_name))); - if heap.len() > limit { - heap.pop(); - } - - let documents_ids = index.documents_ids(rtxn)?; - heap.push(Reverse((documents_ids.len() as usize, "documents-ids".to_string(), main_name))); - if heap.len() > limit { - heap.pop(); - } - - for result in word_docids.remap_data_type::().iter(rtxn)? { - let (word, value) = result?; - heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - for result in exact_word_docids.remap_data_type::().iter(rtxn)? { - let (word, value) = result?; - heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - for result in word_prefix_docids.remap_data_type::().iter(rtxn)? { - let (word, value) = result?; - heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - for result in exact_word_prefix_docids.remap_data_type::().iter(rtxn)? { - let (word, value) = result?; - heap.push(Reverse((value.len(), word.to_string(), word_prefix_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - for result in docid_word_positions.remap_data_type::().iter(rtxn)? { - let ((docid, word), value) = result?; - let key = format!("{} {}", docid, word); - heap.push(Reverse((value.len(), key, docid_word_positions_name))); - if heap.len() > limit { - heap.pop(); - } - } - - for result in word_pair_proximity_docids.remap_data_type::().iter(rtxn)? { - let ((word1, word2, prox), value) = result?; - let key = format!("{} {} {}", word1, word2, prox); - heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - for result in word_prefix_pair_proximity_docids.remap_data_type::().iter(rtxn)? { - let ((word, prefix, prox), value) = result?; - let key = format!("{} {} {}", word, prefix, prox); - heap.push(Reverse((value.len(), key, word_prefix_pair_proximity_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - for result in word_position_docids.remap_data_type::().iter(rtxn)? { - let ((word, pos), value) = result?; - let key = format!("{} {}", word, pos); - heap.push(Reverse((value.len(), key, word_position_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - for result in word_prefix_position_docids.remap_data_type::().iter(rtxn)? { - let ((word, pos), value) = result?; - let key = format!("{} {}", word, pos); - heap.push(Reverse((value.len(), key, word_prefix_position_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - for result in field_id_word_count_docids.remap_data_type::().iter(rtxn)? { - let ((field_id, word_count), docids) = result?; - let key = format!("{} {}", field_id, word_count); - heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - let faceted_fields = index.faceted_fields_ids(rtxn)?; - let fields_ids_map = index.fields_ids_map(rtxn)?; - - for facet_id in faceted_fields { - let facet_name = fields_ids_map.name(facet_id).unwrap(); - - // List the facet numbers of this facet id. - let db = facet_id_f64_docids.remap_data_type::(); - for result in facet_values_iter(rtxn, db, facet_id)? { - let ((_fid, level, left, right), value) = result?; - let mut output = facet_number_value_to_string(level, left, right).1; - write!(&mut output, " (level {})", level)?; - let key = format!("{} {}", facet_name, output); - heap.push(Reverse((value.len(), key, facet_id_f64_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - // List the facet strings of this facet id. - let db = facet_id_string_docids.remap_data_type::(); - for result in facet_values_iter(rtxn, db, facet_id)? { - let ((_fid, fvalue), value) = result?; - let key = format!("{} {}", facet_name, fvalue); - heap.push(Reverse((value.len(), key, facet_id_string_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - - // List the docids where the facet exists - let db = facet_id_exists_docids.remap_data_type::(); - for result in facet_values_iter(rtxn, db, facet_id)? { - let (_fid, value) = result?; - let key = facet_name.to_string(); - heap.push(Reverse((value.len(), key, facet_id_exists_docids_name))); - if heap.len() > limit { - heap.pop(); - } - } - } - - for result in index.all_documents(rtxn)? { - let (id, value) = result?; - let size = value.iter().map(|(k, v)| k.to_ne_bytes().len() + v.len()).sum(); - heap.push(Reverse((size, id.to_string(), documents_name))); - if heap.len() > limit { - heap.pop(); - } - } - } - - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["database_name", "key_name", "size"])?; - - for Reverse((size, key_name, database_name)) in heap.into_sorted_vec() { - wtr.write_record(&[database_name.to_string(), key_name, size.to_string()])?; - } - - Ok(wtr.flush()?) -} - -fn words_docids( - index: &Index, - rtxn: &heed::RoTxn, - debug: bool, - words: Vec, -) -> anyhow::Result<()> { - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["word", "documents_ids"])?; - - for word in words { - if let Some(docids) = index.word_docids.get(rtxn, &word)? { - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - wtr.write_record(&[word, docids])?; - } - } - - Ok(wtr.flush()?) -} - -fn words_prefixes_docids( - index: &Index, - rtxn: &heed::RoTxn, - debug: bool, - prefixes: Vec, -) -> anyhow::Result<()> { - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["prefix", "documents_ids"])?; - - if prefixes.is_empty() { - for result in index.word_prefix_docids.iter(rtxn)? { - let (prefix, docids) = result?; - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - wtr.write_record(&[prefix, &docids])?; - } - } else { - for prefix in prefixes { - if let Some(docids) = index.word_prefix_docids.get(rtxn, &prefix)? { - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - wtr.write_record(&[prefix, docids])?; - } - } - } - - Ok(wtr.flush()?) -} - -fn facet_values_docids( - index: &Index, - rtxn: &heed::RoTxn, - debug: bool, - facet_type: FacetType, - field_name: String, -) -> anyhow::Result<()> { - let fields_ids_map = index.fields_ids_map(&rtxn)?; - let faceted_fields = index.faceted_fields_ids(&rtxn)?; - - let field_id = fields_ids_map - .id(&field_name) - .with_context(|| format!("field {} not found", field_name))?; - - if !faceted_fields.contains(&field_id) { - anyhow::bail!("field {} is not faceted", field_name); - } - - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - - match facet_type { - FacetType::Number => { - wtr.write_record(&["facet_number", "facet_level", "documents_count", "documents_ids"])?; - for result in facet_values_iter(rtxn, index.facet_id_f64_docids, field_id)? { - let ((_fid, level, left, right), docids) = result?; - let value = facet_number_value_to_string(level, left, right).1; - let count = docids.len(); - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; - } - } - FacetType::String => { - wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; - for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { - let ((_fid, normalized), (_original, docids)) = result?; - let count = docids.len(); - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - wtr.write_record(&[normalized.to_string(), count.to_string(), docids])?; - } - } - } - - Ok(wtr.flush()?) -} - -fn words_positions_docids( - index: &Index, - rtxn: &heed::RoTxn, - debug: bool, - words: Vec, -) -> anyhow::Result<()> { - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["word", "position", "documents_count", "documents_ids"])?; - - for word in words.iter().map(AsRef::as_ref) { - let range = { - let left = (word, u32::min_value()); - let right = (word, u32::max_value()); - left..=right - }; - for result in index.word_position_docids.range(rtxn, &range)? { - let ((w, pos), docids) = result?; - - let count = docids.len().to_string(); - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - let position = format!("{:?}", pos); - wtr.write_record(&[w, &position, &count, &docids])?; - } - } - - Ok(wtr.flush()?) -} - -fn word_prefixes_positions_docids( - index: &Index, - rtxn: &heed::RoTxn, - debug: bool, - prefixes: Vec, -) -> anyhow::Result<()> { - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["prefix", "position", "documents_count", "documents_ids"])?; - - for word in prefixes.iter().map(AsRef::as_ref) { - let range = { - let left = (word, u32::min_value()); - let right = (word, u32::max_value()); - left..=right - }; - for result in index.word_prefix_position_docids.range(rtxn, &range)? { - let ((w, pos), docids) = result?; - - let count = docids.len().to_string(); - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - let position = format!("{:?}", pos); - wtr.write_record(&[w, &position, &count, &docids])?; - } - } - - Ok(wtr.flush()?) -} - -fn field_id_word_count_docids( - index: &Index, - rtxn: &heed::RoTxn, - debug: bool, - field_name: String, -) -> anyhow::Result<()> { - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["field_name", "word_count", "docids"])?; - - let field_id = index - .fields_ids_map(rtxn)? - .id(&field_name) - .with_context(|| format!("unknown field name: {}", &field_name))?; - - let left = (field_id, 0); - let right = (field_id, u8::max_value()); - let iter = index.field_id_word_count_docids.range(rtxn, &(left..=right))?; - - for result in iter { - let ((_, word_count), docids) = result?; - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - wtr.write_record(&[&field_name, &format!("{}", word_count), &docids])?; - } - - Ok(wtr.flush()?) -} - -fn docids_words_positions( - index: &Index, - rtxn: &heed::RoTxn, - debug: bool, - internal_ids: Vec, -) -> anyhow::Result<()> { - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["document_id", "word", "positions"])?; - - let iter: Box> = if internal_ids.is_empty() { - Box::new(index.docid_word_positions.iter(rtxn)?) - } else { - let vec: heed::Result> = internal_ids - .into_iter() - .map(|id| index.docid_word_positions.prefix_iter(rtxn, &(id, ""))) - .collect(); - Box::new(vec?.into_iter().flatten()) - }; - - for result in iter { - let ((id, word), positions) = result?; - let positions = if debug { - format!("{:?}", positions) - } else { - format!("{:?}", positions.iter().collect::>()) - }; - wtr.write_record(&[&id.to_string(), word, &positions])?; - } - - Ok(wtr.flush()?) -} - -fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { - let fields_ids_map = index.fields_ids_map(&rtxn)?; - let faceted_fields = index.faceted_fields_ids(&rtxn)?; - - let field_id = fields_ids_map - .id(&field_name) - .with_context(|| format!("field {} not found", field_name))?; - - if !faceted_fields.contains(&field_id) { - anyhow::bail!("field {} is not faceted", field_name); - } - - let iter = facet_values_iter(rtxn, index.facet_id_f64_docids, field_id)?; - println!("The database {:?} facet stats", field_name); - - let mut level_size = 0; - let mut current_level = None; - for result in iter { - let ((_fid, level, _left, _right), _) = result?; - if let Some(current) = current_level { - if current != level { - println!("\tnumber of groups at level {}: {}", current, level_size); - level_size = 0; - } - } - current_level = Some(level); - level_size += 1; - } - - if let Some(current) = current_level { - println!("\tnumber of groups at level {}: {}", current, level_size); - } - - Ok(()) -} - -fn export_words_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { - use std::io::Write as _; - - let mut stdout = io::stdout(); - let words_fst = index.words_fst(rtxn)?; - stdout.write_all(words_fst.as_fst().as_bytes())?; - - Ok(()) -} - -fn export_words_prefix_fst(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { - use std::io::Write as _; - - let mut stdout = io::stdout(); - let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; - stdout.write_all(words_prefixes_fst.as_fst().as_bytes())?; - - Ok(()) -} - -fn export_documents( - index: &Index, - rtxn: &heed::RoTxn, - internal_ids: Vec, -) -> anyhow::Result<()> { - use std::io::{BufWriter, Write as _}; - - use milli::obkv_to_json; - - let stdout = io::stdout(); - let mut out = BufWriter::new(stdout); - - let fields_ids_map = index.fields_ids_map(rtxn)?; - let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); - - let iter: Box> = if internal_ids.is_empty() { - Box::new(index.all_documents(rtxn)?.map(|result| result.map(|(_id, obkv)| obkv))) - } else { - Box::new( - index - .documents(rtxn, internal_ids.into_iter())? - .into_iter() - .map(|(_id, obkv)| Ok(obkv)), - ) - }; - - for result in iter { - let obkv = result?; - let document = obkv_to_json(&displayed_fields, &fields_ids_map, obkv)?; - serde_json::to_writer(&mut out, &document)?; - writeln!(&mut out)?; - } - - out.into_inner()?; - - Ok(()) -} - -fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { - use heed::types::DecodeIgnore; - use milli::{BEU32StrCodec, DocumentId}; - - let mut words_counts = Vec::new(); - let mut count = 0; - let mut prev = None as Option<(DocumentId, u32)>; - - let iter = - index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?; - for result in iter { - let ((docid, _word), ()) = result?; - - match prev.as_mut() { - Some((prev_docid, prev_count)) if docid == *prev_docid => { - *prev_count += 1; - } - Some((prev_docid, prev_count)) => { - words_counts.push(*prev_count); - *prev_docid = docid; - *prev_count = 0; - count += 1; - } - None => prev = Some((docid, 1)), - } - } - - if let Some((_, prev_count)) = prev.take() { - words_counts.push(prev_count); - count += 1; - } - - let words_count = words_counts.into_iter().map(|c| c as usize).sum::() as f64; - let count = count as f64; - - println!("average number of different words by document: {}", words_count / count); - - Ok(()) -} - -fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { - use heed::types::DecodeIgnore; - use milli::BoRoaringBitmapCodec; - - let mut values_length = Vec::new(); - let mut count = 0; - - let db = index.docid_word_positions.as_polymorph(); - for result in db.iter::<_, DecodeIgnore, BoRoaringBitmapCodec>(rtxn)? { - let ((), val) = result?; - values_length.push(val.len() as u32); - count += 1; - } - - let values_length_sum = values_length.into_iter().map(|c| c as usize).sum::() as f64; - let count = count as f64; - - println!("average number of positions by word: {}", values_length_sum / count); - - Ok(()) -} - -fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> anyhow::Result<()> { - use heed::types::ByteSlice; - - let Index { - word_docids, - word_prefix_docids, - docid_word_positions, - word_pair_proximity_docids, - word_prefix_pair_proximity_docids, - word_position_docids, - word_prefix_position_docids, - field_id_word_count_docids, - facet_id_f64_docids, - facet_id_string_docids, - field_id_docid_facet_f64s, - field_id_docid_facet_strings, - facet_id_exists_docids, - exact_word_prefix_docids, - exact_word_docids, - .. - } = index; - - let names = if names.is_empty() { - ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect() - } else { - names - }; - - for name in names { - let database = match name.as_str() { - WORD_PREFIX_DOCIDS => word_prefix_docids.as_polymorph(), - WORD_DOCIDS => word_docids.as_polymorph(), - DOCID_WORD_POSITIONS => docid_word_positions.as_polymorph(), - WORD_PAIR_PROXIMITY_DOCIDS => word_pair_proximity_docids.as_polymorph(), - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => word_prefix_pair_proximity_docids.as_polymorph(), - WORD_POSITION_DOCIDS => word_position_docids.as_polymorph(), - WORD_PREFIX_POSITION_DOCIDS => word_prefix_position_docids.as_polymorph(), - FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(), - FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(), - FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(), - FACET_ID_EXISTS_DOCIDS => facet_id_exists_docids.as_polymorph(), - FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(), - FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(), - EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(), - EXACT_WORD_PREFIX_DOCIDS => exact_word_prefix_docids.as_polymorph(), - - unknown => anyhow::bail!("unknown database {:?}", unknown), - }; - - let mut key_size: u64 = 0; - let mut val_size: u64 = 0; - let mut number_entries: u64 = 0; - for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? { - let (k, v) = result?; - key_size += k.len() as u64; - val_size += v.len() as u64; - number_entries += 1; - } - - println!("The {} database weigh:", name); - println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true)); - println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true)); - println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true)); - println!("\tnumber of entries: {}", number_entries); - } - - Ok(()) -} - -fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> { - use heed::types::ByteSlice; - use heed::{BytesDecode, Error}; - use milli::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; - use roaring::RoaringBitmap; - - fn compute_stats<'a, DC: BytesDecode<'a, DItem = RoaringBitmap>>( - db: heed::PolyDatabase, - rtxn: &'a heed::RoTxn, - name: &str, - ) -> anyhow::Result<()> { - let mut key_size = 0u64; - let mut val_size = 0u64; - let mut values_length = Vec::new(); - - for result in db.iter::<_, ByteSlice, ByteSlice>(rtxn)? { - let (key, val) = result?; - key_size += key.len() as u64; - val_size += val.len() as u64; - let val = DC::bytes_decode(val).ok_or(Error::Decoding)?; - values_length.push(val.len() as u32); - } - - values_length.sort_unstable(); - let len = values_length.len(); - - let twenty_five_percentile = values_length.get(len / 4).unwrap_or(&0); - let fifty_percentile = values_length.get(len / 2).unwrap_or(&0); - let seventy_five_percentile = values_length.get(len * 3 / 4).unwrap_or(&0); - let ninety_percentile = values_length.get(len * 90 / 100).unwrap_or(&0); - let ninety_five_percentile = values_length.get(len * 95 / 100).unwrap_or(&0); - let ninety_nine_percentile = values_length.get(len * 99 / 100).unwrap_or(&0); - let minimum = values_length.first().unwrap_or(&0); - let maximum = values_length.last().unwrap_or(&0); - let count = values_length.len(); - let sum = values_length.iter().map(|l| *l as u64).sum::(); - - println!("The {} database stats on the lengths", name); - println!("\tnumber of entries: {}", count); - println!("\t25th percentile (first quartile): {}", twenty_five_percentile); - println!("\t50th percentile (median): {}", fifty_percentile); - println!("\t75th percentile (third quartile): {}", seventy_five_percentile); - println!("\t90th percentile: {}", ninety_percentile); - println!("\t95th percentile: {}", ninety_five_percentile); - println!("\t99th percentile: {}", ninety_nine_percentile); - println!("\tminimum: {}", minimum); - println!("\tmaximum: {}", maximum); - println!("\taverage: {}", sum as f64 / count as f64); - println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true)); - println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true)); - println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true)); - - Ok(()) - } - - match name { - WORD_DOCIDS => { - let db = index.word_docids.as_polymorph(); - compute_stats::(*db, rtxn, name) - } - WORD_PREFIX_DOCIDS => { - let db = index.word_prefix_docids.as_polymorph(); - compute_stats::(*db, rtxn, name) - } - DOCID_WORD_POSITIONS => { - let db = index.docid_word_positions.as_polymorph(); - compute_stats::(*db, rtxn, name) - } - WORD_PAIR_PROXIMITY_DOCIDS => { - let db = index.word_pair_proximity_docids.as_polymorph(); - compute_stats::(*db, rtxn, name) - } - WORD_PREFIX_PAIR_PROXIMITY_DOCIDS => { - let db = index.word_prefix_pair_proximity_docids.as_polymorph(); - compute_stats::(*db, rtxn, name) - } - FIELD_ID_WORD_COUNT_DOCIDS => { - let db = index.field_id_word_count_docids.as_polymorph(); - compute_stats::(*db, rtxn, name) - } - unknown => anyhow::bail!("unknown database {:?}", unknown), - } -} - -fn word_pair_proximities_docids( - index: &Index, - rtxn: &heed::RoTxn, - debug: bool, - word1: String, - word2: String, -) -> anyhow::Result<()> { - use heed::types::ByteSlice; - use milli::RoaringBitmapCodec; - - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["word1", "word2", "proximity", "documents_ids"])?; - - // Create the prefix key with only the pair of words. - let mut prefix = Vec::with_capacity(word1.len() + word2.len() + 1); - prefix.extend_from_slice(word1.as_bytes()); - prefix.push(0); - prefix.extend_from_slice(word2.as_bytes()); - prefix.push(0); - - let db = index.word_pair_proximity_docids.as_polymorph(); - let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?; - for result in iter { - let (key, docids) = result?; - - // Skip keys that are longer than the requested one, - // a longer key means that the second word is a prefix of the request word. - if key.len() != prefix.len() + 1 { - continue; - } - - let proximity = key.last().unwrap(); - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - wtr.write_record(&[&word1, &word2, &proximity.to_string(), &docids])?; - } - - Ok(wtr.flush()?) -} - -fn word_prefix_pair_proximities_docids( - index: &Index, - rtxn: &heed::RoTxn, - debug: bool, - word1: String, - word_prefix: String, -) -> anyhow::Result<()> { - use heed::types::ByteSlice; - use milli::RoaringBitmapCodec; - - let stdout = io::stdout(); - let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["word1", "word_prefix", "proximity", "documents_ids"])?; - - // Create the prefix key with only the pair of words. - let mut prefix = Vec::with_capacity(word1.len() + word_prefix.len() + 1); - prefix.extend_from_slice(word1.as_bytes()); - prefix.push(0); - prefix.extend_from_slice(word_prefix.as_bytes()); - - let db = index.word_prefix_pair_proximity_docids.as_polymorph(); - let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?; - for result in iter { - let (key, docids) = result?; - - // Skip keys that are longer than the requested one, - // a longer key means that the second word is a prefix of the request word. - if key.len() != prefix.len() + 1 { - continue; - } - - let proximity = key.last().unwrap(); - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - wtr.write_record(&[&word1, &word_prefix, &proximity.to_string(), &docids])?; - } - - Ok(wtr.flush()?) -} diff --git a/milli/README.md b/milli/README.md deleted file mode 100644 index 56db42a86..000000000 --- a/milli/README.md +++ /dev/null @@ -1,26 +0,0 @@ -# Milli - -## Fuzzing milli - -Currently you can only fuzz the indexation. -To execute the fuzzer run: -``` -cargo +nightly fuzz run indexing -``` - -To execute the fuzzer on multiple thread you can also run: -``` -cargo +nightly fuzz run -j4 indexing -``` - -Since the fuzzer is going to create a lot of temporary file to let milli index its documents -I would also recommand to execute it on a ramdisk. -Here is how to setup a ramdisk on linux: -``` -sudo mount -t tmpfs none path/to/your/ramdisk -``` -And then set the [TMPDIR](https://doc.rust-lang.org/std/env/fn.temp_dir.html) environment variable -to make the fuzzer create its file in it: -``` -export TMPDIR=path/to/your/ramdisk -``` diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore deleted file mode 100644 index ebf2c9395..000000000 --- a/milli/fuzz/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -Cargo.lock -target/ - -/corpus/ -/artifacts/ diff --git a/milli/fuzz/Cargo.toml b/milli/fuzz/Cargo.toml deleted file mode 100644 index 6bf7b2c6d..000000000 --- a/milli/fuzz/Cargo.toml +++ /dev/null @@ -1,34 +0,0 @@ -[package] -name = "milli-fuzz" -version = "0.0.0" -authors = ["Automatically generated"] -publish = false -edition = "2018" - -[package.metadata] -cargo-fuzz = true - -[dependencies] -arbitrary = "1.0" -libfuzzer-sys = "0.4" -serde_json = { version = "1.0.62", features = ["preserve_order"] } -anyhow = "1.0" -tempfile = "3.3" -arbitrary-json = "0.1.0" -mimalloc = { version = "0.1.29", default-features = false } - -[dependencies.milli] -path = ".." - -# Prevent this from interfering with workspaces -[workspace] -members = ["."] - -[profile.release] -debug = true - -[[bin]] -name = "indexing" -path = "fuzz_targets/indexing.rs" -test = false -doc = false diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs deleted file mode 100644 index 8ce470718..000000000 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ /dev/null @@ -1,114 +0,0 @@ -#![no_main] - -use std::collections::HashSet; -use std::io::{BufWriter, Cursor, Read, Seek, Write}; - -use anyhow::{bail, Result}; -use arbitrary_json::ArbitraryValue; -use libfuzzer_sys::fuzz_target; -use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; -use milli::heed::EnvOpenOptions; -use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{Index, Object}; -use serde_json::{Map, Value}; - -#[global_allocator] -static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; - -/// reads json from input and write an obkv batch to writer. -pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { - let writer = BufWriter::new(writer); - let mut builder = DocumentsBatchBuilder::new(writer); - - let values: Vec = serde_json::from_reader(input)?; - if builder.documents_count() == 0 { - bail!("Empty payload"); - } - - for object in values { - builder.append_json_object(&object)?; - } - - let count = builder.documents_count(); - let vector = builder.into_inner()?; - - Ok(count as usize) -} - -fn index_documents( - index: &mut milli::Index, - documents: DocumentsBatchReader>>, -) -> Result<()> { - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn()?; - - let indexing_config = IndexDocumentsConfig::default(); - let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())?; - builder.add_documents(documents)?; - builder.execute().unwrap(); - - wtxn.commit()?; - Ok(()) -} - -fn create_index() -> Result { - let dir = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024 * 1024); // 10 GB - options.max_readers(1); - let index = Index::new(options, dir.path())?; - - let config = IndexerConfig::default(); - let mut wtxn = index.write_txn().unwrap(); - - let mut builder = Settings::new(&mut wtxn, &index, &config); - - let displayed_fields = - ["id", "title", "album", "artist", "genre", "country", "released", "duration"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_displayed_fields(displayed_fields); - - let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - - let faceted_fields: HashSet = - ["released-timestamp", "duration-float", "genre", "country", "artist"] - .iter() - .map(|s| s.to_string()) - .collect(); - builder.set_filterable_fields(faceted_fields.clone()); - builder.set_sortable_fields(faceted_fields); - - builder.set_distinct_field("same".to_string()); - - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - Ok(index) -} - -fuzz_target!(|batches: Vec>| { - if let Ok(mut index) = create_index() { - for batch in batches { - let documents: Vec = - batch.into_iter().map(|value| serde_json::Value::from(value)).collect(); - let json = Value::Array(documents); - let json = serde_json::to_string(&json).unwrap(); - - let mut documents = Cursor::new(Vec::new()); - - // We ignore all malformed documents - if let Ok(_) = read_json(json.as_bytes(), &mut documents) { - documents.rewind().unwrap(); - let documents = DocumentsBatchReader::from_reader(documents).unwrap(); - // A lot of errors can come out of milli and we don't know which ones are normal or not - // so we are only going to look for the unexpected panics. - let _ = index_documents(&mut index, documents); - } - } - - index.prepare_for_closing().wait(); - } -}); From b6fe6838d38f8ffc447983bdab22aa453c19d15b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 10:22:32 +0200 Subject: [PATCH 1634/1889] Remove `helpers` crate --- Cargo.toml | 2 +- README.md | 1 - helpers/Cargo.toml | 15 -------- helpers/src/main.rs | 84 --------------------------------------------- 4 files changed, 1 insertion(+), 101 deletions(-) delete mode 100644 helpers/Cargo.toml delete mode 100644 helpers/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 6a618c381..98e17acd2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "benchmarks", "helpers", "cli"] +members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", "benchmarks", "cli"] default-members = ["milli"] [profile.dev] diff --git a/README.md b/README.md index 93f4b2e6c..d69c43656 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,6 @@ This repository contains crates to quickly debug the engine: - The `cli` crate is a simple command-line interface that helps run [flamegraph] on top of it. - The `filter-parser` crate contains the parser for the Meilisearch filter syntax. - The `flatten-serde-json` crate contains the library that flattens serde-json `Value` objects like Elasticsearch does. - - The `helpers` crate is only used to do operations on the database. - The `json-depth-checker` crate is used to indicate if a JSON must be flattened. ## How to use it? diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml deleted file mode 100644 index b1034d092..000000000 --- a/helpers/Cargo.toml +++ /dev/null @@ -1,15 +0,0 @@ -[package] -name = "helpers" -version = "0.33.4" -authors = ["Clément Renault "] -edition = "2018" -description = "A small tool to do operations on the database" -publish = false - -[dependencies] -anyhow = "1.0.56" -byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } -milli = { path = "../milli" } -mimalloc = { version = "0.1.29", default-features = false } -stderrlog = "0.5.1" -structopt = { version = "0.3.26", default-features = false } diff --git a/helpers/src/main.rs b/helpers/src/main.rs deleted file mode 100644 index d1050e937..000000000 --- a/helpers/src/main.rs +++ /dev/null @@ -1,84 +0,0 @@ -use std::path::PathBuf; - -use byte_unit::Byte; -use milli::heed::{CompactionOption, Env, EnvOpenOptions}; -use structopt::StructOpt; -use Command::*; - -#[global_allocator] -static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; - -#[derive(Debug, StructOpt)] -/// Some helpers commands for milli. -pub struct Opt { - /// The database path where the database is located. - /// It is created if it doesn't already exist. - #[structopt(long = "db", parse(from_os_str))] - database: PathBuf, - - /// The maximum size the database can take on disk. It is recommended to specify - /// the whole disk space (value must be a multiple of a page size). - #[structopt(long = "db-size", default_value = "100 GiB")] - database_size: Byte, - - /// Verbose mode (-v, -vv, -vvv, etc.) - #[structopt(short, long, parse(from_occurrences))] - verbose: usize, - - #[structopt(subcommand)] - command: Command, -} - -#[derive(Debug, StructOpt)] -enum Command { - /// Outputs the main LMDB database to stdout. - CopyMainDatabase { - /// Wether to enable or not the compaction of the database. - #[structopt(long, short = "c")] - enable_compaction: bool, - }, -} - -fn main() -> anyhow::Result<()> { - let opt = Opt::from_args(); - - stderrlog::new() - .verbosity(opt.verbose) - .show_level(false) - .timestamp(stderrlog::Timestamp::Off) - .init()?; - - let mut options = EnvOpenOptions::new(); - options.map_size(opt.database_size.get_bytes() as usize); - - // Return an error if the database does not exist. - if !opt.database.exists() { - anyhow::bail!("The database ({}) does not exist.", opt.database.display()); - } - - let env = options.open(opt.database)?; - - match opt.command { - CopyMainDatabase { enable_compaction } => { - use CompactionOption::*; - let compaction = if enable_compaction { Enabled } else { Disabled }; - copy_main_database_to_stdout(env, compaction) - } - } -} - -#[cfg(target_family = "unix")] -fn copy_main_database_to_stdout(env: Env, compaction: CompactionOption) -> anyhow::Result<()> { - use std::os::unix::io::AsRawFd; - - let stdout = std::io::stdout().as_raw_fd(); - unsafe { env.copy_to_fd(stdout, compaction).map_err(Into::into) } -} - -#[cfg(target_family = "windows")] -fn copy_main_database_to_stdout(env: Env, compaction: CompactionOption) -> anyhow::Result<()> { - use std::os::windows::io::AsRawHandle; - - let stdout = std::io::stdout().as_raw_handle(); - unsafe { env.copy_to_fd(stdout, compaction).map_err(Into::into) } -} From 513a38f07ba889c56feda45fdff84ca2a42728f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 10:44:33 +0200 Subject: [PATCH 1635/1889] Remove LTO in release profile Since we can't enable it in Meilisearch, there is no point in having it enabled in milli --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6a618c381..fd92eeb22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,6 @@ opt-level = 3 [profile.release] debug = true codegen-units = 1 -lto = "thin" # Make sure that the build scripts and proc-macros are compiled with # all the optimizations. It speeds up the zip crate that we use in the build.rs. From ed3d87f0614580198b0503340a1bd52cff5f3e98 Mon Sep 17 00:00:00 2001 From: meili-bot <74670311+meili-bot@users.noreply.github.com> Date: Thu, 22 Sep 2022 18:43:42 +0200 Subject: [PATCH 1636/1889] Update CONTRIBUTING.md --- CONTRIBUTING.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index daf2a8892..32f044c40 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,12 +5,23 @@ First, thank you for contributing to Meilisearch! The goal of this document is t Remember that there are many ways to contribute other than writing code: writing [tutorials or blog posts](https://github.com/meilisearch/awesome-meilisearch), improving [the documentation](https://github.com/meilisearch/documentation), submitting [bug reports](https://github.com/meilisearch/milli/issues/new) and [feature requests](https://github.com/meilisearch/product/discussions/categories/feedback-feature-proposal)... ## Table of Contents +- [Hacktoberfest](#hacktoberfest-2022) - [Assumptions](#assumptions) - [How to Contribute](#how-to-contribute) - [Development Workflow](#development-workflow) - [Git Guidelines](#git-guidelines) - [Release Process (for internal team only)](#release-process-for-internal-team-only) +## Hacktoberfest 2022 + +It's [Hacktoberfest month](https://hacktoberfest.com)! 🥳 + +Thanks so much for participating with Meilisearch this year! + +1. We will follow the quality standards set by the organizers of Hacktoberfest (see detail on their [website](https://hacktoberfest.digitalocean.com/resources/qualitystandards)). Our reviewers will not consider any PR that doesn’t match that standard. +2. PRs reviews will take place from Monday to Thursday, during usual working hours, CEST time. If you submit outside of these hours, there’s no need to panic; we will get around to your contribution. +3. There will be no issue assignment as we don’t want people to ask to be assigned specific issues and never return, discouraging the volunteer contributors from opening a PR to fix this issue. We take the liberty to choose the PR that best fixes the issue, so we encourage you to get to it as soon as possible and do your best! + ## Assumptions 1. **You're familiar with [GitHub](https://github.com) and the [Pull Requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)(PR) workflow.** From 26efdf4dd904c86761579ad0a3a27f390ec1995d Mon Sep 17 00:00:00 2001 From: meili-bot <74670311+meili-bot@users.noreply.github.com> Date: Thu, 29 Sep 2022 16:00:15 +0200 Subject: [PATCH 1637/1889] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 32f044c40..422076d6a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,6 +22,8 @@ Thanks so much for participating with Meilisearch this year! 2. PRs reviews will take place from Monday to Thursday, during usual working hours, CEST time. If you submit outside of these hours, there’s no need to panic; we will get around to your contribution. 3. There will be no issue assignment as we don’t want people to ask to be assigned specific issues and never return, discouraging the volunteer contributors from opening a PR to fix this issue. We take the liberty to choose the PR that best fixes the issue, so we encourage you to get to it as soon as possible and do your best! +You can check out the longer, more complete guideline documentation [here](https://github.com/meilisearch/.github/blob/main/Hacktoberfest_2022_contributors_guidelines.md). + ## Assumptions 1. **You're familiar with [GitHub](https://github.com) and the [Pull Requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)(PR) workflow.** From 00c02d00f344f86ef93d8fa264f1a60682ea7608 Mon Sep 17 00:00:00 2001 From: vishalsodani Date: Fri, 30 Sep 2022 22:17:06 +0530 Subject: [PATCH 1638/1889] Add missing logging timer to extractors --- milli/src/update/index_documents/extract/extract_geo_points.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 47085144a..5ea079823 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -12,6 +12,7 @@ use crate::{FieldId, InternalError, Result}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. /// /// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude) +#[logging_timer::time] pub fn extract_geo_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, From bc502ee125b609639146828f2ab8878056900bca Mon Sep 17 00:00:00 2001 From: Anirudh Rowjee Date: Mon, 3 Oct 2022 09:38:59 +0530 Subject: [PATCH 1639/1889] [docs] Fixed #652, changes spelling of author --- README.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d69c43656..b76252622 100644 --- a/README.md +++ b/README.md @@ -14,11 +14,12 @@ it is the job of something else above and this is why it is only able to process one update at a time. This repository contains crates to quickly debug the engine: - - There are benchmarks located in the `benchmarks` crate. - - The `cli` crate is a simple command-line interface that helps run [flamegraph] on top of it. - - The `filter-parser` crate contains the parser for the Meilisearch filter syntax. - - The `flatten-serde-json` crate contains the library that flattens serde-json `Value` objects like Elasticsearch does. - - The `json-depth-checker` crate is used to indicate if a JSON must be flattened. + +- There are benchmarks located in the `benchmarks` crate. +- The `cli` crate is a simple command-line interface that helps run [flamegraph] on top of it. +- The `filter-parser` crate contains the parser for the Meilisearch filter syntax. +- The `flatten-serde-json` crate contains the library that flattens serde-json `Value` objects like Elasticsearch does. +- The `json-depth-checker` crate is used to indicate if a JSON must be flattened. ## How to use it? @@ -39,28 +40,28 @@ let content = documents!([ { "id": 2, "title": "Prideand Prejudice", - "au{hor": "Jane Austin", + "author": "Jane Austin", "genre": "romance", "price$": "3.5$", }, { "id": 456, "title": "Le Petit Prince", - "au{hor": "Antoine de Saint-Exupéry", + "author": "Antoine de Saint-Exupéry", "genre": "adventure", "price$": "10.0$", }, { "id": 1, "title": "Wonderland", - "au{hor": "Lewis Carroll", + "author": "Lewis Carroll", "genre": "fantasy", "price$": "25.99$", }, { "id": 4, "title": "Harry Potter ing fantasy\0lood Prince", - "au{hor": "J. K. Rowling", + "author": "J. K. Rowling", "genre": "fantasy\0", }, ]); @@ -91,5 +92,5 @@ We're glad you're thinking about contributing to this repository! Feel free to p Also, we recommend following the [CONTRIBUTING.md](/CONTRIBUTING.md) to create your PR. -[Meilisearch]: https://github.com/meilisearch/meilisearch +[meilisearch]: https://github.com/meilisearch/meilisearch [flamegraph]: https://github.com/flamegraph-rs/flamegraph From 7d247353d003d7226cab970b537cfc5abb36d475 Mon Sep 17 00:00:00 2001 From: Anirudh Rowjee Date: Mon, 3 Oct 2022 09:52:20 +0530 Subject: [PATCH 1640/1889] [docs] contd - fix #652, revert capitalization of 'Meilisearch' --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b76252622..378751a3f 100644 --- a/README.md +++ b/README.md @@ -92,5 +92,5 @@ We're glad you're thinking about contributing to this repository! Feel free to p Also, we recommend following the [CONTRIBUTING.md](/CONTRIBUTING.md) to create your PR. -[meilisearch]: https://github.com/meilisearch/meilisearch +[Meilisearch]: https://github.com/meilisearch/meilisearch [flamegraph]: https://github.com/flamegraph-rs/flamegraph From 4348c496569d16d25f8ecd4d1ddead1319a181cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9rome=20Eertmans?= Date: Tue, 4 Oct 2022 11:33:19 +0200 Subject: [PATCH 1641/1889] fix: re-upload milli's logo The logo was deleted with this [commit](https://github.com/meilisearch/milli/commit/add96f921b4046aaec72e2e844eed51ad1daa37d). --- logo-black.svg | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 logo-black.svg diff --git a/logo-black.svg b/logo-black.svg new file mode 100644 index 000000000..2a3fb1d89 --- /dev/null +++ b/logo-black.svg @@ -0,0 +1,6 @@ + + + + + + From aec220ab638dde8278a63d00673760c55ce675c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9rome=20Eertmans?= Date: Tue, 4 Oct 2022 12:20:24 +0200 Subject: [PATCH 1642/1889] chore: move logo to (new) assets folder --- .gitignore | 1 + README.md | 2 +- logo-black.svg => assets/logo-black.svg | 0 3 files changed, 2 insertions(+), 1 deletion(-) rename logo-black.svg => assets/logo-black.svg (100%) diff --git a/.gitignore b/.gitignore index 02c4fcd79..cef7b7b4c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ *.csv *.mmdb *.svg +!*/logo-black.svg # Snapshots ## ... large diff --git a/README.md b/README.md index 378751a3f..948752ee9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@

- the milli logo + the milli logo

a concurrent indexer combined with fast and relevant search algorithms

diff --git a/logo-black.svg b/assets/logo-black.svg similarity index 100% rename from logo-black.svg rename to assets/logo-black.svg From 1764a33690d6c354cc1825f379c618e20816d672 Mon Sep 17 00:00:00 2001 From: meili-bot <74670311+meili-bot@users.noreply.github.com> Date: Wed, 5 Oct 2022 19:19:03 +0200 Subject: [PATCH 1643/1889] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 422076d6a..92dde01d1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,7 @@ It's [Hacktoberfest month](https://hacktoberfest.com)! 🥳 Thanks so much for participating with Meilisearch this year! -1. We will follow the quality standards set by the organizers of Hacktoberfest (see detail on their [website](https://hacktoberfest.digitalocean.com/resources/qualitystandards)). Our reviewers will not consider any PR that doesn’t match that standard. +1. We will follow the quality standards set by the organizers of Hacktoberfest (see detail on their [website](https://hacktoberfest.com/participation/#spam)). Our reviewers will not consider any PR that doesn’t match that standard. 2. PRs reviews will take place from Monday to Thursday, during usual working hours, CEST time. If you submit outside of these hours, there’s no need to panic; we will get around to your contribution. 3. There will be no issue assignment as we don’t want people to ask to be assigned specific issues and never return, discouraging the volunteer contributors from opening a PR to fix this issue. We take the liberty to choose the PR that best fixes the issue, so we encourage you to get to it as soon as possible and do your best! From 762e320c3594c9936fd6dd97a70438e5b3d37ff9 Mon Sep 17 00:00:00 2001 From: msvaljek Date: Fri, 7 Oct 2022 12:59:12 +0200 Subject: [PATCH 1644/1889] Add proximity calculation for the same word --- .../extract/extract_word_pair_proximity_docids.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 6add9d980..9448f0e23 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -127,6 +127,17 @@ fn document_word_positions_into_sorter<'b>( // Advance the head and push it in the heap. if let Some(mut head) = ordered_peeked_word_positions.pop() { if let Some(next_position) = head.iter.next() { + let prox = positions_proximity(head.position, next_position); + + if prox > 0 && prox < MAX_DISTANCE { + word_pair_proximity + .entry((head.word.clone(), head.word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + } + word_positions_heap.push(PeekedWordPosition { word: head.word, position: next_position, From 5cfb5df31e80dffa6afe4d2ba43e67d7f7730565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 10:23:00 +0200 Subject: [PATCH 1645/1889] Set opt-level to 0 for debug builds But speed up compile times by optimising build dependencies of lindera --- Cargo.toml | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 90756bc1f..3a8eedc72 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,19 +4,29 @@ members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", default-members = ["milli"] [profile.dev] -opt-level = 3 +opt-level = 0 [profile.release] debug = true codegen-units = 1 -# Make sure that the build scripts and proc-macros are compiled with -# all the optimizations. It speeds up the zip crate that we use in the build.rs. -[profile.dev.build-override] +[profile.dev.package.lindera-ipadic-builder] opt-level = 3 -[profile.release.build-override] +[profile.dev.package.encoding] opt-level = 3 -[profile.bench.build-override] +[profile.dev.package.yada] opt-level = 3 -[profile.test.build-override] + +[profile.release.package.lindera-ipadic-builder] +opt-level = 3 +[profile.release.package.encoding] +opt-level = 3 +[profile.release.package.yada] +opt-level = 3 + +[profile.bench.package.lindera-ipadic-builder] +opt-level = 3 +[profile.bench.package.encoding] +opt-level = 3 +[profile.bench.package.yada] opt-level = 3 From 98fc0938239754da99c7d2f9f1e3fec951afe04a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 13:54:06 +0200 Subject: [PATCH 1646/1889] Optimize a few performance sensitive dependencies on debug builds --- Cargo.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 3a8eedc72..34e32a914 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,12 +4,17 @@ members = ["milli", "filter-parser", "flatten-serde-json", "json-depth-checker", default-members = ["milli"] [profile.dev] -opt-level = 0 +opt-level = 0 [profile.release] debug = true codegen-units = 1 +[profile.dev.package.grenad] +opt-level = 3 +[profile.dev.package.roaring] +opt-level = 3 + [profile.dev.package.lindera-ipadic-builder] opt-level = 3 [profile.dev.package.encoding] From 6fbf5dac68d68ee4f013b0a19dea33b75339195a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 Aug 2022 09:32:03 +0200 Subject: [PATCH 1647/1889] Simplify documents! macro to reduce compile times --- milli/src/documents/mod.rs | 44 +++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index c1580309a..0bdf6600a 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -152,30 +152,33 @@ impl fmt::Display for Error { impl std::error::Error for Error {} +#[cfg(test)] +pub fn objects_from_json_value(json: serde_json::Value) -> Vec { + let documents = match json { + object @ serde_json::Value::Object(_) => vec![object], + serde_json::Value::Array(objects) => objects, + invalid => { + panic!("an array of objects must be specified, {:#?} is not an array", invalid) + } + }; + let mut objects = vec![]; + for document in documents { + let object = match document { + serde_json::Value::Object(object) => object, + invalid => panic!("an object must be specified, {:#?} is not an object", invalid), + }; + objects.push(object); + } + objects +} + /// Macro used to generate documents, with the same syntax as `serde_json::json` #[cfg(test)] macro_rules! documents { ($data:tt) => {{ let documents = serde_json::json!($data); - let documents = match documents { - object @ serde_json::Value::Object(_) => vec![object], - serde_json::Value::Array(objects) => objects, - invalid => { - panic!("an array of objects must be specified, {:#?} is not an array", invalid) - } - }; - - let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); - for document in documents { - let object = match document { - serde_json::Value::Object(object) => object, - invalid => panic!("an object must be specified, {:#?} is not an object", invalid), - }; - builder.append_json_object(&object).unwrap(); - } - - let vector = builder.into_inner().unwrap(); - crate::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap() + let documents = $crate::documents::objects_from_json_value(documents); + $crate::documents::documents_batch_reader_from_objects(documents) }}; } @@ -187,7 +190,8 @@ pub fn documents_batch_reader_from_objects( for object in objects { builder.append_json_object(&object).unwrap(); } - DocumentsBatchReader::from_reader(std::io::Cursor::new(builder.into_inner().unwrap())).unwrap() + let vector = builder.into_inner().unwrap(); + DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap() } #[cfg(test)] From 53503f09ca3060a791a3509df8e3f8638aa861bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 16 Aug 2022 20:02:46 +0200 Subject: [PATCH 1648/1889] Make milli's default features optional in other executable targets --- benchmarks/Cargo.toml | 5 ++++- cli/Cargo.toml | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 9c1e83663..87c567de9 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -7,7 +7,7 @@ publish = false [dependencies] anyhow = "1.0.56" csv = "1.1.6" -milli = { path = "../milli" } +milli = { path = "../milli", default-features = false } mimalloc = { version = "0.1.29", default-features = false } serde_json = { version = "1.0.79", features = ["preserve_order"] } @@ -24,6 +24,9 @@ convert_case = "0.5.0" flate2 = "1.0.22" reqwest = { version = "0.11.9", features = ["blocking", "rustls-tls"], default-features = false } +[features] +default = ["milli/default"] + [[bench]] name = "search_songs" harness = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 3e5df29c5..f1f5a6beb 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -12,9 +12,12 @@ color-eyre = "0.6.1" csv = "1.1.6" eyre = "0.6.7" indicatif = "0.16.2" -milli = { path = "../milli" } +milli = { path = "../milli", default-features = false } mimalloc = { version = "0.1.29", default-features = false } serde = "1.0.136" serde_json = "1.0.79" stderrlog = "0.5.1" structopt = "0.3.26" + +[features] +default = ["milli/default"] From 7f9680f0a0e2c5113ec29500066af2d47c3275f8 Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Wed, 12 Oct 2022 13:18:23 +0530 Subject: [PATCH 1649/1889] Enhance word splitting strategy --- milli/src/search/query_tree.rs | 79 +++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 20 deletions(-) mode change 100644 => 100755 milli/src/search/query_tree.rs diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs old mode 100644 new mode 100755 index 1c60e41f7..b54fc5c9c --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; use std::cmp::max; -use std::{cmp, fmt, mem}; +use std::{fmt, mem}; use charabia::classifier::ClassifiedTokenIter; use charabia::{SeparatorKind, TokenKind}; @@ -10,7 +10,7 @@ use slice_group_by::GroupBy; use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; use crate::search::TermsMatchingStrategy; -use crate::{Index, MatchingWords, Result}; +use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result}; type IsOptionalWord = bool; type IsPrefix = bool; @@ -146,6 +146,7 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; + fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result>; fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { @@ -156,6 +157,12 @@ trait Context { /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; fn exact_words(&self) -> Option<&fst::Set>>; + fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result> { + match self.word_pair_proximity_docids(right_word, left_word, proximity)? { + Some(rb) => Ok(Some(rb.len())), + None => Ok(None), + } + } } /// The query tree builder is the interface to build a query tree. @@ -173,6 +180,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } + fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result> { + self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity)) + } + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { self.index.words_synonyms(self.rtxn, words) } @@ -190,6 +201,11 @@ impl<'a> Context for QueryTreeBuilder<'a> { fn exact_words(&self) -> Option<&fst::Set>> { self.exact_words.as_ref() } + + fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result> { + let key = (left_word, right_word, proximity); + self.index.word_pair_proximity_docids.remap_data_type::().get(&self.rtxn, &key) + } } impl<'a> QueryTreeBuilder<'a> { @@ -274,12 +290,10 @@ fn split_best_frequency<'a>( for (i, _) in chars { let (left, right) = word.split_at(i); - let left_freq = ctx.word_documents_count(left)?.unwrap_or(0); - let right_freq = ctx.word_documents_count(right)?.unwrap_or(0); + let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0); - let min_freq = cmp::min(left_freq, right_freq); - if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { - best = Some((min_freq, left, right)); + if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) { + best = Some((pair_freq, left, right)); } } @@ -824,6 +838,11 @@ mod test { Ok(self.postings.get(word).cloned()) } + fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, _: u8) -> heed::Result> { + let bitmap = self.postings.get(&format!("{} {}", left_word, right_word)); + Ok(bitmap.cloned()) + } + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) @@ -881,19 +900,22 @@ mod test { ], }, postings: hashmap! { - String::from("hello") => random_postings(rng, 1500), - String::from("hi") => random_postings(rng, 4000), - String::from("word") => random_postings(rng, 2500), - String::from("split") => random_postings(rng, 400), - String::from("ngrams") => random_postings(rng, 1400), - String::from("world") => random_postings(rng, 15_000), - String::from("earth") => random_postings(rng, 8000), - String::from("2021") => random_postings(rng, 100), - String::from("2020") => random_postings(rng, 500), - String::from("is") => random_postings(rng, 50_000), - String::from("this") => random_postings(rng, 50_000), - String::from("good") => random_postings(rng, 1250), - String::from("morning") => random_postings(rng, 125), + String::from("hello") => random_postings(rng, 1500), + String::from("hi") => random_postings(rng, 4000), + String::from("word") => random_postings(rng, 2500), + String::from("split") => random_postings(rng, 400), + String::from("ngrams") => random_postings(rng, 1400), + String::from("world") => random_postings(rng, 15_000), + String::from("earth") => random_postings(rng, 8000), + String::from("2021") => random_postings(rng, 100), + String::from("2020") => random_postings(rng, 500), + String::from("is") => random_postings(rng, 50_000), + String::from("this") => random_postings(rng, 50_000), + String::from("good") => random_postings(rng, 1250), + String::from("morning") => random_postings(rng, 125), + String::from("word split") => random_postings(rng, 5000), + String::from("quick brownfox") => random_postings(rng, 7000), + String::from("quickbrown fox") => random_postings(rng, 8000), }, exact_words, } @@ -1041,6 +1063,23 @@ mod test { "###); } + #[test] + fn word_split_choose_pair_with_max_freq() { + let query = "quickbrownfox"; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + PHRASE ["quickbrown", "fox"] + PrefixTolerant { word: "quickbrownfox", max typo: 2 } + "###); + } + #[test] fn phrase() { let query = "\"hey friends\" \" \" \"wooop"; From 63e79a9039feaa1caa117d2aea3275bbbd722ed0 Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Wed, 12 Oct 2022 13:36:48 +0530 Subject: [PATCH 1650/1889] update comment --- milli/src/search/query_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index b54fc5c9c..70227b8f9 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -279,7 +279,7 @@ impl<'a> QueryTreeBuilder<'a> { } } -/// Split the word depending on the frequency of subwords in the database documents. +/// Split the word depending on the frequency of pairs near together in the database documents. fn split_best_frequency<'a>( ctx: &impl Context, word: &'a str, From 8c9245149e32468e57183066536389567db318ca Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Wed, 12 Oct 2022 15:27:56 +0530 Subject: [PATCH 1651/1889] format file --- milli/src/search/query_tree.rs | 40 +++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 70227b8f9..43d903d16 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -146,7 +146,12 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result>; + fn word_pair_proximity_docids( + &self, + right_word: &str, + left_word: &str, + proximity: u8, + ) -> heed::Result>; fn synonyms>(&self, words: &[S]) -> heed::Result>>>; fn word_documents_count(&self, word: &str) -> heed::Result> { match self.word_docids(word)? { @@ -157,7 +162,12 @@ trait Context { /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; fn exact_words(&self) -> Option<&fst::Set>>; - fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result> { + fn word_pair_frequency( + &self, + left_word: &str, + right_word: &str, + proximity: u8, + ) -> heed::Result> { match self.word_pair_proximity_docids(right_word, left_word, proximity)? { Some(rb) => Ok(Some(rb.len())), None => Ok(None), @@ -180,7 +190,12 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } - fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result> { + fn word_pair_proximity_docids( + &self, + right_word: &str, + left_word: &str, + proximity: u8, + ) -> heed::Result> { self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity)) } @@ -202,9 +217,17 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.exact_words.as_ref() } - fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result> { + fn word_pair_frequency( + &self, + left_word: &str, + right_word: &str, + proximity: u8, + ) -> heed::Result> { let key = (left_word, right_word, proximity); - self.index.word_pair_proximity_docids.remap_data_type::().get(&self.rtxn, &key) + self.index + .word_pair_proximity_docids + .remap_data_type::() + .get(&self.rtxn, &key) } } @@ -838,7 +861,12 @@ mod test { Ok(self.postings.get(word).cloned()) } - fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, _: u8) -> heed::Result> { + fn word_pair_proximity_docids( + &self, + right_word: &str, + left_word: &str, + _: u8, + ) -> heed::Result> { let bitmap = self.postings.get(&format!("{} {}", left_word, right_word)); Ok(bitmap.cloned()) } From 6cb8b46900492dd23b229e33604263839a32f06e Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 12:43:11 +0530 Subject: [PATCH 1652/1889] use word_pair_frequency and remove word_documents_count --- milli/src/search/query_tree.rs | 50 ++++++---------------------------- 1 file changed, 8 insertions(+), 42 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 43d903d16..4ed1e9fbd 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -146,19 +146,7 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; - fn word_pair_proximity_docids( - &self, - right_word: &str, - left_word: &str, - proximity: u8, - ) -> heed::Result>; fn synonyms>(&self, words: &[S]) -> heed::Result>>>; - fn word_documents_count(&self, word: &str) -> heed::Result> { - match self.word_docids(word)? { - Some(rb) => Ok(Some(rb.len())), - None => Ok(None), - } - } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; fn exact_words(&self) -> Option<&fst::Set>>; @@ -166,9 +154,9 @@ trait Context { &self, left_word: &str, right_word: &str, - proximity: u8, + _proximity: u8, ) -> heed::Result> { - match self.word_pair_proximity_docids(right_word, left_word, proximity)? { + match self.word_docids(&format!("{} {}", left_word, right_word))? { Some(rb) => Ok(Some(rb.len())), None => Ok(None), } @@ -190,23 +178,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.word_docids.get(self.rtxn, word) } - fn word_pair_proximity_docids( - &self, - right_word: &str, - left_word: &str, - proximity: u8, - ) -> heed::Result> { - self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity)) - } - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { self.index.words_synonyms(self.rtxn, words) } - fn word_documents_count(&self, word: &str) -> heed::Result> { - self.index.word_documents_count(self.rtxn, word) - } - fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { let one = self.index.min_word_len_one_typo(&self.rtxn)?; let two = self.index.min_word_len_two_typos(&self.rtxn)?; @@ -306,7 +281,7 @@ impl<'a> QueryTreeBuilder<'a> { fn split_best_frequency<'a>( ctx: &impl Context, word: &'a str, -) -> heed::Result> { +) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -320,7 +295,7 @@ fn split_best_frequency<'a>( } } - Ok(best.map(|(_, left, right)| (left, right))) + Ok(best) } #[derive(Clone)] @@ -389,7 +364,7 @@ fn create_query_tree( // 4. wrap all in an OR operation PrimitiveQueryPart::Word(word, prefix) => { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some((left, right)) = split_best_frequency(ctx, &word)? { + if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; @@ -535,7 +510,8 @@ fn create_query_tree( .filter(|(_, part)| !part.is_phrase()) .max_by_key(|(_, part)| match part { PrimitiveQueryPart::Word(s, _) => { - ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()) + let (pair_freq, _, _) = split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default(); + pair_freq } _ => unreachable!(), }) @@ -582,7 +558,7 @@ fn create_matching_words( } } - if let Some((left, right)) = split_best_frequency(ctx, &word)? { + if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { let left = MatchingWord::new(left.to_string(), 0, false); let right = MatchingWord::new(right.to_string(), 0, false); matching_words.push((vec![left, right], vec![id])); @@ -861,16 +837,6 @@ mod test { Ok(self.postings.get(word).cloned()) } - fn word_pair_proximity_docids( - &self, - right_word: &str, - left_word: &str, - _: u8, - ) -> heed::Result> { - let bitmap = self.postings.get(&format!("{} {}", left_word, right_word)); - Ok(bitmap.cloned()) - } - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); Ok(self.synonyms.get(&words).cloned()) From ff8b2d4422f8b5363878dc065738935ac080ff42 Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 12:44:08 +0530 Subject: [PATCH 1653/1889] formatting --- milli/src/search/query_tree.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4ed1e9fbd..723192d20 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -510,7 +510,8 @@ fn create_query_tree( .filter(|(_, part)| !part.is_phrase()) .max_by_key(|(_, part)| match part { PrimitiveQueryPart::Word(s, _) => { - let (pair_freq, _, _) = split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default(); + let (pair_freq, _, _) = + split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default(); pair_freq } _ => unreachable!(), From 32f825d442f3b1ade6c6fe70c1f161db267f3abb Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 12:57:50 +0530 Subject: [PATCH 1654/1889] move default implementation of word_pair_frequency to TestContext --- milli/src/search/query_tree.rs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 723192d20..8ab5f81a4 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -155,12 +155,7 @@ trait Context { left_word: &str, right_word: &str, _proximity: u8, - ) -> heed::Result> { - match self.word_docids(&format!("{} {}", left_word, right_word))? { - Some(rb) => Ok(Some(rb.len())), - None => Ok(None), - } - } + ) -> heed::Result>; } /// The query tree builder is the interface to build a query tree. @@ -850,6 +845,18 @@ mod test { fn exact_words(&self) -> Option<&fst::Set>> { self.exact_words.as_ref() } + + fn word_pair_frequency( + &self, + left_word: &str, + right_word: &str, + _proximity: u8, + ) -> heed::Result> { + match self.word_docids(&format!("{} {}", left_word, right_word))? { + Some(rb) => Ok(Some(rb.len())), + None => Ok(None), + } + } } impl Default for TestContext { From 8195fc6141fb8da0a63b6a12b92f2132ba1c8640 Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 13:14:27 +0530 Subject: [PATCH 1655/1889] revert removal of word_documents_count method --- milli/src/search/query_tree.rs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 8ab5f81a4..94c28b3f9 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -147,6 +147,12 @@ impl fmt::Debug for Query { trait Context { fn word_docids(&self, word: &str) -> heed::Result>; fn synonyms>(&self, words: &[S]) -> heed::Result>>>; + fn word_documents_count(&self, word: &str) -> heed::Result> { + match self.word_docids(word)? { + Some(rb) => Ok(Some(rb.len())), + None => Ok(None), + } + } /// Returns the minimum word len for 1 and 2 typos. fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; fn exact_words(&self) -> Option<&fst::Set>>; @@ -276,7 +282,7 @@ impl<'a> QueryTreeBuilder<'a> { fn split_best_frequency<'a>( ctx: &impl Context, word: &'a str, -) -> heed::Result> { +) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -290,7 +296,7 @@ fn split_best_frequency<'a>( } } - Ok(best) + Ok(best.map(|(_, left, right)| (left, right))) } #[derive(Clone)] @@ -359,7 +365,7 @@ fn create_query_tree( // 4. wrap all in an OR operation PrimitiveQueryPart::Word(word, prefix) => { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { + if let Some((left, right)) = split_best_frequency(ctx, &word)? { children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; @@ -505,9 +511,7 @@ fn create_query_tree( .filter(|(_, part)| !part.is_phrase()) .max_by_key(|(_, part)| match part { PrimitiveQueryPart::Word(s, _) => { - let (pair_freq, _, _) = - split_best_frequency(ctx, s).unwrap_or_default().unwrap_or_default(); - pair_freq + ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()); } _ => unreachable!(), }) @@ -554,7 +558,7 @@ fn create_matching_words( } } - if let Some((_, left, right)) = split_best_frequency(ctx, &word)? { + if let Some((left, right)) = split_best_frequency(ctx, &word)? { let left = MatchingWord::new(left.to_string(), 0, false); let right = MatchingWord::new(right.to_string(), 0, false); matching_words.push((vec![left, right], vec![id])); From 85f30283173968628d343ac6b69f120c343ae99b Mon Sep 17 00:00:00 2001 From: Akshay Kulkarni Date: Thu, 13 Oct 2022 13:21:59 +0530 Subject: [PATCH 1656/1889] remove underscore and introduce back word_documents_count --- milli/src/search/query_tree.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 94c28b3f9..080f89080 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -160,7 +160,7 @@ trait Context { &self, left_word: &str, right_word: &str, - _proximity: u8, + proximity: u8, ) -> heed::Result>; } @@ -183,6 +183,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index.words_synonyms(self.rtxn, words) } + fn word_documents_count(&self, word: &str) -> heed::Result> { + self.index.word_documents_count(self.rtxn, word) + } + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { let one = self.index.min_word_len_one_typo(&self.rtxn)?; let two = self.index.min_word_len_two_typos(&self.rtxn)?; @@ -511,7 +515,7 @@ fn create_query_tree( .filter(|(_, part)| !part.is_phrase()) .max_by_key(|(_, part)| match part { PrimitiveQueryPart::Word(s, _) => { - ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()); + ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()) } _ => unreachable!(), }) From 59fe1e8efab153771ba3ca285f3f00fae83d60f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Thu, 13 Oct 2022 13:46:18 +0200 Subject: [PATCH 1657/1889] Update CONTRIBUTING.md --- CONTRIBUTING.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 92dde01d1..131b7ad3b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -44,14 +44,6 @@ You can check out the longer, more complete guideline documentation [here](https ## Development Workflow -### Setup and run - -```bash -cargo run --release -``` - -We recommend using the `--release` flag to test the full performance. - ### Test ```bash From beb987d3d1d9035b92d83a68a1c37f8bbc50f87b Mon Sep 17 00:00:00 2001 From: Ewan Higgs Date: Thu, 13 Oct 2022 22:02:54 +0200 Subject: [PATCH 1658/1889] Fixing piles of clippy errors. Most of these are calling clone when the struct supports Copy. Many are using & and &mut on `self` when the function they are called from already has an immutable or mutable borrow so this isn't needed. I tried to stay away from actual changes or places where I'd have to name fresh variables. --- milli/src/lib.rs | 2 +- milli/src/search/matches/mod.rs | 6 +- milli/src/search/mod.rs | 57 ++++++++--------- .../extract/extract_fid_word_count_docids.rs | 2 +- .../extract/extract_geo_points.rs | 2 +- .../extract/extract_word_docids.rs | 2 +- .../extract_word_pair_proximity_docids.rs | 4 +- .../extract/extract_word_position_docids.rs | 2 +- .../src/update/index_documents/extract/mod.rs | 26 ++++---- .../index_documents/helpers/grenad_helpers.rs | 11 ++-- .../helpers/merge_functions.rs | 2 +- milli/src/update/index_documents/mod.rs | 41 +++++-------- milli/src/update/index_documents/transform.rs | 49 ++++++++------- .../src/update/index_documents/typed_chunk.rs | 4 +- milli/src/update/settings.rs | 61 +++++++++---------- milli/src/update/word_prefix_docids.rs | 4 +- .../word_prefix_pair_proximity_docids.rs | 17 +++--- milli/src/update/words_prefixes_fst.rs | 2 +- 18 files changed, 137 insertions(+), 157 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 517d28ccc..e73db1d55 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -195,7 +195,7 @@ pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] { /// Returns `true` if the field match one of the faceted fields. /// See the function [`is_faceted_by`] below to see what “matching” means. pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator>) -> bool { - faceted_fields.into_iter().find(|facet| is_faceted_by(field, facet.as_ref())).is_some() + faceted_fields.into_iter().any(|facet| is_faceted_by(field, facet.as_ref())) } /// Returns `true` if the field match the facet. diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 2697405be..53101a065 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -7,9 +7,9 @@ use serde::Serialize; pub mod matching_words; -const DEFAULT_CROP_MARKER: &'static str = "…"; -const DEFAULT_HIGHLIGHT_PREFIX: &'static str = ""; -const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = ""; +const DEFAULT_CROP_MARKER: &str = "…"; +const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; +const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; /// Structure used to build a Matcher allowing to customize formating tags. pub struct MatcherBuilder<'a, A> { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7145c1445..93bb41580 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -333,39 +333,36 @@ pub fn word_derivations<'c>( } else if fst.contains(word) { derived_words.push((word.to_string(), 0)); } + } else if max_typo == 1 { + let dfa = build_dfa(word, 1, is_prefix); + let starts = StartsWith(Str::new(get_first(word))); + let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); + + while let Some((word, state)) = stream.next() { + let word = std::str::from_utf8(word)?; + let d = dfa.distance(state.1); + derived_words.push((word.to_string(), d.to_u8())); + } } else { - if max_typo == 1 { - let dfa = build_dfa(word, 1, is_prefix); - let starts = StartsWith(Str::new(get_first(word))); - let mut stream = - fst.search_with_state(Intersection(starts, &dfa)).into_stream(); + let starts = StartsWith(Str::new(get_first(word))); + let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); + let second_dfa = build_dfa(word, 2, is_prefix); + let second = Intersection(&second_dfa, &starts); + let automaton = Union(first, &second); - while let Some((word, state)) = stream.next() { - let word = std::str::from_utf8(word)?; - let d = dfa.distance(state.1); - derived_words.push((word.to_string(), d.to_u8())); - } - } else { - let starts = StartsWith(Str::new(get_first(word))); - let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); - let second_dfa = build_dfa(word, 2, is_prefix); - let second = Intersection(&second_dfa, &starts); - let automaton = Union(first, &second); + let mut stream = fst.search_with_state(automaton).into_stream(); - let mut stream = fst.search_with_state(automaton).into_stream(); - - while let Some((found_word, state)) = stream.next() { - let found_word = std::str::from_utf8(found_word)?; - // in the case the typo is on the first letter, we know the number of typo - // is two - if get_first(found_word) != get_first(word) { - derived_words.push((found_word.to_string(), 2)); - } else { - // Else, we know that it is the second dfa that matched and compute the - // correct distance - let d = second_dfa.distance((state.1).0); - derived_words.push((found_word.to_string(), d.to_u8())); - } + while let Some((found_word, state)) = stream.next() { + let found_word = std::str::from_utf8(found_word)?; + // in the case the typo is on the first letter, we know the number of typo + // is two + if get_first(found_word) != get_first(word) { + derived_words.push((found_word.to_string(), 2)); + } else { + // Else, we know that it is the second dfa that matched and compute the + // correct distance + let d = second_dfa.distance((state.1).0); + derived_words.push((found_word.to_string(), d.to_u8())); } } } diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index d425e8d14..315ebdf0c 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -40,7 +40,7 @@ pub fn extract_fid_word_count_docids( let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { let (document_id_bytes, _word_bytes) = try_split_array_at(key) - .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let curr_document_id = *current_document_id.get_or_insert(document_id); diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 5ea079823..c75b60c60 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -60,5 +60,5 @@ pub fn extract_geo_points( } } - Ok(writer_into_reader(writer)?) + writer_into_reader(writer) } diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 4b965e9a8..da59f9dde 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -51,7 +51,7 @@ pub fn extract_word_docids( let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, positions)) = cursor.move_on_next()? { let (document_id_bytes, word_bytes) = try_split_array_at(key) - .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let bitmap = RoaringBitmap::from_iter(Some(document_id)); diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 9448f0e23..6c0919e14 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -39,7 +39,7 @@ pub fn extract_word_pair_proximity_docids( let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { let (document_id_bytes, word_bytes) = try_split_array_at(key) - .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); let word = str::from_utf8(word_bytes)?; @@ -81,7 +81,7 @@ pub fn extract_word_pair_proximity_docids( /// /// This list is used by the engine to calculate the documents containing words that are /// close to each other. -fn document_word_positions_into_sorter<'b>( +fn document_word_positions_into_sorter( document_id: DocumentId, mut word_positions_heap: BinaryHeap>>, word_pair_proximity_docids_sorter: &mut grenad::Sorter, diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index c1661072a..d4a3eda2c 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -33,7 +33,7 @@ pub fn extract_word_position_docids( let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { let (document_id_bytes, word_bytes) = try_split_array_at(key) - .ok_or_else(|| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = DocumentId::from_be_bytes(document_id_bytes); for position in read_u32_ne_bytes(value) { diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 157886e63..50cc04610 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -96,7 +96,7 @@ pub(crate) fn data_from_obkv_documents( spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), - indexer.clone(), + indexer, lmdb_writer_sx.clone(), extract_word_pair_proximity_docids, merge_cbo_roaring_bitmaps, @@ -106,7 +106,7 @@ pub(crate) fn data_from_obkv_documents( spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), - indexer.clone(), + indexer, lmdb_writer_sx.clone(), extract_fid_word_count_docids, merge_cbo_roaring_bitmaps, @@ -116,7 +116,7 @@ pub(crate) fn data_from_obkv_documents( spawn_extraction_task::<_, _, Vec<(grenad::Reader, grenad::Reader)>>( docid_word_positions_chunks.clone(), - indexer.clone(), + indexer, lmdb_writer_sx.clone(), move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), merge_roaring_bitmaps, @@ -128,8 +128,8 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>( - docid_word_positions_chunks.clone(), - indexer.clone(), + docid_word_positions_chunks, + indexer, lmdb_writer_sx.clone(), extract_word_position_docids, merge_cbo_roaring_bitmaps, @@ -138,8 +138,8 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>( - docid_fid_facet_strings_chunks.clone(), - indexer.clone(), + docid_fid_facet_strings_chunks, + indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, keep_first_prefix_value_merge_roaring_bitmaps, @@ -148,8 +148,8 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>( - docid_fid_facet_numbers_chunks.clone(), - indexer.clone(), + docid_fid_facet_numbers_chunks, + indexer, lmdb_writer_sx.clone(), extract_facet_number_docids, merge_cbo_roaring_bitmaps, @@ -183,12 +183,12 @@ fn spawn_extraction_task( { rayon::spawn(move || { let chunks: Result = - chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect(); + chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer)).collect(); rayon::spawn(move || match chunks { Ok(chunks) => { debug!("merge {} database", name); let reader = chunks.merge(merge_fn, &indexer); - let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))); + let _ = lmdb_writer_sx.send(reader.map(serialize_fn)); } Err(e) => { let _ = lmdb_writer_sx.send(Err(e)); @@ -255,7 +255,7 @@ fn send_and_extract_flattened_documents_data( || { let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( flattened_documents_chunk.clone(), - indexer.clone(), + indexer, searchable_fields, stop_words.as_ref(), max_positions_per_attributes, @@ -279,7 +279,7 @@ fn send_and_extract_flattened_documents_data( fid_facet_exists_docids_chunk, ) = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), - indexer.clone(), + indexer, faceted_fields, )?; diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 202e689f8..e18cb4e16 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -61,7 +61,7 @@ pub fn sorter_into_reader( ); sorter.write_into_stream_writer(&mut writer)?; - Ok(writer_into_reader(writer)?) + writer_into_reader(writer) } pub fn writer_into_reader(writer: grenad::Writer) -> Result> { @@ -134,7 +134,7 @@ impl MergerBuilder { ); merger.write_into_stream_writer(&mut writer)?; - Ok(writer_into_reader(writer)?) + writer_into_reader(writer) } } @@ -180,7 +180,6 @@ pub fn grenad_obkv_into_chunks( let mut continue_reading = true; let mut cursor = reader.into_cursor()?; - let indexer_clone = indexer.clone(); let mut transposer = move || { if !continue_reading { return Ok(None); @@ -188,8 +187,8 @@ pub fn grenad_obkv_into_chunks( let mut current_chunk_size = 0u64; let mut obkv_documents = create_writer( - indexer_clone.chunk_compression_type, - indexer_clone.chunk_compression_level, + indexer.chunk_compression_type, + indexer.chunk_compression_level, tempfile::tempfile()?, ); @@ -224,7 +223,7 @@ pub fn write_into_lmdb_database( match iter.next().transpose()? { Some((key, old_val)) if key == k => { let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; - let val = merge(k, &vals)?; + let val = merge(k, vals)?; // safety: we don't keep references from inside the LMDB database. unsafe { iter.put_current(k, &val)? }; } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index c5385e347..dbe3c0344 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -88,7 +88,7 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { Ok(obkvs - .into_iter() + .iter() .cloned() .reduce(|acc, current| { let first = obkv::KvReader::new(&acc); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e0eefe07b..13429c0d6 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -106,7 +106,7 @@ where ) -> Result> { let transform = Some(Transform::new( wtxn, - &index, + index, indexer_config, config.update_method, config.autogenerate_docids, @@ -291,18 +291,12 @@ where // Run extraction pipeline in parallel. pool.install(|| { // split obkv file into several chunks - let original_chunk_iter = grenad_obkv_into_chunks( - original_documents, - pool_params.clone(), - documents_chunk_size, - ); + let original_chunk_iter = + grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size); // split obkv file into several chunks - let flattened_chunk_iter = grenad_obkv_into_chunks( - flattened_documents, - pool_params.clone(), - documents_chunk_size, - ); + let flattened_chunk_iter = + grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); let result = original_chunk_iter.and_then(|original_chunk| { let flattened_chunk = flattened_chunk_iter?; @@ -341,7 +335,7 @@ where } let index_documents_ids = self.index.documents_ids(self.wtxn)?; - let index_is_empty = index_documents_ids.len() == 0; + let index_is_empty = index_documents_ids.is_empty(); let mut final_documents_ids = RoaringBitmap::new(); let mut word_pair_proximity_docids = None; let mut word_position_docids = None; @@ -378,7 +372,7 @@ where }; let (docids, is_merged_database) = - write_typed_chunk_into_index(typed_chunk, &self.index, self.wtxn, index_is_empty)?; + write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?; if !docids.is_empty() { final_documents_ids |= docids; let documents_seen_count = final_documents_ids.len(); @@ -475,7 +469,7 @@ where ); let common_prefix_fst_words: Vec<_> = common_prefix_fst_words .as_slice() - .linear_group_by_key(|x| x.chars().nth(0).unwrap()) + .linear_group_by_key(|x| x.chars().next().unwrap()) .collect(); // We retrieve the newly added words between the previous and new prefix word fst. @@ -498,9 +492,9 @@ where execute_word_prefix_docids( self.wtxn, word_docids, - self.index.word_docids.clone(), - self.index.word_prefix_docids.clone(), - &self.indexer_config, + self.index.word_docids, + self.index.word_prefix_docids, + self.indexer_config, &new_prefix_fst_words, &common_prefix_fst_words, &del_prefix_fst_words, @@ -511,9 +505,9 @@ where execute_word_prefix_docids( self.wtxn, exact_word_docids, - self.index.exact_word_docids.clone(), - self.index.exact_word_prefix_docids.clone(), - &self.indexer_config, + self.index.exact_word_docids, + self.index.exact_word_prefix_docids, + self.indexer_config, &new_prefix_fst_words, &common_prefix_fst_words, &del_prefix_fst_words, @@ -595,12 +589,7 @@ fn execute_word_prefix_docids( builder.chunk_compression_level = indexer_config.chunk_compression_level; builder.max_nb_chunks = indexer_config.max_nb_chunks; builder.max_memory = indexer_config.max_memory; - builder.execute( - cursor, - &new_prefix_fst_words, - &common_prefix_fst_words, - &del_prefix_fst_words, - )?; + builder.execute(cursor, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?; Ok(()) } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f52d5c7af..3786c5bcb 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -72,10 +72,10 @@ fn create_fields_mapping( // we sort by id here to ensure a deterministic mapping of the fields, that preserves // the original ordering. .sorted_by_key(|(&id, _)| id) - .map(|(field, name)| match index_field_map.id(&name) { + .map(|(field, name)| match index_field_map.id(name) { Some(id) => Ok((*field, id)), None => index_field_map - .insert(&name) + .insert(name) .ok_or(Error::UserError(UserError::AttributeLimitReached)) .map(|id| (*field, id)), }) @@ -192,7 +192,7 @@ impl<'a, 'i> Transform<'a, 'i> { // Insertion in a obkv need to be done with keys ordered. For now they are ordered // according to the document addition key order, so we sort it according to the // fieldids map keys order. - field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2)); + field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2)); // Build the new obkv document. let mut writer = obkv::KvWriter::new(&mut obkv_buffer); @@ -202,24 +202,23 @@ impl<'a, 'i> Transform<'a, 'i> { let mut original_docid = None; - let docid = - match self.new_external_documents_ids_builder.entry(external_id.clone().into()) { - Entry::Occupied(entry) => *entry.get() as u32, - Entry::Vacant(entry) => { - // If the document was already in the db we mark it as a replaced document. - // It'll be deleted later. We keep its original docid to insert it in the grenad. - if let Some(docid) = external_documents_ids.get(entry.key()) { - self.replaced_documents_ids.insert(docid); - original_docid = Some(docid); - } - let docid = self - .available_documents_ids - .next() - .ok_or(UserError::DocumentLimitReached)?; - entry.insert(docid as u64); - docid + let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { + Entry::Occupied(entry) => *entry.get() as u32, + Entry::Vacant(entry) => { + // If the document was already in the db we mark it as a replaced document. + // It'll be deleted later. We keep its original docid to insert it in the grenad. + if let Some(docid) = external_documents_ids.get(entry.key()) { + self.replaced_documents_ids.insert(docid); + original_docid = Some(docid); } - }; + let docid = self + .available_documents_ids + .next() + .ok_or(UserError::DocumentLimitReached)?; + entry.insert(docid as u64); + docid + } + }; let mut skip_insertion = false; if let Some(original_docid) = original_docid { @@ -239,12 +238,12 @@ impl<'a, 'i> Transform<'a, 'i> { // we're not replacing anything self.replaced_documents_ids.remove(original_docid); // and we need to put back the original id as it was before - self.new_external_documents_ids_builder.remove(&*external_id); + self.new_external_documents_ids_builder.remove(external_id); skip_insertion = true; } else { // we associate the base document with the new key, everything will get merged later. self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; - match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { + match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { Some(buffer) => { self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)? } @@ -453,7 +452,7 @@ impl<'a, 'i> Transform<'a, 'i> { { let primary_key = self .index - .primary_key(&wtxn)? + .primary_key(wtxn)? .ok_or(Error::UserError(UserError::MissingPrimaryKey))? .to_string(); @@ -520,7 +519,7 @@ impl<'a, 'i> Transform<'a, 'i> { self.new_external_documents_ids_builder.into_iter().collect(); new_external_documents_ids_builder - .sort_unstable_by(|(left, _), (right, _)| left.cmp(&right)); + .sort_unstable_by(|(left, _), (right, _)| left.cmp(right)); let mut fst_new_external_documents_ids_builder = fst::MapBuilder::memory(); new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { fst_new_external_documents_ids_builder.insert(key, value) @@ -614,7 +613,7 @@ impl<'a, 'i> Transform<'a, 'i> { let mut flattened: Vec<_> = flattened.into_iter().collect(); // we reorder the field to get all the known field first flattened.sort_unstable_by_key(|(key, _)| { - new_fields_ids_map.id(&key).unwrap_or(FieldId::MAX) + new_fields_ids_map.id(key).unwrap_or(FieldId::MAX) }); for (key, value) in flattened { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 5b7b00c21..8464c98b6 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -175,7 +175,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut cursor = fid_docid_facet_number.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { - index_fid_docid_facet_numbers.put(wtxn, key, &value)?; + index_fid_docid_facet_numbers.put(wtxn, key, value)?; } } } @@ -185,7 +185,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut cursor = fid_docid_facet_string.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { - index_fid_docid_facet_strings.put(wtxn, key, &value)?; + index_fid_docid_facet_strings.put(wtxn, key, value)?; } } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 0f611572e..b3b1420f8 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -15,7 +15,7 @@ use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::{FieldsIdsMap, Index, Result}; -#[derive(Debug, Clone, PartialEq, Copy)] +#[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { Set(T), Reset, @@ -273,24 +273,21 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // if the settings are set before any document update, we don't need to do anything, and // will set the primary key during the first document addition. - if self.index.number_of_documents(&self.wtxn)? == 0 { + if self.index.number_of_documents(self.wtxn)? == 0 { return Ok(()); } let transform = Transform::new( self.wtxn, - &self.index, - &self.indexer_config, + self.index, + self.indexer_config, IndexDocumentsMethod::ReplaceDocuments, false, )?; // We remap the documents fields based on the new `FieldsIdsMap`. - let output = transform.remap_index_documents( - self.wtxn, - old_fields_ids_map, - fields_ids_map.clone(), - )?; + let output = + transform.remap_index_documents(self.wtxn, old_fields_ids_map, fields_ids_map)?; let new_facets = output.compute_real_facets(self.wtxn, self.index)?; self.index.put_faceted_fields(self.wtxn, &new_facets)?; @@ -303,7 +300,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let indexing_builder = IndexDocuments::new( self.wtxn, self.index, - &self.indexer_config, + self.indexer_config, IndexDocumentsConfig::default(), &cb, )?; @@ -330,7 +327,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_distinct_field(&mut self) -> Result { match self.distinct_field { Setting::Set(ref attr) => { - self.index.put_distinct_field(self.wtxn, &attr)?; + self.index.put_distinct_field(self.wtxn, attr)?; } Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; @@ -356,11 +353,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // Add all the searchable attributes to the field map, and then add the // remaining fields from the old field map to the new one for name in names.iter() { - new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; + new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } for (_, name) in old_fields_ids_map.iter() { - new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; + new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; } self.index.put_all_searchable_fields_from_fields_ids_map( @@ -462,11 +459,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match self.exact_attributes { Setting::Set(ref attrs) => { let attrs = attrs.iter().map(String::as_str).collect::>(); - self.index.put_exact_attributes(&mut self.wtxn, &attrs)?; + self.index.put_exact_attributes(self.wtxn, &attrs)?; Ok(true) } Setting::Reset => { - self.index.delete_exact_attributes(&mut self.wtxn)?; + self.index.delete_exact_attributes(self.wtxn)?; Ok(true) } Setting::NotSet => Ok(false), @@ -528,7 +525,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_primary_key(&mut self) -> Result<()> { match self.primary_key { Setting::Set(ref primary_key) => { - if self.index.number_of_documents(&self.wtxn)? == 0 { + if self.index.number_of_documents(self.wtxn)? == 0 { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -540,7 +537,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } Setting::Reset => { - if self.index.number_of_documents(&self.wtxn)? == 0 { + if self.index.number_of_documents(self.wtxn)? == 0 { self.index.delete_primary_key(self.wtxn)?; Ok(()) } else { @@ -574,24 +571,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { if one > two { return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); } else { - self.index.put_min_word_len_one_typo(&mut self.wtxn, one)?; - self.index.put_min_word_len_two_typos(&mut self.wtxn, two)?; + self.index.put_min_word_len_one_typo(self.wtxn, one)?; + self.index.put_min_word_len_two_typos(self.wtxn, two)?; } } (Setting::Set(one), _) => { - let two = self.index.min_word_len_two_typos(&self.wtxn)?; + let two = self.index.min_word_len_two_typos(self.wtxn)?; if one > two { return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); } else { - self.index.put_min_word_len_one_typo(&mut self.wtxn, one)?; + self.index.put_min_word_len_one_typo(self.wtxn, one)?; } } (_, Setting::Set(two)) => { - let one = self.index.min_word_len_one_typo(&self.wtxn)?; + let one = self.index.min_word_len_one_typo(self.wtxn)?; if one > two { return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); } else { - self.index.put_min_word_len_two_typos(&mut self.wtxn, two)?; + self.index.put_min_word_len_two_typos(self.wtxn, two)?; } } _ => (), @@ -621,10 +618,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { words.sort_unstable(); let words = fst::Set::from_iter(words.iter())?; - self.index.put_exact_words(&mut self.wtxn, &words)?; + self.index.put_exact_words(self.wtxn, &words)?; } Setting::Reset => { - self.index.put_exact_words(&mut self.wtxn, &fst::Set::default())?; + self.index.put_exact_words(self.wtxn, &fst::Set::default())?; } Setting::NotSet => (), } @@ -635,10 +632,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_max_values_per_facet(&mut self) -> Result<()> { match self.max_values_per_facet { Setting::Set(max) => { - self.index.put_max_values_per_facet(&mut self.wtxn, max)?; + self.index.put_max_values_per_facet(self.wtxn, max)?; } Setting::Reset => { - self.index.delete_max_values_per_facet(&mut self.wtxn)?; + self.index.delete_max_values_per_facet(self.wtxn)?; } Setting::NotSet => (), } @@ -649,10 +646,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_pagination_max_total_hits(&mut self) -> Result<()> { match self.pagination_max_total_hits { Setting::Set(max) => { - self.index.put_pagination_max_total_hits(&mut self.wtxn, max)?; + self.index.put_pagination_max_total_hits(self.wtxn, max)?; } Setting::Reset => { - self.index.delete_pagination_max_total_hits(&mut self.wtxn)?; + self.index.delete_pagination_max_total_hits(self.wtxn)?; } Setting::NotSet => (), } @@ -666,8 +663,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; - let old_faceted_fields = self.index.user_defined_faceted_fields(&self.wtxn)?; - let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; + let old_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?; + let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; self.update_displayed()?; self.update_filterable()?; @@ -684,7 +681,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, // an Asc/Desc criterion or a filtered attribute as be added or removed. - let new_faceted_fields = self.index.user_defined_faceted_fields(&self.wtxn)?; + let new_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?; let faceted_updated = old_faceted_fields != new_faceted_fields; let stop_words_updated = self.update_stop_words()?; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 976ff3dd0..b235c44a6 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -61,12 +61,12 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { let mut prefixes_cache = HashMap::new(); while let Some((word, data)) = new_word_docids_iter.move_on_next()? { current_prefixes = match current_prefixes.take() { - Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), + Some(prefixes) if word.starts_with(prefixes[0].as_bytes()) => Some(prefixes), _otherwise => { write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; common_prefix_fst_words .iter() - .find(|prefixes| word.starts_with(&prefixes[0].as_bytes())) + .find(|prefixes| word.starts_with(prefixes[0].as_bytes())) } }; diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 724858e4f..a851b1869 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -257,7 +257,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute<'a>( - mut self, + self, new_word_pair_proximity_docids: grenad::Reader, new_prefix_fst_words: &'a [String], common_prefix_fst_words: &[&'a [String]], @@ -268,9 +268,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length let prefixes = PrefixTrieNode::from_sorted_prefixes( common_prefix_fst_words - .into_iter() - .map(|s| s.into_iter()) - .flatten() + .iter() + .flat_map(|s| s.iter()) .map(|s| s.as_str()) .filter(|s| s.len() <= self.max_prefix_length), ); @@ -298,7 +297,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) |key, value| { insert_into_database( - &mut self.wtxn, + self.wtxn, *self.index.word_prefix_pair_proximity_docids.as_polymorph(), key, value, @@ -311,7 +310,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { let prefixes = PrefixTrieNode::from_sorted_prefixes( new_prefix_fst_words - .into_iter() + .iter() .map(|s| s.as_str()) .filter(|s| s.len() <= self.max_prefix_length), ); @@ -445,7 +444,7 @@ fn execute_on_word_pairs_and_prefixes( let prefix_len = prefix_buffer.len(); prefix_buffer.push(0); prefix_buffer.push(proximity); - batch.insert(&prefix_buffer, data.to_vec()); + batch.insert(prefix_buffer, data.to_vec()); prefix_buffer.truncate(prefix_len); }, ); @@ -620,7 +619,7 @@ impl PrefixTrieNode { fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { let byte = word[0]; if self.children[search_start.0].1 == byte { - return true; + true } else { match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { Ok(position) => { @@ -638,7 +637,7 @@ impl PrefixTrieNode { fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { let mut node = PrefixTrieNode::default(); for prefix in prefixes { - node.insert_sorted_prefix(prefix.as_bytes().into_iter()); + node.insert_sorted_prefix(prefix.as_bytes().iter()); } node } diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index 95c9f3b01..193956c7a 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -42,7 +42,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { #[logging_timer::time("WordsPrefixesFst::{}")] pub fn execute(self) -> Result<()> { - let words_fst = self.index.words_fst(&self.wtxn)?; + let words_fst = self.index.words_fst(self.wtxn)?; let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length]; let mut current_prefix_count = vec![0; self.max_prefix_length]; From 4c481a8947d5eadc3074d276333b64c971ebdf9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Oct 2022 11:29:39 +0200 Subject: [PATCH 1659/1889] Upgrade all dependencies --- benchmarks/Cargo.toml | 18 +++++++++--------- cli/Cargo.toml | 12 ++++++------ filter-parser/Cargo.toml | 4 ++-- flatten-serde-json/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 36 +++++++++++++++++------------------ 6 files changed, 37 insertions(+), 37 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 87c567de9..ee10a1169 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -5,24 +5,24 @@ edition = "2018" publish = false [dependencies] -anyhow = "1.0.56" +anyhow = "1.0.65" csv = "1.1.6" milli = { path = "../milli", default-features = false } mimalloc = { version = "0.1.29", default-features = false } -serde_json = { version = "1.0.79", features = ["preserve_order"] } +serde_json = { version = "1.0.85", features = ["preserve_order"] } [dev-dependencies] -criterion = { version = "0.3.5", features = ["html_reports"] } +criterion = { version = "0.4.0", features = ["html_reports"] } rand = "0.8.5" rand_chacha = "0.3.1" -roaring = "0.9.0" +roaring = "0.10.1" [build-dependencies] -anyhow = "1.0.56" -bytes = "1.1.0" -convert_case = "0.5.0" -flate2 = "1.0.22" -reqwest = { version = "0.11.9", features = ["blocking", "rustls-tls"], default-features = false } +anyhow = "1.0.65" +bytes = "1.2.1" +convert_case = "0.6.0" +flate2 = "1.0.24" +reqwest = { version = "0.11.12", features = ["blocking", "rustls-tls"], default-features = false } [features] default = ["milli/default"] diff --git a/cli/Cargo.toml b/cli/Cargo.toml index f1f5a6beb..62c4b6a73 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -8,15 +8,15 @@ publish = false [dependencies] bimap = "0.6.2" byte-unit = { version = "4.0.14", features = ["serde"] } -color-eyre = "0.6.1" +color-eyre = "0.6.2" csv = "1.1.6" -eyre = "0.6.7" -indicatif = "0.16.2" +eyre = "0.6.8" +indicatif = "0.17.1" milli = { path = "../milli", default-features = false } mimalloc = { version = "0.1.29", default-features = false } -serde = "1.0.136" -serde_json = "1.0.79" -stderrlog = "0.5.1" +serde = "1.0.145" +serde_json = "1.0.85" +stderrlog = "0.5.3" structopt = "0.3.26" [features] diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 684ef44f0..3cccd38a7 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -6,8 +6,8 @@ description = "The parser for the Meilisearch filter syntax" publish = false [dependencies] -nom = "7.1.0" +nom = "7.1.1" nom_locate = "4.0.0" [dev-dependencies] -insta = "1.18.2" +insta = "1.21.0" diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 27da77b78..cd22b4273 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -10,7 +10,7 @@ publish = false serde_json = "1.0" [dev-dependencies] -criterion = { version = "0.3", features = ["html_reports"] } +criterion = { version = "0.4.0", features = ["html_reports"] } [[bench]] name = "benchmarks" diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 460f4a582..d09730332 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -9,7 +9,7 @@ publish = false serde_json = "1.0" [dev-dependencies] -criterion = "0.3" +criterion = "0.4.0" [[bench]] name = "depth" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 8224632ff..a023944e3 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -7,51 +7,51 @@ edition = "2018" [dependencies] bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" -bstr = "0.2.17" +bstr = "1.0.1" byteorder = "1.4.3" charabia = { version = "0.6.0", default-features = false } concat-arrays = "0.1.2" -crossbeam-channel = "0.5.2" -either = "1.6.1" +crossbeam-channel = "0.5.6" +either = "1.8.0" flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" -geoutils = "0.4.1" +geoutils = "0.5.1" grenad = { version = "0.4.3", default-features = false, features = ["tempfile"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } -memmap2 = "0.5.3" +memmap2 = "0.5.7" obkv = "0.2.0" -once_cell = "1.10.0" -ordered-float = "2.10.0" -rayon = "1.5.1" -roaring = "0.9.0" -rstar = { version = "0.9.2", features = ["serde"] } -serde = { version = "1.0.136", features = ["derive"] } -serde_json = { version = "1.0.79", features = ["preserve_order"] } +once_cell = "1.15.0" +ordered-float = "3.2.0" +rayon = "1.5.3" +roaring = "0.10.1" +rstar = { version = "0.9.3", features = ["serde"] } +serde = { version = "1.0.145", features = ["derive"] } +serde_json = { version = "1.0.85", features = ["preserve_order"] } slice-group-by = "0.3.0" smallstr = { version = "0.3.0", features = ["serde"] } -smallvec = "1.8.0" +smallvec = "1.10.0" smartstring = "1.0.1" tempfile = "3.3.0" -thiserror = "1.0.31" -time = { version = "0.3.7", features = ["serde-well-known", "formatting", "parsing", "macros"] } +thiserror = "1.0.37" +time = { version = "0.3.15", features = ["serde-well-known", "formatting", "parsing", "macros"] } uuid = { version = "1.1.2", features = ["v4"] } filter-parser = { path = "../filter-parser" } # documents words self-join -itertools = "0.10.3" +itertools = "0.10.5" # logging -log = "0.4.14" +log = "0.4.17" logging_timer = "1.1.0" csv = "1.1.6" [dev-dependencies] big_s = "1.0.2" -insta = "1.18.1" +insta = "1.21.0" maplit = "1.0.2" md5 = "0.7.0" rand = "0.8.5" From c2ca259f481562103d638f63d58dab7452a3cf40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Oct 2022 13:00:39 +0200 Subject: [PATCH 1660/1889] Update cli to latest `indicatif` crate version --- cli/src/main.rs | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index e3bbced3e..a633e9fa7 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::io::{stdin, BufRead, BufReader, Cursor, Read, Write}; use std::path::PathBuf; use std::str::FromStr; -use std::time::Instant; +use std::time::{Duration, Instant}; use byte_unit::Byte; use eyre::Result; @@ -267,10 +267,6 @@ impl Performer for DocumentAddition { return Err(error.into()); } - std::thread::spawn(move || { - progesses.join().unwrap(); - }); - let result = addition.execute()?; txn.commit()?; @@ -287,18 +283,19 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa let prev = &bars[step_index - 1]; if !prev.is_finished() { prev.disable_steady_tick(); - prev.finish_at_current_pos(); + prev.finish(); } } let style = ProgressStyle::default_bar() + .progress_chars("##-") .template("[eta: {eta_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}") - .progress_chars("##-"); + .unwrap(); match step { RemapDocumentAddition { documents_seen } => { bar.set_style(ProgressStyle::default_spinner()); - bar.set_message(format!("remaped {} documents so far.", documents_seen)); + bar.set_message(format!("remapped {} documents so far.", documents_seen)); } ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { bar.set_style(style); @@ -319,7 +316,7 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa bar.set_position(databases_seen as u64); } } - bar.enable_steady_tick(200); + bar.enable_steady_tick(Duration::from_millis(200)); } fn documents_from_jsonl(reader: impl Read) -> Result> { @@ -520,10 +517,6 @@ impl Performer for SettingsUpdate { bars.push(bar); } - std::thread::spawn(move || { - progesses.join().unwrap(); - }); - update.execute(|step| indexing_callback(step, &bars))?; txn.commit()?; From a3968063436911975fb1fc47e1c1ed7085c13e0c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 12 Jul 2022 17:56:50 +0200 Subject: [PATCH 1661/1889] Add settings to force milli to exhaustively compute the total number of hits --- milli/src/search/criteria/initial.rs | 32 ++++++++++++++++++++++------ milli/src/search/criteria/mod.rs | 4 +++- milli/src/search/criteria/typo.rs | 9 ++++---- milli/src/search/mod.rs | 10 +++++++++ milli/tests/search/query_criteria.rs | 2 +- 5 files changed, 45 insertions(+), 12 deletions(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 514dbff96..2aabe9b13 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -1,17 +1,22 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::{resolve_query_tree, Context}; use crate::search::query_tree::Operation; use crate::Result; -pub struct Initial { +pub struct Initial<'t> { + ctx: &'t dyn Context<'t>, answer: Option, + exhaustive_number_hits: bool, } -impl Initial { +impl<'t> Initial<'t> { pub fn new( + ctx: &'t dyn Context<'t>, query_tree: Option, filtered_candidates: Option, + exhaustive_number_hits: bool, ) -> Initial { let answer = CriterionResult { query_tree, @@ -19,13 +24,28 @@ impl Initial { filtered_candidates, bucket_candidates: None, }; - Initial { answer: Some(answer) } + Initial { ctx, answer: Some(answer), exhaustive_number_hits } } } -impl Criterion for Initial { +impl Criterion for Initial<'_> { #[logging_timer::time("Initial::{}")] - fn next(&mut self, _: &mut CriterionParameters) -> Result> { - Ok(self.answer.take()) + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + self.answer + .take() + .map(|mut answer| { + if self.exhaustive_number_hits && answer.query_tree.is_some() { + let candidates = resolve_query_tree( + self.ctx, + answer.query_tree.as_ref().unwrap(), + &mut params.wdcache, + )?; + + answer.candidates = Some(candidates.clone()); + answer.bucket_candidates = Some(candidates); + } + Ok(answer) + }) + .transpose() } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index f48865ba5..6c4fa51d3 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -232,13 +232,15 @@ impl<'t> CriteriaBuilder<'t> { primitive_query: Option>, filtered_candidates: Option, sort_criteria: Option>, + exhaustive_number_hits: bool, ) -> Result> { use crate::criterion::Criterion as Name; let primitive_query = primitive_query.unwrap_or_default(); let mut criterion = - Box::new(Initial::new(query_tree, filtered_candidates)) as Box; + Box::new(Initial::new(self, query_tree, filtered_candidates, exhaustive_number_hits)) + as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { Name::Words => Box::new(Words::new(self, criterion)), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index e9e6fb2f5..f1537ed48 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -368,7 +368,7 @@ mod test { excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(query_tree, facet_candidates); + let parent = Initial::new(&context, query_tree, facet_candidates, false); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -405,7 +405,7 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(Some(query_tree), facet_candidates); + let parent = Initial::new(&context, Some(query_tree), facet_candidates, false); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -439,7 +439,7 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(query_tree, Some(facet_candidates.clone())); + let parent = Initial::new(&context, query_tree, Some(facet_candidates.clone()), false); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -476,7 +476,8 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone())); + let parent = + Initial::new(&context, Some(query_tree), Some(facet_candidates.clone()), false); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7145c1445..6f1e1b34c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -47,6 +47,7 @@ pub struct Search<'a> { terms_matching_strategy: TermsMatchingStrategy, authorize_typos: bool, words_limit: usize, + exhaustive_number_hits: bool, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } @@ -61,6 +62,7 @@ impl<'a> Search<'a> { sort_criteria: None, terms_matching_strategy: TermsMatchingStrategy::default(), authorize_typos: true, + exhaustive_number_hits: false, words_limit: 10, rtxn, index, @@ -107,6 +109,11 @@ impl<'a> Search<'a> { self } + pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> { + self.exhaustive_number_hits = exhaustive_number_hits; + self + } + fn is_typo_authorized(&self) -> Result { let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; // only authorize typos if both the index and the query allow it. @@ -189,6 +196,7 @@ impl<'a> Search<'a> { primitive_query, filtered_candidates, self.sort_criteria.clone(), + self.exhaustive_number_hits, )?; match self.index.distinct_field(self.rtxn)? { @@ -262,6 +270,7 @@ impl fmt::Debug for Search<'_> { terms_matching_strategy, authorize_typos, words_limit, + exhaustive_number_hits, rtxn: _, index: _, } = self; @@ -273,6 +282,7 @@ impl fmt::Debug for Search<'_> { .field("sort_criteria", sort_criteria) .field("terms_matching_strategy", terms_matching_strategy) .field("authorize_typos", authorize_typos) + .field("exhaustive_number_hits", exhaustive_number_hits) .field("words_limit", words_limit) .finish() } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 8b72c8420..f873f56f7 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -51,7 +51,7 @@ macro_rules! test_criterion { }; } -test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![], vec![]); +test_criterion!(none_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![], vec![]); test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![], vec![]); test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words], vec![]); test_criterion!( From d71bc1e69fc68c8cf4b5a3abb503fda7c8afdf1b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Jul 2022 16:52:45 +0200 Subject: [PATCH 1662/1889] Compute an exact count when using distinct --- milli/src/search/criteria/initial.rs | 29 ++++++++++++++++----- milli/src/search/criteria/mod.rs | 15 +++++++---- milli/src/search/criteria/typo.rs | 24 +++++++++++++---- milli/src/search/distinct/facet_distinct.rs | 1 + milli/src/search/mod.rs | 28 ++++++++++++++------ 5 files changed, 72 insertions(+), 25 deletions(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 2aabe9b13..bae77fda0 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -3,32 +3,35 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{resolve_query_tree, Context}; use crate::search::query_tree::Operation; +use crate::search::Distinct; use crate::Result; -pub struct Initial<'t> { +pub struct Initial<'t, D> { ctx: &'t dyn Context<'t>, answer: Option, exhaustive_number_hits: bool, + distinct: Option, } -impl<'t> Initial<'t> { +impl<'t, D> Initial<'t, D> { pub fn new( ctx: &'t dyn Context<'t>, query_tree: Option, filtered_candidates: Option, exhaustive_number_hits: bool, - ) -> Initial { + distinct: Option, + ) -> Initial { let answer = CriterionResult { query_tree, candidates: None, filtered_candidates, bucket_candidates: None, }; - Initial { ctx, answer: Some(answer), exhaustive_number_hits } + Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct } } } -impl Criterion for Initial<'_> { +impl Criterion for Initial<'_, D> { #[logging_timer::time("Initial::{}")] fn next(&mut self, params: &mut CriterionParameters) -> Result> { self.answer @@ -41,8 +44,20 @@ impl Criterion for Initial<'_> { &mut params.wdcache, )?; - answer.candidates = Some(candidates.clone()); - answer.bucket_candidates = Some(candidates); + let bucket_candidates = match &mut self.distinct { + // may be really time consuming + Some(distinct) => { + let mut bucket_candidates = RoaringBitmap::new(); + for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { + bucket_candidates.insert(c?); + } + bucket_candidates + } + None => candidates.clone(), + }; + + answer.candidates = Some(candidates); + answer.bucket_candidates = Some(bucket_candidates); } Ok(answer) }) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 6c4fa51d3..866eaefde 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -13,7 +13,7 @@ use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::search::criteria::geo::Geo; -use crate::search::{word_derivations, WordDerivationsCache}; +use crate::search::{word_derivations, Distinct, WordDerivationsCache}; use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; mod asc_desc; @@ -226,21 +226,26 @@ impl<'t> CriteriaBuilder<'t> { Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) } - pub fn build( + pub fn build( &'t self, query_tree: Option, primitive_query: Option>, filtered_candidates: Option, sort_criteria: Option>, exhaustive_number_hits: bool, + distinct: Option, ) -> Result> { use crate::criterion::Criterion as Name; let primitive_query = primitive_query.unwrap_or_default(); - let mut criterion = - Box::new(Initial::new(self, query_tree, filtered_candidates, exhaustive_number_hits)) - as Box; + let mut criterion = Box::new(Initial::new( + self, + query_tree, + filtered_candidates, + exhaustive_number_hits, + distinct, + )) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { Name::Words => Box::new(Words::new(self, criterion)), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index f1537ed48..605089fae 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -348,6 +348,7 @@ mod test { use super::super::initial::Initial; use super::super::test::TestContext; use super::*; + use crate::search::NoopDistinct; fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String { let mut result = String::new(); @@ -368,7 +369,8 @@ mod test { excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(&context, query_tree, facet_candidates, false); + let parent = + Initial::::new(&context, query_tree, facet_candidates, false, None); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -405,7 +407,8 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(&context, Some(query_tree), facet_candidates, false); + let parent = + Initial::::new(&context, Some(query_tree), facet_candidates, false, None); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -439,7 +442,13 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(&context, query_tree, Some(facet_candidates.clone()), false); + let parent = Initial::::new( + &context, + query_tree, + Some(facet_candidates.clone()), + false, + None, + ); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -476,8 +485,13 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = - Initial::new(&context, Some(query_tree), Some(facet_candidates.clone()), false); + let parent = Initial::::new( + &context, + Some(query_tree), + Some(facet_candidates.clone()), + false, + None, + ); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 4436d4cda..33e7b4975 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -21,6 +21,7 @@ const DOCID_SIZE: usize = size_of::(); /// care to keep the document we are currently on, and remove it from the excluded list. The next /// iterations will never contain any occurence of a document with the same distinct value as a /// document from previous iterations. +#[derive(Clone)] pub struct FacetDistinct<'a> { distinct: FieldId, index: &'a Index, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 6f1e1b34c..270beb52a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -191,21 +191,33 @@ impl<'a> Search<'a> { } let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let criteria = criteria_builder.build( - query_tree, - primitive_query, - filtered_candidates, - self.sort_criteria.clone(), - self.exhaustive_number_hits, - )?; match self.index.distinct_field(self.rtxn)? { - None => self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria), + None => { + let criteria = criteria_builder.build::( + query_tree, + primitive_query, + filtered_candidates, + self.sort_criteria.clone(), + self.exhaustive_number_hits, + None, + )?; + self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria) + } Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; match field_ids_map.id(name) { Some(fid) => { let distinct = FacetDistinct::new(fid, self.index, self.rtxn); + + let criteria = criteria_builder.build( + query_tree, + primitive_query, + filtered_candidates, + self.sort_criteria.clone(), + self.exhaustive_number_hits, + Some(distinct.clone()), + )?; self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) } None => Ok(SearchResult::default()), From cf203b7fde8ddbdf711cde6f53ef0298fc3af8d9 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 20 Jul 2022 15:58:26 +0200 Subject: [PATCH 1663/1889] Take filter in account when computing the pages candidates --- milli/src/search/criteria/initial.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index bae77fda0..9a9565182 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -38,12 +38,16 @@ impl Criterion for Initial<'_, D> { .take() .map(|mut answer| { if self.exhaustive_number_hits && answer.query_tree.is_some() { - let candidates = resolve_query_tree( + let mut candidates = resolve_query_tree( self.ctx, answer.query_tree.as_ref().unwrap(), &mut params.wdcache, )?; + if let Some(ref filtered_candidates) = answer.filtered_candidates { + candidates &= filtered_candidates; + } + let bucket_candidates = match &mut self.distinct { // may be really time consuming Some(distinct) => { From 6f55e7844ce7a72f2b4ee7e64be9d1e98cec7700 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 17 Oct 2022 14:41:57 +0200 Subject: [PATCH 1664/1889] Add some code comments --- milli/src/search/criteria/initial.rs | 9 +++++++-- milli/src/search/mod.rs | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 9a9565182..14d368d4e 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -5,7 +5,9 @@ use crate::search::criteria::{resolve_query_tree, Context}; use crate::search::query_tree::Operation; use crate::search::Distinct; use crate::Result; - +/// Initial is a mandatory criterion, it is always the first +/// and is meant to initalize the CriterionResult used by the other criteria. +/// It behave like an [Once Iterator](https://doc.rust-lang.org/std/iter/struct.Once.html) and will return Some(CriterionResult) only one time. pub struct Initial<'t, D> { ctx: &'t dyn Context<'t>, answer: Option, @@ -38,18 +40,21 @@ impl Criterion for Initial<'_, D> { .take() .map(|mut answer| { if self.exhaustive_number_hits && answer.query_tree.is_some() { + // resolve the whole query tree to retrieve an exhastive list of documents matching the query. let mut candidates = resolve_query_tree( self.ctx, answer.query_tree.as_ref().unwrap(), &mut params.wdcache, )?; + // Apply the filters on the documents retrieved with the query tree. if let Some(ref filtered_candidates) = answer.filtered_candidates { candidates &= filtered_candidates; } + // because the bucket_candidates should be an exhastive count of the matching documents, + // we precompute the distinct attributes. let bucket_candidates = match &mut self.distinct { - // may be really time consuming Some(distinct) => { let mut bucket_candidates = RoaringBitmap::new(); for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 270beb52a..20003c676 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -109,6 +109,8 @@ impl<'a> Search<'a> { self } + /// Force the search to exhastivelly compute the number of candidates, + /// this will increase the search time but allows finite pagination. pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> { self.exhaustive_number_hits = exhaustive_number_hits; self From 6603437cb177d6e8ee7e5a4d67f09d2d39c78b84 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 5 Oct 2022 17:41:07 +0200 Subject: [PATCH 1665/1889] Introduce an indexation abortion function when indexing documents --- benchmarks/benches/indexing.rs | 362 +++++++++++++----- benchmarks/benches/utils.rs | 5 +- cli/src/main.rs | 5 +- milli/src/error.rs | 2 + milli/src/index.rs | 3 +- milli/src/search/distinct/mod.rs | 5 +- milli/src/update/index_documents/mod.rs | 69 +++- milli/src/update/index_documents/transform.rs | 12 +- milli/src/update/settings.rs | 20 +- milli/tests/search/distinct.rs | 2 +- milli/tests/search/facet_distribution.rs | 5 +- milli/tests/search/mod.rs | 5 +- milli/tests/search/query_criteria.rs | 9 +- milli/tests/search/typo_tolerance.rs | 11 +- 14 files changed, 379 insertions(+), 136 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index a409e1343..d567b3da1 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -59,7 +59,7 @@ fn setup_settings<'t>( let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); builder.set_sortable_fields(sortable_fields); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); } fn setup_index_with_settings<'t>( @@ -131,9 +131,15 @@ fn indexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -169,9 +175,15 @@ fn reindexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -185,9 +197,15 @@ fn reindexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -225,9 +243,15 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); @@ -282,9 +306,15 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); @@ -298,18 +328,30 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); @@ -345,9 +387,15 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); @@ -384,9 +432,15 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -423,9 +477,15 @@ fn indexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -461,9 +521,15 @@ fn reindexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -478,9 +544,15 @@ fn reindexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -518,9 +590,15 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); @@ -576,9 +654,15 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -594,9 +678,15 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); @@ -606,9 +696,15 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); @@ -646,9 +742,15 @@ fn indexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -683,9 +785,15 @@ fn reindexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -699,9 +807,15 @@ fn reindexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -738,9 +852,15 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); @@ -794,9 +914,15 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { // as we don't care about the time it takes. let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -811,9 +937,15 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -821,9 +953,15 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -883,9 +1021,15 @@ fn indexing_nested_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -945,9 +1089,15 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); @@ -1008,9 +1158,15 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -1046,9 +1202,15 @@ fn indexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -1084,9 +1246,15 @@ fn reindexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -1101,9 +1269,15 @@ fn reindexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -1141,9 +1315,15 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) - .unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index a240ce299..511b3b8d5 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -86,7 +86,7 @@ pub fn base_setup(conf: &Conf) -> Index { (conf.configure)(&mut builder); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); let config = IndexerConfig::default(); @@ -96,7 +96,8 @@ pub fn base_setup(conf: &Conf) -> Index { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); let documents = documents_from(conf.dataset, conf.dataset_format); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); diff --git a/cli/src/main.rs b/cli/src/main.rs index a633e9fa7..dd5489ebc 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -239,7 +239,7 @@ impl Performer for DocumentAddition { if let Some(primary) = self.primary { let mut builder = update::Settings::new(&mut txn, &index, &config); builder.set_primary_key(primary); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); } let indexing_config = IndexDocumentsConfig { @@ -260,6 +260,7 @@ impl Performer for DocumentAddition { &config, indexing_config, |step| indexing_callback(step, &bars), + || false, ) .unwrap(); let (addition, user_error) = addition.add_documents(reader)?; @@ -517,7 +518,7 @@ impl Performer for SettingsUpdate { bars.push(bar); } - update.execute(|step| indexing_callback(step, &bars))?; + update.execute(|step| indexing_callback(step, &bars), || false)?; txn.commit()?; Ok(()) diff --git a/milli/src/error.rs b/milli/src/error.rs index d3f0a179f..bd691ab1d 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -56,6 +56,8 @@ pub enum InternalError { Store(#[from] MdbError), #[error(transparent)] Utf8(#[from] str::Utf8Error), + #[error("An indexation process was explicitly aborted.")] + AbortedIndexation, } #[derive(Error, Debug)] diff --git a/milli/src/index.rs b/milli/src/index.rs index 0dccabf03..ca95e78bc 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1245,6 +1245,7 @@ pub(crate) mod tests { &self.indexer_config, self.index_documents_config.clone(), |_| (), + || false, ) .unwrap(); let (builder, user_error) = builder.add_documents(documents).unwrap(); @@ -1281,7 +1282,7 @@ pub(crate) mod tests { ) -> Result<(), crate::error::Error> { let mut builder = update::Settings::new(wtxn, &self.inner, &self.indexer_config); update(&mut builder); - builder.execute(drop)?; + builder.execute(drop, || false)?; Ok(()) } } diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 1a9c56cf3..b6ed26917 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -89,7 +89,7 @@ mod test { let config = IndexerConfig::default(); let mut update = Settings::new(&mut txn, &index, &config); update.set_distinct_field(distinct.to_string()); - update.execute(|_| ()).unwrap(); + update.execute(|_| (), || false).unwrap(); // add documents to the index let config = IndexerConfig::default(); @@ -98,7 +98,8 @@ mod test { ..Default::default() }; let addition = - IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); + IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| (), || false) + .unwrap(); let reader = crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e0eefe07b..31b0200f3 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -33,7 +33,7 @@ pub use self::helpers::{ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::{obkv_to_object, DocumentsBatchReader}; -use crate::error::UserError; +use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, @@ -71,13 +71,14 @@ impl Default for IndexDocumentsMethod { } } -pub struct IndexDocuments<'t, 'u, 'i, 'a, F> { +pub struct IndexDocuments<'t, 'u, 'i, 'a, FP, FA> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, config: IndexDocumentsConfig, indexer_config: &'a IndexerConfig, transform: Option>, - progress: F, + progress: FP, + should_abort: FA, added_documents: u64, } @@ -93,17 +94,19 @@ pub struct IndexDocumentsConfig { pub autogenerate_docids: bool, } -impl<'t, 'u, 'i, 'a, F> IndexDocuments<'t, 'u, 'i, 'a, F> +impl<'t, 'u, 'i, 'a, FP, FA> IndexDocuments<'t, 'u, 'i, 'a, FP, FA> where - F: Fn(UpdateIndexingStep) + Sync, + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, indexer_config: &'a IndexerConfig, config: IndexDocumentsConfig, - progress: F, - ) -> Result> { + progress: FP, + should_abort: FA, + ) -> Result> { let transform = Some(Transform::new( wtxn, &index, @@ -117,6 +120,7 @@ where config, indexer_config, progress, + should_abort, wtxn, index, added_documents: 0, @@ -151,12 +155,13 @@ where Err(user_error) => return Ok((self, Err(user_error))), }; - let indexed_documents = self - .transform - .as_mut() - .expect("Invalid document addition state") - .read_documents(enriched_documents_reader, self.wtxn, &self.progress)? - as u64; + let indexed_documents = + self.transform.as_mut().expect("Invalid document addition state").read_documents( + enriched_documents_reader, + self.wtxn, + &self.progress, + &self.should_abort, + )? as u64; self.added_documents += indexed_documents; @@ -200,7 +205,8 @@ where #[logging_timer::time("IndexDocuments::{}")] pub fn execute_raw(self, output: TransformOutput) -> Result where - F: Fn(UpdateIndexingStep) + Sync, + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, { let TransformOutput { primary_key, @@ -355,6 +361,10 @@ where }); for result in lmdb_writer_rx { + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + let typed_chunk = match result? { TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; @@ -431,11 +441,16 @@ where word_position_docids: Option>, ) -> Result<()> where - F: Fn(UpdateIndexingStep) + Sync, + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, { // Merged databases are already been indexed, we start from this count; let mut databases_seen = MERGED_DATABASE_COUNT; + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + // Run the facets update operation. let mut builder = Facets::new(self.wtxn, self.index); builder.chunk_compression_type = self.indexer_config.chunk_compression_type; @@ -454,6 +469,10 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + let previous_words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?; @@ -467,6 +486,10 @@ where } builder.execute()?; + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + let current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; // We retrieve the common words between the previous and new prefix word fst. @@ -494,6 +517,10 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + if let Some(word_docids) = word_docids { execute_word_prefix_docids( self.wtxn, @@ -520,6 +547,10 @@ where )?; } + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen, @@ -541,6 +572,10 @@ where )?; } + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen, @@ -568,6 +603,10 @@ where )?; } + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f52d5c7af..781b2cbc6 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -138,15 +138,17 @@ impl<'a, 'i> Transform<'a, 'i> { }) } - pub fn read_documents( + pub fn read_documents( &mut self, reader: EnrichedDocumentsBatchReader, wtxn: &mut heed::RwTxn, - progress_callback: F, + progress_callback: FP, + should_abort: FA, ) -> Result where R: Read + Seek, - F: Fn(UpdateIndexingStep) + Sync, + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, { let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); @@ -165,6 +167,10 @@ impl<'a, 'i> Transform<'a, 'i> { while let Some(enriched_document) = cursor.next_enriched_document()? { let EnrichedDocument { document, document_id } = enriched_document; + if should_abort() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + // drop_and_reuse is called instead of .clear() to communicate to the compiler that field_buffer // does not keep references from the cursor between loop iterations let mut field_buffer_cache = drop_and_reuse(field_buffer); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 0f611572e..5aed2aeb3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -266,9 +266,15 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.pagination_max_total_hits = Setting::Reset; } - fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> + fn reindex( + &mut self, + progress_callback: &FP, + should_abort: &FA, + old_fields_ids_map: FieldsIdsMap, + ) -> Result<()> where - F: Fn(UpdateIndexingStep) + Sync, + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, { let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // if the settings are set before any document update, we don't need to do anything, and @@ -305,7 +311,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index, &self.indexer_config, IndexDocumentsConfig::default(), - &cb, + &progress_callback, + &should_abort, )?; indexing_builder.execute_raw(output)?; @@ -660,9 +667,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } - pub fn execute(mut self, progress_callback: F) -> Result<()> + pub fn execute(mut self, progress_callback: FP, should_abort: FA) -> Result<()> where - F: Fn(UpdateIndexingStep) + Sync, + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; @@ -698,7 +706,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { || searchable_updated || exact_attributes_updated { - self.reindex(&progress_callback, old_fields_ids_map)?; + self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?; } Ok(()) diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index 64dd16f09..c2b7e2c1e 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -19,7 +19,7 @@ macro_rules! test_distinct { let config = milli::update::IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_distinct_field(S(stringify!($distinct))); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index 83d692d7f..e2f89f2db 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -23,13 +23,14 @@ fn test_facet_distribution_with_no_facet_values() { S("genres"), S("tags"), }); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); // index documents let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new( r#"{ diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 4ec1aeb83..c8b01648c 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -57,13 +57,14 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("america") => vec![S("the united states")], }); builder.set_searchable_fields(vec![S("title"), S("description")]); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); // index documents let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 8b72c8420..74d771214 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -345,7 +345,7 @@ fn criteria_mixup() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(criteria.iter().map(ToString::to_string).collect()); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); let mut rtxn = index.read_txn().unwrap(); @@ -385,12 +385,13 @@ fn criteria_ascdesc() { S("name"), S("age"), }); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); // index documents let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); let mut batch_builder = DocumentsBatchBuilder::new(Vec::new()); @@ -436,7 +437,7 @@ fn criteria_ascdesc() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_criteria(vec![criterion.to_string()]); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); let mut rtxn = index.read_txn().unwrap(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 7dc6b0c4f..c939186e5 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -40,7 +40,7 @@ fn test_typo_tolerance_one_typo() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); builder.set_min_word_len_one_typo(4); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); @@ -86,7 +86,7 @@ fn test_typo_tolerance_two_typo() { let config = IndexerConfig::default(); let mut builder = Settings::new(&mut txn, &index, &config); builder.set_min_word_len_two_typos(7); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); // typo is now supported for 4 letters words let mut search = Search::new(&txn, &index); @@ -127,7 +127,8 @@ fn test_typo_disabled_on_word() { let mut txn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = + IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| (), || false).unwrap(); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); @@ -156,7 +157,7 @@ fn test_typo_disabled_on_word() { // `zealand` doesn't allow typos anymore exact_words.insert("zealand".to_string()); builder.set_exact_words(exact_words); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); let mut search = Search::new(&txn, &index); search.query("zealand"); @@ -194,7 +195,7 @@ fn test_disable_typo_on_attribute() { let mut builder = Settings::new(&mut txn, &index, &config); // disable typos on `description` builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); let mut search = Search::new(&txn, &index); search.query("antebelum"); From fc03e536153d61da3224698f34fb8c6ee2312c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 30 Aug 2022 17:17:50 +0200 Subject: [PATCH 1666/1889] Add a test to check that we can abort an indexation --- milli/src/index.rs | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index ca95e78bc..5c0233b9a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1196,6 +1196,7 @@ pub(crate) mod tests { use tempfile::TempDir; use crate::documents::DocumentsBatchReader; + use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use crate::{db_snap, Index}; @@ -1287,6 +1288,40 @@ pub(crate) mod tests { } } + #[test] + fn aborting_indexation() { + use std::sync::atomic::AtomicBool; + use std::sync::atomic::Ordering::Relaxed; + + let index = TempIndex::new(); + let mut wtxn = index.inner.write_txn().unwrap(); + + let should_abort = AtomicBool::new(false); + let builder = IndexDocuments::new( + &mut wtxn, + &index.inner, + &index.indexer_config, + index.index_documents_config.clone(), + |_| (), + || should_abort.load(Relaxed), + ) + .unwrap(); + + let (builder, user_error) = builder + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "bob", "age": 20 }, + { "id": 2, "name": "bob", "age": 20 }, + ])) + .unwrap(); + user_error.unwrap(); + + should_abort.store(true, Relaxed); + let err = builder.execute().unwrap_err(); + + assert!(matches!(err, Error::InternalError(InternalError::AbortedIndexation))); + } + #[test] fn initial_field_distribution() { let index = TempIndex::new(); From 516e838eb4cadd27df43d84a288cd11d1d2e3bb0 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 17 Oct 2022 18:23:15 +0200 Subject: [PATCH 1667/1889] Update milli/src/search/criteria/initial.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/initial.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 14d368d4e..195c926be 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -40,7 +40,7 @@ impl Criterion for Initial<'_, D> { .take() .map(|mut answer| { if self.exhaustive_number_hits && answer.query_tree.is_some() { - // resolve the whole query tree to retrieve an exhastive list of documents matching the query. + // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. let mut candidates = resolve_query_tree( self.ctx, answer.query_tree.as_ref().unwrap(), From 81919a35a27f588b75893fd3b18644feaa6c325a Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 17 Oct 2022 18:23:20 +0200 Subject: [PATCH 1668/1889] Update milli/src/search/criteria/initial.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/initial.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 195c926be..ac61adfe2 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -52,7 +52,7 @@ impl Criterion for Initial<'_, D> { candidates &= filtered_candidates; } - // because the bucket_candidates should be an exhastive count of the matching documents, + // because the bucket_candidates should be an exhaustive count of the matching documents, // we precompute the distinct attributes. let bucket_candidates = match &mut self.distinct { Some(distinct) => { From bdeb47305e52dfbeccc5cabc10ffccdd94054759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 13:54:12 +0200 Subject: [PATCH 1669/1889] Change encoding of word_pair_proximity DB to (proximity, word1, word2) Same for word_prefix_pair_proximity --- milli/src/heed_codec/str_str_u8_codec.rs | 20 +- milli/src/snapshot_tests.rs | 2 +- .../extract_word_pair_proximity_docids.rs | 3 +- .../word_prefix_pair_proximity_docids.snap | 84 ++++---- ...ord_prefix_pair_proximity_docids.hash.snap | 2 +- .../word_prefix_pair_proximity_docids.rs | 198 +++++++----------- 6 files changed, 130 insertions(+), 179 deletions(-) diff --git a/milli/src/heed_codec/str_str_u8_codec.rs b/milli/src/heed_codec/str_str_u8_codec.rs index 888e08752..6cfff3ecf 100644 --- a/milli/src/heed_codec/str_str_u8_codec.rs +++ b/milli/src/heed_codec/str_str_u8_codec.rs @@ -7,12 +7,11 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { type DItem = (&'a str, &'a str, u8); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (n, bytes) = bytes.split_last()?; + let (n, bytes) = bytes.split_first()?; let s1_end = bytes.iter().position(|b| *b == 0)?; let (s1_bytes, rest) = bytes.split_at(s1_end); - let rest = &rest[1..]; + let s2_bytes = &rest[1..]; let s1 = str::from_utf8(s1_bytes).ok()?; - let (_, s2_bytes) = rest.split_last()?; let s2 = str::from_utf8(s2_bytes).ok()?; Some((s1, s2, *n)) } @@ -22,12 +21,11 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { type EItem = (&'a str, &'a str, u8); fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1); + let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); + bytes.push(*n); bytes.extend_from_slice(s1.as_bytes()); bytes.push(0); bytes.extend_from_slice(s2.as_bytes()); - bytes.push(0); - bytes.push(*n); Some(Cow::Owned(bytes)) } } @@ -37,11 +35,10 @@ impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec { type DItem = (&'a [u8], &'a [u8], u8); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (n, bytes) = bytes.split_last()?; + let (n, bytes) = bytes.split_first()?; let s1_end = bytes.iter().position(|b| *b == 0)?; let (s1_bytes, rest) = bytes.split_at(s1_end); - let rest = &rest[1..]; - let (_, s2_bytes) = rest.split_last()?; + let s2_bytes = &rest[1..]; Some((s1_bytes, s2_bytes, *n)) } } @@ -50,12 +47,11 @@ impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec { type EItem = (&'a [u8], &'a [u8], u8); fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1); + let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); + bytes.push(*n); bytes.extend_from_slice(s1); bytes.push(0); bytes.extend_from_slice(s2); - bytes.push(0); - bytes.push(*n); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index eac3340fd..17f490758 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -194,7 +194,7 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { (word1, prefix, proximity), b, )| { - &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b)) + &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) }); snap } diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 9448f0e23..3837c1bbe 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -151,11 +151,10 @@ fn document_word_positions_into_sorter<'b>( let mut key_buffer = Vec::new(); for ((w1, w2), prox) in word_pair_proximity { key_buffer.clear(); + key_buffer.push(prox as u8); key_buffer.extend_from_slice(w1.as_bytes()); key_buffer.push(0); key_buffer.extend_from_slice(w2.as_bytes()); - key_buffer.push(0); - key_buffer.push(prox as u8); word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; } diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap index 0a61cf4e8..47a6df343 100644 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap @@ -1,46 +1,46 @@ --- source: milli/src/update/word_prefix_pair_proximity_docids.rs --- -5 a 1 [101, ] -5 a 2 [101, ] -5 b 4 [101, ] -5 be 4 [101, ] -am a 3 [101, ] -amazing a 1 [100, ] -amazing a 2 [100, ] -amazing a 3 [100, ] -amazing b 2 [100, ] -amazing be 2 [100, ] -an a 1 [100, ] -an a 2 [100, ] -an b 3 [100, ] -an be 3 [100, ] -and a 2 [100, ] -and a 3 [100, ] -and a 4 [100, ] -and b 1 [100, ] -and be 1 [100, ] -at a 1 [100, ] -at a 2 [100, 101, ] -at a 3 [100, ] -at b 3 [101, ] -at b 4 [100, ] -at be 3 [101, ] -at be 4 [100, ] -beautiful a 2 [100, ] -beautiful a 3 [100, ] -beautiful a 4 [100, ] -bell a 2 [101, ] -bell a 4 [101, ] -house a 3 [100, ] -house a 4 [100, ] -house b 2 [100, ] -house be 2 [100, ] -rings a 1 [101, ] -rings a 3 [101, ] -rings b 2 [101, ] -rings be 2 [101, ] -the a 3 [101, ] -the b 1 [101, ] -the be 1 [101, ] +1 5 a [101, ] +1 amazing a [100, ] +1 an a [100, ] +1 and b [100, ] +1 and be [100, ] +1 at a [100, ] +1 rings a [101, ] +1 the b [101, ] +1 the be [101, ] +2 5 a [101, ] +2 amazing a [100, ] +2 amazing b [100, ] +2 amazing be [100, ] +2 an a [100, ] +2 and a [100, ] +2 at a [100, 101, ] +2 beautiful a [100, ] +2 bell a [101, ] +2 house b [100, ] +2 house be [100, ] +2 rings b [101, ] +2 rings be [101, ] +3 am a [101, ] +3 amazing a [100, ] +3 an b [100, ] +3 an be [100, ] +3 and a [100, ] +3 at a [100, ] +3 at b [101, ] +3 at be [101, ] +3 beautiful a [100, ] +3 house a [100, ] +3 rings a [101, ] +3 the a [101, ] +4 5 b [101, ] +4 5 be [101, ] +4 and a [100, ] +4 at b [100, ] +4 at be [100, ] +4 beautiful a [100, ] +4 bell a [101, ] +4 house a [100, ] diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap index a39ee07b5..bb2cc3b84 100644 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/word_prefix_pair_proximity_docids.rs --- -5ed4bf83317b10962a55ade353427bdd +fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 724858e4f..f919aecc7 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,7 +1,7 @@ /*! ## What is WordPrefixPairProximityDocids? The word-prefix-pair-proximity-docids database is a database whose keys are of -the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of +the form `(proximity, word, prefix)` and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. @@ -23,127 +23,100 @@ dog Note that only prefixes which correspond to more than a certain number of different words from the database are included in this list. -* a sorted list of word pairs and the distance between them (i.e. proximity), -* associated with a roaring bitmap, such as: +* a sorted list of proximities and word pairs (the proximity is the distance between the two words), +associated with a roaring bitmap, such as: ```text -good dog 3 -> docids1: [2, 5, 6] -good doggo 1 -> docids2: [8] -good dogma 1 -> docids3: [7, 19, 20] -good ghost 2 -> docids4: [1] -horror cathedral 4 -> docids5: [1, 2] +1 good doggo -> docids1: [8] +1 good door -> docids2: [7, 19, 20] +1 good ghost -> docids3: [1] +2 good dog -> docids4: [2, 5, 6] +2 horror cathedral -> docids5: [1, 2] ``` I illustrate a simplified version of the algorithm to create the word-prefix pair-proximity database below: -1. **Outer loop:** First, we iterate over each word pair and its proximity: +1. **Outer loop:** First, we iterate over each proximity and word pair: ```text +proximity: 1 word1 : good -word2 : dog -proximity: 3 +word2 : doggo ``` 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are -in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) +in the list of sorted prefixes. And we insert the key `prefix` and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: ```text Outer loop 1: ------------------------------ +proximity: 1 word1 : good -word2 : dog -proximity: 3 +word2 : doggo docids : docids1 prefixes: [d, do, dog] batch: [ - (d, 3) -> [docids1] - (do, 3) -> [docids1] - (dog, 3) -> [docids1] + d, -> [docids1] + do -> [docids1] + dog -> [docids1] ] ``` 3. For illustration purpose, let's run through a second iteration of the outer loop: ```text Outer loop 2: ------------------------------ -word1 : good -word2 : doggo proximity: 1 +word1 : good +word2 : door docids : docids2 -prefixes: [d, do, dog] +prefixes: [d, do, doo] batch: [ - (d, 1) -> [docids2] - (d, 3) -> [docids1] - (do, 1) -> [docids2] - (do, 3) -> [docids1] - (dog, 1) -> [docids2] - (dog, 3) -> [docids1] -] -``` -Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some -of the elements inserted in the second iteration of the outer loop appear -*before* elements from the first iteration. - -4. And a third: -```text -Outer loop 3: ------------------------------- -word1 : good -word2 : dogma -proximity: 1 -docids : docids3 - -prefixes: [d, do, dog] - -batch: [ - (d, 1) -> [docids2, docids3] - (d, 3) -> [docids1] - (do, 1) -> [docids2, docids3] - (do, 3) -> [docids1] - (dog, 1) -> [docids2, docids3] - (dog, 3) -> [docids1] + d -> [docids1, docids2] + do -> [docids1, docids2] + dog -> [docids1] + doo -> [docids2] ] ``` Notice that there were some conflicts which were resolved by merging the -conflicting values together. +conflicting values together. Also, an additional prefix was added at the +end of the batch. -5. On the fourth iteration of the outer loop, we have: +4. On the third iteration of the outer loop, we have: ```text Outer loop 4: ------------------------------ +proximity: 1 word1 : good word2 : ghost -proximity: 2 ``` Because `word2` begins with a different letter than the previous `word2`, -we know that: - -1. All the prefixes of `word2` are greater than the prefixes of the previous word2 -2. And therefore, every instance of (`word2`, `prefix`) will be greater than -any element in the batch. +we know that all the prefixes of `word2` are greater than the prefixes of the previous word2 Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called -“flushing the batch”. Flushing the batch should also be done whenever `word1` -is different than the previous `word1`. +“flushing the batch”. Flushing the batch should also be done whenever: +* `proximity` is different than the previous `proximity`. +* `word1` is different than the previous `word1`. +* `word2` starts with a different letter than the previous word2 -6. **Flushing the batch:** to flush the batch, we look at the `word1` and -iterate over the elements of the batch in sorted order: +6. **Flushing the batch:** to flush the batch, we iterate over its elements: ```text Flushing Batch loop 1: ------------------------------ -word1 : good -word2 : d -proximity: 1 +proximity : 1 +word1 : good +prefix : d + docids : [docids2, docids3] ``` We then merge the array of `docids` (of type `Vec>`) using `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a roaring bitmap of all the document ids where `word1` is followed by `prefix` at a distance of `proximity`. -Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` +Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids` into the database. 7. That's it! ... except... @@ -184,8 +157,8 @@ Note, also, that since we read data from the database when iterating over `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- docids from the batch directly into the database (we would have a concurrent reader and writer). Therefore, when calling the algorithm on -(`new_prefixes`, `word_pairs_db`), we insert the computed -((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad +`(new_prefixes, word_pairs_db)`, we insert the computed +`((proximity, word, prefix), docids)` elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. @@ -406,7 +379,7 @@ fn execute_on_word_pairs_and_prefixes( while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { // skip this iteration if the proximity is over the threshold if proximity > max_proximity { - continue; + break; }; let word2_start_different_than_prev = word2[0] != prev_word2_start; // if there were no potential prefixes for the previous word2 based on its first letter, @@ -416,16 +389,21 @@ fn execute_on_word_pairs_and_prefixes( continue; } - // if word1 is different than the previous word1 OR if the start of word2 is different - // than the previous start of word2, then we'll need to flush the batch + // if the proximity is different to the previous one, OR + // if word1 is different than the previous word1, OR + // if the start of word2 is different than the previous start of word2, + // THEN we'll need to flush the batch + let prox_different_than_prev = proximity != batch.proximity; let word1_different_than_prev = word1 != batch.word1; - if word1_different_than_prev || word2_start_different_than_prev { + if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev + { batch.flush(&mut merge_buffer, &mut insert)?; // don't forget to reset the value of batch.word1 and prev_word2_start if word1_different_than_prev { prefix_search_start.0 = 0; batch.word1.clear(); batch.word1.extend_from_slice(word1); + batch.proximity = proximity; } if word2_start_different_than_prev { // word2_start_different_than_prev == true @@ -437,74 +415,70 @@ fn execute_on_word_pairs_and_prefixes( if !empty_prefixes { // All conditions are satisfied, we can now insert each new prefix of word2 into the batch + prefix_buffer.clear(); prefixes.for_each_prefix_of( word2, &mut prefix_buffer, &prefix_search_start, |prefix_buffer| { - let prefix_len = prefix_buffer.len(); - prefix_buffer.push(0); - prefix_buffer.push(proximity); batch.insert(&prefix_buffer, data.to_vec()); - prefix_buffer.truncate(prefix_len); }, ); - prefix_buffer.clear(); } } batch.flush(&mut merge_buffer, &mut insert)?; Ok(()) } /** -A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). +A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps). The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. -It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. +It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently. -The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content +The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: -- key : (word1, prefix, proximity) as bytes -- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes +- key : (proximity, word1, prefix) as bytes +- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes */ #[derive(Default)] struct PrefixAndProximityBatch { + proximity: u8, word1: Vec, batch: Vec<(Vec, Vec>)>, } impl PrefixAndProximityBatch { /// Insert the new key and value into the batch + /// + /// The key must either exist in the batch or be greater than all existing keys fn insert(&mut self, new_key: &[u8], new_value: Vec) { - match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { - Ok(position) => { - self.batch[position].1.push(Cow::Owned(new_value)); - } - Err(position) => { - self.batch.insert(position, (new_key.to_vec(), vec![Cow::Owned(new_value)])); - } + match self.batch.iter_mut().find(|el| el.0 == new_key) { + Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)), + None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])), } } /// Empties the batch, calling `insert` on each element. /// - /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. + /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap. fn flush( &mut self, merge_buffer: &mut Vec, insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, ) -> Result<()> { - let PrefixAndProximityBatch { word1, batch } = self; + let PrefixAndProximityBatch { proximity, word1, batch } = self; if batch.is_empty() { return Ok(()); } merge_buffer.clear(); - let mut buffer = Vec::with_capacity(word1.len() + 1 + 6 + 1); + let mut buffer = Vec::with_capacity(word1.len() + 1 + 6); + buffer.push(*proximity); buffer.extend_from_slice(word1); buffer.push(0); for (key, mergeable_data) in batch.drain(..) { - buffer.truncate(word1.len() + 1); + buffer.truncate(1 + word1.len() + 1); buffer.extend_from_slice(key.as_slice()); let data = if mergeable_data.len() > 1 { @@ -884,51 +858,33 @@ mod tests { CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); let word_pairs = [ - // 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456) - (("healthy", "arbre", 2), &serialised_bitmap123), - // not inserted because 3 > max_proximity - (("healthy", "arbre", 3), &serialised_bitmap456), - // 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123) (("healthy", "arbres", 1), &serialised_bitmap123), - // 1, 3: - (("healthy", "arbres", 2), &serialised_bitmap456), - // not be inserted because 3 > max_proximity - (("healthy", "arbres", 3), &serialised_bitmap789), - // not inserted because no prefixes for boat (("healthy", "boat", 1), &serialised_bitmap123), - // not inserted because no prefixes for ca (("healthy", "ca", 1), &serialised_bitmap123), - // 4: (healthy cat 1) with (bitmap456 + bitmap123) (("healthy", "cats", 1), &serialised_bitmap456), - // 5: (healthy cat 2) with (bitmap789 + bitmap_ranges) - (("healthy", "cats", 2), &serialised_bitmap789), - // 4 + 6: (healthy catto 1) with (bitmap123) (("healthy", "cattos", 1), &serialised_bitmap123), - // 5 + 7: (healthy catto 2) with (bitmap_ranges) - (("healthy", "cattos", 2), &serialised_bitmap_ranges), - // 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges) (("jittery", "cat", 1), &serialised_bitmap123), - // 8: (("jittery", "cata", 1), &serialised_bitmap456), - // 8: (("jittery", "catb", 1), &serialised_bitmap789), - // 8: (("jittery", "catc", 1), &serialised_bitmap_ranges), + (("healthy", "arbre", 2), &serialised_bitmap123), + (("healthy", "arbres", 2), &serialised_bitmap456), + (("healthy", "cats", 2), &serialised_bitmap789), + (("healthy", "cattos", 2), &serialised_bitmap_ranges), + (("healthy", "arbre", 3), &serialised_bitmap456), + (("healthy", "arbres", 3), &serialised_bitmap789), ]; let expected_result = [ - // first batch: (("healthy", "arb", 1), bitmap123.clone()), - (("healthy", "arb", 2), &bitmap123 | &bitmap456), (("healthy", "arbre", 1), bitmap123.clone()), - (("healthy", "arbre", 2), &bitmap123 | &bitmap456), - // second batch: (("healthy", "cat", 1), &bitmap456 | &bitmap123), - (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), (("healthy", "catto", 1), bitmap123.clone()), - (("healthy", "catto", 2), bitmap_ranges.clone()), - // third batch (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), + (("healthy", "arb", 2), &bitmap123 | &bitmap456), + (("healthy", "arbre", 2), &bitmap123 | &bitmap456), + (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), + (("healthy", "catto", 2), bitmap_ranges.clone()), ]; let mut result = vec![]; From 1dbbd8694feb66c07cb2eef2144ff785fba16604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 14:01:53 +0200 Subject: [PATCH 1670/1889] Rename StrStrU8Codec to U8StrStrCodec and reorder its fields --- milli/src/heed_codec/mod.rs | 2 +- milli/src/heed_codec/str_str_u8_codec.rs | 28 +++---- milli/src/index.rs | 6 +- milli/src/lib.rs | 2 +- milli/src/search/criteria/mod.rs | 4 +- milli/src/snapshot_tests.rs | 6 +- .../word_prefix_pair_proximity_docids.rs | 74 +++++++++---------- 7 files changed, 61 insertions(+), 61 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index f3691b7d8..e07e47c79 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -15,4 +15,4 @@ pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; pub use self::str_beu32_codec::StrBEU32Codec; -pub use self::str_str_u8_codec::{StrStrU8Codec, UncheckedStrStrU8Codec}; +pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; diff --git a/milli/src/heed_codec/str_str_u8_codec.rs b/milli/src/heed_codec/str_str_u8_codec.rs index 6cfff3ecf..60be8ddc7 100644 --- a/milli/src/heed_codec/str_str_u8_codec.rs +++ b/milli/src/heed_codec/str_str_u8_codec.rs @@ -1,10 +1,10 @@ use std::borrow::Cow; use std::str; -pub struct StrStrU8Codec; +pub struct U8StrStrCodec; -impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { - type DItem = (&'a str, &'a str, u8); +impl<'a> heed::BytesDecode<'a> for U8StrStrCodec { + type DItem = (u8, &'a str, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { let (n, bytes) = bytes.split_first()?; @@ -13,14 +13,14 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { let s2_bytes = &rest[1..]; let s1 = str::from_utf8(s1_bytes).ok()?; let s2 = str::from_utf8(s2_bytes).ok()?; - Some((s1, s2, *n)) + Some((*n, s1, s2)) } } -impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { - type EItem = (&'a str, &'a str, u8); +impl<'a> heed::BytesEncode<'a> for U8StrStrCodec { + type EItem = (u8, &'a str, &'a str); - fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { + fn bytes_encode((n, s1, s2): &Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); bytes.push(*n); bytes.extend_from_slice(s1.as_bytes()); @@ -29,24 +29,24 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { Some(Cow::Owned(bytes)) } } -pub struct UncheckedStrStrU8Codec; +pub struct UncheckedU8StrStrCodec; -impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec { - type DItem = (&'a [u8], &'a [u8], u8); +impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec { + type DItem = (u8, &'a [u8], &'a [u8]); fn bytes_decode(bytes: &'a [u8]) -> Option { let (n, bytes) = bytes.split_first()?; let s1_end = bytes.iter().position(|b| *b == 0)?; let (s1_bytes, rest) = bytes.split_at(s1_end); let s2_bytes = &rest[1..]; - Some((s1_bytes, s2_bytes, *n)) + Some((*n, s1_bytes, s2_bytes)) } } -impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec { - type EItem = (&'a [u8], &'a [u8], u8); +impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec { + type EItem = (u8, &'a [u8], &'a [u8]); - fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { + fn bytes_encode((n, s1, s2): &Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); bytes.push(*n); bytes.extend_from_slice(s1); diff --git a/milli/src/index.rs b/milli/src/index.rs index 0dccabf03..f1bc2fa10 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -21,7 +21,7 @@ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, StrBEU32Codec, StrStrU8Codec, BEU16, BEU32, + Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -106,9 +106,9 @@ pub struct Index { pub docid_word_positions: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. - pub word_pair_proximity_docids: Database, + pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. - pub word_prefix_pair_proximity_docids: Database, + pub word_prefix_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. pub word_position_docids: Database, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 517d28ccc..b5671b33b 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,7 +37,7 @@ pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, - RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, UncheckedStrStrU8Codec, + RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec, }; pub use self::index::Index; pub use self::search::{ diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 866eaefde..86cec1ddc 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -138,7 +138,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { right: &str, proximity: u8, ) -> heed::Result> { - let key = (left, right, proximity); + let key = (proximity, left, right); self.index.word_pair_proximity_docids.get(self.rtxn, &key) } @@ -148,7 +148,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { right: &str, proximity: u8, ) -> heed::Result> { - let key = (left, right, proximity); + let key = (proximity, left, right); self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 17f490758..b4eee7dfe 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -182,16 +182,16 @@ pub fn snap_docid_word_positions(index: &Index) -> String { } pub fn snap_word_pair_proximity_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, word_pair_proximity_docids, |( - (word1, word2, proximity), + (proximity, word1, word2), b, )| { - &format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b)) + &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) }); snap } pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( - (word1, prefix, proximity), + (proximity, word1, prefix), b, )| { &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index f919aecc7..77294296f 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -177,7 +177,7 @@ use log::debug; use crate::update::index_documents::{ create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, }; -use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; +use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedU8StrStrCodec}; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -259,9 +259,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &mut cursor, |cursor| { if let Some((key, value)) = cursor.move_on_next()? { - let (word1, word2, proximity) = UncheckedStrStrU8Codec::bytes_decode(key) + let (proximity, word1, word2) = UncheckedU8StrStrCodec::bytes_decode(key) .ok_or(heed::Error::Decoding)?; - Ok(Some(((word1, word2, proximity), value))) + Ok(Some(((proximity, word1, word2), value))) } else { Ok(None) } @@ -293,7 +293,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { let mut db_iter = self .index .word_pair_proximity_docids - .remap_key_type::() + .remap_key_type::() .remap_data_type::() .iter(self.wtxn)?; @@ -358,7 +358,7 @@ fn execute_on_word_pairs_and_prefixes( mut next_word_pair_proximity: impl for<'a> FnMut( &'a mut I, ) -> Result< - Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, + Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>, >, prefixes: &PrefixTrieNode, max_proximity: u8, @@ -376,14 +376,14 @@ fn execute_on_word_pairs_and_prefixes( let mut prefix_buffer = Vec::with_capacity(8); let mut merge_buffer = Vec::with_capacity(65_536); - while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { // skip this iteration if the proximity is over the threshold if proximity > max_proximity { break; }; let word2_start_different_than_prev = word2[0] != prev_word2_start; // if there were no potential prefixes for the previous word2 based on its first letter, - // and if the current word2 starts with the same letter, then there is also no potential + // and if the current word2 starts with the s`ame letter, then there is also no potential // prefixes for the current word2, and we can skip to the next iteration if empty_prefixes && !word2_start_different_than_prev { continue; @@ -683,7 +683,7 @@ mod tests { use super::*; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; - use crate::{db_snap, CboRoaringBitmapCodec, StrStrU8Codec}; + use crate::{db_snap, CboRoaringBitmapCodec, U8StrStrCodec}; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); @@ -858,40 +858,40 @@ mod tests { CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); let word_pairs = [ - (("healthy", "arbres", 1), &serialised_bitmap123), - (("healthy", "boat", 1), &serialised_bitmap123), - (("healthy", "ca", 1), &serialised_bitmap123), - (("healthy", "cats", 1), &serialised_bitmap456), - (("healthy", "cattos", 1), &serialised_bitmap123), - (("jittery", "cat", 1), &serialised_bitmap123), - (("jittery", "cata", 1), &serialised_bitmap456), - (("jittery", "catb", 1), &serialised_bitmap789), - (("jittery", "catc", 1), &serialised_bitmap_ranges), - (("healthy", "arbre", 2), &serialised_bitmap123), - (("healthy", "arbres", 2), &serialised_bitmap456), - (("healthy", "cats", 2), &serialised_bitmap789), - (("healthy", "cattos", 2), &serialised_bitmap_ranges), - (("healthy", "arbre", 3), &serialised_bitmap456), - (("healthy", "arbres", 3), &serialised_bitmap789), + ((1, "healthy", "arbres"), &serialised_bitmap123), + ((1, "healthy", "boat"), &serialised_bitmap123), + ((1, "healthy", "ca"), &serialised_bitmap123), + ((1, "healthy", "cats"), &serialised_bitmap456), + ((1, "healthy", "cattos"), &serialised_bitmap123), + ((1, "jittery", "cat"), &serialised_bitmap123), + ((1, "jittery", "cata"), &serialised_bitmap456), + ((1, "jittery", "catb"), &serialised_bitmap789), + ((1, "jittery", "catc"), &serialised_bitmap_ranges), + ((2, "healthy", "arbre"), &serialised_bitmap123), + ((2, "healthy", "arbres"), &serialised_bitmap456), + ((2, "healthy", "cats"), &serialised_bitmap789), + ((2, "healthy", "cattos"), &serialised_bitmap_ranges), + ((3, "healthy", "arbre"), &serialised_bitmap456), + ((3, "healthy", "arbres"), &serialised_bitmap789), ]; let expected_result = [ - (("healthy", "arb", 1), bitmap123.clone()), - (("healthy", "arbre", 1), bitmap123.clone()), - (("healthy", "cat", 1), &bitmap456 | &bitmap123), - (("healthy", "catto", 1), bitmap123.clone()), - (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), - (("healthy", "arb", 2), &bitmap123 | &bitmap456), - (("healthy", "arbre", 2), &bitmap123 | &bitmap456), - (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), - (("healthy", "catto", 2), bitmap_ranges.clone()), + ((1, "healthy", "arb"), bitmap123.clone()), + ((1, "healthy", "arbre"), bitmap123.clone()), + ((1, "healthy", "cat"), &bitmap456 | &bitmap123), + ((1, "healthy", "catto"), bitmap123.clone()), + ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), + ((2, "healthy", "arb"), &bitmap123 | &bitmap456), + ((2, "healthy", "arbre"), &bitmap123 | &bitmap456), + ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges), + ((2, "healthy", "catto"), bitmap_ranges.clone()), ]; let mut result = vec![]; let mut iter = - IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { - ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) + IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| { + ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice()) }); execute_on_word_pairs_and_prefixes( &mut iter, @@ -899,7 +899,7 @@ mod tests { &prefixes, 2, |k, v| { - let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); + let (word1, prefix, proximity) = U8StrStrCodec::bytes_decode(k).unwrap(); let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); Ok(()) @@ -908,8 +908,8 @@ mod tests { .unwrap(); for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { - let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x; - let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y; + let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x; + let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y; assert_eq!(actual_word1, expected_word1); assert_eq!(actual_prefix, expected_prefix); From 264a04922dfb16b54903ce16d0dd1c846060fbd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 15:33:13 +0200 Subject: [PATCH 1671/1889] Add prefix_word_pair_proximity database Similar to the word_prefix_pair_proximity one but instead the keys are: (proximity, prefix, word2) --- milli/src/index.rs | 8 +- milli/src/snapshot_tests.rs | 12 + milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 35 +- milli/src/update/index_documents/mod.rs | 11 +- milli/src/update/mod.rs | 4 +- milli/src/update/prefix_word_pairs/mod.rs | 216 +++++++++ .../update/prefix_word_pairs/prefix_word.rs | 178 ++++++++ .../word_prefix_pair_proximity_docids.snap | 46 ++ ...refix_word_pair_proximity_docids.hash.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 41 ++ .../word_pair_proximity_docids.hash.snap | 4 + ...ord_prefix_pair_proximity_docids.hash.snap | 4 + .../word_prefix.rs} | 427 +++++------------- 14 files changed, 653 insertions(+), 339 deletions(-) create mode 100644 milli/src/update/prefix_word_pairs/mod.rs create mode 100644 milli/src/update/prefix_word_pairs/prefix_word.rs create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap rename milli/src/update/{word_prefix_pair_proximity_docids.rs => prefix_word_pairs/word_prefix.rs} (67%) diff --git a/milli/src/index.rs b/milli/src/index.rs index f1bc2fa10..3bb668b43 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -71,6 +71,7 @@ pub mod db_name { pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; + pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; @@ -109,6 +110,8 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, + /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears. + pub prefix_word_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. pub word_position_docids: Database, @@ -138,7 +141,7 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(17); + options.max_dbs(18); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -151,6 +154,8 @@ impl Index { let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; + let prefix_word_pair_proximity_docids = + env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; @@ -175,6 +180,7 @@ impl Index { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, field_id_word_count_docids, diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index b4eee7dfe..e9c92a949 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -198,6 +198,15 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { }); snap } +pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |( + (proximity, prefix, word2), + b, + )| { + &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b)) + }); + snap +} pub fn snap_word_position_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) @@ -427,6 +436,9 @@ macro_rules! full_snap_of_db { ($index:ident, word_prefix_pair_proximity_docids) => {{ $crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index) }}; + ($index:ident, prefix_word_pair_proximity_docids) => {{ + $crate::snapshot_tests::snap_prefix_word_pair_proximity_docids(&$index) + }}; ($index:ident, word_position_docids) => {{ $crate::snapshot_tests::snap_word_position_docids(&$index) }}; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 5b7dbc57c..ba59c14cf 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -25,6 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, word_position_docids, field_id_word_count_docids, word_prefix_position_docids, @@ -66,6 +67,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; + prefix_word_pair_proximity_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index eae473f51..54328b50d 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -183,6 +183,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_pair_proximity_docids, field_id_word_count_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, facet_id_f64_docids, @@ -327,26 +328,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; } - // We delete the documents ids from the word prefix pair proximity database docids - // and remove the empty pairs too. - let db = word_prefix_pair_proximity_docids.remap_key_type::(); - let mut iter = db.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (key, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; + for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { + // We delete the documents ids from the word prefix pair proximity database docids + // and remove the empty pairs too. + let db = db.remap_key_type::(); + let mut iter = db.iter_mut(self.wtxn)?; + while let Some(result) = iter.next() { + let (key, mut docids) = result?; + let previous_len = docids.len(); + docids -= &self.to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } } } - drop(iter); - // We delete the documents ids that are under the pairs of words, // it is faster and use no memory to iterate over all the words pairs than // to compute the cartesian product of every words of the deleted documents. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e0eefe07b..897f2f8f8 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -36,8 +36,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, - WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, + self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, + WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; @@ -528,12 +528,7 @@ where if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { // Run the word prefix pair proximity docids update operation. - let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - builder.max_nb_chunks = self.indexer_config.max_nb_chunks; - builder.max_memory = self.indexer_config.max_memory; - builder.execute( + PrefixWordPairsProximityDocids::new(self.wtxn, self.index).execute( word_pair_proximity_docids, &new_prefix_fst_words, &common_prefix_fst_words, diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 1bf27a5f0..3ddc01cef 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -6,10 +6,10 @@ pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; pub use self::indexer_config::IndexerConfig; +pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids; pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; -pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; pub use self::words_prefix_position_docids::WordPrefixPositionDocids; pub use self::words_prefixes_fst::WordsPrefixesFst; @@ -19,9 +19,9 @@ mod delete_documents; mod facets; mod index_documents; mod indexer_config; +mod prefix_word_pairs; mod settings; mod update_step; mod word_prefix_docids; -mod word_prefix_pair_proximity_docids; mod words_prefix_position_docids; mod words_prefixes_fst; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs new file mode 100644 index 000000000..63286f8da --- /dev/null +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -0,0 +1,216 @@ +use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; +use crate::{Index, Result}; +use heed::types::ByteSlice; +use std::{borrow::Cow, collections::HashSet, io::BufReader}; + +mod prefix_word; +mod word_prefix; + +pub use prefix_word::index_prefix_word_database; +pub use word_prefix::index_word_prefix_database; + +pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + max_proximity: u8, + max_prefix_length: usize, +} +impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Self { + Self { wtxn, index, max_proximity: 4, max_prefix_length: 2 } + } + /// Set the maximum proximity required to make a prefix be part of the words prefixes + /// database. If two words are too far from the threshold the associated documents will + /// not be part of the prefix database. + /// + /// Default value is 4. This value must be lower or equal than 7 and will be clamped + /// to this bound otherwise. + pub fn max_proximity(&mut self, value: u8) -> &mut Self { + self.max_proximity = value.max(7); + self + } + /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words + /// prefixes database. If the prefix length is higher than the threshold, the associated documents + /// will not be part of the prefix database. + /// + /// Default value is 2. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value; + self + } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] + pub fn execute<'a>( + self, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &'a [String], + common_prefix_fst_words: &[&'a [String]], + del_prefix_fst_words: &HashSet>, + ) -> Result<()> { + index_word_prefix_database( + self.wtxn, + self.index.word_pair_proximity_docids, + self.index.word_prefix_pair_proximity_docids, + self.max_proximity, + self.max_prefix_length, + new_word_pair_proximity_docids.clone(), + new_prefix_fst_words, + common_prefix_fst_words, + del_prefix_fst_words, + )?; + + index_prefix_word_database( + self.wtxn, + self.index.word_pair_proximity_docids, + self.index.prefix_word_pair_proximity_docids, + self.max_proximity, + self.max_prefix_length, + new_word_pair_proximity_docids, + new_prefix_fst_words, + common_prefix_fst_words, + del_prefix_fst_words, + )?; + + Ok(()) + } +} + +// This is adapted from `sorter_into_lmdb_database` +pub fn insert_into_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + new_key: &[u8], + new_value: &[u8], +) -> Result<()> { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; + match iter.next().transpose()? { + Some((key, old_val)) if new_key == key => { + let val = + merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) + .map_err(|_| { + // TODO just wrap this error? + crate::error::InternalError::IndexingMergingKeys { + process: "get-put-merge", + } + })?; + // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour + unsafe { iter.put_current(new_key, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; + } + } + Ok(()) +} + +// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, +// but it uses `append` if the database is empty, and it assumes that the values in the +// writer don't conflict with values in the database. +pub fn write_into_lmdb_database_without_merging( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + writer: grenad::Writer, +) -> Result<()> { + let file = writer.into_inner()?; + let reader = grenad::Reader::new(BufReader::new(file))?; + if database.is_empty(wtxn)? { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + // safety: the key comes from the grenad reader, not the database + unsafe { out_iter.append(k, v)? }; + } + } else { + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::db_snap; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + use std::io::Cursor; + + fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { + let mut documents = Vec::new(); + for prefix in prefixes { + for i in 0..50 { + documents.push( + serde_json::json!({ + "text": format!("{prefix}{i:x}"), + }) + .as_object() + .unwrap() + .clone(), + ) + } + } + documents + } + + #[test] + fn test_update() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + + let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); + documents.push( + serde_json::json!({ + "text": "At an extraordinary house" + }) + .as_object() + .unwrap() + .clone(), + ); + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_pair_proximity_docids, "update"); + db_snap!(index, word_prefix_pair_proximity_docids, "update"); + db_snap!(index, prefix_word_pair_proximity_docids, "update"); + } +} diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs new file mode 100644 index 000000000..cbc9ac0b2 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -0,0 +1,178 @@ +use crate::update::index_documents::{create_writer, CursorClonableMmap}; +use crate::update::prefix_word_pairs::{ + insert_into_database, write_into_lmdb_database_without_merging, +}; +use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; +use std::borrow::Cow; +use std::collections::{BTreeMap, HashSet}; + +#[logging_timer::time] +pub fn index_prefix_word_database( + wtxn: &mut heed::RwTxn, + word_pair_proximity_docids: heed::Database, + prefix_word_pair_proximity_docids: heed::Database, + max_proximity: u8, + max_prefix_length: usize, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, +) -> Result<()> { + let max_proximity = max_proximity - 1; + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + let common_prefixes: Vec<_> = common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length) + .collect(); + + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (word1, common_prefix, proximity) elements + // to insert in the DB + for proximity in 1..=max_proximity - 1 { + for prefix in common_prefixes.iter() { + let mut prefix_key = vec![]; + prefix_key.push(proximity); + prefix_key.extend_from_slice(prefix.as_bytes()); + let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; + // This is the core of the algorithm + execute_on_word_pairs_and_prefixes( + proximity + 1, + prefix.as_bytes(), + // the next two arguments tell how to iterate over the new word pairs + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.next()? { + let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) + .ok_or(heed::Error::Decoding)?; + Ok(Some((word2, value))) + } else { + Ok(None) + } + }, + // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) + |key, value| { + insert_into_database( + wtxn, + *prefix_word_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + } + } + + // Now we do the same thing with the new prefixes and all word pairs in the DB + let new_prefixes: Vec<_> = new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length) + .collect(); + + // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) + // element in an intermediary grenad + let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + + for proximity in 1..=max_proximity - 1 { + for prefix in new_prefixes.iter() { + let mut prefix_key = vec![]; + prefix_key.push(proximity); + prefix_key.extend_from_slice(prefix.as_bytes()); + let mut db_iter = word_pair_proximity_docids + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? + .remap_key_type::(); + execute_on_word_pairs_and_prefixes( + proximity + 1, + prefix.as_bytes(), + &mut db_iter, + |db_iter| { + db_iter + .next() + .transpose() + .map(|x| x.map(|((_, _, word2), value)| (word2, value))) + .map_err(|e| e.into()) + }, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); + } + } + + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( + wtxn, + *prefix_word_pair_proximity_docids.as_polymorph(), + writer, + )?; + + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + if !del_prefix_fst_words.is_empty() { + let mut iter = + prefix_word_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; + while let Some(((_, prefix, _), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; + } + } + } + + Ok(()) +} + +/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// +/// Its main arguments are: +/// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements +/// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements +/// +/// For more information about what this function does, read the module documentation. +fn execute_on_word_pairs_and_prefixes( + proximity: u8, + prefix: &[u8], + iter: &mut I, + mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, +) -> Result<()> { + let mut batch: BTreeMap, Vec>> = <_>::default(); + + while let Some((word2, data)) = next_word2_and_docids(iter)? { + let entry = batch.entry(word2.to_owned()).or_default(); + entry.push(Cow::Owned(data.to_owned())); + } + + let mut key_buffer = Vec::with_capacity(8); + key_buffer.push(proximity); + key_buffer.extend_from_slice(prefix); + key_buffer.push(0); + + let mut value_buffer = Vec::with_capacity(65_536); + + for (key, values) in batch { + key_buffer.truncate(prefix.len() + 2); + value_buffer.clear(); + + key_buffer.extend_from_slice(&key); + let data = if values.len() > 1 { + CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?; + value_buffer.as_slice() + } else { + &values[0] + }; + insert(key_buffer.as_slice(), data)?; + } + Ok(()) +} diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..9a6ffaec9 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,46 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [101, ] +1 amazing a [100, ] +1 an a [100, ] +1 and b [100, ] +1 and be [100, ] +1 at a [100, ] +1 rings a [101, ] +1 the b [101, ] +1 the be [101, ] +2 5 a [101, ] +2 amazing a [100, ] +2 amazing b [100, ] +2 amazing be [100, ] +2 an a [100, ] +2 and a [100, ] +2 at a [100, 101, ] +2 beautiful a [100, ] +2 bell a [101, ] +2 house b [100, ] +2 house be [100, ] +2 rings b [101, ] +2 rings be [101, ] +3 am a [101, ] +3 amazing a [100, ] +3 an b [100, ] +3 an be [100, ] +3 and a [100, ] +3 at a [100, ] +3 at b [101, ] +3 at be [101, ] +3 beautiful a [100, ] +3 house a [100, ] +3 rings a [101, ] +3 the a [101, ] +4 5 b [101, ] +4 5 be [101, ] +4 and a [100, ] +4 at b [100, ] +4 at be [100, ] +4 beautiful a [100, ] +4 bell a [101, ] +4 house a [100, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..e460be400 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b94c5d52e --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,41 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +2 a 5 [101, ] +2 a amazing [100, ] +2 a an [100, 202, ] +2 a and [100, ] +2 a beautiful [100, ] +2 a extraordinary [202, ] +2 am and [100, ] +2 an amazing [100, ] +2 an beautiful [100, ] +2 an extraordinary [202, ] +2 b house [100, ] +2 b rings [101, ] +2 be house [100, ] +2 be rings [101, ] +3 a 5 [101, ] +3 a am [101, ] +3 a amazing [100, ] +3 a an [100, ] +3 a and [100, ] +3 a at [100, 202, ] +3 a beautiful [100, ] +3 a extraordinary [202, ] +3 a house [100, 202, ] +3 a rings [101, ] +3 am 5 [101, ] +3 am an [100, ] +3 am beautiful [100, ] +3 an amazing [100, ] +3 an and [100, ] +3 an at [100, 202, ] +3 an house [100, 202, ] +3 b and [100, ] +3 b at [101, ] +3 b the [101, ] +3 be and [100, ] +3 be at [101, ] +3 be the [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..015ef8c14 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +6965ecd1bf821f1cf921c2ab751b36cf diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..e460be400 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs similarity index 67% rename from milli/src/update/word_prefix_pair_proximity_docids.rs rename to milli/src/update/prefix_word_pairs/word_prefix.rs index 77294296f..bd1bea2a3 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -1,5 +1,5 @@ /*! - ## What is WordPrefixPairProximityDocids? + ## What is WordPrefix? The word-prefix-pair-proximity-docids database is a database whose keys are of the form `(proximity, word, prefix)` and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with @@ -139,7 +139,7 @@ inputs described above, which come from different places: 2. `word_pairs_db`, which is the list of word pairs from the database. This list includes all elements in `new_word_pairs` since `new_word_pairs` - was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` + was added to the database prior to calling the `WordPrefix::execute` function. To update the prefix database correctly, we call the algorithm described earlier first @@ -161,196 +161,137 @@ reader and writer). Therefore, when calling the algorithm on `((proximity, word, prefix), docids)` elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. - - - */ -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::BufReader; +use crate::update::index_documents::{create_writer, CursorClonableMmap}; +use crate::update::prefix_word_pairs::{ + insert_into_database, write_into_lmdb_database_without_merging, +}; +use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; use grenad::CompressionType; use heed::types::ByteSlice; use heed::BytesDecode; use log::debug; +use std::borrow::Cow; +use std::collections::HashSet; -use crate::update::index_documents::{ - create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, -}; -use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedU8StrStrCodec}; - -pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, +#[logging_timer::time] +pub fn index_word_prefix_database( + wtxn: &mut heed::RwTxn, + word_pair_proximity_docids: heed::Database, + word_prefix_pair_proximity_docids: heed::Database, max_proximity: u8, max_prefix_length: usize, -} + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, +) -> Result<()> { + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); -impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { - WordPrefixPairProximityDocids { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - max_nb_chunks: None, - max_memory: None, - max_proximity: 4, - max_prefix_length: 2, - } - } + // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length + let prefixes = PrefixTrieNode::from_sorted_prefixes( + common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length), + ); - /// Set the maximum proximity required to make a prefix be part of the words prefixes - /// database. If two words are too far from the threshold the associated documents will - /// not be part of the prefix database. - /// - /// Default value is 4. This value must be lower or equal than 7 and will be clamped - /// to this bound otherwise. - pub fn max_proximity(&mut self, value: u8) -> &mut Self { - self.max_proximity = value.max(7); - self - } - - /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words - /// prefixes database. If the prefix length is higher than the threshold, the associated documents - /// will not be part of the prefix database. - /// - /// Default value is 2. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value; - self - } - - #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute<'a>( - mut self, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &'a [String], - common_prefix_fst_words: &[&'a [String]], - del_prefix_fst_words: &HashSet>, - ) -> Result<()> { - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length - let prefixes = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .into_iter() - .map(|s| s.into_iter()) - .flatten() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (word1, common_prefix, proximity) elements - // to insert in the DB - if !prefixes.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - // the first two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.move_on_next()? { - let (proximity, word1, word2) = UncheckedU8StrStrCodec::bytes_decode(key) - .ok_or(heed::Error::Decoding)?; - Ok(Some(((proximity, word1, word2), value))) - } else { - Ok(None) - } - }, - &prefixes, - self.max_proximity, - // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) - |key, value| { - insert_into_database( - &mut self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - - let prefixes = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words - .into_iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - if !prefixes.is_empty() { - let mut db_iter = self - .index - .word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(self.wtxn)?; - - // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) - // element in an intermediary grenad - let mut writer = create_writer( - self.chunk_compression_type, - self.chunk_compression_level, - tempfile::tempfile()?, - ); - - execute_on_word_pairs_and_prefixes( - &mut db_iter, - |db_iter| db_iter.next().transpose().map_err(|e| e.into()), - &prefixes, - self.max_proximity, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = self - .index - .word_prefix_pair_proximity_docids - .remap_data_type::() - .iter_mut(self.wtxn)?; - while let Some(((_, w2, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(w2.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (proximity, word1, common_prefix) elements + // to insert in the DB + if !prefixes.is_empty() { + let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + // This is the core of the algorithm + execute_on_word_pairs_and_prefixes( + // the first two arguments tell how to iterate over the new word pairs + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.move_on_next()? { + let (proximity, word1, word2) = + UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + Ok(Some(((proximity, word1, word2), value))) + } else { + Ok(None) } + }, + &prefixes, + max_proximity, + // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap) + |key, value| { + insert_into_database( + wtxn, + *word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + } + + // Now we do the same thing with the new prefixes and all word pairs in the DB + + let prefixes = PrefixTrieNode::from_sorted_prefixes( + new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length), + ); + + if !prefixes.is_empty() { + let mut db_iter = word_pair_proximity_docids + .remap_key_type::() + .remap_data_type::() + .iter(wtxn)?; + + // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) + // element in an intermediary grenad + let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + + execute_on_word_pairs_and_prefixes( + &mut db_iter, + |db_iter| db_iter.next().transpose().map_err(|e| e.into()), + &prefixes, + max_proximity, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); + + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( + wtxn, + *word_prefix_pair_proximity_docids.as_polymorph(), + writer, + )?; + } + + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + if !del_prefix_fst_words.is_empty() { + let mut iter = + word_prefix_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; + while let Some(((_, _, prefix), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; } } - - Ok(()) } + + Ok(()) } /// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. /// /// Its main arguments are: -/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements +/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements /// 2. a prefix trie -/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements +/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements /// /// For more information about what this function does, read the module documentation. fn execute_on_word_pairs_and_prefixes( @@ -495,61 +436,6 @@ impl PrefixAndProximityBatch { } } -// This is adapted from `sorter_into_lmdb_database` -fn insert_into_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - new_key: &[u8], - new_value: &[u8], -) -> Result<()> { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; - match iter.next().transpose()? { - Some((key, old_val)) if new_key == key => { - let val = - merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) - .map_err(|_| { - // TODO just wrap this error? - crate::error::InternalError::IndexingMergingKeys { - process: "get-put-merge", - } - })?; - // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour - unsafe { iter.put_current(new_key, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; - } - } - Ok(()) -} - -// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, -// but it uses `append` if the database is empty, and it assumes that the values in the -// writer don't conflict with values in the database. -pub fn write_into_lmdb_database_without_merging( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - writer: grenad::Writer, -) -> Result<()> { - let file = writer.into_inner()?; - let reader = grenad::Reader::new(BufReader::new(file))?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - // safety: the key comes from the grenad reader, not the database - unsafe { out_iter.append(k, v)? }; - } - } else { - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - Ok(()) -} - /** A prefix trie. Used to iterate quickly over the prefixes of a word that are within a set. @@ -676,90 +562,9 @@ impl PrefixTrieNode { } #[cfg(test)] mod tests { - use std::io::Cursor; - - use roaring::RoaringBitmap; - use super::*; - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::{db_snap, CboRoaringBitmapCodec, U8StrStrCodec}; - - fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { - let mut documents = Vec::new(); - for prefix in prefixes { - for i in 0..50 { - documents.push( - serde_json::json!({ - "text": format!("{prefix}{i:x}"), - }) - .as_object() - .unwrap() - .clone(), - ) - } - } - documents - } - - #[test] - fn test_update() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - - let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); - documents.push( - serde_json::json!({ - "text": "At an extraordinary house" - }) - .as_object() - .unwrap() - .clone(), - ); - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_prefix_pair_proximity_docids, "update"); - } + use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; + use roaring::RoaringBitmap; fn check_prefixes( trie: &PrefixTrieNode, @@ -899,9 +704,9 @@ mod tests { &prefixes, 2, |k, v| { - let (word1, prefix, proximity) = U8StrStrCodec::bytes_decode(k).unwrap(); + let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap(); let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); - result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); + result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap)); Ok(()) }, ) From a7de4f5b854715198c1c537f8bb7010d7f614b75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 15:35:15 +0200 Subject: [PATCH 1672/1889] Don't add swapped word pairs to the word_pair_proximity_docids db --- .../extract/extract_word_pair_proximity_docids.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 3837c1bbe..25117c706 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -106,17 +106,6 @@ fn document_word_positions_into_sorter<'b>( *p = cmp::min(*p, prox); }) .or_insert(prox); - - // We also compute the inverse proximity. - let prox = prox + 1; - if prox < MAX_DISTANCE { - word_pair_proximity - .entry((word.clone(), head.word.clone())) - .and_modify(|p| { - *p = cmp::min(*p, prox); - }) - .or_insert(prox); - } } } From 6c3a5d69e1ff8465c9e2e2a78a6d880c0f730250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 15:43:10 +0200 Subject: [PATCH 1673/1889] Update snapshots --- .../word_prefix_pair_proximity_docids.snap | 20 -------- ...refix_word_pair_proximity_docids.hash.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 12 ----- .../word_pair_proximity_docids.hash.snap | 4 -- .../update/word_pair_proximity_docids.snap | 39 ++++++++++++++++ ...ord_prefix_pair_proximity_docids.hash.snap | 4 -- .../word_prefix_pair_proximity_docids.snap | 35 ++++++++++++++ .../word_prefix_pair_proximity_docids.snap | 46 ------------------- ...ord_prefix_pair_proximity_docids.hash.snap | 4 -- 9 files changed, 74 insertions(+), 94 deletions(-) delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap index 9a6ffaec9..c760ae440 100644 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap @@ -10,37 +10,17 @@ source: milli/src/update/prefix_word_pairs/mod.rs 1 rings a [101, ] 1 the b [101, ] 1 the be [101, ] -2 5 a [101, ] -2 amazing a [100, ] 2 amazing b [100, ] 2 amazing be [100, ] 2 an a [100, ] -2 and a [100, ] 2 at a [100, 101, ] -2 beautiful a [100, ] 2 bell a [101, ] -2 house b [100, ] -2 house be [100, ] -2 rings b [101, ] -2 rings be [101, ] -3 am a [101, ] -3 amazing a [100, ] 3 an b [100, ] 3 an be [100, ] -3 and a [100, ] 3 at a [100, ] -3 at b [101, ] -3 at be [101, ] -3 beautiful a [100, ] -3 house a [100, ] 3 rings a [101, ] 3 the a [101, ] -4 5 b [101, ] -4 5 be [101, ] -4 and a [100, ] 4 at b [100, ] 4 at be [100, ] -4 beautiful a [100, ] 4 bell a [101, ] -4 house a [100, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap deleted file mode 100644 index e460be400..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap index b94c5d52e..c5f45a9eb 100644 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap @@ -15,27 +15,15 @@ source: milli/src/update/prefix_word_pairs/mod.rs 2 b rings [101, ] 2 be house [100, ] 2 be rings [101, ] -3 a 5 [101, ] 3 a am [101, ] 3 a amazing [100, ] -3 a an [100, ] 3 a and [100, ] -3 a at [100, 202, ] 3 a beautiful [100, ] 3 a extraordinary [202, ] 3 a house [100, 202, ] -3 a rings [101, ] -3 am 5 [101, ] -3 am an [100, ] 3 am beautiful [100, ] -3 an amazing [100, ] 3 an and [100, ] -3 an at [100, 202, ] 3 an house [100, 202, ] -3 b and [100, ] 3 b at [101, ] -3 b the [101, ] -3 be and [100, ] 3 be at [101, ] -3 be the [101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap deleted file mode 100644 index 015ef8c14..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -6965ecd1bf821f1cf921c2ab751b36cf diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap new file mode 100644 index 000000000..4fcd0fbd2 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap @@ -0,0 +1,39 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 am [101, ] +1 amazing and [100, ] +1 an amazing [100, ] +1 an extraordinary [202, ] +1 and beautiful [100, ] +1 at 5 [101, ] +1 at an [100, 202, ] +1 beautiful house [100, ] +1 bell rings [101, ] +1 extraordinary house [202, ] +1 rings at [101, ] +1 the bell [101, ] +2 amazing beautiful [100, ] +2 an and [100, ] +2 an house [202, ] +2 and house [100, ] +2 at am [101, ] +2 at amazing [100, ] +2 at extraordinary [202, ] +2 bell at [101, ] +2 rings 5 [101, ] +2 the rings [101, ] +3 amazing house [100, ] +3 an beautiful [100, ] +3 at and [100, ] +3 at house [202, ] +3 bell 5 [101, ] +3 rings am [101, ] +3 the at [101, ] +4 an house [100, ] +4 at beautiful [100, ] +4 bell am [101, ] +4 the 5 [101, ] +5 at house [100, ] +5 the am [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap deleted file mode 100644 index e460be400..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..0f2e458a8 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,35 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [101, ] +1 5 am [101, ] +1 amazing a [100, ] +1 amazing an [100, ] +1 an a [100, ] +1 an am [100, ] +1 and b [100, ] +1 and be [100, ] +1 at a [100, 202, ] +1 at an [100, 202, ] +1 rings a [101, ] +1 the b [101, ] +1 the be [101, ] +2 amazing b [100, ] +2 amazing be [100, ] +2 an a [100, ] +2 an an [100, ] +2 at a [100, 101, ] +2 at am [100, 101, ] +2 bell a [101, ] +3 an b [100, ] +3 an be [100, ] +3 at a [100, ] +3 at an [100, ] +3 rings a [101, ] +3 rings am [101, ] +3 the a [101, ] +4 at b [100, ] +4 at be [100, ] +4 bell a [101, ] +4 bell am [101, ] + diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 47a6df343..000000000 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,46 +0,0 @@ ---- -source: milli/src/update/word_prefix_pair_proximity_docids.rs ---- -1 5 a [101, ] -1 amazing a [100, ] -1 an a [100, ] -1 and b [100, ] -1 and be [100, ] -1 at a [100, ] -1 rings a [101, ] -1 the b [101, ] -1 the be [101, ] -2 5 a [101, ] -2 amazing a [100, ] -2 amazing b [100, ] -2 amazing be [100, ] -2 an a [100, ] -2 and a [100, ] -2 at a [100, 101, ] -2 beautiful a [100, ] -2 bell a [101, ] -2 house b [100, ] -2 house be [100, ] -2 rings b [101, ] -2 rings be [101, ] -3 am a [101, ] -3 amazing a [100, ] -3 an b [100, ] -3 an be [100, ] -3 and a [100, ] -3 at a [100, ] -3 at b [101, ] -3 at be [101, ] -3 beautiful a [100, ] -3 house a [100, ] -3 rings a [101, ] -3 the a [101, ] -4 5 b [101, ] -4 5 be [101, ] -4 and a [100, ] -4 at b [100, ] -4 at be [100, ] -4 beautiful a [100, ] -4 bell a [101, ] -4 house a [100, ] - diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap deleted file mode 100644 index bb2cc3b84..000000000 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/word_prefix_pair_proximity_docids.rs ---- -fb88e49fd666886731b62baef8f44995 From 072b57651407a4c0d1e81f3d385a07776b9bb7a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 16:00:56 +0200 Subject: [PATCH 1674/1889] Fix proximity value in keys of prefix_word_pair_proximity_docids --- .../update/prefix_word_pairs/prefix_word.rs | 4 +- .../prefix_word_pair_proximity_docids.snap | 42 +++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index cbc9ac0b2..18f5bdc5a 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -44,7 +44,7 @@ pub fn index_prefix_word_database( let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; // This is the core of the algorithm execute_on_word_pairs_and_prefixes( - proximity + 1, + proximity, prefix.as_bytes(), // the next two arguments tell how to iterate over the new word pairs &mut cursor, @@ -91,7 +91,7 @@ pub fn index_prefix_word_database( .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? .remap_key_type::(); execute_on_word_pairs_and_prefixes( - proximity + 1, + proximity, prefix.as_bytes(), &mut db_iter, |db_iter| { diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap index c5f45a9eb..7644c433d 100644 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap @@ -1,29 +1,29 @@ --- source: milli/src/update/prefix_word_pairs/mod.rs --- -2 a 5 [101, ] +1 a 5 [101, ] +1 a amazing [100, ] +1 a an [100, 202, ] +1 a and [100, ] +1 a beautiful [100, ] +1 a extraordinary [202, ] +1 am and [100, ] +1 an amazing [100, ] +1 an beautiful [100, ] +1 an extraordinary [202, ] +1 b house [100, ] +1 b rings [101, ] +1 be house [100, ] +1 be rings [101, ] +2 a am [101, ] 2 a amazing [100, ] -2 a an [100, 202, ] 2 a and [100, ] 2 a beautiful [100, ] 2 a extraordinary [202, ] -2 am and [100, ] -2 an amazing [100, ] -2 an beautiful [100, ] -2 an extraordinary [202, ] -2 b house [100, ] -2 b rings [101, ] -2 be house [100, ] -2 be rings [101, ] -3 a am [101, ] -3 a amazing [100, ] -3 a and [100, ] -3 a beautiful [100, ] -3 a extraordinary [202, ] -3 a house [100, 202, ] -3 am beautiful [100, ] -3 an and [100, ] -3 an house [100, 202, ] -3 b at [101, ] -3 be at [101, ] +2 a house [100, 202, ] +2 am beautiful [100, ] +2 an and [100, ] +2 an house [100, 202, ] +2 b at [101, ] +2 be at [101, ] From 18d578dfc439db736892d58ccda6c60d323dca26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 15 Sep 2022 09:34:35 +0200 Subject: [PATCH 1675/1889] Adjust some algorithms using DBs of word pair proximities --- milli/src/search/criteria/exactness.rs | 1 + milli/src/search/criteria/mod.rs | 183 ++++++++++++++++++++----- 2 files changed, 153 insertions(+), 31 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index e7775423c..5327f13e4 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -226,6 +226,7 @@ fn resolve_state( } // compute intersection on pair of words with a proximity of 0. Phrase(phrase) => { + // TODO: use resolve_phrase here let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); for words in phrase.windows(2) { if let [left, right] = words { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 86cec1ddc..cefc071ee 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -71,6 +71,7 @@ pub trait Context<'c> { fn exact_word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; fn exact_word_prefix_docids(&self, word: &str) -> heed::Result>; + fn word_pair_proximity_docids( &self, left: &str, @@ -83,6 +84,12 @@ pub trait Context<'c> { right: &str, proximity: u8, ) -> heed::Result>; + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, + right: &str, + proximity: u8, + ) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; fn docid_words_positions( @@ -111,6 +118,68 @@ pub struct CriteriaBuilder<'t> { words_prefixes_fst: fst::Set>, } +/// Return the docids for the following word pairs and proximities using [`Context::word_pair_proximity_docids`]. +/// * `left, right, prox` (leftward proximity) +/// * `right, left, prox-1` (rightward proximity) +/// +/// ## Example +/// For a document with the text `the good fox eats the apple`, we have: +/// * `rightward_proximity(the, eats) = 3` +/// * `leftward_proximity(eats, the) = 1` +/// +/// So both the expressions `word_pair_overall_proximity_docids(ctx, the, eats, 3)` +/// and `word_pair_overall_proximity_docids(ctx, the, eats, 2)` would return a bitmap containing +/// the id of this document. +fn word_pair_overall_proximity_docids( + ctx: &dyn Context, + left: &str, + right: &str, + prox: u8, +) -> heed::Result> { + let rightward = ctx.word_pair_proximity_docids(left, right, prox)?; + let leftward = + if prox > 1 { ctx.word_pair_proximity_docids(right, left, prox - 1)? } else { None }; + if let Some(mut all) = rightward { + if let Some(leftward) = leftward { + all |= leftward; + } + Ok(Some(all)) + } else { + Ok(leftward) + } +} + +/// This function works identically to [`word_pair_overall_proximity_docids`] except that the +/// right word is replaced by a prefix string. +/// +/// It will return None if no documents were found or if the prefix does not exist in the +/// `word_prefix_pair_proximity_docids` database. +fn word_prefix_pair_overall_proximity_docids( + ctx: &dyn Context, + left: &str, + prefix: &str, + proximity: u8, +) -> heed::Result> { + // We retrieve the docids for the original and swapped word pairs: + // A: word1 prefix2 proximity + // B: prefix2 word1 proximity-1 + let rightward = ctx.word_prefix_pair_proximity_docids(left, prefix, proximity)?; + + let leftward = if proximity > 1 { + ctx.prefix_word_pair_proximity_docids(prefix, left, proximity - 1)? + } else { + None + }; + if let Some(mut all) = rightward { + if let Some(leftward) = leftward { + all |= leftward; + } + Ok(Some(all)) + } else { + Ok(leftward) + } +} + impl<'c> Context<'c> for CriteriaBuilder<'c> { fn documents_ids(&self) -> heed::Result { self.index.documents_ids(self.rtxn) @@ -138,18 +207,24 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { right: &str, proximity: u8, ) -> heed::Result> { - let key = (proximity, left, right); - self.index.word_pair_proximity_docids.get(self.rtxn, &key) + self.index.word_pair_proximity_docids.get(self.rtxn, &(proximity, left, right)) } fn word_prefix_pair_proximity_docids( &self, left: &str, + prefix: &str, + proximity: u8, + ) -> heed::Result> { + self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &(proximity, left, prefix)) + } + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, right: &str, proximity: u8, ) -> heed::Result> { - let key = (proximity, left, right); - self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) + self.index.prefix_word_pair_proximity_docids.get(self.rtxn, &(proximity, prefix, right)) } fn words_fst<'t>(&self) -> &'t fst::Set> { @@ -353,17 +428,34 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(RoaringBitmap::new()), + if s1 == s2 { + continue; + } + if dist == 0 { + match ctx.word_pair_proximity_docids(s1, s2, 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this pair, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } + } else { + let mut bitmap = RoaringBitmap::new(); + for dist in 0..=dist { + match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + Some(m) => bitmap |= m, + None => {} + } + } + if bitmap.is_empty() { + return Ok(bitmap); + } else { + bitmaps.push(bitmap); + } } } } @@ -387,7 +479,7 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result, U: AsRef>( +fn all_word_pair_overall_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], right_words: &[(U, u8)], @@ -396,9 +488,9 @@ fn all_word_pair_proximity_docids, U: AsRef>( let mut docids = RoaringBitmap::new(); for (left, _l_typo) in left_words { for (right, _r_typo) in right_words { - let current_docids = ctx - .word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)? - .unwrap_or_default(); + let current_docids = + word_pair_overall_proximity_docids(ctx, left.as_ref(), right.as_ref(), proximity)? + .unwrap_or_default(); docids |= current_docids; } } @@ -472,7 +564,8 @@ fn query_pair_proximity_docids( match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix { - match ctx.word_prefix_pair_proximity_docids( + match word_prefix_pair_overall_proximity_docids( + ctx, left.as_str(), right.as_str(), proximity, @@ -480,7 +573,12 @@ fn query_pair_proximity_docids( Some(docids) => Ok(docids), None => { let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + all_word_pair_overall_proximity_docids( + ctx, + &[(left, 0)], + &r_words, + proximity, + ) } } } else { @@ -495,7 +593,8 @@ fn query_pair_proximity_docids( if prefix { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { - let current_docids = match ctx.word_prefix_pair_proximity_docids( + let current_docids = match word_prefix_pair_overall_proximity_docids( + ctx, left.as_str(), right.as_str(), proximity, @@ -504,19 +603,24 @@ fn query_pair_proximity_docids( None => { let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + all_word_pair_overall_proximity_docids( + ctx, + &[(left, 0)], + &r_words, + proximity, + ) } }?; docids |= current_docids; } Ok(docids) } else { - all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) + all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } } (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) } ( QueryKind::Tolerant { typo: l_typo, word: left }, @@ -525,7 +629,7 @@ fn query_pair_proximity_docids( let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) + all_word_pair_overall_proximity_docids(ctx, &l_words, &r_words, proximity) } } } @@ -552,6 +656,7 @@ pub mod test { exact_word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + prefix_word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, docid_words: HashMap>, } @@ -588,13 +693,22 @@ pub mod test { fn word_prefix_pair_proximity_docids( &self, - left: &str, - right: &str, + word: &str, + prefix: &str, proximity: u8, ) -> heed::Result> { - let key = (left.to_string(), right.to_string(), proximity.into()); + let key = (word.to_string(), prefix.to_string(), proximity.into()); Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) } + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, + word: &str, + proximity: u8, + ) -> heed::Result> { + let key = (prefix.to_string(), word.to_string(), proximity.into()); + Ok(self.prefix_word_pair_proximity_docids.get(&key).cloned()) + } fn words_fst<'t>(&self) -> &'t fst::Set> { &self.words_fst @@ -708,6 +822,8 @@ pub mod test { let mut word_pair_proximity_docids = HashMap::new(); let mut word_prefix_pair_proximity_docids = HashMap::new(); + let mut prefix_word_pair_proximity_docids = HashMap::new(); + for (lword, lcandidates) in &word_docids { for (rword, rcandidates) in &word_docids { if lword == rword { @@ -740,15 +856,19 @@ pub mod test { let lposition = docid_words.iter().position(|w| w == lword).unwrap(); let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); - let key = if lposition < rposition { - (s(lword), s(pword), (rposition - lposition) as i32) + if lposition < rposition { + let key = (s(lword), s(pword), (rposition - lposition) as i32); + let docids = word_prefix_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); + docids.push(candidate); } else { - (s(lword), s(pword), (lposition - rposition + 1) as i32) + let key = (s(lword), s(pword), (lposition - rposition) as i32); + let docids = prefix_word_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); + docids.push(candidate); }; - let docids = word_prefix_pair_proximity_docids - .entry(key) - .or_insert(RoaringBitmap::new()); - docids.push(candidate); } } } @@ -766,6 +886,7 @@ pub mod test { exact_word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, docid_words, } } From 830a7c0c7ab8a39f3444d767d802fdac86ea2c6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 15 Sep 2022 13:34:52 +0200 Subject: [PATCH 1676/1889] Use `resolve_phrase` function for exactness criteria as well --- milli/src/search/criteria/exactness.rs | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 5327f13e4..d5b2ff0ee 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -7,7 +7,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::criteria::{ - resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, + resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; use crate::{absolute_from_relative_position, FieldId, Result}; @@ -226,20 +226,7 @@ fn resolve_state( } // compute intersection on pair of words with a proximity of 0. Phrase(phrase) => { - // TODO: use resolve_phrase here - let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); - for words in phrase.windows(2) { - if let [left, right] = words { - match ctx.word_pair_proximity_docids(left, right, 0)? { - Some(docids) => bitmaps.push(docids), - None => { - bitmaps.clear(); - break; - } - } - } - } - candidates |= intersection_of(bitmaps.iter().collect()); + candidates |= resolve_phrase(ctx, phrase)?; } } parts_candidates_array.push(candidates); From 178d00f93aebe3a688b32d548006d8e0e0d34393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 19 Sep 2022 11:03:52 +0200 Subject: [PATCH 1677/1889] Cargo fmt --- milli/src/update/prefix_word_pairs/mod.rs | 11 ++++++++--- .../src/update/prefix_word_pairs/prefix_word.rs | 14 ++++++++------ .../src/update/prefix_word_pairs/word_prefix.rs | 17 ++++++++++------- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 63286f8da..1549acf40 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -1,7 +1,11 @@ +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; + +use heed::types::ByteSlice; + use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; use crate::{Index, Result}; -use heed::types::ByteSlice; -use std::{borrow::Cow, collections::HashSet, io::BufReader}; mod prefix_word; mod word_prefix; @@ -131,10 +135,11 @@ pub fn write_into_lmdb_database_without_merging( #[cfg(test)] mod tests { + use std::io::Cursor; + use crate::db_snap; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; - use std::io::Cursor; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 18f5bdc5a..0cd55c929 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -1,14 +1,16 @@ +use std::borrow::Cow; +use std::collections::{BTreeMap, HashSet}; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; + use crate::update::index_documents::{create_writer, CursorClonableMmap}; use crate::update::prefix_word_pairs::{ insert_into_database, write_into_lmdb_database_without_merging, }; use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; -use std::borrow::Cow; -use std::collections::{BTreeMap, HashSet}; #[logging_timer::time] pub fn index_prefix_word_database( diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index bd1bea2a3..1c7a4fffe 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -163,17 +163,19 @@ Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. */ +use std::borrow::Cow; +use std::collections::HashSet; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; + use crate::update::index_documents::{create_writer, CursorClonableMmap}; use crate::update::prefix_word_pairs::{ insert_into_database, write_into_lmdb_database_without_merging, }; use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; -use std::borrow::Cow; -use std::collections::HashSet; #[logging_timer::time] pub fn index_word_prefix_database( @@ -562,9 +564,10 @@ impl PrefixTrieNode { } #[cfg(test)] mod tests { + use roaring::RoaringBitmap; + use super::*; use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; - use roaring::RoaringBitmap; fn check_prefixes( trie: &PrefixTrieNode, From e6e76fbefecd0a727c0a1c32492604a65d6dc0de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 19 Sep 2022 15:59:05 +0200 Subject: [PATCH 1678/1889] Improve performance of resolve_phrase at the cost of some relevancy --- milli/src/search/criteria/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index cefc071ee..234252ff2 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -427,12 +427,14 @@ pub fn resolve_query_tree( pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result { let mut candidates = RoaringBitmap::new(); let mut first_iter = true; - let winsize = phrase.len().min(7); + let winsize = phrase.len().min(3); for win in phrase.windows(winsize) { // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); for (offset, s1) in win.iter().enumerate() { for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + // TODO: add proximity between identical words to the word + // pair proximity database if s1 == s2 { continue; } From ab2f6f3aa4f489d4ea69f065a417a91b3efc6796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 19 Sep 2022 16:22:07 +0200 Subject: [PATCH 1679/1889] Refine some details in word_prefix_pair_proximity indexing code --- .../update/prefix_word_pairs/prefix_word.rs | 33 +++++++++---------- .../update/prefix_word_pairs/word_prefix.rs | 3 +- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 0cd55c929..8883cc451 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -35,9 +35,6 @@ pub fn index_prefix_word_database( .filter(|s| s.len() <= max_prefix_length) .collect(); - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (word1, common_prefix, proximity) elements - // to insert in the DB for proximity in 1..=max_proximity - 1 { for prefix in common_prefixes.iter() { let mut prefix_key = vec![]; @@ -135,13 +132,11 @@ pub fn index_prefix_word_database( Ok(()) } -/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. /// -/// Its main arguments are: -/// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements -/// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements -/// -/// For more information about what this function does, read the module documentation. +/// Its arguments are: +/// - an iterator over the words following the given `prefix` with the given `proximity` +/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements fn execute_on_word_pairs_and_prefixes( proximity: u8, prefix: &[u8], @@ -151,28 +146,32 @@ fn execute_on_word_pairs_and_prefixes( ) -> Result<()> { let mut batch: BTreeMap, Vec>> = <_>::default(); - while let Some((word2, data)) = next_word2_and_docids(iter)? { + // Memory usage check: + // The content of the loop will be called for each `word2` that follows a word beginning + // with `prefix` with the given proximity. + // In practice, I don't think the batch can ever get too big. + while let Some((word2, docids)) = next_word2_and_docids(iter)? { let entry = batch.entry(word2.to_owned()).or_default(); - entry.push(Cow::Owned(data.to_owned())); + entry.push(Cow::Owned(docids.to_owned())); } - let mut key_buffer = Vec::with_capacity(8); + let mut key_buffer = Vec::with_capacity(512); key_buffer.push(proximity); key_buffer.extend_from_slice(prefix); key_buffer.push(0); let mut value_buffer = Vec::with_capacity(65_536); - for (key, values) in batch { + for (word2, docids) in batch { key_buffer.truncate(prefix.len() + 2); value_buffer.clear(); - key_buffer.extend_from_slice(&key); - let data = if values.len() > 1 { - CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?; + key_buffer.extend_from_slice(&word2); + let data = if docids.len() > 1 { + CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; value_buffer.as_slice() } else { - &values[0] + &docids[0] }; insert(key_buffer.as_slice(), data)?; } diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index 1c7a4fffe..eb0b05d89 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -1,5 +1,4 @@ /*! - ## What is WordPrefix? The word-prefix-pair-proximity-docids database is a database whose keys are of the form `(proximity, word, prefix)` and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with @@ -320,7 +319,7 @@ fn execute_on_word_pairs_and_prefixes( let mut merge_buffer = Vec::with_capacity(65_536); while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { - // skip this iteration if the proximity is over the threshold + // stop indexing if the proximity is over the threshold if proximity > max_proximity { break; }; From 176ffd23f554ea1535454b6392387b130652409a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 18 Oct 2022 10:40:26 +0200 Subject: [PATCH 1680/1889] Fix compile error after rebasing wppd-refactor --- milli/src/search/query_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 080f89080..034b9123b 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -203,7 +203,7 @@ impl<'a> Context for QueryTreeBuilder<'a> { right_word: &str, proximity: u8, ) -> heed::Result> { - let key = (left_word, right_word, proximity); + let key = (proximity, left_word, right_word); self.index .word_pair_proximity_docids .remap_data_type::() From a983129613d1f9340484e648536fcce4c4f4303d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 20 Oct 2022 09:49:37 +0200 Subject: [PATCH 1681/1889] Apply suggestions from code review --- milli/src/search/criteria/mod.rs | 5 ++-- milli/src/update/index_documents/mod.rs | 8 ++++++- milli/src/update/prefix_word_pairs/mod.rs | 24 +++++++++++++++++-- .../update/prefix_word_pairs/prefix_word.rs | 9 ++++--- .../update/prefix_word_pairs/word_prefix.rs | 7 ++++-- 5 files changed, 42 insertions(+), 11 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 234252ff2..4069306b3 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -448,9 +448,8 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result bitmap |= m, - None => {} + if let Some(m) = ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + bitmap |= m } } if bitmap.is_empty() { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 897f2f8f8..5550c8725 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -528,7 +528,13 @@ where if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { // Run the word prefix pair proximity docids update operation. - PrefixWordPairsProximityDocids::new(self.wtxn, self.index).execute( + PrefixWordPairsProximityDocids::new( + self.wtxn, + self.index, + self.indexer_config.chunk_compression_type, + self.indexer_config.chunk_compression_level, + ) + .execute( word_pair_proximity_docids, &new_prefix_fst_words, &common_prefix_fst_words, diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 1549acf40..03abdbb6e 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::io::BufReader; +use grenad::CompressionType; use heed::types::ByteSlice; use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; @@ -18,10 +19,24 @@ pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { index: &'i Index, max_proximity: u8, max_prefix_length: usize, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, } impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Self { - Self { wtxn, index, max_proximity: 4, max_prefix_length: 2 } + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, + ) -> Self { + Self { + wtxn, + index, + max_proximity: 4, + max_prefix_length: 2, + chunk_compression_type, + chunk_compression_level, + } } /// Set the maximum proximity required to make a prefix be part of the words prefixes /// database. If two words are too far from the threshold the associated documents will @@ -42,6 +57,7 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { self.max_prefix_length = value; self } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute<'a>( self, @@ -60,6 +76,8 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words, + self.chunk_compression_type, + self.chunk_compression_level, )?; index_prefix_word_database( @@ -72,6 +90,8 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words, + self.chunk_compression_type, + self.chunk_compression_level, )?; Ok(()) diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 8883cc451..9bc184825 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -23,6 +23,8 @@ pub fn index_prefix_word_database( new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, ) -> Result<()> { let max_proximity = max_proximity - 1; debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); @@ -35,7 +37,7 @@ pub fn index_prefix_word_database( .filter(|s| s.len() <= max_prefix_length) .collect(); - for proximity in 1..=max_proximity - 1 { + for proximity in 1..max_proximity { for prefix in common_prefixes.iter() { let mut prefix_key = vec![]; prefix_key.push(proximity); @@ -78,7 +80,8 @@ pub fn index_prefix_word_database( // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) // element in an intermediary grenad - let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + let mut writer = + create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); for proximity in 1..=max_proximity - 1 { for prefix in new_prefixes.iter() { @@ -144,7 +147,7 @@ fn execute_on_word_pairs_and_prefixes( mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, ) -> Result<()> { - let mut batch: BTreeMap, Vec>> = <_>::default(); + let mut batch: BTreeMap, Vec>> = BTreeMap::default(); // Memory usage check: // The content of the loop will be called for each `word2` that follows a word beginning diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index eb0b05d89..5895cdc46 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -187,6 +187,8 @@ pub fn index_word_prefix_database( new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); @@ -249,7 +251,8 @@ pub fn index_word_prefix_database( // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) // element in an intermediary grenad - let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + let mut writer = + create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); execute_on_word_pairs_and_prefixes( &mut db_iter, @@ -325,7 +328,7 @@ fn execute_on_word_pairs_and_prefixes( }; let word2_start_different_than_prev = word2[0] != prev_word2_start; // if there were no potential prefixes for the previous word2 based on its first letter, - // and if the current word2 starts with the s`ame letter, then there is also no potential + // and if the current word2 starts with the same letter, then there is also no potential // prefixes for the current word2, and we can skip to the next iteration if empty_prefixes && !word2_start_different_than_prev { continue; From f3874d58b993cfb4aecdc014821967b103fc7fab Mon Sep 17 00:00:00 2001 From: curquiza Date: Mon, 24 Oct 2022 10:13:25 +0000 Subject: [PATCH 1682/1889] Update version for the next release (v0.34.0) in Cargo.toml files --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index ee10a1169..ee6bae7c0 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.33.4" +version = "0.34.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 62c4b6a73..97931a371 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.33.4" +version = "0.34.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 3cccd38a7..1245b097b 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.33.4" +version = "0.34.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index cd22b4273..3953ad0f6 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.33.4" +version = "0.34.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index d09730332..2b71d1d18 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.33.4" +version = "0.34.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index a023944e3..835425714 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.33.4" +version = "0.34.0" authors = ["Kerollmops "] edition = "2018" From be302fd25038ab92ed8c8e62ad59f3462962b76c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 24 Oct 2022 15:27:06 +0200 Subject: [PATCH 1683/1889] Remove outdated workaround for duplicate words in phrase search --- milli/src/search/criteria/mod.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 4069306b3..3159afb9e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -433,11 +433,6 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result bitmaps.push(m), From 9a569d73d18427b65d009c7314f8234ba90bafc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 24 Oct 2022 15:30:43 +0200 Subject: [PATCH 1684/1889] Minor code style change --- milli/src/update/prefix_word_pairs/prefix_word.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 9bc184825..26fe0105e 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -83,7 +83,7 @@ pub fn index_prefix_word_database( let mut writer = create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); - for proximity in 1..=max_proximity - 1 { + for proximity in 1..max_proximity { for prefix in new_prefixes.iter() { let mut prefix_key = vec![]; prefix_key.push(proximity); From 36bd66281dd12b446ee5339bd6a71c3bc98f9715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 25 Oct 2022 14:37:56 +0200 Subject: [PATCH 1685/1889] Add method to create a new Index with specific creation dates --- milli/src/index.rs | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 3bb668b43..94e2f538d 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -138,7 +138,12 @@ pub struct Index { } impl Index { - pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { + pub fn new_with_creation_dates>( + mut options: heed::EnvOpenOptions, + path: P, + created_at: OffsetDateTime, + updated_at: OffsetDateTime, + ) -> Result { use db_name::*; options.max_dbs(18); @@ -168,7 +173,7 @@ impl Index { env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; let documents = env.create_database(Some(DOCUMENTS))?; - Index::initialize_creation_dates(&env, main)?; + Index::set_creation_dates(&env, main, created_at, updated_at)?; Ok(Index { env, @@ -193,21 +198,30 @@ impl Index { }) } - fn initialize_creation_dates(env: &heed::Env, main: PolyDatabase) -> heed::Result<()> { + pub fn new>(options: heed::EnvOpenOptions, path: P) -> Result { + let now = OffsetDateTime::now_utc(); + Self::new_with_creation_dates(options, path, now.clone(), now) + } + + fn set_creation_dates( + env: &heed::Env, + main: PolyDatabase, + created_at: OffsetDateTime, + updated_at: OffsetDateTime, + ) -> heed::Result<()> { let mut txn = env.write_txn()?; // The db was just created, we update its metadata with the relevant information. if main.get::<_, Str, SerdeJson>(&txn, main_key::CREATED_AT_KEY)?.is_none() { - let now = OffsetDateTime::now_utc(); main.put::<_, Str, SerdeJson>( &mut txn, main_key::UPDATED_AT_KEY, - &now, + &updated_at, )?; main.put::<_, Str, SerdeJson>( &mut txn, main_key::CREATED_AT_KEY, - &now, + &created_at, )?; txn.commit()?; } From 6b2fe94192fb8731c6ba542eda5f3a90cdc86f9f Mon Sep 17 00:00:00 2001 From: Ewan Higgs Date: Mon, 24 Oct 2022 21:34:13 +0200 Subject: [PATCH 1686/1889] Fixes for clippy bringing us down to 18 remaining issues. This brings us a step closer to enforcing clippy on each build. --- milli/src/asc_desc.rs | 2 +- milli/src/documents/builder.rs | 2 +- milli/src/documents/mod.rs | 6 +- milli/src/fields_ids_map.rs | 2 +- .../facet_string_level_zero_value_codec.rs | 2 +- .../facet_string_zero_bounds_value_codec.rs | 4 +- milli/src/index.rs | 6 +- milli/src/search/criteria/asc_desc.rs | 2 +- milli/src/search/criteria/attribute.rs | 17 +++-- milli/src/search/criteria/geo.rs | 2 +- milli/src/search/criteria/initial.rs | 2 +- milli/src/search/criteria/mod.rs | 68 +++++++++---------- milli/src/search/criteria/proximity.rs | 8 +-- milli/src/search/criteria/typo.rs | 6 +- milli/src/search/criteria/words.rs | 5 +- milli/src/search/facet/facet_distribution.rs | 4 +- milli/src/search/facet/filter.rs | 34 +++++----- milli/src/search/matches/matching_words.rs | 2 +- milli/src/search/matches/mod.rs | 10 +-- milli/src/search/query_tree.rs | 15 ++-- milli/src/update/delete_documents.rs | 21 +++--- milli/src/update/facets.rs | 10 +-- milli/src/update/index_documents/enrich.rs | 8 +-- .../extract/extract_docid_word_positions.rs | 2 +- .../extract/extract_fid_docid_facet_values.rs | 6 +- .../src/update/index_documents/extract/mod.rs | 2 +- 26 files changed, 117 insertions(+), 131 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index 88023b3cf..21065da36 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -70,7 +70,7 @@ impl FromStr for Member { type Err = AscDescError; fn from_str(text: &str) -> Result { - match text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(")")) { + match text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(')')) { Some(point) => { let (lat, lng) = point .split_once(',') diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 9fda31cf0..1fa59168e 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -60,7 +60,7 @@ impl DocumentsBatchBuilder { /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly. pub fn append_json_object(&mut self, object: &Object) -> io::Result<()> { // Make sure that we insert the fields ids in order as the obkv writer has this requirement. - let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect(); + let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(k)).collect(); fields_ids.sort_unstable(); self.obkv_buffer.clear(); diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 0bdf6600a..da3a07942 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -25,9 +25,9 @@ const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes(); pub fn obkv_to_object(obkv: &KvReader, index: &DocumentsBatchIndex) -> Result { obkv.iter() .map(|(field_id, value)| { - let field_name = index.name(field_id).ok_or_else(|| { - FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" } - })?; + let field_name = index + .name(field_id) + .ok_or(FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" })?; let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; Ok((field_name.to_string(), value)) }) diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index b0a084c3c..810ff755b 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -65,7 +65,7 @@ impl FieldsIdsMap { } /// Iterate over the ids in the order of the ids. - pub fn ids<'a>(&'a self) -> impl Iterator + 'a { + pub fn ids(&'_ self) -> impl Iterator + '_ { self.ids_names.keys().copied() } diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs index 22031c474..d1605e6ef 100644 --- a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -34,7 +34,7 @@ where type EItem = (&'a str, C::EItem); fn bytes_encode((string, value): &'a Self::EItem) -> Option> { - let value_bytes = C::bytes_encode(&value)?; + let value_bytes = C::bytes_encode(value)?; let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); encode_prefix_string(string, &mut bytes).ok()?; diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs index 337433c2b..90ba09ae2 100644 --- a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -66,14 +66,14 @@ where bytes.extend_from_slice(left.as_bytes()); bytes.extend_from_slice(right.as_bytes()); - let value_bytes = C::bytes_encode(&value)?; + let value_bytes = C::bytes_encode(value)?; bytes.extend_from_slice(&value_bytes[..]); Some(Cow::Owned(bytes)) } None => { bytes.push(0); - let value_bytes = C::bytes_encode(&value)?; + let value_bytes = C::bytes_encode(value)?; bytes.extend_from_slice(&value_bytes[..]); Some(Cow::Owned(bytes)) } diff --git a/milli/src/index.rs b/milli/src/index.rs index 94e2f538d..0601ae7b7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -320,7 +320,7 @@ impl Index { /// Writes the documents primary key, this is the field name that is used to store the id. pub(crate) fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { self.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, &primary_key) + self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, primary_key) } /// Deletes the primary key of the documents, this can be done to reset indexes settings. @@ -1013,7 +1013,7 @@ impl Index { let kv = self .documents .get(rtxn, &BEU32::new(id))? - .ok_or_else(|| UserError::UnknownInternalDocumentId { document_id: id })?; + .ok_or(UserError::UnknownInternalDocumentId { document_id: id })?; documents.push((id, kv)); } @@ -1072,7 +1072,7 @@ impl Index { wtxn: &mut RwTxn, time: &OffsetDateTime, ) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson>(wtxn, main_key::UPDATED_AT_KEY, &time) + self.main.put::<_, Str, SerdeJson>(wtxn, main_key::UPDATED_AT_KEY, time) } pub fn authorize_typos(&self, txn: &RoTxn) -> heed::Result { diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 6d50c1bb5..bf015c5fc 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -115,7 +115,7 @@ impl<'t> Criterion for AscDesc<'t> { let mut candidates = match (&self.query_tree, candidates) { (_, Some(candidates)) => candidates, (Some(qt), None) => { - let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + let context = CriteriaBuilder::new(self.rtxn, self.index)?; resolve_query_tree(&context, qt, params.wdcache)? } (None, None) => self.index.documents_ids(self.rtxn)?, diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index d8feeeee9..7e55a1038 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -89,7 +89,7 @@ impl<'t> Criterion for Attribute<'t> { } } } else { - let mut set_buckets = match self.set_buckets.as_mut() { + let set_buckets = match self.set_buckets.as_mut() { Some(set_buckets) => set_buckets, None => { let new_buckets = initialize_set_buckets( @@ -102,7 +102,7 @@ impl<'t> Criterion for Attribute<'t> { } }; - match set_compute_candidates(&mut set_buckets, &allowed_candidates)? { + match set_compute_candidates(set_buckets, &allowed_candidates)? { Some((_score, candidates)) => candidates, None => { return Ok(Some(CriterionResult { @@ -199,18 +199,18 @@ impl<'t> QueryPositionIterator<'t> { let iter = ctx.word_position_iterator(word, in_prefix_cache)?; inner.push(iter.peekable()); } else { - for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? + for (word, _) in word_derivations(word, true, 0, ctx.words_fst(), wdcache)? { - let iter = ctx.word_position_iterator(&word, in_prefix_cache)?; + let iter = ctx.word_position_iterator(word, in_prefix_cache)?; inner.push(iter.peekable()); } } } QueryKind::Tolerant { typo, word } => { for (word, _) in - word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? + word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)? { - let iter = ctx.word_position_iterator(&word, in_prefix_cache)?; + let iter = ctx.word_position_iterator(word, in_prefix_cache)?; inner.push(iter.peekable()); } } @@ -476,8 +476,7 @@ fn initialize_linear_buckets( } else { words_positions .get(word) - .map(|positions| positions.iter().next()) - .flatten() + .and_then(|positions| positions.iter().next()) } } QueryKind::Tolerant { typo, word } => { @@ -574,7 +573,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { if ops.iter().all(|op| op.query().is_some()) { vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] } else { - ops.iter().map(recurse).flatten().collect() + ops.iter().flat_map(recurse).collect() } } Phrase(words) => { diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index e3bda51de..1b08cfac8 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -90,7 +90,7 @@ impl Criterion for Geo<'_> { let mut candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, (Some(qt), None) => { - let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + let context = CriteriaBuilder::new(self.rtxn, self.index)?; resolve_query_tree(&context, qt, params.wdcache)? } (None, None) => self.index.documents_ids(self.rtxn)?, diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index ac61adfe2..85daa813b 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -44,7 +44,7 @@ impl Criterion for Initial<'_, D> { let mut candidates = resolve_query_tree( self.ctx, answer.query_tree.as_ref().unwrap(), - &mut params.wdcache, + params.wdcache, )?; // Apply the filters on the documents retrieved with the query tree. diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 3159afb9e..7d59bb3c0 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -186,19 +186,19 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { } fn word_docids(&self, word: &str) -> heed::Result> { - self.index.word_docids.get(self.rtxn, &word) + self.index.word_docids.get(self.rtxn, word) } fn exact_word_docids(&self, word: &str) -> heed::Result> { - self.index.exact_word_docids.get(self.rtxn, &word) + self.index.exact_word_docids.get(self.rtxn, word) } fn word_prefix_docids(&self, word: &str) -> heed::Result> { - self.index.word_prefix_docids.get(self.rtxn, &word) + self.index.word_prefix_docids.get(self.rtxn, word) } fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { - self.index.exact_word_prefix_docids.get(self.rtxn, &word) + self.index.exact_word_prefix_docids.get(self.rtxn, word) } fn word_pair_proximity_docids( @@ -321,7 +321,7 @@ impl<'t> CriteriaBuilder<'t> { exhaustive_number_hits, distinct, )) as Box; - for name in self.index.criteria(&self.rtxn)? { + for name in self.index.criteria(self.rtxn)? { criterion = match name { Name::Words => Box::new(Words::new(self, criterion)), Name::Typo => Box::new(Typo::new(self, criterion)), @@ -330,29 +330,23 @@ impl<'t> CriteriaBuilder<'t> { for asc_desc in sort_criteria { criterion = match asc_desc { AscDescName::Asc(Member::Field(field)) => Box::new(AscDesc::asc( - &self.index, - &self.rtxn, + self.index, + self.rtxn, criterion, field.to_string(), )?), AscDescName::Desc(Member::Field(field)) => Box::new(AscDesc::desc( - &self.index, - &self.rtxn, + self.index, + self.rtxn, criterion, field.to_string(), )?), - AscDescName::Asc(Member::Geo(point)) => Box::new(Geo::asc( - &self.index, - &self.rtxn, - criterion, - point.clone(), - )?), - AscDescName::Desc(Member::Geo(point)) => Box::new(Geo::desc( - &self.index, - &self.rtxn, - criterion, - point.clone(), - )?), + AscDescName::Asc(Member::Geo(point)) => { + Box::new(Geo::asc(self.index, self.rtxn, criterion, *point)?) + } + AscDescName::Desc(Member::Geo(point)) => { + Box::new(Geo::desc(self.index, self.rtxn, criterion, *point)?) + } }; } criterion @@ -363,10 +357,10 @@ impl<'t> CriteriaBuilder<'t> { Name::Attribute => Box::new(Attribute::new(self, criterion)), Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), Name::Asc(field) => { - Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?) + Box::new(AscDesc::asc(self.index, self.rtxn, criterion, field)?) } Name::Desc(field) => { - Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?) + Box::new(AscDesc::desc(self.index, self.rtxn, criterion, field)?) } }; } @@ -408,7 +402,7 @@ pub fn resolve_query_tree( } Ok(candidates) } - Phrase(words) => resolve_phrase(ctx, &words), + Phrase(words) => resolve_phrase(ctx, words), Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { @@ -457,7 +451,7 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result Result { match &query.kind { QueryKind::Exact { word, original_typo } => { - if query.prefix && ctx.in_prefix_cache(&word) { - let mut docids = ctx.word_prefix_docids(&word)?.unwrap_or_default(); + if query.prefix && ctx.in_prefix_cache(word) { + let mut docids = ctx.word_prefix_docids(word)?.unwrap_or_default(); // only add the exact docids if the word hasn't been derived if *original_typo == 0 { - docids |= ctx.exact_word_prefix_docids(&word)?.unwrap_or_default(); + docids |= ctx.exact_word_prefix_docids(word)?.unwrap_or_default(); } Ok(docids) } else if query.prefix { - let words = word_derivations(&word, true, 0, ctx.words_fst(), wdcache)?; + let words = word_derivations(word, true, 0, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); for (word, _typo) in words { - docids |= ctx.word_docids(&word)?.unwrap_or_default(); + docids |= ctx.word_docids(word)?.unwrap_or_default(); // only add the exact docids if the word hasn't been derived if *original_typo == 0 { - docids |= ctx.exact_word_docids(&word)?.unwrap_or_default(); + docids |= ctx.exact_word_docids(word)?.unwrap_or_default(); } } Ok(docids) } else { - let mut docids = ctx.word_docids(&word)?.unwrap_or_default(); + let mut docids = ctx.word_docids(word)?.unwrap_or_default(); // only add the exact docids if the word hasn't been derived if *original_typo == 0 { - docids |= ctx.exact_word_docids(&word)?.unwrap_or_default(); + docids |= ctx.exact_word_docids(word)?.unwrap_or_default(); } Ok(docids) } } QueryKind::Tolerant { typo, word } => { - let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)?; + let words = word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)?; let mut docids = RoaringBitmap::new(); for (word, typo) in words { - let mut current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + let mut current_docids = ctx.word_docids(word)?.unwrap_or_default(); if *typo == 0 { - current_docids |= ctx.exact_word_docids(&word)?.unwrap_or_default() + current_docids |= ctx.exact_word_docids(word)?.unwrap_or_default() } docids |= current_docids; } @@ -585,7 +579,7 @@ fn query_pair_proximity_docids( } (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { let l_words = - word_derivations(&left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); + word_derivations(left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); if prefix { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index e942a7bef..b7c10a2e0 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -99,7 +99,7 @@ impl<'t> Criterion for Proximity<'t> { // use set theory based algorithm resolve_candidates( self.ctx, - &query_tree, + query_tree, self.proximity, &mut self.candidates_cache, params.wdcache, @@ -194,7 +194,7 @@ fn resolve_candidates<'t>( .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); match (most_left, most_right) { - (Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, &words)?)], + (Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, words)?)], _otherwise => Default::default(), } } else { @@ -496,7 +496,7 @@ fn resolve_plane_sweep_candidates( match kind { QueryKind::Exact { word, .. } => { if *prefix { - let iter = word_derivations(word, true, 0, &words_positions) + let iter = word_derivations(word, true, 0, words_positions) .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); result.extend(iter); } else if let Some(positions) = words_positions.get(word) { @@ -504,7 +504,7 @@ fn resolve_plane_sweep_candidates( } } QueryKind::Tolerant { typo, word } => { - let iter = word_derivations(word, *prefix, *typo, &words_positions) + let iter = word_derivations(word, *prefix, *typo, words_positions) .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); result.extend(iter); } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 605089fae..76bd04d20 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -69,7 +69,7 @@ impl<'t> Criterion for Typo<'t> { let fst = self.ctx.words_fst(); let new_query_tree = match self.typos { typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree( - &fst, + fst, query_tree.clone(), self.typos, params.wdcache, @@ -78,7 +78,7 @@ impl<'t> Criterion for Typo<'t> { // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, // we keep the altered query tree *query_tree = alterate_query_tree( - &fst, + fst, query_tree.clone(), self.typos, params.wdcache, @@ -199,7 +199,7 @@ fn alterate_query_tree( ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) } // Because Phrases don't allow typos, no alteration can be done. - Phrase(_words) => return Ok(()), + Phrase(_words) => Ok(()), Operation::Query(q) => { if let QueryKind::Tolerant { typo, word } = &q.kind { // if no typo is allowed we don't call word_derivations function, diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index ccc6c0617..b67b7f6b4 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -53,10 +53,7 @@ impl<'t> Criterion for Words<'t> { None => None, }; - let bucket_candidates = match self.bucket_candidates.as_mut() { - Some(bucket_candidates) => Some(take(bucket_candidates)), - None => None, - }; + let bucket_candidates = self.bucket_candidates.as_mut().map(take); return Ok(Some(CriterionResult { query_tree: Some(query_tree), diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index b2718a490..47e4088fe 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -66,7 +66,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { match facet_type { FacetType::Number => { - let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; @@ -91,7 +91,7 @@ impl<'a> FacetDistribution<'a> { } FacetType::String => { let mut normalized_distribution = BTreeMap::new(); - let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); let db = self.index.field_id_docid_facet_strings; for docid in candidates.into_iter() { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 7241dab2b..1d8fcd389 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -96,7 +96,7 @@ impl<'a> Filter<'a> { Either::Left(array) => { let mut ors = vec![]; for rule in array { - if let Some(filter) = Self::from_str(rule.as_ref())? { + if let Some(filter) = Self::from_str(rule)? { ors.push(filter.condition); } } @@ -108,7 +108,7 @@ impl<'a> Filter<'a> { } } Either::Right(rule) => { - if let Some(filter) = Self::from_str(rule.as_ref())? { + if let Some(filter) = Self::from_str(rule)? { ands.push(filter.condition); } } @@ -358,7 +358,7 @@ impl<'a> Filter<'a> { index, filterable_fields, )?; - return Ok(all_ids - selected); + Ok(all_ids - selected) } FilterCondition::In { fid, els } => { if crate::is_faceted(fid.value(), filterable_fields) { @@ -377,38 +377,36 @@ impl<'a> Filter<'a> { Ok(RoaringBitmap::new()) } } else { - return Err(fid.as_external_error(FilterError::AttributeNotFilterable { + Err(fid.as_external_error(FilterError::AttributeNotFilterable { attribute: fid.value(), filterable_fields: filterable_fields.clone(), - }))?; + }))? } } FilterCondition::Condition { fid, op } => { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, fid, &op) + Self::evaluate_operator(rtxn, index, fid, op) } else { - return Ok(RoaringBitmap::new()); + Ok(RoaringBitmap::new()) } } else { match fid.lexeme() { attribute @ "_geo" => { - return Err(fid.as_external_error(FilterError::BadGeo(attribute)))?; + Err(fid.as_external_error(FilterError::BadGeo(attribute)))? } attribute if attribute.starts_with("_geoPoint(") => { - return Err(fid.as_external_error(FilterError::BadGeo("_geoPoint")))?; + Err(fid.as_external_error(FilterError::BadGeo("_geoPoint")))? } attribute @ "_geoDistance" => { - return Err(fid.as_external_error(FilterError::Reserved(attribute)))?; + Err(fid.as_external_error(FilterError::Reserved(attribute)))? } attribute => { - return Err(fid.as_external_error( - FilterError::AttributeNotFilterable { - attribute, - filterable_fields: filterable_fields.clone(), - }, - ))?; + Err(fid.as_external_error(FilterError::AttributeNotFilterable { + attribute, + filterable_fields: filterable_fields.clone(), + }))? } } } @@ -477,10 +475,10 @@ impl<'a> Filter<'a> { Ok(result) } else { - return Err(point[0].as_external_error(FilterError::AttributeNotFilterable { + Err(point[0].as_external_error(FilterError::AttributeNotFilterable { attribute: "_geo", filterable_fields: filterable_fields.clone(), - }))?; + }))? } } } diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 71fbfd794..1f6ead8a9 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -44,7 +44,7 @@ impl<'a> Iterator for MatchesIter<'a, '_> { fn next(&mut self) -> Option { match self.inner.next() { - Some((matching_words, ids)) => match matching_words[0].match_token(&self.token) { + Some((matching_words, ids)) => match matching_words[0].match_token(self.token) { Some(char_len) => { if matching_words.len() > 1 { Some(MatchType::Partial(PartialMatch { diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 53101a065..b76ddef99 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -49,16 +49,16 @@ impl<'a, A> MatcherBuilder<'a, A> { pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { let crop_marker = match &self.crop_marker { Some(marker) => marker.as_str(), - None => &DEFAULT_CROP_MARKER, + None => DEFAULT_CROP_MARKER, }; let highlight_prefix = match &self.highlight_prefix { Some(marker) => marker.as_str(), - None => &DEFAULT_HIGHLIGHT_PREFIX, + None => DEFAULT_HIGHLIGHT_PREFIX, }; let highlight_suffix = match &self.highlight_suffix { Some(marker) => marker.as_str(), - None => &DEFAULT_HIGHLIGHT_SUFFIX, + None => DEFAULT_HIGHLIGHT_SUFFIX, }; Matcher { text, @@ -95,7 +95,7 @@ pub struct Match { token_position: usize, } -#[derive(Serialize, Debug, Clone, PartialEq)] +#[derive(Serialize, Debug, Clone, PartialEq, Eq)] pub struct MatchBounds { pub start: usize, pub length: usize, @@ -131,7 +131,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { potential_matches.push((token_position, word_position, partial.char_len())); for (token_position, word_position, word) in words_positions { - partial = match partial.match_token(&word) { + partial = match partial.match_token(word) { // token matches the partial match, but the match is not full, // we temporarly save the current token then we try to match the next one. Some(MatchType::Partial(partial)) => { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 034b9123b..6e908f25d 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -188,8 +188,8 @@ impl<'a> Context for QueryTreeBuilder<'a> { } fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { - let one = self.index.min_word_len_one_typo(&self.rtxn)?; - let two = self.index.min_word_len_two_typos(&self.rtxn)?; + let one = self.index.min_word_len_one_typo(self.rtxn)?; + let two = self.index.min_word_len_two_typos(self.rtxn)?; Ok((one, two)) } @@ -207,7 +207,7 @@ impl<'a> Context for QueryTreeBuilder<'a> { self.index .word_pair_proximity_docids .remap_data_type::() - .get(&self.rtxn, &key) + .get(self.rtxn, &key) } } @@ -313,7 +313,7 @@ pub struct TypoConfig<'a> { /// Return the `QueryKind` of a word depending on `authorize_typos` /// and the provided word length. -fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind { +fn typos(word: String, authorize_typos: bool, config: TypoConfig<'_>) -> QueryKind { if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) { let count = word.chars().count().min(u8::MAX as usize) as u8; if count < config.word_len_one_typo { @@ -556,7 +556,7 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| MatchingWord::new(syn.to_string(), 0, false)) + .map(|syn| MatchingWord::new(syn, 0, false)) .collect(); matching_words.push((synonym, vec![id])); } @@ -583,8 +583,7 @@ fn create_matching_words( PrimitiveQueryPart::Phrase(words) => { let ids: Vec<_> = (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); - let words = - words.into_iter().map(|w| MatchingWord::new(w.to_string(), 0, false)).collect(); + let words = words.into_iter().map(|w| MatchingWord::new(w, 0, false)).collect(); matching_words.push((words, ids)); } } @@ -639,7 +638,7 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| MatchingWord::new(syn.to_string(), 0, false)) + .map(|syn| MatchingWord::new(syn, 0, false)) .collect(); matching_words.push((synonym, ids.clone())); } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 54328b50d..26340b9dd 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -127,7 +127,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // the `soft_deleted_documents_ids` bitmap and early exit. let size_used = self.index.used_size()?; let map_size = self.index.env.map_size()? as u64; - let nb_documents = self.index.number_of_documents(&self.wtxn)?; + let nb_documents = self.index.number_of_documents(self.wtxn)?; let nb_soft_deleted = soft_deleted_docids.len(); let percentage_available = 100 - (size_used * 100 / map_size); @@ -158,12 +158,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // and we can reset the soft deleted bitmap self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; - let primary_key = self.index.primary_key(self.wtxn)?.ok_or_else(|| { - InternalError::DatabaseMissingEntry { + let primary_key = + self.index.primary_key(self.wtxn)?.ok_or(InternalError::DatabaseMissingEntry { db_name: db_name::MAIN, key: Some(main_key::PRIMARY_KEY_KEY), - } - })?; + })?; // Since we already checked if the DB was empty, if we can't find the primary key, then // something is wrong, and we must return an error. @@ -433,7 +432,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { .map(|point| (point, point.data.0)) .unzip(); points_to_remove.iter().for_each(|point| { - rtree.remove(&point); + rtree.remove(point); }); geo_faceted_doc_ids -= docids_to_remove; @@ -534,7 +533,7 @@ fn remove_from_word_docids( // We create an iterator to be able to get the content and delete the word docids. // It's faster to acquire a cursor to get and delete or put, as we avoid traversing // the LMDB B-Tree two times but only once. - let mut iter = db.prefix_iter_mut(txn, &word)?; + let mut iter = db.prefix_iter_mut(txn, word)?; if let Some((key, mut docids)) = iter.next().transpose()? { if key == word { let previous_len = docids.len(); @@ -597,7 +596,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( // level key. We must then parse the value using the appropriate codec. let (group, mut docids) = FacetStringZeroBoundsValueCodec::::bytes_decode(val) - .ok_or_else(|| SerializationError::Decoding { db_name })?; + .ok_or(SerializationError::Decoding { db_name })?; let previous_len = docids.len(); docids -= to_remove; @@ -609,7 +608,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( let val = &(group, docids); let value_bytes = FacetStringZeroBoundsValueCodec::::bytes_encode(val) - .ok_or_else(|| SerializationError::Encoding { db_name })?; + .ok_or(SerializationError::Encoding { db_name })?; // safety: we don't keep references from inside the LMDB database. unsafe { iter.put_current(&key, &value_bytes)? }; @@ -619,7 +618,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( // The key corresponds to a level zero facet string. let (original_value, mut docids) = FacetStringLevelZeroValueCodec::bytes_decode(val) - .ok_or_else(|| SerializationError::Decoding { db_name })?; + .ok_or(SerializationError::Decoding { db_name })?; let previous_len = docids.len(); docids -= to_remove; @@ -630,7 +629,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( let key = key.to_owned(); let val = &(original_value, docids); let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) - .ok_or_else(|| SerializationError::Encoding { db_name })?; + .ok_or(SerializationError::Encoding { db_name })?; // safety: we don't keep references from inside the LMDB database. unsafe { iter.put_current(&key, &value_bytes)? }; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 108acae4f..0d7dcbc50 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -262,8 +262,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { /// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` /// that must be inserted into the database. /// 2. a roaring bitmap of all the document ids present in the database -fn compute_facet_number_levels<'t>( - rtxn: &'t heed::RoTxn, +fn compute_facet_number_levels( + rtxn: &'_ heed::RoTxn, db: heed::Database, compression_type: CompressionType, compression_level: Option, @@ -496,7 +496,7 @@ where bitmaps.clear(); } // level 0 is already stored in the DB - return Ok(vec![]); + Ok(vec![]) } else { // level >= 1 // we compute each element of this level based on the elements of the level below it @@ -562,7 +562,7 @@ where } sub_writers.push(writer_into_reader(cur_writer)?); - return Ok(sub_writers); + Ok(sub_writers) } } @@ -598,7 +598,7 @@ fn write_number_entry( ) -> Result<()> { let key = (field_id, level, left, right); let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(ids).ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(()) } diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 15fbe9319..7eda5dca4 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -140,7 +140,7 @@ fn fetch_or_generate_document_id( } None => Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), - document: obkv_to_object(&document, &documents_batch_index)?, + document: obkv_to_object(document, documents_batch_index)?, })), } } @@ -156,7 +156,7 @@ fn fetch_or_generate_document_id( if matching_documents_ids.len() >= 2 { return Ok(Err(UserError::TooManyDocumentIds { primary_key: nested.name().to_string(), - document: obkv_to_object(&document, &documents_batch_index)?, + document: obkv_to_object(document, documents_batch_index)?, })); } } @@ -170,7 +170,7 @@ fn fetch_or_generate_document_id( }, None => Ok(Err(UserError::MissingDocumentId { primary_key: nested.name().to_string(), - document: obkv_to_object(&document, &documents_batch_index)?, + document: obkv_to_object(document, documents_batch_index)?, })), } } @@ -313,7 +313,7 @@ pub fn validate_document_id_value(document_id: Value) -> Result Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), }, Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), - content => Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), + content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), } } diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index e067623e2..f1d595039 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -132,7 +132,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st } if let Value::String(string) = value { - Some(&string) + Some(string) } else if inner(value, buffer) { Some(buffer) } else { diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index f9d1443d5..44afcde6c 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -67,7 +67,7 @@ pub fn extract_fid_docid_facet_values( facet_exists_docids.entry(field_id).or_default().insert(document); // For the other extraction tasks, prefix the key with the field_id and the document_id - key_buffer.extend_from_slice(&docid_bytes); + key_buffer.extend_from_slice(docid_bytes); let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; @@ -107,8 +107,8 @@ pub fn extract_fid_docid_facet_values( let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; Ok(( - sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, - sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?, + sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, + sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, facet_exists_docids_reader, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 50cc04610..8e0e61175 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -150,7 +150,7 @@ pub(crate) fn data_from_obkv_documents( spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_numbers_chunks, indexer, - lmdb_writer_sx.clone(), + lmdb_writer_sx, extract_facet_number_docids, merge_cbo_roaring_bitmaps, TypedChunk::FieldIdFacetNumberDocids, From 17f7922bfc9d6b68527c0fc148dc09966c6b55bd Mon Sep 17 00:00:00 2001 From: Ewan Higgs Date: Tue, 25 Oct 2022 12:42:38 +0200 Subject: [PATCH 1687/1889] Remove unneeded lifetimes. --- milli/src/search/query_tree.rs | 2 +- milli/src/update/facets.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 6e908f25d..9b4b38f76 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -313,7 +313,7 @@ pub struct TypoConfig<'a> { /// Return the `QueryKind` of a word depending on `authorize_typos` /// and the provided word length. -fn typos(word: String, authorize_typos: bool, config: TypoConfig<'_>) -> QueryKind { +fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) { let count = word.chars().count().min(u8::MAX as usize) as u8; if count < config.word_len_one_typo { diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 0d7dcbc50..ae2a6d7fd 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -263,7 +263,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { /// that must be inserted into the database. /// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_number_levels( - rtxn: &'_ heed::RoTxn, + rtxn: &heed::RoTxn, db: heed::Database, compression_type: CompressionType, compression_level: Option, From 2ce025a9069243a74a0a704aafa803aa875e1ad8 Mon Sep 17 00:00:00 2001 From: Ewan Higgs Date: Tue, 25 Oct 2022 20:58:31 +0200 Subject: [PATCH 1688/1889] Fixes after rebase to fix new issues. --- milli/src/index.rs | 2 +- milli/src/search/criteria/mod.rs | 18 +++++++++--------- milli/src/update/facets.rs | 4 ++-- .../update/prefix_word_pairs/prefix_word.rs | 7 +++---- .../update/prefix_word_pairs/word_prefix.rs | 16 ++++++---------- 5 files changed, 21 insertions(+), 26 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 0601ae7b7..4144728f1 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -200,7 +200,7 @@ impl Index { pub fn new>(options: heed::EnvOpenOptions, path: P) -> Result { let now = OffsetDateTime::now_utc(); - Self::new_with_creation_dates(options, path, now.clone(), now) + Self::new_with_creation_dates(options, path, now, now) } fn set_creation_dates( diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 7d59bb3c0..1b46c8441 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -562,11 +562,11 @@ fn query_pair_proximity_docids( )? { Some(docids) => Ok(docids), None => { - let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; + let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; all_word_pair_overall_proximity_docids( ctx, &[(left, 0)], - &r_words, + r_words, proximity, ) } @@ -592,11 +592,11 @@ fn query_pair_proximity_docids( Some(docids) => Ok(docids), None => { let r_words = - word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; + word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; all_word_pair_overall_proximity_docids( ctx, &[(left, 0)], - &r_words, + r_words, proximity, ) } @@ -609,17 +609,17 @@ fn query_pair_proximity_docids( } } (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { - let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + let r_words = word_derivations(right, prefix, *typo, ctx.words_fst(), wdcache)?; + all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], r_words, proximity) } ( QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }, ) => { let l_words = - word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); - let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids(ctx, &l_words, &r_words, proximity) + word_derivations(left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); + let r_words = word_derivations(right, prefix, *r_typo, ctx.words_fst(), wdcache)?; + all_word_pair_overall_proximity_docids(ctx, &l_words, r_words, proximity) } } } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index ae2a6d7fd..50b34a714 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -332,8 +332,8 @@ fn compute_facet_number_levels( /// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` /// that must be inserted into the database. /// 2. a roaring bitmap of all the document ids present in the database -fn compute_facet_strings_levels<'t>( - rtxn: &'t heed::RoTxn, +fn compute_facet_strings_levels( + rtxn: &heed::RoTxn, db: heed::Database, compression_type: CompressionType, compression_level: Option, diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 26fe0105e..952e02558 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -30,9 +30,8 @@ pub fn index_prefix_word_database( debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); let common_prefixes: Vec<_> = common_prefix_fst_words - .into_iter() - .map(|s| s.into_iter()) - .flatten() + .iter() + .flat_map(|s| s.iter()) .map(|s| s.as_str()) .filter(|s| s.len() <= max_prefix_length) .collect(); @@ -73,7 +72,7 @@ pub fn index_prefix_word_database( // Now we do the same thing with the new prefixes and all word pairs in the DB let new_prefixes: Vec<_> = new_prefix_fst_words - .into_iter() + .iter() .map(|s| s.as_str()) .filter(|s| s.len() <= max_prefix_length) .collect(); diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index 5895cdc46..53e421fac 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -195,9 +195,8 @@ pub fn index_word_prefix_database( // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length let prefixes = PrefixTrieNode::from_sorted_prefixes( common_prefix_fst_words - .into_iter() - .map(|s| s.into_iter()) - .flatten() + .iter() + .flat_map(|s| s.iter()) .map(|s| s.as_str()) .filter(|s| s.len() <= max_prefix_length), ); @@ -237,10 +236,7 @@ pub fn index_word_prefix_database( // Now we do the same thing with the new prefixes and all word pairs in the DB let prefixes = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words - .into_iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length), + new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length), ); if !prefixes.is_empty() { @@ -366,7 +362,7 @@ fn execute_on_word_pairs_and_prefixes( &mut prefix_buffer, &prefix_search_start, |prefix_buffer| { - batch.insert(&prefix_buffer, data.to_vec()); + batch.insert(prefix_buffer, data.to_vec()); }, ); } @@ -484,7 +480,7 @@ impl PrefixTrieNode { fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { let byte = word[0]; if self.children[search_start.0].1 == byte { - return true; + true } else { match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { Ok(position) => { @@ -502,7 +498,7 @@ impl PrefixTrieNode { fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { let mut node = PrefixTrieNode::default(); for prefix in prefixes { - node.insert_sorted_prefix(prefix.as_bytes().into_iter()); + node.insert_sorted_prefix(prefix.as_bytes().iter()); } node } From 42cdc38c7bcf3fff7127775ebd5ce3343ecce41d Mon Sep 17 00:00:00 2001 From: Ewan Higgs Date: Tue, 25 Oct 2022 21:12:59 +0200 Subject: [PATCH 1689/1889] Allow weird ranges like 1..=0 to pass clippy. Everything else is just a warning and exit code will be 0. --- milli/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ffbe8f38f..05e80f9ba 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,3 +1,4 @@ +#![allow(clippy::reversed_empty_ranges)] #[macro_use] pub mod documents; From 9d27ac8a2e222511465202e7f1764828cc3a4a91 Mon Sep 17 00:00:00 2001 From: Ewan Higgs Date: Tue, 25 Oct 2022 21:22:53 +0200 Subject: [PATCH 1690/1889] Ignore too many arguments to functions. --- milli/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 05e80f9ba..6fb83922a 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,4 +1,5 @@ #![allow(clippy::reversed_empty_ranges)] +#![allow(clippy::too_many_arguments)] #[macro_use] pub mod documents; From e883bccc7684800387bcec4c9651759d03746a0e Mon Sep 17 00:00:00 2001 From: curquiza Date: Wed, 26 Oct 2022 11:43:54 +0000 Subject: [PATCH 1691/1889] Update version for the next release (v0.35.0) in Cargo.toml files --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index ee6bae7c0..b5fee6640 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.34.0" +version = "0.35.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 97931a371..30fab7851 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.34.0" +version = "0.35.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 1245b097b..b22fdaad5 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.34.0" +version = "0.35.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 3953ad0f6..aa0787eed 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.34.0" +version = "0.35.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 2b71d1d18..db6132fe8 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.34.0" +version = "0.35.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 835425714..f19d3781e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.34.0" +version = "0.35.0" authors = ["Kerollmops "] edition = "2018" From c3f49f766d0a3a181e3c515438d413620e7b36b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 29 Aug 2022 16:01:54 +0200 Subject: [PATCH 1692/1889] Prepare refactor of facets database Prepare refactor of facets database --- infos/src/main.rs | 1 + .../facet/facet_level_value_f64_codec.rs | 89 -- .../facet/facet_level_value_u32_codec.rs | 53 - .../facet/facet_string_level_zero_codec.rs | 50 - .../facet_string_level_zero_value_codec.rs | 90 -- milli/src/heed_codec/facet/mod.rs | 22 +- milli/src/heed_codec/facet/new/mod.rs | 148 ++ .../heed_codec/facet/new/ordered_f64_codec.rs | 36 + milli/src/heed_codec/facet/new/str_ref.rs | 20 + milli/src/index.rs | 17 +- milli/src/search/criteria/asc_desc.rs | 33 +- milli/src/search/distinct/facet_distinct.rs | 10 +- milli/src/search/facet/facet_distribution.rs | 120 +- milli/src/search/facet/facet_number.rs | 539 +++++--- milli/src/search/facet/facet_string.rs | 1217 ++++++++--------- milli/src/search/facet/filter.rs | 268 ++-- milli/src/search/facet/mod.rs | 4 +- milli/src/search/mod.rs | 2 +- milli/src/snapshot_tests.rs | 81 +- milli/src/update/delete_documents.rs | 108 +- milli/src/update/facets.rs | 261 ++-- .../extract/extract_facet_number_docids.rs | 13 +- .../extract/extract_facet_string_docids.rs | 24 +- .../src/update/index_documents/extract/mod.rs | 6 +- .../helpers/merge_functions.rs | 52 +- .../src/update/index_documents/helpers/mod.rs | 6 +- .../src/update/index_documents/typed_chunk.rs | 16 +- 27 files changed, 1662 insertions(+), 1624 deletions(-) create mode 100644 infos/src/main.rs delete mode 100644 milli/src/heed_codec/facet/facet_level_value_f64_codec.rs delete mode 100644 milli/src/heed_codec/facet/facet_level_value_u32_codec.rs delete mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_codec.rs delete mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs create mode 100644 milli/src/heed_codec/facet/new/mod.rs create mode 100644 milli/src/heed_codec/facet/new/ordered_f64_codec.rs create mode 100644 milli/src/heed_codec/facet/new/str_ref.rs diff --git a/infos/src/main.rs b/infos/src/main.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/infos/src/main.rs @@ -0,0 +1 @@ + diff --git a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs deleted file mode 100644 index 1e66427ca..000000000 --- a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ /dev/null @@ -1,89 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::f64_into_bytes; -use crate::{try_split_array_at, FieldId}; - -// TODO do not de/serialize right bound when level = 0 -pub struct FacetLevelValueF64Codec; - -impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { - type DItem = (FieldId, u8, f64, f64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - - let (left, right) = if *level != 0 { - let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?; - let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?; - (left, right) - } else { - let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; - (left, left) - }; - - Some((field_id, *level, left, right)) - } -} - -impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { - type EItem = (FieldId, u8, f64, f64); - - fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { - let mut buffer = [0u8; 32]; - - let len = if *level != 0 { - // Write the globally ordered floats. - let bytes = f64_into_bytes(*left)?; - buffer[..8].copy_from_slice(&bytes[..]); - - let bytes = f64_into_bytes(*right)?; - buffer[8..16].copy_from_slice(&bytes[..]); - - // Then the f64 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[16..24].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[24..].copy_from_slice(&bytes[..]); - - 32 // length - } else { - // Write the globally ordered floats. - let bytes = f64_into_bytes(*left)?; - buffer[..8].copy_from_slice(&bytes[..]); - - // Then the f64 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[8..16].copy_from_slice(&bytes[..]); - - 16 // length - }; - - let mut bytes = Vec::with_capacity(len + 3); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.push(*level); - bytes.extend_from_slice(&buffer[..len]); - Some(Cow::Owned(bytes)) - } -} - -#[cfg(test)] -mod tests { - use heed::{BytesDecode, BytesEncode}; - - use super::*; - - #[test] - fn globally_ordered_f64() { - let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap(); - let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0)); - - let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap(); - let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0)); - } -} diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs deleted file mode 100644 index 597335b6e..000000000 --- a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs +++ /dev/null @@ -1,53 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::num::NonZeroU8; - -use crate::{try_split_array_at, FieldId}; - -/// A codec that stores the field id, level 1 and higher and the groups ids. -/// -/// It can only be used to encode the facet string of the level 1 or higher. -pub struct FacetLevelValueU32Codec; - -impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { - type DItem = (FieldId, NonZeroU8, u32, u32); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - let level = NonZeroU8::new(*level)?; - let left = bytes[8..12].try_into().ok().map(u32::from_be_bytes)?; - let right = bytes[12..].try_into().ok().map(u32::from_be_bytes)?; - Some((field_id, level, left, right)) - } -} - -impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { - type EItem = (FieldId, NonZeroU8, u32, u32); - - fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { - let mut buffer = [0u8; 16]; - - // Write the big-endian integers. - let bytes = left.to_be_bytes(); - buffer[..4].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[4..8].copy_from_slice(&bytes[..]); - - // Then the u32 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[8..12].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[12..].copy_from_slice(&bytes[..]); - - let mut bytes = Vec::with_capacity(buffer.len() + 2 + 1); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.push(level.get()); - bytes.extend_from_slice(&buffer); - - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs deleted file mode 100644 index 009c6454a..000000000 --- a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, FieldId}; - -/// A codec that stores the field id, level 0, and facet string. -/// -/// It can only be used to encode the facet string of the level 0, -/// as it hardcodes the level. -/// -/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys, -/// and make sure that the levels are not mixed-up. The level 0 is special, the key -/// are strings, other levels represent groups and keys are simply two integers. -pub struct FacetStringLevelZeroCodec; - -impl FacetStringLevelZeroCodec { - pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { - out.reserve(value.len() + 2); - out.extend_from_slice(&field_id.to_be_bytes()); - out.push(0); // the level zero (for LMDB ordering only) - out.extend_from_slice(value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { - type DItem = (FieldId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - - if *level != 0 { - return None; - } - - let value = str::from_utf8(bytes).ok()?; - Some((field_id, value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec { - type EItem = (FieldId, &'a str); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs deleted file mode 100644 index 22031c474..000000000 --- a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs +++ /dev/null @@ -1,90 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::{marker, str}; - -use crate::error::SerializationError; -use crate::heed_codec::RoaringBitmapCodec; -use crate::{try_split_array_at, try_split_at, Result}; - -pub type FacetStringLevelZeroValueCodec = StringValueCodec; - -/// A codec that encodes a string in front of a value. -/// -/// The usecase is for the facet string levels algorithm where we must know the -/// original string of a normalized facet value, the original values are stored -/// in the value to not break the lexicographical ordering of the LMDB keys. -pub struct StringValueCodec(marker::PhantomData); - -impl<'a, C> heed::BytesDecode<'a> for StringValueCodec -where - C: heed::BytesDecode<'a>, -{ - type DItem = (&'a str, C::DItem); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (string, bytes) = decode_prefix_string(bytes)?; - C::bytes_decode(bytes).map(|item| (string, item)) - } -} - -impl<'a, C> heed::BytesEncode<'a> for StringValueCodec -where - C: heed::BytesEncode<'a>, -{ - type EItem = (&'a str, C::EItem); - - fn bytes_encode((string, value): &'a Self::EItem) -> Option> { - let value_bytes = C::bytes_encode(&value)?; - - let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); - encode_prefix_string(string, &mut bytes).ok()?; - bytes.extend_from_slice(&value_bytes[..]); - - Some(Cow::Owned(bytes)) - } -} - -pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> { - let (original_length_bytes, bytes) = try_split_array_at(value)?; - let original_length = u16::from_be_bytes(original_length_bytes) as usize; - let (string, bytes) = try_split_at(bytes, original_length)?; - let string = str::from_utf8(string).ok()?; - Some((string, bytes)) -} - -pub fn encode_prefix_string(string: &str, buffer: &mut Vec) -> Result<()> { - let string_len: u16 = - string.len().try_into().map_err(|_| SerializationError::InvalidNumberSerialization)?; - buffer.extend_from_slice(&string_len.to_be_bytes()); - buffer.extend_from_slice(string.as_bytes()); - Ok(()) -} - -#[cfg(test)] -mod tests { - use heed::types::Unit; - use heed::{BytesDecode, BytesEncode}; - use roaring::RoaringBitmap; - - use super::*; - - #[test] - fn deserialize_roaring_bitmaps() { - let string = "abc"; - let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); - let key = (string, docids.clone()); - let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); - let (out_string, out_docids) = - StringValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_string, out_docids), (string, docids)); - } - - #[test] - fn deserialize_unit() { - let string = "def"; - let key = (string, ()); - let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); - let (out_string, out_unit) = StringValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_string, out_unit), (string, ())); - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 0b2d9186f..d23ab391e 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,19 +1,21 @@ -mod facet_level_value_f64_codec; -mod facet_level_value_u32_codec; -mod facet_string_level_zero_codec; -mod facet_string_level_zero_value_codec; +// mod facet_level_value_f64_codec; +// mod facet_level_value_u32_codec; +// mod facet_string_level_zero_codec; +// mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; +pub mod new; + use heed::types::OwnedType; -pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; -pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; -pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; -pub use self::facet_string_level_zero_value_codec::{ - decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, -}; +// pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; +// pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; +// pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; +// pub use self::facet_string_level_zero_value_codec::{ +// decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, +// }; pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/heed_codec/facet/new/mod.rs b/milli/src/heed_codec/facet/new/mod.rs new file mode 100644 index 000000000..5ed6a61f6 --- /dev/null +++ b/milli/src/heed_codec/facet/new/mod.rs @@ -0,0 +1,148 @@ +use heed::{BytesDecode, BytesEncode}; +use roaring::RoaringBitmap; +use std::{borrow::Cow, convert::TryFrom, marker::PhantomData}; + +pub mod ordered_f64_codec; +pub mod str_ref; +// TODO: these codecs were quickly written and not fast/resilient enough + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct FacetKey { + pub field_id: u16, + pub level: u8, + pub left_bound: T, +} +impl<'a> FacetKey<&'a [u8]> { + pub fn into_owned(self) -> FacetKey> { + FacetKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.to_vec(), + } + } +} + +impl<'a> FacetKey> { + pub fn as_ref(&self) -> FacetKey<&[u8]> { + FacetKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.as_slice(), + } + } +} + +pub struct FacetGroupValue { + pub size: u8, + pub bitmap: RoaringBitmap, +} + +pub struct FacetKeyCodec { + _phantom: PhantomData, +} + +impl<'a, T> heed::BytesEncode<'a> for FacetKeyCodec +where + T: BytesEncode<'a>, + T::EItem: Sized, +{ + type EItem = FacetKey; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.extend_from_slice(&value.field_id.to_be_bytes()); + v.extend_from_slice(&[value.level]); + + let bound = T::bytes_encode(&value.left_bound).unwrap(); + v.extend_from_slice(&bound); + + Some(Cow::Owned(v)) + } +} +impl<'a, T> heed::BytesDecode<'a> for FacetKeyCodec +where + T: BytesDecode<'a>, +{ + type DItem = FacetKey; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).unwrap()); + let level = bytes[2]; + let bound = T::bytes_decode(&bytes[3..]).unwrap(); + Some(FacetKey { field_id: fid, level, left_bound: bound }) + } +} + +pub struct FacetGroupValueCodec; +impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { + type EItem = FacetGroupValue; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.push(value.size); + value.bitmap.serialize_into(&mut v).unwrap(); + Some(Cow::Owned(v)) + } +} +impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { + type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Option { + let size = bytes[0]; + let bitmap = RoaringBitmap::deserialize_from(&bytes[1..]).unwrap(); + Some(FacetGroupValue { size, bitmap }) + } +} + +// TODO: get rid of this codec as it is named confusingly + should really be part of heed +// or even replace the current ByteSlice codec +pub struct MyByteSlice; + +impl<'a> BytesEncode<'a> for MyByteSlice { + type EItem = &'a [u8]; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Borrowed(item)) + } +} + +impl<'a> BytesDecode<'a> for MyByteSlice { + type DItem = &'a [u8]; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(bytes) + } +} + +// I won't need these ones anymore +// pub struct U16Codec; +// impl<'a> BytesEncode<'a> for U16Codec { +// type EItem = u16; + +// fn bytes_encode(item: &'a Self::EItem) -> Option> { +// Some(Cow::Owned(item.to_be_bytes().to_vec())) +// } +// } +// impl<'a> BytesDecode<'a> for U16Codec { +// type DItem = u16; + +// fn bytes_decode(bytes: &'a [u8]) -> Option { +// Some(u16::from_be_bytes(bytes[0..=1].try_into().unwrap())) +// } +// } + +// pub struct StrCodec; +// impl<'a> BytesEncode<'a> for StrCodec { +// type EItem = &'a str; + +// fn bytes_encode(item: &'a &'a str) -> Option> { +// Some(Cow::Borrowed(item.as_bytes())) +// } +// } +// impl<'a> BytesDecode<'a> for StrCodec { +// type DItem = &'a str; + +// fn bytes_decode(bytes: &'a [u8]) -> Option { +// let s = std::str::from_utf8(bytes).unwrap(); +// Some(s) +// } +// } diff --git a/milli/src/heed_codec/facet/new/ordered_f64_codec.rs b/milli/src/heed_codec/facet/new/ordered_f64_codec.rs new file mode 100644 index 000000000..856a9c0d1 --- /dev/null +++ b/milli/src/heed_codec/facet/new/ordered_f64_codec.rs @@ -0,0 +1,36 @@ +use std::{borrow::Cow, convert::TryInto}; + +use heed::BytesDecode; + +use crate::facet::value_encoding::f64_into_bytes; + +pub struct OrderedF64Codec; + +impl<'a> BytesDecode<'a> for OrderedF64Codec { + type DItem = f64; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + if bytes.len() < 16 { + return None; + } + let f = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; + Some(f) + } +} + +impl heed::BytesEncode<'_> for OrderedF64Codec { + type EItem = f64; + + fn bytes_encode(f: &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // write the globally ordered float + let bytes = f64_into_bytes(*f)?; + buffer[..8].copy_from_slice(&bytes[..]); + // Then the f64 value just to be able to read it back + let bytes = f.to_be_bytes(); + buffer[8..16].copy_from_slice(&bytes[..]); + + Some(Cow::Owned(buffer.to_vec())) + } +} diff --git a/milli/src/heed_codec/facet/new/str_ref.rs b/milli/src/heed_codec/facet/new/str_ref.rs new file mode 100644 index 000000000..80a51c803 --- /dev/null +++ b/milli/src/heed_codec/facet/new/str_ref.rs @@ -0,0 +1,20 @@ +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; + +pub struct StrRefCodec; +impl<'a> BytesEncode<'a> for StrRefCodec { + type EItem = &'a str; + + fn bytes_encode(item: &'a &'a str) -> Option> { + Some(Cow::Borrowed(item.as_bytes())) + } +} +impl<'a> BytesDecode<'a> for StrRefCodec { + type DItem = &'a str; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let s = std::str::from_utf8(bytes).unwrap(); + Some(s) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 94e2f538d..0561a77ac 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -13,9 +13,14 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec}; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, + // FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, + FieldDocIdFacetStringCodec, + FieldIdCodec, }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, @@ -123,10 +128,10 @@ pub struct Index { /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, - /// Maps the facet field id, level and the number with the docids that corresponds to it. - pub facet_id_f64_docids: Database, - /// Maps the facet field id and the string with the original string and docids that corresponds to it. - pub facet_id_string_docids: Database, + /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. + pub facet_id_f64_docids: Database, FacetGroupValueCodec>, + /// Maps the facet field id and ranges of strings with the docids that corresponds to them. + pub facet_id_string_docids: Database, FacetGroupValueCodec>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 6d50c1bb5..bd08c54a5 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::{FacetNumberIter, FacetStringIter}; +// use crate::search::facet::FacetStringIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -186,23 +186,24 @@ fn facet_ordered<'t>( iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) } else { - let facet_number_fn = if is_ascending { - FacetNumberIter::new_reducing - } else { - FacetNumberIter::new_reverse_reducing - }; - let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? - .map(|res| res.map(|(_, docids)| docids)); + todo!() + // let facet_number_fn = if is_ascending { + // FacetNumberIter::new_reducing + // } else { + // FacetNumberIter::new_reverse_reducing + // }; + // let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? + // .map(|res| res.map(|(_, docids)| docids)); - let facet_string_fn = if is_ascending { - FacetStringIter::new_reducing - } else { - FacetStringIter::new_reverse_reducing - }; - let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? - .map(|res| res.map(|(_, _, docids)| docids)); + // let facet_string_fn = if is_ascending { + // FacetStringIter::new_reducing + // } else { + // FacetStringIter::new_reverse_reducing + // }; + // let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? + // .map(|res| res.map(|(_, _, docids)| docids)); - Ok(Box::new(number_iter.chain(string_iter))) + // Ok(Box::new(number_iter.chain(string_iter))) } } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 33e7b4975..4a4815775 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,6 +6,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::error::InternalError; +use crate::heed_codec::facet::new::FacetKey; use crate::heed_codec::facet::*; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; @@ -47,13 +48,16 @@ impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { self.index .facet_id_string_docids - .get(self.txn, &(self.distinct, key)) - .map(|result| result.map(|(_original, docids)| docids)) + .get(self.txn, &FacetKey { field_id: self.distinct, level: 0, left_bound: key }) + .map(|opt| opt.map(|v| v.bitmap)) } fn facet_number_docids(&self, key: f64) -> heed::Result> { // get facet docids on level 0 - self.index.facet_id_f64_docids.get(self.txn, &(self.distinct, 0, key, key)) + self.index + .facet_id_f64_docids + .get(self.txn, &FacetKey { field_id: self.distinct, level: 0, left_bound: key }) + .map(|opt| opt.map(|v| v.bitmap)) } fn distinct_string(&mut self, id: DocumentId) -> Result<()> { diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index b2718a490..fddf93d4b 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -7,10 +7,8 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; -use crate::heed_codec::facet::{ - FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, -}; -use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; +use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +// use crate::search::facet::FacetStringIter; use crate::{FieldId, Index, Result}; /// The default number of values by facets that will @@ -133,21 +131,22 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - let iter = - FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + todo!() + // let iter = + // FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - for result in iter { - let (value, mut docids) = result?; - docids &= candidates; - if !docids.is_empty() { - distribution.insert(value.to_string(), docids.len()); - } - if distribution.len() == self.max_values_per_facet { - break; - } - } + // for result in iter { + // let (value, mut docids) = result?; + // docids &= candidates; + // if !docids.is_empty() { + // distribution.insert(value.to_string(), docids.len()); + // } + // if distribution.len() == self.max_values_per_facet { + // break; + // } + // } - Ok(()) + // Ok(()) } fn facet_strings_distribution_from_facet_levels( @@ -156,21 +155,22 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - let iter = - FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + todo!() + // let iter = + // FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - for result in iter { - let (_normalized, original, mut docids) = result?; - docids &= candidates; - if !docids.is_empty() { - distribution.insert(original.to_string(), docids.len()); - } - if distribution.len() == self.max_values_per_facet { - break; - } - } + // for result in iter { + // let (_normalized, original, mut docids) = result?; + // docids &= candidates; + // if !docids.is_empty() { + // distribution.insert(original.to_string(), docids.len()); + // } + // if distribution.len() == self.max_values_per_facet { + // break; + // } + // } - Ok(()) + // Ok(()) } /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the @@ -179,41 +179,43 @@ impl<'a> FacetDistribution<'a> { &self, field_id: FieldId, ) -> heed::Result> { - let mut distribution = BTreeMap::new(); + todo!() + // let mut distribution = BTreeMap::new(); - let db = self.index.facet_id_f64_docids; - let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + // let db = self.index.facet_id_f64_docids; + // let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; - for result in range { - let ((_, _, value, _), docids) = result?; - distribution.insert(value.to_string(), docids.len()); - if distribution.len() == self.max_values_per_facet { - break; - } - } + // for result in range { + // let ((_, _, value, _), docids) = result?; + // distribution.insert(value.to_string(), docids.len()); + // if distribution.len() == self.max_values_per_facet { + // break; + // } + // } - let iter = self - .index - .facet_id_string_docids - .remap_key_type::() - .prefix_iter(self.rtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + // let iter = self + // .index + // .facet_id_string_docids + // .remap_key_type::() + // .prefix_iter(self.rtxn, &field_id.to_be_bytes())? + // .remap_key_type::(); - let mut normalized_distribution = BTreeMap::new(); - for result in iter { - let ((_, normalized_value), (original_value, docids)) = result?; - normalized_distribution.insert(normalized_value, (original_value, docids.len())); - if normalized_distribution.len() == self.max_values_per_facet { - break; - } - } + // let mut normalized_distribution = BTreeMap::new(); + // for result in iter { + // let ((_, normalized_value), group_value) = result?; + // normalized_distribution + // .insert(normalized_value, (normalized_value, group_value.bitmap.len())); + // if normalized_distribution.len() == self.max_values_per_facet { + // break; + // } + // } - let iter = normalized_distribution - .into_iter() - .map(|(_normalized, (original, count))| (original.to_string(), count)); - distribution.extend(iter); + // let iter = normalized_distribution + // .into_iter() + // .map(|(_normalized, (original, count))| (original.to_string(), count)); + // distribution.extend(iter); - Ok(distribution) + // Ok(distribution) } fn facet_values(&self, field_id: FieldId) -> heed::Result> { diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs index 02390aac1..5f7bd5325 100644 --- a/milli/src/search/facet/facet_number.rs +++ b/milli/src/search/facet/facet_number.rs @@ -1,248 +1,335 @@ -use std::ops::Bound::{self, Excluded, Included, Unbounded}; +// use std::ops::Bound::{self, Excluded, Included, Unbounded}; -use either::Either::{self, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; +// use either::Either::{self, Left, Right}; +// use heed::types::{ByteSlice, DecodeIgnore}; +// use heed::{BytesDecode, BytesEncode, Database, Lazy, LazyDecode, RoRange, RoRevRange}; +// use obkv::Key; +// use roaring::RoaringBitmap; -use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; +// use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +// use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; +// use crate::heed_codec::CboRoaringBitmapCodec; +// use crate::{FieldId, Index}; -pub struct FacetNumberRange<'t> { - iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} +// pub struct FacetNumberRange<'t, 'e> { +// rtxn: &'t heed::RoTxn<'e>, +// db: Database, FacetGroupValueCodec>, +// iter: RoRange<'t, FacetKeyCodec, LazyDecode>, +// max_bound: f64, +// previous: Option<(FacetKey, Lazy<'t, FacetGroupValueCodec>)>, +// field_id: FieldId, +// end: Bound, +// } -impl<'t> FacetNumberRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetNumberRange { iter, end: right }) - } -} +// impl<'t, 'e> FacetNumberRange<'t, 'e> { +// pub fn new( +// rtxn: &'t heed::RoTxn<'e>, +// db: Database, FacetGroupValueCodec>, +// field_id: FieldId, +// level: u8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let left_bound = match left { +// Included(left_bound) => Included(FacetKey { field_id, level, left_bound }), +// Excluded(left_bound) => Excluded(FacetKey { field_id, level, left_bound }), +// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), +// }; -impl<'t> Iterator for FacetNumberRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; +// let mut iter = db.lazily_decode_data().range(rtxn, &(left_bound, Unbounded))?; +// let mut previous = iter.next().transpose()?; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => Some(Ok(((fid, level, left, right), docids))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// // Compute the maximum end bound by looking at the key of the last element in level 0 +// let mut prefix_level_0 = vec![]; +// prefix_level_0.extend_from_slice(&field_id.to_be_bytes()); +// prefix_level_0.push(level); -pub struct FacetNumberRevRange<'t> { - iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} +// let mut rev_iter = +// db.as_polymorph().rev_prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, &prefix_level_0)?; -impl<'t> FacetNumberRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetNumberRevRange { iter, end: right }) - } -} +// let rev_iter_first = rev_iter.next().transpose()?; +// let max_bound = if let Some((max_bound_key, _)) = rev_iter_first { +// let max_bound_key = +// FacetKeyCodec::::bytes_decode(max_bound_key).unwrap(); +// max_bound_key.left_bound +// } else { +// // I can't imagine when that would happen, but let's handle it correctly anyway +// // by making the iterator empty +// previous = None; +// 0.0 // doesn't matter since previous = None so the iterator will always early exit +// // and return None itself +// }; -impl<'t> Iterator for FacetNumberRevRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; +// Ok(FacetNumberRange { rtxn, db, iter, field_id, previous, max_bound, end: right }) +// } +// } - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} +// impl<'t, 'e> Iterator for FacetNumberRange<'t, 'e> { +// type Item = heed::Result<(FacetKey, RoaringBitmap)>; -pub struct FacetNumberIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, - must_reduce: bool, -} +// fn next(&mut self) -> Option { +// // The idea here is to return the **previous** element only if the left +// // bound of the current key fits within the range given to the iter +// // if it doesn't, then there is still a chance that it must be returned, +// // but we need to check the actual right bound of the group by looking for +// // the key preceding the first key of the next group in level 0 -impl<'t> FacetNumberIter<'t> { - /// Create a `FacetNumberIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } +// let (prev_key, prev_value) = self.previous?; - /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids; - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Right(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } +// let (next_left_bound, next_previous) = if let Some(next) = self.iter.next() { +// let (key, group_value) = match next { +// Ok(n) => n, +// Err(e) => return Some(Err(e)), +// }; +// (key.left_bound, Some((key, group_value))) +// } else { +// // we're at the end of the level iter, so we need to fetch the max bound instead +// (self.max_bound, None) +// }; +// let must_be_returned = match self.end { +// Included(end) => next_left_bound <= end, +// Excluded(end) => next_left_bound < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match prev_value.decode() { +// Ok(group_value) => { +// self.previous = next_previous; +// Some(Ok((prev_key, group_value.bitmap))) +// } +// Err(e) => Some(Err(e)), +// } +// } else { +// // it still possible that we want to return the value (one last time) +// // but to do so, we need to fetch the right bound of the current group +// // this is done by getting the first element at level 0 of the next group +// // then iterating in reverse from it +// // once we have the right bound, we can compare it, and then return or not +// // then we still set self.previous to None so that no other element can return +// // from it? +// let mut level_0_key_prefix = vec![]; +// level_0_key_prefix.extend_from_slice(&self.field_id.to_be_bytes()); +// level_0_key_prefix.push(0); +// let key = +// FacetKey:: { field_id: self.field_id, level: 0, left_bound: next_left_bound }; +// let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); +// level_0_key_prefix.extend_from_slice(&key_bytes); - /// Create a `FacetNumberIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will not reduce the given documents ids - /// while iterating on the different facet levels, possibly returning multiple times - /// a document id associated with multiple facet values. - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) - } +// let mut rev_iter_next_group_level_0 = self +// .db +// .as_polymorph() +// .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&self.rtxn, &level_0_key_prefix) +// .unwrap(); +// let (key_for_right_bound, _) = rev_iter_next_group_level_0.next().unwrap().unwrap(); +// let key_for_right_bound = +// FacetKeyCodec::::bytes_decode(key_for_right_bound).unwrap(); +// let right_bound = key_for_right_bound.left_bound; +// let must_be_returned = match self.end { +// Included(end) => right_bound <= end, +// Excluded(end) => right_bound < end, +// Unbounded => unreachable!(), +// }; +// self.previous = None; +// if must_be_returned { +// match prev_value.decode() { +// Ok(group_value) => Some(Ok((prev_key, group_value.bitmap))), +// Err(e) => Some(Err(e)), +// } +// } else { +// None +// } +// } +// } +// } - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - let level = db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? - .remap_key_type::() - .last() - .transpose()? - .map(|((_, level, _, _), _)| level); - Ok(level) - } -} +// pub struct FacetNumberRevRange<'t> { +// iter: RoRevRange<'t, FacetKeyCodec, LazyDecode>, +// end: Bound, +// } -impl<'t> Iterator for FacetNumberIter<'t> { - type Item = heed::Result<(f64, RoaringBitmap)>; +// impl<'t> FacetNumberRevRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, FacetGroupValueCodec>, +// field_id: FieldId, +// level: u8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let left_bound = match left { +// Included(left) => Included(FacetKey { field_id, level, left_bound: left }), +// Excluded(left) => Excluded(FacetKey { field_id, level, left_bound: left }), +// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), +// }; +// let right_bound = Included(FacetKey { field_id, level, left_bound: f64::MAX }); +// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; +// Ok(FacetNumberRevRange { iter, end: right }) +// } +// } - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); - for result in last { - // If the last iterator must find an empty set of documents it means - // that we found all the documents in the sub level iterations already, - // we can pop this level iterator. - if documents_ids.is_empty() { - break; - } +// impl<'t> Iterator for FacetNumberRevRange<'t> { +// type Item = heed::Result<(FacetKey, RoaringBitmap)>; - match result { - Ok(((_fid, level, left, right), mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } +// fn next(&mut self) -> Option { +// loop { +// match self.iter.next() { +// Some(Ok((FacetKey { field_id, level, left_bound }, docids))) => { +// let must_be_returned = match self.end { +// Included(end) => todo!(), //right <= end, +// Excluded(end) => todo!(), //right < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match docids.decode() { +// Ok(docids) => { +// return Some(Ok(( +// FacetKey { field_id, level, left_bound }, +// docids.bitmap, +// ))) +// } +// Err(e) => return Some(Err(e)), +// } +// } +// continue; +// } +// Some(Err(e)) => return Some(Err(e)), +// None => return None, +// } +// } +// } +// } - if level == 0 { - return Some(Ok((left, docids))); - } +// pub struct FacetNumberIter<'t, 'e> { +// rtxn: &'t heed::RoTxn<'t>, +// db: Database, FacetGroupValueCodec>, +// field_id: FieldId, +// level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, +// must_reduce: bool, +// } - let rtxn = self.rtxn; - let db = self.db; - let fid = self.field_id; - let left = Included(left); - let right = Included(right); +// impl<'t, 'e> FacetNumberIter<'t, 'e> { +// /// Create a `FacetNumberIter` that will iterate on the different facet entries +// /// (facet value + documents ids) and that will reduce the given documents ids +// /// while iterating on the different facet levels. +// pub fn new_reducing( +// rtxn: &'t heed::RoTxn<'e>, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_f64_docids; +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// let highest_iter = +// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; +// let level_iters = vec![(documents_ids, Left(highest_iter))]; +// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) +// } - let result = if is_ascending { - FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) - .map(Left) - } else { - FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) - .map(Right) - }; +// /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse +// /// (facet value + documents ids) and that will reduce the given documents ids +// /// while iterating on the different facet levels. +// pub fn new_reverse_reducing( +// rtxn: &'t heed::RoTxn<'e>, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_f64_docids; +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// let highest_iter = +// FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; +// let level_iters = vec![(documents_ids, Right(highest_iter))]; +// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) +// } - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - self.level_iters.pop(); - } - } -} +// /// Create a `FacetNumberIter` that will iterate on the different facet entries +// /// (facet value + documents ids) and that will not reduce the given documents ids +// /// while iterating on the different facet levels, possibly returning multiple times +// /// a document id associated with multiple facet values. +// pub fn new_non_reducing( +// rtxn: &'t heed::RoTxn<'e>, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_f64_docids; +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// let highest_iter = +// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; +// let level_iters = vec![(documents_ids, Left(highest_iter))]; +// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) +// } + +// fn highest_level( +// rtxn: &'t heed::RoTxn, +// db: Database, X>, +// fid: FieldId, +// ) -> heed::Result> { +// let level = db +// .remap_types::() +// .prefix_iter(rtxn, &fid.to_be_bytes())? +// .remap_key_type::>() +// .last() +// .transpose()? +// .map(|(key, _)| key.level); +// Ok(level) +// } +// } + +// impl<'t, 'e> Iterator for FacetNumberIter<'t, 'e> { +// type Item = heed::Result<(f64, RoaringBitmap)>; + +// fn next(&mut self) -> Option { +// 'outer: loop { +// let (documents_ids, last) = self.level_iters.last_mut()?; +// let is_ascending = last.is_left(); +// for result in last { +// // If the last iterator must find an empty set of documents it means +// // that we found all the documents in the sub level iterations already, +// // we can pop this level iterator. +// if documents_ids.is_empty() { +// break; +// } + +// match result { +// Ok((key, mut docids)) => { +// docids &= &*documents_ids; +// if !docids.is_empty() { +// if self.must_reduce { +// *documents_ids -= &docids; +// } + +// if level == 0 { +// return Some(Ok((left, docids))); +// } + +// let rtxn = self.rtxn; +// let db = self.db; +// let fid = self.field_id; +// let left = Included(left); +// let right = Included(right); + +// let result = if is_ascending { +// FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) +// .map(Left) +// } else { +// FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) +// .map(Right) +// }; + +// match result { +// Ok(iter) => { +// self.level_iters.push((docids, iter)); +// continue 'outer; +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// Err(e) => return Some(Err(e)), +// } +// } +// self.level_iters.pop(); +// } +// } +// } diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index c55430cf1..b01359503 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -1,652 +1,649 @@ -//! This module contains helpers iterators for facet strings. -//! -//! The purpose is to help iterate over the quite complex system of facets strings. A simple -//! description of the system would be that every facet string value is stored into an LMDB database -//! and that every value is associated with the document ids which are associated with this facet -//! string value. -//! -//! In reality it is a little bit more complex as we have to create aggregations of runs of facet -//! string values, those aggregations helps in choosing the right groups of facets to follow. -//! -//! ## A typical algorithm run -//! -//! If a group of aggregated facets values contains one of the documents ids, we must continue -//! iterating over the sub-groups. -//! -//! If this group is the lowest level and contain at least one document id we yield the associated -//! facet documents ids. -//! -//! If the group doesn't contain one of our documents ids, we continue to the next group at this -//! same level. -//! -//! ## The complexity comes from the strings -//! -//! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create -//! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the -//! two numbers bounds, the left and the right bound of the group, both inclusive. -//! -//! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and -//! puting two numbers big-endian encoded one after the other gives us ordered groups. The values -//! are simple unions of the documents ids coming from the groups below. -//! -//! ### Example of what a facet number LMDB database contain -//! -//! | level | left-bound | right-bound | documents ids | -//! |-------|------------|-------------|------------------| -//! | 0 | 0 | _skipped_ | 1, 2 | -//! | 0 | 1 | _skipped_ | 6, 7 | -//! | 0 | 3 | _skipped_ | 4, 7 | -//! | 0 | 5 | _skipped_ | 2, 3, 4 | -//! | 1 | 0 | 1 | 1, 2, 6, 7 | -//! | 1 | 3 | 5 | 2, 3, 4, 7 | -//! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | -//! -//! As you can see the level 0 have two equal bounds, therefore we skip serializing the second -//! bound, that's the base level where you can directly fetch the documents ids associated with an -//! exact number. -//! -//! The next levels have two different bounds and the associated documents ids are simply the result -//! of an union of all the documents ids associated with the aggregated groups above. -//! -//! ## The complexity of defining groups for facet strings -//! -//! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in -//! lexicographical order, it means that whatever the key represent the bytes are read in their raw -//! form and a simple `strcmp` will define the order in which keys will be read from the store. -//! -//! That's easy for types with a known size, like floats or integers, they are 64 bytes long and -//! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the -//! first number then by the second if the the first number is equal on two keys. -//! -//! For strings it is a lot more complex as those types are unsized, it means that the size of facet -//! strings is different for each facet value. -//! -//! ### Basic approach: padding the keys -//! -//! A first approach would be to simply define the maximum size of a facet string and pad the keys -//! with zeroes. The big problem of this approach is that it: -//! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the -//! other. -//! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB -//! performances. -//! -//! ### Better approach: number the facet groups -//! -//! A better approach would be to number the groups, this way we don't have the downsides of the -//! previously described approach but we need to be able to describe the groups by using a number. -//! -//! #### Example of facet strings with numbered groups -//! -//! | level | left-bound | right-bound | left-string | right-string | documents ids | -//! |-------|------------|-------------|-------------|--------------|------------------| -//! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | -//! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | -//! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | -//! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | -//! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | -//! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | -//! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | -//! -//! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not -//! need to store the facet string value two times. -//! -//! The number in the left-bound and right-bound columns are incremental numbers representing the -//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering -//! of the LMDB keys. -//! -//! In the value, not in the key, you can see that we added two new values: the left-string and the -//! right-string, which defines the original facet strings associated with the given group. -//! -//! We put those two strings inside of the value, this way we do not limit the maximum size of the -//! facet string values, and the impact on performances is not important as, IIRC, LMDB put big -//! values on another page, this helps in iterating over keys fast enough and only fetch the page -//! with the values when required. -//! -//! The other little advantage with this solution is that there is no a big overhead, compared with -//! the facet number levels, we only duplicate the facet strings once for the level 1. -//! -//! #### A typical algorithm run -//! -//! Note that the algorithm is always moving from the highest level to the lowest one, one level -//! by one level, this is why it is ok to only store the facets string on the level 1. -//! -//! If a group of aggregated facets values, a group with numbers contains one of the documents ids, -//! we must continue iterating over the sub-groups. To do so: -//! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds -//! and iterate over the facet groups defined by these numbers over the current level - 1. -//! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the -//! value and just do the same as with the facet numbers but with strings: iterate over the -//! current level - 1 with both keys. -//! -//! If this group is the lowest level (level 0) and contain at least one document id we yield the -//! associated facet documents ids. -//! -//! If the group doesn't contain one of our documents ids, we continue to the next group at this -//! same level. -//! +// //! This module contains helpers iterators for facet strings. +// //! +// //! The purpose is to help iterate over the quite complex system of facets strings. A simple +// //! description of the system would be that every facet string value is stored into an LMDB database +// //! and that every value is associated with the document ids which are associated with this facet +// //! string value. +// //! +// //! In reality it is a little bit more complex as we have to create aggregations of runs of facet +// //! string values, those aggregations helps in choosing the right groups of facets to follow. +// //! +// //! ## A typical algorithm run +// //! +// //! If a group of aggregated facets values contains one of the documents ids, we must continue +// //! iterating over the sub-groups. +// //! +// //! If this group is the lowest level and contain at least one document id we yield the associated +// //! facet documents ids. +// //! +// //! If the group doesn't contain one of our documents ids, we continue to the next group at this +// //! same level. +// //! +// //! ## The complexity comes from the strings +// //! +// //! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create +// //! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the +// //! two numbers bounds, the left and the right bound of the group, both inclusive. +// //! +// //! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and +// //! puting two numbers big-endian encoded one after the other gives us ordered groups. The values +// //! are simple unions of the documents ids coming from the groups below. +// //! +// //! ### Example of what a facet number LMDB database contain +// //! +// //! | level | left-bound | right-bound | documents ids | +// //! |-------|------------|-------------|------------------| +// //! | 0 | 0 | _skipped_ | 1, 2 | +// //! | 0 | 1 | _skipped_ | 6, 7 | +// //! | 0 | 3 | _skipped_ | 4, 7 | +// //! | 0 | 5 | _skipped_ | 2, 3, 4 | +// //! | 1 | 0 | 1 | 1, 2, 6, 7 | +// //! | 1 | 3 | 5 | 2, 3, 4, 7 | +// //! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | +// //! +// //! As you can see the level 0 have two equal bounds, therefore we skip serializing the second +// //! bound, that's the base level where you can directly fetch the documents ids associated with an +// //! exact number. +// //! +// //! The next levels have two different bounds and the associated documents ids are simply the result +// //! of an union of all the documents ids associated with the aggregated groups above. +// //! +// //! ## The complexity of defining groups for facet strings +// //! +// //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in +// //! lexicographical order, it means that whatever the key represent the bytes are read in their raw +// //! form and a simple `strcmp` will define the order in which keys will be read from the store. +// //! +// //! That's easy for types with a known size, like floats or integers, they are 64 bytes long and +// //! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the +// //! first number then by the second if the the first number is equal on two keys. +// //! +// //! For strings it is a lot more complex as those types are unsized, it means that the size of facet +// //! strings is different for each facet value. +// //! +// //! ### Basic approach: padding the keys +// //! +// //! A first approach would be to simply define the maximum size of a facet string and pad the keys +// //! with zeroes. The big problem of this approach is that it: +// //! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the +// //! other. +// //! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB +// //! performances. +// //! +// //! ### Better approach: number the facet groups +// //! +// //! A better approach would be to number the groups, this way we don't have the downsides of the +// //! previously described approach but we need to be able to describe the groups by using a number. +// //! +// //! #### Example of facet strings with numbered groups +// //! +// //! | level | left-bound | right-bound | left-string | right-string | documents ids | +// //! |-------|------------|-------------|-------------|--------------|------------------| +// //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | +// //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | +// //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | +// //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | +// //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | +// //! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | +// //! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | +// //! +// //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not +// //! need to store the facet string value two times. +// //! +// //! The number in the left-bound and right-bound columns are incremental numbers representing the +// //! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering +// //! of the LMDB keys. +// //! +// //! In the value, not in the key, you can see that we added two new values: the left-string and the +// //! right-string, which defines the original facet strings associated with the given group. +// //! +// //! We put those two strings inside of the value, this way we do not limit the maximum size of the +// //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big +// //! values on another page, this helps in iterating over keys fast enough and only fetch the page +// //! with the values when required. +// //! +// //! The other little advantage with this solution is that there is no a big overhead, compared with +// //! the facet number levels, we only duplicate the facet strings once for the level 1. +// //! +// //! #### A typical algorithm run +// //! +// //! Note that the algorithm is always moving from the highest level to the lowest one, one level +// //! by one level, this is why it is ok to only store the facets string on the level 1. +// //! +// //! If a group of aggregated facets values, a group with numbers contains one of the documents ids, +// //! we must continue iterating over the sub-groups. To do so: +// //! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds +// //! and iterate over the facet groups defined by these numbers over the current level - 1. +// //! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the +// //! value and just do the same as with the facet numbers but with strings: iterate over the +// //! current level - 1 with both keys. +// //! +// //! If this group is the lowest level (level 0) and contain at least one document id we yield the +// //! associated facet documents ids. +// //! +// //! If the group doesn't contain one of our documents ids, we continue to the next group at this +// //! same level. +// //! -use std::num::NonZeroU8; -use std::ops::Bound; -use std::ops::Bound::{Excluded, Included, Unbounded}; +// use std::num::NonZeroU8; +// use std::ops::Bound; +// use std::ops::Bound::{Excluded, Included, Unbounded}; -use either::{Either, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; +// use either::{Either, Left, Right}; +// use heed::types::{ByteSlice, DecodeIgnore}; +// use heed::{Database, LazyDecode, RoRange, RoRevRange}; +// use roaring::RoaringBitmap; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FacetStringZeroBoundsValueCodec, -}; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; +// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; +// use crate::heed_codec::CboRoaringBitmapCodec; +// use crate::{FieldId, Index}; -/// An iterator that is used to explore the facets level strings -/// from the level 1 to infinity. -/// -/// It yields the level, group id that an entry covers, the optional group strings -/// that it covers of the level 0 only if it is an entry from the level 1 and -/// the roaring bitmap associated. -pub struct FacetStringGroupRange<'t> { - iter: RoRange< - 't, - FacetLevelValueU32Codec, - LazyDecode>, - >, - end: Bound, -} +// /// An iterator that is used to explore the facets level strings +// /// from the level 1 to infinity. +// /// +// /// It yields the level, group id that an entry covers, the optional group strings +// /// that it covers of the level 0 only if it is an entry from the level 1 and +// /// the roaring bitmap associated. +// pub struct FacetStringGroupRange<'t> { +// iter: RoRange< +// 't, +// FacetLevelValueU32Codec, +// LazyDecode>, +// >, +// end: Bound, +// } -impl<'t> FacetStringGroupRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: NonZeroU8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let db = db.remap_types::< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >(); - let left_bound = match left { - Included(left) => Included((field_id, level, left, u32::MIN)), - Excluded(left) => Excluded((field_id, level, left, u32::MIN)), - Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), - }; - let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetStringGroupRange { iter, end: right }) - } -} +// impl<'t> FacetStringGroupRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// level: NonZeroU8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let db = db.remap_types::< +// FacetLevelValueU32Codec, +// FacetStringZeroBoundsValueCodec, +// >(); +// let left_bound = match left { +// Included(left) => Included((field_id, level, left, u32::MIN)), +// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), +// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), +// }; +// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); +// let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; +// Ok(FacetStringGroupRange { iter, end: right }) +// } +// } -impl<'t> Iterator for FacetStringGroupRange<'t> { - type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; +// impl<'t> Iterator for FacetStringGroupRange<'t> { +// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// fn next(&mut self) -> Option { +// match self.iter.next() { +// Some(Ok(((_fid, level, left, right), docids))) => { +// let must_be_returned = match self.end { +// Included(end) => right <= end, +// Excluded(end) => right < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match docids.decode() { +// Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), +// Err(e) => Some(Err(e)), +// } +// } else { +// None +// } +// } +// Some(Err(e)) => Some(Err(e)), +// None => None, +// } +// } +// } -pub struct FacetStringGroupRevRange<'t> { - iter: RoRevRange< - 't, - FacetLevelValueU32Codec, - LazyDecode>, - >, - end: Bound, -} +// pub struct FacetStringGroupRevRange<'t> { +// iter: RoRevRange< +// 't, +// FacetLevelValueU32Codec, +// LazyDecode>, +// >, +// end: Bound, +// } -impl<'t> FacetStringGroupRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: NonZeroU8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let db = db.remap_types::< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >(); - let left_bound = match left { - Included(left) => Included((field_id, level, left, u32::MIN)), - Excluded(left) => Excluded((field_id, level, left, u32::MIN)), - Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), - }; - let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetStringGroupRevRange { iter, end: right }) - } -} +// impl<'t> FacetStringGroupRevRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// level: NonZeroU8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let db = db.remap_types::< +// FacetLevelValueU32Codec, +// FacetStringZeroBoundsValueCodec, +// >(); +// let left_bound = match left { +// Included(left) => Included((field_id, level, left, u32::MIN)), +// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), +// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), +// }; +// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); +// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; +// Ok(FacetStringGroupRevRange { iter, end: right }) +// } +// } -impl<'t> Iterator for FacetStringGroupRevRange<'t> { - type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; +// impl<'t> Iterator for FacetStringGroupRevRange<'t> { +// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => { - return Some(Ok(((level, left, right), (bounds, docids)))) - } - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} +// fn next(&mut self) -> Option { +// loop { +// match self.iter.next() { +// Some(Ok(((_fid, level, left, right), docids))) => { +// let must_be_returned = match self.end { +// Included(end) => right <= end, +// Excluded(end) => right < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match docids.decode() { +// Ok((bounds, docids)) => { +// return Some(Ok(((level, left, right), (bounds, docids)))) +// } +// Err(e) => return Some(Err(e)), +// } +// } +// continue; +// } +// Some(Err(e)) => return Some(Err(e)), +// None => return None, +// } +// } +// } +// } -/// An iterator that is used to explore the level 0 of the facets string database. -/// -/// It yields the facet string and the roaring bitmap associated with it. -pub struct FacetStringLevelZeroRange<'t> { - iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -} +// /// An iterator that is used to explore the level 0 of the facets string database. +// /// +// /// It yields the facet string and the roaring bitmap associated with it. +// pub struct FacetStringLevelZeroRange<'t> { +// iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, +// } -impl<'t> FacetStringLevelZeroRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - left: Bound<&str>, - right: Bound<&str>, - ) -> heed::Result> { - fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - &buffer[..] - } +// impl<'t> FacetStringLevelZeroRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// left: Bound<&str>, +// right: Bound<&str>, +// ) -> heed::Result> { +// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { +// buffer.extend_from_slice(&field_id.to_be_bytes()); +// buffer.push(0); +// buffer.extend_from_slice(value.as_bytes()); +// &buffer[..] +// } - let mut left_buffer = Vec::new(); - let left_bound = match left { - Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), - Unbounded => { - left_buffer.extend_from_slice(&field_id.to_be_bytes()); - left_buffer.push(0); - Included(&left_buffer[..]) - } - }; +// let mut left_buffer = Vec::new(); +// let left_bound = match left { +// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), +// Unbounded => { +// left_buffer.extend_from_slice(&field_id.to_be_bytes()); +// left_buffer.push(0); +// Included(&left_buffer[..]) +// } +// }; - let mut right_buffer = Vec::new(); - let right_bound = match right { - Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), - Unbounded => { - right_buffer.extend_from_slice(&field_id.to_be_bytes()); - right_buffer.push(1); // we must only get the level 0 - Excluded(&right_buffer[..]) - } - }; +// let mut right_buffer = Vec::new(); +// let right_bound = match right { +// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), +// Unbounded => { +// right_buffer.extend_from_slice(&field_id.to_be_bytes()); +// right_buffer.push(1); // we must only get the level 0 +// Excluded(&right_buffer[..]) +// } +// }; - let iter = db - .remap_key_type::() - .range(rtxn, &(left_bound, right_bound))? - .remap_types::(); +// let iter = db +// .remap_key_type::() +// .range(rtxn, &(left_bound, right_bound))? +// .remap_types::(); - Ok(FacetStringLevelZeroRange { iter }) - } -} +// Ok(FacetStringLevelZeroRange { iter }) +// } +// } -impl<'t> Iterator for FacetStringLevelZeroRange<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; +// impl<'t> Iterator for FacetStringLevelZeroRange<'t> { +// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, normalized), (original, docids)))) => { - Some(Ok((normalized, original, docids))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// fn next(&mut self) -> Option { +// match self.iter.next() { +// Some(Ok(((_fid, normalized), (original, docids)))) => { +// Some(Ok((normalized, original, docids))) +// } +// Some(Err(e)) => Some(Err(e)), +// None => None, +// } +// } +// } -pub struct FacetStringLevelZeroRevRange<'t> { - iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -} +// pub struct FacetStringLevelZeroRevRange<'t> { +// iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, +// } -impl<'t> FacetStringLevelZeroRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - left: Bound<&str>, - right: Bound<&str>, - ) -> heed::Result> { - fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - &buffer[..] - } +// impl<'t> FacetStringLevelZeroRevRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// left: Bound<&str>, +// right: Bound<&str>, +// ) -> heed::Result> { +// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { +// buffer.extend_from_slice(&field_id.to_be_bytes()); +// buffer.push(0); +// buffer.extend_from_slice(value.as_bytes()); +// &buffer[..] +// } - let mut left_buffer = Vec::new(); - let left_bound = match left { - Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), - Unbounded => { - left_buffer.extend_from_slice(&field_id.to_be_bytes()); - left_buffer.push(0); - Included(&left_buffer[..]) - } - }; +// let mut left_buffer = Vec::new(); +// let left_bound = match left { +// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), +// Unbounded => { +// left_buffer.extend_from_slice(&field_id.to_be_bytes()); +// left_buffer.push(0); +// Included(&left_buffer[..]) +// } +// }; - let mut right_buffer = Vec::new(); - let right_bound = match right { - Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), - Unbounded => { - right_buffer.extend_from_slice(&field_id.to_be_bytes()); - right_buffer.push(1); // we must only get the level 0 - Excluded(&right_buffer[..]) - } - }; +// let mut right_buffer = Vec::new(); +// let right_bound = match right { +// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), +// Unbounded => { +// right_buffer.extend_from_slice(&field_id.to_be_bytes()); +// right_buffer.push(1); // we must only get the level 0 +// Excluded(&right_buffer[..]) +// } +// }; - let iter = db - .remap_key_type::() - .rev_range(rtxn, &(left_bound, right_bound))? - .remap_types::(); +// let iter = db +// .remap_key_type::() +// .rev_range(rtxn, &(left_bound, right_bound))? +// .remap_types::(); - Ok(FacetStringLevelZeroRevRange { iter }) - } -} +// Ok(FacetStringLevelZeroRevRange { iter }) +// } +// } -impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; +// impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { +// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, normalized), (original, docids)))) => { - Some(Ok((normalized, original, docids))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// fn next(&mut self) -> Option { +// match self.iter.next() { +// Some(Ok(((_fid, normalized), (original, docids)))) => { +// Some(Ok((normalized, original, docids))) +// } +// Some(Err(e)) => Some(Err(e)), +// None => None, +// } +// } +// } -type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; -type EitherStringRevRange<'t> = - Either, FacetStringLevelZeroRevRange<'t>>; +// type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; +// type EitherStringRevRange<'t> = +// Either, FacetStringLevelZeroRevRange<'t>>; -/// An iterator that is used to explore the facet strings level by level, -/// it will only return facets strings that are associated with the -/// candidates documents ids given. -pub struct FacetStringIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, - must_reduce: bool, -} +// /// An iterator that is used to explore the facet strings level by level, +// /// it will only return facets strings that are associated with the +// /// candidates documents ids given. +// pub struct FacetStringIter<'t> { +// rtxn: &'t heed::RoTxn<'t>, +// db: Database, +// field_id: FieldId, +// level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, +// must_reduce: bool, +// } -impl<'t> FacetStringIter<'t> { - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Left(highest_iter))], - must_reduce: true, - }) - } +// impl<'t> FacetStringIter<'t> { +// pub fn new_reducing( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_string_docids.remap_types::(); +// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; +// Ok(FacetStringIter { +// rtxn, +// db, +// field_id, +// level_iters: vec![(documents_ids, Left(highest_iter))], +// must_reduce: true, +// }) +// } - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Right(highest_reverse_iter))], - must_reduce: true, - }) - } +// pub fn new_reverse_reducing( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_string_docids.remap_types::(); +// let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; +// Ok(FacetStringIter { +// rtxn, +// db, +// field_id, +// level_iters: vec![(documents_ids, Right(highest_reverse_iter))], +// must_reduce: true, +// }) +// } - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Left(highest_iter))], - must_reduce: false, - }) - } +// pub fn new_non_reducing( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_string_docids.remap_types::(); +// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; +// Ok(FacetStringIter { +// rtxn, +// db, +// field_id, +// level_iters: vec![(documents_ids, Left(highest_iter))], +// must_reduce: false, +// }) +// } - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - Ok(db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits - .last() - .transpose()? - .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit - } +// fn highest_level( +// rtxn: &'t heed::RoTxn, +// db: Database, +// fid: FieldId, +// ) -> heed::Result> { +// Ok(db +// .remap_types::() +// .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits +// .last() +// .transpose()? +// .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit +// } - fn highest_iter( - rtxn: &'t heed::RoTxn, - index: &'t Index, - db: Database, - field_id: FieldId, - ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - match NonZeroU8::new(highest_level) { - Some(highest_level) => FacetStringGroupRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - ) - .map(Left), - None => FacetStringLevelZeroRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - ) - .map(Right), - } - } +// fn highest_iter( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// db: Database, +// field_id: FieldId, +// ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// match NonZeroU8::new(highest_level) { +// Some(highest_level) => FacetStringGroupRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// highest_level, +// Unbounded, +// Unbounded, +// ) +// .map(Left), +// None => FacetStringLevelZeroRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// Unbounded, +// Unbounded, +// ) +// .map(Right), +// } +// } - fn highest_reverse_iter( - rtxn: &'t heed::RoTxn, - index: &'t Index, - db: Database, - field_id: FieldId, - ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - match NonZeroU8::new(highest_level) { - Some(highest_level) => FacetStringGroupRevRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - ) - .map(Left), - None => FacetStringLevelZeroRevRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - ) - .map(Right), - } - } -} +// fn highest_reverse_iter( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// db: Database, +// field_id: FieldId, +// ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// match NonZeroU8::new(highest_level) { +// Some(highest_level) => FacetStringGroupRevRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// highest_level, +// Unbounded, +// Unbounded, +// ) +// .map(Left), +// None => FacetStringLevelZeroRevRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// Unbounded, +// Unbounded, +// ) +// .map(Right), +// } +// } +// } -impl<'t> Iterator for FacetStringIter<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; +// impl<'t> Iterator for FacetStringIter<'t> { +// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); +// fn next(&mut self) -> Option { +// 'outer: loop { +// let (documents_ids, last) = self.level_iters.last_mut()?; +// let is_ascending = last.is_left(); - // We remap the different iterator types to make - // the algorithm less complex to understand. - let last = match last { - Left(ascending) => match ascending { - Left(group) => Left(Left(group)), - Right(zero_level) => Right(Left(zero_level)), - }, - Right(descending) => match descending { - Left(group) => Left(Right(group)), - Right(zero_level) => Right(Right(zero_level)), - }, - }; +// // We remap the different iterator types to make +// // the algorithm less complex to understand. +// let last = match last { +// Left(ascending) => match ascending { +// Left(group) => Left(Left(group)), +// Right(zero_level) => Right(Left(zero_level)), +// }, +// Right(descending) => match descending { +// Left(group) => Left(Right(group)), +// Right(zero_level) => Right(Right(zero_level)), +// }, +// }; - match last { - Left(group) => { - for result in group { - match result { - Ok(((level, left, right), (string_bounds, mut docids))) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } +// match last { +// Left(group) => { +// for result in group { +// match result { +// Ok(((level, left, right), (string_bounds, mut docids))) => { +// docids &= &*documents_ids; +// if !docids.is_empty() { +// if self.must_reduce { +// *documents_ids -= &docids; +// } - let result = if is_ascending { - match string_bounds { - Some((left, right)) => FacetStringLevelZeroRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right), - None => FacetStringGroupRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Left) - } else { - match string_bounds { - Some((left, right)) => { - FacetStringLevelZeroRevRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right) - } - None => FacetStringGroupRevRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Right) - }; +// let result = if is_ascending { +// match string_bounds { +// Some((left, right)) => FacetStringLevelZeroRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// Included(left), +// Included(right), +// ) +// .map(Right), +// None => FacetStringGroupRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// NonZeroU8::new(level.get() - 1).unwrap(), +// Included(left), +// Included(right), +// ) +// .map(Left), +// } +// .map(Left) +// } else { +// match string_bounds { +// Some((left, right)) => { +// FacetStringLevelZeroRevRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// Included(left), +// Included(right), +// ) +// .map(Right) +// } +// None => FacetStringGroupRevRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// NonZeroU8::new(level.get() - 1).unwrap(), +// Included(left), +// Included(right), +// ) +// .map(Left), +// } +// .map(Right) +// }; - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - } - Right(zero_level) => { - // level zero only - for result in zero_level { - match result { - Ok((normalized, original, mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } - return Some(Ok((normalized, original, docids))); - } - } - Err(e) => return Some(Err(e)), - } - } - } - } +// match result { +// Ok(iter) => { +// self.level_iters.push((docids, iter)); +// continue 'outer; +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// Right(zero_level) => { +// // level zero only +// for result in zero_level { +// match result { +// Ok((normalized, original, mut docids)) => { +// docids &= &*documents_ids; +// if !docids.is_empty() { +// if self.must_reduce { +// *documents_ids -= &docids; +// } +// return Some(Ok((normalized, original, docids))); +// } +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// } - self.level_iters.pop(); - } - } -} +// self.level_iters.pop(); +// } +// } +// } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 7241dab2b..e911dfb15 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,16 +1,20 @@ use std::collections::HashSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; +use std::ops::RangeBounds; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; +use heed::LazyDecode; use log::debug; use roaring::RoaringBitmap; -use super::FacetNumberRange; +// use super::FacetNumberRange; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; +// use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{ distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, }; @@ -144,18 +148,29 @@ impl<'a> Filter<'a> { } } +fn explore_facet_number_levels( + rtxn: &heed::RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: FieldId, +) { +} + impl<'a> Filter<'a> { /// Aggregates the documents ids that are part of the specified range automatically /// going deeper through the levels. fn explore_facet_number_levels( rtxn: &heed::RoTxn, - db: heed::Database, + db: heed::Database, CboRoaringBitmapCodec>, field_id: FieldId, level: u8, left: Bound, right: Bound, output: &mut RoaringBitmap, ) -> Result<()> { + // level must be > 0, I'll create a separate function for level 0 + // if level == 0 { + // call that function + //} match (left, right) { // If the request is an exact value we must go directly to the deepest level. (Included(l), Included(r)) if l == r && level > 0 => { @@ -170,87 +185,121 @@ impl<'a> Filter<'a> { (Excluded(l), Included(r)) if l >= r => return Ok(()), (_, _) => (), } - - let mut left_found = None; - let mut right_found = None; - - // We must create a custom iterator to be able to iterate over the - // requested range as the range iterator cannot express some conditions. - let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; - - debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - for (i, result) in iter.enumerate() { - let ((_fid, level, l, r), docids) = result?; - debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - *output |= docids; - // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { - left_found = Some(l); - } - right_found = Some(r); - } - - // Can we go deeper? - let deeper_level = match level.checked_sub(1) { - Some(level) => level, - None => return Ok(()), + let range_start_key = FacetKey { + field_id, + level, + left_bound: match left { + Included(l) => l, + Excluded(l) => l, + Bound::Unbounded => f64::MIN, + }, }; + let mut range_iter = db + .remap_data_type::>() + .range(rtxn, &(range_start_key..))?; - // We must refine the left and right bounds of this range by retrieving the - // missing part in a deeper level. - match left_found.zip(right_found) { - Some((left_found, right_found)) => { - // If the bound is satisfied we avoid calling this function again. - if !matches!(left, Included(l) if l == left_found) { - let sub_right = Excluded(left_found); - debug!( - "calling left with {:?} to {:?} (level {})", - left, sub_right, deeper_level - ); - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - left, - sub_right, - output, - )?; - } - if !matches!(right, Included(r) if r == right_found) { - let sub_left = Excluded(right_found); - debug!( - "calling right with {:?} to {:?} (level {})", - sub_left, right, deeper_level - ); - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - sub_left, - right, - output, - )?; - } - } - None => { - // If we found nothing at this level it means that we must find - // the same bounds but at a deeper, more precise level. - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - left, - right, - output, - )?; - } + let (mut previous_facet_key, mut previous_value) = range_iter.next().unwrap()?; + while let Some(el) = range_iter.next() { + let (facet_key, value) = el?; + let range = (Included(previous_facet_key.left_bound), Excluded(facet_key.left_bound)); + // if the current range intersects with the query range, then go deeper + // what does it mean for two ranges to intersect? + let gte_left = match left { + Included(l) => previous_facet_key.left_bound >= l, + Excluded(l) => previous_facet_key.left_bound > l, // TODO: not true? + Bound::Unbounded => true, + }; + let lte_right = match right { + Included(r) => facet_key.left_bound <= r, + Excluded(r) => facet_key.left_bound < r, + Bound::Unbounded => true, + }; } + // at this point, previous_facet_key and previous_value are the last groups in the level + // we must also check whether we should visit this group - Ok(()) + todo!(); + + // let mut left_found = None; + // let mut right_found = None; + + // // We must create a custom iterator to be able to iterate over the + // // requested range as the range iterator cannot express some conditions. + // let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; + + // debug!("Iterating between {:?} and {:?} (level {})", left, right, level); + + // for (i, result) in iter.enumerate() { + // let ((_fid, level, l, r), docids) = result?; + // debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); + // *output |= docids; + // // We save the leftest and rightest bounds we actually found at this level. + // if i == 0 { + // left_found = Some(l); + // } + // right_found = Some(r); + // } + + // // Can we go deeper? + // let deeper_level = match level.checked_sub(1) { + // Some(level) => level, + // None => return Ok(()), + // }; + + // // We must refine the left and right bounds of this range by retrieving the + // // missing part in a deeper level. + // match left_found.zip(right_found) { + // Some((left_found, right_found)) => { + // // If the bound is satisfied we avoid calling this function again. + // if !matches!(left, Included(l) if l == left_found) { + // let sub_right = Excluded(left_found); + // debug!( + // "calling left with {:?} to {:?} (level {})", + // left, sub_right, deeper_level + // ); + // Self::explore_facet_number_levels( + // rtxn, + // db, + // field_id, + // deeper_level, + // left, + // sub_right, + // output, + // )?; + // } + // if !matches!(right, Included(r) if r == right_found) { + // let sub_left = Excluded(right_found); + // debug!( + // "calling right with {:?} to {:?} (level {})", + // sub_left, right, deeper_level + // ); + // Self::explore_facet_number_levels( + // rtxn, + // db, + // field_id, + // deeper_level, + // sub_left, + // right, + // output, + // )?; + // } + // } + // None => { + // // If we found nothing at this level it means that we must find + // // the same bounds but at a deeper, more precise level. + // Self::explore_facet_number_levels( + // rtxn, + // db, + // field_id, + // deeper_level, + // left, + // right, + // output, + // )?; + // } + // } + + // Ok(()) } fn evaluate_operator( @@ -277,23 +326,27 @@ impl<'a> Filter<'a> { return Ok(exist); } Condition::Equal(val) => { - let (_original_value, string_docids) = strings_db - .get(rtxn, &(field_id, &val.value().to_lowercase()))? + let string_docids = strings_db + .get( + rtxn, + &FacetKey { field_id, level: 0, left_bound: &val.value().to_lowercase() }, + )? + .map(|v| v.bitmap) .unwrap_or_default(); let number = val.parse::().ok(); let number_docids = match number { Some(n) => { let n = Included(n); let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels( - rtxn, - numbers_db, - field_id, - 0, - n, - n, - &mut output, - )?; + // Self::explore_facet_number_levels( + // rtxn, + // numbers_db, + // field_id, + // 0, + // n, + // n, + // &mut output, + // )?; output } None => RoaringBitmap::new(), @@ -312,21 +365,32 @@ impl<'a> Filter<'a> { // that's fine if it don't, the value just before will be returned instead. let biggest_level = numbers_db .remap_data_type::() - .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))? - .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); + .get_lower_than_or_equal_to( + rtxn, + &FacetKey { field_id, level: u8::MAX, left_bound: f64::MAX }, + )? + .and_then( + |(FacetKey { field_id: id, level, .. }, _)| { + if id == field_id { + Some(level) + } else { + None + } + }, + ); match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels( - rtxn, - numbers_db, - field_id, - level, - left, - right, - &mut output, - )?; + // Self::explore_facet_number_levels( + // rtxn, + // numbers_db, + // field_id, + // level, + // left, + // right, + // &mut output, + // )?; Ok(output) } None => Ok(RoaringBitmap::new()), diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index e3ac95882..13b00d2de 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,6 +1,6 @@ pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; -pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; -pub use self::facet_string::FacetStringIter; +// pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; +// pub use self::facet_string::FacetStringIter; pub use self::filter::Filter; mod facet_distribution; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 1b62a67c7..d05e807df 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, FacetNumberIter, Filter, DEFAULT_VALUES_PER_FACET}; +pub use self::facet::{FacetDistribution, /* FacetNumberIter,*/ Filter, DEFAULT_VALUES_PER_FACET,}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index e9c92a949..4031c9b06 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -6,10 +6,7 @@ use heed::types::ByteSlice; use heed::BytesDecode; use roaring::RoaringBitmap; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FacetStringZeroBoundsValueCodec, -}; +use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index}; #[track_caller] @@ -232,46 +229,48 @@ pub fn snap_word_prefix_position_docids(index: &Index) -> String { snap } pub fn snap_facet_id_f64_docids(index: &Index) -> String { - let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( - (facet_id, level, left, right), - b, - )| { - &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) - }); - snap + todo!() + // let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( + // (facet_id, level, left, right), + // b, + // )| { + // &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) + // }); + // snap } pub fn snap_facet_id_string_docids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let bytes_db = index.facet_id_string_docids.remap_types::(); - let iter = bytes_db.iter(&rtxn).unwrap(); - let mut snap = String::new(); + todo!() + // let rtxn = index.read_txn().unwrap(); + // let bytes_db = index.facet_id_string_docids.remap_types::(); + // let iter = bytes_db.iter(&rtxn).unwrap(); + // let mut snap = String::new(); - for x in iter { - let (key, value) = x.unwrap(); - if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { - let (orig_string, docids) = - FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); - snap.push_str(&format!( - "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", - display_bitmap(&docids) - )); - } else if let Some((field_id, level, left, right)) = - FacetLevelValueU32Codec::bytes_decode(key) - { - snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); - let (bounds, docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(value) - .unwrap(); - if let Some((left, right)) = bounds { - snap.push_str(&format!("{left:<8} {right:<8} ")); - } - snap.push_str(&display_bitmap(&docids)); - snap.push('\n'); - } else { - panic!(); - } - } - snap + // for x in iter { + // let (key, value) = x.unwrap(); + // if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { + // let (orig_string, docids) = + // FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); + // snap.push_str(&format!( + // "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", + // display_bitmap(&docids) + // )); + // } else if let Some((field_id, level, left, right)) = + // FacetLevelValueU32Codec::bytes_decode(key) + // { + // snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); + // let (bounds, docids) = + // FacetStringZeroBoundsValueCodec::::bytes_decode(value) + // .unwrap(); + // if let Some((left, right)) = bounds { + // snap.push_str(&format!("{left:<8} {right:<8} ")); + // } + // snap.push_str(&display_bitmap(&docids)); + // snap.push('\n'); + // } else { + // panic!(); + // } + // } + // snap } pub fn snap_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 54328b50d..bb30f24c9 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -10,9 +10,7 @@ use time::OffsetDateTime; use super::ClearDocuments; use crate::error::{InternalError, SerializationError, UserError}; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, -}; +use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ @@ -442,11 +440,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_docids( - self.wtxn, - facet_id_f64_docids, - &self.to_delete_docids, - )?; + // TODO: remove_docids_from_facet_field_id_docids( + // self.wtxn, + // facet_id_f64_docids, + // &self.to_delete_docids, + // )?; // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_field_id_docids( self.wtxn, @@ -587,57 +585,57 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( db: &heed::Database, to_remove: &RoaringBitmap, ) -> crate::Result<()> { - let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); - let mut iter = db.remap_types::().iter_mut(wtxn)?; - while let Some(result) = iter.next() { - let (key, val) = result?; - match FacetLevelValueU32Codec::bytes_decode(key) { - Some(_) => { - // If we are able to parse this key it means it is a facet string group - // level key. We must then parse the value using the appropriate codec. - let (group, mut docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(val) - .ok_or_else(|| SerializationError::Decoding { db_name })?; + // let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); + // let mut iter = db.remap_types::().iter_mut(wtxn)?; + // while let Some(result) = iter.next() { + // let (key, val) = result?; + // match FacetLevelValueU32Codec::bytes_decode(key) { + // Some(_) => { + // // If we are able to parse this key it means it is a facet string group + // // level key. We must then parse the value using the appropriate codec. + // let (group, mut docids) = + // FacetStringZeroBoundsValueCodec::::bytes_decode(val) + // .ok_or_else(|| SerializationError::Decoding { db_name })?; - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - let val = &(group, docids); - let value_bytes = - FacetStringZeroBoundsValueCodec::::bytes_encode(val) - .ok_or_else(|| SerializationError::Encoding { db_name })?; + // let previous_len = docids.len(); + // docids -= to_remove; + // if docids.is_empty() { + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.del_current()? }; + // } else if docids.len() != previous_len { + // let key = key.to_owned(); + // let val = &(group, docids); + // let value_bytes = + // FacetStringZeroBoundsValueCodec::::bytes_encode(val) + // .ok_or_else(|| SerializationError::Encoding { db_name })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &value_bytes)? }; - } - } - None => { - // The key corresponds to a level zero facet string. - let (original_value, mut docids) = - FacetStringLevelZeroValueCodec::bytes_decode(val) - .ok_or_else(|| SerializationError::Decoding { db_name })?; + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.put_current(&key, &value_bytes)? }; + // } + // } + // None => { + // // The key corresponds to a level zero facet string. + // let (original_value, mut docids) = + // FacetStringLevelZeroValueCodec::bytes_decode(val) + // .ok_or_else(|| SerializationError::Decoding { db_name })?; - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - let val = &(original_value, docids); - let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) - .ok_or_else(|| SerializationError::Encoding { db_name })?; + // let previous_len = docids.len(); + // docids -= to_remove; + // if docids.is_empty() { + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.del_current()? }; + // } else if docids.len() != previous_len { + // let key = key.to_owned(); + // let val = &(original_value, docids); + // let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) + // .ok_or_else(|| SerializationError::Encoding { db_name })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &value_bytes)? }; - } - } - } - } + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.put_current(&key, &value_bytes)? }; + // } + // } + // } + // } Ok(()) } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 108acae4f..0926b63f4 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -136,11 +136,12 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; use crate::error::InternalError; -use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, }; -use crate::heed_codec::CboRoaringBitmapCodec; +// use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; use crate::{FieldId, Index, Result}; @@ -187,16 +188,18 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + let mut nested_wtxn = self.index.env.nested_write_txn(self.wtxn).unwrap(); + for field_id in faceted_fields { // Clear the facet string levels. - clear_field_string_levels( - self.wtxn, - self.index.facet_id_string_docids.remap_types::(), - field_id, - )?; + // clear_field_string_levels( + // &mut nested_wtxn, + // self.index.facet_id_string_docids.remap_types::(), + // field_id, + // )?; let (facet_string_levels, string_documents_ids) = compute_facet_strings_levels( - self.wtxn, + &mut nested_wtxn, self.index.facet_id_string_docids, self.chunk_compression_type, self.chunk_compression_level, @@ -206,13 +209,13 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; self.index.put_string_faceted_documents_ids( - self.wtxn, + &mut nested_wtxn, field_id, &string_documents_ids, )?; for facet_strings_level in facet_string_levels { write_into_lmdb_database( - self.wtxn, + &mut nested_wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_strings_level, |_, _| { @@ -221,11 +224,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; } - // Clear the facet number levels. - clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; + // // Clear the facet number levels. + // clear_field_number_levels(&mut nested_wtxn, self.index.facet_id_f64_docids, field_id)?; let (facet_number_levels, number_documents_ids) = compute_facet_number_levels( - self.wtxn, + &mut nested_wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, self.chunk_compression_level, @@ -235,14 +238,14 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; self.index.put_number_faceted_documents_ids( - self.wtxn, + &mut nested_wtxn, field_id, &number_documents_ids, )?; for facet_number_level in facet_number_levels { write_into_lmdb_database( - self.wtxn, + &mut nested_wtxn, *self.index.facet_id_f64_docids.as_polymorph(), facet_number_level, |_, _| { @@ -263,8 +266,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { /// that must be inserted into the database. /// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_number_levels<'t>( - rtxn: &'t heed::RoTxn, - db: heed::Database, + rtxn: &'t mut heed::RwTxn, + db: heed::Database, FacetGroupValueCodec>, compression_type: CompressionType, compression_level: Option, level_group_size: NonZeroUsize, @@ -277,7 +280,7 @@ fn compute_facet_number_levels<'t>( .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_start = (field_id, 0, f64::MIN, f64::MIN); + let level_0_start = FacetKey { field_id, level: 0, left_bound: f64::MIN }; // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -289,37 +292,31 @@ fn compute_facet_number_levels<'t>( let mut number_document_ids = RoaringBitmap::new(); if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = - recursive_compute_levels::( - rtxn, - db, - compression_type, - compression_level, - *top_level, - level_0_start, - &(level_0_start..), - first_level_size, - level_group_size, - &mut |bitmaps, _, _| { - for bitmap in bitmaps { - number_document_ids |= bitmap; - } - Ok(()) - }, - &|_i, (_field_id, _level, left, _right)| *left, - &|bitmap| bitmap, - &|writer, level, left, right, docids| { - write_number_entry(writer, field_id, level.get(), left, right, &docids)?; - Ok(()) - }, - )?; + let subwriters = recursive_compute_levels::( + rtxn, + db, + compression_type, + compression_level, + field_id, + *top_level, + level_0_start, + &(level_0_start..), + first_level_size, + level_group_size, + &mut |bitmaps, _| { + for bitmap in bitmaps { + number_document_ids |= bitmap; + } + Ok(()) + }, + )?; Ok((subwriters, number_document_ids)) } else { let mut documents_ids = RoaringBitmap::new(); for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, docids) = result?; - documents_ids |= docids; + let (_key, group_value) = result?; + documents_ids |= group_value.bitmap; } Ok((vec![], documents_ids)) @@ -333,8 +330,8 @@ fn compute_facet_number_levels<'t>( /// that must be inserted into the database. /// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_strings_levels<'t>( - rtxn: &'t heed::RoTxn, - db: heed::Database, + rtxn: &'t mut heed::RwTxn, + db: heed::Database, FacetGroupValueCodec>, compression_type: CompressionType, compression_level: Option, level_group_size: NonZeroUsize, @@ -347,7 +344,7 @@ fn compute_facet_strings_levels<'t>( .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_start = (field_id, ""); + let level_0_start = FacetKey { field_id, level: 0, left_bound: "" }; // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -359,40 +356,31 @@ fn compute_facet_strings_levels<'t>( let mut strings_document_ids = RoaringBitmap::new(); if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = recursive_compute_levels::< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - (u32, &str), - >( + let subwriters = recursive_compute_levels::( rtxn, db, compression_type, compression_level, + field_id, *top_level, level_0_start, &(level_0_start..), first_level_size, level_group_size, - &mut |bitmaps, _, _| { + &mut |bitmaps, _| { for bitmap in bitmaps { strings_document_ids |= bitmap; } Ok(()) }, - &|i, (_field_id, value)| (i as u32, *value), - &|value| value.1, - &|writer, level, start_bound, end_bound, docids| { - write_string_entry(writer, field_id, level, start_bound, end_bound, docids)?; - Ok(()) - }, )?; Ok((subwriters, strings_document_ids)) } else { let mut documents_ids = RoaringBitmap::new(); for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, (_original_value, docids)) = result?; - documents_ids |= docids; + let (_key, group_value) = result?; + documents_ids |= group_value.bitmap; } Ok((vec![], documents_ids)) @@ -436,29 +424,26 @@ from the level below were read/created. Its arguments are: A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` that must be inserted into the database. */ -fn recursive_compute_levels<'t, KeyCodec, ValueCodec, Bound>( - rtxn: &'t heed::RoTxn, - db: heed::Database, +fn recursive_compute_levels<'t, BoundCodec>( + rtxn: &'t mut heed::RwTxn, + db: heed::Database, FacetGroupValueCodec>, compression_type: CompressionType, compression_level: Option, + field_id: FieldId, level: u8, - level_0_start: >::DItem, - level_0_range: &'t RangeFrom<>::DItem>, + level_0_start: FacetKey<>::EItem>, + level_0_range: &'t RangeFrom>::EItem>>, level_0_size: usize, level_group_size: NonZeroUsize, - computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], Bound, Bound) -> Result<()>, - bound_from_db_key: &dyn for<'a> Fn(usize, &'a >::DItem) -> Bound, - bitmap_from_db_value: &dyn Fn(>::DItem) -> RoaringBitmap, - write_entry: &dyn Fn(&mut Writer, NonZeroU8, Bound, Bound, RoaringBitmap) -> Result<()>, + computed_group_bitmap: &mut dyn FnMut( + &[RoaringBitmap], + >::EItem, + ) -> Result<()>, ) -> Result>> where - KeyCodec: for<'a> BytesEncode<'a> - + for<'a> BytesDecode<'a, DItem = >::EItem>, - for<'a> >::EItem: Sized, - ValueCodec: for<'a> BytesEncode<'a> - + for<'a> BytesDecode<'a, DItem = >::EItem>, - for<'a> >::EItem: Sized, - Bound: Copy, + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + for<'a> >::EItem: Copy + Sized, { if level == 0 { // base case for the recursion @@ -468,31 +453,32 @@ where // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read let mut bitmaps = vec![]; - let mut start_bound = bound_from_db_key(0, &level_0_start); - let mut end_bound = bound_from_db_key(0, &level_0_start); + let mut start_bound = level_0_start.left_bound; + // let mut end_bound = level_0_start.bound; + let mut first_iteration_for_new_group = true; for (i, db_result_item) in db.range(rtxn, level_0_range)?.take(level_0_size).enumerate() { let (key, value) = db_result_item?; - let bound = bound_from_db_key(i, &key); - let docids = bitmap_from_db_value(value); + let bound = key.left_bound; + let docids = value.bitmap; if first_iteration_for_new_group { start_bound = bound; first_iteration_for_new_group = false; } - end_bound = bound; + // end_bound = bound; bitmaps.push(docids); if bitmaps.len() == level_group_size.get() { - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; + computed_group_bitmap(&bitmaps, start_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); } } // don't forget to give the leftover bitmaps as well if !bitmaps.is_empty() { - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; + computed_group_bitmap(&bitmaps, start_bound)?; bitmaps.clear(); } // level 0 is already stored in the DB @@ -516,48 +502,52 @@ where db, compression_type, compression_level, + field_id, level - 1, level_0_start, level_0_range, level_0_size, level_group_size, - &mut |sub_bitmaps: &[RoaringBitmap], start_range, end_range| { + &mut |sub_bitmaps: &[RoaringBitmap], + start_range: >::EItem| { let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; } - range_for_bitmaps.push((start_range, end_range)); + range_for_bitmaps.push(start_range); bitmaps.push(combined_bitmap); if bitmaps.len() == level_group_size.get() { - let start_bound = range_for_bitmaps.first().unwrap().0; - let end_bound = range_for_bitmaps.last().unwrap().1; - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; - for (bitmap, (start_bound, end_bound)) in - bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) + let start_bound = range_for_bitmaps.first().unwrap(); + computed_group_bitmap(&bitmaps, *start_bound)?; + for (bitmap, start_bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - write_entry( + write_entry::( &mut cur_writer, + field_id, NonZeroU8::new(level).unwrap(), start_bound, - end_bound, bitmap, )?; } } Ok(()) }, - bound_from_db_key, - bitmap_from_db_value, - write_entry, )?; + // don't forget to insert the leftover elements into the writer as well if !bitmaps.is_empty() { - let start_range = range_for_bitmaps.first().unwrap().0; - let end_range = range_for_bitmaps.last().unwrap().1; - computed_group_bitmap(&bitmaps, start_range, end_range)?; - for (bitmap, (left, right)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - write_entry(&mut cur_writer, NonZeroU8::new(level).unwrap(), left, right, bitmap)?; + let start_range = range_for_bitmaps.first().unwrap(); + let end_range = range_for_bitmaps.last().unwrap(); + computed_group_bitmap(&bitmaps, *start_range)?; + for (bitmap, bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { + write_entry( + &mut cur_writer, + field_id, + NonZeroU8::new(level).unwrap(), + bound, + bitmap, + )?; } } @@ -566,60 +556,25 @@ where } } -fn clear_field_number_levels<'t>( - wtxn: &'t mut heed::RwTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result<()> { - let left = (field_id, 1, f64::MIN, f64::MIN); - let right = (field_id, u8::MAX, f64::MAX, f64::MAX); - let range = left..=right; - db.delete_range(wtxn, &range).map(drop) -} - -fn clear_field_string_levels<'t>( - wtxn: &'t mut heed::RwTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result<()> { - let left = (field_id, NonZeroU8::new(1).unwrap(), u32::MIN, u32::MIN); - let right = (field_id, NonZeroU8::new(u8::MAX).unwrap(), u32::MAX, u32::MAX); - let range = left..=right; - db.remap_key_type::().delete_range(wtxn, &range).map(drop) -} - -fn write_number_entry( - writer: &mut Writer, - field_id: FieldId, - level: u8, - left: f64, - right: f64, - ids: &RoaringBitmap, -) -> Result<()> { - let key = (field_id, level, left, right); - let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) -} -fn write_string_entry( +fn write_entry( writer: &mut Writer, field_id: FieldId, level: NonZeroU8, - (left_id, left_value): (u32, &str), - (right_id, right_value): (u32, &str), + bound: >::EItem, docids: RoaringBitmap, -) -> Result<()> { - let key = (field_id, level, left_id, right_id); - let key = FacetLevelValueU32Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = match level.get() { - 1 => (Some((left_value, right_value)), docids), - _ => (None, docids), - }; - let data = FacetStringZeroBoundsValueCodec::::bytes_encode(&data) - .ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) +) -> Result<()> +where + for<'a> BoundCodec: BytesEncode<'a>, + for<'a> >::EItem: Copy + Sized, +{ + todo!() + // let key = FacetKey { field_id, level: level.get(), left_bound: bound }; + // let key_bytes = FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + // let value_bytes = + // FacetGroupValueCodec::bytes_encode(&FacetGroupValue { size: 4, bitmap: docids }) + // .ok_or(Error::Encoding)?; + // writer.insert(&key_bytes, &value_bytes)?; + // Ok(()) } #[cfg(test)] diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 61157fa35..c5424a346 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,7 +6,7 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FieldDocIdFacetF64Codec}; +use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. @@ -31,13 +31,14 @@ pub fn extract_facet_number_docids( let mut cursor = docid_fid_facet_number.into_cursor()?; while let Some((key_bytes, _)) = cursor.move_on_next()? { - let (field_id, document_id, number) = - FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); + todo!() + // let (field_id, document_id, number) = + // FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); - let key = (field_id, 0, number, number); - let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); + // let key = (field_id, 0, number, number); + // // let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); - facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + // facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } sorter_into_reader(facet_number_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index f7aa3730c..4e655329e 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -4,11 +4,9 @@ use std::{io, str}; use roaring::RoaringBitmap; -use super::helpers::{ - create_sorter, keep_first_prefix_value_merge_roaring_bitmaps, sorter_into_reader, - try_split_array_at, GrenadParameters, -}; -use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; +use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; +use crate::update::index_documents::merge_cbo_roaring_bitmaps; +// use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -24,7 +22,7 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - keep_first_prefix_value_merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, // TODO: check indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -42,14 +40,16 @@ pub fn extract_facet_string_docids( let original_value = str::from_utf8(original_value_bytes)?; key_buffer.clear(); - FacetStringLevelZeroCodec::serialize_into( - field_id, - str::from_utf8(normalized_value_bytes)?, - &mut key_buffer, - ); + // TODO + // FacetStringLevelZeroCodec::serialize_into( + // field_id, + // str::from_utf8(normalized_value_bytes)?, + // &mut key_buffer, + // ); value_buffer.clear(); - encode_prefix_string(original_value, &mut value_buffer)?; + // TODO + // encode_prefix_string(original_value, &mut value_buffer)?; let bitmap = RoaringBitmap::from_iter(Some(document_id)); bitmap.serialize_into(&mut value_buffer)?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 50cc04610..1e414458f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -25,8 +25,8 @@ use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ - as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, - merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, + as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, + GrenadParameters, MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -142,7 +142,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - keep_first_prefix_value_merge_roaring_bitmaps, + merge_roaring_bitmaps, // TODO: check (cbo?) TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index dbe3c0344..cef27ab30 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -5,7 +5,7 @@ use std::result::Result as StdResult; use roaring::RoaringBitmap; use super::read_u32_ne_bytes; -use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; +// use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::Result; @@ -49,32 +49,32 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul } } -pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( - _key: &[u8], - values: &[Cow<'a, [u8]>], -) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - let original = decode_prefix_string(&values[0]).unwrap().0; - let merged_bitmaps = values - .iter() - .map(AsRef::as_ref) - .map(decode_prefix_string) - .map(Option::unwrap) - .map(|(_, bitmap_bytes)| bitmap_bytes) - .map(RoaringBitmap::deserialize_from) - .map(StdResult::unwrap) - .reduce(|a, b| a | b) - .unwrap(); +// pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( +// _key: &[u8], +// values: &[Cow<'a, [u8]>], +// ) -> Result> { +// if values.len() == 1 { +// Ok(values[0].clone()) +// } else { +// let original = decode_prefix_string(&values[0]).unwrap().0; +// let merged_bitmaps = values +// .iter() +// .map(AsRef::as_ref) +// .map(decode_prefix_string) +// .map(Option::unwrap) +// .map(|(_, bitmap_bytes)| bitmap_bytes) +// .map(RoaringBitmap::deserialize_from) +// .map(StdResult::unwrap) +// .reduce(|a, b| a | b) +// .unwrap(); - let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); - let mut buffer = Vec::with_capacity(cap); - encode_prefix_string(original, &mut buffer)?; - merged_bitmaps.serialize_into(&mut buffer)?; - Ok(Cow::Owned(buffer)) - } -} +// let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); +// let mut buffer = Vec::with_capacity(cap); +// encode_prefix_string(original, &mut buffer)?; +// merged_bitmaps.serialize_into(&mut buffer)?; +// Ok(Cow::Owned(buffer)) +// } +// } pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { Ok(values[0].clone()) diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 6466a636b..7e2ebd2d3 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -13,9 +13,9 @@ pub use grenad_helpers::{ writer_into_reader, GrenadParameters, MergeableReader, }; pub use merge_functions::{ - concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, - merge_cbo_roaring_bitmaps, merge_obkvs, merge_roaring_bitmaps, merge_two_obkvs, - roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, + concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs, + merge_roaring_bitmaps, merge_two_obkvs, roaring_bitmap_from_u32s_array, + serialize_roaring_bitmap, MergeFn, }; /// The maximum length a word can be diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 8464c98b6..7a9787bdb 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -13,7 +13,6 @@ use super::helpers::{ valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; -use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, @@ -197,13 +196,14 @@ pub(crate) fn write_typed_chunk_into_index( index_is_empty, |value, _buffer| Ok(value), |new_values, db_values, buffer| { - let (_, new_values) = decode_prefix_string(new_values).unwrap(); - let new_values = RoaringBitmap::deserialize_from(new_values)?; - let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); - let db_values = RoaringBitmap::deserialize_from(db_values)?; - let values = new_values | db_values; - encode_prefix_string(db_original, buffer)?; - Ok(values.serialize_into(buffer)?) + todo!() + // let (_, new_values) = decode_prefix_string(new_values).unwrap(); + // let new_values = RoaringBitmap::deserialize_from(new_values)?; + // let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); + // let db_values = RoaringBitmap::deserialize_from(db_values)?; + // let values = new_values | db_values; + // encode_prefix_string(db_original, buffer)?; + // Ok(values.serialize_into(buffer)?) }, )?; is_merged_database = true; From 7913d6365ca3dbb759c1e95dd47ac107aa0d7648 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 30 Aug 2022 14:03:18 +0200 Subject: [PATCH 1693/1889] Update Facets indexing to be compatible with new database structure --- milli/src/update/facets.rs | 631 ++++++------------------ milli/src/update/index_documents/mod.rs | 24 +- 2 files changed, 178 insertions(+), 477 deletions(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 0926b63f4..aaaa445da 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,168 +1,43 @@ -/*! -This module initialises the databases that are used to quickly get the list -of documents with a faceted field value falling within a certain range. For -example, they can be used to implement filters such as `x >= 3`. - -These databases are `facet_id_string_docids` and `facet_id_f64_docids`. - -## Example with numbers - -In the case of numbers, we start with a sorted list whose keys are -`(field_id, number_value)` and whose value is a roaring bitmap of the document ids -which contain the value `number_value` for the faceted field `field_id`. - -From this list, we want to compute two things: - -1. the bitmap of all documents that contain **any** number for each faceted field -2. a structure that allows us to use a (sort of) binary search to find all documents -containing numbers inside a certain range for a faceted field - -To achieve goal (2), we recursively split the list into chunks. Every time we split it, we -create a new "level" that is several times smaller than the level below it. The base level, -level 0, is the starting list. Level 1 is composed of chunks of up to N elements. Each element -contains a range and a bitmap of docids. Level 2 is composed of chunks up to N^2 elements, etc. - -For example, let's say we have 26 documents which we identify through the letters a-z. -We will focus on a single faceted field. When there are multiple faceted fields, the structure -described below is simply repeated for each field. - -What we want to obtain is the following structure for each faceted field: -```text -┌───────┐ ┌───────────────────────────────────────────────────────────────────────────────┐ -│ all │ │ [a, b, c, d, e, f, g, u, y, z] │ -└───────┘ └───────────────────────────────────────────────────────────────────────────────┘ - ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ -┌───────┐ │ 1.2 – 2 │ 3.4 – 100 │ 102 – 104 │ -│Level 2│ │ │ │ │ -└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ - ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ -┌───────┐ │ 1.2 – 1.3 │ 1.6 – 2 │ 3.4 – 12 │ 12.3 – 100 │ 102 – 104 │ -│Level 1│ │ │ │ │ │ │ -└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ - ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ -┌───────┐ │ 1.2 │ 1.3 │ 1.6 │ 2 │ 3.4 │ 12 │ 12.3 │ 100 │ 102 │ 104 │ -│Level 0│ │ │ │ │ │ │ │ │ │ │ │ -└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ - └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ -``` - -You can read more about this structure (for strings) in `[crate::search::facet::facet_strings]`. - -To create the levels, we use a recursive algorithm which makes sure that we only need to iterate -over the elements of level 0 once. It is implemented by [`recursive_compute_levels`]. - -## Encoding - -### Numbers -For numbers we use the same encoding for level 0 and the other levels. - -The key is given by `FacetLevelValueF64Codec`. It consists of: -1. The field id : u16 -2. The height of the level : u8 -3. The start bound : f64 -4. The end bound : f64 -Note that at level 0, we have start bound == end bound. - -The value is a serialised `RoaringBitmap`. - -### Strings - -For strings, we use a different encoding for level 0 and the other levels. - -At level 0, the key is given by `FacetStringLevelZeroCodec`. It consists of: -1. The field id : u16 -2. The height of the level : u8 <-- always == 0 -3. The normalised string value : &str - -And the value is given by `FacetStringLevelZeroValueCodec`. It consists of: -1. The original string -2. A serialised `RoaringBitmap` - -At level 1, the key is given by `FacetLevelValueU32Codec`. It consists of: -1. The field id : u16 -2. The height of the level : u8 <-- always >= 1 -3. The start bound : u32 -4. The end bound : u32 -where the bounds are indices inside level 0. - -The value is given by `FacetStringZeroBoundsValueCodec`. -If the level is 1, then it consists of: -1. The normalised string of the start bound -2. The normalised string of the end bound -3. A serialised `RoaringBitmap` - -If the level is higher, then it consists only of the serialised roaring bitmap. - -The distinction between the value encoding of level 1 and the levels above it -is to allow us to retrieve the value in level 0 quickly by reading the key of -level 1 (we obtain the string value of the bound and execute a prefix search -in the database). - -Therefore, for strings, the structure for a single faceted field looks more like this: -```text -┌───────┐ ┌───────────────────────────────────────────────────────────────────────────────┐ -│ all │ │ [a, b, c, d, e, f, g, u, y, z] │ -└───────┘ └───────────────────────────────────────────────────────────────────────────────┘ - - ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ -┌───────┐ │ 0 – 3 │ 4 – 7 │ 8 – 9 │ -│Level 2│ │ │ │ │ -└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ - ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ -┌───────┐ │ 0 – 1 │ 2 – 3 │ 4 – 5 │ 6 – 7 │ 8 – 9 │ -│Level 1│ │ "ab" – "ac" │ "ba" – "bac" │ "gaf" – "gal" │"form" – "wow" │ "woz" – "zz" │ -└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ - ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ -┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │ -│Level 0│ │ "AB" │ " Ac" │ "ba " │ "Bac" │ " GAF"│ "gal" │ "Form"│ " wow"│ "woz" │ "ZZ" │ -└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ - └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ - -The first line in a cell is its key (without the field id and level height) and the last two -lines are its values. -``` -*/ - use std::cmp; use std::fs::File; -use std::num::{NonZeroU8, NonZeroUsize}; -use std::ops::RangeFrom; +use std::num::NonZeroUsize; -use grenad::{CompressionType, Reader, Writer}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{BytesDecode, BytesEncode, Error}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn}; use log::debug; use roaring::RoaringBitmap; use time::OffsetDateTime; use crate::error::InternalError; -use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -use crate::heed_codec::facet::new::str_ref::StrRefCodec; use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; // use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; use crate::{FieldId, Index, Result}; -pub struct Facets<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, +pub struct Facets<'i> { index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, + level_group_size: usize, + min_level_size: usize, } -impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> { +impl<'i> Facets<'i> { + pub fn new( + index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, + ) -> Facets<'i> { Facets { - wtxn, index, + database, chunk_compression_type: CompressionType::None, chunk_compression_level: None, - level_group_size: NonZeroUsize::new(4).unwrap(), - min_level_size: NonZeroUsize::new(5).unwrap(), + level_group_size: 4, + min_level_size: 5, } } @@ -170,413 +45,233 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { /// /// This setting is always greater than or equal to 2. pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); + self.level_group_size = cmp::max(value.get(), 2); self } /// The minimum number of elements that a level is allowed to have. pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.min_level_size = value; + self.min_level_size = value.get(); self } + fn clear_levels(&self, wtxn: &mut heed::RwTxn, field_id: FieldId) -> Result<()> { + let left = FacetKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; + let right = FacetKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; + let range = left..=right; + self.database.delete_range(wtxn, &range).map(drop)?; + Ok(()) + } + #[logging_timer::time("Facets::{}")] - pub fn execute(self) -> Result<()> { - self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; + pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { + self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // We get the faceted fields to be able to create the facet levels. - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - let mut nested_wtxn = self.index.env.nested_write_txn(self.wtxn).unwrap(); + for &field_id in faceted_fields.iter() { + self.clear_levels(wtxn, field_id)?; + } - for field_id in faceted_fields { - // Clear the facet string levels. - // clear_field_string_levels( - // &mut nested_wtxn, - // self.index.facet_id_string_docids.remap_types::(), - // field_id, - // )?; + let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?; - let (facet_string_levels, string_documents_ids) = compute_facet_strings_levels( - &mut nested_wtxn, - self.index.facet_id_string_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.level_group_size, - self.min_level_size, - field_id, - )?; + for &field_id in faceted_fields.iter() { + let (level_readers, all_docids) = + self.compute_levels_for_field_id(field_id, &nested_wtxn)?; - self.index.put_string_faceted_documents_ids( - &mut nested_wtxn, - field_id, - &string_documents_ids, - )?; - for facet_strings_level in facet_string_levels { + // TODO: this will need to be an argument to Facets as well + self.index.put_string_faceted_documents_ids(&mut nested_wtxn, field_id, &all_docids)?; + + for level_reader in level_readers { + // TODO: append instead of write with merge write_into_lmdb_database( &mut nested_wtxn, - *self.index.facet_id_string_docids.as_polymorph(), - facet_strings_level, + *self.database.as_polymorph(), + level_reader, |_, _| { Err(InternalError::IndexingMergingKeys { process: "facet string levels" })? }, )?; } - - // // Clear the facet number levels. - // clear_field_number_levels(&mut nested_wtxn, self.index.facet_id_f64_docids, field_id)?; - - let (facet_number_levels, number_documents_ids) = compute_facet_number_levels( - &mut nested_wtxn, - self.index.facet_id_f64_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.level_group_size, - self.min_level_size, - field_id, - )?; - - self.index.put_number_faceted_documents_ids( - &mut nested_wtxn, - field_id, - &number_documents_ids, - )?; - - for facet_number_level in facet_number_levels { - write_into_lmdb_database( - &mut nested_wtxn, - *self.index.facet_id_f64_docids.as_polymorph(), - facet_number_level, - |_, _| { - Err(InternalError::IndexingMergingKeys { process: "facet number levels" })? - }, - )?; - } } Ok(()) } -} -/// Compute the content of the database levels from its level 0 for the given field id. -/// -/// ## Returns: -/// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` -/// that must be inserted into the database. -/// 2. a roaring bitmap of all the document ids present in the database -fn compute_facet_number_levels<'t>( - rtxn: &'t mut heed::RwTxn, - db: heed::Database, FacetGroupValueCodec>, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, - field_id: FieldId, -) -> Result<(Vec>, RoaringBitmap)> { - let first_level_size = db - .remap_key_type::() - .prefix_iter(rtxn, &field_id.to_be_bytes())? - .remap_types::() - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - - let level_0_start = FacetKey { field_id, level: 0, left_bound: f64::MIN }; - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) - .collect::>(); - - let mut number_document_ids = RoaringBitmap::new(); - - if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = recursive_compute_levels::( - rtxn, - db, - compression_type, - compression_level, + fn compute_levels_for_field_id( + &self, + field_id: FieldId, + txn: &RoTxn, + ) -> Result<(Vec>, RoaringBitmap)> { + let algo = CreateFacetsAlgo { + rtxn: txn, + db: &self.database, field_id, - *top_level, - level_0_start, - &(level_0_start..), - first_level_size, - level_group_size, - &mut |bitmaps, _| { - for bitmap in bitmaps { - number_document_ids |= bitmap; - } - Ok(()) - }, - )?; + level_group_size: self.level_group_size, + min_level_size: self.min_level_size, + chunk_compression_type: self.chunk_compression_type, + chunk_compression_level: self.chunk_compression_level, + }; + // TODO: first check whether there is anything in level 0 - Ok((subwriters, number_document_ids)) - } else { - let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, group_value) = result?; - documents_ids |= group_value.bitmap; - } + let mut all_docids = RoaringBitmap::new(); + let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| { + for bitmap in bitmaps { + all_docids |= bitmap; + } + Ok(()) + })?; + drop(algo); - Ok((vec![], documents_ids)) + Ok((subwriters, all_docids)) } } -/// Compute the content of the database levels from its level 0 for the given field id. -/// -/// ## Returns: -/// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` -/// that must be inserted into the database. -/// 2. a roaring bitmap of all the document ids present in the database -fn compute_facet_strings_levels<'t>( - rtxn: &'t mut heed::RwTxn, - db: heed::Database, FacetGroupValueCodec>, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, - field_id: FieldId, -) -> Result<(Vec>, RoaringBitmap)> { - let first_level_size = db - .remap_key_type::() - .prefix_iter(rtxn, &field_id.to_be_bytes())? - .remap_types::() - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - - let level_0_start = FacetKey { field_id, level: 0, left_bound: "" }; - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) - .collect::>(); - - let mut strings_document_ids = RoaringBitmap::new(); - - if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = recursive_compute_levels::( - rtxn, - db, - compression_type, - compression_level, - field_id, - *top_level, - level_0_start, - &(level_0_start..), - first_level_size, - level_group_size, - &mut |bitmaps, _| { - for bitmap in bitmaps { - strings_document_ids |= bitmap; - } - Ok(()) - }, - )?; - - Ok((subwriters, strings_document_ids)) - } else { - let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, group_value) = result?; - documents_ids |= group_value.bitmap; - } - - Ok((vec![], documents_ids)) - } +pub struct CreateFacetsAlgo<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, + field_id: u16, + level_group_size: usize, + min_level_size: usize, } - -/** -Compute a level from the levels below it, with the elements of level 0 already existing in the given `db`. - -This function is generic to work with both numbers and strings. The generic type parameters are: -* `KeyCodec`/`ValueCodec`: the codecs used to read the elements of the database. -* `Bound`: part of the range in the levels structure. For example, for numbers, the `Bound` is `f64` -because each chunk in a level contains a range such as (1.2 ..= 4.5). - -## Arguments -* `rtxn` : LMDB read transaction -* `db`: a database which already contains a `level 0` -* `compression_type`/`compression_level`: parameters used to create the `grenad::Writer` that -will contain the new levels -* `level` : the height of the level to create, or `0` to read elements from level 0. -* `level_0_start` : a key in the database that points to the beginning of its level 0 -* `level_0_range` : equivalent to `level_0_start..` -* `level_0_size` : the number of elements in level 0 -* `level_group_size` : the number of elements from the level below that are represented by a -single element of the new level -* `computed_group_bitmap` : a callback that is called whenever at most `level_group_size` elements -from the level below were read/created. Its arguments are: - 0. the list of bitmaps from each read/created element of the level below - 1. the start bound corresponding to the first element - 2. the end bound corresponding to the last element -* `bound_from_db_key` : finds the `Bound` from a key in the database -* `bitmap_from_db_value` : finds the `RoaringBitmap` from a value in the database -* `write_entry` : writes an element of a level into the writer. The arguments are: - 0. the writer - 1. the height of the level - 2. the start bound - 3. the end bound - 4. the docids of all elements between the start and end bound - -## Return -A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` -that must be inserted into the database. -*/ -fn recursive_compute_levels<'t, BoundCodec>( - rtxn: &'t mut heed::RwTxn, - db: heed::Database, FacetGroupValueCodec>, - compression_type: CompressionType, - compression_level: Option, - field_id: FieldId, - level: u8, - level_0_start: FacetKey<>::EItem>, - level_0_range: &'t RangeFrom>::EItem>>, - level_0_size: usize, - level_group_size: NonZeroUsize, - computed_group_bitmap: &mut dyn FnMut( - &[RoaringBitmap], - >::EItem, - ) -> Result<()>, -) -> Result>> -where - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - for<'a> >::EItem: Copy + Sized, -{ - if level == 0 { - // base case for the recursion - +impl<'t> CreateFacetsAlgo<'t> { + fn read_level_0( + &self, + handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, + ) -> Result<()> { // we read the elements one by one and - // 1. keep track of the start and end bounds + // 1. keep track of the left bound // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read let mut bitmaps = vec![]; - let mut start_bound = level_0_start.left_bound; - // let mut end_bound = level_0_start.bound; + let mut level_0_prefix = vec![]; + level_0_prefix.extend_from_slice(&self.field_id.to_be_bytes()); + level_0_prefix.push(0); + let level_0_iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, level_0_prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); + + let mut left_bound: &[u8] = &[]; let mut first_iteration_for_new_group = true; - for (i, db_result_item) in db.range(rtxn, level_0_range)?.take(level_0_size).enumerate() { - let (key, value) = db_result_item?; - + for el in level_0_iter { + let (key, value) = el?; let bound = key.left_bound; let docids = value.bitmap; if first_iteration_for_new_group { - start_bound = bound; + left_bound = bound; first_iteration_for_new_group = false; } - // end_bound = bound; bitmaps.push(docids); - if bitmaps.len() == level_group_size.get() { - computed_group_bitmap(&bitmaps, start_bound)?; + if bitmaps.len() == self.level_group_size { + handle_group(&bitmaps, left_bound); first_iteration_for_new_group = true; bitmaps.clear(); } } // don't forget to give the leftover bitmaps as well if !bitmaps.is_empty() { - computed_group_bitmap(&bitmaps, start_bound)?; + handle_group(&bitmaps, left_bound); bitmaps.clear(); } - // level 0 is already stored in the DB - return Ok(vec![]); - } else { + Ok(()) + } + + /// Compute the content of the database levels from its level 0 for the given field id. + /// + /// ## Returns: + /// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` + /// that must be inserted into the database. + /// 2. a roaring bitmap of all the document ids present in the database + fn compute_higher_levels( + &self, + level: u8, + handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, + ) -> Result>> { + if level == 0 { + self.read_level_0(handle_group); + // Level 0 is already in the database + return Ok(vec![]); + } // level >= 1 // we compute each element of this level based on the elements of the level below it - // once we have computed `level_group_size` elements, we give the start and end bounds + // once we have computed `level_group_size` elements, we give the left bound // of those elements, and their bitmaps, to the level above - let mut cur_writer = - create_writer(compression_type, compression_level, tempfile::tempfile()?); + let mut cur_writer = create_writer( + self.chunk_compression_type, + self.chunk_compression_level, + tempfile::tempfile()?, + ); + let mut cur_writer_len = 0; - let mut range_for_bitmaps = vec![]; + let mut group_sizes = vec![]; + let mut left_bounds = vec![]; let mut bitmaps = vec![]; // compute the levels below // in the callback, we fill `cur_writer` with the correct elements for this level - let mut sub_writers = recursive_compute_levels( - rtxn, - db, - compression_type, - compression_level, - field_id, - level - 1, - level_0_start, - level_0_range, - level_0_size, - level_group_size, - &mut |sub_bitmaps: &[RoaringBitmap], - start_range: >::EItem| { + let mut sub_writers = + self.compute_higher_levels(level - 1, &mut |sub_bitmaps, left_bound| { let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; } - range_for_bitmaps.push(start_range); + group_sizes.push(sub_bitmaps.len() as u8); + left_bounds.push(left_bound); bitmaps.push(combined_bitmap); - if bitmaps.len() == level_group_size.get() { - let start_bound = range_for_bitmaps.first().unwrap(); - computed_group_bitmap(&bitmaps, *start_bound)?; - for (bitmap, start_bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) - { - write_entry::( - &mut cur_writer, - field_id, - NonZeroU8::new(level).unwrap(), - start_bound, - bitmap, - )?; - } + if bitmaps.len() != self.level_group_size { + return Ok(()); + } + let left_bound = left_bounds.first().unwrap(); + handle_group(&bitmaps, left_bound)?; + + for ((bitmap, left_bound), group_size) in + bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) + { + let key = FacetKey { field_id: self.field_id, level, left_bound }; + let key = + FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + let value = FacetGroupValue { size: group_size, bitmap }; + let value = + FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; + cur_writer.insert(key, value)?; + cur_writer_len += 1; } Ok(()) - }, - )?; - + })?; // don't forget to insert the leftover elements into the writer as well - if !bitmaps.is_empty() { - let start_range = range_for_bitmaps.first().unwrap(); - let end_range = range_for_bitmaps.last().unwrap(); - computed_group_bitmap(&bitmaps, *start_range)?; - for (bitmap, bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - write_entry( - &mut cur_writer, - field_id, - NonZeroU8::new(level).unwrap(), - bound, - bitmap, - )?; + if !bitmaps.is_empty() && cur_writer_len >= self.level_group_size * self.min_level_size { + let left_bound = left_bounds.first().unwrap(); + handle_group(&bitmaps, left_bound)?; + for ((bitmap, left_bound), group_size) in + bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) + { + let key = FacetKey { field_id: self.field_id, level, left_bound }; + let key = + FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + let value = FacetGroupValue { size: group_size, bitmap }; + let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; + cur_writer.insert(key, value)?; + cur_writer_len += 1; } } - - sub_writers.push(writer_into_reader(cur_writer)?); + if cur_writer_len > self.level_group_size * self.min_level_size { + sub_writers.push(writer_into_reader(cur_writer)?); + } return Ok(sub_writers); } } -fn write_entry( - writer: &mut Writer, - field_id: FieldId, - level: NonZeroU8, - bound: >::EItem, - docids: RoaringBitmap, -) -> Result<()> -where - for<'a> BoundCodec: BytesEncode<'a>, - for<'a> >::EItem: Copy + Sized, -{ - todo!() - // let key = FacetKey { field_id, level: level.get(), left_bound: bound }; - // let key_bytes = FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; - // let value_bytes = - // FacetGroupValueCodec::bytes_encode(&FacetGroupValue { size: 4, bitmap: docids }) - // .ok_or(Error::Encoding)?; - // writer.insert(&key_bytes, &value_bytes)?; - // Ok(()) -} - #[cfg(test)] mod tests { use std::num::NonZeroUsize; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index f13ac13a8..5a9066eba 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -34,6 +34,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; +use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, @@ -431,16 +432,21 @@ where let mut databases_seen = MERGED_DATABASE_COUNT; // Run the facets update operation. - let mut builder = Facets::new(self.wtxn, self.index); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - if let Some(value) = self.config.facet_level_group_size { - builder.level_group_size(value); + for facet_db in [ + (&self.index.facet_id_string_docids).remap_key_type::>(), + (&self.index.facet_id_f64_docids).remap_key_type::>(), + ] { + let mut builder = Facets::new(self.index, facet_db); + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + if let Some(value) = self.config.facet_level_group_size { + builder.level_group_size(value); + } + if let Some(value) = self.config.facet_min_level_size { + builder.min_level_size(value); + } + builder.execute(self.wtxn)?; } - if let Some(value) = self.config.facet_min_level_size { - builder.min_level_size(value); - } - builder.execute()?; databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { From 63ef0aba181387a76283edaab126e2210987d284 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 30 Aug 2022 14:17:40 +0200 Subject: [PATCH 1694/1889] Start porting facet distribution and sort to new database structure --- .../search/facet/facet_distribution_iter.rs | 199 ++++++ milli/src/search/facet/facet_number.rs | 335 --------- .../src/search/facet/facet_sort_ascending.rs | 147 ++++ .../src/search/facet/facet_sort_descending.rs | 172 +++++ milli/src/search/facet/facet_string.rs | 649 ------------------ milli/src/search/facet/mod.rs | 74 +- milli/src/update/facets.rs | 8 +- 7 files changed, 594 insertions(+), 990 deletions(-) create mode 100644 milli/src/search/facet/facet_distribution_iter.rs delete mode 100644 milli/src/search/facet/facet_number.rs create mode 100644 milli/src/search/facet/facet_sort_ascending.rs create mode 100644 milli/src/search/facet/facet_sort_descending.rs delete mode 100644 milli/src/search/facet/facet_string.rs diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs new file mode 100644 index 000000000..2dfe3580f --- /dev/null +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -0,0 +1,199 @@ +use roaring::RoaringBitmap; +use std::ops::ControlFlow; + +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; + +use super::{get_first_facet_value, get_highest_level}; + +pub fn iterate_over_facet_distribution<'t, CB>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: &RoaringBitmap, + callback: CB, +) where + CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, +{ + let mut fd = FacetDistribution { rtxn, db, field_id, callback }; + let highest_level = + get_highest_level(rtxn, &db.remap_key_type::>(), field_id); + + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id) { + fd.iterate(candidates, highest_level, first_bound, usize::MAX); + return; + } else { + return; + } +} + +struct FacetDistribution<'t, CB> +where + CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, +{ + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + callback: CB, +} + +impl<'t, CB> FacetDistribution<'t, CB> +where + CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, +{ + fn iterate_level_0( + &mut self, + candidates: &RoaringBitmap, + starting_bound: &'t [u8], + group_size: usize, + ) -> ControlFlow<()> { + let starting_key = + FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; + let iter = self.db.range(self.rtxn, &(starting_key..)).unwrap().take(group_size); + for el in iter { + let (key, value) = el.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != self.field_id { + return ControlFlow::Break(()); + } + let docids_in_common = value.bitmap.intersection_len(candidates); + if docids_in_common > 0 { + match (self.callback)(key.left_bound, docids_in_common) { + ControlFlow::Continue(_) => {} + ControlFlow::Break(_) => return ControlFlow::Break(()), + } + } + } + return ControlFlow::Continue(()); + } + fn iterate( + &mut self, + candidates: &RoaringBitmap, + level: u8, + starting_bound: &'t [u8], + group_size: usize, + ) -> ControlFlow<()> { + if level == 0 { + return self.iterate_level_0(candidates, starting_bound, group_size); + } + let starting_key = FacetKey { field_id: self.field_id, level, left_bound: starting_bound }; + let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size); + + for el in iter { + let (key, value) = el.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != self.field_id { + return ControlFlow::Break(()); + } + let docids_in_common = value.bitmap & candidates; + if docids_in_common.len() > 0 { + let cf = + self.iterate(&docids_in_common, level - 1, key.left_bound, value.size as usize); + match cf { + ControlFlow::Continue(_) => {} + ControlFlow::Break(_) => return ControlFlow::Break(()), + } + } + } + + return ControlFlow::Continue(()); + } +} + +#[cfg(test)] +mod tests { + use crate::{codec::U16Codec, Index}; + use heed::BytesDecode; + use roaring::RoaringBitmap; + use std::ops::ControlFlow; + + use super::iterate_over_facet_distribution; + + fn get_simple_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &i, &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = fastrand::Rng::with_seed(0); + let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as u16), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_distribution_all() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (0..=255).into_iter().collect::(); + let mut results = String::new(); + iterate_over_facet_distribution( + &txn, + &index.db.content, + 0, + &candidates, + |facet, count| { + let facet = U16Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {count}\n")); + ControlFlow::Continue(()) + }, + ); + insta::assert_snapshot!(format!("filter_distribution_{i}_all"), results); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_distribution_all_stop_early() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (0..=255).into_iter().collect::(); + let mut results = String::new(); + let mut nbr_facets = 0; + iterate_over_facet_distribution( + &txn, + &index.db.content, + 0, + &candidates, + |facet, count| { + let facet = U16Codec::bytes_decode(facet).unwrap(); + if nbr_facets == 100 { + return ControlFlow::Break(()); + } else { + nbr_facets += 1; + results.push_str(&format!("{facet}: {count}\n")); + + ControlFlow::Continue(()) + } + }, + ); + insta::assert_snapshot!(format!("filter_distribution_{i}_all_stop_early"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs deleted file mode 100644 index 5f7bd5325..000000000 --- a/milli/src/search/facet/facet_number.rs +++ /dev/null @@ -1,335 +0,0 @@ -// use std::ops::Bound::{self, Excluded, Included, Unbounded}; - -// use either::Either::{self, Left, Right}; -// use heed::types::{ByteSlice, DecodeIgnore}; -// use heed::{BytesDecode, BytesEncode, Database, Lazy, LazyDecode, RoRange, RoRevRange}; -// use obkv::Key; -// use roaring::RoaringBitmap; - -// use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -// use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; -// use crate::heed_codec::CboRoaringBitmapCodec; -// use crate::{FieldId, Index}; - -// pub struct FacetNumberRange<'t, 'e> { -// rtxn: &'t heed::RoTxn<'e>, -// db: Database, FacetGroupValueCodec>, -// iter: RoRange<'t, FacetKeyCodec, LazyDecode>, -// max_bound: f64, -// previous: Option<(FacetKey, Lazy<'t, FacetGroupValueCodec>)>, -// field_id: FieldId, -// end: Bound, -// } - -// impl<'t, 'e> FacetNumberRange<'t, 'e> { -// pub fn new( -// rtxn: &'t heed::RoTxn<'e>, -// db: Database, FacetGroupValueCodec>, -// field_id: FieldId, -// level: u8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let left_bound = match left { -// Included(left_bound) => Included(FacetKey { field_id, level, left_bound }), -// Excluded(left_bound) => Excluded(FacetKey { field_id, level, left_bound }), -// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), -// }; - -// let mut iter = db.lazily_decode_data().range(rtxn, &(left_bound, Unbounded))?; -// let mut previous = iter.next().transpose()?; - -// // Compute the maximum end bound by looking at the key of the last element in level 0 -// let mut prefix_level_0 = vec![]; -// prefix_level_0.extend_from_slice(&field_id.to_be_bytes()); -// prefix_level_0.push(level); - -// let mut rev_iter = -// db.as_polymorph().rev_prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, &prefix_level_0)?; - -// let rev_iter_first = rev_iter.next().transpose()?; -// let max_bound = if let Some((max_bound_key, _)) = rev_iter_first { -// let max_bound_key = -// FacetKeyCodec::::bytes_decode(max_bound_key).unwrap(); -// max_bound_key.left_bound -// } else { -// // I can't imagine when that would happen, but let's handle it correctly anyway -// // by making the iterator empty -// previous = None; -// 0.0 // doesn't matter since previous = None so the iterator will always early exit -// // and return None itself -// }; - -// Ok(FacetNumberRange { rtxn, db, iter, field_id, previous, max_bound, end: right }) -// } -// } - -// impl<'t, 'e> Iterator for FacetNumberRange<'t, 'e> { -// type Item = heed::Result<(FacetKey, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// // The idea here is to return the **previous** element only if the left -// // bound of the current key fits within the range given to the iter -// // if it doesn't, then there is still a chance that it must be returned, -// // but we need to check the actual right bound of the group by looking for -// // the key preceding the first key of the next group in level 0 - -// let (prev_key, prev_value) = self.previous?; - -// let (next_left_bound, next_previous) = if let Some(next) = self.iter.next() { -// let (key, group_value) = match next { -// Ok(n) => n, -// Err(e) => return Some(Err(e)), -// }; -// (key.left_bound, Some((key, group_value))) -// } else { -// // we're at the end of the level iter, so we need to fetch the max bound instead -// (self.max_bound, None) -// }; -// let must_be_returned = match self.end { -// Included(end) => next_left_bound <= end, -// Excluded(end) => next_left_bound < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match prev_value.decode() { -// Ok(group_value) => { -// self.previous = next_previous; -// Some(Ok((prev_key, group_value.bitmap))) -// } -// Err(e) => Some(Err(e)), -// } -// } else { -// // it still possible that we want to return the value (one last time) -// // but to do so, we need to fetch the right bound of the current group -// // this is done by getting the first element at level 0 of the next group -// // then iterating in reverse from it -// // once we have the right bound, we can compare it, and then return or not -// // then we still set self.previous to None so that no other element can return -// // from it? -// let mut level_0_key_prefix = vec![]; -// level_0_key_prefix.extend_from_slice(&self.field_id.to_be_bytes()); -// level_0_key_prefix.push(0); -// let key = -// FacetKey:: { field_id: self.field_id, level: 0, left_bound: next_left_bound }; -// let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); -// level_0_key_prefix.extend_from_slice(&key_bytes); - -// let mut rev_iter_next_group_level_0 = self -// .db -// .as_polymorph() -// .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&self.rtxn, &level_0_key_prefix) -// .unwrap(); -// let (key_for_right_bound, _) = rev_iter_next_group_level_0.next().unwrap().unwrap(); -// let key_for_right_bound = -// FacetKeyCodec::::bytes_decode(key_for_right_bound).unwrap(); -// let right_bound = key_for_right_bound.left_bound; -// let must_be_returned = match self.end { -// Included(end) => right_bound <= end, -// Excluded(end) => right_bound < end, -// Unbounded => unreachable!(), -// }; -// self.previous = None; -// if must_be_returned { -// match prev_value.decode() { -// Ok(group_value) => Some(Ok((prev_key, group_value.bitmap))), -// Err(e) => Some(Err(e)), -// } -// } else { -// None -// } -// } -// } -// } - -// pub struct FacetNumberRevRange<'t> { -// iter: RoRevRange<'t, FacetKeyCodec, LazyDecode>, -// end: Bound, -// } - -// impl<'t> FacetNumberRevRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, FacetGroupValueCodec>, -// field_id: FieldId, -// level: u8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let left_bound = match left { -// Included(left) => Included(FacetKey { field_id, level, left_bound: left }), -// Excluded(left) => Excluded(FacetKey { field_id, level, left_bound: left }), -// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), -// }; -// let right_bound = Included(FacetKey { field_id, level, left_bound: f64::MAX }); -// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; -// Ok(FacetNumberRevRange { iter, end: right }) -// } -// } - -// impl<'t> Iterator for FacetNumberRevRange<'t> { -// type Item = heed::Result<(FacetKey, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// loop { -// match self.iter.next() { -// Some(Ok((FacetKey { field_id, level, left_bound }, docids))) => { -// let must_be_returned = match self.end { -// Included(end) => todo!(), //right <= end, -// Excluded(end) => todo!(), //right < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match docids.decode() { -// Ok(docids) => { -// return Some(Ok(( -// FacetKey { field_id, level, left_bound }, -// docids.bitmap, -// ))) -// } -// Err(e) => return Some(Err(e)), -// } -// } -// continue; -// } -// Some(Err(e)) => return Some(Err(e)), -// None => return None, -// } -// } -// } -// } - -// pub struct FacetNumberIter<'t, 'e> { -// rtxn: &'t heed::RoTxn<'t>, -// db: Database, FacetGroupValueCodec>, -// field_id: FieldId, -// level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, -// must_reduce: bool, -// } - -// impl<'t, 'e> FacetNumberIter<'t, 'e> { -// /// Create a `FacetNumberIter` that will iterate on the different facet entries -// /// (facet value + documents ids) and that will reduce the given documents ids -// /// while iterating on the different facet levels. -// pub fn new_reducing( -// rtxn: &'t heed::RoTxn<'e>, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_f64_docids; -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// let highest_iter = -// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; -// let level_iters = vec![(documents_ids, Left(highest_iter))]; -// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) -// } - -// /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse -// /// (facet value + documents ids) and that will reduce the given documents ids -// /// while iterating on the different facet levels. -// pub fn new_reverse_reducing( -// rtxn: &'t heed::RoTxn<'e>, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_f64_docids; -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// let highest_iter = -// FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; -// let level_iters = vec![(documents_ids, Right(highest_iter))]; -// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) -// } - -// /// Create a `FacetNumberIter` that will iterate on the different facet entries -// /// (facet value + documents ids) and that will not reduce the given documents ids -// /// while iterating on the different facet levels, possibly returning multiple times -// /// a document id associated with multiple facet values. -// pub fn new_non_reducing( -// rtxn: &'t heed::RoTxn<'e>, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_f64_docids; -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// let highest_iter = -// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; -// let level_iters = vec![(documents_ids, Left(highest_iter))]; -// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) -// } - -// fn highest_level( -// rtxn: &'t heed::RoTxn, -// db: Database, X>, -// fid: FieldId, -// ) -> heed::Result> { -// let level = db -// .remap_types::() -// .prefix_iter(rtxn, &fid.to_be_bytes())? -// .remap_key_type::>() -// .last() -// .transpose()? -// .map(|(key, _)| key.level); -// Ok(level) -// } -// } - -// impl<'t, 'e> Iterator for FacetNumberIter<'t, 'e> { -// type Item = heed::Result<(f64, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// 'outer: loop { -// let (documents_ids, last) = self.level_iters.last_mut()?; -// let is_ascending = last.is_left(); -// for result in last { -// // If the last iterator must find an empty set of documents it means -// // that we found all the documents in the sub level iterations already, -// // we can pop this level iterator. -// if documents_ids.is_empty() { -// break; -// } - -// match result { -// Ok((key, mut docids)) => { -// docids &= &*documents_ids; -// if !docids.is_empty() { -// if self.must_reduce { -// *documents_ids -= &docids; -// } - -// if level == 0 { -// return Some(Ok((left, docids))); -// } - -// let rtxn = self.rtxn; -// let db = self.db; -// let fid = self.field_id; -// let left = Included(left); -// let right = Included(right); - -// let result = if is_ascending { -// FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) -// .map(Left) -// } else { -// FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) -// .map(Right) -// }; - -// match result { -// Ok(iter) => { -// self.level_iters.push((docids, iter)); -// continue 'outer; -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// Err(e) => return Some(Err(e)), -// } -// } -// self.level_iters.pop(); -// } -// } -// } diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs new file mode 100644 index 000000000..c9abd9556 --- /dev/null +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -0,0 +1,147 @@ +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; + +use super::{get_first_facet_value, get_highest_level}; + +pub fn ascending_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: RoaringBitmap, +) -> Box + 't> { + let highest_level = + get_highest_level(rtxn, &db.remap_key_type::>(), field_id); + if let Some(first_bound) = get_first_facet_value::( + rtxn, + &db.remap_key_type::>(), + field_id, + ) { + let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; + let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); + + Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] }) + } else { + return Box::new(std::iter::empty()); + } +} + +struct AscendingFacetSort<'t, 'e> { + rtxn: &'t heed::RoTxn<'e>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + stack: Vec<( + RoaringBitmap, + std::iter::Take, FacetGroupValueCodec>>, + )>, +} + +impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { + type Item = (&'t [u8], RoaringBitmap); + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, deepest_iter) = self.stack.last_mut()?; + for result in deepest_iter { + let ( + FacetKey { level, left_bound, field_id }, + FacetGroupValue { size: group_size, mut bitmap }, + ) = result.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if field_id != self.field_id { + return None; + } + + // If the last iterator found an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + bitmap &= &*documents_ids; + if !bitmap.is_empty() { + *documents_ids -= &bitmap; + + if level == 0 { + return Some((left_bound, bitmap)); + } + let starting_key_below = + FacetKey { field_id: self.field_id, level: level - 1, left_bound }; + let iter = self + .db + .range(&self.rtxn, &(starting_key_below..)) + .unwrap() + .take(group_size as usize); + + self.stack.push((bitmap, iter)); + continue 'outer; + } + } + self.stack.pop(); + } + } +} + +#[cfg(test)] +mod tests { + use crate::{ + ascending_facet_sort::ascending_facet_sort, codec::U16Codec, display_bitmap, Index, + }; + use heed::BytesDecode; + use roaring::RoaringBitmap; + + fn get_simple_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &i, &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = fastrand::Rng::with_seed(0); + let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as u16), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_sort() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates); + for (facet, docids) in iter { + let facet = U16Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); + } + insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs new file mode 100644 index 000000000..d3c9d54f8 --- /dev/null +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -0,0 +1,172 @@ +use std::ops::Bound; + +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; + +use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; + +fn descending_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: RoaringBitmap, +) -> Box + 't> { + let highest_level = get_highest_level(rtxn, db, field_id); + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id) { + let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; + let last_bound = get_last_facet_value::(rtxn, db, field_id).unwrap(); + let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound }; + let iter = db.rev_range(rtxn, &(first_key..=last_key)).unwrap().take(usize::MAX); + Box::new(DescendingFacetSort { + rtxn, + db, + field_id, + stack: vec![(candidates, iter, Bound::Included(last_bound))], + }) + } else { + return Box::new(std::iter::empty()); + } +} + +struct DescendingFacetSort<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + stack: Vec<( + RoaringBitmap, + std::iter::Take, FacetGroupValueCodec>>, + Bound<&'t [u8]>, + )>, +} + +impl<'t> Iterator for DescendingFacetSort<'t> { + type Item = (&'t [u8], RoaringBitmap); + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?; + while let Some(result) = deepest_iter.next() { + let ( + FacetKey { level, left_bound, field_id }, + FacetGroupValue { size: group_size, mut bitmap }, + ) = result.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if field_id != self.field_id { + return None; + } + // If the last iterator found an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + bitmap &= &*documents_ids; + if !bitmap.is_empty() { + *documents_ids -= &bitmap; + + if level == 0 { + return Some((left_bound, bitmap)); + } + let starting_key_below = FacetKey { field_id, level: level - 1, left_bound }; + + let end_key_kelow = match *right_bound { + Bound::Included(right) => Bound::Included(FacetKey { + field_id, + level: level - 1, + left_bound: right, + }), + Bound::Excluded(right) => Bound::Excluded(FacetKey { + field_id, + level: level - 1, + left_bound: right, + }), + Bound::Unbounded => Bound::Unbounded, + }; + let prev_right_bound = *right_bound; + *right_bound = Bound::Excluded(left_bound); + let iter = self + .db + .rev_range( + &self.rtxn, + &(Bound::Included(starting_key_below), end_key_kelow), + ) + .unwrap() + .take(group_size as usize); + + self.stack.push((bitmap, iter, prev_right_bound)); + continue 'outer; + } + *right_bound = Bound::Excluded(left_bound); + } + self.stack.pop(); + } + } +} + +#[cfg(test)] +mod tests { + use crate::{ + codec::{MyByteSlice, U16Codec}, + descending_facet_sort::descending_facet_sort, + display_bitmap, FacetKeyCodec, Index, + }; + use heed::BytesDecode; + use roaring::RoaringBitmap; + + fn get_simple_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &i, &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = fastrand::Rng::with_seed(0); + let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as u16), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_sort_descending() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let db = index.db.content.remap_key_type::>(); + let iter = descending_facet_sort(&txn, &db, 0, candidates); + for (facet, docids) in iter { + let facet = U16Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); + } + insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs deleted file mode 100644 index b01359503..000000000 --- a/milli/src/search/facet/facet_string.rs +++ /dev/null @@ -1,649 +0,0 @@ -// //! This module contains helpers iterators for facet strings. -// //! -// //! The purpose is to help iterate over the quite complex system of facets strings. A simple -// //! description of the system would be that every facet string value is stored into an LMDB database -// //! and that every value is associated with the document ids which are associated with this facet -// //! string value. -// //! -// //! In reality it is a little bit more complex as we have to create aggregations of runs of facet -// //! string values, those aggregations helps in choosing the right groups of facets to follow. -// //! -// //! ## A typical algorithm run -// //! -// //! If a group of aggregated facets values contains one of the documents ids, we must continue -// //! iterating over the sub-groups. -// //! -// //! If this group is the lowest level and contain at least one document id we yield the associated -// //! facet documents ids. -// //! -// //! If the group doesn't contain one of our documents ids, we continue to the next group at this -// //! same level. -// //! -// //! ## The complexity comes from the strings -// //! -// //! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create -// //! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the -// //! two numbers bounds, the left and the right bound of the group, both inclusive. -// //! -// //! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and -// //! puting two numbers big-endian encoded one after the other gives us ordered groups. The values -// //! are simple unions of the documents ids coming from the groups below. -// //! -// //! ### Example of what a facet number LMDB database contain -// //! -// //! | level | left-bound | right-bound | documents ids | -// //! |-------|------------|-------------|------------------| -// //! | 0 | 0 | _skipped_ | 1, 2 | -// //! | 0 | 1 | _skipped_ | 6, 7 | -// //! | 0 | 3 | _skipped_ | 4, 7 | -// //! | 0 | 5 | _skipped_ | 2, 3, 4 | -// //! | 1 | 0 | 1 | 1, 2, 6, 7 | -// //! | 1 | 3 | 5 | 2, 3, 4, 7 | -// //! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | -// //! -// //! As you can see the level 0 have two equal bounds, therefore we skip serializing the second -// //! bound, that's the base level where you can directly fetch the documents ids associated with an -// //! exact number. -// //! -// //! The next levels have two different bounds and the associated documents ids are simply the result -// //! of an union of all the documents ids associated with the aggregated groups above. -// //! -// //! ## The complexity of defining groups for facet strings -// //! -// //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in -// //! lexicographical order, it means that whatever the key represent the bytes are read in their raw -// //! form and a simple `strcmp` will define the order in which keys will be read from the store. -// //! -// //! That's easy for types with a known size, like floats or integers, they are 64 bytes long and -// //! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the -// //! first number then by the second if the the first number is equal on two keys. -// //! -// //! For strings it is a lot more complex as those types are unsized, it means that the size of facet -// //! strings is different for each facet value. -// //! -// //! ### Basic approach: padding the keys -// //! -// //! A first approach would be to simply define the maximum size of a facet string and pad the keys -// //! with zeroes. The big problem of this approach is that it: -// //! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the -// //! other. -// //! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB -// //! performances. -// //! -// //! ### Better approach: number the facet groups -// //! -// //! A better approach would be to number the groups, this way we don't have the downsides of the -// //! previously described approach but we need to be able to describe the groups by using a number. -// //! -// //! #### Example of facet strings with numbered groups -// //! -// //! | level | left-bound | right-bound | left-string | right-string | documents ids | -// //! |-------|------------|-------------|-------------|--------------|------------------| -// //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | -// //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | -// //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | -// //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | -// //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | -// //! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | -// //! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | -// //! -// //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not -// //! need to store the facet string value two times. -// //! -// //! The number in the left-bound and right-bound columns are incremental numbers representing the -// //! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering -// //! of the LMDB keys. -// //! -// //! In the value, not in the key, you can see that we added two new values: the left-string and the -// //! right-string, which defines the original facet strings associated with the given group. -// //! -// //! We put those two strings inside of the value, this way we do not limit the maximum size of the -// //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big -// //! values on another page, this helps in iterating over keys fast enough and only fetch the page -// //! with the values when required. -// //! -// //! The other little advantage with this solution is that there is no a big overhead, compared with -// //! the facet number levels, we only duplicate the facet strings once for the level 1. -// //! -// //! #### A typical algorithm run -// //! -// //! Note that the algorithm is always moving from the highest level to the lowest one, one level -// //! by one level, this is why it is ok to only store the facets string on the level 1. -// //! -// //! If a group of aggregated facets values, a group with numbers contains one of the documents ids, -// //! we must continue iterating over the sub-groups. To do so: -// //! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds -// //! and iterate over the facet groups defined by these numbers over the current level - 1. -// //! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the -// //! value and just do the same as with the facet numbers but with strings: iterate over the -// //! current level - 1 with both keys. -// //! -// //! If this group is the lowest level (level 0) and contain at least one document id we yield the -// //! associated facet documents ids. -// //! -// //! If the group doesn't contain one of our documents ids, we continue to the next group at this -// //! same level. -// //! - -// use std::num::NonZeroU8; -// use std::ops::Bound; -// use std::ops::Bound::{Excluded, Included, Unbounded}; - -// use either::{Either, Left, Right}; -// use heed::types::{ByteSlice, DecodeIgnore}; -// use heed::{Database, LazyDecode, RoRange, RoRevRange}; -// use roaring::RoaringBitmap; - -// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; -// use crate::heed_codec::CboRoaringBitmapCodec; -// use crate::{FieldId, Index}; - -// /// An iterator that is used to explore the facets level strings -// /// from the level 1 to infinity. -// /// -// /// It yields the level, group id that an entry covers, the optional group strings -// /// that it covers of the level 0 only if it is an entry from the level 1 and -// /// the roaring bitmap associated. -// pub struct FacetStringGroupRange<'t> { -// iter: RoRange< -// 't, -// FacetLevelValueU32Codec, -// LazyDecode>, -// >, -// end: Bound, -// } - -// impl<'t> FacetStringGroupRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// level: NonZeroU8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let db = db.remap_types::< -// FacetLevelValueU32Codec, -// FacetStringZeroBoundsValueCodec, -// >(); -// let left_bound = match left { -// Included(left) => Included((field_id, level, left, u32::MIN)), -// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), -// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), -// }; -// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); -// let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; -// Ok(FacetStringGroupRange { iter, end: right }) -// } -// } - -// impl<'t> Iterator for FacetStringGroupRange<'t> { -// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - -// fn next(&mut self) -> Option { -// match self.iter.next() { -// Some(Ok(((_fid, level, left, right), docids))) => { -// let must_be_returned = match self.end { -// Included(end) => right <= end, -// Excluded(end) => right < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match docids.decode() { -// Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), -// Err(e) => Some(Err(e)), -// } -// } else { -// None -// } -// } -// Some(Err(e)) => Some(Err(e)), -// None => None, -// } -// } -// } - -// pub struct FacetStringGroupRevRange<'t> { -// iter: RoRevRange< -// 't, -// FacetLevelValueU32Codec, -// LazyDecode>, -// >, -// end: Bound, -// } - -// impl<'t> FacetStringGroupRevRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// level: NonZeroU8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let db = db.remap_types::< -// FacetLevelValueU32Codec, -// FacetStringZeroBoundsValueCodec, -// >(); -// let left_bound = match left { -// Included(left) => Included((field_id, level, left, u32::MIN)), -// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), -// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), -// }; -// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); -// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; -// Ok(FacetStringGroupRevRange { iter, end: right }) -// } -// } - -// impl<'t> Iterator for FacetStringGroupRevRange<'t> { -// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - -// fn next(&mut self) -> Option { -// loop { -// match self.iter.next() { -// Some(Ok(((_fid, level, left, right), docids))) => { -// let must_be_returned = match self.end { -// Included(end) => right <= end, -// Excluded(end) => right < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match docids.decode() { -// Ok((bounds, docids)) => { -// return Some(Ok(((level, left, right), (bounds, docids)))) -// } -// Err(e) => return Some(Err(e)), -// } -// } -// continue; -// } -// Some(Err(e)) => return Some(Err(e)), -// None => return None, -// } -// } -// } -// } - -// /// An iterator that is used to explore the level 0 of the facets string database. -// /// -// /// It yields the facet string and the roaring bitmap associated with it. -// pub struct FacetStringLevelZeroRange<'t> { -// iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -// } - -// impl<'t> FacetStringLevelZeroRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// left: Bound<&str>, -// right: Bound<&str>, -// ) -> heed::Result> { -// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { -// buffer.extend_from_slice(&field_id.to_be_bytes()); -// buffer.push(0); -// buffer.extend_from_slice(value.as_bytes()); -// &buffer[..] -// } - -// let mut left_buffer = Vec::new(); -// let left_bound = match left { -// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), -// Unbounded => { -// left_buffer.extend_from_slice(&field_id.to_be_bytes()); -// left_buffer.push(0); -// Included(&left_buffer[..]) -// } -// }; - -// let mut right_buffer = Vec::new(); -// let right_bound = match right { -// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), -// Unbounded => { -// right_buffer.extend_from_slice(&field_id.to_be_bytes()); -// right_buffer.push(1); // we must only get the level 0 -// Excluded(&right_buffer[..]) -// } -// }; - -// let iter = db -// .remap_key_type::() -// .range(rtxn, &(left_bound, right_bound))? -// .remap_types::(); - -// Ok(FacetStringLevelZeroRange { iter }) -// } -// } - -// impl<'t> Iterator for FacetStringLevelZeroRange<'t> { -// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// match self.iter.next() { -// Some(Ok(((_fid, normalized), (original, docids)))) => { -// Some(Ok((normalized, original, docids))) -// } -// Some(Err(e)) => Some(Err(e)), -// None => None, -// } -// } -// } - -// pub struct FacetStringLevelZeroRevRange<'t> { -// iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -// } - -// impl<'t> FacetStringLevelZeroRevRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// left: Bound<&str>, -// right: Bound<&str>, -// ) -> heed::Result> { -// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { -// buffer.extend_from_slice(&field_id.to_be_bytes()); -// buffer.push(0); -// buffer.extend_from_slice(value.as_bytes()); -// &buffer[..] -// } - -// let mut left_buffer = Vec::new(); -// let left_bound = match left { -// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), -// Unbounded => { -// left_buffer.extend_from_slice(&field_id.to_be_bytes()); -// left_buffer.push(0); -// Included(&left_buffer[..]) -// } -// }; - -// let mut right_buffer = Vec::new(); -// let right_bound = match right { -// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), -// Unbounded => { -// right_buffer.extend_from_slice(&field_id.to_be_bytes()); -// right_buffer.push(1); // we must only get the level 0 -// Excluded(&right_buffer[..]) -// } -// }; - -// let iter = db -// .remap_key_type::() -// .rev_range(rtxn, &(left_bound, right_bound))? -// .remap_types::(); - -// Ok(FacetStringLevelZeroRevRange { iter }) -// } -// } - -// impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { -// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// match self.iter.next() { -// Some(Ok(((_fid, normalized), (original, docids)))) => { -// Some(Ok((normalized, original, docids))) -// } -// Some(Err(e)) => Some(Err(e)), -// None => None, -// } -// } -// } - -// type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; -// type EitherStringRevRange<'t> = -// Either, FacetStringLevelZeroRevRange<'t>>; - -// /// An iterator that is used to explore the facet strings level by level, -// /// it will only return facets strings that are associated with the -// /// candidates documents ids given. -// pub struct FacetStringIter<'t> { -// rtxn: &'t heed::RoTxn<'t>, -// db: Database, -// field_id: FieldId, -// level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, -// must_reduce: bool, -// } - -// impl<'t> FacetStringIter<'t> { -// pub fn new_reducing( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_string_docids.remap_types::(); -// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; -// Ok(FacetStringIter { -// rtxn, -// db, -// field_id, -// level_iters: vec![(documents_ids, Left(highest_iter))], -// must_reduce: true, -// }) -// } - -// pub fn new_reverse_reducing( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_string_docids.remap_types::(); -// let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; -// Ok(FacetStringIter { -// rtxn, -// db, -// field_id, -// level_iters: vec![(documents_ids, Right(highest_reverse_iter))], -// must_reduce: true, -// }) -// } - -// pub fn new_non_reducing( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_string_docids.remap_types::(); -// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; -// Ok(FacetStringIter { -// rtxn, -// db, -// field_id, -// level_iters: vec![(documents_ids, Left(highest_iter))], -// must_reduce: false, -// }) -// } - -// fn highest_level( -// rtxn: &'t heed::RoTxn, -// db: Database, -// fid: FieldId, -// ) -> heed::Result> { -// Ok(db -// .remap_types::() -// .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits -// .last() -// .transpose()? -// .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit -// } - -// fn highest_iter( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// db: Database, -// field_id: FieldId, -// ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// match NonZeroU8::new(highest_level) { -// Some(highest_level) => FacetStringGroupRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// highest_level, -// Unbounded, -// Unbounded, -// ) -// .map(Left), -// None => FacetStringLevelZeroRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// Unbounded, -// Unbounded, -// ) -// .map(Right), -// } -// } - -// fn highest_reverse_iter( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// db: Database, -// field_id: FieldId, -// ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// match NonZeroU8::new(highest_level) { -// Some(highest_level) => FacetStringGroupRevRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// highest_level, -// Unbounded, -// Unbounded, -// ) -// .map(Left), -// None => FacetStringLevelZeroRevRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// Unbounded, -// Unbounded, -// ) -// .map(Right), -// } -// } -// } - -// impl<'t> Iterator for FacetStringIter<'t> { -// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// 'outer: loop { -// let (documents_ids, last) = self.level_iters.last_mut()?; -// let is_ascending = last.is_left(); - -// // We remap the different iterator types to make -// // the algorithm less complex to understand. -// let last = match last { -// Left(ascending) => match ascending { -// Left(group) => Left(Left(group)), -// Right(zero_level) => Right(Left(zero_level)), -// }, -// Right(descending) => match descending { -// Left(group) => Left(Right(group)), -// Right(zero_level) => Right(Right(zero_level)), -// }, -// }; - -// match last { -// Left(group) => { -// for result in group { -// match result { -// Ok(((level, left, right), (string_bounds, mut docids))) => { -// docids &= &*documents_ids; -// if !docids.is_empty() { -// if self.must_reduce { -// *documents_ids -= &docids; -// } - -// let result = if is_ascending { -// match string_bounds { -// Some((left, right)) => FacetStringLevelZeroRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// Included(left), -// Included(right), -// ) -// .map(Right), -// None => FacetStringGroupRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// NonZeroU8::new(level.get() - 1).unwrap(), -// Included(left), -// Included(right), -// ) -// .map(Left), -// } -// .map(Left) -// } else { -// match string_bounds { -// Some((left, right)) => { -// FacetStringLevelZeroRevRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// Included(left), -// Included(right), -// ) -// .map(Right) -// } -// None => FacetStringGroupRevRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// NonZeroU8::new(level.get() - 1).unwrap(), -// Included(left), -// Included(right), -// ) -// .map(Left), -// } -// .map(Right) -// }; - -// match result { -// Ok(iter) => { -// self.level_iters.push((docids, iter)); -// continue 'outer; -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// Right(zero_level) => { -// // level zero only -// for result in zero_level { -// match result { -// Ok((normalized, original, mut docids)) => { -// docids &= &*documents_ids; -// if !docids.is_empty() { -// if self.must_reduce { -// *documents_ids -= &docids; -// } -// return Some(Ok((normalized, original, docids))); -// } -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// } - -// self.level_iters.pop(); -// } -// } -// } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 13b00d2de..ceedff1e0 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,9 +1,79 @@ +use heed::types::ByteSlice; +use heed::{BytesDecode, RoTxn}; + +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; + pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; // pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; // pub use self::facet_string::FacetStringIter; pub use self::filter::Filter; mod facet_distribution; -mod facet_number; -mod facet_string; +mod facet_distribution_iter; +mod facet_sort_ascending; +mod facet_sort_descending; mod filter; + +fn get_first_facet_value<'t, BoundCodec>( + txn: &'t RoTxn, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> Option +where + BoundCodec: BytesDecode<'t>, +{ + let mut level0prefix = vec![]; + level0prefix.extend_from_slice(&field_id.to_be_bytes()); + level0prefix.push(0); + let mut level0_iter_forward = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) + .unwrap(); + if let Some(first) = level0_iter_forward.next() { + let (first_key, _) = first.unwrap(); + let first_key = FacetKeyCodec::::bytes_decode(first_key).unwrap(); + Some(first_key.left_bound) + } else { + None + } +} +fn get_last_facet_value<'t, BoundCodec>( + txn: &'t RoTxn, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> Option +where + BoundCodec: BytesDecode<'t>, +{ + let mut level0prefix = vec![]; + level0prefix.extend_from_slice(&field_id.to_be_bytes()); + level0prefix.push(0); + let mut level0_iter_backward = db + .as_polymorph() + .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) + .unwrap(); + if let Some(last) = level0_iter_backward.next() { + let (last_key, _) = last.unwrap(); + let last_key = FacetKeyCodec::::bytes_decode(last_key).unwrap(); + Some(last_key.left_bound) + } else { + None + } +} +fn get_highest_level<'t>( + txn: &'t RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> u8 { + let field_id_prefix = &field_id.to_be_bytes(); + db.as_polymorph() + .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix) + .unwrap() + .next() + .map(|el| { + let (key, _) = el.unwrap(); + let key = FacetKeyCodec::::bytes_decode(key).unwrap(); + key.level + }) + .unwrap_or(0) +} diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index aaaa445da..fe8c2855e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -64,7 +64,7 @@ impl<'i> Facets<'i> { } #[logging_timer::time("Facets::{}")] - pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // We get the faceted fields to be able to create the facet levels. let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); @@ -172,14 +172,14 @@ impl<'t> CreateFacetsAlgo<'t> { bitmaps.push(docids); if bitmaps.len() == self.level_group_size { - handle_group(&bitmaps, left_bound); + handle_group(&bitmaps, left_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); } } // don't forget to give the leftover bitmaps as well if !bitmaps.is_empty() { - handle_group(&bitmaps, left_bound); + handle_group(&bitmaps, left_bound)?; bitmaps.clear(); } Ok(()) @@ -197,7 +197,7 @@ impl<'t> CreateFacetsAlgo<'t> { handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result>> { if level == 0 { - self.read_level_0(handle_group); + self.read_level_0(handle_group)?; // Level 0 is already in the database return Ok(vec![]); } From b8a1caad5e8d9a55ba7c7807805a4ee2fbb6b980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 30 Aug 2022 15:22:39 +0200 Subject: [PATCH 1695/1889] Add range search and incremental indexing algorithm --- milli/Cargo.toml | 2 +- .../search/facet/facet_distribution_iter.rs | 70 +-- milli/src/search/facet/facet_range_search.rs | 451 +++++++++++++++++ .../src/search/facet/facet_sort_ascending.rs | 56 ++- .../src/search/facet/facet_sort_descending.rs | 73 +-- milli/src/search/facet/filter.rs | 1 - milli/src/search/facet/incremental_update.rs | 459 ++++++++++++++++++ milli/src/search/facet/mod.rs | 148 +++++- 8 files changed, 1145 insertions(+), 115 deletions(-) create mode 100644 milli/src/search/facet/facet_range_search.rs create mode 100644 milli/src/search/facet/incremental_update.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 835425714..658ef0d24 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -54,7 +54,7 @@ big_s = "1.0.2" insta = "1.21.0" maplit = "1.0.2" md5 = "0.7.0" -rand = "0.8.5" +rand = {version = "0.8.5", features = ["small_rng"] } [features] default = [ "charabia/default" ] diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 2dfe3580f..83079028c 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,8 +1,8 @@ +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; +use crate::Result; use roaring::RoaringBitmap; use std::ops::ControlFlow; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; - use super::{get_first_facet_value, get_highest_level}; pub fn iterate_over_facet_distribution<'t, CB>( @@ -11,18 +11,19 @@ pub fn iterate_over_facet_distribution<'t, CB>( field_id: u16, candidates: &RoaringBitmap, callback: CB, -) where +) -> Result<()> +where CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, { let mut fd = FacetDistribution { rtxn, db, field_id, callback }; let highest_level = - get_highest_level(rtxn, &db.remap_key_type::>(), field_id); + get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id) { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX); - return; + return Ok(()); } else { - return; + return Ok(()); } } @@ -45,26 +46,26 @@ where candidates: &RoaringBitmap, starting_bound: &'t [u8], group_size: usize, - ) -> ControlFlow<()> { + ) -> Result> { let starting_key = FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; - let iter = self.db.range(self.rtxn, &(starting_key..)).unwrap().take(group_size); + let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size); for el in iter { - let (key, value) = el.unwrap(); + let (key, value) = el?; // The range is unbounded on the right and the group size for the highest level is MAX, // so we need to check that we are not iterating over the next field id if key.field_id != self.field_id { - return ControlFlow::Break(()); + return Ok(ControlFlow::Break(())); } let docids_in_common = value.bitmap.intersection_len(candidates); if docids_in_common > 0 { match (self.callback)(key.left_bound, docids_in_common) { ControlFlow::Continue(_) => {} - ControlFlow::Break(_) => return ControlFlow::Break(()), + ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } } } - return ControlFlow::Continue(()); + return Ok(ControlFlow::Continue(())); } fn iterate( &mut self, @@ -72,7 +73,7 @@ where level: u8, starting_bound: &'t [u8], group_size: usize, - ) -> ControlFlow<()> { + ) -> Result> { if level == 0 { return self.iterate_level_0(candidates, starting_bound, group_size); } @@ -84,34 +85,42 @@ where // The range is unbounded on the right and the group size for the highest level is MAX, // so we need to check that we are not iterating over the next field id if key.field_id != self.field_id { - return ControlFlow::Break(()); + return Ok(ControlFlow::Break(())); } let docids_in_common = value.bitmap & candidates; if docids_in_common.len() > 0 { - let cf = - self.iterate(&docids_in_common, level - 1, key.left_bound, value.size as usize); + let cf = self.iterate( + &docids_in_common, + level - 1, + key.left_bound, + value.size as usize, + )?; match cf { ControlFlow::Continue(_) => {} - ControlFlow::Break(_) => return ControlFlow::Break(()), + ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } } } - return ControlFlow::Continue(()); + return Ok(ControlFlow::Continue(())); } } #[cfg(test)] mod tests { - use crate::{codec::U16Codec, Index}; use heed::BytesDecode; + use rand::{rngs::SmallRng, Rng, SeedableRng}; use roaring::RoaringBitmap; use std::ops::ControlFlow; + use crate::{ + heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::test::FacetIndex, + }; + use super::iterate_over_facet_distribution; - fn get_simple_index() -> Index { - let index = Index::::new(4, 8); + fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -121,18 +130,19 @@ mod tests { txn.commit().unwrap(); index } - fn get_random_looking_index() -> Index { - let index = Index::::new(4, 8); + fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = fastrand::Rng::with_seed(0); - let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as u16), &bitmap); + bitmap.insert(key + 100.); + index.insert(&mut txn, 0, &(key as f64), &bitmap); } txn.commit().unwrap(); index @@ -156,7 +166,7 @@ mod tests { 0, &candidates, |facet, count| { - let facet = U16Codec::bytes_decode(facet).unwrap(); + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); results.push_str(&format!("{facet}: {count}\n")); ControlFlow::Continue(()) }, @@ -180,7 +190,7 @@ mod tests { 0, &candidates, |facet, count| { - let facet = U16Codec::bytes_decode(facet).unwrap(); + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); if nbr_facets == 100 { return ControlFlow::Break(()); } else { diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs new file mode 100644 index 000000000..c01346b25 --- /dev/null +++ b/milli/src/search/facet/facet_range_search.rs @@ -0,0 +1,451 @@ +use heed::BytesEncode; +use roaring::RoaringBitmap; +use std::ops::Bound; +use std::ops::RangeBounds; + +use crate::heed_codec::facet::new::FacetGroupValueCodec; +use crate::heed_codec::facet::new::FacetKey; +use crate::heed_codec::facet::new::FacetKeyCodec; +use crate::heed_codec::facet::new::MyByteSlice; +use crate::Result; + +use super::get_first_facet_value; +use super::get_highest_level; +use super::get_last_facet_value; + +pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + left: &'t Bound<>::EItem>, + right: &'t Bound<>::EItem>, +) -> Result +where + BoundCodec: for<'a> BytesEncode<'a>, + for<'a> >::EItem: Sized, +{ + let inner; + let left = match left { + Bound::Included(left) => { + inner = BoundCodec::bytes_encode(left).unwrap(); + Bound::Included(inner.as_ref()) + } + Bound::Excluded(left) => { + inner = BoundCodec::bytes_encode(left).unwrap(); + Bound::Excluded(inner.as_ref()) + } + Bound::Unbounded => Bound::Unbounded, + }; + let inner; + let right = match right { + Bound::Included(right) => { + inner = BoundCodec::bytes_encode(right).unwrap(); + Bound::Included(inner.as_ref()) + } + Bound::Excluded(right) => { + inner = BoundCodec::bytes_encode(right).unwrap(); + Bound::Excluded(inner.as_ref()) + } + Bound::Unbounded => Bound::Unbounded, + }; + + let mut docids = RoaringBitmap::new(); + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; + let highest_level = get_highest_level(rtxn, db, field_id)?; + + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; + Ok(docids) + } else { + return Ok(RoaringBitmap::new()); + } +} + +/// Fetch the document ids that have a facet with a value between the two given bounds +struct FacetRangeSearch<'t, 'b, 'bitmap> { + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + left: Bound<&'b [u8]>, + right: Bound<&'b [u8]>, + docids: &'bitmap mut RoaringBitmap, +} +impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { + fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { + let left_key = + FacetKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; + let iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); + for el in iter { + let (key, value) = el?; + // the right side of the iter range is unbounded, so we need to make sure that we are not iterating + // on the next field id + if key.field_id != self.field_id { + return Ok(()); + } + let should_skip = { + match self.left { + Bound::Included(left) => left > key.left_bound, + Bound::Excluded(left) => left >= key.left_bound, + Bound::Unbounded => false, + } + }; + if should_skip { + continue; + } + let should_stop = { + match self.right { + Bound::Included(right) => right < key.left_bound, + Bound::Excluded(right) => right <= key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + break; + } + + if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { + *self.docids |= value.bitmap; + } + } + Ok(()) + } + + /// Recursive part of the algorithm for level > 0 + fn run( + &mut self, + level: u8, + starting_left_bound: &'t [u8], + rightmost_bound: Bound<&'t [u8]>, + group_size: usize, + ) -> Result<()> { + if level == 0 { + return self.run_level_0(starting_left_bound, group_size); + } + + let left_key = FacetKey { field_id: self.field_id, level, left_bound: starting_left_bound }; + let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); + + let (mut previous_key, mut previous_value) = iter.next().unwrap()?; + for el in iter { + let (next_key, next_value) = el?; + // the right of the iter range is unbounded, so we need to make sure that we are not iterating + // on the next field id + if next_key.field_id != self.field_id { + return Ok(()); + } + // now, do we skip, stop, or visit? + let should_skip = { + match self.left { + Bound::Included(left) => left >= next_key.left_bound, + Bound::Excluded(left) => left >= next_key.left_bound, // TODO: use > instead? + Bound::Unbounded => false, + } + }; + if should_skip { + previous_key = next_key; + previous_value = next_value; + continue; + } + + // should we stop? + let should_stop = { + match self.right { + Bound::Included(right) => right < previous_key.left_bound, + Bound::Excluded(right) => right <= previous_key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + return Ok(()); + } + // should we take the whole thing, without recursing down? + let should_take_whole_group = { + let left_condition = match self.left { + Bound::Included(left) => previous_key.left_bound >= left, + Bound::Excluded(left) => previous_key.left_bound > left, + Bound::Unbounded => true, + }; + let right_condition = match self.right { + Bound::Included(right) => next_key.left_bound <= right, + Bound::Excluded(right) => next_key.left_bound <= right, + Bound::Unbounded => true, + }; + left_condition && right_condition + }; + if should_take_whole_group { + *self.docids |= &previous_value.bitmap; + previous_key = next_key; + previous_value = next_value; + continue; + } + + let level = level - 1; + let starting_left_bound = previous_key.left_bound; + let rightmost_bound = Bound::Excluded(next_key.left_bound); + let group_size = previous_value.size as usize; + + self.run(level, starting_left_bound, rightmost_bound, group_size)?; + + previous_key = next_key; + previous_value = next_value; + } + // previous_key/previous_value are the last element + + // now, do we skip, stop, or visit? + let should_skip = { + match (self.left, rightmost_bound) { + (Bound::Included(left), Bound::Included(right)) => left > right, + (Bound::Included(left), Bound::Excluded(right)) => left >= right, + (Bound::Excluded(left), Bound::Included(right) | Bound::Excluded(right)) => { + left >= right + } + (Bound::Unbounded, _) => false, + (_, Bound::Unbounded) => false, // should never run? + } + }; + if should_skip { + return Ok(()); + } + + // should we stop? + let should_stop = { + match self.right { + Bound::Included(right) => right <= previous_key.left_bound, + Bound::Excluded(right) => right < previous_key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + return Ok(()); + } + // should we take the whole thing, without recursing down? + let should_take_whole_group = { + let left_condition = match self.left { + Bound::Included(left) => previous_key.left_bound >= left, + Bound::Excluded(left) => previous_key.left_bound > left, + Bound::Unbounded => true, + }; + let right_condition = match (self.right, rightmost_bound) { + (Bound::Included(right), Bound::Included(rightmost)) => rightmost <= right, + (Bound::Included(right), Bound::Excluded(rightmost)) => rightmost < right, + // e.g. x < 8 and rightmost is <= y + // condition met if rightmost < 8 + (Bound::Excluded(right), Bound::Included(rightmost)) => rightmost < right, + // e.g. x < 8 and rightmost is < y + // condition met only if y <= 8? + (Bound::Excluded(right), Bound::Excluded(rightmost)) => rightmost <= right, + // e.g. x < inf. , so yes we take the whole thing + (Bound::Unbounded, _) => true, + // e.g. x < 7 , righmost is inf + (_, Bound::Unbounded) => false, // panic? + }; + left_condition && right_condition + }; + if should_take_whole_group { + *self.docids |= &previous_value.bitmap; + } else { + let level = level - 1; + let starting_left_bound = previous_key.left_bound; + let group_size = previous_value.size as usize; + + self.run(level, starting_left_bound, rightmost_bound, group_size)?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::{ + heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, + search::facet::test::FacetIndex, snapshot_tests::display_bitmap, + }; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + use std::ops::Bound; + + use super::find_docids_of_facet_within_bounds; + + fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_range_increasing() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Included(0.); + let end = Bound::Included(i); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + insta::assert_snapshot!( + format!("filter_range_{i}_increasing_included_bounds"), + results + ); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Excluded(0.); + let end = Bound::Excluded(i); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + insta::assert_snapshot!( + format!("filter_range_{i}_increasing_excluded_bounds"), + results + ); + txn.commit().unwrap(); + } + } + #[test] + fn filter_range_decreasing() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + + let mut results = String::new(); + + for i in (0..=255).into_iter().rev() { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Included(255.); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + insta::assert_snapshot!( + format!("filter_range_{i}_decreasing_included_bounds"), + results + ); + + let mut results = String::new(); + + for i in (0..=255).into_iter().rev() { + let i = i as f64; + let start = Bound::Excluded(i); + let end = Bound::Excluded(255.); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + insta::assert_snapshot!( + format!("filter_range_{i}_decreasing_excluded_bounds"), + results + ); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_range_pinch() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + + let mut results = String::new(); + + for i in (0..=128).into_iter().rev() { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Included(255. - i); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + insta::assert_snapshot!(format!("filter_range_{i}_pinch_included_bounds"), results); + + let mut results = String::new(); + + for i in (0..=128).into_iter().rev() { + let i = i as f64; + let start = Bound::Excluded(i); + let end = Bound::Excluded(255. - i); + let docids = find_docids_of_facet_within_bounds::( + &txn, + &index.db.content, + 0, + &start, + &end, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + insta::assert_snapshot!(format!("filter_range_{i}_pinch_excluded_bounds"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index c9abd9556..73491d4ae 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -1,8 +1,8 @@ -use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; +use crate::Result; +use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; @@ -11,20 +11,20 @@ pub fn ascending_facet_sort<'t>( db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, -) -> Box + 't> { +) -> Result> + 't>> { let highest_level = - get_highest_level(rtxn, &db.remap_key_type::>(), field_id); + get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; if let Some(first_bound) = get_first_facet_value::( rtxn, &db.remap_key_type::>(), field_id, - ) { + )? { let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); - Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] }) + Ok(Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] })) } else { - return Box::new(std::iter::empty()); + Ok(Box::new(std::iter::empty())) } } @@ -39,7 +39,7 @@ struct AscendingFacetSort<'t, 'e> { } impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { - type Item = (&'t [u8], RoaringBitmap); + type Item = Result<(&'t [u8], RoaringBitmap)>; fn next(&mut self) -> Option { 'outer: loop { @@ -67,15 +67,15 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { *documents_ids -= &bitmap; if level == 0 { - return Some((left_bound, bitmap)); + return Some(Ok((left_bound, bitmap))); } let starting_key_below = FacetKey { field_id: self.field_id, level: level - 1, left_bound }; - let iter = self - .db - .range(&self.rtxn, &(starting_key_below..)) - .unwrap() - .take(group_size as usize); + let iter = match self.db.range(&self.rtxn, &(starting_key_below..)) { + Ok(iter) => iter, + Err(e) => return Some(Err(e.into())), + } + .take(group_size as usize); self.stack.push((bitmap, iter)); continue 'outer; @@ -88,14 +88,19 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { - use crate::{ - ascending_facet_sort::ascending_facet_sort, codec::U16Codec, display_bitmap, Index, - }; use heed::BytesDecode; + use rand::Rng; + use rand::SeedableRng; use roaring::RoaringBitmap; - fn get_simple_index() -> Index { - let index = Index::::new(4, 8); + use crate::{ + heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, + search::facet::{facet_sort_ascending::ascending_facet_sort, test::FacetIndex}, + snapshot_tests::display_bitmap, + }; + + fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -105,18 +110,19 @@ mod tests { txn.commit().unwrap(); index } - fn get_random_looking_index() -> Index { - let index = Index::::new(4, 8); + fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = fastrand::Rng::with_seed(0); - let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as u16), &bitmap); + index.insert(&mut txn, 0, &(key as f64), &bitmap); } txn.commit().unwrap(); index @@ -136,7 +142,7 @@ mod tests { let mut results = String::new(); let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates); for (facet, docids) in iter { - let facet = U16Codec::bytes_decode(facet).unwrap(); + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); } insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index d3c9d54f8..81b0eb09d 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -1,10 +1,10 @@ use std::ops::Bound; -use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; +use crate::Result; +use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; @@ -13,21 +13,21 @@ fn descending_facet_sort<'t>( db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, -) -> Box + 't> { - let highest_level = get_highest_level(rtxn, db, field_id); - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id) { +) -> Result> + 't>> { + let highest_level = get_highest_level(rtxn, db, field_id)?; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id).unwrap(); + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound }; - let iter = db.rev_range(rtxn, &(first_key..=last_key)).unwrap().take(usize::MAX); - Box::new(DescendingFacetSort { + let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); + Ok(Box::new(DescendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter, Bound::Included(last_bound))], - }) + })) } else { - return Box::new(std::iter::empty()); + Ok(Box::new(std::iter::empty())) } } @@ -43,7 +43,7 @@ struct DescendingFacetSort<'t> { } impl<'t> Iterator for DescendingFacetSort<'t> { - type Item = (&'t [u8], RoaringBitmap); + type Item = Result<(&'t [u8], RoaringBitmap)>; fn next(&mut self) -> Option { 'outer: loop { @@ -70,7 +70,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { *documents_ids -= &bitmap; if level == 0 { - return Some((left_bound, bitmap)); + return Some(Ok((left_bound, bitmap))); } let starting_key_below = FacetKey { field_id, level: level - 1, left_bound }; @@ -89,14 +89,14 @@ impl<'t> Iterator for DescendingFacetSort<'t> { }; let prev_right_bound = *right_bound; *right_bound = Bound::Excluded(left_bound); - let iter = self - .db - .rev_range( - &self.rtxn, - &(Bound::Included(starting_key_below), end_key_kelow), - ) - .unwrap() - .take(group_size as usize); + let iter = match self.db.rev_range( + &self.rtxn, + &(Bound::Included(starting_key_below), end_key_kelow), + ) { + Ok(iter) => iter, + Err(e) => return Some(Err(e.into())), + } + .take(group_size as usize); self.stack.push((bitmap, iter, prev_right_bound)); continue 'outer; @@ -110,16 +110,20 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { - use crate::{ - codec::{MyByteSlice, U16Codec}, - descending_facet_sort::descending_facet_sort, - display_bitmap, FacetKeyCodec, Index, - }; + use heed::BytesDecode; + use rand::Rng; + use rand::SeedableRng; use roaring::RoaringBitmap; - fn get_simple_index() -> Index { - let index = Index::::new(4, 8); + use crate::{ + heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec, MyByteSlice}, + search::facet::{facet_sort_descending::descending_facet_sort, test::FacetIndex}, + snapshot_tests::display_bitmap, + }; + + fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -129,18 +133,19 @@ mod tests { txn.commit().unwrap(); index } - fn get_random_looking_index() -> Index { - let index = Index::::new(4, 8); + fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = fastrand::Rng::with_seed(0); - let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as u16), &bitmap); + bitmap.insert(key + 100.); + index.insert(&mut txn, 0, &(key as f64), &bitmap); } txn.commit().unwrap(); index @@ -161,7 +166,7 @@ mod tests { let db = index.db.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, &db, 0, candidates); for (facet, docids) in iter { - let facet = U16Codec::bytes_decode(facet).unwrap(); + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); } insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index e911dfb15..dd34abe6d 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -7,7 +7,6 @@ use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; use heed::LazyDecode; -use log::debug; use roaring::RoaringBitmap; // use super::FacetNumberRange; diff --git a/milli/src/search/facet/incremental_update.rs b/milli/src/search/facet/incremental_update.rs new file mode 100644 index 000000000..a437efb2d --- /dev/null +++ b/milli/src/search/facet/incremental_update.rs @@ -0,0 +1,459 @@ +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; +use crate::Result; +use heed::Error; +use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; +use roaring::RoaringBitmap; + +use super::get_highest_level; + +enum InsertionResult { + InPlace, + Insert, +} +enum DeletionResult { + InPlace, + Reduce { prev: Option>, next: Option> }, + Remove { prev: Option>, next: Option> }, +} + +struct IncrementalFacetUpdate<'i> { + db: &'i heed::Database, FacetGroupValueCodec>, + group_size: usize, + min_level_size: usize, + max_group_size: usize, +} +impl<'i> IncrementalFacetUpdate<'i> { + fn find_insertion_key_value<'a>( + &self, + field_id: u16, + level: u8, + search_key: &[u8], + txn: &RoTxn, + ) -> Result<(FacetKey>, FacetGroupValue)> { + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(level); + prefix.extend_from_slice(search_key); + + let mut prefix_iter = self + .db + .as_polymorph() + .prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>(txn, &prefix.as_slice())?; + if let Some(e) = prefix_iter.next() { + let (key_bytes, value) = e?; + let key = FacetKeyCodec::::bytes_decode(&key_bytes) + .ok_or(heed::Error::Encoding)?; + Ok(( + FacetKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)? + .into_owned(), + value, + )) + } else { + let key = FacetKey { field_id, level, left_bound: search_key }; + match self.db.get_lower_than(txn, &key)? { + Some((key, value)) => { + if key.level != level || key.field_id != field_id { + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(level); + + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>( + txn, + &prefix.as_slice(), + )?; + let (key_bytes, value) = iter.next().unwrap()?; + Ok(( + FacetKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)? + .into_owned(), + value, + )) + } else { + Ok((key.into_owned(), value)) + } + } + None => panic!(), + } + } + } + + fn insert_in_level_0<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + new_key: &[u8], + new_values: &RoaringBitmap, + ) -> Result { + let key = FacetKey { field_id, level: 0, left_bound: new_key }; + let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 }; + + let mut level0_prefix = vec![]; + level0_prefix.extend_from_slice(&field_id.to_be_bytes()); + level0_prefix.push(0); + + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level0_prefix)?; + + if iter.next().is_none() { + drop(iter); + self.db.put(txn, &key, &value)?; + return Ok(InsertionResult::Insert); + } else { + drop(iter); + let old_value = self.db.get(&txn, &key)?; + match old_value { + Some(mut updated_value) => { + // now merge the two + updated_value.bitmap |= value.bitmap; + self.db.put(txn, &key, &updated_value)?; + Ok(InsertionResult::InPlace) + } + None => { + self.db.put(txn, &key, &value)?; + Ok(InsertionResult::Insert) + } + } + } + } + fn insert_in_level<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + level: u8, + new_key: &[u8], + new_values: &RoaringBitmap, + ) -> Result { + if level == 0 { + return self.insert_in_level_0(txn, field_id, new_key, new_values); + } + + let max_group_size = self.max_group_size; + + let (insertion_key, insertion_value) = + self.find_insertion_key_value(field_id, level, new_key, txn)?; + + let result = self.insert_in_level(txn, field_id, level - 1, new_key.clone(), new_values)?; + // level below inserted an element + + let insertion_key = { + let mut new_insertion_key = insertion_key.clone(); + let mut modified = false; + + if new_key < insertion_key.left_bound.as_slice() { + new_insertion_key.left_bound = new_key.to_vec(); + modified = true; + } + if modified { + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; + } + new_insertion_key + }; + + match result { + // TODO: this could go above the block recomputing insertion key + // because we know that if we inserted in place, the key is not a new one + // thus it doesn't extend a group + InsertionResult::InPlace => { + let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); + updated_value.bitmap |= new_values; + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; + + return Ok(InsertionResult::InPlace); + } + InsertionResult::Insert => {} + } + let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); + + updated_value.size += 1; + if updated_value.size as usize == max_group_size { + // need to split it + // recompute left element and right element + // replace current group by left element + // add one more group to the right + + let size_left = max_group_size / 2; + let size_right = max_group_size - size_left; + + let level_below = level - 1; + + let (start_key, _) = self + .db + .get_greater_than_or_equal_to( + &txn, + &FacetKey { + field_id, + level: level_below, + left_bound: insertion_key.left_bound.as_slice(), + }, + )? + .unwrap(); + + let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size); + + let group_left = { + let mut values_left = RoaringBitmap::new(); + + let mut i = 0; + while let Some(next) = iter.next() { + let (_key, value) = next?; + i += 1; + values_left |= &value.bitmap; + if i == size_left { + break; + } + } + + let key = + FacetKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; + let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; + (key, value) + }; + + let group_right = { + let mut values_right = RoaringBitmap::new(); + let mut right_start_key = None; + + while let Some(next) = iter.next() { + let (key, value) = next?; + if right_start_key.is_none() { + right_start_key = Some(key.left_bound); + } + values_right |= &value.bitmap; + } + + let key = + FacetKey { field_id, level, left_bound: right_start_key.unwrap().to_vec() }; + let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; + (key, value) + }; + drop(iter); + + let _ = self.db.delete(txn, &insertion_key.as_ref())?; + + self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; + self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; + + Ok(InsertionResult::Insert) + } else { + let mut value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); + value.bitmap |= new_values; + value.size += 1; + self.db.put(txn, &insertion_key.as_ref(), &value).unwrap(); + + Ok(InsertionResult::InPlace) + } + } + + pub fn insert<'a, 't>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + new_key: &[u8], + new_values: &RoaringBitmap, + ) -> Result<()> { + if new_values.is_empty() { + return Ok(()); + } + let group_size = self.group_size; + + let highest_level = get_highest_level(&txn, &self.db, field_id)?; + + let result = + self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?; + match result { + InsertionResult::InPlace => return Ok(()), + InsertionResult::Insert => {} + } + + let mut highest_level_prefix = vec![]; + highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); + highest_level_prefix.push(highest_level); + + let size_highest_level = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? + .count(); + + if size_highest_level < self.min_level_size { + return Ok(()); + } + + let mut groups_iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &highest_level_prefix)?; + + let mut to_add = vec![]; + for _ in 0..group_size { + let mut first_key = None; + let mut values = RoaringBitmap::new(); + for _ in 0..group_size { + let (key_bytes, value_i) = groups_iter.next().unwrap()?; + let key_i = FacetKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)?; + + if first_key.is_none() { + first_key = Some(key_i); + } + values |= value_i.bitmap; + } + let key = FacetKey { + field_id, + level: highest_level + 1, + left_bound: first_key.unwrap().left_bound, + }; + let value = FacetGroupValue { size: group_size as u8, bitmap: values }; + to_add.push((key.into_owned(), value)); + } + drop(groups_iter); + for (key, value) in to_add { + self.db.put(txn, &key.as_ref(), &value)?; + } + Ok(()) + } + + fn delete_in_level<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + level: u8, + key: &[u8], + value: u32, + ) -> Result { + if level == 0 { + return self.delete_in_level_0(txn, field_id, key, value); + } + let (deletion_key, mut bitmap) = + self.find_insertion_key_value(field_id, level, key, txn)?; + + let result = self.delete_in_level(txn, field_id, level - 1, key.clone(), value)?; + + let mut decrease_size = false; + let (prev_key, next_key) = match result { + DeletionResult::InPlace => { + bitmap.bitmap.remove(value); + self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; + return Ok(DeletionResult::InPlace); + } + DeletionResult::Reduce { prev, next } => (prev, next), + DeletionResult::Remove { prev, next } => { + decrease_size = true; + (prev, next) + } + }; + + let mut updated_value = bitmap; + if decrease_size { + updated_value.size -= 1; + } + + if updated_value.size == 0 { + self.db.delete(txn, &deletion_key.as_ref())?; + Ok(DeletionResult::Remove { prev: prev_key, next: next_key }) + } else { + let mut updated_deletion_key = deletion_key.clone(); + if key == deletion_key.left_bound { + updated_deletion_key.left_bound = next_key.clone().unwrap(); + } + updated_value.bitmap.remove(value); + let _ = self.db.delete(txn, &deletion_key.as_ref())?; + self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; + + Ok(DeletionResult::Reduce { prev: prev_key, next: next_key }) + } + } + + fn delete_in_level_0<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + key: &[u8], + value: u32, + ) -> Result { + let key = FacetKey { field_id, level: 0, left_bound: key }; + let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; + bitmap.remove(value); + + if bitmap.is_empty() { + let mut prev_key = None; + let mut next_key = None; + + if let Some(prev) = self.db.get_lower_than(&txn, &key)? { + prev_key = Some(prev.0.left_bound.to_vec()); + } + if let Some(next) = self.db.get_greater_than(&txn, &key)? { + if next.0.level == 0 { + next_key = Some(next.0.left_bound.to_vec()); + } + } + self.db.delete(txn, &key)?; + Ok(DeletionResult::Remove { prev: prev_key, next: next_key }) + } else { + self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?; + Ok(DeletionResult::InPlace) + } + } + + pub fn delete<'a, 't>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + key: &[u8], + value: u32, + ) -> Result<()> { + if self.db.get(txn, &FacetKey { field_id, level: 0, left_bound: key })?.is_none() { + return Ok(()); + } + let highest_level = get_highest_level(&txn, &self.db, field_id)?; + + // let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + + let result = self.delete_in_level(txn, field_id, highest_level as u8, key, value)?; + match result { + DeletionResult::InPlace => return Ok(()), + DeletionResult::Reduce { .. } => {} + DeletionResult::Remove { .. } => {} + } + let mut highest_level_prefix = vec![]; + highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); + highest_level_prefix.push(highest_level); + + if highest_level == 0 + || self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? + .count() + >= self.group_size + { + return Ok(()); + } + let mut to_delete = vec![]; + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)?; + while let Some(el) = iter.next() { + let (k, _) = el?; + to_delete.push( + FacetKeyCodec::::bytes_decode(k).ok_or(Error::Encoding)?.into_owned(), + ); + } + drop(iter); + for k in to_delete { + self.db.delete(txn, &k.as_ref())?; + } + Ok(()) + } +} diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ceedff1e0..d27206af2 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -10,38 +10,39 @@ pub use self::filter::Filter; mod facet_distribution; mod facet_distribution_iter; +mod facet_range_search; mod facet_sort_ascending; mod facet_sort_descending; mod filter; +mod incremental_update; -fn get_first_facet_value<'t, BoundCodec>( +pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> Option +) -> crate::Result> where BoundCodec: BytesDecode<'t>, { let mut level0prefix = vec![]; level0prefix.extend_from_slice(&field_id.to_be_bytes()); level0prefix.push(0); - let mut level0_iter_forward = db - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) - .unwrap(); + let mut level0_iter_forward = + db.as_polymorph().prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; if let Some(first) = level0_iter_forward.next() { - let (first_key, _) = first.unwrap(); - let first_key = FacetKeyCodec::::bytes_decode(first_key).unwrap(); - Some(first_key.left_bound) + let (first_key, _) = first?; + let first_key = + FacetKeyCodec::::bytes_decode(first_key).ok_or(heed::Error::Encoding)?; + Ok(Some(first_key.left_bound)) } else { - None + Ok(None) } } -fn get_last_facet_value<'t, BoundCodec>( +pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> Option +) -> crate::Result> where BoundCodec: BytesDecode<'t>, { @@ -50,30 +51,129 @@ where level0prefix.push(0); let mut level0_iter_backward = db .as_polymorph() - .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) - .unwrap(); + .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; if let Some(last) = level0_iter_backward.next() { - let (last_key, _) = last.unwrap(); - let last_key = FacetKeyCodec::::bytes_decode(last_key).unwrap(); - Some(last_key.left_bound) + let (last_key, _) = last?; + let last_key = + FacetKeyCodec::::bytes_decode(last_key).ok_or(heed::Error::Encoding)?; + Ok(Some(last_key.left_bound)) } else { - None + Ok(None) } } -fn get_highest_level<'t>( +pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> u8 { +) -> crate::Result { let field_id_prefix = &field_id.to_be_bytes(); - db.as_polymorph() - .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix) - .unwrap() + Ok(db + .as_polymorph() + .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix)? .next() .map(|el| { let (key, _) = el.unwrap(); let key = FacetKeyCodec::::bytes_decode(key).unwrap(); key.level }) - .unwrap_or(0) + .unwrap_or(0)) +} + +#[cfg(test)] +mod test { + use std::{fmt::Display, marker::PhantomData, rc::Rc}; + + use heed::{BytesDecode, BytesEncode, Env}; + use tempfile::TempDir; + + use crate::{ + heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, + }, + snapshot_tests::display_bitmap, + }; + + pub struct FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + pub env: Env, + pub db: Database, + _phantom: PhantomData, + } + + pub struct Database { + pub content: heed::Database, FacetGroupValueCodec>, + pub group_size: usize, + pub max_group_size: usize, + _tempdir: Rc, + } + + impl FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + pub fn open_from_tempdir( + tempdir: Rc, + group_size: u8, + max_group_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize; + let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 10 * 100); + unsafe { + options.flag(heed::flags::Flags::MdbAlwaysFreePages); + } + let env = options.open(tempdir.path()).unwrap(); + let content = env.open_database(None).unwrap().unwrap(); + + FacetIndex { + db: Database { content, group_size, max_group_size, _tempdir: tempdir }, + env, + _phantom: PhantomData, + } + } + pub fn new(group_size: u8, max_group_size: u8) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize; + let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 100); + let tempdir = tempfile::TempDir::new_in("databases/").unwrap(); + let env = options.open(tempdir.path()).unwrap(); + let content = env.create_database(None).unwrap(); + + FacetIndex { + db: Database { content, group_size, max_group_size, _tempdir: Rc::new(tempdir) }, + env, + _phantom: PhantomData, + } + } + } + + impl Display for FacetIndex + where + for<'a> >::EItem: Sized + Display, + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let txn = self.env.read_txn().unwrap(); + let mut iter = self.db.content.iter(&txn).unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let FacetKey { field_id, level, left_bound: bound } = key; + let bound = BoundCodec::bytes_decode(bound).unwrap(); + let FacetGroupValue { size, bitmap } = value; + writeln!( + f, + "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}", + values = display_bitmap(&bitmap) + )?; + } + Ok(()) + } + } } From 5a904cf29d77ccb7bbeda88373f9017f9c0e388c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 07:50:18 +0200 Subject: [PATCH 1696/1889] Reintroduce facet distribution functionality --- milli/src/search/facet/facet_distribution.rs | 132 +++++++++--------- .../search/facet/facet_distribution_iter.rs | 4 +- milli/src/search/facet/mod.rs | 6 +- 3 files changed, 72 insertions(+), 70 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index fddf93d4b..670719a9b 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,13 +1,18 @@ use std::collections::{BTreeMap, HashSet}; -use std::ops::Bound::Unbounded; +use std::ops::ControlFlow; use std::{fmt, mem}; use heed::types::ByteSlice; +use heed::BytesDecode; use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +use crate::search::facet::facet_distribution_iter; // use crate::search::facet::FacetStringIter; use crate::{FieldId, Index, Result}; @@ -131,22 +136,21 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - todo!() - // let iter = - // FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - - // for result in iter { - // let (value, mut docids) = result?; - // docids &= candidates; - // if !docids.is_empty() { - // distribution.insert(value.to_string(), docids.len()); - // } - // if distribution.len() == self.max_values_per_facet { - // break; - // } - // } - - // Ok(()) + facet_distribution_iter::iterate_over_facet_distribution( + self.rtxn, + &self.index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates, + |facet_key, nbr_docids| { + let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap(); + distribution.insert(facet_key.to_string(), nbr_docids); + if distribution.len() == self.max_values_per_facet { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(()) + } + }, + ) } fn facet_strings_distribution_from_facet_levels( @@ -155,22 +159,21 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - todo!() - // let iter = - // FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - - // for result in iter { - // let (_normalized, original, mut docids) = result?; - // docids &= candidates; - // if !docids.is_empty() { - // distribution.insert(original.to_string(), docids.len()); - // } - // if distribution.len() == self.max_values_per_facet { - // break; - // } - // } - - // Ok(()) + facet_distribution_iter::iterate_over_facet_distribution( + self.rtxn, + &self.index.facet_id_string_docids.remap_key_type::>(), + field_id, + candidates, + |facet_key, nbr_docids| { + let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); + distribution.insert(facet_key.to_string(), nbr_docids); + if distribution.len() == self.max_values_per_facet { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(()) + } + }, + ) } /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the @@ -179,43 +182,42 @@ impl<'a> FacetDistribution<'a> { &self, field_id: FieldId, ) -> heed::Result> { - todo!() - // let mut distribution = BTreeMap::new(); + let mut distribution = BTreeMap::new(); - // let db = self.index.facet_id_f64_docids; - // let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + let db = self.index.facet_id_f64_docids; + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(0); + let iter = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); - // for result in range { - // let ((_, _, value, _), docids) = result?; - // distribution.insert(value.to_string(), docids.len()); - // if distribution.len() == self.max_values_per_facet { - // break; - // } - // } + for result in iter { + let (key, value) = result?; + distribution.insert(key.left_bound.to_string(), value.bitmap.len()); + if distribution.len() == self.max_values_per_facet { + break; + } + } - // let iter = self - // .index - // .facet_id_string_docids - // .remap_key_type::() - // .prefix_iter(self.rtxn, &field_id.to_be_bytes())? - // .remap_key_type::(); + let iter = self + .index + .facet_id_string_docids + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); - // let mut normalized_distribution = BTreeMap::new(); - // for result in iter { - // let ((_, normalized_value), group_value) = result?; - // normalized_distribution - // .insert(normalized_value, (normalized_value, group_value.bitmap.len())); - // if normalized_distribution.len() == self.max_values_per_facet { - // break; - // } - // } + // TODO: get the original value of the facet somewhere (in the documents DB?) + for result in iter { + let (key, value) = result?; + distribution.insert(key.left_bound.to_owned(), value.bitmap.len()); + if distribution.len() == self.max_values_per_facet { + break; + } + } - // let iter = normalized_distribution - // .into_iter() - // .map(|(_normalized, (original, count))| (original.to_string(), count)); - // distribution.extend(iter); - - // Ok(distribution) + Ok(distribution) } fn facet_values(&self, field_id: FieldId) -> heed::Result> { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 83079028c..9f1031a85 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,5 +1,5 @@ use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; -use crate::Result; +use heed::Result; use roaring::RoaringBitmap; use std::ops::ControlFlow; @@ -20,7 +20,7 @@ where get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - fd.iterate(candidates, highest_level, first_bound, usize::MAX); + fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; return Ok(()); } else { return Ok(()); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index d27206af2..023d433ad 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -20,7 +20,7 @@ pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> crate::Result> +) -> heed::Result> where BoundCodec: BytesDecode<'t>, { @@ -42,7 +42,7 @@ pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> crate::Result> +) -> heed::Result> where BoundCodec: BytesDecode<'t>, { @@ -65,7 +65,7 @@ pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> crate::Result { +) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); Ok(db .as_polymorph() From 6cc91824c1d831950187cfa6b4ca047cf0b89683 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 07:51:11 +0200 Subject: [PATCH 1697/1889] Remove unused heed codec files --- .../facet_string_zero_bounds_value_codec.rs | 114 ------------------ .../facet/facet_value_string_codec.rs | 35 ------ milli/src/heed_codec/facet/mod.rs | 4 +- milli/src/update/delete_documents.rs | 2 +- 4 files changed, 3 insertions(+), 152 deletions(-) delete mode 100644 milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs delete mode 100644 milli/src/heed_codec/facet/facet_value_string_codec.rs diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs deleted file mode 100644 index 337433c2b..000000000 --- a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs +++ /dev/null @@ -1,114 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::{marker, str}; - -use super::try_split_at; - -/// A codec that optionally encodes two strings in front of the value. -/// -/// The usecase is for the facet string levels algorithm where we must -/// know the origin of a group, the group left and right bounds are stored -/// in the value to not break the lexicographical ordering of the LMDB keys. -pub struct FacetStringZeroBoundsValueCodec(marker::PhantomData); - -impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec -where - C: heed::BytesDecode<'a>, -{ - type DItem = (Option<(&'a str, &'a str)>, C::DItem); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (contains_bounds, bytes) = bytes.split_first()?; - - if *contains_bounds != 0 { - let (left_len, bytes) = try_split_at(bytes, 2)?; - let (right_len, bytes) = try_split_at(bytes, 2)?; - - let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?; - let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?; - - let (left, bytes) = try_split_at(bytes, left_len as usize)?; - let (right, bytes) = try_split_at(bytes, right_len as usize)?; - - let left = str::from_utf8(left).ok()?; - let right = str::from_utf8(right).ok()?; - - C::bytes_decode(bytes).map(|item| (Some((left, right)), item)) - } else { - C::bytes_decode(bytes).map(|item| (None, item)) - } - } -} - -impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec -where - C: heed::BytesEncode<'a>, -{ - type EItem = (Option<(&'a str, &'a str)>, C::EItem); - - fn bytes_encode((bounds, value): &'a Self::EItem) -> Option> { - let mut bytes = Vec::new(); - - match bounds { - Some((left, right)) => { - bytes.push(u8::max_value()); - - if left.is_empty() || right.is_empty() { - return None; - } - - let left_len: u16 = left.len().try_into().ok()?; - let right_len: u16 = right.len().try_into().ok()?; - - bytes.extend_from_slice(&left_len.to_be_bytes()); - bytes.extend_from_slice(&right_len.to_be_bytes()); - - bytes.extend_from_slice(left.as_bytes()); - bytes.extend_from_slice(right.as_bytes()); - - let value_bytes = C::bytes_encode(&value)?; - bytes.extend_from_slice(&value_bytes[..]); - - Some(Cow::Owned(bytes)) - } - None => { - bytes.push(0); - let value_bytes = C::bytes_encode(&value)?; - bytes.extend_from_slice(&value_bytes[..]); - Some(Cow::Owned(bytes)) - } - } - } -} - -#[cfg(test)] -mod tests { - use heed::types::Unit; - use heed::{BytesDecode, BytesEncode}; - use roaring::RoaringBitmap; - - use super::*; - use crate::CboRoaringBitmapCodec; - - #[test] - fn deserialize_roaring_bitmaps() { - let bounds = Some(("abc", "def")); - let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); - let key = (bounds, docids.clone()); - let bytes = - FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); - let (out_bounds, out_docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_bounds, out_docids), (bounds, docids)); - } - - #[test] - fn deserialize_unit() { - let bounds = Some(("abc", "def")); - let key = (bounds, ()); - let bytes = FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); - let (out_bounds, out_unit) = - FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_bounds, out_unit), (bounds, ())); - } -} diff --git a/milli/src/heed_codec/facet/facet_value_string_codec.rs b/milli/src/heed_codec/facet/facet_value_string_codec.rs deleted file mode 100644 index 54abb7886..000000000 --- a/milli/src/heed_codec/facet/facet_value_string_codec.rs +++ /dev/null @@ -1,35 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, FieldId}; - -pub struct FacetValueStringCodec; - -impl FacetValueStringCodec { - pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { - out.reserve(value.len() + 2); - out.extend_from_slice(&field_id.to_be_bytes()); - out.extend_from_slice(value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { - type DItem = (FieldId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let value = str::from_utf8(bytes).ok()?; - Some((field_id, value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec { - type EItem = (FieldId, &'a str); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FacetValueStringCodec::serialize_into(*field_id, value, &mut bytes); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index d23ab391e..e145e311e 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -2,7 +2,7 @@ // mod facet_level_value_u32_codec; // mod facet_string_level_zero_codec; // mod facet_string_level_zero_value_codec; -mod facet_string_zero_bounds_value_codec; +// mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; @@ -16,7 +16,7 @@ use heed::types::OwnedType; // pub use self::facet_string_level_zero_value_codec::{ // decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, // }; -pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; +// pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; use crate::BEU16; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index bb30f24c9..5eebff913 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -10,7 +10,7 @@ use time::OffsetDateTime; use super::ClearDocuments; use crate::error::{InternalError, SerializationError, UserError}; -use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; +// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ From 22d80eeaf9262f9f97135d147bf45258240d9a3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 08:10:45 +0200 Subject: [PATCH 1698/1889] Reintroduce facet deletion functionality --- milli/src/update/delete_documents.rs | 139 ++++++++++++--------------- 1 file changed, 62 insertions(+), 77 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 5eebff913..32b2ac986 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -3,19 +3,21 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; use heed::{BytesDecode, BytesEncode, Database}; +use obkv::Key; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; -use super::ClearDocuments; +use super::{ClearDocuments, Facets}; use crate::error::{InternalError, SerializationError, UserError}; // use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, - RoaringBitmapCodec, SmallString32, BEU32, + fields_ids_map, DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, + FieldsIdsMap, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, }; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -62,6 +64,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { pub fn execute(mut self) -> Result { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; + // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?; @@ -439,25 +442,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; } + remove_docids_from_facet_id_docids( + self.wtxn, + self.index, + facet_id_f64_docids.remap_key_type::>(), + &self.to_delete_docids, + fields_ids_map.clone(), + )?; + remove_docids_from_facet_id_docids( + self.wtxn, + self.index, + facet_id_string_docids.remap_key_type::>(), + &self.to_delete_docids, + fields_ids_map.clone(), + )?; // We delete the documents ids that are under the facet field id values. - // TODO: remove_docids_from_facet_field_id_docids( - // self.wtxn, - // facet_id_f64_docids, - // &self.to_delete_docids, - // )?; - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_docids( + remove_docids_from_facet_id_exists_docids( self.wtxn, facet_id_exists_docids, &self.to_delete_docids, )?; - remove_docids_from_facet_field_id_string_docids( - self.wtxn, - facet_id_string_docids, - &self.to_delete_docids, - )?; - // Remove the documents ids from the faceted documents ids. for field_id in self.index.faceted_fields_ids(self.wtxn)? { // Remove docids from the number faceted documents ids @@ -580,67 +585,7 @@ where Ok(()) } -fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( - wtxn: &'a mut heed::RwTxn, - db: &heed::Database, - to_remove: &RoaringBitmap, -) -> crate::Result<()> { - // let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); - // let mut iter = db.remap_types::().iter_mut(wtxn)?; - // while let Some(result) = iter.next() { - // let (key, val) = result?; - // match FacetLevelValueU32Codec::bytes_decode(key) { - // Some(_) => { - // // If we are able to parse this key it means it is a facet string group - // // level key. We must then parse the value using the appropriate codec. - // let (group, mut docids) = - // FacetStringZeroBoundsValueCodec::::bytes_decode(val) - // .ok_or_else(|| SerializationError::Decoding { db_name })?; - - // let previous_len = docids.len(); - // docids -= to_remove; - // if docids.is_empty() { - // // safety: we don't keep references from inside the LMDB database. - // unsafe { iter.del_current()? }; - // } else if docids.len() != previous_len { - // let key = key.to_owned(); - // let val = &(group, docids); - // let value_bytes = - // FacetStringZeroBoundsValueCodec::::bytes_encode(val) - // .ok_or_else(|| SerializationError::Encoding { db_name })?; - - // // safety: we don't keep references from inside the LMDB database. - // unsafe { iter.put_current(&key, &value_bytes)? }; - // } - // } - // None => { - // // The key corresponds to a level zero facet string. - // let (original_value, mut docids) = - // FacetStringLevelZeroValueCodec::bytes_decode(val) - // .ok_or_else(|| SerializationError::Decoding { db_name })?; - - // let previous_len = docids.len(); - // docids -= to_remove; - // if docids.is_empty() { - // // safety: we don't keep references from inside the LMDB database. - // unsafe { iter.del_current()? }; - // } else if docids.len() != previous_len { - // let key = key.to_owned(); - // let val = &(original_value, docids); - // let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) - // .ok_or_else(|| SerializationError::Encoding { db_name })?; - - // // safety: we don't keep references from inside the LMDB database. - // unsafe { iter.put_current(&key, &value_bytes)? }; - // } - // } - // } - // } - - Ok(()) -} - -fn remove_docids_from_facet_field_id_docids<'a, C>( +fn remove_docids_from_facet_id_exists_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, @@ -665,6 +610,46 @@ where Ok(()) } +fn remove_docids_from_facet_id_docids<'a>( + wtxn: &'a mut heed::RwTxn, + index: &Index, + db: heed::Database, FacetGroupValueCodec>, + to_remove: &RoaringBitmap, + fields_ids_map: FieldsIdsMap, +) -> Result<()> { + let mut modified = false; + for field_id in fields_ids_map.ids() { + let mut level0_prefix = vec![]; + level0_prefix.extend_from_slice(&field_id.to_be_bytes()); + level0_prefix.push(0); + let mut iter = db + .as_polymorph() + .prefix_iter_mut::<_, ByteSlice, FacetGroupValueCodec>(wtxn, &level0_prefix)?; + + while let Some(result) = iter.next() { + let (bytes, mut value) = result?; + let previous_len = value.bitmap.len(); + value.bitmap -= to_remove; + if value.bitmap.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + modified = true; + } else if value.bitmap.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &value)? }; + modified = true; + } + } + } + if !modified { + return Ok(()); + } + let builder = Facets::new(index, db); + builder.execute(wtxn)?; + + Ok(()) +} #[cfg(test)] mod tests { From 39a4a0a362f4803016072b54a0cbcf88ccb3a55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 08:27:16 +0200 Subject: [PATCH 1699/1889] Reintroduce filter range search and facet extractors --- milli/src/search/facet/facet_range_search.rs | 12 +- milli/src/search/facet/filter.rs | 248 +++++------------- milli/src/update/delete_documents.rs | 10 +- .../extract/extract_facet_number_docids.rs | 13 +- .../extract/extract_facet_string_docids.rs | 40 +-- 5 files changed, 92 insertions(+), 231 deletions(-) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index c01346b25..75db9fda2 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -15,7 +15,7 @@ use super::get_last_facet_value; pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, @@ -48,13 +48,13 @@ where } Bound::Unbounded => Bound::Unbounded, }; - + let db = db.remap_key_type::>(); let mut docids = RoaringBitmap::new(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; - let highest_level = get_highest_level(rtxn, db, field_id)?; + let mut f = FacetRangeSearch { rtxn, db: &db, field_id, left, right, docids: &mut docids }; + let highest_level = get_highest_level(rtxn, &db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + if let Some(first_bound) = get_first_facet_value::(rtxn, &db, field_id)? { + let last_bound = get_last_facet_value::(rtxn, &db, field_id)?.unwrap(); f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; Ok(docids) } else { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index dd34abe6d..79d7f5e0f 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,22 +1,17 @@ -use std::collections::HashSet; -use std::fmt::{Debug, Display}; -use std::ops::Bound::{self, Excluded, Included}; -use std::ops::RangeBounds; - use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; -use heed::LazyDecode; use roaring::RoaringBitmap; +use std::collections::HashSet; +use std::fmt::{Debug, Display}; +use std::ops::Bound::{self, Excluded, Included}; -// use super::FacetNumberRange; use crate::error::{Error, UserError}; use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; -// use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::{ - distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, -}; +use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; + +use super::facet_range_search; /// The maximum number of filters the filter AST can process. const MAX_FILTER_DEPTH: usize = 2000; @@ -147,158 +142,15 @@ impl<'a> Filter<'a> { } } -fn explore_facet_number_levels( - rtxn: &heed::RoTxn, - db: heed::Database, FacetGroupValueCodec>, - field_id: FieldId, -) { -} - impl<'a> Filter<'a> { - /// Aggregates the documents ids that are part of the specified range automatically - /// going deeper through the levels. - fn explore_facet_number_levels( - rtxn: &heed::RoTxn, - db: heed::Database, CboRoaringBitmapCodec>, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - output: &mut RoaringBitmap, - ) -> Result<()> { - // level must be > 0, I'll create a separate function for level 0 - // if level == 0 { - // call that function - //} - match (left, right) { - // If the request is an exact value we must go directly to the deepest level. - (Included(l), Included(r)) if l == r && level > 0 => { - return Self::explore_facet_number_levels( - rtxn, db, field_id, 0, left, right, output, - ); - } - // lower TO upper when lower > upper must return no result - (Included(l), Included(r)) if l > r => return Ok(()), - (Included(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Included(r)) if l >= r => return Ok(()), - (_, _) => (), - } - let range_start_key = FacetKey { - field_id, - level, - left_bound: match left { - Included(l) => l, - Excluded(l) => l, - Bound::Unbounded => f64::MIN, - }, - }; - let mut range_iter = db - .remap_data_type::>() - .range(rtxn, &(range_start_key..))?; + pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { + // to avoid doing this for each recursive call we're going to do it ONCE ahead of time + let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; + let filterable_fields = index.filterable_fields(rtxn)?; - let (mut previous_facet_key, mut previous_value) = range_iter.next().unwrap()?; - while let Some(el) = range_iter.next() { - let (facet_key, value) = el?; - let range = (Included(previous_facet_key.left_bound), Excluded(facet_key.left_bound)); - // if the current range intersects with the query range, then go deeper - // what does it mean for two ranges to intersect? - let gte_left = match left { - Included(l) => previous_facet_key.left_bound >= l, - Excluded(l) => previous_facet_key.left_bound > l, // TODO: not true? - Bound::Unbounded => true, - }; - let lte_right = match right { - Included(r) => facet_key.left_bound <= r, - Excluded(r) => facet_key.left_bound < r, - Bound::Unbounded => true, - }; - } - // at this point, previous_facet_key and previous_value are the last groups in the level - // we must also check whether we should visit this group - - todo!(); - - // let mut left_found = None; - // let mut right_found = None; - - // // We must create a custom iterator to be able to iterate over the - // // requested range as the range iterator cannot express some conditions. - // let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; - - // debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - // for (i, result) in iter.enumerate() { - // let ((_fid, level, l, r), docids) = result?; - // debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - // *output |= docids; - // // We save the leftest and rightest bounds we actually found at this level. - // if i == 0 { - // left_found = Some(l); - // } - // right_found = Some(r); - // } - - // // Can we go deeper? - // let deeper_level = match level.checked_sub(1) { - // Some(level) => level, - // None => return Ok(()), - // }; - - // // We must refine the left and right bounds of this range by retrieving the - // // missing part in a deeper level. - // match left_found.zip(right_found) { - // Some((left_found, right_found)) => { - // // If the bound is satisfied we avoid calling this function again. - // if !matches!(left, Included(l) if l == left_found) { - // let sub_right = Excluded(left_found); - // debug!( - // "calling left with {:?} to {:?} (level {})", - // left, sub_right, deeper_level - // ); - // Self::explore_facet_number_levels( - // rtxn, - // db, - // field_id, - // deeper_level, - // left, - // sub_right, - // output, - // )?; - // } - // if !matches!(right, Included(r) if r == right_found) { - // let sub_left = Excluded(right_found); - // debug!( - // "calling right with {:?} to {:?} (level {})", - // sub_left, right, deeper_level - // ); - // Self::explore_facet_number_levels( - // rtxn, - // db, - // field_id, - // deeper_level, - // sub_left, - // right, - // output, - // )?; - // } - // } - // None => { - // // If we found nothing at this level it means that we must find - // // the same bounds but at a deeper, more precise level. - // Self::explore_facet_number_levels( - // rtxn, - // db, - // field_id, - // deeper_level, - // left, - // right, - // output, - // )?; - // } - // } - - // Ok(()) + // and finally we delete all the soft_deleted_documents, again, only once at the very end + self.inner_evaluate(rtxn, index, &filterable_fields) + .map(|result| result - soft_deleted_documents) } fn evaluate_operator( @@ -337,15 +189,15 @@ impl<'a> Filter<'a> { Some(n) => { let n = Included(n); let mut output = RoaringBitmap::new(); - // Self::explore_facet_number_levels( - // rtxn, - // numbers_db, - // field_id, - // 0, - // n, - // n, - // &mut output, - // )?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + 0, + n, + n, + &mut output, + )?; output } None => RoaringBitmap::new(), @@ -381,29 +233,53 @@ impl<'a> Filter<'a> { match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - // Self::explore_facet_number_levels( - // rtxn, - // numbers_db, - // field_id, - // level, - // left, - // right, - // &mut output, - // )?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + level, + left, + right, + &mut output, + )?; Ok(output) } None => Ok(RoaringBitmap::new()), } } - pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { - // to avoid doing this for each recursive call we're going to do it ONCE ahead of time - let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; - let filterable_fields = index.filterable_fields(rtxn)?; + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_number_levels( + rtxn: &heed::RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> Result<()> { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return Self::explore_facet_number_levels( + rtxn, db, field_id, 0, left, right, output, + ); + } + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + let x = facet_range_search::find_docids_of_facet_within_bounds::( + rtxn, &db, field_id, &left, &right, + )?; + // TODO: the facet range search should take a mutable roaring bitmap as argument + *output = x; - // and finally we delete all the soft_deleted_documents, again, only once at the very end - self.inner_evaluate(rtxn, index, &filterable_fields) - .map(|result| result - soft_deleted_documents) + Ok(()) } fn inner_evaluate( diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 32b2ac986..e16d98e74 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,22 +2,20 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; -use heed::{BytesDecode, BytesEncode, Database}; -use obkv::Key; +use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; use super::{ClearDocuments, Facets}; -use crate::error::{InternalError, SerializationError, UserError}; -// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; +use crate::error::{InternalError, UserError}; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - fields_ids_map, DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, - FieldsIdsMap, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, + DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, + RoaringBitmapCodec, SmallString32, BEU32, }; pub struct DeleteDocuments<'t, 'u, 'i> { diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index c5424a346..eece08ee3 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,6 +6,8 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::Result; @@ -31,14 +33,13 @@ pub fn extract_facet_number_docids( let mut cursor = docid_fid_facet_number.into_cursor()?; while let Some((key_bytes, _)) = cursor.move_on_next()? { - todo!() - // let (field_id, document_id, number) = - // FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); + let (field_id, document_id, number) = + FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); - // let key = (field_id, 0, number, number); - // // let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); + let key = FacetKey { field_id, level: 0, left_bound: number }; + let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); - // facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } sorter_into_reader(facet_number_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 4e655329e..51d2df923 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,13 +1,11 @@ -use std::fs::File; -use std::iter::FromIterator; -use std::{io, str}; - -use roaring::RoaringBitmap; - use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; use crate::update::index_documents::merge_cbo_roaring_bitmaps; -// use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; use crate::{FieldId, Result}; +use heed::BytesEncode; +use std::fs::File; +use std::io; /// Extracts the facet string and the documents ids where this facet string appear. /// @@ -22,38 +20,26 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_cbo_roaring_bitmaps, // TODO: check + merge_cbo_roaring_bitmaps, // TODO: check that it is correct indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - let mut key_buffer = Vec::new(); - let mut value_buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; - while let Some((key, original_value_bytes)) = cursor.move_on_next()? { + while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); - let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap(); - let document_id = u32::from_be_bytes(document_id_bytes); - let original_value = str::from_utf8(original_value_bytes)?; - key_buffer.clear(); - // TODO - // FacetStringLevelZeroCodec::serialize_into( - // field_id, - // str::from_utf8(normalized_value_bytes)?, - // &mut key_buffer, - // ); + let (document_id_bytes, normalized_value_bytes) = + try_split_array_at::<_, 4>(bytes).unwrap(); - value_buffer.clear(); - // TODO - // encode_prefix_string(original_value, &mut value_buffer)?; - let bitmap = RoaringBitmap::from_iter(Some(document_id)); - bitmap.serialize_into(&mut value_buffer)?; + let normalised_value = std::str::from_utf8(normalized_value_bytes)?; + let key = FacetKey { field_id, level: 0, left_bound: normalised_value }; + let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); - facet_string_docids_sorter.insert(&key_buffer, &value_buffer)?; + facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?; } sorter_into_reader(facet_string_docids_sorter, indexer) From bd2c0e1ab6393550d7cdd8439c9b605ff2dd7fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 08:39:01 +0200 Subject: [PATCH 1700/1889] Remove unused code --- milli/src/search/facet/incremental_update.rs | 2 -- .../src/update/index_documents/typed_chunk.rs | 25 ++++++------------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/milli/src/search/facet/incremental_update.rs b/milli/src/search/facet/incremental_update.rs index a437efb2d..f01b19dab 100644 --- a/milli/src/search/facet/incremental_update.rs +++ b/milli/src/search/facet/incremental_update.rs @@ -43,8 +43,6 @@ impl<'i> IncrementalFacetUpdate<'i> { .prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>(txn, &prefix.as_slice())?; if let Some(e) = prefix_iter.next() { let (key_bytes, value) = e?; - let key = FacetKeyCodec::::bytes_decode(&key_bytes) - .ok_or(heed::Error::Encoding)?; Ok(( FacetKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)? diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 7a9787bdb..3c7a78d95 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -189,23 +189,14 @@ pub(crate) fn write_typed_chunk_into_index( } } TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { - append_entries_into_database( - facet_id_string_docids, - &index.facet_id_string_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - |new_values, db_values, buffer| { - todo!() - // let (_, new_values) = decode_prefix_string(new_values).unwrap(); - // let new_values = RoaringBitmap::deserialize_from(new_values)?; - // let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); - // let db_values = RoaringBitmap::deserialize_from(db_values)?; - // let values = new_values | db_values; - // encode_prefix_string(db_original, buffer)?; - // Ok(values.serialize_into(buffer)?) - }, - )?; + // facet_id_string_docids contains the thing that the extractor put into it, + // so: (FacetKey { field id, level: 0, left_bound } , docids: RoaringBitmap ) + // now we need to either: + // 1. incrementally add the keys/docids pairs into the DB + // 2. add the keys/docids into level 0 and then call Facets::execute + // the choice of solution should be determined by their performance + // characteristics + is_merged_database = true; } TypedChunk::GeoPoints(geo_points) => { From e570c23153f4b4ce91dfd0fe80ed03802a396563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 09:36:19 +0200 Subject: [PATCH 1701/1889] Reintroduce asc/desc functionality --- milli/src/search/criteria/asc_desc.rs | 33 ++++++++++--------- milli/src/search/facet/facet_distribution.rs | 4 +-- .../search/facet/facet_distribution_iter.rs | 6 ++-- milli/src/search/facet/facet_range_search.rs | 12 +++---- .../src/search/facet/facet_sort_ascending.rs | 21 +++++------- .../src/search/facet/facet_sort_descending.rs | 31 ++++++++--------- milli/src/search/facet/filter.rs | 2 +- milli/src/search/facet/incremental_update.rs | 4 +-- milli/src/search/facet/mod.rs | 10 +++--- 9 files changed, 60 insertions(+), 63 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index bd08c54a5..a5ea9b058 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -6,7 +6,10 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; +use crate::search::facet::facet_sort_ascending::ascending_facet_sort; +use crate::search::facet::facet_sort_descending::descending_facet_sort; // use crate::search::facet::FacetStringIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -186,24 +189,22 @@ fn facet_ordered<'t>( iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) } else { - todo!() - // let facet_number_fn = if is_ascending { - // FacetNumberIter::new_reducing - // } else { - // FacetNumberIter::new_reverse_reducing - // }; - // let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? - // .map(|res| res.map(|(_, docids)| docids)); + let make_iter = if is_ascending { ascending_facet_sort } else { descending_facet_sort }; - // let facet_string_fn = if is_ascending { - // FacetStringIter::new_reducing - // } else { - // FacetStringIter::new_reverse_reducing - // }; - // let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? - // .map(|res| res.map(|(_, _, docids)| docids)); + let number_iter = make_iter( + rtxn, + index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates.clone(), + )?; + let string_iter = make_iter( + rtxn, + index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates, + )?; - // Ok(Box::new(number_iter.chain(string_iter))) + Ok(Box::new(number_iter.chain(string_iter))) } } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 670719a9b..c7619c609 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -138,7 +138,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - &self.index.facet_id_f64_docids.remap_key_type::>(), + self.index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids| { @@ -161,7 +161,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - &self.index.facet_id_string_docids.remap_key_type::>(), + self.index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids| { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 9f1031a85..f347b9d7e 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -7,7 +7,7 @@ use super::{get_first_facet_value, get_highest_level}; pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: &RoaringBitmap, callback: CB, @@ -17,7 +17,7 @@ where { let mut fd = FacetDistribution { rtxn, db, field_id, callback }; let highest_level = - get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; + get_highest_level(rtxn, db.remap_key_type::>(), field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; @@ -32,7 +32,7 @@ where CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, { rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, callback: CB, } diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 75db9fda2..b05a3c275 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -15,7 +15,7 @@ use super::get_last_facet_value; pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, @@ -50,11 +50,11 @@ where }; let db = db.remap_key_type::>(); let mut docids = RoaringBitmap::new(); - let mut f = FacetRangeSearch { rtxn, db: &db, field_id, left, right, docids: &mut docids }; - let highest_level = get_highest_level(rtxn, &db, field_id)?; + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; + let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, &db, field_id)? { - let last_bound = get_last_facet_value::(rtxn, &db, field_id)?.unwrap(); + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; Ok(docids) } else { @@ -65,7 +65,7 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 73491d4ae..e4b77c691 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -1,24 +1,19 @@ use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; -use crate::Result; +use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, -) -> Result> + 't>> { - let highest_level = - get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; - if let Some(first_bound) = get_first_facet_value::( - rtxn, - &db.remap_key_type::>(), - field_id, - )? { +) -> Result> + 't>> { + let highest_level = get_highest_level(rtxn, db, field_id)?; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); @@ -30,7 +25,7 @@ pub fn ascending_facet_sort<'t>( struct AscendingFacetSort<'t, 'e> { rtxn: &'t heed::RoTxn<'e>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, @@ -39,7 +34,7 @@ struct AscendingFacetSort<'t, 'e> { } impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { - type Item = Result<(&'t [u8], RoaringBitmap)>; + type Item = Result; fn next(&mut self) -> Option { 'outer: loop { @@ -67,7 +62,7 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { *documents_ids -= &bitmap; if level == 0 { - return Some(Ok((left_bound, bitmap))); + return Some(Ok(bitmap)); } let starting_key_below = FacetKey { field_id: self.field_id, level: level - 1, left_bound }; diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 81b0eb09d..fc62b894f 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -3,17 +3,17 @@ use std::ops::Bound; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; -use crate::Result; +use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -fn descending_facet_sort<'t>( +pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, -) -> Result> + 't>> { +) -> Result> + 't>> { let highest_level = get_highest_level(rtxn, db, field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; @@ -33,7 +33,7 @@ fn descending_facet_sort<'t>( struct DescendingFacetSort<'t> { rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, @@ -43,7 +43,7 @@ struct DescendingFacetSort<'t> { } impl<'t> Iterator for DescendingFacetSort<'t> { - type Item = Result<(&'t [u8], RoaringBitmap)>; + type Item = Result; fn next(&mut self) -> Option { 'outer: loop { @@ -70,7 +70,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { *documents_ids -= &bitmap; if level == 0 { - return Some(Ok((left_bound, bitmap))); + return Some(Ok(bitmap)); } let starting_key_below = FacetKey { field_id, level: level - 1, left_bound }; @@ -89,14 +89,15 @@ impl<'t> Iterator for DescendingFacetSort<'t> { }; let prev_right_bound = *right_bound; *right_bound = Bound::Excluded(left_bound); - let iter = match self.db.rev_range( - &self.rtxn, - &(Bound::Included(starting_key_below), end_key_kelow), - ) { - Ok(iter) => iter, - Err(e) => return Some(Err(e.into())), - } - .take(group_size as usize); + let iter = + match self.db.remap_key_type::>().rev_range( + &self.rtxn, + &(Bound::Included(starting_key_below), end_key_kelow), + ) { + Ok(iter) => iter, + Err(e) => return Some(Err(e.into())), + } + .take(group_size as usize); self.stack.push((bitmap, iter, prev_right_bound)); continue 'outer; diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 79d7f5e0f..6ec626a5c 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -274,7 +274,7 @@ impl<'a> Filter<'a> { (_, _) => (), } let x = facet_range_search::find_docids_of_facet_within_bounds::( - rtxn, &db, field_id, &left, &right, + rtxn, db, field_id, &left, &right, )?; // TODO: the facet range search should take a mutable roaring bitmap as argument *output = x; diff --git a/milli/src/search/facet/incremental_update.rs b/milli/src/search/facet/incremental_update.rs index f01b19dab..fd4e1eeb5 100644 --- a/milli/src/search/facet/incremental_update.rs +++ b/milli/src/search/facet/incremental_update.rs @@ -264,7 +264,7 @@ impl<'i> IncrementalFacetUpdate<'i> { } let group_size = self.group_size; - let highest_level = get_highest_level(&txn, &self.db, field_id)?; + let highest_level = get_highest_level(&txn, *self.db, field_id)?; let result = self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?; @@ -413,7 +413,7 @@ impl<'i> IncrementalFacetUpdate<'i> { if self.db.get(txn, &FacetKey { field_id, level: 0, left_bound: key })?.is_none() { return Ok(()); } - let highest_level = get_highest_level(&txn, &self.db, field_id)?; + let highest_level = get_highest_level(&txn, *self.db, field_id)?; // let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 023d433ad..8405c0141 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -11,14 +11,14 @@ pub use self::filter::Filter; mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; -mod facet_sort_ascending; -mod facet_sort_descending; +pub mod facet_sort_ascending; +pub mod facet_sort_descending; mod filter; mod incremental_update; pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -40,7 +40,7 @@ where } pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -63,7 +63,7 @@ where } pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); From fb8d23deb3690e412217a59b43e16c34b4bfb938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 12:53:53 +0200 Subject: [PATCH 1702/1889] Reintroduce db_snap! for facet databases --- milli/src/snapshot_tests.rs | 55 ++++++++++--------------------------- 1 file changed, 15 insertions(+), 40 deletions(-) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 4031c9b06..17ee3f392 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -6,6 +6,7 @@ use heed::types::ByteSlice; use heed::BytesDecode; use roaring::RoaringBitmap; +use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index}; @@ -229,48 +230,22 @@ pub fn snap_word_prefix_position_docids(index: &Index) -> String { snap } pub fn snap_facet_id_f64_docids(index: &Index) -> String { - todo!() - // let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( - // (facet_id, level, left, right), - // b, - // )| { - // &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) - // }); - // snap + let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( + FacetKey { field_id, level, left_bound }, + FacetGroupValue { size, bitmap }, + )| { + &format!("{field_id:<3} {level:<2} {left_bound:<6} {size:<2} {}", display_bitmap(&bitmap)) + }); + snap } pub fn snap_facet_id_string_docids(index: &Index) -> String { - todo!() - // let rtxn = index.read_txn().unwrap(); - // let bytes_db = index.facet_id_string_docids.remap_types::(); - // let iter = bytes_db.iter(&rtxn).unwrap(); - // let mut snap = String::new(); - - // for x in iter { - // let (key, value) = x.unwrap(); - // if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { - // let (orig_string, docids) = - // FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); - // snap.push_str(&format!( - // "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", - // display_bitmap(&docids) - // )); - // } else if let Some((field_id, level, left, right)) = - // FacetLevelValueU32Codec::bytes_decode(key) - // { - // snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); - // let (bounds, docids) = - // FacetStringZeroBoundsValueCodec::::bytes_decode(value) - // .unwrap(); - // if let Some((left, right)) = bounds { - // snap.push_str(&format!("{left:<8} {right:<8} ")); - // } - // snap.push_str(&display_bitmap(&docids)); - // snap.push('\n'); - // } else { - // panic!(); - // } - // } - // snap + let snap = make_db_snap_from_iter!(index, facet_id_string_docids, |( + FacetKey { field_id, level, left_bound }, + FacetGroupValue { size, bitmap }, + )| { + &format!("{field_id:<3} {level:<2} {left_bound:<12} {size:<2} {}", display_bitmap(&bitmap)) + }); + snap } pub fn snap_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); From e8a156d68287db90841109328f8dd3ba70f10433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 13:03:36 +0200 Subject: [PATCH 1703/1889] Reorganise facets database indexing code --- http-ui/src/main.rs | 1 + milli/src/search/facet/mod.rs | 3 -- milli/src/search/mod.rs | 2 +- milli/src/update/delete_documents.rs | 4 +- milli/src/update/{facets.rs => facet/bulk.rs} | 37 +++++++++---------- .../facet/incremental.rs} | 11 +++--- milli/src/update/facet/mod.rs | 2 + milli/src/update/index_documents/mod.rs | 6 +-- milli/src/update/mod.rs | 4 +- 9 files changed, 33 insertions(+), 37 deletions(-) create mode 100644 http-ui/src/main.rs rename milli/src/update/{facets.rs => facet/bulk.rs} (97%) rename milli/src/{search/facet/incremental_update.rs => update/facet/incremental.rs} (98%) create mode 100644 milli/src/update/facet/mod.rs diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/http-ui/src/main.rs @@ -0,0 +1 @@ + diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 8405c0141..12074cc12 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -4,8 +4,6 @@ use heed::{BytesDecode, RoTxn}; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; -// pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; -// pub use self::facet_string::FacetStringIter; pub use self::filter::Filter; mod facet_distribution; @@ -14,7 +12,6 @@ mod facet_range_search; pub mod facet_sort_ascending; pub mod facet_sort_descending; mod filter; -mod incremental_update; pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index d05e807df..e6651737c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -32,7 +32,7 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod criteria; mod distinct; -mod facet; +pub mod facet; mod fst_utils; mod matches; mod query_tree; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e16d98e74..1d1745d82 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; -use super::{ClearDocuments, Facets}; +use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::CboRoaringBitmapCodec; @@ -643,7 +643,7 @@ fn remove_docids_from_facet_id_docids<'a>( if !modified { return Ok(()); } - let builder = Facets::new(index, db); + let builder = FacetsUpdateBulk::new(index, db); builder.execute(wtxn)?; Ok(()) diff --git a/milli/src/update/facets.rs b/milli/src/update/facet/bulk.rs similarity index 97% rename from milli/src/update/facets.rs rename to milli/src/update/facet/bulk.rs index fe8c2855e..587dc95ab 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,23 +1,20 @@ -use std::cmp; -use std::fs::File; -use std::num::NonZeroUsize; - +use crate::error::InternalError; +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; +use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; +use crate::{FieldId, Index, Result}; use grenad::CompressionType; use heed::types::ByteSlice; use heed::{BytesEncode, Error, RoTxn}; use log::debug; use roaring::RoaringBitmap; +use std::cmp; +use std::fs::File; +use std::num::NonZeroUsize; use time::OffsetDateTime; -use crate::error::InternalError; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, -}; -// use crate::heed_codec::CboRoaringBitmapCodec; -use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; -use crate::{FieldId, Index, Result}; - -pub struct Facets<'i> { +pub struct FacetsUpdateBulk<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, pub(crate) chunk_compression_type: CompressionType, @@ -26,12 +23,12 @@ pub struct Facets<'i> { min_level_size: usize, } -impl<'i> Facets<'i> { +impl<'i> FacetsUpdateBulk<'i> { pub fn new( index: &'i Index, database: heed::Database, FacetGroupValueCodec>, - ) -> Facets<'i> { - Facets { + ) -> FacetsUpdateBulk<'i> { + FacetsUpdateBulk { index, database, chunk_compression_type: CompressionType::None, @@ -63,7 +60,7 @@ impl<'i> Facets<'i> { Ok(()) } - #[logging_timer::time("Facets::{}")] + #[logging_timer::time("FacetsUpdateBulk::{}")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // We get the faceted fields to be able to create the facet levels. @@ -105,7 +102,7 @@ impl<'i> Facets<'i> { field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { - let algo = CreateFacetsAlgo { + let algo = FacetsUpdateBulkAlgorithm { rtxn: txn, db: &self.database, field_id, @@ -129,7 +126,7 @@ impl<'i> Facets<'i> { } } -pub struct CreateFacetsAlgo<'t> { +pub struct FacetsUpdateBulkAlgorithm<'t> { rtxn: &'t heed::RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, chunk_compression_type: CompressionType, @@ -138,7 +135,7 @@ pub struct CreateFacetsAlgo<'t> { level_group_size: usize, min_level_size: usize, } -impl<'t> CreateFacetsAlgo<'t> { +impl<'t> FacetsUpdateBulkAlgorithm<'t> { fn read_level_0( &self, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, diff --git a/milli/src/search/facet/incremental_update.rs b/milli/src/update/facet/incremental.rs similarity index 98% rename from milli/src/search/facet/incremental_update.rs rename to milli/src/update/facet/incremental.rs index fd4e1eeb5..d2fb3755f 100644 --- a/milli/src/search/facet/incremental_update.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,13 +1,12 @@ use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; +use crate::search::facet::get_highest_level; use crate::Result; use heed::Error; use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use super::get_highest_level; - enum InsertionResult { InPlace, Insert, @@ -18,14 +17,14 @@ enum DeletionResult { Remove { prev: Option>, next: Option> }, } -struct IncrementalFacetUpdate<'i> { - db: &'i heed::Database, FacetGroupValueCodec>, +struct FacetUpdateIncremental { + db: heed::Database, FacetGroupValueCodec>, group_size: usize, min_level_size: usize, max_group_size: usize, } -impl<'i> IncrementalFacetUpdate<'i> { - fn find_insertion_key_value<'a>( +impl FacetUpdateIncremental { + fn find_insertion_key_value( &self, field_id: u16, level: u8, diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs new file mode 100644 index 000000000..ecde3a248 --- /dev/null +++ b/milli/src/update/facet/mod.rs @@ -0,0 +1,2 @@ +pub mod bulk; +pub mod incremental; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 5a9066eba..be9b1e3c5 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -37,8 +37,8 @@ use crate::error::UserError; use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, - WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, + self, FacetsUpdateBulk, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, + PrefixWordPairsProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; @@ -436,7 +436,7 @@ where (&self.index.facet_id_string_docids).remap_key_type::>(), (&self.index.facet_id_f64_docids).remap_key_type::>(), ] { - let mut builder = Facets::new(self.index, facet_db); + let mut builder = FacetsUpdateBulk::new(self.index, facet_db); builder.chunk_compression_type = self.indexer_config.chunk_compression_type; builder.chunk_compression_level = self.indexer_config.chunk_compression_level; if let Some(value) = self.config.facet_level_group_size { diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 3ddc01cef..cd96d3e88 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -1,7 +1,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; -pub use self::facets::Facets; +pub use self::facet::bulk::FacetsUpdateBulk; pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; @@ -16,7 +16,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; mod delete_documents; -mod facets; +mod facet; mod index_documents; mod indexer_config; mod prefix_word_pairs; From d30c89e3451dd22b9b507e877cbea8b7473ff145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 14:19:52 +0200 Subject: [PATCH 1704/1889] Fix compile error+warnings in new tests --- .../search/facet/facet_distribution_iter.rs | 22 ++++++++-------- milli/src/search/facet/facet_range_search.rs | 25 ++++++++++--------- .../src/search/facet/facet_sort_ascending.rs | 15 ++++++----- .../src/search/facet/facet_sort_descending.rs | 18 ++++++------- milli/src/search/facet/mod.rs | 16 ++++++++++-- milli/src/snapshot_tests.rs | 11 +++----- milli/src/update/facet/incremental.rs | 13 +++++++--- milli/src/update/mod.rs | 1 + 8 files changed, 67 insertions(+), 54 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index f347b9d7e..16b83c2db 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -109,7 +109,7 @@ where #[cfg(test)] mod tests { use heed::BytesDecode; - use rand::{rngs::SmallRng, Rng, SeedableRng}; + use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; use std::ops::ControlFlow; @@ -125,7 +125,7 @@ mod tests { for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); - index.insert(&mut txn, 0, &i, &bitmap); + index.insert(&mut txn, 0, &(i as f64), &bitmap); } txn.commit().unwrap(); index @@ -134,14 +134,14 @@ mod tests { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); let keys = std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); - bitmap.insert(key + 100.); + bitmap.insert(key + 100); index.insert(&mut txn, 0, &(key as f64), &bitmap); } txn.commit().unwrap(); @@ -156,13 +156,13 @@ mod tests { #[test] fn filter_distribution_all() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); iterate_over_facet_distribution( &txn, - &index.db.content, + index.db.content, 0, &candidates, |facet, count| { @@ -170,7 +170,8 @@ mod tests { results.push_str(&format!("{facet}: {count}\n")); ControlFlow::Continue(()) }, - ); + ) + .unwrap(); insta::assert_snapshot!(format!("filter_distribution_{i}_all"), results); txn.commit().unwrap(); @@ -179,14 +180,14 @@ mod tests { #[test] fn filter_distribution_all_stop_early() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); let mut nbr_facets = 0; iterate_over_facet_distribution( &txn, - &index.db.content, + index.db.content, 0, &candidates, |facet, count| { @@ -200,7 +201,8 @@ mod tests { ControlFlow::Continue(()) } }, - ); + ) + .unwrap(); insta::assert_snapshot!(format!("filter_distribution_{i}_all_stop_early"), results); txn.commit().unwrap(); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index b05a3c275..7e7c5e713 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -259,8 +259,9 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { #[cfg(test)] mod tests { use crate::{ - heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, - search::facet::test::FacetIndex, snapshot_tests::display_bitmap, + heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec}, + search::facet::test::FacetIndex, + snapshot_tests::display_bitmap, }; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; @@ -283,7 +284,7 @@ mod tests { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); let keys = std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); @@ -305,7 +306,7 @@ mod tests { #[test] fn filter_range_increasing() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let mut results = String::new(); for i in 0..=255 { @@ -314,7 +315,7 @@ mod tests { let end = Bound::Included(i); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -333,7 +334,7 @@ mod tests { let end = Bound::Excluded(i); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -351,7 +352,7 @@ mod tests { #[test] fn filter_range_decreasing() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let mut results = String::new(); @@ -362,7 +363,7 @@ mod tests { let end = Bound::Included(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -384,7 +385,7 @@ mod tests { let end = Bound::Excluded(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -404,7 +405,7 @@ mod tests { #[test] fn filter_range_pinch() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let mut results = String::new(); @@ -415,7 +416,7 @@ mod tests { let end = Bound::Included(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -434,7 +435,7 @@ mod tests { let end = Bound::Excluded(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - &index.db.content, + index.db.content.remap_key_type::>(), 0, &start, &end, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index e4b77c691..8af191089 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -83,7 +83,6 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { - use heed::BytesDecode; use rand::Rng; use rand::SeedableRng; use roaring::RoaringBitmap; @@ -100,7 +99,7 @@ mod tests { for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); - index.insert(&mut txn, 0, &i, &bitmap); + index.insert(&mut txn, 0, &(i as f64), &bitmap); } txn.commit().unwrap(); index @@ -109,7 +108,7 @@ mod tests { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); let keys = std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); @@ -131,14 +130,14 @@ mod tests { #[test] fn filter_sort() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates); - for (facet, docids) in iter { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); + let iter = ascending_facet_sort(&txn, index.db.content, 0, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); } insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index fc62b894f..5ce55ec6d 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -111,8 +111,6 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { - - use heed::BytesDecode; use rand::Rng; use rand::SeedableRng; use roaring::RoaringBitmap; @@ -129,7 +127,7 @@ mod tests { for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); - index.insert(&mut txn, 0, &i, &bitmap); + index.insert(&mut txn, 0, &(i as f64), &bitmap); } txn.commit().unwrap(); index @@ -138,14 +136,14 @@ mod tests { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); - let rng = rand::rngs::SmallRng::from_seed([0; 32]); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); let keys = std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); - bitmap.insert(key + 100.); + bitmap.insert(key + 100); index.insert(&mut txn, 0, &(key as f64), &bitmap); } txn.commit().unwrap(); @@ -160,15 +158,15 @@ mod tests { #[test] fn filter_sort_descending() { let indexes = [get_simple_index(), get_random_looking_index()]; - for (i, index) in indexes.into_iter().enumerate() { + for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); let db = index.db.content.remap_key_type::>(); - let iter = descending_facet_sort(&txn, &db, 0, candidates); - for (facet, docids) in iter { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); + let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); } insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 12074cc12..2ca6c0689 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -78,9 +78,10 @@ pub(crate) fn get_highest_level<'t>( #[cfg(test)] mod test { + use crate::update::FacetsUpdateIncremental; + use heed::{BytesDecode, BytesEncode, Env, RwTxn}; + use roaring::RoaringBitmap; use std::{fmt::Display, marker::PhantomData, rc::Rc}; - - use heed::{BytesDecode, BytesEncode, Env}; use tempfile::TempDir; use crate::{ @@ -148,6 +149,17 @@ mod test { _phantom: PhantomData, } } + pub fn insert<'a>( + &self, + rwtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncremental::new(self.db.content); + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.insert(rwtxn, field_id, &key_bytes, docids).unwrap(); + } } impl Display for FacetIndex diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 17ee3f392..c6b83eeb6 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -1,15 +1,10 @@ +use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; +use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; +use roaring::RoaringBitmap; use std::borrow::Cow; use std::fmt::Write; use std::path::Path; -use heed::types::ByteSlice; -use heed::BytesDecode; -use roaring::RoaringBitmap; - -use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; -use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; -use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index}; - #[track_caller] pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Settings { let mut settings = insta::Settings::clone_current(); diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index d2fb3755f..df0b93839 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -17,13 +17,18 @@ enum DeletionResult { Remove { prev: Option>, next: Option> }, } -struct FacetUpdateIncremental { +pub struct FacetsUpdateIncremental { db: heed::Database, FacetGroupValueCodec>, group_size: usize, min_level_size: usize, max_group_size: usize, } -impl FacetUpdateIncremental { +impl FacetsUpdateIncremental { + pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { + Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } + } +} +impl FacetsUpdateIncremental { fn find_insertion_key_value( &self, field_id: u16, @@ -263,7 +268,7 @@ impl FacetUpdateIncremental { } let group_size = self.group_size; - let highest_level = get_highest_level(&txn, *self.db, field_id)?; + let highest_level = get_highest_level(&txn, self.db, field_id)?; let result = self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?; @@ -412,7 +417,7 @@ impl FacetUpdateIncremental { if self.db.get(txn, &FacetKey { field_id, level: 0, left_bound: key })?.is_none() { return Ok(()); } - let highest_level = get_highest_level(&txn, *self.db, field_id)?; + let highest_level = get_highest_level(&txn, self.db, field_id)?; // let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index cd96d3e88..8fba16d3d 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -2,6 +2,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; pub use self::facet::bulk::FacetsUpdateBulk; +pub use self::facet::incremental::FacetsUpdateIncremental; pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; From 85824ee203a3f6c99a0335c9a11c275cb6dc37f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 08:17:27 +0200 Subject: [PATCH 1705/1889] Try to make facet indexing incremental --- milli/src/search/facet/facet_range_search.rs | 2 +- milli/src/update/delete_documents.rs | 7 +++- milli/src/update/facet/bulk.rs | 18 ++++++-- .../extract/extract_facet_string_docids.rs | 4 ++ milli/src/update/index_documents/mod.rs | 18 -------- .../src/update/index_documents/typed_chunk.rs | 41 ++++++++++++++----- 6 files changed, 55 insertions(+), 35 deletions(-) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 7e7c5e713..523b3853c 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -138,7 +138,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let should_skip = { match self.left { Bound::Included(left) => left >= next_key.left_bound, - Bound::Excluded(left) => left >= next_key.left_bound, // TODO: use > instead? + Bound::Excluded(left) => left >= next_key.left_bound, Bound::Unbounded => false, } }; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 1d1745d82..bb18ed80f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,7 +2,7 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; -use heed::Database; +use heed::{Database, RwTxn}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -446,6 +446,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { facet_id_f64_docids.remap_key_type::>(), &self.to_delete_docids, fields_ids_map.clone(), + Index::put_number_faceted_documents_ids, )?; remove_docids_from_facet_id_docids( self.wtxn, @@ -453,6 +454,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { facet_id_string_docids.remap_key_type::>(), &self.to_delete_docids, fields_ids_map.clone(), + Index::put_string_faceted_documents_ids, )?; // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_id_exists_docids( @@ -614,6 +616,7 @@ fn remove_docids_from_facet_id_docids<'a>( db: heed::Database, FacetGroupValueCodec>, to_remove: &RoaringBitmap, fields_ids_map: FieldsIdsMap, + put_faceted_docids_in_main: fn(&Index, &mut RwTxn, FieldId, &RoaringBitmap) -> heed::Result<()>, ) -> Result<()> { let mut modified = false; for field_id in fields_ids_map.ids() { @@ -643,7 +646,7 @@ fn remove_docids_from_facet_id_docids<'a>( if !modified { return Ok(()); } - let builder = FacetsUpdateBulk::new(index, db); + let builder = FacetsUpdateBulk::new(index, db, put_faceted_docids_in_main); builder.execute(wtxn)?; Ok(()) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 587dc95ab..b3e932dc2 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -6,7 +6,7 @@ use crate::update::index_documents::{create_writer, write_into_lmdb_database, wr use crate::{FieldId, Index, Result}; use grenad::CompressionType; use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn}; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; use log::debug; use roaring::RoaringBitmap; use std::cmp; @@ -21,12 +21,19 @@ pub struct FacetsUpdateBulk<'i> { pub(crate) chunk_compression_level: Option, level_group_size: usize, min_level_size: usize, + put_faceted_docids_in_main: fn(&Index, &mut RwTxn, FieldId, &RoaringBitmap) -> heed::Result<()>, } impl<'i> FacetsUpdateBulk<'i> { pub fn new( index: &'i Index, database: heed::Database, FacetGroupValueCodec>, + put_faceted_docids_in_main: fn( + &Index, + &mut RwTxn, + FieldId, + &RoaringBitmap, + ) -> heed::Result<()>, ) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, @@ -35,6 +42,7 @@ impl<'i> FacetsUpdateBulk<'i> { chunk_compression_level: None, level_group_size: 4, min_level_size: 5, + put_faceted_docids_in_main, } } @@ -78,8 +86,12 @@ impl<'i> FacetsUpdateBulk<'i> { let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &nested_wtxn)?; - // TODO: this will need to be an argument to Facets as well - self.index.put_string_faceted_documents_ids(&mut nested_wtxn, field_id, &all_docids)?; + (self.put_faceted_docids_in_main)( + &self.index, + &mut nested_wtxn, + field_id, + &all_docids, + )?; for level_reader in level_readers { // TODO: append instead of write with merge diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 51d2df923..0bb83c29a 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -32,6 +32,10 @@ pub fn extract_facet_string_docids( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); + // document_id_bytes is a big-endian u32 + // merge_cbo_roaring_bitmap works with native endian u32s + // that is a problem, I think + let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index be9b1e3c5..1ab1bd38d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -34,7 +34,6 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; -use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, FacetsUpdateBulk, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, @@ -431,23 +430,6 @@ where // Merged databases are already been indexed, we start from this count; let mut databases_seen = MERGED_DATABASE_COUNT; - // Run the facets update operation. - for facet_db in [ - (&self.index.facet_id_string_docids).remap_key_type::>(), - (&self.index.facet_id_f64_docids).remap_key_type::>(), - ] { - let mut builder = FacetsUpdateBulk::new(self.index, facet_db); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - if let Some(value) = self.config.facet_level_group_size { - builder.level_group_size(value); - } - if let Some(value) = self.config.facet_min_level_size { - builder.min_level_size(value); - } - builder.execute(self.wtxn)?; - } - databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 3c7a78d95..7aa306183 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -13,7 +13,9 @@ use super::helpers::{ valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; +use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; use crate::update::index_documents::helpers::as_cloneable_grenad; +use crate::update::FacetsUpdateIncremental; use crate::{ lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, @@ -146,6 +148,34 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { + // merge cbo roaring bitmaps is not the correct merger because the data in the DB + // is FacetGroupValue and not RoaringBitmap + // so I need to create my own merging function + + // facet_id_string_docids is encoded as: + // key: FacetKeyCodec + // value: CboRoaringBitmapCodec + // basically + + // TODO: a condition saying "if I have more than 1/50th of the DB to add, + // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50, + // it is a ratio I determine empirically + + // for now I only do it incrementally, to see if things work + let builder = FacetsUpdateIncremental::new( + index.facet_id_string_docids.remap_key_type::>(), + ); + let mut cursor = facet_id_string_docids.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = + FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; + let value = + CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + builder.insert(wtxn, key.field_id, key.left_bound, &value)?; + } + is_merged_database = true; + } TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { append_entries_into_database( facet_id_exists_docids, @@ -188,17 +218,6 @@ pub(crate) fn write_typed_chunk_into_index( } } } - TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { - // facet_id_string_docids contains the thing that the extractor put into it, - // so: (FacetKey { field id, level: 0, left_bound } , docids: RoaringBitmap ) - // now we need to either: - // 1. incrementally add the keys/docids pairs into the DB - // 2. add the keys/docids into level 0 and then call Facets::execute - // the choice of solution should be determined by their performance - // characteristics - - is_merged_database = true; - } TypedChunk::GeoPoints(geo_points) => { let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; From 68cbcdf08b860ce42458ffc0868a00086696b10b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 08:34:26 +0200 Subject: [PATCH 1706/1889] Fix compile errors/warnings in http-ui and infos --- milli/src/search/facet/mod.rs | 4 +-- milli/src/update/delete_documents.rs | 46 +++++++++++++++------------- milli/src/update/facet/bulk.rs | 38 +++++++++++------------ 3 files changed, 44 insertions(+), 44 deletions(-) diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 2ca6c0689..b03302ca1 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -82,7 +82,6 @@ mod test { use heed::{BytesDecode, BytesEncode, Env, RwTxn}; use roaring::RoaringBitmap; use std::{fmt::Display, marker::PhantomData, rc::Rc}; - use tempfile::TempDir; use crate::{ heed_codec::facet::new::{ @@ -113,8 +112,9 @@ mod test { for<'a> BoundCodec: BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, { + #[cfg(all(test, fuzzing))] pub fn open_from_tempdir( - tempdir: Rc, + tempdir: Rc, group_size: u8, max_group_size: u8, ) -> FacetIndex { diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index bb18ed80f..531fd2b74 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,7 +2,7 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; use heed::types::{ByteSlice, Str}; -use heed::{Database, RwTxn}; +use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -10,6 +10,7 @@ use time::OffsetDateTime; use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; +use crate::facet::FacetType; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; @@ -185,9 +186,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, - facet_id_f64_docids, + facet_id_f64_docids: _, facet_id_exists_docids, - facet_id_string_docids, + facet_id_string_docids: _, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -440,22 +441,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; } - remove_docids_from_facet_id_docids( - self.wtxn, - self.index, - facet_id_f64_docids.remap_key_type::>(), - &self.to_delete_docids, - fields_ids_map.clone(), - Index::put_number_faceted_documents_ids, - )?; - remove_docids_from_facet_id_docids( - self.wtxn, - self.index, - facet_id_string_docids.remap_key_type::>(), - &self.to_delete_docids, - fields_ids_map.clone(), - Index::put_string_faceted_documents_ids, - )?; + for facet_type in [FacetType::Number, FacetType::String] { + remove_docids_from_facet_id_docids( + self.wtxn, + self.index, + &self.to_delete_docids, + fields_ids_map.clone(), + facet_type, + )?; + } + // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_id_exists_docids( self.wtxn, @@ -613,11 +608,18 @@ where fn remove_docids_from_facet_id_docids<'a>( wtxn: &'a mut heed::RwTxn, index: &Index, - db: heed::Database, FacetGroupValueCodec>, to_remove: &RoaringBitmap, fields_ids_map: FieldsIdsMap, - put_faceted_docids_in_main: fn(&Index, &mut RwTxn, FieldId, &RoaringBitmap) -> heed::Result<()>, + facet_type: FacetType, ) -> Result<()> { + let db = match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; let mut modified = false; for field_id in fields_ids_map.ids() { let mut level0_prefix = vec![]; @@ -646,7 +648,7 @@ fn remove_docids_from_facet_id_docids<'a>( if !modified { return Ok(()); } - let builder = FacetsUpdateBulk::new(index, db, put_faceted_docids_in_main); + let builder = FacetsUpdateBulk::new(index, facet_type); builder.execute(wtxn)?; Ok(()) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index b3e932dc2..b8acffbaf 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,4 +1,5 @@ use crate::error::InternalError; +use crate::facet::FacetType; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; @@ -6,7 +7,7 @@ use crate::update::index_documents::{create_writer, write_into_lmdb_database, wr use crate::{FieldId, Index, Result}; use grenad::CompressionType; use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use heed::{BytesEncode, Error, RoTxn}; use log::debug; use roaring::RoaringBitmap; use std::cmp; @@ -21,28 +22,26 @@ pub struct FacetsUpdateBulk<'i> { pub(crate) chunk_compression_level: Option, level_group_size: usize, min_level_size: usize, - put_faceted_docids_in_main: fn(&Index, &mut RwTxn, FieldId, &RoaringBitmap) -> heed::Result<()>, + facet_type: FacetType, } impl<'i> FacetsUpdateBulk<'i> { - pub fn new( - index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, - put_faceted_docids_in_main: fn( - &Index, - &mut RwTxn, - FieldId, - &RoaringBitmap, - ) -> heed::Result<()>, - ) -> FacetsUpdateBulk<'i> { + pub fn new(index: &'i Index, facet_type: FacetType) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, - database, + database: match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }, chunk_compression_type: CompressionType::None, chunk_compression_level: None, level_group_size: 4, min_level_size: 5, - put_faceted_docids_in_main, + facet_type, } } @@ -86,12 +85,11 @@ impl<'i> FacetsUpdateBulk<'i> { let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &nested_wtxn)?; - (self.put_faceted_docids_in_main)( - &self.index, - &mut nested_wtxn, - field_id, - &all_docids, - )?; + let put_docids_fn = match self.facet_type { + FacetType::Number => Index::put_number_faceted_documents_ids, + FacetType::String => Index::put_string_faceted_documents_ids, + }; + put_docids_fn(&self.index, &mut nested_wtxn, field_id, &all_docids)?; for level_reader in level_readers { // TODO: append instead of write with merge From 61252248fb991557cce1e0de25e4dfe13ff00388 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 09:51:43 +0200 Subject: [PATCH 1707/1889] Fix some facet indexing bugs --- milli/src/search/facet/mod.rs | 2 +- milli/src/snapshot_tests.rs | 12 ++++ milli/src/update/facet/incremental.rs | 5 +- .../extract/extract_facet_string_docids.rs | 3 +- milli/src/update/index_documents/mod.rs | 49 ++++++++++++++- .../src/update/index_documents/typed_chunk.rs | 62 +++++++++++++++---- 6 files changed, 115 insertions(+), 18 deletions(-) diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index b03302ca1..0ed80dd92 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -139,7 +139,7 @@ mod test { let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 100); - let tempdir = tempfile::TempDir::new_in("databases/").unwrap(); + let tempdir = tempfile::TempDir::new().unwrap(); let env = options.open(tempdir.path()).unwrap(); let content = env.create_database(None).unwrap(); diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index c6b83eeb6..933f68837 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -242,6 +242,15 @@ pub fn snap_facet_id_string_docids(index: &Index) -> String { }); snap } +pub fn snap_field_id_docid_facet_strings(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, field_id_docid_facet_strings, |( + (field_id, doc_id, string), + other_string, + )| { + &format!("{field_id:<3} {doc_id:<4} {string:<12} {other_string}") + }); + snap +} pub fn snap_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let documents_ids = index.documents_ids(&rtxn).unwrap(); @@ -423,6 +432,9 @@ macro_rules! full_snap_of_db { ($index:ident, facet_id_string_docids) => {{ $crate::snapshot_tests::snap_facet_id_string_docids(&$index) }}; + ($index:ident, field_id_docid_facet_strings) => {{ + $crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index) + }}; ($index:ident, documents_ids) => {{ $crate::snapshot_tests::snap_documents_ids(&$index) }}; diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index df0b93839..a0d426d7a 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,8 +1,9 @@ +use crate::facet::FacetType; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; use crate::search::facet::get_highest_level; -use crate::Result; +use crate::{Index, Result}; use heed::Error; use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; use roaring::RoaringBitmap; @@ -287,7 +288,7 @@ impl FacetsUpdateIncremental { .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? .count(); - if size_highest_level < self.min_level_size { + if size_highest_level < self.group_size * self.min_level_size { return Ok(()); } diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 0bb83c29a..fe42801e7 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -38,12 +38,13 @@ pub fn extract_facet_string_docids( let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); let normalised_value = std::str::from_utf8(normalized_value_bytes)?; let key = FacetKey { field_id, level: 0, left_bound: normalised_value }; let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); - facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?; + facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?; } sorter_into_reader(facet_string_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 1ab1bd38d..2a2511362 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -592,7 +592,7 @@ mod tests { use crate::index::tests::TempIndex; use crate::search::TermsMatchingStrategy; use crate::update::DeleteDocuments; - use crate::BEU16; + use crate::{db_snap, BEU16}; #[test] fn simple_document_replacement() { @@ -1379,6 +1379,25 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @r###" + 3 0 first 1 [1, ] + 3 0 second 1 [2, ] + 3 0 third 1 [3, ] + 3 0 zeroth 1 [0, ] + "###); + db_snap!(index, field_id_docid_facet_strings, @r###" + 3 0 zeroth zeroth + 3 1 first first + 3 2 second second + 3 3 third third + "###); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let hidden = index.faceted_fields(&rtxn).unwrap(); @@ -1399,6 +1418,15 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @""); + db_snap!(index, field_id_docid_facet_strings, @""); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let facets = index.faceted_fields(&rtxn).unwrap(); @@ -1412,6 +1440,25 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @r###" + 3 0 first 1 [1, ] + 3 0 second 1 [2, ] + 3 0 third 1 [3, ] + 3 0 zeroth 1 [0, ] + "###); + db_snap!(index, field_id_docid_facet_strings, @r###" + 3 0 zeroth zeroth + 3 1 first first + 3 2 second second + 3 3 third third + "###); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let facets = index.faceted_fields(&rtxn).unwrap(); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 7aa306183..df98724da 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io; @@ -17,8 +18,8 @@ use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::update::FacetsUpdateIncremental; use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, - Result, + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, + Index, Result, }; pub(crate) enum TypedChunk { @@ -138,14 +139,41 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { - append_entries_into_database( - facet_id_f64_docids_iter, - &index.facet_id_f64_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, - )?; + // merge cbo roaring bitmaps is not the correct merger because the data in the DB + // is FacetGroupValue and not RoaringBitmap + // so I need to create my own merging function + + // facet_id_string_docids is encoded as: + // key: FacetKeyCodec + // value: CboRoaringBitmapCodec + // basically + + // TODO: a condition saying "if I have more than 1/50th of the DB to add, + // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50, + // it is a ratio I determine empirically + + // for now I only do it incrementally, to see if things work + let indexer = FacetsUpdateIncremental::new( + index.facet_id_f64_docids.remap_key_type::>(), + ); + + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = facet_id_f64_docids_iter.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = + FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; + let docids = + CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + for (field_id, new_docids) in new_faceted_docids { + let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?; + docids |= new_docids; + index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?; + } + is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { @@ -163,16 +191,24 @@ pub(crate) fn write_typed_chunk_into_index( // it is a ratio I determine empirically // for now I only do it incrementally, to see if things work - let builder = FacetsUpdateIncremental::new( + let indexer = FacetsUpdateIncremental::new( index.facet_id_string_docids.remap_key_type::>(), ); + let mut new_faceted_docids = HashMap::::default(); + let mut cursor = facet_id_string_docids.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { let key = FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; - let value = + let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - builder.insert(wtxn, key.field_id, key.left_bound, &value)?; + indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + for (field_id, new_docids) in new_faceted_docids { + let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?; + docids |= new_docids; + index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?; } is_merged_database = true; } From 07ff92c663014d61ddc67b0726c6e7051a0a5efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 11:09:01 +0200 Subject: [PATCH 1708/1889] Add more snapshots from facet tests --- .../search/facet/facet_distribution_iter.rs | 14 +- milli/src/search/facet/facet_range_search.rs | 27 +-- .../src/search/facet/facet_sort_ascending.rs | 13 +- .../src/search/facet/facet_sort_descending.rs | 13 +- .../filter_distribution_all/0.snap | 228 ++++++++++++++++++ .../filter_distribution_all/1.snap | 100 ++++++++ .../filter_distribution_all_stop_early/0.snap | 104 ++++++++ .../filter_distribution_all_stop_early/1.snap | 100 ++++++++ .../random_looking_index_snap.hash.snap | 4 + .../filter_range_decreasing/0.hash.snap | 4 + .../filter_range_decreasing/1.hash.snap | 4 + .../filter_range_increasing/0.hash.snap | 4 + .../filter_range_increasing/1.hash.snap | 4 + .../filter_range_pinch/0.hash.snap | 4 + .../filter_range_pinch/1.hash.snap | 4 + .../random_looking_index_snap.hash.snap | 4 + .../filter_sort/0.snap | 28 +++ .../filter_sort/1.snap | 53 ++++ .../random_looking_index_snap.hash.snap | 4 + .../filter_sort_descending/0.snap | 16 ++ .../filter_sort_descending/1.snap | 49 ++++ .../random_looking_index_snap.hash.snap | 4 + milli/src/snapshot_tests.rs | 69 +++++- milli/src/update/facet/incremental.rs | 3 +- .../default/facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../default/facet_id_string_docids.hash.snap | 4 + .../facet_id_string_docids.hash.snap | 4 + .../default/facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../default/facet_id_string_docids.hash.snap | 4 - .../facet_id_string_docids.hash.snap | 4 - 40 files changed, 840 insertions(+), 81 deletions(-) create mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap create mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap create mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap create mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap create mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 16b83c2db..9e251103c 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -108,15 +108,15 @@ where #[cfg(test)] mod tests { + use crate::milli_snap; + use crate::{ + heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::test::FacetIndex, + }; use heed::BytesDecode; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; use std::ops::ControlFlow; - use crate::{ - heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::test::FacetIndex, - }; - use super::iterate_over_facet_distribution; fn get_simple_index() -> FacetIndex { @@ -151,7 +151,7 @@ mod tests { #[test] fn random_looking_index_snap() { let index = get_random_looking_index(); - insta::assert_display_snapshot!(index) + milli_snap!(format!("{index}")); } #[test] fn filter_distribution_all() { @@ -172,7 +172,7 @@ mod tests { }, ) .unwrap(); - insta::assert_snapshot!(format!("filter_distribution_{i}_all"), results); + milli_snap!(results, i); txn.commit().unwrap(); } @@ -203,7 +203,7 @@ mod tests { }, ) .unwrap(); - insta::assert_snapshot!(format!("filter_distribution_{i}_all_stop_early"), results); + milli_snap!(results, i); txn.commit().unwrap(); } diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 523b3853c..38c6acdec 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -258,6 +258,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { #[cfg(test)] mod tests { + use crate::milli_snap; use crate::{ heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec}, search::facet::test::FacetIndex, @@ -301,7 +302,7 @@ mod tests { #[test] fn random_looking_index_snap() { let index = get_random_looking_index(); - insta::assert_display_snapshot!(index) + milli_snap!(format!("{index}")); } #[test] fn filter_range_increasing() { @@ -323,10 +324,7 @@ mod tests { .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!( - format!("filter_range_{i}_increasing_included_bounds"), - results - ); + milli_snap!(results, i); let mut results = String::new(); for i in 0..=255 { let i = i as f64; @@ -342,10 +340,7 @@ mod tests { .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!( - format!("filter_range_{i}_increasing_excluded_bounds"), - results - ); + milli_snap!(results, i); txn.commit().unwrap(); } } @@ -372,10 +367,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!( - format!("filter_range_{i}_decreasing_included_bounds"), - results - ); + milli_snap!(results, i); let mut results = String::new(); @@ -394,10 +386,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!( - format!("filter_range_{i}_decreasing_excluded_bounds"), - results - ); + milli_snap!(results, i); txn.commit().unwrap(); } @@ -425,7 +414,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!(format!("filter_range_{i}_pinch_included_bounds"), results); + milli_snap!(results, i); let mut results = String::new(); @@ -444,7 +433,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - insta::assert_snapshot!(format!("filter_range_{i}_pinch_excluded_bounds"), results); + milli_snap!(results, i); txn.commit().unwrap(); } diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 8af191089..e8618c302 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -83,15 +83,15 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { - use rand::Rng; - use rand::SeedableRng; - use roaring::RoaringBitmap; - + use crate::milli_snap; use crate::{ heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::{facet_sort_ascending::ascending_facet_sort, test::FacetIndex}, snapshot_tests::display_bitmap, }; + use rand::Rng; + use rand::SeedableRng; + use roaring::RoaringBitmap; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); @@ -125,7 +125,7 @@ mod tests { #[test] fn random_looking_index_snap() { let index = get_random_looking_index(); - insta::assert_display_snapshot!(index) + milli_snap!(format!("{index}")); } #[test] fn filter_sort() { @@ -138,8 +138,9 @@ mod tests { for el in iter { let docids = el.unwrap(); results.push_str(&display_bitmap(&docids)); + results.push('\n'); } - insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results); + milli_snap!(results, i); txn.commit().unwrap(); } diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 5ce55ec6d..b8bae2f9d 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -111,15 +111,15 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { - use rand::Rng; - use rand::SeedableRng; - use roaring::RoaringBitmap; - + use crate::milli_snap; use crate::{ heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec, MyByteSlice}, search::facet::{facet_sort_descending::descending_facet_sort, test::FacetIndex}, snapshot_tests::display_bitmap, }; + use rand::Rng; + use rand::SeedableRng; + use roaring::RoaringBitmap; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); @@ -153,7 +153,7 @@ mod tests { #[test] fn random_looking_index_snap() { let index = get_random_looking_index(); - insta::assert_display_snapshot!(index) + milli_snap!(format!("{index}")); } #[test] fn filter_sort_descending() { @@ -167,8 +167,9 @@ mod tests { for el in iter { let docids = el.unwrap(); results.push_str(&display_bitmap(&docids)); + results.push('\n'); } - insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results); + milli_snap!(results, i); txn.commit().unwrap(); } diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap new file mode 100644 index 000000000..fe5f69d7d --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap @@ -0,0 +1,228 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +0: 1 +1: 1 +2: 1 +3: 1 +4: 1 +5: 1 +6: 1 +7: 1 +8: 1 +9: 1 +10: 1 +11: 1 +12: 1 +13: 1 +14: 1 +15: 1 +16: 1 +17: 1 +18: 1 +19: 1 +20: 1 +21: 1 +22: 1 +23: 1 +24: 1 +25: 1 +26: 1 +27: 1 +28: 1 +29: 1 +30: 1 +31: 1 +32: 1 +33: 1 +34: 1 +35: 1 +36: 1 +37: 1 +38: 1 +39: 1 +40: 1 +41: 1 +42: 1 +43: 1 +44: 1 +45: 1 +46: 1 +47: 1 +48: 1 +49: 1 +50: 1 +51: 1 +52: 1 +53: 1 +54: 1 +55: 1 +56: 1 +57: 1 +58: 1 +59: 1 +60: 1 +61: 1 +62: 1 +63: 1 +64: 1 +65: 1 +66: 1 +67: 1 +68: 1 +69: 1 +70: 1 +71: 1 +72: 1 +73: 1 +74: 1 +75: 1 +76: 1 +77: 1 +78: 1 +79: 1 +80: 1 +81: 1 +82: 1 +83: 1 +84: 1 +85: 1 +86: 1 +87: 1 +88: 1 +89: 1 +90: 1 +91: 1 +92: 1 +93: 1 +94: 1 +95: 1 +96: 1 +97: 1 +98: 1 +99: 1 +100: 1 +101: 1 +102: 1 +103: 1 +104: 1 +105: 1 +106: 1 +107: 1 +108: 1 +109: 1 +110: 1 +111: 1 +112: 1 +113: 1 +114: 1 +115: 1 +116: 1 +117: 1 +118: 1 +119: 1 +120: 1 +121: 1 +122: 1 +123: 1 +124: 1 +125: 1 +126: 1 +127: 1 +128: 1 +129: 1 +130: 1 +131: 1 +132: 1 +133: 1 +134: 1 +135: 1 +136: 1 +137: 1 +138: 1 +139: 1 +140: 1 +141: 1 +142: 1 +143: 1 +144: 1 +145: 1 +146: 1 +147: 1 +148: 1 +149: 1 +150: 1 +151: 1 +152: 1 +153: 1 +154: 1 +155: 1 +156: 1 +157: 1 +158: 1 +159: 1 +160: 1 +161: 1 +162: 1 +163: 1 +164: 1 +165: 1 +166: 1 +167: 1 +168: 1 +169: 1 +170: 1 +171: 1 +172: 1 +173: 1 +174: 1 +175: 1 +176: 1 +177: 1 +178: 1 +179: 1 +180: 1 +181: 1 +182: 1 +183: 1 +184: 1 +185: 1 +186: 1 +187: 1 +188: 1 +189: 1 +190: 1 +191: 1 +192: 1 +193: 1 +194: 1 +195: 1 +196: 1 +197: 1 +198: 1 +199: 1 +200: 1 +201: 1 +202: 1 +203: 1 +204: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +211: 1 +212: 1 +213: 1 +214: 1 +215: 1 +216: 1 +217: 1 +218: 1 +219: 1 +220: 1 +221: 1 +222: 1 +223: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap new file mode 100644 index 000000000..dd5e761ea --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap @@ -0,0 +1,100 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +3: 2 +5: 2 +6: 2 +9: 2 +10: 2 +11: 2 +14: 2 +18: 2 +19: 2 +24: 2 +26: 2 +28: 2 +29: 2 +32: 2 +33: 2 +35: 2 +36: 2 +37: 2 +38: 2 +39: 2 +41: 2 +46: 2 +47: 2 +49: 2 +52: 2 +53: 2 +55: 2 +59: 2 +61: 2 +64: 2 +68: 2 +71: 2 +74: 2 +75: 2 +76: 2 +81: 2 +83: 2 +85: 2 +86: 2 +88: 2 +90: 2 +91: 2 +92: 2 +98: 2 +99: 2 +101: 2 +102: 2 +103: 2 +107: 2 +111: 2 +115: 2 +119: 2 +123: 2 +124: 2 +130: 2 +131: 2 +133: 2 +135: 2 +136: 2 +137: 2 +139: 2 +141: 2 +143: 2 +144: 2 +147: 2 +150: 2 +156: 1 +158: 1 +160: 1 +162: 1 +163: 1 +164: 1 +167: 1 +169: 1 +173: 1 +177: 1 +178: 1 +179: 1 +181: 1 +182: 1 +186: 1 +189: 1 +192: 1 +193: 1 +195: 1 +197: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +216: 1 +219: 1 +220: 1 +226: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap new file mode 100644 index 000000000..7170dab89 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap @@ -0,0 +1,104 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +0: 1 +1: 1 +2: 1 +3: 1 +4: 1 +5: 1 +6: 1 +7: 1 +8: 1 +9: 1 +10: 1 +11: 1 +12: 1 +13: 1 +14: 1 +15: 1 +16: 1 +17: 1 +18: 1 +19: 1 +20: 1 +21: 1 +22: 1 +23: 1 +24: 1 +25: 1 +26: 1 +27: 1 +28: 1 +29: 1 +30: 1 +31: 1 +32: 1 +33: 1 +34: 1 +35: 1 +36: 1 +37: 1 +38: 1 +39: 1 +40: 1 +41: 1 +42: 1 +43: 1 +44: 1 +45: 1 +46: 1 +47: 1 +48: 1 +49: 1 +50: 1 +51: 1 +52: 1 +53: 1 +54: 1 +55: 1 +56: 1 +57: 1 +58: 1 +59: 1 +60: 1 +61: 1 +62: 1 +63: 1 +64: 1 +65: 1 +66: 1 +67: 1 +68: 1 +69: 1 +70: 1 +71: 1 +72: 1 +73: 1 +74: 1 +75: 1 +76: 1 +77: 1 +78: 1 +79: 1 +80: 1 +81: 1 +82: 1 +83: 1 +84: 1 +85: 1 +86: 1 +87: 1 +88: 1 +89: 1 +90: 1 +91: 1 +92: 1 +93: 1 +94: 1 +95: 1 +96: 1 +97: 1 +98: 1 +99: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap new file mode 100644 index 000000000..dd5e761ea --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap @@ -0,0 +1,100 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +3: 2 +5: 2 +6: 2 +9: 2 +10: 2 +11: 2 +14: 2 +18: 2 +19: 2 +24: 2 +26: 2 +28: 2 +29: 2 +32: 2 +33: 2 +35: 2 +36: 2 +37: 2 +38: 2 +39: 2 +41: 2 +46: 2 +47: 2 +49: 2 +52: 2 +53: 2 +55: 2 +59: 2 +61: 2 +64: 2 +68: 2 +71: 2 +74: 2 +75: 2 +76: 2 +81: 2 +83: 2 +85: 2 +86: 2 +88: 2 +90: 2 +91: 2 +92: 2 +98: 2 +99: 2 +101: 2 +102: 2 +103: 2 +107: 2 +111: 2 +115: 2 +119: 2 +123: 2 +124: 2 +130: 2 +131: 2 +133: 2 +135: 2 +136: 2 +137: 2 +139: 2 +141: 2 +143: 2 +144: 2 +147: 2 +150: 2 +156: 1 +158: 1 +160: 1 +162: 1 +163: 1 +164: 1 +167: 1 +169: 1 +173: 1 +177: 1 +178: 1 +179: 1 +181: 1 +182: 1 +186: 1 +189: 1 +192: 1 +193: 1 +195: 1 +197: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +216: 1 +219: 1 +220: 1 +226: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap new file mode 100644 index 000000000..da2b49adc --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +ea4022977d09c7854c833146276348de diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap new file mode 100644 index 000000000..e835d8934 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +52d0b31f312572c10959418434e36581 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap new file mode 100644 index 000000000..150f00f7b --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2cb9e819529823d488e141edb4307f97 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap new file mode 100644 index 000000000..4f05823f4 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +38a4352c48905f5b121d1217734862da diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap new file mode 100644 index 000000000..d2c8a3559 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +aefc1ec120fa884cc8396a68bd7de42f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap new file mode 100644 index 000000000..3fb0c94b0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9e360d7bcd29ac2c23bc241df941fd23 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap new file mode 100644 index 000000000..44fa88004 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +f0606b9af67de9ede9d469514ea1741f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap new file mode 100644 index 000000000..cf4b29ba3 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ea4022977d09c7854c833146276348de diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap new file mode 100644 index 000000000..9dcd92ed7 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap @@ -0,0 +1,28 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[200, ] +[201, ] +[202, ] +[203, ] +[204, ] +[205, ] +[206, ] +[207, ] +[208, ] +[209, ] +[210, ] +[211, ] +[212, ] +[213, ] +[214, ] +[215, ] +[216, ] +[217, ] +[218, ] +[219, ] +[220, ] +[221, ] +[222, ] +[223, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap new file mode 100644 index 000000000..a81e7377b --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap @@ -0,0 +1,53 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, ] +[202, ] +[203, ] +[207, ] +[211, ] +[215, ] +[219, ] +[223, ] +[224, ] +[230, ] +[231, ] +[233, ] +[235, ] +[236, ] +[237, ] +[239, ] +[241, ] +[243, ] +[244, ] +[247, ] +[250, ] +[256, ] +[258, ] +[260, ] +[262, ] +[263, ] +[264, ] +[267, ] +[269, ] +[273, ] +[277, ] +[278, ] +[279, ] +[281, ] +[282, ] +[286, ] +[289, ] +[292, ] +[293, ] +[295, ] +[297, ] +[205, ] +[206, ] +[208, ] +[209, ] +[210, ] +[216, ] +[220, ] +[226, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap new file mode 100644 index 000000000..785ff325c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +ea4022977d09c7854c833146276348de diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap new file mode 100644 index 000000000..05a18f000 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[247, ] +[246, ] +[245, ] +[244, ] +[207, ] +[206, ] +[205, ] +[204, ] +[203, ] +[202, ] +[201, ] +[200, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap new file mode 100644 index 000000000..9890c1aab --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap @@ -0,0 +1,49 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[243, ] +[235, ] +[226, ] +[209, ] +[208, ] +[207, ] +[206, ] +[205, ] +[297, ] +[295, ] +[293, ] +[292, ] +[289, ] +[286, ] +[282, ] +[281, ] +[279, ] +[278, ] +[277, ] +[273, ] +[269, ] +[267, ] +[264, ] +[263, ] +[262, ] +[260, ] +[258, ] +[256, ] +[250, ] +[247, ] +[244, ] +[241, ] +[239, ] +[237, ] +[236, ] +[233, ] +[231, ] +[230, ] +[224, ] +[223, ] +[215, ] +[211, ] +[203, ] +[202, ] +[201, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap new file mode 100644 index 000000000..b68843376 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +ea4022977d09c7854c833146276348de diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 933f68837..f35bda2e7 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -6,7 +6,7 @@ use std::fmt::Write; use std::path::Path; #[track_caller] -pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Settings { +pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { let mut settings = insta::Settings::clone_current(); settings.set_prepend_module_to_snapshot(false); let path = Path::new(std::panic::Location::caller().file()); @@ -16,12 +16,63 @@ pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Setti if let Some(name) = name { settings - .set_snapshot_path(Path::new("snapshots").join(filename).join(test_name).join(name)); + .set_snapshot_path(Path::new("snapshots").join(filename).join(&test_name).join(name)); } else { - settings.set_snapshot_path(Path::new("snapshots").join(filename).join(test_name)); + settings.set_snapshot_path(Path::new("snapshots").join(filename).join(&test_name)); } - settings + (settings, test_name) +} +#[macro_export] +macro_rules! milli_snap { + ($value:expr, $name:expr) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", $name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($value:expr) => { + let (settings, test_name) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", test_name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($value:expr, @$inline:literal) => { + let (settings, test_name) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", test_name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; + ($value:expr, $name:expr, @$inline:literal) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", $name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; } /** @@ -92,7 +143,7 @@ db_snap!(index, word_docids, "some_identifier", @""); #[macro_export] macro_rules! db_snap { ($index:ident, $db_name:ident, $name:expr) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some( + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some( &format!("{}", $name), )); settings.bind(|| { @@ -104,7 +155,7 @@ macro_rules! db_snap { }); }; ($index:ident, $db_name:ident) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); settings.bind(|| { let snap = $crate::full_snap_of_db!($index, $db_name); let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false); @@ -114,7 +165,7 @@ macro_rules! db_snap { }); }; ($index:ident, $db_name:ident, @$inline:literal) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); settings.bind(|| { let snap = $crate::full_snap_of_db!($index, $db_name); let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); @@ -127,8 +178,8 @@ macro_rules! db_snap { } }); }; - ($index:ident, $db_name:ident, $name:literal, @$inline:literal) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(&format!("{}", $name))); + ($index:ident, $db_name:ident, $name:expr, @$inline:literal) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(&format!("{}", $name))); settings.bind(|| { let snap = $crate::full_snap_of_db!($index, $db_name); let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index a0d426d7a..6dd1f7ac5 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,9 +1,8 @@ -use crate::facet::FacetType; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; use crate::search::facet::get_highest_level; -use crate::{Index, Result}; +use crate::Result; use heed::Error; use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; use roaring::RoaringBitmap; diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..b165e1619 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +8bc439472ccda008dc5c28aa789f433d diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..bc0668408 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +834f27a924de1acbd3cd94c0d7f10315 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..bc0668408 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +834f27a924de1acbd3cd94c0d7f10315 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap deleted file mode 100644 index 373455db6..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -587899707db2848da3f18399e14ed4d0 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index c3415c320..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -02bbf2ca1663cccea0e4c06d5ad06a45 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 78dad29f1..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -e68ea591e1af3e53e544dff9a1648e88 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 61a5908f4..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -12a4bb0f5b95d7629c2b9a915150c0cf diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 961346de5..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -6438e94bc7fada13022e0efccdf294e0 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 2b7c1ef9c..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -5348bbc46b5384455b6a900666d2a502 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap deleted file mode 100644 index 901b86255..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -faddef9eae5f2efacfec51f20f2e8cd6 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap deleted file mode 100644 index aa6c85461..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -ddb8fc987c5dc892337682595043858e From 36296bbb20e9c545d131117d85b1d3718d985378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 11:33:50 +0200 Subject: [PATCH 1709/1889] Add facet incremental indexing snapshot tests + fix bug --- milli/src/search/facet/mod.rs | 13 +- .../filter_distribution_all/0.snap | 32 + .../filter_distribution_all/1.snap | 5 + .../filter_distribution_all_stop_early/1.snap | 4 + .../random_looking_index_snap.hash.snap | 2 +- .../filter_range_decreasing/0.hash.snap | 2 +- .../filter_range_decreasing/1.hash.snap | 2 +- .../filter_range_increasing/0.hash.snap | 2 +- .../filter_range_increasing/1.hash.snap | 2 +- .../filter_range_pinch/0.hash.snap | 2 +- .../filter_range_pinch/1.hash.snap | 2 +- .../random_looking_index_snap.hash.snap | 2 +- .../filter_sort/0.snap | 32 + .../filter_sort/1.snap | 1 + .../random_looking_index_snap.hash.snap | 2 +- .../filter_sort_descending/0.snap | 44 ++ .../filter_sort_descending/1.snap | 9 +- .../random_looking_index_snap.hash.snap | 2 +- milli/src/update/facet/incremental.rs | 679 +++++++++++++++++- .../default/facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../default/facet_id_string_docids.hash.snap | 2 +- .../facet_id_string_docids.hash.snap | 2 +- .../incremental.rs/append/append.hash.snap | 4 + .../incremental.rs/delete_from_end/0.snap | 4 + .../delete_from_end/100.hash.snap | 4 + .../incremental.rs/delete_from_end/15.snap | 23 + .../delete_from_end/150.hash.snap | 4 + .../incremental.rs/delete_from_end/17.snap | 26 + .../delete_from_end/200.hash.snap | 4 + .../delete_from_start/127.hash.snap | 4 + .../incremental.rs/delete_from_start/215.snap | 54 ++ .../incremental.rs/delete_from_start/255.snap | 4 + .../delete_shuffled/127.hash.snap | 4 + .../delete_shuffled/215.hash.snap | 4 + .../incremental.rs/delete_shuffled/255.snap | 4 + .../in_place_level0_delete.hash.snap | 4 + .../in_place_level0_insert.snap | 20 + .../many_field_ids_append.hash.snap | 4 + .../many_field_ids_prepend.hash.snap | 4 + .../merge_values/merge_values.hash.snap | 4 + .../incremental.rs/prepend/prepend.hash.snap | 4 + .../shuffle_merge_string/1.hash.snap | 4 + .../shuffle_merge_string/2.hash.snap | 4 + .../shuffled/shuffled.hash.snap | 4 + 49 files changed, 1028 insertions(+), 22 deletions(-) create mode 100644 milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap create mode 100644 milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 0ed80dd92..42c0f065a 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -77,7 +77,7 @@ pub(crate) fn get_highest_level<'t>( } #[cfg(test)] -mod test { +pub mod test { use crate::update::FacetsUpdateIncremental; use heed::{BytesDecode, BytesEncode, Env, RwTxn}; use roaring::RoaringBitmap; @@ -160,6 +160,17 @@ mod test { let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.insert(rwtxn, field_id, &key_bytes, docids).unwrap(); } + pub fn delete<'a>( + &self, + rwtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + value: u32, + ) { + let update = FacetsUpdateIncremental::new(self.db.content); + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.delete(rwtxn, field_id, &key_bytes, value).unwrap(); + } } impl Display for FacetIndex diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap index fe5f69d7d..2b6123289 100644 --- a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap @@ -225,4 +225,36 @@ source: milli/src/search/facet/facet_distribution_iter.rs 221: 1 222: 1 223: 1 +224: 1 +225: 1 +226: 1 +227: 1 +228: 1 +229: 1 +230: 1 +231: 1 +232: 1 +233: 1 +234: 1 +235: 1 +236: 1 +237: 1 +238: 1 +239: 1 +240: 1 +241: 1 +242: 1 +243: 1 +244: 1 +245: 1 +246: 1 +247: 1 +248: 1 +249: 1 +250: 1 +251: 1 +252: 1 +253: 1 +254: 1 +255: 1 diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap index dd5e761ea..d0c0dd98d 100644 --- a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap @@ -96,5 +96,10 @@ source: milli/src/search/facet/facet_distribution_iter.rs 216: 1 219: 1 220: 1 +223: 1 226: 1 +235: 1 +236: 1 +238: 1 +243: 1 diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap index dd5e761ea..95c719bb0 100644 --- a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap @@ -96,5 +96,9 @@ source: milli/src/search/facet/facet_distribution_iter.rs 216: 1 219: 1 220: 1 +223: 1 226: 1 +235: 1 +236: 1 +238: 1 diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap index da2b49adc..661e1a35b 100644 --- a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_distribution_iter.rs --- -ea4022977d09c7854c833146276348de +3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap index e835d8934..7bf13e05c 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -52d0b31f312572c10959418434e36581 +fcedc563a82c1c61f50174a5f3f982b6 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap index 150f00f7b..100b928d7 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -2cb9e819529823d488e141edb4307f97 +6cc26e77fc6bd9145deedf14cf422b03 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap index 4f05823f4..db11ce952 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -38a4352c48905f5b121d1217734862da +c1c7a0bb91d53d33724583b6d4a99f16 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap index d2c8a3559..f5a81c121 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -aefc1ec120fa884cc8396a68bd7de42f +12213d3f1047a0c3d08e4670a7d688e7 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap index 3fb0c94b0..07664807e 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -9e360d7bcd29ac2c23bc241df941fd23 +3456db9a1bb94c33c1e9f656184ee711 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap index 44fa88004..ef530faa1 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -f0606b9af67de9ede9d469514ea1741f +2127cd818b457e0611e0c8e1a871602a diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap index cf4b29ba3..67a2f6bd9 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -ea4022977d09c7854c833146276348de +3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap index 9dcd92ed7..2d0f6e213 100644 --- a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap @@ -25,4 +25,36 @@ source: milli/src/search/facet/facet_sort_ascending.rs [221, ] [222, ] [223, ] +[224, ] +[225, ] +[226, ] +[227, ] +[228, ] +[229, ] +[230, ] +[231, ] +[232, ] +[233, ] +[234, ] +[235, ] +[236, ] +[237, ] +[238, ] +[239, ] +[240, ] +[241, ] +[242, ] +[243, ] +[244, ] +[245, ] +[246, ] +[247, ] +[248, ] +[249, ] +[250, ] +[251, ] +[252, ] +[253, ] +[254, ] +[255, ] diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap index a81e7377b..20d666494 100644 --- a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap @@ -50,4 +50,5 @@ source: milli/src/search/facet/facet_sort_ascending.rs [216, ] [220, ] [226, ] +[238, ] diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap index 785ff325c..64ff762db 100644 --- a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_sort_ascending.rs --- -ea4022977d09c7854c833146276348de +3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap index 05a18f000..032763c74 100644 --- a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap @@ -1,10 +1,54 @@ --- source: milli/src/search/facet/facet_sort_descending.rs --- +[255, ] +[254, ] +[253, ] +[252, ] +[251, ] +[250, ] +[249, ] +[248, ] [247, ] [246, ] [245, ] [244, ] +[243, ] +[242, ] +[241, ] +[240, ] +[239, ] +[238, ] +[237, ] +[236, ] +[235, ] +[234, ] +[233, ] +[232, ] +[231, ] +[230, ] +[229, ] +[228, ] +[227, ] +[226, ] +[225, ] +[224, ] +[223, ] +[222, ] +[221, ] +[220, ] +[219, ] +[218, ] +[217, ] +[216, ] +[215, ] +[214, ] +[213, ] +[212, ] +[211, ] +[210, ] +[209, ] +[208, ] [207, ] [206, ] [205, ] diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap index 9890c1aab..4c62cfee4 100644 --- a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap @@ -2,8 +2,15 @@ source: milli/src/search/facet/facet_sort_descending.rs --- [243, ] +[238, ] +[236, ] [235, ] [226, ] +[223, ] +[220, ] +[219, ] +[216, ] +[210, ] [209, ] [208, ] [207, ] @@ -35,12 +42,10 @@ source: milli/src/search/facet/facet_sort_descending.rs [241, ] [239, ] [237, ] -[236, ] [233, ] [231, ] [230, ] [224, ] -[223, ] [215, ] [211, ] [203, ] diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap index b68843376..0649e3c5d 100644 --- a/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_sort_descending.rs --- -ea4022977d09c7854c833146276348de +3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 6dd1f7ac5..712d7271c 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -297,7 +297,7 @@ impl FacetsUpdateIncremental { .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &highest_level_prefix)?; let mut to_add = vec![]; - for _ in 0..group_size { + for _ in 0..self.min_level_size { let mut first_key = None; let mut values = RoaringBitmap::new(); for _ in 0..group_size { @@ -459,3 +459,680 @@ impl FacetsUpdateIncremental { Ok(()) } } + +#[cfg(test)] +mod tests { + use crate::milli_snap; + use crate::{ + heed_codec::facet::new::{ + ordered_f64_codec::OrderedF64Codec, str_ref::StrRefCodec, FacetGroupValueCodec, + FacetKeyCodec, MyByteSlice, + }, + search::facet::{get_highest_level, test::FacetIndex}, + }; + use heed::{types::ByteSlice, BytesDecode, BytesEncode}; + use rand::Rng; + use rand::{seq::SliceRandom, SeedableRng}; + use roaring::RoaringBitmap; + + pub fn verify_structure_validity(index: &FacetIndex, field_id: u16) + where + for<'a> C: BytesDecode<'a> + BytesEncode<'a, EItem = >::DItem>, + { + let FacetIndex { env, db, .. } = index; + + let txn = env.write_txn().unwrap(); + let mut field_id_prefix = vec![]; + field_id_prefix.extend_from_slice(&field_id.to_be_bytes()); + + let highest_level = get_highest_level(&txn, index.db.content, field_id).unwrap(); + txn.commit().unwrap(); + + let txn = env.read_txn().unwrap(); + for level_no in (1..=highest_level).rev() { + let mut level_no_prefix = vec![]; + level_no_prefix.extend_from_slice(&field_id.to_be_bytes()); + level_no_prefix.push(level_no); + + let mut iter = db + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level_no_prefix) + .unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let key = FacetKeyCodec::::bytes_decode(&key).unwrap(); + + let mut prefix_start_below = vec![]; + prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); + prefix_start_below.push(level_no - 1); + prefix_start_below.extend_from_slice(&key.left_bound); + + let start_below = { + let mut start_below_iter = db + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + &txn, + &prefix_start_below, + ) + .unwrap(); + let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); + FacetKeyCodec::::bytes_decode(&key_bytes).unwrap() + }; + + assert!(value.size > 0 && (value.size as usize) < db.max_group_size); + + let mut actual_size = 0; + let mut values_below = RoaringBitmap::new(); + let mut iter_below = + db.content.range(&txn, &(start_below..)).unwrap().take(value.size as usize); + while let Some(el) = iter_below.next() { + let (_, value) = el.unwrap(); + actual_size += 1; + values_below |= value.bitmap; + } + assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}"); + + assert_eq!(value.bitmap, values_below); + } + } + } + #[test] + fn append() { + let index = FacetIndex::::new(4, 8); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + #[test] + fn many_field_ids_append() { + let index = FacetIndex::::new(4, 8); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 2, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 1, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + verify_structure_validity(&index, 1); + verify_structure_validity(&index, 2); + milli_snap!(format!("{index}")); + } + #[test] + fn many_field_ids_prepend() { + let index = FacetIndex::::new(4, 8); + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 2, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 1, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + verify_structure_validity(&index, 1); + verify_structure_validity(&index, 2); + milli_snap!(format!("{index}")); + } + + #[test] + fn prepend() { + let index = FacetIndex::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + txn.commit().unwrap(); + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + + #[test] + fn shuffled() { + let index = FacetIndex::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + txn.commit().unwrap(); + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + + #[test] + fn merge_values() { + let index = FacetIndex::::new(4, 8); + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(rng.gen_range(256..512)); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + txn.commit().unwrap(); + } + + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + + #[test] + fn delete_from_end() { + let index = FacetIndex::::new(4, 8); + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(&(i as f64)), &bitmap); + txn.commit().unwrap(); + } + + for i in (200..256).into_iter().rev() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 200); + + for i in (150..200).into_iter().rev() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 150); + + for i in (100..150).into_iter().rev() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 100); + + for i in (17..100).into_iter().rev() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 17); + + let mut txn = index.env.write_txn().unwrap(); + for i in (15..17).into_iter().rev() { + index.delete(&mut txn, 0, &(i as f64), i as u32); + } + txn.commit().unwrap(); + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 15); + for i in (0..15).into_iter().rev() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 0); + } + + #[test] + fn delete_from_start() { + let index = FacetIndex::::new(4, 8); + + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + + for i in 0..128 { + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 127); + for i in 128..216 { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 215); + for i in 216..256 { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(i as f64), i as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 255); + } + + #[test] + fn delete_shuffled() { + let index = FacetIndex::::new(4, 8); + + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for i in 0..128 { + let key = keys[i]; + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(key as f64), key as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 127); + for i in 128..216 { + let key = keys[i]; + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(key as f64), key as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 215); + for i in 216..256 { + let key = keys[i]; + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(key as f64), key as u32); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 255); + } + + #[test] + fn in_place_level0_insert() { + let index = FacetIndex::::new(4, 8); + let mut keys = (0..16).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + for i in 0..4 { + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(rng.gen_range(i * 256..(i + 1) * 256)); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + txn.commit().unwrap(); + } + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + + #[test] + fn in_place_level0_delete() { + let index = FacetIndex::::new(4, 8); + + let mut keys = (0..64).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + + for &key in keys.iter() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &(key as f64), key + 100); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}")); + } + + #[test] + fn shuffle_merge_string() { + let index = FacetIndex::::new(4, 8); + + let mut keys = (1000..1064).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &format!("{key:x}").as_str(), &bitmap); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 1); + + for &key in keys.iter() { + verify_structure_validity(&index, 0); + let mut txn = index.env.write_txn().unwrap(); + index.delete(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); + txn.commit().unwrap(); + } + verify_structure_validity(&index, 0); + milli_snap!(format!("{index}"), 2); + } + + // fuzz tests +} +// #[cfg(all(test, fuzzing))] +// mod fuzz { +// use crate::codec::U16Codec; + +// use super::tests::verify_structure_validity; +// use super::*; +// use fuzzcheck::mutators::integer_within_range::U16WithinRangeMutator; +// use fuzzcheck::DefaultMutator; +// use roaring::RoaringBitmap; +// use std::collections::BTreeMap; +// use std::collections::HashMap; + +// #[derive(Default)] +// pub struct TrivialDatabase { +// pub elements: BTreeMap>, +// } +// impl TrivialDatabase +// where +// T: Ord + Clone + Copy + Eq + std::fmt::Debug, +// { +// pub fn insert(&mut self, field_id: u16, new_key: T, new_values: &RoaringBitmap) { +// if new_values.is_empty() { +// return; +// } +// let values_field_id = self.elements.entry(field_id).or_default(); +// let values = values_field_id.entry(new_key).or_default(); +// *values |= new_values; +// } +// pub fn delete(&mut self, field_id: u16, key: T, value: u32) { +// if let Some(values_field_id) = self.elements.get_mut(&field_id) { +// if let Some(values) = values_field_id.get_mut(&key) { +// values.remove(value); +// if values.is_empty() { +// values_field_id.remove(&key); +// } +// } +// if values_field_id.is_empty() { +// self.elements.remove(&field_id); +// } +// } +// } +// } +// #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] +// struct Operation { +// key: Key, +// #[field_mutator(U16WithinRangeMutator = { U16WithinRangeMutator::new(..=3) })] +// field_id: u16, +// kind: OperationKind, +// } +// #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] +// enum OperationKind { +// Insert(Vec), +// Delete(u8), +// } + +// fn compare_with_trivial_database( +// tempdir: Rc, +// group_size: u8, +// max_group_size: u8, +// operations: &[Operation], +// ) { +// let index = FacetIndex::::open_from_tempdir(tempdir, group_size, max_group_size); +// let mut trivial_db = TrivialDatabase::::default(); +// let mut value_to_keys = HashMap::>::new(); +// let mut txn = index.env.write_txn().unwrap(); +// for Operation { key, field_id, kind } in operations { +// match kind { +// OperationKind::Insert(values) => { +// let mut bitmap = RoaringBitmap::new(); +// for value in values { +// bitmap.insert(*value as u32); +// value_to_keys.entry(*value).or_default().push(*key); +// } +// index.insert(&mut txn, *field_id, key, &bitmap); +// trivial_db.insert(*field_id, *key, &bitmap); +// } +// OperationKind::Delete(value) => { +// if let Some(keys) = value_to_keys.get(value) { +// for key in keys { +// index.delete(&mut txn, *field_id, key, *value as u32); +// trivial_db.delete(*field_id, *key, *value as u32); +// } +// } +// } +// } +// } +// for (field_id, values_field_id) in trivial_db.elements.iter() { +// let level0iter = index +// .db +// .content +// .as_polymorph() +// .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( +// &mut txn, +// &field_id.to_be_bytes(), +// ) +// .unwrap(); + +// for ((key, values), group) in values_field_id.iter().zip(level0iter) { +// let (group_key, group_values) = group.unwrap(); +// let group_key = FacetKeyCodec::::bytes_decode(group_key).unwrap(); +// assert_eq!(key, &group_key.left_bound); +// assert_eq!(values, &group_values.bitmap); +// } +// } + +// txn.commit().unwrap(); +// let mut txn = index.env.write_txn().unwrap(); +// for (field_id, values_field_id) in trivial_db.elements.iter() { +// let level0iter = index +// .db +// .content +// .as_polymorph() +// .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) +// .unwrap(); + +// for ((key, values), group) in values_field_id.iter().zip(level0iter) { +// let (group_key, group_values) = group.unwrap(); +// let group_key = FacetKeyCodec::::bytes_decode(group_key).unwrap(); +// assert_eq!(key, &group_key.left_bound); +// assert_eq!(values, &group_values.bitmap); +// } +// verify_structure_validity(&index, *field_id); +// } + +// index.db.content.clear(&mut txn).unwrap(); +// txn.commit().unwrap(); +// } + +// #[test] +// fn fuzz() { +// let tempdir = Rc::new(TempDir::new().unwrap()); +// let tempdir_cloned = tempdir.clone(); +// let result = fuzzcheck::fuzz_test(move |x: &(u8, u8, Vec>)| { +// compare_with_trivial_database(tempdir_cloned.clone(), x.0, x.1, &x.2) +// }) +// .default_mutator() +// .serde_serializer() +// .default_sensor_and_pool_with_custom_filter(|file, function| { +// if file.is_relative() +// && !function.contains("serde") +// && !function.contains("tests::") +// && !function.contains("fuzz::") +// && !function.contains("display_bitmap") +// { +// true +// } else { +// false +// } +// }) +// .arguments_from_cargo_fuzzcheck() +// .launch(); +// assert!(!result.found_test_failure); +// } + +// #[test] +// fn reproduce_bug() { +// let operations = r#" +// [ +// {"key":0, "field_id": 0, "kind":{"Insert":[109]}}, +// {"key":143, "field_id": 0, "kind":{"Insert":[243]}}, +// {"key":90, "field_id": 0, "kind":{"Insert":[217]}}, +// {"key":172, "field_id": 0, "kind":{"Insert":[94]}}, +// {"key":27, "field_id": 0, "kind":{"Insert":[4]}}, +// {"key":124, "field_id": 0, "kind":{"Insert":[0]}}, +// {"key":123, "field_id": 0, "kind":{"Insert":[0]}}, +// {"key":67, "field_id": 0, "kind":{"Insert":[109]}}, +// {"key":13, "field_id": 0, "kind":{"Insert":[0]}}, +// {"key":162, "field_id": 0, "kind":{"Insert":[213]}}, +// {"key":235, "field_id": 0, "kind":{"Insert":[67]}}, +// {"key":251, "field_id": 0, "kind":{"Insert":[50]}}, +// {"key":218, "field_id": 0, "kind":{"Insert":[164]}}, +// {"key":166, "field_id": 0, "kind":{"Insert":[67]}}, +// {"key":64, "field_id": 0, "kind":{"Insert":[61]}}, +// {"key":183, "field_id": 0, "kind":{"Insert":[210]}}, +// {"key":250, "field_id": 0, "kind":{"Delete":50}} +// ] +// "#; +// let operations: Vec> = serde_json::from_str(operations).unwrap(); +// let tempdir = TempDir::new().unwrap(); +// compare_with_trivial_database(Rc::new(tempdir), 4, 8, &operations); +// } + +// #[test] +// fn reproduce_bug2() { +// let operations = r#" +// [ +// {"key":102, "field_id": 0, "kind":{"Insert":[122]}}, +// {"key":73, "field_id": 0, "kind":{"Insert":[132]}}, +// {"key":20, "field_id": 0, "kind":{"Insert":[215]}}, +// {"key":39, "field_id": 0, "kind":{"Insert":[152]}}, +// {"key":151, "field_id": 0, "kind":{"Insert":[226]}}, +// {"key":17, "field_id": 0, "kind":{"Insert":[101]}}, +// {"key":74, "field_id": 0, "kind":{"Insert":[210]}}, +// {"key":2, "field_id": 0, "kind":{"Insert":[130]}}, +// {"key":64, "field_id": 0, "kind":{"Insert":[180]}}, +// {"key":83, "field_id": 0, "kind":{"Insert":[250]}}, +// {"key":80, "field_id": 0, "kind":{"Insert":[210]}}, +// {"key":113, "field_id": 0, "kind":{"Insert":[63]}}, +// {"key":201, "field_id": 0, "kind":{"Insert":[210]}}, +// {"key":200, "field_id": 0, "kind":{"Insert":[5]}}, +// {"key":93, "field_id": 0, "kind":{"Insert":[98]}}, +// {"key":162, "field_id": 0, "kind":{"Insert":[5]}}, +// {"key":80, "field_id": 0, "kind":{"Delete":210}} +// ] +// "#; +// let operations: Vec> = serde_json::from_str(operations).unwrap(); +// let tempdir = TempDir::new().unwrap(); +// compare_with_trivial_database(Rc::new(tempdir), 4, 8, &operations); +// } +// #[test] +// fn reproduce_bug3() { +// let operations = r#" +// [ +// {"key":27488, "field_id": 0, "kind":{"Insert":[206]}}, +// {"key":64716, "field_id": 0, "kind":{"Insert":[216]}}, +// {"key":60886, "field_id": 0, "kind":{"Insert":[206]}}, +// {"key":59509, "field_id": 0, "kind":{"Insert":[187,231]}}, +// {"key":55057, "field_id": 0, "kind":{"Insert":[37]}}, +// {"key":45200, "field_id": 0, "kind":{"Insert":[206]}}, +// {"key":55056, "field_id": 0, "kind":{"Insert":[37]}}, +// {"key":63679, "field_id": 0, "kind":{"Insert":[206]}}, +// {"key":52155, "field_id": 0, "kind":{"Insert":[74]}}, +// {"key":20648, "field_id": 0, "kind":{"Insert":[47,138,157]}} +// ] +// "#; +// let operations: Vec> = serde_json::from_str(operations).unwrap(); +// let tempdir = TempDir::new().unwrap(); +// compare_with_trivial_database(Rc::new(tempdir), 0, 7, &operations); +// } + +// #[test] +// fn reproduce_bug4() { +// let operations = r#" +// [{"key":63499, "field_id": 0, "kind":{"Insert":[87]}},{"key":25374, "field_id": 0, "kind":{"Insert":[14]}},{"key":64481, "field_id": 0, "kind":{"Delete":87}},{"key":23038, "field_id": 0, "kind":{"Insert":[173]}},{"key":14862, "field_id": 0, "kind":{"Insert":[8]}},{"key":13145, "field_id": 0, "kind":{"Insert":[5,64]}},{"key":23446, "field_id": 0, "kind":{"Insert":[86,59]}},{"key":17972, "field_id": 0, "kind":{"Insert":[58,137]}},{"key":21273, "field_id": 0, "kind":{"Insert":[121,132,81,147]}},{"key":28264, "field_id": 0, "kind":{"Insert":[36]}},{"key":46659, "field_id": 0, "kind":{"Insert":[]}}] +// "#; +// let operations: Vec> = serde_json::from_str(operations).unwrap(); +// let tempdir = TempDir::new().unwrap(); +// compare_with_trivial_database(Rc::new(tempdir), 2, 1, &operations); +// } +// } diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap index b165e1619..b990c31c7 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -8bc439472ccda008dc5c28aa789f433d +947949d1a5c9c4e895c89fba46cbba68 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap index bc0668408..7ed43424a 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -834f27a924de1acbd3cd94c0d7f10315 +5ce8009d3eb023e4b9c0a6e7fa4e6262 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap index bc0668408..7ed43424a 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -834f27a924de1acbd3cd94c0d7f10315 +5ce8009d3eb023e4b9c0a6e7fa4e6262 diff --git a/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap new file mode 100644 index 000000000..919f3fe7c --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +5dbfa134cc44abeb3ab6242fc182e48e diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap new file mode 100644 index 000000000..bdeeefc13 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +6ed7bf5d440599b3b10b37549a271fdf diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap new file mode 100644 index 000000000..08534cbd4 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap @@ -0,0 +1,23 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[0, ]" +0 0 k1 1 "[1, ]" +0 0 k2 1 "[2, ]" +0 0 k3 1 "[3, ]" +0 0 k4 1 "[4, ]" +0 0 k5 1 "[5, ]" +0 0 k6 1 "[6, ]" +0 0 k7 1 "[7, ]" +0 0 k8 1 "[8, ]" +0 0 k9 1 "[9, ]" +0 0 k10 1 "[10, ]" +0 0 k11 1 "[11, ]" +0 0 k12 1 "[12, ]" +0 0 k13 1 "[13, ]" +0 0 k14 1 "[14, ]" +0 1 k0 4 "[0, 1, 2, 3, ]" +0 1 k4 4 "[4, 5, 6, 7, ]" +0 1 k8 4 "[8, 9, 10, 11, ]" +0 1 k12 3 "[12, 13, 14, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap new file mode 100644 index 000000000..e9ccc990f --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b5203f0df0036ebaa133dd77d63a00eb diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap new file mode 100644 index 000000000..a98803604 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap @@ -0,0 +1,26 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[0, ]" +0 0 k1 1 "[1, ]" +0 0 k2 1 "[2, ]" +0 0 k3 1 "[3, ]" +0 0 k4 1 "[4, ]" +0 0 k5 1 "[5, ]" +0 0 k6 1 "[6, ]" +0 0 k7 1 "[7, ]" +0 0 k8 1 "[8, ]" +0 0 k9 1 "[9, ]" +0 0 k10 1 "[10, ]" +0 0 k11 1 "[11, ]" +0 0 k12 1 "[12, ]" +0 0 k13 1 "[13, ]" +0 0 k14 1 "[14, ]" +0 0 k15 1 "[15, ]" +0 0 k16 1 "[16, ]" +0 1 k0 4 "[0, 1, 2, 3, ]" +0 1 k4 4 "[4, 5, 6, 7, ]" +0 1 k8 4 "[8, 9, 10, 11, ]" +0 1 k12 4 "[12, 13, 14, 15, ]" +0 1 k16 1 "[16, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap new file mode 100644 index 000000000..bb07123a9 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +95497d8579740868ee0bfc655b0bf782 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap new file mode 100644 index 000000000..8714af061 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +d565c2f7bbd9e13e12de40cfbbfba6bb diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap new file mode 100644 index 000000000..1bba99454 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k216 1 "[216, ]" +0 0 k217 1 "[217, ]" +0 0 k218 1 "[218, ]" +0 0 k219 1 "[219, ]" +0 0 k220 1 "[220, ]" +0 0 k221 1 "[221, ]" +0 0 k222 1 "[222, ]" +0 0 k223 1 "[223, ]" +0 0 k224 1 "[224, ]" +0 0 k225 1 "[225, ]" +0 0 k226 1 "[226, ]" +0 0 k227 1 "[227, ]" +0 0 k228 1 "[228, ]" +0 0 k229 1 "[229, ]" +0 0 k230 1 "[230, ]" +0 0 k231 1 "[231, ]" +0 0 k232 1 "[232, ]" +0 0 k233 1 "[233, ]" +0 0 k234 1 "[234, ]" +0 0 k235 1 "[235, ]" +0 0 k236 1 "[236, ]" +0 0 k237 1 "[237, ]" +0 0 k238 1 "[238, ]" +0 0 k239 1 "[239, ]" +0 0 k240 1 "[240, ]" +0 0 k241 1 "[241, ]" +0 0 k242 1 "[242, ]" +0 0 k243 1 "[243, ]" +0 0 k244 1 "[244, ]" +0 0 k245 1 "[245, ]" +0 0 k246 1 "[246, ]" +0 0 k247 1 "[247, ]" +0 0 k248 1 "[248, ]" +0 0 k249 1 "[249, ]" +0 0 k250 1 "[250, ]" +0 0 k251 1 "[251, ]" +0 0 k252 1 "[252, ]" +0 0 k253 1 "[253, ]" +0 0 k254 1 "[254, ]" +0 0 k255 1 "[255, ]" +0 1 k216 4 "[216, 217, 218, 219, ]" +0 1 k220 4 "[220, 221, 222, 223, ]" +0 1 k224 4 "[224, 225, 226, 227, ]" +0 1 k228 4 "[228, 229, 230, 231, ]" +0 1 k232 4 "[232, 233, 234, 235, ]" +0 1 k236 4 "[236, 237, 238, 239, ]" +0 1 k240 4 "[240, 241, 242, 243, ]" +0 1 k244 4 "[244, 245, 246, 247, ]" +0 1 k248 4 "[248, 249, 250, 251, ]" +0 1 k252 4 "[252, 253, 254, 255, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap new file mode 100644 index 000000000..6815ee609 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7cb503827ba17e9670296cc9531a1380 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap new file mode 100644 index 000000000..6860385ee --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b061f43e379e16f0617c05d3313d0078 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap new file mode 100644 index 000000000..f96b42b27 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +81fc9489d6b163935b97433477dea63b diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap new file mode 100644 index 000000000..82a7ce716 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap @@ -0,0 +1,20 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[3, 435, 583, 849, ]" +0 0 k1 1 "[35, 494, 693, 796, ]" +0 0 k2 1 "[76, 420, 526, 909, ]" +0 0 k3 1 "[133, 451, 653, 806, ]" +0 0 k4 1 "[131, 464, 656, 853, ]" +0 0 k5 1 "[61, 308, 701, 903, ]" +0 0 k6 1 "[144, 449, 674, 794, ]" +0 0 k7 1 "[182, 451, 735, 941, ]" +0 0 k8 1 "[6, 359, 679, 1003, ]" +0 0 k9 1 "[197, 418, 659, 904, ]" +0 0 k10 1 "[88, 297, 567, 800, ]" +0 0 k11 1 "[150, 309, 530, 946, ]" +0 0 k12 1 "[156, 466, 567, 892, ]" +0 0 k13 1 "[46, 425, 610, 807, ]" +0 0 k14 1 "[236, 433, 549, 891, ]" +0 0 k15 1 "[207, 472, 603, 974, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap new file mode 100644 index 000000000..fd4beeca8 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7f8aa18d2b3a6422d55c03bede0563db diff --git a/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap new file mode 100644 index 000000000..fd4beeca8 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7f8aa18d2b3a6422d55c03bede0563db diff --git a/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap new file mode 100644 index 000000000..d055892f5 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b3e2de9020d9e0f3941bc3a179c795ba diff --git a/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap new file mode 100644 index 000000000..919f3fe7c --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +5dbfa134cc44abeb3ab6242fc182e48e diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap new file mode 100644 index 000000000..2b6805676 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +4fc800f49201a336295af0542fdf01ab diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap new file mode 100644 index 000000000..1802eb952 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +9343355bf535ed4a0c956df2b229d5e6 diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap new file mode 100644 index 000000000..5ef88bfb4 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +fd65ce7d96a07aafb0ef6cfb5bf016b8 From a7201ece04e1acc47801be7f7dd1c93388751718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 11:40:29 +0200 Subject: [PATCH 1710/1889] cargo fmt --- milli/src/heed_codec/facet/new/mod.rs | 5 +++- .../heed_codec/facet/new/ordered_f64_codec.rs | 3 +- .../search/facet/facet_distribution_iter.rs | 17 ++++++----- milli/src/search/facet/facet_range_search.rs | 30 ++++++++----------- .../src/search/facet/facet_sort_ascending.rs | 21 +++++++------ .../src/search/facet/facet_sort_descending.rs | 22 +++++++------- milli/src/search/facet/filter.rs | 12 ++++---- milli/src/search/facet/mod.rs | 18 +++++------ milli/src/snapshot_tests.rs | 8 +++-- milli/src/update/facet/bulk.rs | 20 +++++++------ milli/src/update/facet/incremental.rs | 29 +++++++++--------- .../extract/extract_facet_string_docids.rs | 8 +++-- 12 files changed, 99 insertions(+), 94 deletions(-) diff --git a/milli/src/heed_codec/facet/new/mod.rs b/milli/src/heed_codec/facet/new/mod.rs index 5ed6a61f6..04a545564 100644 --- a/milli/src/heed_codec/facet/new/mod.rs +++ b/milli/src/heed_codec/facet/new/mod.rs @@ -1,6 +1,9 @@ +use std::borrow::Cow; +use std::convert::TryFrom; +use std::marker::PhantomData; + use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; -use std::{borrow::Cow, convert::TryFrom, marker::PhantomData}; pub mod ordered_f64_codec; pub mod str_ref; diff --git a/milli/src/heed_codec/facet/new/ordered_f64_codec.rs b/milli/src/heed_codec/facet/new/ordered_f64_codec.rs index 856a9c0d1..5ac9ffcfc 100644 --- a/milli/src/heed_codec/facet/new/ordered_f64_codec.rs +++ b/milli/src/heed_codec/facet/new/ordered_f64_codec.rs @@ -1,4 +1,5 @@ -use std::{borrow::Cow, convert::TryInto}; +use std::borrow::Cow; +use std::convert::TryInto; use heed::BytesDecode; diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 9e251103c..13ba28019 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,9 +1,10 @@ -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; -use heed::Result; -use roaring::RoaringBitmap; use std::ops::ControlFlow; +use heed::Result; +use roaring::RoaringBitmap; + use super::{get_first_facet_value, get_highest_level}; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, @@ -108,16 +109,16 @@ where #[cfg(test)] mod tests { - use crate::milli_snap; - use crate::{ - heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::test::FacetIndex, - }; + use std::ops::ControlFlow; + use heed::BytesDecode; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use std::ops::ControlFlow; use super::iterate_over_facet_distribution; + use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::milli_snap; + use crate::search::facet::test::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 38c6acdec..20ad23a37 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -1,18 +1,12 @@ +use std::ops::{Bound, RangeBounds}; + use heed::BytesEncode; use roaring::RoaringBitmap; -use std::ops::Bound; -use std::ops::RangeBounds; -use crate::heed_codec::facet::new::FacetGroupValueCodec; -use crate::heed_codec::facet::new::FacetKey; -use crate::heed_codec::facet::new::FacetKeyCodec; -use crate::heed_codec::facet::new::MyByteSlice; +use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; use crate::Result; -use super::get_first_facet_value; -use super::get_highest_level; -use super::get_last_facet_value; - pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, @@ -258,17 +252,17 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { #[cfg(test)] mod tests { - use crate::milli_snap; - use crate::{ - heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec}, - search::facet::test::FacetIndex, - snapshot_tests::display_bitmap, - }; - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; use std::ops::Bound; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + use super::find_docids_of_facet_within_bounds; + use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::new::FacetKeyCodec; + use crate::milli_snap; + use crate::search::facet::test::FacetIndex; + use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index e8618c302..b3cae5d28 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -1,10 +1,10 @@ -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, -}; use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, @@ -83,16 +83,15 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { - use crate::milli_snap; - use crate::{ - heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, - search::facet::{facet_sort_ascending::ascending_facet_sort, test::FacetIndex}, - snapshot_tests::display_bitmap, - }; - use rand::Rng; - use rand::SeedableRng; + use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; + use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::milli_snap; + use crate::search::facet::facet_sort_ascending::ascending_facet_sort; + use crate::search::facet::test::FacetIndex; + use crate::snapshot_tests::display_bitmap; + fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index b8bae2f9d..d68c9bdad 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -1,12 +1,12 @@ use std::ops::Bound; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, -}; use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, @@ -111,16 +111,16 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { - use crate::milli_snap; - use crate::{ - heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec, MyByteSlice}, - search::facet::{facet_sort_descending::descending_facet_sort, test::FacetIndex}, - snapshot_tests::display_bitmap, - }; - use rand::Rng; - use rand::SeedableRng; + use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; + use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; + use crate::milli_snap; + use crate::search::facet::facet_sort_descending::descending_facet_sort; + use crate::search::facet::test::FacetIndex; + use crate::snapshot_tests::display_bitmap; + fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); let mut txn = index.env.write_txn().unwrap(); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 6ec626a5c..6a10b7097 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,18 +1,18 @@ -use either::Either; -pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; -use heed::types::DecodeIgnore; -use roaring::RoaringBitmap; use std::collections::HashSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; +use either::Either; +pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; +use heed::types::DecodeIgnore; +use roaring::RoaringBitmap; + +use super::facet_range_search; use crate::error::{Error, UserError}; use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; -use super::facet_range_search; - /// The maximum number of filters the filter AST can process. const MAX_FILTER_DEPTH: usize = 2000; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 42c0f065a..78cd8fd4b 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,10 +1,9 @@ use heed::types::ByteSlice; use heed::{BytesDecode, RoTxn}; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; - pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; mod facet_distribution; mod facet_distribution_iter; @@ -78,17 +77,18 @@ pub(crate) fn get_highest_level<'t>( #[cfg(test)] pub mod test { - use crate::update::FacetsUpdateIncremental; + use std::fmt::Display; + use std::marker::PhantomData; + use std::rc::Rc; + use heed::{BytesDecode, BytesEncode, Env, RwTxn}; use roaring::RoaringBitmap; - use std::{fmt::Display, marker::PhantomData, rc::Rc}; - use crate::{ - heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, - }, - snapshot_tests::display_bitmap, + use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; + use crate::snapshot_tests::display_bitmap; + use crate::update::FacetsUpdateIncremental; pub struct FacetIndex where diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f35bda2e7..d054e63b5 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -1,10 +1,12 @@ -use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; -use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; -use roaring::RoaringBitmap; use std::borrow::Cow; use std::fmt::Write; use std::path::Path; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; +use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; + #[track_caller] pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { let mut settings = insta::Settings::clone_current(); diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index b8acffbaf..f93ee735e 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,3 +1,14 @@ +use std::cmp; +use std::fs::File; +use std::num::NonZeroUsize; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn}; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + use crate::error::InternalError; use crate::facet::FacetType; use crate::heed_codec::facet::new::{ @@ -5,15 +16,6 @@ use crate::heed_codec::facet::new::{ }; use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; use crate::{FieldId, Index, Result}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn}; -use log::debug; -use roaring::RoaringBitmap; -use std::cmp; -use std::fs::File; -use std::num::NonZeroUsize; -use time::OffsetDateTime; pub struct FacetsUpdateBulk<'i> { index: &'i Index, diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 712d7271c..3493db0f7 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,11 +1,12 @@ +use heed::types::ByteSlice; +use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; + use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; use crate::search::facet::get_highest_level; use crate::Result; -use heed::Error; -use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; -use roaring::RoaringBitmap; enum InsertionResult { InPlace, @@ -462,19 +463,19 @@ impl FacetsUpdateIncremental { #[cfg(test)] mod tests { - use crate::milli_snap; - use crate::{ - heed_codec::facet::new::{ - ordered_f64_codec::OrderedF64Codec, str_ref::StrRefCodec, FacetGroupValueCodec, - FacetKeyCodec, MyByteSlice, - }, - search::facet::{get_highest_level, test::FacetIndex}, - }; - use heed::{types::ByteSlice, BytesDecode, BytesEncode}; - use rand::Rng; - use rand::{seq::SliceRandom, SeedableRng}; + use heed::types::ByteSlice; + use heed::{BytesDecode, BytesEncode}; + use rand::seq::SliceRandom; + use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; + use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::new::str_ref::StrRefCodec; + use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; + use crate::milli_snap; + use crate::search::facet::get_highest_level; + use crate::search::facet::test::FacetIndex; + pub fn verify_structure_validity(index: &FacetIndex, field_id: u16) where for<'a> C: BytesDecode<'a> + BytesEncode<'a, EItem = >::DItem>, diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index fe42801e7..591f44c74 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,11 +1,13 @@ +use std::fs::File; +use std::io; + +use heed::BytesEncode; + use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::new::str_ref::StrRefCodec; use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; -use heed::BytesEncode; -use std::fs::File; -use std::io; /// Extracts the facet string and the documents ids where this facet string appear. /// From afdf87f6f75cb5977c2b1ecec91406951e3d3256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 12:51:54 +0200 Subject: [PATCH 1711/1889] Fix bugs in asc/desc criterion and facet indexing --- milli/src/search/criteria/asc_desc.rs | 3 ++- milli/src/update/index_documents/extract/mod.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index a5ea9b058..23dd860e1 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -197,9 +197,10 @@ fn facet_ordered<'t>( field_id, candidates.clone(), )?; + let string_iter = make_iter( rtxn, - index.facet_id_f64_docids.remap_key_type::>(), + index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, )?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 1e414458f..208dfc74d 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -142,7 +142,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - merge_roaring_bitmaps, // TODO: check (cbo?) + merge_cbo_roaring_bitmaps, // TODO: check (cbo?) TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); From 079ed4a992df4db94d7c4b555b164cf89ab4bf1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 1 Sep 2022 12:57:29 +0200 Subject: [PATCH 1712/1889] Add more snapshots --- milli/src/search/facet/facet_range_search.rs | 12 ++++++------ .../{0.hash.snap => excluded_0.hash.snap} | 0 .../{1.hash.snap => excluded_1.hash.snap} | 0 .../filter_range_decreasing/included_0.hash.snap | 4 ++++ .../filter_range_decreasing/included_1.hash.snap | 4 ++++ .../{0.hash.snap => excluded_0.hash.snap} | 0 .../{1.hash.snap => excluded_1.hash.snap} | 0 .../filter_range_increasing/included_0.hash.snap | 4 ++++ .../filter_range_increasing/included_1.hash.snap | 4 ++++ .../{0.hash.snap => excluded_0.hash.snap} | 0 .../{1.hash.snap => excluded_1.hash.snap} | 0 .../filter_range_pinch/included_0.hash.snap | 4 ++++ .../filter_range_pinch/included_1.hash.snap | 4 ++++ milli/src/update/facet/incremental.rs | 10 +++++----- ...evel0_delete.hash.snap => after_delete.hash.snap} | 0 .../in_place_level0_delete/before_delete.hash.snap | 4 ++++ .../after_delete.hash.snap} | 0 .../before_delete.hash.snap} | 0 18 files changed, 39 insertions(+), 11 deletions(-) rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/{0.hash.snap => excluded_0.hash.snap} (100%) rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/{1.hash.snap => excluded_1.hash.snap} (100%) create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/{0.hash.snap => excluded_0.hash.snap} (100%) rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/{1.hash.snap => excluded_1.hash.snap} (100%) create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/{0.hash.snap => excluded_0.hash.snap} (100%) rename milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/{1.hash.snap => excluded_1.hash.snap} (100%) create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap rename milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/{in_place_level0_delete.hash.snap => after_delete.hash.snap} (100%) create mode 100644 milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap rename milli/src/update/facet/snapshots/incremental.rs/{shuffle_merge_string/2.hash.snap => shuffle_merge_string_and_delete/after_delete.hash.snap} (100%) rename milli/src/update/facet/snapshots/incremental.rs/{shuffle_merge_string/1.hash.snap => shuffle_merge_string_and_delete/before_delete.hash.snap} (100%) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 20ad23a37..039cd5c8d 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -318,7 +318,7 @@ mod tests { .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("included_{i}")); let mut results = String::new(); for i in 0..=255 { let i = i as f64; @@ -334,7 +334,7 @@ mod tests { .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("excluded_{i}")); txn.commit().unwrap(); } } @@ -361,7 +361,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("included_{i}")); let mut results = String::new(); @@ -380,7 +380,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("excluded_{i}")); txn.commit().unwrap(); } @@ -408,7 +408,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("included_{i}")); let mut results = String::new(); @@ -427,7 +427,7 @@ mod tests { results.push_str(&format!("{}\n", display_bitmap(&docids))); } - milli_snap!(results, i); + milli_snap!(results, format!("excluded_{i}")); txn.commit().unwrap(); } diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/0.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/1.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap new file mode 100644 index 000000000..be0b06ded --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +57d35cfa419a19a1a1f8d7c8ef096e0f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap new file mode 100644 index 000000000..93fe17b0c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3dbe0547b42759795e9b16989df72cee diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/0.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/1.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap new file mode 100644 index 000000000..fa7242056 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ca59f20e043a4d52c49e15b10adf96bb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap new file mode 100644 index 000000000..a7611d8c1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +cb69e0fe10fb299bafe77514204379cb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/0.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/1.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap new file mode 100644 index 000000000..db8a314b0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +b976551ceff412bfb2ec9bfbda320bbb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap new file mode 100644 index 000000000..2b82e07e8 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +7620ca1a96882c7147d3fd996570f9b3 diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 3493db0f7..e32a6baf1 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -853,7 +853,7 @@ mod tests { txn.commit().unwrap(); } verify_structure_validity(&index, 0); - milli_snap!(format!("{index}")); + milli_snap!(format!("{index}"), "before_delete"); for &key in keys.iter() { verify_structure_validity(&index, 0); @@ -862,11 +862,11 @@ mod tests { txn.commit().unwrap(); } verify_structure_validity(&index, 0); - milli_snap!(format!("{index}")); + milli_snap!(format!("{index}"), "after_delete"); } #[test] - fn shuffle_merge_string() { + fn shuffle_merge_string_and_delete() { let index = FacetIndex::::new(4, 8); let mut keys = (1000..1064).into_iter().collect::>(); @@ -883,7 +883,7 @@ mod tests { txn.commit().unwrap(); } verify_structure_validity(&index, 0); - milli_snap!(format!("{index}"), 1); + milli_snap!(format!("{index}"), "before_delete"); for &key in keys.iter() { verify_structure_validity(&index, 0); @@ -892,7 +892,7 @@ mod tests { txn.commit().unwrap(); } verify_structure_validity(&index, 0); - milli_snap!(format!("{index}"), 2); + milli_snap!(format!("{index}"), "after_delete"); } // fuzz tests diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap similarity index 100% rename from milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/in_place_level0_delete.hash.snap rename to milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap new file mode 100644 index 000000000..c57ca72eb --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b17b2c4ec87a778aae07854c96c08b48 diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap similarity index 100% rename from milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/2.hash.snap rename to milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap similarity index 100% rename from milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string/1.hash.snap rename to milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap From 982efab88ff0f1e34346c55f7a113299b93a46e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 12:50:31 +0200 Subject: [PATCH 1713/1889] Fix encoding bugs in facet databases --- milli/src/heed_codec/facet/new/mod.rs | 47 +++++---------------------- 1 file changed, 8 insertions(+), 39 deletions(-) diff --git a/milli/src/heed_codec/facet/new/mod.rs b/milli/src/heed_codec/facet/new/mod.rs index 04a545564..bcb2957fc 100644 --- a/milli/src/heed_codec/facet/new/mod.rs +++ b/milli/src/heed_codec/facet/new/mod.rs @@ -5,6 +5,8 @@ use std::marker::PhantomData; use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; +use crate::CboRoaringBitmapCodec; + pub mod ordered_f64_codec; pub mod str_ref; // TODO: these codecs were quickly written and not fast/resilient enough @@ -35,6 +37,7 @@ impl<'a> FacetKey> { } } +#[derive(Debug)] pub struct FacetGroupValue { pub size: u8, pub bitmap: RoaringBitmap, @@ -56,7 +59,7 @@ where v.extend_from_slice(&value.field_id.to_be_bytes()); v.extend_from_slice(&[value.level]); - let bound = T::bytes_encode(&value.left_bound).unwrap(); + let bound = T::bytes_encode(&value.left_bound)?; v.extend_from_slice(&bound); Some(Cow::Owned(v)) @@ -69,9 +72,9 @@ where type DItem = FacetKey; fn bytes_decode(bytes: &'a [u8]) -> Option { - let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).unwrap()); + let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?); let level = bytes[2]; - let bound = T::bytes_decode(&bytes[3..]).unwrap(); + let bound = T::bytes_decode(&bytes[3..])?; Some(FacetKey { field_id: fid, level, left_bound: bound }) } } @@ -83,7 +86,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { fn bytes_encode(value: &'a Self::EItem) -> Option> { let mut v = vec![]; v.push(value.size); - value.bitmap.serialize_into(&mut v).unwrap(); + CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); Some(Cow::Owned(v)) } } @@ -91,7 +94,7 @@ impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { type DItem = FacetGroupValue; fn bytes_decode(bytes: &'a [u8]) -> Option { let size = bytes[0]; - let bitmap = RoaringBitmap::deserialize_from(&bytes[1..]).unwrap(); + let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?; Some(FacetGroupValue { size, bitmap }) } } @@ -115,37 +118,3 @@ impl<'a> BytesDecode<'a> for MyByteSlice { Some(bytes) } } - -// I won't need these ones anymore -// pub struct U16Codec; -// impl<'a> BytesEncode<'a> for U16Codec { -// type EItem = u16; - -// fn bytes_encode(item: &'a Self::EItem) -> Option> { -// Some(Cow::Owned(item.to_be_bytes().to_vec())) -// } -// } -// impl<'a> BytesDecode<'a> for U16Codec { -// type DItem = u16; - -// fn bytes_decode(bytes: &'a [u8]) -> Option { -// Some(u16::from_be_bytes(bytes[0..=1].try_into().unwrap())) -// } -// } - -// pub struct StrCodec; -// impl<'a> BytesEncode<'a> for StrCodec { -// type EItem = &'a str; - -// fn bytes_encode(item: &'a &'a str) -> Option> { -// Some(Cow::Borrowed(item.as_bytes())) -// } -// } -// impl<'a> BytesDecode<'a> for StrCodec { -// type DItem = &'a str; - -// fn bytes_decode(bytes: &'a [u8]) -> Option { -// let s = std::str::from_utf8(bytes).unwrap(); -// Some(s) -// } -// } From 3d145d7f48b35739c02e3cf3a44c624cf94ce8d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 12:51:40 +0200 Subject: [PATCH 1714/1889] Merge the two _faceted_documents_ids methods into one --- milli/src/index.rs | 71 ++++++++------------------- milli/src/search/criteria/asc_desc.rs | 7 ++- milli/src/snapshot_tests.rs | 5 +- milli/src/update/clear_documents.rs | 16 ++++-- milli/src/update/delete_documents.rs | 22 +++++++-- 5 files changed, 59 insertions(+), 62 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 0561a77ac..40e78bf10 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -12,6 +12,7 @@ use rstar::RTree; use time::OffsetDateTime; use crate::error::{InternalError, UserError}; +use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; use crate::heed_codec::facet::new::str_ref::StrRefCodec; @@ -780,68 +781,38 @@ impl Index { /* faceted documents ids */ - /// Writes the documents ids that are faceted with numbers under this field id. - pub(crate) fn put_number_faceted_documents_ids( + /// Writes the documents ids that are faceted under this field id for the given facet type. + pub fn put_faceted_documents_ids( &self, wtxn: &mut RwTxn, field_id: FieldId, + facet_type: FacetType, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = - [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); + let key = match facet_type { + FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, + FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, + }; + let mut buffer = vec![0u8; key.len() + size_of::()]; + buffer[..key.len()].copy_from_slice(key.as_bytes()); + buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } - /// Retrieve all the documents ids that faceted with numbers under this field id. - pub fn number_faceted_documents_ids( + /// Retrieve all the documents ids that are faceted under this field id for the given facet type. + pub fn faceted_documents_ids( &self, rtxn: &RoTxn, field_id: FieldId, + facet_type: FacetType, ) -> heed::Result { - let mut buffer = - [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); - match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { - Some(docids) => Ok(docids), - None => Ok(RoaringBitmap::new()), - } - } - - /// Writes the documents ids that are faceted with strings under this field id. - pub(crate) fn put_string_faceted_documents_ids( - &self, - wtxn: &mut RwTxn, - field_id: FieldId, - docids: &RoaringBitmap, - ) -> heed::Result<()> { - let mut buffer = - [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); - self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) - } - - /// Retrieve all the documents ids that faceted with strings under this field id. - pub fn string_faceted_documents_ids( - &self, - rtxn: &RoTxn, - field_id: FieldId, - ) -> heed::Result { - let mut buffer = - [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); + let key = match facet_type { + FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, + FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, + }; + let mut buffer = vec![0u8; key.len() + size_of::()]; + buffer[..key.len()].copy_from_slice(key.as_bytes()); + buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 23dd860e1..ccf66889e 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -6,6 +6,7 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::facet::FacetType; use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; @@ -62,8 +63,10 @@ impl<'t> AscDesc<'t> { let field_id = fields_ids_map.id(&field_name); let faceted_candidates = match field_id { Some(field_id) => { - let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?; - let string_faceted = index.string_faceted_documents_ids(rtxn, field_id)?; + let number_faceted = + index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?; + let string_faceted = + index.faceted_documents_ids(rtxn, field_id, FacetType::String)?; number_faceted | string_faceted } None => RoaringBitmap::default(), diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index d054e63b5..57fd2e5fe 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -4,6 +4,7 @@ use std::path::Path; use roaring::RoaringBitmap; +use crate::facet::FacetType; use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; @@ -370,7 +371,7 @@ pub fn snap_number_faceted_documents_ids(index: &Index) -> String { let mut snap = String::new(); for field_id in fields_ids_map.ids() { let number_faceted_documents_ids = - index.number_faceted_documents_ids(&rtxn, field_id).unwrap(); + index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap(); writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) .unwrap(); } @@ -383,7 +384,7 @@ pub fn snap_string_faceted_documents_ids(index: &Index) -> String { let mut snap = String::new(); for field_id in fields_ids_map.ids() { let string_faceted_documents_ids = - index.string_faceted_documents_ids(&rtxn, field_id).unwrap(); + index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap(); writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) .unwrap(); } diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ba59c14cf..7d89ca89a 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,7 +1,7 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; +use crate::{facet::FacetType, ExternalDocumentsIds, FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -55,8 +55,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We clean all the faceted documents ids. for field_id in faceted_fields { - self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?; - self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::Number, + &empty_roaring, + )?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::String, + &empty_roaring, + )?; } // Clear the other databases. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 531fd2b74..ffa63f0a7 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -461,9 +461,15 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // Remove the documents ids from the faceted documents ids. for field_id in self.index.faceted_fields_ids(self.wtxn)? { // Remove docids from the number faceted documents ids - let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?; + let mut docids = + self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::Number)?; docids -= &self.to_delete_docids; - self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::Number, + &docids, + )?; remove_docids_from_field_id_docid_facet_value( self.wtxn, @@ -474,9 +480,15 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { )?; // Remove docids from the string faceted documents ids - let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?; + let mut docids = + self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::String)?; docids -= &self.to_delete_docids; - self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::String, + &docids, + )?; remove_docids_from_field_id_docid_facet_value( self.wtxn, @@ -648,7 +660,7 @@ fn remove_docids_from_facet_id_docids<'a>( if !modified { return Ok(()); } - let builder = FacetsUpdateBulk::new(index, facet_type); + let builder = FacetsUpdateBulk::new_not_updating_level_0(index, facet_type); builder.execute(wtxn)?; Ok(()) From 9b55e582cd70c4f64e3739323255e91fb433be44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 12:52:05 +0200 Subject: [PATCH 1715/1889] Add FacetsUpdate type that wraps incremental and bulk indexing methods --- milli/src/update/facet/bulk.rs | 156 ++++++++++++------ milli/src/update/facet/mod.rs | 88 ++++++++++ .../default/facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../facet_id_f64_docids.hash.snap | 2 +- .../default/facet_id_string_docids.hash.snap | 2 +- .../facet_id_string_docids.hash.snap | 2 +- .../src/update/index_documents/typed_chunk.rs | 85 ++-------- 11 files changed, 216 insertions(+), 129 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index f93ee735e..0a4b7db45 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,10 +1,11 @@ +use std::borrow::Cow; use std::cmp; use std::fs::File; use std::num::NonZeroUsize; use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn}; use log::debug; use roaring::RoaringBitmap; use time::OffsetDateTime; @@ -14,21 +15,27 @@ use crate::facet::FacetType; use crate::heed_codec::facet::new::{ FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, }; -use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; -use crate::{FieldId, Index, Result}; +use crate::update::index_documents::{ + create_writer, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, +}; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; pub struct FacetsUpdateBulk<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, level_group_size: usize, min_level_size: usize, facet_type: FacetType, + // None if level 0 does not need to be updated + new_data: Option>, } impl<'i> FacetsUpdateBulk<'i> { - pub fn new(index: &'i Index, facet_type: FacetType) -> FacetsUpdateBulk<'i> { + pub fn new( + index: &'i Index, + facet_type: FacetType, + new_data: grenad::Reader, + ) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, database: match facet_type { @@ -39,11 +46,31 @@ impl<'i> FacetsUpdateBulk<'i> { index.facet_id_f64_docids.remap_key_type::>() } }, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, level_group_size: 4, min_level_size: 5, facet_type, + new_data: Some(new_data), + } + } + + pub fn new_not_updating_level_0( + index: &'i Index, + facet_type: FacetType, + ) -> FacetsUpdateBulk<'i> { + FacetsUpdateBulk { + index, + database: match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }, + level_group_size: 4, + min_level_size: 5, + facet_type, + new_data: None, } } @@ -70,39 +97,84 @@ impl<'i> FacetsUpdateBulk<'i> { } #[logging_timer::time("FacetsUpdateBulk::{}")] - pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + // We get the faceted fields to be able to create the facet levels. let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - for &field_id in faceted_fields.iter() { self.clear_levels(wtxn, field_id)?; } + self.update_level0(wtxn)?; - let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?; + // let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?; for &field_id in faceted_fields.iter() { - let (level_readers, all_docids) = - self.compute_levels_for_field_id(field_id, &nested_wtxn)?; + let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; - let put_docids_fn = match self.facet_type { - FacetType::Number => Index::put_number_faceted_documents_ids, - FacetType::String => Index::put_string_faceted_documents_ids, - }; - put_docids_fn(&self.index, &mut nested_wtxn, field_id, &all_docids)?; + self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &all_docids)?; for level_reader in level_readers { - // TODO: append instead of write with merge - write_into_lmdb_database( - &mut nested_wtxn, - *self.database.as_polymorph(), - level_reader, - |_, _| { - Err(InternalError::IndexingMergingKeys { process: "facet string levels" })? - }, - )?; + let mut cursor = level_reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + let key = FacetKeyCodec::::bytes_decode(k).unwrap(); + let value = FacetGroupValueCodec::bytes_decode(v).unwrap(); + println!("inserting {key:?} {value:?}"); + + self.database.remap_types::().put(wtxn, k, v)?; + } + } + } + + Ok(()) + } + + fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { + let new_data = match self.new_data.take() { + Some(x) => x, + None => return Ok(()), + }; + if self.database.is_empty(wtxn)? { + let mut buffer = Vec::new(); + let mut database = self.database.iter_mut(wtxn)?.remap_types::(); + let mut cursor = new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if valid_lmdb_key(key) { + buffer.clear(); + // the group size for level 0 + buffer.push(1); + // then we extend the buffer with the docids bitmap + buffer.extend_from_slice(value); + unsafe { database.append(key, &buffer)? }; + } + } + } else { + let mut buffer = Vec::new(); + let database = self.database.remap_types::(); + + let mut cursor = new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if valid_lmdb_key(key) { + buffer.clear(); + // the group size for level 0 + buffer.push(1); + // then we extend the buffer with the docids bitmap + match database.get(wtxn, key)? { + Some(prev_value) => { + let old_bitmap = &prev_value[1..]; + CboRoaringBitmapCodec::merge_into( + &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], + &mut buffer, + )?; + } + None => { + buffer.extend_from_slice(value); + } + }; + database.put(wtxn, key, &buffer)?; + } } } @@ -114,16 +186,14 @@ impl<'i> FacetsUpdateBulk<'i> { field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { - let algo = FacetsUpdateBulkAlgorithm { + // TODO: first check whether there is anything in level 0 + let algo = ComputeHigherLevels { rtxn: txn, db: &self.database, field_id, level_group_size: self.level_group_size, min_level_size: self.min_level_size, - chunk_compression_type: self.chunk_compression_type, - chunk_compression_level: self.chunk_compression_level, }; - // TODO: first check whether there is anything in level 0 let mut all_docids = RoaringBitmap::new(); let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| { @@ -138,16 +208,14 @@ impl<'i> FacetsUpdateBulk<'i> { } } -pub struct FacetsUpdateBulkAlgorithm<'t> { +struct ComputeHigherLevels<'t> { rtxn: &'t heed::RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, field_id: u16, level_group_size: usize, min_level_size: usize, } -impl<'t> FacetsUpdateBulkAlgorithm<'t> { +impl<'t> ComputeHigherLevels<'t> { fn read_level_0( &self, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, @@ -215,11 +283,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> { // once we have computed `level_group_size` elements, we give the left bound // of those elements, and their bitmaps, to the level above - let mut cur_writer = create_writer( - self.chunk_compression_type, - self.chunk_compression_level, - tempfile::tempfile()?, - ); + let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); let mut cur_writer_len = 0; let mut group_sizes = vec![]; @@ -259,7 +323,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> { Ok(()) })?; // don't forget to insert the leftover elements into the writer as well - if !bitmaps.is_empty() && cur_writer_len >= self.level_group_size * self.min_level_size { + if !bitmaps.is_empty() && cur_writer_len >= self.min_level_size { let left_bound = left_bounds.first().unwrap(); handle_group(&bitmaps, left_bound)?; for ((bitmap, left_bound), group_size) in @@ -274,7 +338,7 @@ impl<'t> FacetsUpdateBulkAlgorithm<'t> { cur_writer_len += 1; } } - if cur_writer_len > self.level_group_size * self.min_level_size { + if cur_writer_len > self.min_level_size { sub_writers.push(writer_into_reader(cur_writer)?); } return Ok(sub_writers); @@ -315,9 +379,9 @@ mod tests { documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); } let documents = documents_batch_reader_from_objects(documents); - + dbg!(); index.add_documents(documents).unwrap(); - + dbg!(); db_snap!(index, facet_id_f64_docids, name); }; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index ecde3a248..00964a406 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -1,2 +1,90 @@ +use std::{collections::HashMap, fs::File}; + +use grenad::CompressionType; +use heed::BytesDecode; +use roaring::RoaringBitmap; + +use crate::{ + facet::FacetType, + heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}, + CboRoaringBitmapCodec, FieldId, Index, Result, +}; + +use super::{FacetsUpdateBulk, FacetsUpdateIncremental}; + pub mod bulk; pub mod incremental; + +pub struct FacetsUpdate<'i> { + index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, + level_group_size: u8, + max_level_group_size: u8, + min_level_size: u8, + facet_type: FacetType, + new_data: grenad::Reader, +} +impl<'i> FacetsUpdate<'i> { + pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { + let database = match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + Self { + index, + database, + level_group_size: 4, + max_level_group_size: 8, + min_level_size: 5, + facet_type, + new_data, + } + } + + // /// The number of elements from the level below that are represented by a single element in the level above + // /// + // /// This setting is always greater than or equal to 2. + // pub fn level_group_size(&mut self, value: u8) -> &mut Self { + // self.level_group_size = std::cmp::max(value, 2); + // self + // } + + // /// The minimum number of elements that a level is allowed to have. + // pub fn min_level_size(&mut self, value: u8) -> &mut Self { + // self.min_level_size = std::cmp::max(value, 1); + // self + // } + + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + if self.database.is_empty(wtxn)? { + let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data); + bulk_update.execute(wtxn)?; + } else { + let indexer = FacetsUpdateIncremental::new(self.database); + + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = self.new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = + FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; + let docids = + CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + + for (field_id, new_docids) in new_faceted_docids { + let mut docids = + self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; + docids |= new_docids; + self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; + } + } + Ok(()) + } +} diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap index b990c31c7..960843592 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -947949d1a5c9c4e895c89fba46cbba68 +07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap index 7ed43424a..574a3c393 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -5ce8009d3eb023e4b9c0a6e7fa4e6262 +3e6a91b3c54c614a4787224ac4278ed3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap index 7ed43424a..574a3c393 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -5ce8009d3eb023e4b9c0a6e7fa4e6262 +3e6a91b3c54c614a4787224ac4278ed3 diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index df98724da..16784bd92 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,5 +1,4 @@ use std::borrow::Cow; -use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io; @@ -14,12 +13,12 @@ use super::helpers::{ valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; -use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; +use crate::facet::FacetType; +use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::as_cloneable_grenad; -use crate::update::FacetsUpdateIncremental; use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, - Index, Result, + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, + Result, }; pub(crate) enum TypedChunk { @@ -138,78 +137,14 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { - // merge cbo roaring bitmaps is not the correct merger because the data in the DB - // is FacetGroupValue and not RoaringBitmap - // so I need to create my own merging function - - // facet_id_string_docids is encoded as: - // key: FacetKeyCodec - // value: CboRoaringBitmapCodec - // basically - - // TODO: a condition saying "if I have more than 1/50th of the DB to add, - // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50, - // it is a ratio I determine empirically - - // for now I only do it incrementally, to see if things work - let indexer = FacetsUpdateIncremental::new( - index.facet_id_f64_docids.remap_key_type::>(), - ); - - let mut new_faceted_docids = HashMap::::default(); - - let mut cursor = facet_id_f64_docids_iter.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let key = - FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; - let docids = - CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; - *new_faceted_docids.entry(key.field_id).or_default() |= docids; - } - for (field_id, new_docids) in new_faceted_docids { - let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?; - docids |= new_docids; - index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?; - } - + TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { + let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); + indexer.execute(wtxn)?; is_merged_database = true; } - TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { - // merge cbo roaring bitmaps is not the correct merger because the data in the DB - // is FacetGroupValue and not RoaringBitmap - // so I need to create my own merging function - - // facet_id_string_docids is encoded as: - // key: FacetKeyCodec - // value: CboRoaringBitmapCodec - // basically - - // TODO: a condition saying "if I have more than 1/50th of the DB to add, - // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50, - // it is a ratio I determine empirically - - // for now I only do it incrementally, to see if things work - let indexer = FacetsUpdateIncremental::new( - index.facet_id_string_docids.remap_key_type::>(), - ); - let mut new_faceted_docids = HashMap::::default(); - - let mut cursor = facet_id_string_docids.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let key = - FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; - let docids = - CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; - *new_faceted_docids.entry(key.field_id).or_default() |= docids; - } - for (field_id, new_docids) in new_faceted_docids { - let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?; - docids |= new_docids; - index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?; - } + TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { + let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); + indexer.execute(wtxn)?; is_merged_database = true; } TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { From 485a72306d6e599f5d602887a0fa02822087527d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 13:01:36 +0200 Subject: [PATCH 1716/1889] Refactor facet-related codecs --- milli/src/heed_codec/facet/mod.rs | 134 +++++++++++++++--- milli/src/heed_codec/facet/new/mod.rs | 120 ---------------- .../facet/{new => }/ordered_f64_codec.rs | 0 .../src/heed_codec/facet/{new => }/str_ref.rs | 0 milli/src/index.rs | 17 +-- milli/src/search/criteria/asc_desc.rs | 6 +- milli/src/search/distinct/facet_distinct.rs | 6 +- milli/src/search/facet/facet_distribution.rs | 15 +- .../search/facet/facet_distribution_iter.rs | 16 +-- milli/src/search/facet/facet_range_search.rs | 32 ++--- .../src/search/facet/facet_sort_ascending.rs | 20 +-- .../src/search/facet/facet_sort_descending.rs | 34 ++--- milli/src/search/facet/filter.rs | 16 ++- milli/src/search/facet/mod.rs | 22 +-- milli/src/snapshot_tests.rs | 6 +- milli/src/update/delete_documents.rs | 6 +- milli/src/update/facet/bulk.rs | 32 ++--- milli/src/update/facet/incremental.rs | 52 +++---- milli/src/update/facet/mod.rs | 27 ++-- .../extract/extract_facet_number_docids.rs | 8 +- .../extract/extract_facet_string_docids.rs | 8 +- .../word_pair_proximity_docids.hash.snap | 4 + 22 files changed, 280 insertions(+), 301 deletions(-) delete mode 100644 milli/src/heed_codec/facet/new/mod.rs rename milli/src/heed_codec/facet/{new => }/ordered_f64_codec.rs (100%) rename milli/src/heed_codec/facet/{new => }/str_ref.rs (100%) create mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index e145e311e..299aeceb4 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,25 +1,19 @@ -// mod facet_level_value_f64_codec; -// mod facet_level_value_u32_codec; -// mod facet_string_level_zero_codec; -// mod facet_string_level_zero_value_codec; -// mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; +mod ordered_f64_codec; +mod str_ref; -pub mod new; - -use heed::types::OwnedType; - -// pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; -// pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; -// pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; -// pub use self::facet_string_level_zero_value_codec::{ -// decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, -// }; -// pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; -use crate::BEU16; +pub use self::ordered_f64_codec::OrderedF64Codec; +pub use self::str_ref::StrRefCodec; +use crate::{CboRoaringBitmapCodec, BEU16}; +use heed::types::OwnedType; +use heed::{BytesDecode, BytesEncode}; +use roaring::RoaringBitmap; +use std::borrow::Cow; +use std::convert::TryFrom; +use std::marker::PhantomData; pub type FieldIdCodec = OwnedType; @@ -32,3 +26,109 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct FacetGroupKey { + pub field_id: u16, + pub level: u8, + pub left_bound: T, +} +impl<'a> FacetGroupKey<&'a [u8]> { + pub fn into_owned(self) -> FacetGroupKey> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.to_vec(), + } + } +} + +impl<'a> FacetGroupKey> { + pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.as_slice(), + } + } +} + +#[derive(Debug)] +pub struct FacetGroupValue { + pub size: u8, + pub bitmap: RoaringBitmap, +} + +pub struct FacetGroupKeyCodec { + _phantom: PhantomData, +} + +impl<'a, T> heed::BytesEncode<'a> for FacetGroupKeyCodec +where + T: BytesEncode<'a>, + T::EItem: Sized, +{ + type EItem = FacetGroupKey; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.extend_from_slice(&value.field_id.to_be_bytes()); + v.extend_from_slice(&[value.level]); + + let bound = T::bytes_encode(&value.left_bound)?; + v.extend_from_slice(&bound); + + Some(Cow::Owned(v)) + } +} +impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec +where + T: BytesDecode<'a>, +{ + type DItem = FacetGroupKey; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?); + let level = bytes[2]; + let bound = T::bytes_decode(&bytes[3..])?; + Some(FacetGroupKey { field_id: fid, level, left_bound: bound }) + } +} + +pub struct FacetGroupValueCodec; +impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { + type EItem = FacetGroupValue; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.push(value.size); + CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); + Some(Cow::Owned(v)) + } +} +impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { + type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Option { + let size = bytes[0]; + let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?; + Some(FacetGroupValue { size, bitmap }) + } +} + +pub struct ByteSliceRef; + +impl<'a> BytesEncode<'a> for ByteSliceRef { + type EItem = &'a [u8]; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Borrowed(item)) + } +} + +impl<'a> BytesDecode<'a> for ByteSliceRef { + type DItem = &'a [u8]; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(bytes) + } +} diff --git a/milli/src/heed_codec/facet/new/mod.rs b/milli/src/heed_codec/facet/new/mod.rs deleted file mode 100644 index bcb2957fc..000000000 --- a/milli/src/heed_codec/facet/new/mod.rs +++ /dev/null @@ -1,120 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryFrom; -use std::marker::PhantomData; - -use heed::{BytesDecode, BytesEncode}; -use roaring::RoaringBitmap; - -use crate::CboRoaringBitmapCodec; - -pub mod ordered_f64_codec; -pub mod str_ref; -// TODO: these codecs were quickly written and not fast/resilient enough - -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub struct FacetKey { - pub field_id: u16, - pub level: u8, - pub left_bound: T, -} -impl<'a> FacetKey<&'a [u8]> { - pub fn into_owned(self) -> FacetKey> { - FacetKey { - field_id: self.field_id, - level: self.level, - left_bound: self.left_bound.to_vec(), - } - } -} - -impl<'a> FacetKey> { - pub fn as_ref(&self) -> FacetKey<&[u8]> { - FacetKey { - field_id: self.field_id, - level: self.level, - left_bound: self.left_bound.as_slice(), - } - } -} - -#[derive(Debug)] -pub struct FacetGroupValue { - pub size: u8, - pub bitmap: RoaringBitmap, -} - -pub struct FacetKeyCodec { - _phantom: PhantomData, -} - -impl<'a, T> heed::BytesEncode<'a> for FacetKeyCodec -where - T: BytesEncode<'a>, - T::EItem: Sized, -{ - type EItem = FacetKey; - - fn bytes_encode(value: &'a Self::EItem) -> Option> { - let mut v = vec![]; - v.extend_from_slice(&value.field_id.to_be_bytes()); - v.extend_from_slice(&[value.level]); - - let bound = T::bytes_encode(&value.left_bound)?; - v.extend_from_slice(&bound); - - Some(Cow::Owned(v)) - } -} -impl<'a, T> heed::BytesDecode<'a> for FacetKeyCodec -where - T: BytesDecode<'a>, -{ - type DItem = FacetKey; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?); - let level = bytes[2]; - let bound = T::bytes_decode(&bytes[3..])?; - Some(FacetKey { field_id: fid, level, left_bound: bound }) - } -} - -pub struct FacetGroupValueCodec; -impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { - type EItem = FacetGroupValue; - - fn bytes_encode(value: &'a Self::EItem) -> Option> { - let mut v = vec![]; - v.push(value.size); - CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); - Some(Cow::Owned(v)) - } -} -impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { - type DItem = FacetGroupValue; - fn bytes_decode(bytes: &'a [u8]) -> Option { - let size = bytes[0]; - let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?; - Some(FacetGroupValue { size, bitmap }) - } -} - -// TODO: get rid of this codec as it is named confusingly + should really be part of heed -// or even replace the current ByteSlice codec -pub struct MyByteSlice; - -impl<'a> BytesEncode<'a> for MyByteSlice { - type EItem = &'a [u8]; - - fn bytes_encode(item: &'a Self::EItem) -> Option> { - Some(Cow::Borrowed(item)) - } -} - -impl<'a> BytesDecode<'a> for MyByteSlice { - type DItem = &'a [u8]; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - Some(bytes) - } -} diff --git a/milli/src/heed_codec/facet/new/ordered_f64_codec.rs b/milli/src/heed_codec/facet/ordered_f64_codec.rs similarity index 100% rename from milli/src/heed_codec/facet/new/ordered_f64_codec.rs rename to milli/src/heed_codec/facet/ordered_f64_codec.rs diff --git a/milli/src/heed_codec/facet/new/str_ref.rs b/milli/src/heed_codec/facet/str_ref.rs similarity index 100% rename from milli/src/heed_codec/facet/new/str_ref.rs rename to milli/src/heed_codec/facet/str_ref.rs diff --git a/milli/src/index.rs b/milli/src/index.rs index 40e78bf10..66a53d98c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -14,15 +14,10 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -use crate::heed_codec::facet::new::str_ref::StrRefCodec; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec}; -use crate::heed_codec::facet::{ - // FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, - FieldIdCodec, -}; +use crate::heed_codec::facet::OrderedF64Codec; +use crate::heed_codec::facet::StrRefCodec; +use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec}; +use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec}; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, @@ -130,9 +125,9 @@ pub struct Index { pub facet_id_exists_docids: Database, /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. - pub facet_id_f64_docids: Database, FacetGroupValueCodec>, + pub facet_id_f64_docids: Database, FacetGroupValueCodec>, /// Maps the facet field id and ranges of strings with the docids that corresponds to them. - pub facet_id_string_docids: Database, FacetGroupValueCodec>, + pub facet_id_string_docids: Database, FacetGroupValueCodec>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index ccf66889e..2908f0e78 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; -use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, ByteSliceRef}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::facet_sort_descending::descending_facet_sort; @@ -196,14 +196,14 @@ fn facet_ordered<'t>( let number_iter = make_iter( rtxn, - index.facet_id_f64_docids.remap_key_type::>(), + index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates.clone(), )?; let string_iter = make_iter( rtxn, - index.facet_id_string_docids.remap_key_type::>(), + index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, )?; diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 4a4815775..b9d584eb6 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::error::InternalError; -use crate::heed_codec::facet::new::FacetKey; +use crate::heed_codec::facet::FacetGroupKey; use crate::heed_codec::facet::*; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; @@ -48,7 +48,7 @@ impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { self.index .facet_id_string_docids - .get(self.txn, &FacetKey { field_id: self.distinct, level: 0, left_bound: key }) + .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) .map(|opt| opt.map(|v| v.bitmap)) } @@ -56,7 +56,7 @@ impl<'a> FacetDistinctIter<'a> { // get facet docids on level 0 self.index .facet_id_f64_docids - .get(self.txn, &FacetKey { field_id: self.distinct, level: 0, left_bound: key }) + .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) .map(|opt| opt.map(|v| v.bitmap)) } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index c7619c609..10b995d97 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -8,12 +8,11 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; -use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -use crate::heed_codec::facet::new::str_ref::StrRefCodec; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::OrderedF64Codec; +use crate::heed_codec::facet::StrRefCodec; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; use crate::search::facet::facet_distribution_iter; -// use crate::search::facet::FacetStringIter; use crate::{FieldId, Index, Result}; /// The default number of values by facets that will @@ -138,7 +137,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - self.index.facet_id_f64_docids.remap_key_type::>(), + self.index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids| { @@ -161,7 +160,7 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - self.index.facet_id_string_docids.remap_key_type::>(), + self.index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids| { @@ -191,7 +190,7 @@ impl<'a> FacetDistribution<'a> { let iter = db .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); + .remap_types::, FacetGroupValueCodec>(); for result in iter { let (key, value) = result?; @@ -206,7 +205,7 @@ impl<'a> FacetDistribution<'a> { .facet_id_string_docids .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); + .remap_types::, FacetGroupValueCodec>(); // TODO: get the original value of the facet somewhere (in the documents DB?) for result in iter { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 13ba28019..151304029 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -4,11 +4,11 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupValueCodec, FacetGroupKeyCodec}; pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: &RoaringBitmap, callback: CB, @@ -18,9 +18,9 @@ where { let mut fd = FacetDistribution { rtxn, db, field_id, callback }; let highest_level = - get_highest_level(rtxn, db.remap_key_type::>(), field_id)?; + get_highest_level(rtxn, db.remap_key_type::>(), field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; return Ok(()); } else { @@ -33,7 +33,7 @@ where CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, callback: CB, } @@ -49,7 +49,7 @@ where group_size: usize, ) -> Result> { let starting_key = - FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; + FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size); for el in iter { let (key, value) = el?; @@ -78,7 +78,7 @@ where if level == 0 { return self.iterate_level_0(candidates, starting_bound, group_size); } - let starting_key = FacetKey { field_id: self.field_id, level, left_bound: starting_bound }; + let starting_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound }; let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size); for el in iter { @@ -116,7 +116,7 @@ mod tests { use roaring::RoaringBitmap; use super::iterate_over_facet_distribution; - use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::test::FacetIndex; diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 039cd5c8d..a0e6d8e03 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,12 +4,12 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef}; use crate::Result; pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, @@ -42,13 +42,13 @@ where } Bound::Unbounded => Bound::Unbounded, }; - let db = db.remap_key_type::>(); + let db = db.remap_key_type::>(); let mut docids = RoaringBitmap::new(); let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; Ok(docids) } else { @@ -59,7 +59,7 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, @@ -68,7 +68,7 @@ struct FacetRangeSearch<'t, 'b, 'bitmap> { impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = - FacetKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; + FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; let iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); for el in iter { let (key, value) = el?; @@ -117,7 +117,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { return self.run_level_0(starting_left_bound, group_size); } - let left_key = FacetKey { field_id: self.field_id, level, left_bound: starting_left_bound }; + let left_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); let (mut previous_key, mut previous_value) = iter.next().unwrap()?; @@ -258,8 +258,8 @@ mod tests { use roaring::RoaringBitmap; use super::find_docids_of_facet_within_bounds; - use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; - use crate::heed_codec::facet::new::FacetKeyCodec; + use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::FacetGroupKeyCodec; use crate::milli_snap; use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; @@ -310,7 +310,7 @@ mod tests { let end = Bound::Included(i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -326,7 +326,7 @@ mod tests { let end = Bound::Excluded(i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -352,7 +352,7 @@ mod tests { let end = Bound::Included(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -371,7 +371,7 @@ mod tests { let end = Bound::Excluded(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -399,7 +399,7 @@ mod tests { let end = Bound::Included(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, @@ -418,7 +418,7 @@ mod tests { let end = Bound::Excluded(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.db.content.remap_key_type::>(), 0, &start, &end, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index b3cae5d28..b601242e8 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -2,19 +2,19 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +use crate::heed_codec::facet::{ + FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, }; pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, ) -> Result> + 't>> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); Ok(Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] })) @@ -25,11 +25,11 @@ pub fn ascending_facet_sort<'t>( struct AscendingFacetSort<'t, 'e> { rtxn: &'t heed::RoTxn<'e>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, - std::iter::Take, FacetGroupValueCodec>>, + std::iter::Take, FacetGroupValueCodec>>, )>, } @@ -41,7 +41,7 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { let (documents_ids, deepest_iter) = self.stack.last_mut()?; for result in deepest_iter { let ( - FacetKey { level, left_bound, field_id }, + FacetGroupKey { level, left_bound, field_id }, FacetGroupValue { size: group_size, mut bitmap }, ) = result.unwrap(); // The range is unbounded on the right and the group size for the highest level is MAX, @@ -65,7 +65,7 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { return Some(Ok(bitmap)); } let starting_key_below = - FacetKey { field_id: self.field_id, level: level - 1, left_bound }; + FacetGroupKey { field_id: self.field_id, level: level - 1, left_bound }; let iter = match self.db.range(&self.rtxn, &(starting_key_below..)) { Ok(iter) => iter, Err(e) => return Some(Err(e.into())), @@ -86,7 +86,7 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::test::FacetIndex; diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index d68c9bdad..088f8d2fa 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -4,21 +4,21 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +use crate::heed_codec::facet::{ + FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, }; pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, ) -> Result> + 't>> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); - let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound }; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); Ok(Box::new(DescendingFacetSort { rtxn, @@ -33,11 +33,11 @@ pub fn descending_facet_sort<'t>( struct DescendingFacetSort<'t> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, - std::iter::Take, FacetGroupValueCodec>>, + std::iter::Take, FacetGroupValueCodec>>, Bound<&'t [u8]>, )>, } @@ -50,7 +50,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?; while let Some(result) = deepest_iter.next() { let ( - FacetKey { level, left_bound, field_id }, + FacetGroupKey { level, left_bound, field_id }, FacetGroupValue { size: group_size, mut bitmap }, ) = result.unwrap(); // The range is unbounded on the right and the group size for the highest level is MAX, @@ -72,15 +72,15 @@ impl<'t> Iterator for DescendingFacetSort<'t> { if level == 0 { return Some(Ok(bitmap)); } - let starting_key_below = FacetKey { field_id, level: level - 1, left_bound }; + let starting_key_below = FacetGroupKey { field_id, level: level - 1, left_bound }; let end_key_kelow = match *right_bound { - Bound::Included(right) => Bound::Included(FacetKey { + Bound::Included(right) => Bound::Included(FacetGroupKey { field_id, level: level - 1, left_bound: right, }), - Bound::Excluded(right) => Bound::Excluded(FacetKey { + Bound::Excluded(right) => Bound::Excluded(FacetGroupKey { field_id, level: level - 1, left_bound: right, @@ -90,7 +90,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { let prev_right_bound = *right_bound; *right_bound = Bound::Excluded(left_bound); let iter = - match self.db.remap_key_type::>().rev_range( + match self.db.remap_key_type::>().rev_range( &self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow), ) { @@ -114,8 +114,8 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; - use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; + use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::{FacetGroupKeyCodec, ByteSliceRef}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::test::FacetIndex; @@ -162,7 +162,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let db = index.db.content.remap_key_type::>(); + let db = index.db.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); for el in iter { let docids = el.unwrap(); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 6a10b7097..1b40f6db1 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -9,8 +9,8 @@ use roaring::RoaringBitmap; use super::facet_range_search; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; +use crate::heed_codec::facet::OrderedF64Codec; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; /// The maximum number of filters the filter AST can process. @@ -180,7 +180,11 @@ impl<'a> Filter<'a> { let string_docids = strings_db .get( rtxn, - &FacetKey { field_id, level: 0, left_bound: &val.value().to_lowercase() }, + &FacetGroupKey { + field_id, + level: 0, + left_bound: &val.value().to_lowercase(), + }, )? .map(|v| v.bitmap) .unwrap_or_default(); @@ -218,10 +222,10 @@ impl<'a> Filter<'a> { .remap_data_type::() .get_lower_than_or_equal_to( rtxn, - &FacetKey { field_id, level: u8::MAX, left_bound: f64::MAX }, + &FacetGroupKey { field_id, level: u8::MAX, left_bound: f64::MAX }, )? .and_then( - |(FacetKey { field_id: id, level, .. }, _)| { + |(FacetGroupKey { field_id: id, level, .. }, _)| { if id == field_id { Some(level) } else { @@ -252,7 +256,7 @@ impl<'a> Filter<'a> { /// going deeper through the levels. fn explore_facet_number_levels( rtxn: &heed::RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: FieldId, level: u8, left: Bound, diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 78cd8fd4b..ec5caa2a8 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -3,7 +3,7 @@ use heed::{BytesDecode, RoTxn}; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; mod facet_distribution; mod facet_distribution_iter; @@ -14,7 +14,7 @@ mod filter; pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -28,7 +28,7 @@ where if let Some(first) = level0_iter_forward.next() { let (first_key, _) = first?; let first_key = - FacetKeyCodec::::bytes_decode(first_key).ok_or(heed::Error::Encoding)?; + FacetGroupKeyCodec::::bytes_decode(first_key).ok_or(heed::Error::Encoding)?; Ok(Some(first_key.left_bound)) } else { Ok(None) @@ -36,7 +36,7 @@ where } pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -51,7 +51,7 @@ where if let Some(last) = level0_iter_backward.next() { let (last_key, _) = last?; let last_key = - FacetKeyCodec::::bytes_decode(last_key).ok_or(heed::Error::Encoding)?; + FacetGroupKeyCodec::::bytes_decode(last_key).ok_or(heed::Error::Encoding)?; Ok(Some(last_key.left_bound)) } else { Ok(None) @@ -59,7 +59,7 @@ where } pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); @@ -69,7 +69,7 @@ pub(crate) fn get_highest_level<'t>( .next() .map(|el| { let (key, _) = el.unwrap(); - let key = FacetKeyCodec::::bytes_decode(key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); key.level }) .unwrap_or(0)) @@ -84,8 +84,8 @@ pub mod test { use heed::{BytesDecode, BytesEncode, Env, RwTxn}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, + use crate::heed_codec::facet::{ + FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, }; use crate::snapshot_tests::display_bitmap; use crate::update::FacetsUpdateIncremental; @@ -101,7 +101,7 @@ pub mod test { } pub struct Database { - pub content: heed::Database, FacetGroupValueCodec>, + pub content: heed::Database, FacetGroupValueCodec>, pub group_size: usize, pub max_group_size: usize, _tempdir: Rc, @@ -184,7 +184,7 @@ pub mod test { let mut iter = self.db.content.iter(&txn).unwrap(); while let Some(el) = iter.next() { let (key, value) = el.unwrap(); - let FacetKey { field_id, level, left_bound: bound } = key; + let FacetGroupKey { field_id, level, left_bound: bound } = key; let bound = BoundCodec::bytes_decode(bound).unwrap(); let FacetGroupValue { size, bitmap } = value; writeln!( diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 57fd2e5fe..ab9dddaf2 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -5,7 +5,7 @@ use std::path::Path; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::new::{FacetGroupValue, FacetKey}; +use crate::heed_codec::facet::{FacetGroupValue, FacetGroupKey}; use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; #[track_caller] @@ -280,7 +280,7 @@ pub fn snap_word_prefix_position_docids(index: &Index) -> String { } pub fn snap_facet_id_f64_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( - FacetKey { field_id, level, left_bound }, + FacetGroupKey { field_id, level, left_bound }, FacetGroupValue { size, bitmap }, )| { &format!("{field_id:<3} {level:<2} {left_bound:<6} {size:<2} {}", display_bitmap(&bitmap)) @@ -289,7 +289,7 @@ pub fn snap_facet_id_f64_docids(index: &Index) -> String { } pub fn snap_facet_id_string_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, facet_id_string_docids, |( - FacetKey { field_id, level, left_bound }, + FacetGroupKey { field_id, level, left_bound }, FacetGroupValue { size, bitmap }, )| { &format!("{field_id:<3} {level:<2} {left_bound:<12} {size:<2} {}", display_bitmap(&bitmap)) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index ffa63f0a7..5b9e99d77 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -11,7 +11,7 @@ use time::OffsetDateTime; use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; +use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ @@ -626,10 +626,10 @@ fn remove_docids_from_facet_id_docids<'a>( ) -> Result<()> { let db = match facet_type { FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() + index.facet_id_string_docids.remap_key_type::>() } FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; let mut modified = false; diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 0a4b7db45..38017a83d 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -12,8 +12,8 @@ use time::OffsetDateTime; use crate::error::InternalError; use crate::facet::FacetType; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +use crate::heed_codec::facet::{ + FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, }; use crate::update::index_documents::{ create_writer, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, @@ -22,7 +22,7 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; pub struct FacetsUpdateBulk<'i> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, + database: heed::Database, FacetGroupValueCodec>, level_group_size: usize, min_level_size: usize, facet_type: FacetType, @@ -40,10 +40,10 @@ impl<'i> FacetsUpdateBulk<'i> { index, database: match facet_type { FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() + index.facet_id_string_docids.remap_key_type::>() } FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }, level_group_size: 4, @@ -61,10 +61,10 @@ impl<'i> FacetsUpdateBulk<'i> { index, database: match facet_type { FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() + index.facet_id_string_docids.remap_key_type::>() } FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }, level_group_size: 4, @@ -89,8 +89,8 @@ impl<'i> FacetsUpdateBulk<'i> { } fn clear_levels(&self, wtxn: &mut heed::RwTxn, field_id: FieldId) -> Result<()> { - let left = FacetKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; - let right = FacetKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; + let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; + let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; let range = left..=right; self.database.delete_range(wtxn, &range).map(drop)?; Ok(()) @@ -119,7 +119,7 @@ impl<'i> FacetsUpdateBulk<'i> { for level_reader in level_readers { let mut cursor = level_reader.into_cursor()?; while let Some((k, v)) = cursor.move_on_next()? { - let key = FacetKeyCodec::::bytes_decode(k).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(k).unwrap(); let value = FacetGroupValueCodec::bytes_decode(v).unwrap(); println!("inserting {key:?} {value:?}"); @@ -210,7 +210,7 @@ impl<'i> FacetsUpdateBulk<'i> { struct ComputeHigherLevels<'t> { rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, + db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, level_group_size: usize, min_level_size: usize, @@ -233,7 +233,7 @@ impl<'t> ComputeHigherLevels<'t> { .db .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, level_0_prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); + .remap_types::, FacetGroupValueCodec>(); let mut left_bound: &[u8] = &[]; let mut first_iteration_for_new_group = true; @@ -311,9 +311,9 @@ impl<'t> ComputeHigherLevels<'t> { for ((bitmap, left_bound), group_size) in bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { - let key = FacetKey { field_id: self.field_id, level, left_bound }; + let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; let key = - FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + FacetGroupKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; @@ -329,9 +329,9 @@ impl<'t> ComputeHigherLevels<'t> { for ((bitmap, left_bound), group_size) in bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { - let key = FacetKey { field_id: self.field_id, level, left_bound }; + let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; let key = - FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + FacetGroupKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; cur_writer.insert(key, value)?; diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index e32a6baf1..e86aa4402 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -2,8 +2,8 @@ use heed::types::ByteSlice; use heed::{BytesDecode, Error, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use crate::heed_codec::facet::new::{ - FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +use crate::heed_codec::facet::{ + FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, }; use crate::search::facet::get_highest_level; use crate::Result; @@ -19,13 +19,13 @@ enum DeletionResult { } pub struct FacetsUpdateIncremental { - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, group_size: usize, min_level_size: usize, max_group_size: usize, } impl FacetsUpdateIncremental { - pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { + pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } } } @@ -36,7 +36,7 @@ impl FacetsUpdateIncremental { level: u8, search_key: &[u8], txn: &RoTxn, - ) -> Result<(FacetKey>, FacetGroupValue)> { + ) -> Result<(FacetGroupKey>, FacetGroupValue)> { let mut prefix = vec![]; prefix.extend_from_slice(&field_id.to_be_bytes()); prefix.push(level); @@ -45,17 +45,17 @@ impl FacetsUpdateIncremental { let mut prefix_iter = self .db .as_polymorph() - .prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>(txn, &prefix.as_slice())?; + .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(txn, &prefix.as_slice())?; if let Some(e) = prefix_iter.next() { let (key_bytes, value) = e?; Ok(( - FacetKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, )) } else { - let key = FacetKey { field_id, level, left_bound: search_key }; + let key = FacetGroupKey { field_id, level, left_bound: search_key }; match self.db.get_lower_than(txn, &key)? { Some((key, value)) => { if key.level != level || key.field_id != field_id { @@ -66,13 +66,13 @@ impl FacetsUpdateIncremental { let mut iter = self .db .as_polymorph() - .prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>( + .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>( txn, &prefix.as_slice(), )?; let (key_bytes, value) = iter.next().unwrap()?; Ok(( - FacetKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, @@ -93,7 +93,7 @@ impl FacetsUpdateIncremental { new_key: &[u8], new_values: &RoaringBitmap, ) -> Result { - let key = FacetKey { field_id, level: 0, left_bound: new_key }; + let key = FacetGroupKey { field_id, level: 0, left_bound: new_key }; let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 }; let mut level0_prefix = vec![]; @@ -193,7 +193,7 @@ impl FacetsUpdateIncremental { .db .get_greater_than_or_equal_to( &txn, - &FacetKey { + &FacetGroupKey { field_id, level: level_below, left_bound: insertion_key.left_bound.as_slice(), @@ -217,7 +217,7 @@ impl FacetsUpdateIncremental { } let key = - FacetKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; + FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; (key, value) }; @@ -235,7 +235,7 @@ impl FacetsUpdateIncremental { } let key = - FacetKey { field_id, level, left_bound: right_start_key.unwrap().to_vec() }; + FacetGroupKey { field_id, level, left_bound: right_start_key.unwrap().to_vec() }; let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; (key, value) }; @@ -303,7 +303,7 @@ impl FacetsUpdateIncremental { let mut values = RoaringBitmap::new(); for _ in 0..group_size { let (key_bytes, value_i) = groups_iter.next().unwrap()?; - let key_i = FacetKeyCodec::::bytes_decode(&key_bytes) + let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)?; if first_key.is_none() { @@ -311,7 +311,7 @@ impl FacetsUpdateIncremental { } values |= value_i.bitmap; } - let key = FacetKey { + let key = FacetGroupKey { field_id, level: highest_level + 1, left_bound: first_key.unwrap().left_bound, @@ -384,7 +384,7 @@ impl FacetsUpdateIncremental { key: &[u8], value: u32, ) -> Result { - let key = FacetKey { field_id, level: 0, left_bound: key }; + let key = FacetGroupKey { field_id, level: 0, left_bound: key }; let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; bitmap.remove(value); @@ -415,7 +415,7 @@ impl FacetsUpdateIncremental { key: &[u8], value: u32, ) -> Result<()> { - if self.db.get(txn, &FacetKey { field_id, level: 0, left_bound: key })?.is_none() { + if self.db.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: key })?.is_none() { return Ok(()); } let highest_level = get_highest_level(&txn, self.db, field_id)?; @@ -450,7 +450,7 @@ impl FacetsUpdateIncremental { while let Some(el) = iter.next() { let (k, _) = el?; to_delete.push( - FacetKeyCodec::::bytes_decode(k).ok_or(Error::Encoding)?.into_owned(), + FacetGroupKeyCodec::::bytes_decode(k).ok_or(Error::Encoding)?.into_owned(), ); } drop(iter); @@ -469,9 +469,9 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; - use crate::heed_codec::facet::new::str_ref::StrRefCodec; - use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; + use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::str_ref::StrRefCodec; + use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; use crate::milli_snap; use crate::search::facet::get_highest_level; use crate::search::facet::test::FacetIndex; @@ -502,7 +502,7 @@ mod tests { .unwrap(); while let Some(el) = iter.next() { let (key, value) = el.unwrap(); - let key = FacetKeyCodec::::bytes_decode(&key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); let mut prefix_start_below = vec![]; prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); @@ -519,7 +519,7 @@ mod tests { ) .unwrap(); let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); - FacetKeyCodec::::bytes_decode(&key_bytes).unwrap() + FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() }; assert!(value.size > 0 && (value.size as usize) < db.max_group_size); @@ -996,7 +996,7 @@ mod tests { // for ((key, values), group) in values_field_id.iter().zip(level0iter) { // let (group_key, group_values) = group.unwrap(); -// let group_key = FacetKeyCodec::::bytes_decode(group_key).unwrap(); +// let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); // assert_eq!(key, &group_key.left_bound); // assert_eq!(values, &group_values.bitmap); // } @@ -1014,7 +1014,7 @@ mod tests { // for ((key, values), group) in values_field_id.iter().zip(level0iter) { // let (group_key, group_values) = group.unwrap(); -// let group_key = FacetKeyCodec::::bytes_decode(group_key).unwrap(); +// let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); // assert_eq!(key, &group_key.left_bound); // assert_eq!(values, &group_values.bitmap); // } diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 00964a406..77b42f355 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -1,23 +1,20 @@ -use std::{collections::HashMap, fs::File}; - +use super::{FacetsUpdateBulk, FacetsUpdateIncremental}; +use crate::{ + facet::FacetType, + heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}, + CboRoaringBitmapCodec, FieldId, Index, Result, +}; use grenad::CompressionType; use heed::BytesDecode; use roaring::RoaringBitmap; - -use crate::{ - facet::FacetType, - heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}, - CboRoaringBitmapCodec, FieldId, Index, Result, -}; - -use super::{FacetsUpdateBulk, FacetsUpdateIncremental}; +use std::{collections::HashMap, fs::File}; pub mod bulk; pub mod incremental; pub struct FacetsUpdate<'i> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, + database: heed::Database, FacetGroupValueCodec>, level_group_size: u8, max_level_group_size: u8, min_level_size: u8, @@ -28,10 +25,10 @@ impl<'i> FacetsUpdate<'i> { pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { let database = match facet_type { FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() + index.facet_id_string_docids.remap_key_type::>() } FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; Self { @@ -70,8 +67,8 @@ impl<'i> FacetsUpdate<'i> { let mut cursor = self.new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let key = - FacetKeyCodec::::bytes_decode(key).ok_or(heed::Error::Encoding)?; + let key = FacetGroupKeyCodec::::bytes_decode(key) + .ok_or(heed::Error::Encoding)?; let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index eece08ee3..9a89691b1 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,9 +6,9 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; -use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; use crate::heed_codec::facet::FieldDocIdFacetF64Codec; +use crate::heed_codec::facet::OrderedF64Codec; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. @@ -36,8 +36,8 @@ pub fn extract_facet_number_docids( let (field_id, document_id, number) = FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); - let key = FacetKey { field_id, level: 0, left_bound: number }; - let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); + let key = FacetGroupKey { field_id, level: 0, left_bound: number }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 591f44c74..078a82335 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -4,8 +4,8 @@ use std::io; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; -use crate::heed_codec::facet::new::str_ref::StrRefCodec; -use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec}; +use crate::heed_codec::facet::StrRefCodec; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; @@ -43,8 +43,8 @@ pub fn extract_facet_string_docids( let document_id = u32::from_be_bytes(document_id_bytes); let normalised_value = std::str::from_utf8(normalized_value_bytes)?; - let key = FacetKey { field_id, level: 0, left_bound: normalised_value }; - let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); + let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?; } diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..e50e50347 --- /dev/null +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/word_prefix_pair_proximity_docids.rs +--- +6873ff1f78d08f2b1a13bb9e37349c01 From 330c9eb1b28ad84cb7f710f58682b254fae1d06d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 13:49:52 +0200 Subject: [PATCH 1717/1889] Rename facet codecs and refine FacetsUpdate API --- .../search/facet/facet_distribution_iter.rs | 19 +-- milli/src/search/facet/facet_range_search.rs | 9 +- .../src/search/facet/facet_sort_ascending.rs | 4 +- .../src/search/facet/facet_sort_descending.rs | 27 ++-- milli/src/update/facet/bulk.rs | 134 ++++++++---------- milli/src/update/facet/incremental.rs | 50 ++++--- milli/src/update/facet/mod.rs | 24 +--- .../incremental.rs/delete_from_end/15.snap | 4 - 8 files changed, 133 insertions(+), 138 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 151304029..6eec64b25 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -4,7 +4,9 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupValueCodec, FacetGroupKeyCodec}; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, +}; pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, @@ -78,7 +80,8 @@ where if level == 0 { return self.iterate_level_0(candidates, starting_bound, group_size); } - let starting_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound }; + let starting_key = + FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound }; let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size); for el in iter { @@ -109,16 +112,14 @@ where #[cfg(test)] mod tests { - use std::ops::ControlFlow; - + use super::iterate_over_facet_distribution; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::milli_snap; + use crate::search::facet::test::FacetIndex; use heed::BytesDecode; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - - use super::iterate_over_facet_distribution; - use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; - use crate::milli_snap; - use crate::search::facet::test::FacetIndex; + use std::ops::ControlFlow; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index a0e6d8e03..d9a6c5fd4 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,7 +4,9 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, +}; use crate::Result; pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( @@ -117,7 +119,8 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { return self.run_level_0(starting_left_bound, group_size); } - let left_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; + let left_key = + FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); let (mut previous_key, mut previous_value) = iter.next().unwrap()?; @@ -258,8 +261,8 @@ mod tests { use roaring::RoaringBitmap; use super::find_docids_of_facet_within_bounds; - use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; use crate::heed_codec::facet::FacetGroupKeyCodec; + use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index b601242e8..e620f9f1d 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -3,7 +3,7 @@ use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; pub fn ascending_facet_sort<'t>( @@ -86,7 +86,7 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; + use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::test::FacetIndex; diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 088f8d2fa..5425a5051 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; pub fn descending_facet_sort<'t>( @@ -37,7 +37,9 @@ struct DescendingFacetSort<'t> { field_id: u16, stack: Vec<( RoaringBitmap, - std::iter::Take, FacetGroupValueCodec>>, + std::iter::Take< + heed::RoRevRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + >, Bound<&'t [u8]>, )>, } @@ -72,7 +74,8 @@ impl<'t> Iterator for DescendingFacetSort<'t> { if level == 0 { return Some(Ok(bitmap)); } - let starting_key_below = FacetGroupKey { field_id, level: level - 1, left_bound }; + let starting_key_below = + FacetGroupKey { field_id, level: level - 1, left_bound }; let end_key_kelow = match *right_bound { Bound::Included(right) => Bound::Included(FacetGroupKey { @@ -89,15 +92,17 @@ impl<'t> Iterator for DescendingFacetSort<'t> { }; let prev_right_bound = *right_bound; *right_bound = Bound::Excluded(left_bound); - let iter = - match self.db.remap_key_type::>().rev_range( + let iter = match self + .db + .remap_key_type::>() + .rev_range( &self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow), ) { - Ok(iter) => iter, - Err(e) => return Some(Err(e.into())), - } - .take(group_size as usize); + Ok(iter) => iter, + Err(e) => return Some(Err(e.into())), + } + .take(group_size as usize); self.stack.push((bitmap, iter, prev_right_bound)); continue 'outer; @@ -114,8 +119,8 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; - use crate::heed_codec::facet::{FacetGroupKeyCodec, ByteSliceRef}; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::test::FacetIndex; diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 38017a83d..70392b7db 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,30 +1,24 @@ +use crate::facet::FacetType; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::update::index_documents::{create_writer, writer_into_reader}; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use log::debug; +use roaring::RoaringBitmap; use std::borrow::Cow; use std::cmp; use std::fs::File; -use std::num::NonZeroUsize; - -use grenad::CompressionType; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; -use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::error::InternalError; -use crate::facet::FacetType; -use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, -}; -use crate::update::index_documents::{ - create_writer, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, -}; -use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; - pub struct FacetsUpdateBulk<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, - level_group_size: usize, - min_level_size: usize, + level_group_size: u8, + min_level_size: u8, facet_type: FacetType, // None if level 0 does not need to be updated new_data: Option>, @@ -39,9 +33,9 @@ impl<'i> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, database: match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), FacetType::Number => { index.facet_id_f64_docids.remap_key_type::>() } @@ -60,9 +54,9 @@ impl<'i> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, database: match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), FacetType::Number => { index.facet_id_f64_docids.remap_key_type::>() } @@ -77,14 +71,14 @@ impl<'i> FacetsUpdateBulk<'i> { /// The number of elements from the level below that are represented by a single element in the level above /// /// This setting is always greater than or equal to 2. - pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.level_group_size = cmp::max(value.get(), 2); + pub fn level_group_size(mut self, value: u8) -> Self { + self.level_group_size = cmp::max(value, 2); self } /// The minimum number of elements that a level is allowed to have. - pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.min_level_size = value.get(); + pub fn min_level_size(mut self, value: u8) -> Self { + self.min_level_size = cmp::max(value, 1); self } @@ -109,8 +103,6 @@ impl<'i> FacetsUpdateBulk<'i> { } self.update_level0(wtxn)?; - // let mut nested_wtxn = self.index.env.nested_write_txn(wtxn)?; - for &field_id in faceted_fields.iter() { let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; @@ -119,10 +111,6 @@ impl<'i> FacetsUpdateBulk<'i> { for level_reader in level_readers { let mut cursor = level_reader.into_cursor()?; while let Some((k, v)) = cursor.move_on_next()? { - let key = FacetGroupKeyCodec::::bytes_decode(k).unwrap(); - let value = FacetGroupValueCodec::bytes_decode(v).unwrap(); - println!("inserting {key:?} {value:?}"); - self.database.remap_types::().put(wtxn, k, v)?; } } @@ -141,14 +129,12 @@ impl<'i> FacetsUpdateBulk<'i> { let mut database = self.database.iter_mut(wtxn)?.remap_types::(); let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - if valid_lmdb_key(key) { - buffer.clear(); - // the group size for level 0 - buffer.push(1); - // then we extend the buffer with the docids bitmap - buffer.extend_from_slice(value); - unsafe { database.append(key, &buffer)? }; - } + buffer.clear(); + // the group size for level 0 + buffer.push(1); + // then we extend the buffer with the docids bitmap + buffer.extend_from_slice(value); + unsafe { database.append(key, &buffer)? }; } } else { let mut buffer = Vec::new(); @@ -156,25 +142,24 @@ impl<'i> FacetsUpdateBulk<'i> { let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - if valid_lmdb_key(key) { - buffer.clear(); - // the group size for level 0 - buffer.push(1); - // then we extend the buffer with the docids bitmap - match database.get(wtxn, key)? { - Some(prev_value) => { - let old_bitmap = &prev_value[1..]; - CboRoaringBitmapCodec::merge_into( - &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], - &mut buffer, - )?; - } - None => { - buffer.extend_from_slice(value); - } - }; - database.put(wtxn, key, &buffer)?; - } + // the value is a CboRoaringBitmap, but I still need to prepend the + // group size for level 0 (= 1) to it + buffer.clear(); + buffer.push(1); + // then we extend the buffer with the docids bitmap + match database.get(wtxn, key)? { + Some(prev_value) => { + let old_bitmap = &prev_value[1..]; + CboRoaringBitmapCodec::merge_into( + &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], + &mut buffer, + )?; + } + None => { + buffer.extend_from_slice(value); + } + }; + database.put(wtxn, key, &buffer)?; } } @@ -186,7 +171,7 @@ impl<'i> FacetsUpdateBulk<'i> { field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { - // TODO: first check whether there is anything in level 0 + // TODO: first check whether there is anything in level 0? let algo = ComputeHigherLevels { rtxn: txn, db: &self.database, @@ -212,8 +197,8 @@ struct ComputeHigherLevels<'t> { rtxn: &'t heed::RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, - level_group_size: usize, - min_level_size: usize, + level_group_size: u8, + min_level_size: u8, } impl<'t> ComputeHigherLevels<'t> { fn read_level_0( @@ -248,7 +233,7 @@ impl<'t> ComputeHigherLevels<'t> { } bitmaps.push(docids); - if bitmaps.len() == self.level_group_size { + if bitmaps.len() == self.level_group_size as usize { handle_group(&bitmaps, left_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); @@ -265,9 +250,8 @@ impl<'t> ComputeHigherLevels<'t> { /// Compute the content of the database levels from its level 0 for the given field id. /// /// ## Returns: - /// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` + /// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` /// that must be inserted into the database. - /// 2. a roaring bitmap of all the document ids present in the database fn compute_higher_levels( &self, level: u8, @@ -302,7 +286,7 @@ impl<'t> ComputeHigherLevels<'t> { left_bounds.push(left_bound); bitmaps.push(combined_bitmap); - if bitmaps.len() != self.level_group_size { + if bitmaps.len() != self.level_group_size as usize { return Ok(()); } let left_bound = left_bounds.first().unwrap(); @@ -312,8 +296,8 @@ impl<'t> ComputeHigherLevels<'t> { bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; - let key = - FacetGroupKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + let key = FacetGroupKeyCodec::::bytes_encode(&key) + .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; @@ -330,8 +314,8 @@ impl<'t> ComputeHigherLevels<'t> { bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; - let key = - FacetGroupKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + let key = FacetGroupKeyCodec::::bytes_encode(&key) + .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; cur_writer.insert(key, value)?; @@ -340,6 +324,10 @@ impl<'t> ComputeHigherLevels<'t> { } if cur_writer_len > self.min_level_size { sub_writers.push(writer_into_reader(cur_writer)?); + } else { + if !bitmaps.is_empty() { + handle_group(&bitmaps, left_bounds.first().unwrap())?; + } } return Ok(sub_writers); } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index e86aa4402..bcde3bc53 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -3,7 +3,7 @@ use heed::{BytesDecode, Error, RoTxn, RwTxn}; use roaring::RoaringBitmap; use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::search::facet::get_highest_level; use crate::Result; @@ -20,14 +20,26 @@ enum DeletionResult { pub struct FacetsUpdateIncremental { db: heed::Database, FacetGroupValueCodec>, - group_size: usize, - min_level_size: usize, - max_group_size: usize, + group_size: u8, + min_level_size: u8, + max_group_size: u8, } impl FacetsUpdateIncremental { pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } } + pub fn group_size(mut self, size: u8) -> Self { + self.group_size = size; + self + } + pub fn min_level_size(mut self, size: u8) -> Self { + self.min_level_size = size; + self + } + pub fn max_group_size(mut self, size: u8) -> Self { + self.max_group_size = size; + self + } } impl FacetsUpdateIncremental { fn find_insertion_key_value( @@ -178,12 +190,7 @@ impl FacetsUpdateIncremental { let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); updated_value.size += 1; - if updated_value.size as usize == max_group_size { - // need to split it - // recompute left element and right element - // replace current group by left element - // add one more group to the right - + if updated_value.size == max_group_size { let size_left = max_group_size / 2; let size_right = max_group_size - size_left; @@ -201,7 +208,7 @@ impl FacetsUpdateIncremental { )? .unwrap(); - let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size); + let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize); let group_left = { let mut values_left = RoaringBitmap::new(); @@ -234,8 +241,11 @@ impl FacetsUpdateIncremental { values_right |= &value.bitmap; } - let key = - FacetGroupKey { field_id, level, left_bound: right_start_key.unwrap().to_vec() }; + let key = FacetGroupKey { + field_id, + level, + left_bound: right_start_key.unwrap().to_vec(), + }; let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; (key, value) }; @@ -288,7 +298,7 @@ impl FacetsUpdateIncremental { .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? .count(); - if size_highest_level < self.group_size * self.min_level_size { + if size_highest_level < self.group_size as usize * self.min_level_size as usize { return Ok(()); } @@ -438,7 +448,7 @@ impl FacetsUpdateIncremental { .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? .count() - >= self.group_size + >= self.min_level_size as usize { return Ok(()); } @@ -450,7 +460,9 @@ impl FacetsUpdateIncremental { while let Some(el) = iter.next() { let (k, _) = el?; to_delete.push( - FacetGroupKeyCodec::::bytes_decode(k).ok_or(Error::Encoding)?.into_owned(), + FacetGroupKeyCodec::::bytes_decode(k) + .ok_or(Error::Encoding)? + .into_owned(), ); } drop(iter); @@ -469,9 +481,9 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::ordered_f64_codec::OrderedF64Codec; - use crate::heed_codec::facet::str_ref::StrRefCodec; - use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::facet::StrRefCodec; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::milli_snap; use crate::search::facet::get_highest_level; use crate::search::facet::test::FacetIndex; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 77b42f355..04810cb48 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -4,7 +4,6 @@ use crate::{ heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}, CboRoaringBitmapCodec, FieldId, Index, Result, }; -use grenad::CompressionType; use heed::BytesDecode; use roaring::RoaringBitmap; use std::{collections::HashMap, fs::File}; @@ -42,26 +41,17 @@ impl<'i> FacetsUpdate<'i> { } } - // /// The number of elements from the level below that are represented by a single element in the level above - // /// - // /// This setting is always greater than or equal to 2. - // pub fn level_group_size(&mut self, value: u8) -> &mut Self { - // self.level_group_size = std::cmp::max(value, 2); - // self - // } - - // /// The minimum number of elements that a level is allowed to have. - // pub fn min_level_size(&mut self, value: u8) -> &mut Self { - // self.min_level_size = std::cmp::max(value, 1); - // self - // } - pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + // here, come up with a better condition! if self.database.is_empty(wtxn)? { - let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data); + let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data) + .level_group_size(self.level_group_size) + .min_level_size(self.min_level_size); bulk_update.execute(wtxn)?; } else { - let indexer = FacetsUpdateIncremental::new(self.database); + let indexer = FacetsUpdateIncremental::new(self.database) + .max_group_size(self.max_level_group_size) + .min_level_size(self.min_level_size); let mut new_faceted_docids = HashMap::::default(); diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap index 08534cbd4..e037c0295 100644 --- a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap @@ -16,8 +16,4 @@ source: milli/src/update/facet/incremental.rs 0 0 k12 1 "[12, ]" 0 0 k13 1 "[13, ]" 0 0 k14 1 "[14, ]" -0 1 k0 4 "[0, 1, 2, 3, ]" -0 1 k4 4 "[4, 5, 6, 7, ]" -0 1 k8 4 "[8, 9, 10, 11, ]" -0 1 k12 3 "[12, 13, 14, ]" From 9026867d17744a0a95ea4086d0efff48ddd323af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Sep 2022 17:31:26 +0200 Subject: [PATCH 1718/1889] Give same interface to bulk and incremental facet indexing types + cargo fmt, oops, sorry for the bad history :( --- milli/src/heed_codec/facet/mod.rs | 14 ++- milli/src/index.rs | 8 +- milli/src/search/criteria/asc_desc.rs | 2 +- milli/src/search/distinct/facet_distinct.rs | 3 +- milli/src/search/facet/facet_distribution.rs | 8 +- .../search/facet/facet_distribution_iter.rs | 14 ++- milli/src/search/facet/facet_range_search.rs | 7 +- .../src/search/facet/facet_sort_ascending.rs | 4 +- .../src/search/facet/facet_sort_descending.rs | 7 +- milli/src/search/facet/filter.rs | 5 +- milli/src/search/facet/mod.rs | 59 ++++++--- milli/src/snapshot_tests.rs | 2 +- milli/src/update/clear_documents.rs | 3 +- milli/src/update/delete_documents.rs | 2 +- milli/src/update/facet/bulk.rs | 118 ++++++++++++++++-- milli/src/update/facet/incremental.rs | 117 ++++++++++++----- milli/src/update/facet/mod.rs | 68 +++++----- .../default/facet_id_f64_docids.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 + .../default/facet_id_string_docids.hash.snap | 4 + .../facet_id_string_docids.hash.snap | 4 + .../extract/extract_facet_number_docids.rs | 6 +- .../extract/extract_facet_string_docids.rs | 3 +- .../index_documents/helpers/grenad_helpers.rs | 32 +---- .../src/update/index_documents/helpers/mod.rs | 4 +- milli/src/update/index_documents/mod.rs | 3 +- milli/src/update/mod.rs | 2 +- 27 files changed, 333 insertions(+), 174 deletions(-) create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 299aeceb4..40e395881 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -3,17 +3,19 @@ mod field_doc_id_facet_string_codec; mod ordered_f64_codec; mod str_ref; +use std::borrow::Cow; +use std::convert::TryFrom; +use std::marker::PhantomData; + +use heed::types::OwnedType; +use heed::{BytesDecode, BytesEncode}; +use roaring::RoaringBitmap; + pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; pub use self::ordered_f64_codec::OrderedF64Codec; pub use self::str_ref::StrRefCodec; use crate::{CboRoaringBitmapCodec, BEU16}; -use heed::types::OwnedType; -use heed::{BytesDecode, BytesEncode}; -use roaring::RoaringBitmap; -use std::borrow::Cow; -use std::convert::TryFrom; -use std::marker::PhantomData; pub type FieldIdCodec = OwnedType; diff --git a/milli/src/index.rs b/milli/src/index.rs index 66a53d98c..893817d59 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -14,10 +14,10 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::StrRefCodec; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec}; -use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec}; +use crate::heed_codec::facet::{ + FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FieldIdCodec, OrderedF64Codec, StrRefCodec, +}; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 2908f0e78..bb2788cc8 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::facet_sort_descending::descending_facet_sort; diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index b9d584eb6..1725346be 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,8 +6,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::error::InternalError; -use crate::heed_codec::facet::FacetGroupKey; -use crate::heed_codec::facet::*; +use crate::heed_codec::facet::{FacetGroupKey, *}; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 10b995d97..7c554d368 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -8,10 +8,10 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::StrRefCodec; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; -use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, + FieldDocIdFacetStringCodec, OrderedF64Codec, StrRefCodec, +}; use crate::search::facet::facet_distribution_iter; use crate::{FieldId, Index, Result}; diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 6eec64b25..2eebffbcd 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -112,17 +112,19 @@ where #[cfg(test)] mod tests { + use std::ops::ControlFlow; + + use heed::BytesDecode; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + use super::iterate_over_facet_distribution; use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::test::FacetIndex; - use heed::BytesDecode; - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - use std::ops::ControlFlow; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -133,7 +135,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index d9a6c5fd4..bb555e1ab 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -261,14 +261,13 @@ mod tests { use roaring::RoaringBitmap; use super::find_docids_of_facet_within_bounds; - use crate::heed_codec::facet::FacetGroupKeyCodec; - use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -279,7 +278,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index e620f9f1d..fc5fd3d04 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -93,7 +93,7 @@ mod tests { use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -104,7 +104,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 5425a5051..42bae42a6 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -119,15 +119,14 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::OrderedF64Codec; - use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); @@ -138,7 +137,7 @@ mod tests { index } fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 1b40f6db1..15edafb03 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -9,8 +9,9 @@ use roaring::RoaringBitmap; use super::facet_range_search; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, +}; use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; /// The maximum number of filters the filter AST can process. diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ec5caa2a8..ef72658ec 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -3,7 +3,7 @@ use heed::{BytesDecode, RoTxn}; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; mod facet_distribution; mod facet_distribution_iter; @@ -27,8 +27,8 @@ where db.as_polymorph().prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; if let Some(first) = level0_iter_forward.next() { let (first_key, _) = first?; - let first_key = - FacetGroupKeyCodec::::bytes_decode(first_key).ok_or(heed::Error::Encoding)?; + let first_key = FacetGroupKeyCodec::::bytes_decode(first_key) + .ok_or(heed::Error::Encoding)?; Ok(Some(first_key.left_bound)) } else { Ok(None) @@ -50,8 +50,8 @@ where .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; if let Some(last) = level0_iter_backward.next() { let (last_key, _) = last?; - let last_key = - FacetGroupKeyCodec::::bytes_decode(last_key).ok_or(heed::Error::Encoding)?; + let last_key = FacetGroupKeyCodec::::bytes_decode(last_key) + .ok_or(heed::Error::Encoding)?; Ok(Some(last_key.left_bound)) } else { Ok(None) @@ -85,11 +85,12 @@ pub mod test { use roaring::RoaringBitmap; use crate::heed_codec::facet::{ - FacetGroupValue, FacetGroupValueCodec, FacetGroupKey, FacetGroupKeyCodec, ByteSliceRef, + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::snapshot_tests::display_bitmap; - use crate::update::FacetsUpdateIncremental; + use crate::update::FacetsUpdateIncrementalInner; + // A dummy index that only contains the facet database, used for testing pub struct FacetIndex where for<'a> BoundCodec: @@ -100,10 +101,12 @@ pub mod test { _phantom: PhantomData, } + // The faecet database and its settings pub struct Database { pub content: heed::Database, FacetGroupValueCodec>, - pub group_size: usize, - pub max_group_size: usize, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, _tempdir: Rc, } @@ -117,9 +120,12 @@ pub mod test { tempdir: Rc, group_size: u8, max_group_size: u8, + min_level_size: u8, ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize; - let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 10 * 100); unsafe { @@ -129,14 +135,25 @@ pub mod test { let content = env.open_database(None).unwrap().unwrap(); FacetIndex { - db: Database { content, group_size, max_group_size, _tempdir: tempdir }, + db: Database { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: tempdir, + }, env, _phantom: PhantomData, } } - pub fn new(group_size: u8, max_group_size: u8) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize; - let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); + pub fn new( + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 100); let tempdir = tempfile::TempDir::new().unwrap(); @@ -144,7 +161,13 @@ pub mod test { let content = env.create_database(None).unwrap(); FacetIndex { - db: Database { content, group_size, max_group_size, _tempdir: Rc::new(tempdir) }, + db: Database { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: Rc::new(tempdir), + }, env, _phantom: PhantomData, } @@ -156,7 +179,7 @@ pub mod test { key: &'a >::EItem, docids: &RoaringBitmap, ) { - let update = FacetsUpdateIncremental::new(self.db.content); + let update = FacetsUpdateIncrementalInner::new(self.db.content); let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.insert(rwtxn, field_id, &key_bytes, docids).unwrap(); } @@ -167,7 +190,7 @@ pub mod test { key: &'a >::EItem, value: u32, ) { - let update = FacetsUpdateIncremental::new(self.db.content); + let update = FacetsUpdateIncrementalInner::new(self.db.content); let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.delete(rwtxn, field_id, &key_bytes, value).unwrap(); } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index ab9dddaf2..9bc39d882 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -5,7 +5,7 @@ use std::path::Path; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupValue, FacetGroupKey}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; #[track_caller] diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 7d89ca89a..adeea11fa 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,7 +1,8 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::{facet::FacetType, ExternalDocumentsIds, FieldDistribution, Index, Result}; +use crate::facet::FacetType; +use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 5b9e99d77..14ef5fd6a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -11,7 +11,7 @@ use time::OffsetDateTime; use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupValueCodec, FacetGroupKeyCodec, ByteSliceRef}; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 70392b7db..ad97ed2de 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,18 +1,20 @@ +use std::borrow::Cow; +use std::cmp; +use std::fs::File; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; -use roaring::RoaringBitmap; -use std::borrow::Cow; -use std::cmp; -use std::fs::File; -use time::OffsetDateTime; pub struct FacetsUpdateBulk<'i> { index: &'i Index, @@ -367,9 +369,7 @@ mod tests { documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); } let documents = documents_batch_reader_from_objects(documents); - dbg!(); index.add_documents(documents).unwrap(); - dbg!(); db_snap!(index, facet_id_f64_docids, name); }; @@ -421,4 +421,100 @@ mod tests { test("default", None, None); test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); } + + #[test] + fn test_facets_number_incremental_update() { + let test = + |name: &str, group_size: Option, min_level_size: Option| { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB + index.index_documents_config.autogenerate_docids = true; + index.index_documents_config.facet_level_group_size = group_size; + index.index_documents_config.facet_min_level_size = min_level_size; + + index + .update_settings(|settings| { + settings.set_filterable_fields( + IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) + .collect(), + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 0..100 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + let mut documents = vec![]; + for i in 1000..1010 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 100..110 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + db_snap!(index, facet_id_f64_docids, name); + }; + + test("default", None, None); + test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + } + + #[test] + fn test_facets_number_delete_facet_id_then_bulk_update() { + let test = + |name: &str, group_size: Option, min_level_size: Option| { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB + index.index_documents_config.autogenerate_docids = true; + index.index_documents_config.facet_level_group_size = group_size; + index.index_documents_config.facet_min_level_size = min_level_size; + + index + .update_settings(|settings| { + settings.set_filterable_fields( + IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) + .collect(), + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 0..100 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + // 1100 facets -> how long is the DB? + + let mut documents = vec![]; + for i in 1000..1010 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 100..110 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents_batch = documents_batch_reader_from_objects(documents.clone()); + + index.add_documents(documents_batch).unwrap(); + + db_snap!(index, facet_id_f64_docids, name); + }; + + test("default", None, None); + test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + } } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index bcde3bc53..75ca5d55b 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,12 +1,16 @@ +use std::collections::HashMap; +use std::fs::File; + use heed::types::ByteSlice; use heed::{BytesDecode, Error, RoTxn, RwTxn}; use roaring::RoaringBitmap; +use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::search::facet::get_highest_level; -use crate::Result; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; enum InsertionResult { InPlace, @@ -18,30 +22,79 @@ enum DeletionResult { Remove { prev: Option>, next: Option> }, } -pub struct FacetsUpdateIncremental { +pub struct FacetsUpdateIncremental<'i> { + index: &'i Index, + inner: FacetsUpdateIncrementalInner, + facet_type: FacetType, + new_data: grenad::Reader, +} + +impl<'i> FacetsUpdateIncremental<'i> { + pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { + FacetsUpdateIncremental { + index, + inner: FacetsUpdateIncrementalInner { + db: match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => index + .facet_id_f64_docids + .remap_key_type::>(), + }, + group_size: 4, + max_group_size: 8, + min_level_size: 5, + }, + facet_type, + new_data, + } + } + pub fn group_size(mut self, size: u8) -> Self { + self.inner.group_size = size; + self + } + pub fn min_level_size(mut self, size: u8) -> Self { + self.inner.min_level_size = size; + self + } + pub fn max_group_size(mut self, size: u8) -> Self { + self.inner.max_group_size = size; + self + } + pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = self.new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = FacetGroupKeyCodec::::bytes_decode(key) + .ok_or(heed::Error::Encoding)?; + let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + + for (field_id, new_docids) in new_faceted_docids { + let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; + docids |= new_docids; + self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; + } + Ok(()) + } +} + +pub struct FacetsUpdateIncrementalInner { db: heed::Database, FacetGroupValueCodec>, group_size: u8, min_level_size: u8, max_group_size: u8, } -impl FacetsUpdateIncremental { +impl FacetsUpdateIncrementalInner { pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } } - pub fn group_size(mut self, size: u8) -> Self { - self.group_size = size; - self - } - pub fn min_level_size(mut self, size: u8) -> Self { - self.min_level_size = size; - self - } - pub fn max_group_size(mut self, size: u8) -> Self { - self.max_group_size = size; - self - } } -impl FacetsUpdateIncremental { +impl FacetsUpdateIncrementalInner { fn find_insertion_key_value( &self, field_id: u16, @@ -481,9 +534,9 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::OrderedF64Codec; - use crate::heed_codec::facet::StrRefCodec; - use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; + use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, StrRefCodec, + }; use crate::milli_snap; use crate::search::facet::get_highest_level; use crate::search::facet::test::FacetIndex; @@ -534,7 +587,7 @@ mod tests { FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() }; - assert!(value.size > 0 && (value.size as usize) < db.max_group_size); + assert!(value.size > 0 && value.size < db.max_group_size); let mut actual_size = 0; let mut values_below = RoaringBitmap::new(); @@ -553,7 +606,7 @@ mod tests { } #[test] fn append() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); @@ -566,7 +619,7 @@ mod tests { } #[test] fn many_field_ids_append() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256u16 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); @@ -595,7 +648,7 @@ mod tests { } #[test] fn many_field_ids_prepend() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in (0..256).into_iter().rev() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i as u32); @@ -625,7 +678,7 @@ mod tests { #[test] fn prepend() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); for i in (0..256).into_iter().rev() { @@ -640,7 +693,7 @@ mod tests { #[test] fn shuffled() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); let mut keys = (0..256).into_iter().collect::>(); @@ -659,7 +712,7 @@ mod tests { #[test] fn merge_values() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (0..256).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -680,7 +733,7 @@ mod tests { #[test] fn delete_from_end() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); @@ -745,7 +798,7 @@ mod tests { #[test] fn delete_from_start() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); @@ -783,7 +836,7 @@ mod tests { #[test] fn delete_shuffled() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); @@ -829,7 +882,7 @@ mod tests { #[test] fn in_place_level0_insert() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (0..16).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); @@ -849,7 +902,7 @@ mod tests { #[test] fn in_place_level0_delete() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (0..64).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -879,7 +932,7 @@ mod tests { #[test] fn shuffle_merge_string_and_delete() { - let index = FacetIndex::::new(4, 8); + let index = FacetIndex::::new(4, 8, 5); let mut keys = (1000..1064).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 04810cb48..3b46bb421 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -1,12 +1,9 @@ -use super::{FacetsUpdateBulk, FacetsUpdateIncremental}; -use crate::{ - facet::FacetType, - heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}, - CboRoaringBitmapCodec, FieldId, Index, Result, -}; -use heed::BytesDecode; -use roaring::RoaringBitmap; -use std::{collections::HashMap, fs::File}; +use self::incremental::FacetsUpdateIncremental; +use super::FacetsUpdateBulk; +use crate::facet::FacetType; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::{Index, Result}; +use std::fs::File; pub mod bulk; pub mod incremental; @@ -14,11 +11,13 @@ pub mod incremental; pub struct FacetsUpdate<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, + facet_type: FacetType, + new_data: grenad::Reader, + // Options: + // there's no way to change these for now level_group_size: u8, max_level_group_size: u8, min_level_size: u8, - facet_type: FacetType, - new_data: grenad::Reader, } impl<'i> FacetsUpdate<'i> { pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { @@ -42,36 +41,37 @@ impl<'i> FacetsUpdate<'i> { } pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + if self.new_data.is_empty() { + return Ok(()); + } // here, come up with a better condition! - if self.database.is_empty(wtxn)? { + // ideally we'd choose which method to use for each field id individually + // but I dont' think it's worth the effort yet + // As a first requirement, we ask that the length of the new data is less + // than a 1/50th of the length of the database in order to use the incremental + // method. + if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data) .level_group_size(self.level_group_size) .min_level_size(self.min_level_size); bulk_update.execute(wtxn)?; } else { - let indexer = FacetsUpdateIncremental::new(self.database) - .max_group_size(self.max_level_group_size) - .min_level_size(self.min_level_size); - - let mut new_faceted_docids = HashMap::::default(); - - let mut cursor = self.new_data.into_cursor()?; - while let Some((key, value)) = cursor.move_on_next()? { - let key = FacetGroupKeyCodec::::bytes_decode(key) - .ok_or(heed::Error::Encoding)?; - let docids = - CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; - indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; - *new_faceted_docids.entry(key.field_id).or_default() |= docids; - } - - for (field_id, new_docids) in new_faceted_docids { - let mut docids = - self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; - docids |= new_docids; - self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; - } + let incremental_update = + FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data) + .group_size(self.level_group_size) + .max_group_size(self.max_level_group_size) + .min_level_size(self.min_level_size); + incremental_update.execute(wtxn)?; } Ok(()) } } + +#[cfg(test)] +mod tests { + // here I want to create a benchmark + // to find out at which point it is faster to do it incrementally + + #[test] + fn update() {} +} diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..c2b3896c4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..c2b3896c4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..c9f8951ac --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..c9f8951ac --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 9a89691b1..1d415166d 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,9 +6,9 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; -use crate::heed_codec::facet::FieldDocIdFacetF64Codec; -use crate::heed_codec::facet::OrderedF64Codec; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, +}; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 078a82335..e6a41067b 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -4,8 +4,7 @@ use std::io; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; -use crate::heed_codec::facet::StrRefCodec; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, StrRefCodec}; use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index e18cb4e16..03f15945a 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::time::Instant; -use grenad::{CompressionType, Reader, Sorter}; +use grenad::{CompressionType, Sorter}; use heed::types::ByteSlice; use log::debug; @@ -208,36 +208,6 @@ pub fn grenad_obkv_into_chunks( Ok(std::iter::from_fn(move || transposer().transpose())) } -pub fn write_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - reader: Reader, - merge: MergeFn, -) -> Result<()> { - debug!("Writing MTBL stores..."); - let before = Instant::now(); - - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; - let val = merge(k, vals)?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - } - - debug!("MTBL stores merged in {:.02?}!", before.elapsed()); - Ok(()) -} - pub fn sorter_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 7e2ebd2d3..8fb629cae 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -9,8 +9,8 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, - merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, - writer_into_reader, GrenadParameters, MergeableReader, + merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, + GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2a2511362..96bea9589 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -27,8 +27,7 @@ pub use self::enrich::{ pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, - ClonableMmap, MergeFn, + sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 8fba16d3d..b13118e09 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -2,7 +2,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; pub use self::facet::bulk::FacetsUpdateBulk; -pub use self::facet::incremental::FacetsUpdateIncremental; +pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; From b2f01ad2042ce102fe03141cbf2b3ef65762aced Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 6 Sep 2022 11:52:57 +0200 Subject: [PATCH 1719/1889] Refactor facet database tests --- .../search/facet/facet_distribution_iter.rs | 51 +-- milli/src/search/facet/facet_range_search.rs | 14 +- .../src/search/facet/facet_sort_ascending.rs | 4 +- .../src/search/facet/facet_sort_descending.rs | 4 +- milli/src/search/facet/mod.rs | 146 ------- milli/src/update/facet/bulk.rs | 407 ++++++++---------- milli/src/update/facet/incremental.rs | 272 +++++------- milli/src/update/facet/mod.rs | 244 ++++++++++- .../default.hash.snap} | 2 +- .../large_group_small_min_level.hash.snap} | 2 +- .../odd_group_odd_min_level.hash.snap} | 2 +- .../small_group_large_min_level.hash.snap} | 2 +- .../small_group_small_min_level.hash.snap | 4 + .../default.hash.snap | 4 + .../large_group_small_min_level.hash.snap | 4 + .../odd_group_odd_min_level.hash.snap | 4 + .../small_group_large_min_level.hash.snap | 4 + .../small_group_small_min_level.hash.snap | 4 + .../facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../default/facet_id_f64_docids.hash.snap | 4 - .../facet_id_f64_docids.hash.snap | 4 - .../default/facet_id_string_docids.hash.snap | 4 - .../facet_id_string_docids.hash.snap | 4 - .../default/facet_id_string_docids.hash.snap | 4 - .../facet_id_string_docids.hash.snap | 4 - milli/src/update/mod.rs | 2 +- .../word_pair_proximity_docids.hash.snap | 4 - 28 files changed, 568 insertions(+), 644 deletions(-) rename milli/src/update/facet/snapshots/bulk.rs/{test_facets_number/default/facet_id_f64_docids.hash.snap => insert/default.hash.snap} (58%) rename milli/src/update/facet/snapshots/bulk.rs/{test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap => insert/large_group_small_min_level.hash.snap} (58%) rename milli/src/update/facet/snapshots/bulk.rs/{test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap => insert/odd_group_odd_min_level.hash.snap} (58%) rename milli/src/update/facet/snapshots/bulk.rs/{test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap => insert/small_group_large_min_level.hash.snap} (58%) create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap delete mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 2eebffbcd..3379d1abe 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -114,14 +114,13 @@ where mod tests { use std::ops::ControlFlow; - use heed::BytesDecode; - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - use super::iterate_over_facet_distribution; use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; - use crate::search::facet::test::FacetIndex; + use crate::update::facet::tests::FacetIndex; + use heed::BytesDecode; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -164,17 +163,11 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); - iterate_over_facet_distribution( - &txn, - index.db.content, - 0, - &candidates, - |facet, count| { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - results.push_str(&format!("{facet}: {count}\n")); - ControlFlow::Continue(()) - }, - ) + iterate_over_facet_distribution(&txn, index.content, 0, &candidates, |facet, count| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {count}\n")); + ControlFlow::Continue(()) + }) .unwrap(); milli_snap!(results, i); @@ -189,23 +182,17 @@ mod tests { let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); let mut nbr_facets = 0; - iterate_over_facet_distribution( - &txn, - index.db.content, - 0, - &candidates, - |facet, count| { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - if nbr_facets == 100 { - return ControlFlow::Break(()); - } else { - nbr_facets += 1; - results.push_str(&format!("{facet}: {count}\n")); + iterate_over_facet_distribution(&txn, index.content, 0, &candidates, |facet, count| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + if nbr_facets == 100 { + return ControlFlow::Break(()); + } else { + nbr_facets += 1; + results.push_str(&format!("{facet}: {count}\n")); - ControlFlow::Continue(()) - } - }, - ) + ControlFlow::Continue(()) + } + }) .unwrap(); milli_snap!(results, i); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index bb555e1ab..cb5fd14d2 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -263,8 +263,8 @@ mod tests { use super::find_docids_of_facet_within_bounds; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; - use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; + use crate::update::facet::tests::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -312,7 +312,7 @@ mod tests { let end = Bound::Included(i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -328,7 +328,7 @@ mod tests { let end = Bound::Excluded(i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -354,7 +354,7 @@ mod tests { let end = Bound::Included(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -373,7 +373,7 @@ mod tests { let end = Bound::Excluded(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -401,7 +401,7 @@ mod tests { let end = Bound::Included(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -420,7 +420,7 @@ mod tests { let end = Bound::Excluded(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index fc5fd3d04..f320f9e77 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -89,8 +89,8 @@ mod tests { use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; - use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; + use crate::update::facet::tests::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -133,7 +133,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let iter = ascending_facet_sort(&txn, index.db.content, 0, candidates).unwrap(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap(); for el in iter { let docids = el.unwrap(); results.push_str(&display_bitmap(&docids)); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 42bae42a6..be5fe7841 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -122,8 +122,8 @@ mod tests { use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; - use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; + use crate::update::facet::tests::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -166,7 +166,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let db = index.db.content.remap_key_type::>(); + let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); for el in iter { let docids = el.unwrap(); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ef72658ec..fc71acf37 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -74,149 +74,3 @@ pub(crate) fn get_highest_level<'t>( }) .unwrap_or(0)) } - -#[cfg(test)] -pub mod test { - use std::fmt::Display; - use std::marker::PhantomData; - use std::rc::Rc; - - use heed::{BytesDecode, BytesEncode, Env, RwTxn}; - use roaring::RoaringBitmap; - - use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, - }; - use crate::snapshot_tests::display_bitmap; - use crate::update::FacetsUpdateIncrementalInner; - - // A dummy index that only contains the facet database, used for testing - pub struct FacetIndex - where - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - { - pub env: Env, - pub db: Database, - _phantom: PhantomData, - } - - // The faecet database and its settings - pub struct Database { - pub content: heed::Database, FacetGroupValueCodec>, - pub group_size: u8, - pub min_level_size: u8, - pub max_group_size: u8, - _tempdir: Rc, - } - - impl FacetIndex - where - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - { - #[cfg(all(test, fuzzing))] - pub fn open_from_tempdir( - tempdir: Rc, - group_size: u8, - max_group_size: u8, - min_level_size: u8, - ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 - let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 - let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf - - let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 10 * 100); - unsafe { - options.flag(heed::flags::Flags::MdbAlwaysFreePages); - } - let env = options.open(tempdir.path()).unwrap(); - let content = env.open_database(None).unwrap().unwrap(); - - FacetIndex { - db: Database { - content, - group_size, - max_group_size, - min_level_size, - _tempdir: tempdir, - }, - env, - _phantom: PhantomData, - } - } - pub fn new( - group_size: u8, - max_group_size: u8, - min_level_size: u8, - ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 - let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 - let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf - let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 100); - let tempdir = tempfile::TempDir::new().unwrap(); - let env = options.open(tempdir.path()).unwrap(); - let content = env.create_database(None).unwrap(); - - FacetIndex { - db: Database { - content, - group_size, - max_group_size, - min_level_size, - _tempdir: Rc::new(tempdir), - }, - env, - _phantom: PhantomData, - } - } - pub fn insert<'a>( - &self, - rwtxn: &'a mut RwTxn, - field_id: u16, - key: &'a >::EItem, - docids: &RoaringBitmap, - ) { - let update = FacetsUpdateIncrementalInner::new(self.db.content); - let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - update.insert(rwtxn, field_id, &key_bytes, docids).unwrap(); - } - pub fn delete<'a>( - &self, - rwtxn: &'a mut RwTxn, - field_id: u16, - key: &'a >::EItem, - value: u32, - ) { - let update = FacetsUpdateIncrementalInner::new(self.db.content); - let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - update.delete(rwtxn, field_id, &key_bytes, value).unwrap(); - } - } - - impl Display for FacetIndex - where - for<'a> >::EItem: Sized + Display, - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let txn = self.env.read_txn().unwrap(); - let mut iter = self.db.content.iter(&txn).unwrap(); - while let Some(el) = iter.next() { - let (key, value) = el.unwrap(); - let FacetGroupKey { field_id, level, left_bound: bound } = key; - let bound = BoundCodec::bytes_decode(bound).unwrap(); - let FacetGroupValue { size, bitmap } = value; - writeln!( - f, - "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}", - values = display_bitmap(&bitmap) - )?; - } - Ok(()) - } - } -} diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index ad97ed2de..321ae52d4 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -19,7 +19,7 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; pub struct FacetsUpdateBulk<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, - level_group_size: u8, + group_size: u8, min_level_size: u8, facet_type: FacetType, // None if level 0 does not need to be updated @@ -42,7 +42,7 @@ impl<'i> FacetsUpdateBulk<'i> { index.facet_id_f64_docids.remap_key_type::>() } }, - level_group_size: 4, + group_size: 4, min_level_size: 5, facet_type, new_data: Some(new_data), @@ -63,7 +63,7 @@ impl<'i> FacetsUpdateBulk<'i> { index.facet_id_f64_docids.remap_key_type::>() } }, - level_group_size: 4, + group_size: 4, min_level_size: 5, facet_type, new_data: None, @@ -74,61 +74,85 @@ impl<'i> FacetsUpdateBulk<'i> { /// /// This setting is always greater than or equal to 2. pub fn level_group_size(mut self, value: u8) -> Self { - self.level_group_size = cmp::max(value, 2); + self.group_size = cmp::max(value, 2); self } /// The minimum number of elements that a level is allowed to have. pub fn min_level_size(mut self, value: u8) -> Self { - self.min_level_size = cmp::max(value, 1); + self.min_level_size = cmp::max(value, 2); self } + #[logging_timer::time("FacetsUpdateBulk::{}")] + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + + let Self { index, database, group_size, min_level_size, facet_type, new_data } = self; + + index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + + let inner = FacetsUpdateBulkInner { db: database, new_data, group_size, min_level_size }; + + let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); + + inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { + index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; + Ok(()) + })?; + + Ok(()) + } +} + +pub(crate) struct FacetsUpdateBulkInner { + pub db: heed::Database, FacetGroupValueCodec>, + pub new_data: Option>, + pub group_size: u8, + pub min_level_size: u8, +} +impl FacetsUpdateBulkInner { + pub fn update( + mut self, + wtxn: &mut RwTxn, + field_ids: &[u16], + mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>, + ) -> Result<()> { + self.update_level0(wtxn)?; + for &field_id in field_ids.iter() { + self.clear_levels(wtxn, field_id)?; + } + + for &field_id in field_ids.iter() { + let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; + + handle_all_docids(wtxn, field_id, all_docids)?; + + for level_reader in level_readers { + let mut cursor = level_reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + self.db.remap_types::().put(wtxn, k, v)?; + } + } + } + Ok(()) + } + fn clear_levels(&self, wtxn: &mut heed::RwTxn, field_id: FieldId) -> Result<()> { let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; let range = left..=right; - self.database.delete_range(wtxn, &range).map(drop)?; + self.db.delete_range(wtxn, &range).map(drop)?; Ok(()) } - - #[logging_timer::time("FacetsUpdateBulk::{}")] - pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { - self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - - // We get the faceted fields to be able to create the facet levels. - let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); - - for &field_id in faceted_fields.iter() { - self.clear_levels(wtxn, field_id)?; - } - self.update_level0(wtxn)?; - - for &field_id in faceted_fields.iter() { - let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; - - self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &all_docids)?; - - for level_reader in level_readers { - let mut cursor = level_reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - self.database.remap_types::().put(wtxn, k, v)?; - } - } - } - - Ok(()) - } - fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { let new_data = match self.new_data.take() { Some(x) => x, None => return Ok(()), }; - if self.database.is_empty(wtxn)? { + if self.db.is_empty(wtxn)? { let mut buffer = Vec::new(); - let mut database = self.database.iter_mut(wtxn)?.remap_types::(); + let mut database = self.db.iter_mut(wtxn)?.remap_types::(); let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { buffer.clear(); @@ -140,7 +164,7 @@ impl<'i> FacetsUpdateBulk<'i> { } } else { let mut buffer = Vec::new(); - let database = self.database.remap_types::(); + let database = self.db.remap_types::(); let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -164,47 +188,29 @@ impl<'i> FacetsUpdateBulk<'i> { database.put(wtxn, key, &buffer)?; } } - Ok(()) } - fn compute_levels_for_field_id( &self, field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { // TODO: first check whether there is anything in level 0? - let algo = ComputeHigherLevels { - rtxn: txn, - db: &self.database, - field_id, - level_group_size: self.level_group_size, - min_level_size: self.min_level_size, - }; let mut all_docids = RoaringBitmap::new(); - let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| { + let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { for bitmap in bitmaps { all_docids |= bitmap; } Ok(()) })?; - drop(algo); Ok((subwriters, all_docids)) } -} - -struct ComputeHigherLevels<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, - field_id: u16, - level_group_size: u8, - min_level_size: u8, -} -impl<'t> ComputeHigherLevels<'t> { - fn read_level_0( + fn read_level_0<'t>( &self, + rtxn: &'t RoTxn, + field_id: u16, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result<()> { // we read the elements one by one and @@ -213,13 +219,13 @@ impl<'t> ComputeHigherLevels<'t> { let mut bitmaps = vec![]; let mut level_0_prefix = vec![]; - level_0_prefix.extend_from_slice(&self.field_id.to_be_bytes()); + level_0_prefix.extend_from_slice(&field_id.to_be_bytes()); level_0_prefix.push(0); let level_0_iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, level_0_prefix.as_slice())? + .prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, level_0_prefix.as_slice())? .remap_types::, FacetGroupValueCodec>(); let mut left_bound: &[u8] = &[]; @@ -235,7 +241,7 @@ impl<'t> ComputeHigherLevels<'t> { } bitmaps.push(docids); - if bitmaps.len() == self.level_group_size as usize { + if bitmaps.len() == self.group_size as usize { handle_group(&bitmaps, left_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); @@ -254,13 +260,15 @@ impl<'t> ComputeHigherLevels<'t> { /// ## Returns: /// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` /// that must be inserted into the database. - fn compute_higher_levels( + fn compute_higher_levels<'t>( &self, + rtxn: &'t RoTxn, + field_id: u16, level: u8, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result>> { if level == 0 { - self.read_level_0(handle_group)?; + self.read_level_0(rtxn, field_id, handle_group)?; // Level 0 is already in the database return Ok(vec![]); } @@ -270,7 +278,7 @@ impl<'t> ComputeHigherLevels<'t> { // of those elements, and their bitmaps, to the level above let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); - let mut cur_writer_len = 0; + let mut cur_writer_len: usize = 0; let mut group_sizes = vec![]; let mut left_bounds = vec![]; @@ -278,8 +286,13 @@ impl<'t> ComputeHigherLevels<'t> { // compute the levels below // in the callback, we fill `cur_writer` with the correct elements for this level - let mut sub_writers = - self.compute_higher_levels(level - 1, &mut |sub_bitmaps, left_bound| { + let mut sub_writers = self.compute_higher_levels( + rtxn, + field_id, + level - 1, + &mut |sub_bitmaps, left_bound| { + // TODO: is this done unnecessarily for all 32 levels? + println!("level: {level}"); let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; @@ -288,7 +301,7 @@ impl<'t> ComputeHigherLevels<'t> { left_bounds.push(left_bound); bitmaps.push(combined_bitmap); - if bitmaps.len() != self.level_group_size as usize { + if bitmaps.len() != self.group_size as usize { return Ok(()); } let left_bound = left_bounds.first().unwrap(); @@ -297,7 +310,7 @@ impl<'t> ComputeHigherLevels<'t> { for ((bitmap, left_bound), group_size) in bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { - let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; + let key = FacetGroupKey { field_id, level, left_bound }; let key = FacetGroupKeyCodec::::bytes_encode(&key) .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; @@ -307,15 +320,26 @@ impl<'t> ComputeHigherLevels<'t> { cur_writer_len += 1; } Ok(()) - })?; + }, + )?; // don't forget to insert the leftover elements into the writer as well - if !bitmaps.is_empty() && cur_writer_len >= self.min_level_size { + + // but only do so if the current number of elements to be inserted into this + // levelcould grow to the minimum level size + + if !bitmaps.is_empty() && (cur_writer_len >= self.min_level_size as usize - 1) { + // the length of bitmaps is between 0 and group_size + assert!(bitmaps.len() < self.group_size as usize); + assert!(cur_writer_len > 0); + let left_bound = left_bounds.first().unwrap(); handle_group(&bitmaps, left_bound)?; + + // Note: how many bitmaps are there here? for ((bitmap, left_bound), group_size) in bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { - let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; + let key = FacetGroupKey { field_id, level, left_bound }; let key = FacetGroupKeyCodec::::bytes_encode(&key) .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; @@ -324,9 +348,12 @@ impl<'t> ComputeHigherLevels<'t> { cur_writer_len += 1; } } - if cur_writer_len > self.min_level_size { + // if we inserted enough elements to reach the minimum level size, then we push the writer + if cur_writer_len as u8 >= self.min_level_size { sub_writers.push(writer_into_reader(cur_writer)?); } else { + // otherwise, if there are still leftover elements, we give them to the level above + // this is necessary in order to get the union of all docids if !bitmaps.is_empty() { handle_group(&bitmaps, left_bounds.first().unwrap())?; } @@ -337,184 +364,90 @@ impl<'t> ComputeHigherLevels<'t> { #[cfg(test)] mod tests { - use std::num::NonZeroUsize; - - use crate::db_snap; - use crate::documents::documents_batch_reader_from_objects; - use crate::index::tests::TempIndex; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; + use roaring::RoaringBitmap; + use std::iter::once; #[test] - fn test_facets_number() { - let test = - |name: &str, group_size: Option, min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; + fn insert() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = + FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1_000 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 0..100 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - db_snap!(index, facet_id_f64_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); - test("small_groups_small_levels", NonZeroUsize::new(2), NonZeroUsize::new(2)); - test("small_groups_large_levels", NonZeroUsize::new(2), NonZeroUsize::new(128)); - test("large_groups_small_levels", NonZeroUsize::new(16), NonZeroUsize::new(2)); - test("large_groups_large_levels", NonZeroUsize::new(16), NonZeroUsize::new(256)); - } - - #[test] - fn test_facets_string() { - let test = |name: &str, - group_size: Option, - min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; - - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..100 { - documents.push( - serde_json::json!({ "facet": format!("s{i:X}") }).as_object().unwrap().clone(), - ); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..1_000u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); } - for i in 0..10 { - documents.push( - serde_json::json!({ "facet2": format!("s{i:X}") }).as_object().unwrap().clone(), - ); + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); } - let documents = documents_batch_reader_from_objects(documents); + let mut wtxn = index.env.write_txn().unwrap(); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); - index.add_documents(documents).unwrap(); + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); - db_snap!(index, facet_id_string_docids, name); + wtxn.commit().unwrap(); + + milli_snap!(format!("{index}"), name); }; - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); } - #[test] - fn test_facets_number_incremental_update() { - let test = - |name: &str, group_size: Option, min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; + fn insert_delete_field_insert() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = + FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); + let mut wtxn = index.env.write_txn().unwrap(); - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..100u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); - let mut documents = vec![]; - for i in 0..1000 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 0..100 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + // delete all the elements for the facet id 0 + for i in 0..100u32 { + index.delete(&mut wtxn, 0, &(i as f64), i); + } + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); - index.add_documents(documents_batch).unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + // then add some elements again for the facet id 1 + for i in 0..110u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); - let mut documents = vec![]; - for i in 1000..1010 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 100..110 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); + wtxn.commit().unwrap(); - index.add_documents(documents_batch).unwrap(); + milli_snap!(format!("{index}"), name); + }; - db_snap!(index, facet_id_f64_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); - } - - #[test] - fn test_facets_number_delete_facet_id_then_bulk_update() { - let test = - |name: &str, group_size: Option, min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; - - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 0..100 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); - - index.add_documents(documents_batch).unwrap(); - - // 1100 facets -> how long is the DB? - - let mut documents = vec![]; - for i in 1000..1010 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 100..110 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); - - index.add_documents(documents_batch).unwrap(); - - db_snap!(index, facet_id_f64_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); } } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 75ca5d55b..14b421242 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -84,15 +84,10 @@ impl<'i> FacetsUpdateIncremental<'i> { } pub struct FacetsUpdateIncrementalInner { - db: heed::Database, FacetGroupValueCodec>, - group_size: u8, - min_level_size: u8, - max_group_size: u8, -} -impl FacetsUpdateIncrementalInner { - pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { - Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } - } + pub db: heed::Database, FacetGroupValueCodec>, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, } impl FacetsUpdateIncrementalInner { fn find_insertion_key_value( @@ -528,82 +523,13 @@ impl FacetsUpdateIncrementalInner { #[cfg(test)] mod tests { - use heed::types::ByteSlice; - use heed::{BytesDecode, BytesEncode}; + use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; use rand::seq::SliceRandom; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, StrRefCodec, - }; - use crate::milli_snap; - use crate::search::facet::get_highest_level; - use crate::search::facet::test::FacetIndex; - - pub fn verify_structure_validity(index: &FacetIndex, field_id: u16) - where - for<'a> C: BytesDecode<'a> + BytesEncode<'a, EItem = >::DItem>, - { - let FacetIndex { env, db, .. } = index; - - let txn = env.write_txn().unwrap(); - let mut field_id_prefix = vec![]; - field_id_prefix.extend_from_slice(&field_id.to_be_bytes()); - - let highest_level = get_highest_level(&txn, index.db.content, field_id).unwrap(); - txn.commit().unwrap(); - - let txn = env.read_txn().unwrap(); - for level_no in (1..=highest_level).rev() { - let mut level_no_prefix = vec![]; - level_no_prefix.extend_from_slice(&field_id.to_be_bytes()); - level_no_prefix.push(level_no); - - let mut iter = db - .content - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level_no_prefix) - .unwrap(); - while let Some(el) = iter.next() { - let (key, value) = el.unwrap(); - let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); - - let mut prefix_start_below = vec![]; - prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); - prefix_start_below.push(level_no - 1); - prefix_start_below.extend_from_slice(&key.left_bound); - - let start_below = { - let mut start_below_iter = db - .content - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( - &txn, - &prefix_start_below, - ) - .unwrap(); - let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); - FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() - }; - - assert!(value.size > 0 && value.size < db.max_group_size); - - let mut actual_size = 0; - let mut values_below = RoaringBitmap::new(); - let mut iter_below = - db.content.range(&txn, &(start_below..)).unwrap().take(value.size as usize); - while let Some(el) = iter_below.next() { - let (_, value) = el.unwrap(); - actual_size += 1; - values_below |= value.bitmap; - } - assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}"); - - assert_eq!(value.bitmap, values_below); - } - } - } #[test] fn append() { let index = FacetIndex::::new(4, 8, 5); @@ -614,7 +540,9 @@ mod tests { index.insert(&mut txn, 0, &(i as f64), &bitmap); txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] @@ -641,9 +569,11 @@ mod tests { index.insert(&mut txn, 1, &(i as f64), &bitmap); txn.commit().unwrap(); } - verify_structure_validity(&index, 0); - verify_structure_validity(&index, 1); - verify_structure_validity(&index, 2); + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.verify_structure_validity(&txn, 1); + index.verify_structure_validity(&txn, 2); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] @@ -670,9 +600,11 @@ mod tests { index.insert(&mut txn, 1, &(i as f64), &bitmap); txn.commit().unwrap(); } - verify_structure_validity(&index, 0); - verify_structure_validity(&index, 1); - verify_structure_validity(&index, 2); + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.verify_structure_validity(&txn, 1); + index.verify_structure_validity(&txn, 2); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } @@ -686,8 +618,9 @@ mod tests { bitmap.insert(i); index.insert(&mut txn, 0, &(i as f64), &bitmap); } + + index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); - verify_structure_validity(&index, 0); milli_snap!(format!("{index}")); } @@ -705,146 +638,138 @@ mod tests { bitmap.insert(key); index.insert(&mut txn, 0, &(key as f64), &bitmap); } + index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); - verify_structure_validity(&index, 0); milli_snap!(format!("{index}")); } #[test] fn merge_values() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); let mut keys = (0..256).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); + for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(rng.gen_range(256..512)); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(key as f64), &bitmap); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] fn delete_from_end() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(&(i as f64)), &bitmap); - txn.commit().unwrap(); } for i in (200..256).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 200); + let mut txn = index.env.write_txn().unwrap(); for i in (150..200).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 150); - + let mut txn = index.env.write_txn().unwrap(); for i in (100..150).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 100); - + let mut txn = index.env.write_txn().unwrap(); for i in (17..100).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 17); - let mut txn = index.env.write_txn().unwrap(); for i in (15..17).into_iter().rev() { index.delete(&mut txn, 0, &(i as f64), i as u32); } + index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); - verify_structure_validity(&index, 0); milli_snap!(format!("{index}"), 15); + let mut txn = index.env.write_txn().unwrap(); for i in (0..15).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 0); } #[test] fn delete_from_start() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(i as f64), &bitmap); - txn.commit().unwrap(); } for i in 0..128 { - let mut txn = index.env.write_txn().unwrap(); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 127); + let mut txn = index.env.write_txn().unwrap(); for i in 128..216 { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 215); + let mut txn = index.env.write_txn().unwrap(); for i in 216..256 { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 255); } #[test] fn delete_shuffled() { let index = FacetIndex::::new(4, 8, 5); - + let mut txn = index.env.write_txn().unwrap(); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(i as f64), &bitmap); - txn.commit().unwrap(); } let mut keys = (0..256).into_iter().collect::>(); @@ -853,36 +778,37 @@ mod tests { for i in 0..128 { let key = keys[i]; - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 127); + let mut txn = index.env.write_txn().unwrap(); for i in 128..216 { let key = keys[i]; - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + let mut txn = index.env.write_txn().unwrap(); milli_snap!(format!("{index}"), 215); for i in 216..256 { let key = keys[i]; - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 255); } #[test] fn in_place_level0_insert() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + let mut keys = (0..16).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); @@ -890,19 +816,19 @@ mod tests { for &key in keys.iter() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(rng.gen_range(i * 256..(i + 1) * 256)); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(key as f64), &bitmap); - txn.commit().unwrap(); } } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] fn in_place_level0_delete() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); let mut keys = (0..64).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -912,27 +838,29 @@ mod tests { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(key + 100); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(key as f64), &bitmap); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "before_delete"); + let mut txn = index.env.write_txn().unwrap(); + for &key in keys.iter() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key + 100); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "after_delete"); } #[test] fn shuffle_merge_string_and_delete() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); let mut keys = (1000..1064).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -942,21 +870,21 @@ mod tests { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(key + 100); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &format!("{key:x}").as_str(), &bitmap); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "before_delete"); + let mut txn = index.env.write_txn().unwrap(); + for &key in keys.iter() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "after_delete"); } @@ -1083,7 +1011,7 @@ mod tests { // assert_eq!(key, &group_key.left_bound); // assert_eq!(values, &group_values.bitmap); // } -// verify_structure_validity(&index, *field_id); +// index.verify_structure_validity(*field_id); // } // index.db.content.clear(&mut txn).unwrap(); diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 3b46bb421..7298fecc5 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -68,10 +68,244 @@ impl<'i> FacetsUpdate<'i> { } #[cfg(test)] -mod tests { - // here I want to create a benchmark - // to find out at which point it is faster to do it incrementally +pub(crate) mod tests { + use super::bulk::FacetsUpdateBulkInner; + use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + }; + use crate::search::facet::get_highest_level; + use crate::snapshot_tests::display_bitmap; + use crate::update::FacetsUpdateIncrementalInner; + use crate::CboRoaringBitmapCodec; + use heed::types::ByteSlice; + use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; + use roaring::RoaringBitmap; + use std::fmt::Display; + use std::marker::PhantomData; + use std::rc::Rc; - #[test] - fn update() {} + // A dummy index that only contains the facet database, used for testing + pub struct FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + pub env: Env, + pub content: heed::Database, FacetGroupValueCodec>, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, + _tempdir: Rc, + _phantom: PhantomData, + } + + impl FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + #[cfg(all(test, fuzzing))] + pub fn open_from_tempdir( + tempdir: Rc, + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 10 * 100); + unsafe { + options.flag(heed::flags::Flags::MdbAlwaysFreePages); + } + let env = options.open(tempdir.path()).unwrap(); + let content = env.open_database(None).unwrap().unwrap(); + + FacetIndex { + db: Database { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: tempdir, + }, + env, + _phantom: PhantomData, + } + } + pub fn new( + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 100); + let tempdir = tempfile::TempDir::new().unwrap(); + let env = options.open(tempdir.path()).unwrap(); + let content = env.create_database(None).unwrap(); + + FacetIndex { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: Rc::new(tempdir), + env, + _phantom: PhantomData, + } + } + pub fn insert<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size, + min_level_size: self.min_level_size, + max_group_size: self.max_group_size, + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); + } + pub fn delete<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + value: u32, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size, + min_level_size: self.min_level_size, + max_group_size: self.max_group_size, + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.delete(wtxn, field_id, &key_bytes, value).unwrap(); + } + + pub fn bulk_insert<'a, 'b>( + &self, + wtxn: &'a mut RwTxn, + field_ids: &[u16], + els: impl IntoIterator< + Item = &'a ((u16, >::EItem), RoaringBitmap), + >, + ) where + for<'c> >::EItem: Sized, + { + let mut new_data = vec![]; + let mut writer = grenad::Writer::new(&mut new_data); + for ((field_id, left_bound), docids) in els { + let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned(); + let key: FacetGroupKey<&[u8]> = + FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; + let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let value = CboRoaringBitmapCodec::bytes_encode(&docids).unwrap(); + writer.insert(&key, &value).unwrap(); + } + writer.finish().unwrap(); + let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); + + let update = FacetsUpdateBulkInner { + db: self.content, + new_data: Some(reader), + group_size: self.group_size, + min_level_size: self.min_level_size, + }; + + update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); + } + + pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { + let mut field_id_prefix = vec![]; + field_id_prefix.extend_from_slice(&field_id.to_be_bytes()); + + let highest_level = get_highest_level(txn, self.content, field_id).unwrap(); + + for level_no in (1..=highest_level).rev() { + let mut level_no_prefix = vec![]; + level_no_prefix.extend_from_slice(&field_id.to_be_bytes()); + level_no_prefix.push(level_no); + + let mut iter = self + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &level_no_prefix) + .unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); + + let mut prefix_start_below = vec![]; + prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); + prefix_start_below.push(level_no - 1); + prefix_start_below.extend_from_slice(&key.left_bound); + + let start_below = { + let mut start_below_iter = self + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + txn, + &prefix_start_below, + ) + .unwrap(); + let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); + FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() + }; + + assert!(value.size > 0 && value.size < self.max_group_size); + + let mut actual_size = 0; + let mut values_below = RoaringBitmap::new(); + let mut iter_below = self + .content + .range(txn, &(start_below..)) + .unwrap() + .take(value.size as usize); + while let Some(el) = iter_below.next() { + let (_, value) = el.unwrap(); + actual_size += 1; + values_below |= value.bitmap; + } + assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}"); + + assert_eq!(value.bitmap, values_below); + } + } + } + } + + impl Display for FacetIndex + where + for<'a> >::EItem: Sized + Display, + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let txn = self.env.read_txn().unwrap(); + let mut iter = self.content.iter(&txn).unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let FacetGroupKey { field_id, level, left_bound: bound } = key; + let bound = BoundCodec::bytes_decode(bound).unwrap(); + let FacetGroupValue { size, bitmap } = value; + writeln!( + f, + "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}", + values = display_bitmap(&bitmap) + )?; + } + Ok(()) + } + } } diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap index 960843592..bef20823c 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +b40dd31a65e033ffc6b35c027ce19506 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap index 960843592..74c40e6a3 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +7ee22d8e9387e72758f00918eb67e4c6 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap index 960843592..6fb086d35 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +60f567359382507afdaf45fb075740c3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap index 960843592..0271a6c6b 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +b986d6e6cbf425685f409a8b417010e1 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..d801ef19f --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +ee10dd2ae2b5c6621a89a5d0a9aa8ccc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap new file mode 100644 index 000000000..e9988f527 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +fa877559eef78b383b496c15a364a2dc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..64f5012a4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +be1b08073b9d9788d18080c1320151d7 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..bb0e9aa69 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +32a45d555df2e001420fea149818d376 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 960843592..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 960843592..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap deleted file mode 100644 index c2b3896c4..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index c2b3896c4..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap deleted file mode 100644 index 574a3c393..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -3e6a91b3c54c614a4787224ac4278ed3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap deleted file mode 100644 index 574a3c393..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -3e6a91b3c54c614a4787224ac4278ed3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap deleted file mode 100644 index c9f8951ac..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap deleted file mode 100644 index c9f8951ac..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index b13118e09..952720725 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -17,7 +17,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; mod delete_documents; -mod facet; +pub(crate) mod facet; mod index_documents; mod indexer_config; mod prefix_word_pairs; diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap deleted file mode 100644 index e50e50347..000000000 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/word_prefix_pair_proximity_docids.rs ---- -6873ff1f78d08f2b1a13bb9e37349c01 From bee3c23b45c0a1a9212ba7269c82a4eca5ad6e45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 6 Sep 2022 13:39:08 +0200 Subject: [PATCH 1720/1889] Add comparison benchmark between bulk and incremental facet indexing --- milli/src/update/facet/bulk.rs | 2 - milli/src/update/facet/mod.rs | 85 ++++++++++++++++++- .../src/update/index_documents/typed_chunk.rs | 2 + 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 321ae52d4..90e287f23 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -291,8 +291,6 @@ impl FacetsUpdateBulkInner { field_id, level - 1, &mut |sub_bitmaps, left_bound| { - // TODO: is this done unnecessarily for all 32 levels? - println!("level: {level}"); let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 7298fecc5..caf88671e 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -13,8 +13,6 @@ pub struct FacetsUpdate<'i> { database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, new_data: grenad::Reader, - // Options: - // there's no way to change these for now level_group_size: u8, max_level_group_size: u8, min_level_size: u8, @@ -40,6 +38,28 @@ impl<'i> FacetsUpdate<'i> { } } + // TODO: use the options below? + // but I don't actually see why they should be configurable + // /// The minimum number of elements that a level is allowed to have. + // pub fn level_max_group_size(mut self, value: u8) -> Self { + // self.max_level_group_size = std::cmp::max(value, 4); + // self + // } + + // /// The number of elements from the level below that are represented by a single element in the level above + // /// + // /// This setting is always greater than or equal to 2. + // pub fn level_group_size(mut self, value: u8) -> Self { + // self.level_group_size = std::cmp::max(value, 2); + // self + // } + + // /// The minimum number of elements that a level is allowed to have. + // pub fn min_level_size(mut self, value: u8) -> Self { + // self.min_level_size = std::cmp::max(value, 2); + // self + // } + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { if self.new_data.is_empty() { return Ok(()); @@ -144,7 +164,7 @@ pub(crate) mod tests { let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 100); + let options = options.map_size(4096 * 4 * 1000); let tempdir = tempfile::TempDir::new().unwrap(); let env = options.open(tempdir.path()).unwrap(); let content = env.create_database(None).unwrap(); @@ -309,3 +329,62 @@ pub(crate) mod tests { } } } + +#[allow(unused)] +#[cfg(test)] +mod comparison_bench { + use std::iter::once; + + use rand::Rng; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + + use super::tests::FacetIndex; + + // This is a simple test to get an intuition on the relative speed + // of the incremental vs. bulk indexer. + // It appears that the incremental indexer is about 50 times slower than the + // bulk indexer. + #[test] + fn benchmark_facet_indexing() { + // then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it + + let mut facet_value = 0; + + let mut r = rand::thread_rng(); + + for i in 1..=20 { + let size = 50_000 * i; + let index = FacetIndex::::new(4, 8, 5); + + let mut txn = index.env.write_txn().unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..size { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, facet_value as f64), once(i).collect())); + facet_value += 1; + } + let timer = std::time::Instant::now(); + index.bulk_insert(&mut txn, &[0], elements.iter()); + let time_spent = timer.elapsed().as_millis(); + println!("bulk {size} : {time_spent}ms"); + + txn.commit().unwrap(); + + for nbr_doc in [1, 100, 1000, 10_000] { + let mut txn = index.env.write_txn().unwrap(); + let timer = std::time::Instant::now(); + // + // insert one document + // + for _ in 0..nbr_doc { + index.insert(&mut txn, 0, &r.gen(), &once(1).collect()); + } + let time_spent = timer.elapsed().as_millis(); + println!(" add {nbr_doc} : {time_spent}ms"); + txn.abort().unwrap(); + } + } + } +} diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 16784bd92..f11414f20 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -138,11 +138,13 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { + // TODO indexer options for the facet level database let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?; is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { + // TODO indexer options for the facet level database let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); indexer.execute(wtxn)?; is_merged_database = true; From 27454e9828ef76d85bb530a63a73e4948b902809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 16:44:08 +0200 Subject: [PATCH 1721/1889] Document and refine facet indexing algorithms --- milli/src/heed_codec/facet/mod.rs | 21 +- milli/src/update/facet/bulk.rs | 88 ++--- milli/src/update/facet/incremental.rs | 440 ++++++++++++++---------- milli/src/update/facet/mod.rs | 125 ++++--- milli/src/update/index_documents/mod.rs | 4 +- 5 files changed, 387 insertions(+), 291 deletions(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 40e395881..2e9f0b212 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -29,31 +29,14 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { } } +/// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] +/// databases. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct FacetGroupKey { pub field_id: u16, pub level: u8, pub left_bound: T, } -impl<'a> FacetGroupKey<&'a [u8]> { - pub fn into_owned(self) -> FacetGroupKey> { - FacetGroupKey { - field_id: self.field_id, - level: self.level, - left_bound: self.left_bound.to_vec(), - } - } -} - -impl<'a> FacetGroupKey> { - pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { - FacetGroupKey { - field_id: self.field_id, - level: self.level, - left_bound: self.left_bound.as_slice(), - } - } -} #[derive(Debug)] pub struct FacetGroupValue { diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 90e287f23..83fa51003 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,24 +1,30 @@ -use std::borrow::Cow; -use std::cmp; -use std::fs::File; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; -use roaring::RoaringBitmap; -use time::OffsetDateTime; - use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use log::debug; +use roaring::RoaringBitmap; +use std::borrow::Cow; +use std::fs::File; +use time::OffsetDateTime; +use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; + +/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases +/// by rebuilding the database "from scratch". +/// +/// First, the new elements are inserted into the level 0 of the database. Then, the +/// higher levels are cleared and recomputed from the content of level 0. +/// +/// Finally, the `faceted_documents_ids` value in the main database of `Index` +/// is updated to contain the new set of faceted documents. pub struct FacetsUpdateBulk<'i> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, group_size: u8, min_level_size: u8, facet_type: FacetType, @@ -31,22 +37,10 @@ impl<'i> FacetsUpdateBulk<'i> { index: &'i Index, facet_type: FacetType, new_data: grenad::Reader, + group_size: u8, + min_level_size: u8, ) -> FacetsUpdateBulk<'i> { - FacetsUpdateBulk { - index, - database: match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }, - group_size: 4, - min_level_size: 5, - facet_type, - new_data: Some(new_data), - } + FacetsUpdateBulk { index, group_size, min_level_size, facet_type, new_data: Some(new_data) } } pub fn new_not_updating_level_0( @@ -55,44 +49,31 @@ impl<'i> FacetsUpdateBulk<'i> { ) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, - database: match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }, - group_size: 4, - min_level_size: 5, + group_size: FACET_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, new_data: None, } } - /// The number of elements from the level below that are represented by a single element in the level above - /// - /// This setting is always greater than or equal to 2. - pub fn level_group_size(mut self, value: u8) -> Self { - self.group_size = cmp::max(value, 2); - self - } - - /// The minimum number of elements that a level is allowed to have. - pub fn min_level_size(mut self, value: u8) -> Self { - self.min_level_size = cmp::max(value, 2); - self - } - #[logging_timer::time("FacetsUpdateBulk::{}")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - let Self { index, database, group_size, min_level_size, facet_type, new_data } = self; + let Self { index, group_size, min_level_size, facet_type, new_data } = self; + + let db = match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - let inner = FacetsUpdateBulkInner { db: database, new_data, group_size, min_level_size }; + let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); @@ -105,6 +86,7 @@ impl<'i> FacetsUpdateBulk<'i> { } } +/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { pub db: heed::Database, FacetGroupValueCodec>, pub new_data: Option>, diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 14b421242..6be2dbf03 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,16 +1,14 @@ -use std::collections::HashMap; -use std::fs::File; - -use heed::types::ByteSlice; -use heed::{BytesDecode, Error, RoTxn, RwTxn}; -use roaring::RoaringBitmap; - use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::search::facet::get_highest_level; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; +use std::collections::HashMap; +use std::fs::File; enum InsertionResult { InPlace, @@ -18,10 +16,15 @@ enum InsertionResult { } enum DeletionResult { InPlace, - Reduce { prev: Option>, next: Option> }, - Remove { prev: Option>, next: Option> }, + Reduce { next: Option> }, + Remove { next: Option> }, } +/// Algorithm to incrementally insert and delete elememts into the +/// `facet_id_(string/f64)_docids` databases. +/// +/// Rhe `faceted_documents_ids` value in the main database of `Index` +/// is also updated to contain the new set of faceted documents. pub struct FacetsUpdateIncremental<'i> { index: &'i Index, inner: FacetsUpdateIncrementalInner, @@ -30,7 +33,14 @@ pub struct FacetsUpdateIncremental<'i> { } impl<'i> FacetsUpdateIncremental<'i> { - pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { + pub fn new( + index: &'i Index, + facet_type: FacetType, + new_data: grenad::Reader, + group_size: u8, + min_level_size: u8, + max_group_size: u8, + ) -> Self { FacetsUpdateIncremental { index, inner: FacetsUpdateIncrementalInner { @@ -42,26 +52,15 @@ impl<'i> FacetsUpdateIncremental<'i> { .facet_id_f64_docids .remap_key_type::>(), }, - group_size: 4, - max_group_size: 8, - min_level_size: 5, + group_size, + max_group_size, + min_level_size, }, facet_type, new_data, } } - pub fn group_size(mut self, size: u8) -> Self { - self.inner.group_size = size; - self - } - pub fn min_level_size(mut self, size: u8) -> Self { - self.inner.min_level_size = size; - self - } - pub fn max_group_size(mut self, size: u8) -> Self { - self.inner.max_group_size = size; - self - } + pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { let mut new_faceted_docids = HashMap::::default(); @@ -83,6 +82,7 @@ impl<'i> FacetsUpdateIncremental<'i> { } } +/// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type pub struct FacetsUpdateIncrementalInner { pub db: heed::Database, FacetGroupValueCodec>, pub group_size: u8, @@ -90,22 +90,36 @@ pub struct FacetsUpdateIncrementalInner { pub max_group_size: u8, } impl FacetsUpdateIncrementalInner { + /// Find the `FacetGroupKey`/`FacetGroupValue` in the database that + /// should be used to insert the new `facet_value` for the given `field_id` and `level` + /// where `level` must be strictly greater than 0. + /// + /// For example, when inserting the facet value `4`, there are two possibilities: + /// + /// 1. We find a key whose lower bound is 3 followed by a key whose lower bound is 6. Therefore, + /// we know that the implicit range of the first key is 3..6, which contains 4. + /// So the new facet value belongs in that first key/value pair. + /// + /// 2. The first key of the level has a lower bound of `5`. We return this key/value pair + /// but will need to change the lowerbound of this key to `4` in order to insert this facet value. fn find_insertion_key_value( &self, field_id: u16, level: u8, - search_key: &[u8], + facet_value: &[u8], txn: &RoTxn, ) -> Result<(FacetGroupKey>, FacetGroupValue)> { + assert!(level > 0); + let mut prefix = vec![]; prefix.extend_from_slice(&field_id.to_be_bytes()); prefix.push(level); - prefix.extend_from_slice(search_key); + prefix.extend_from_slice(facet_value); let mut prefix_iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(txn, &prefix.as_slice())?; + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, prefix.as_slice())?; if let Some(e) = prefix_iter.next() { let (key_bytes, value) = e?; Ok(( @@ -115,10 +129,10 @@ impl FacetsUpdateIncrementalInner { value, )) } else { - let key = FacetGroupKey { field_id, level, left_bound: search_key }; + let key = FacetGroupKey { field_id, level, left_bound: facet_value }; match self.db.get_lower_than(txn, &key)? { Some((key, value)) => { - if key.level != level || key.field_id != field_id { + if key.level != level { let mut prefix = vec![]; prefix.extend_from_slice(&field_id.to_be_bytes()); prefix.push(level); @@ -126,7 +140,7 @@ impl FacetsUpdateIncrementalInner { let mut iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>( + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( txn, &prefix.as_slice(), )?; @@ -146,15 +160,19 @@ impl FacetsUpdateIncrementalInner { } } + /// Insert the given facet value and corresponding document ids in the level 0 of the database + /// + /// ## Return + /// See documentation of `insert_in_level` fn insert_in_level_0<'t>( &self, txn: &'t mut RwTxn, field_id: u16, - new_key: &[u8], - new_values: &RoaringBitmap, + facet_value: &[u8], + docids: &RoaringBitmap, ) -> Result { - let key = FacetGroupKey { field_id, level: 0, left_bound: new_key }; - let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 }; + let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; + let value = FacetGroupValue { bitmap: docids.clone(), size: 1 }; let mut level0_prefix = vec![]; level0_prefix.extend_from_slice(&field_id.to_be_bytes()); @@ -163,7 +181,7 @@ impl FacetsUpdateIncrementalInner { let mut iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level0_prefix)?; + .prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, &level0_prefix)?; if iter.next().is_none() { drop(iter); @@ -186,143 +204,158 @@ impl FacetsUpdateIncrementalInner { } } } + + /// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`. + /// This function works recursively. + /// + /// ## Return + /// Returns the effect of adding the facet value to the database on the given `level`. + /// + /// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have + /// an effect on the number of keys in that level. Therefore, it did not increase the number of children + /// of the parent node. + /// + /// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted + /// in the addition of a new key in that level, and that therefore the number of children + /// of the parent node should be incremented. fn insert_in_level<'t>( &self, txn: &'t mut RwTxn, field_id: u16, level: u8, - new_key: &[u8], - new_values: &RoaringBitmap, + facet_value: &[u8], + docids: &RoaringBitmap, ) -> Result { if level == 0 { - return self.insert_in_level_0(txn, field_id, new_key, new_values); + return self.insert_in_level_0(txn, field_id, facet_value, docids); } let max_group_size = self.max_group_size; - let (insertion_key, insertion_value) = - self.find_insertion_key_value(field_id, level, new_key, txn)?; - - let result = self.insert_in_level(txn, field_id, level - 1, new_key.clone(), new_values)?; + let result = self.insert_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?; // level below inserted an element - let insertion_key = { - let mut new_insertion_key = insertion_key.clone(); - let mut modified = false; - - if new_key < insertion_key.left_bound.as_slice() { - new_insertion_key.left_bound = new_key.to_vec(); - modified = true; - } - if modified { - let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; - assert!(is_deleted); - self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; - } - new_insertion_key - }; + let (insertion_key, insertion_value) = + self.find_insertion_key_value(field_id, level, facet_value, txn)?; match result { - // TODO: this could go above the block recomputing insertion key - // because we know that if we inserted in place, the key is not a new one - // thus it doesn't extend a group + // because we know that we inserted in place, the facet_value is not a new one + // thus it doesn't extend a group, and thus the insertion key computed above is + // still correct InsertionResult::InPlace => { - let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); - updated_value.bitmap |= new_values; + let mut updated_value = insertion_value; + updated_value.bitmap |= docids; self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; return Ok(InsertionResult::InPlace); } InsertionResult::Insert => {} } - let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); + + // Here we know that inserting the facet value in the level below resulted in the creation + // of a new key. Therefore, it may be the case that we need to modify the left bound of the + // insertion key (see documentation of `find_insertion_key_value` for an example of when that + // could happen). + let insertion_key = { + let mut new_insertion_key = insertion_key.clone(); + let mut key_should_be_modified = false; + + if facet_value < insertion_key.left_bound.as_slice() { + new_insertion_key.left_bound = facet_value.to_vec(); + key_should_be_modified = true; + } + if key_should_be_modified { + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; + } + new_insertion_key + }; + // Now we know that the insertion key contains the `facet_value`. + + // We still need to update the insertion value by: + // 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`) + // 2. Merge the previous docids with the new one + let mut updated_value = insertion_value; updated_value.size += 1; - if updated_value.size == max_group_size { - let size_left = max_group_size / 2; - let size_right = max_group_size - size_left; - let level_below = level - 1; + if updated_value.size < max_group_size { + updated_value.bitmap |= docids; + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - let (start_key, _) = self - .db - .get_greater_than_or_equal_to( - &txn, - &FacetGroupKey { - field_id, - level: level_below, - left_bound: insertion_key.left_bound.as_slice(), - }, - )? - .unwrap(); - - let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize); - - let group_left = { - let mut values_left = RoaringBitmap::new(); - - let mut i = 0; - while let Some(next) = iter.next() { - let (_key, value) = next?; - i += 1; - values_left |= &value.bitmap; - if i == size_left { - break; - } - } - - let key = - FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; - let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; - (key, value) - }; - - let group_right = { - let mut values_right = RoaringBitmap::new(); - let mut right_start_key = None; - - while let Some(next) = iter.next() { - let (key, value) = next?; - if right_start_key.is_none() { - right_start_key = Some(key.left_bound); - } - values_right |= &value.bitmap; - } - - let key = FacetGroupKey { - field_id, - level, - left_bound: right_start_key.unwrap().to_vec(), - }; - let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; - (key, value) - }; - drop(iter); - - let _ = self.db.delete(txn, &insertion_key.as_ref())?; - - self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; - self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; - - Ok(InsertionResult::Insert) - } else { - let mut value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); - value.bitmap |= new_values; - value.size += 1; - self.db.put(txn, &insertion_key.as_ref(), &value).unwrap(); - - Ok(InsertionResult::InPlace) + return Ok(InsertionResult::InPlace); } + + // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` + // Therefore it must be split into two nodes. + + let size_left = max_group_size / 2; + let size_right = max_group_size - size_left; + + let level_below = level - 1; + + let start_key = FacetGroupKey { + field_id, + level: level_below, + left_bound: insertion_key.left_bound.as_slice(), + }; + + let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize); + + let group_left = { + let mut values_left = RoaringBitmap::new(); + + let mut i = 0; + while let Some(next) = iter.next() { + let (_key, value) = next?; + i += 1; + values_left |= &value.bitmap; + if i == size_left { + break; + } + } + + let key = + FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; + let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; + (key, value) + }; + + let group_right = { + let ( + FacetGroupKey { left_bound: right_left_bound, .. }, + FacetGroupValue { bitmap: mut values_right, .. }, + ) = iter.next().unwrap()?; + + while let Some(next) = iter.next() { + let (_, value) = next?; + values_right |= &value.bitmap; + } + + let key = FacetGroupKey { field_id, level, left_bound: right_left_bound.to_vec() }; + let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; + (key, value) + }; + drop(iter); + + let _ = self.db.delete(txn, &insertion_key.as_ref())?; + + self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; + self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; + + Ok(InsertionResult::Insert) } + /// Insert the given facet value and corresponding document ids in the database. pub fn insert<'a, 't>( &self, txn: &'t mut RwTxn, field_id: u16, - new_key: &[u8], - new_values: &RoaringBitmap, + facet_value: &[u8], + docids: &RoaringBitmap, ) -> Result<()> { - if new_values.is_empty() { + if docids.is_empty() { return Ok(()); } let group_size = self.group_size; @@ -330,12 +363,15 @@ impl FacetsUpdateIncrementalInner { let highest_level = get_highest_level(&txn, self.db, field_id)?; let result = - self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?; + self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; match result { InsertionResult::InPlace => return Ok(()), InsertionResult::Insert => {} } + // Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`. + // If it has, we must build an addition level above it. + let mut highest_level_prefix = vec![]; highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); highest_level_prefix.push(highest_level); @@ -384,36 +420,61 @@ impl FacetsUpdateIncrementalInner { Ok(()) } + /// Delete the given document id from the given facet value in the database, from level 0 to the + /// the given level. + /// + /// ## Return + /// Returns the effect of removing the document id from the database on the given `level`. + /// + /// - `DeletionResult::InPlace` means that deleting the document id did not have + /// an effect on the keys in that level. + /// + /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the + /// number of keys in the level. For example, removing a document id from the facet value `3` could + /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted + /// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must + /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. + /// + /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the + /// bounds of the keys of the level. For example, removing a document id from the facet value + /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, + /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). + /// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust + /// its left bound as well. fn delete_in_level<'t>( &self, txn: &'t mut RwTxn, field_id: u16, level: u8, - key: &[u8], - value: u32, + facet_value: &[u8], + docid: u32, ) -> Result { if level == 0 { - return self.delete_in_level_0(txn, field_id, key, value); + return self.delete_in_level_0(txn, field_id, facet_value, docid); } let (deletion_key, mut bitmap) = - self.find_insertion_key_value(field_id, level, key, txn)?; + self.find_insertion_key_value(field_id, level, facet_value, txn)?; - let result = self.delete_in_level(txn, field_id, level - 1, key.clone(), value)?; + let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docid)?; let mut decrease_size = false; - let (prev_key, next_key) = match result { + let next_key = match result { DeletionResult::InPlace => { - bitmap.bitmap.remove(value); + bitmap.bitmap.remove(docid); self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; return Ok(DeletionResult::InPlace); } - DeletionResult::Reduce { prev, next } => (prev, next), - DeletionResult::Remove { prev, next } => { + DeletionResult::Reduce { next } => next, + DeletionResult::Remove { next } => { decrease_size = true; - (prev, next) + next } }; + // If either DeletionResult::Reduce or DeletionResult::Remove was returned, + // then we may need to adjust the left_bound of the deletion key. + // If DeletionResult::Remove was returned, then we need to decrease the group + // size of the deletion key. let mut updated_value = bitmap; if decrease_size { updated_value.size -= 1; @@ -421,17 +482,21 @@ impl FacetsUpdateIncrementalInner { if updated_value.size == 0 { self.db.delete(txn, &deletion_key.as_ref())?; - Ok(DeletionResult::Remove { prev: prev_key, next: next_key }) + Ok(DeletionResult::Remove { next: next_key }) } else { let mut updated_deletion_key = deletion_key.clone(); - if key == deletion_key.left_bound { + let reduced_range = facet_value == deletion_key.left_bound; + if reduced_range { updated_deletion_key.left_bound = next_key.clone().unwrap(); } - updated_value.bitmap.remove(value); + updated_value.bitmap.remove(docid); let _ = self.db.delete(txn, &deletion_key.as_ref())?; self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; - - Ok(DeletionResult::Reduce { prev: prev_key, next: next_key }) + if reduced_range { + Ok(DeletionResult::Reduce { next: next_key }) + } else { + Ok(DeletionResult::InPlace) + } } } @@ -439,27 +504,24 @@ impl FacetsUpdateIncrementalInner { &self, txn: &'t mut RwTxn, field_id: u16, - key: &[u8], - value: u32, + facet_value: &[u8], + docid: u32, ) -> Result { - let key = FacetGroupKey { field_id, level: 0, left_bound: key }; + let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; - bitmap.remove(value); + bitmap.remove(docid); if bitmap.is_empty() { - let mut prev_key = None; let mut next_key = None; - - if let Some(prev) = self.db.get_lower_than(&txn, &key)? { - prev_key = Some(prev.0.left_bound.to_vec()); - } - if let Some(next) = self.db.get_greater_than(&txn, &key)? { - if next.0.level == 0 { - next_key = Some(next.0.left_bound.to_vec()); + if let Some((next, _)) = + self.db.remap_data_type::().get_greater_than(&txn, &key)? + { + if next.field_id == field_id && next.level == 0 { + next_key = Some(next.left_bound.to_vec()); } } self.db.delete(txn, &key)?; - Ok(DeletionResult::Remove { prev: prev_key, next: next_key }) + Ok(DeletionResult::Remove { next: next_key }) } else { self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?; Ok(DeletionResult::InPlace) @@ -470,22 +532,30 @@ impl FacetsUpdateIncrementalInner { &self, txn: &'t mut RwTxn, field_id: u16, - key: &[u8], - value: u32, + facet_value: &[u8], + docid: u32, ) -> Result<()> { - if self.db.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: key })?.is_none() { + if self + .db + .remap_data_type::() + .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })? + .is_none() + { return Ok(()); } let highest_level = get_highest_level(&txn, self.db, field_id)?; - // let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - - let result = self.delete_in_level(txn, field_id, highest_level as u8, key, value)?; + let result = + self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docid)?; match result { DeletionResult::InPlace => return Ok(()), - DeletionResult::Reduce { .. } => {} + DeletionResult::Reduce { .. } => return Ok(()), DeletionResult::Remove { .. } => {} } + + // if we either removed a key from the highest level, its size may have fallen + // below `min_level_size`, in which case we need to remove the entire level + let mut highest_level_prefix = vec![]; highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); highest_level_prefix.push(highest_level); @@ -521,6 +591,26 @@ impl FacetsUpdateIncrementalInner { } } +impl<'a> FacetGroupKey<&'a [u8]> { + pub fn into_owned(self) -> FacetGroupKey> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.to_vec(), + } + } +} + +impl<'a> FacetGroupKey> { + pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.as_slice(), + } + } +} + #[cfg(test)] mod tests { use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index caf88671e..ea6468538 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -1,3 +1,79 @@ +/*! +This module implements two different algorithms for updating the `facet_id_string_docids` +and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that +it recreates the database from scratch when new elements are added to it. The second algorithm +is incremental: it modifies the database as little as possible. + +The databases must be able to return results for queries such as: +1. Filter : find all the document ids that have a facet value greater than X and/or smaller than Y +2. Min/Max : find the minimum/maximum facet value among these document ids +3. Sort : sort these document ids by increasing/decreasing facet values +4. Distribution : given some document ids, make a list of each facet value + found in these documents along with the number of documents that contain it + +The algorithms that implement these queries are found in the `src/search/facet` folder. + +To make these queries fast to compute, the database adopts a tree structure: +```ignore + ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ +┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │ +│Level 2│ │ │ │ │ +└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ + ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ +┌───────┐ │ "ab" (2) │ "ba" (2) │ "gaf" (2) │ "form" (2) │ "woz" (2) │ +│Level 1│ │ │ │ │ │ │ +└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ + ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ +┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │ +│Level 0│ │ │ │ │ │ │ │ │ │ │ │ +└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ + └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ +``` +In the diagram above, each cell corresponds to a node in the tree. The first line of the cell +contains the left bound of the range of facet values as well as the number of children of the node. +The second line contains the document ids which have a facet value within the range of the node. +The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range. + +In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because +`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`. +These documents all contain a facet value that is contained within `ab .. gaf`. + +In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a +[`FacetGroupValue`], which have the following format: + +```ignore +FacetGroupKey: +- field id : u16 +- level : u8 +- left bound: [u8] // the facet value encoded using either OrderedF64Codec or Str + +FacetGroupValue: +- #children : u8 +- docids : RoaringBitmap +``` + +When the database is first created using the "bulk" method, each node has a fixed number of children +(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`). +The tree is also built such that the highest level has more than `min_level_size` +(default to `FACET_MIN_LEVEL_SIZE`) elements in it. + +When the database is incrementally updated, the number of children of a node can vary between +1 and `max_group_size`. This is done so that most incremental operations do not need to change +the structure of the tree. When the number of children of a node reaches `max_group_size`, +we split the node in two and update the number of children of its parent. + +When adding documents to the databases, it is important to determine which method to use to +minimise indexing time. The incremental method is faster when adding few new facet values, but the +bulk method is faster when a large part of the database is modified. Empirically, it seems that +it takes 50x more time to incrementally add N facet values to an existing database than it is to +construct a database of N facet values. This is the heuristic that is used to choose between the +two methods. +*/ + +pub const FACET_MAX_GROUP_SIZE: u8 = 8; +pub const FACET_GROUP_SIZE: u8 = 4; +pub const FACET_MIN_LEVEL_SIZE: u8 = 5; + use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; @@ -13,8 +89,8 @@ pub struct FacetsUpdate<'i> { database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, new_data: grenad::Reader, - level_group_size: u8, - max_level_group_size: u8, + group_size: u8, + max_group_size: u8, min_level_size: u8, } impl<'i> FacetsUpdate<'i> { @@ -30,57 +106,24 @@ impl<'i> FacetsUpdate<'i> { Self { index, database, - level_group_size: 4, - max_level_group_size: 8, - min_level_size: 5, + group_size: FACET_GROUP_SIZE, + max_group_size: FACET_MAX_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, new_data, } } - // TODO: use the options below? - // but I don't actually see why they should be configurable - // /// The minimum number of elements that a level is allowed to have. - // pub fn level_max_group_size(mut self, value: u8) -> Self { - // self.max_level_group_size = std::cmp::max(value, 4); - // self - // } - - // /// The number of elements from the level below that are represented by a single element in the level above - // /// - // /// This setting is always greater than or equal to 2. - // pub fn level_group_size(mut self, value: u8) -> Self { - // self.level_group_size = std::cmp::max(value, 2); - // self - // } - - // /// The minimum number of elements that a level is allowed to have. - // pub fn min_level_size(mut self, value: u8) -> Self { - // self.min_level_size = std::cmp::max(value, 2); - // self - // } - pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { if self.new_data.is_empty() { return Ok(()); } - // here, come up with a better condition! - // ideally we'd choose which method to use for each field id individually - // but I dont' think it's worth the effort yet - // As a first requirement, we ask that the length of the new data is less - // than a 1/50th of the length of the database in order to use the incremental - // method. if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { - let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data) - .level_group_size(self.level_group_size) - .min_level_size(self.min_level_size); + let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size); bulk_update.execute(wtxn)?; } else { let incremental_update = - FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data) - .group_size(self.level_group_size) - .max_group_size(self.max_level_group_size) - .min_level_size(self.min_level_size); + FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size, self.max_group_size); incremental_update.execute(wtxn)?; } Ok(()) @@ -346,7 +389,7 @@ mod comparison_bench { // of the incremental vs. bulk indexer. // It appears that the incremental indexer is about 50 times slower than the // bulk indexer. - #[test] + // #[test] fn benchmark_facet_indexing() { // then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 96bea9589..7b02fd1af 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -7,7 +7,7 @@ mod typed_chunk; use std::collections::HashSet; use std::io::{Cursor, Read, Seek}; use std::iter::FromIterator; -use std::num::{NonZeroU32, NonZeroUsize}; +use std::num::NonZeroU32; use std::result::Result as StdResult; use crossbeam_channel::{Receiver, Sender}; @@ -82,8 +82,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a, F> { #[derive(Default, Debug, Clone)] pub struct IndexDocumentsConfig { - pub facet_level_group_size: Option, - pub facet_min_level_size: Option, pub words_prefix_threshold: Option, pub max_prefix_length: Option, pub words_positions_level_group_size: Option, From fca4577e233d943990757c7b4e1408f8bec7840f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 17:56:38 +0200 Subject: [PATCH 1722/1889] Return original string in facet distributions, work on facet tests --- milli/src/search/facet/facet_distribution.rs | 249 +++++++++++++++++- .../search/facet/facet_distribution_iter.rs | 104 +++----- milli/src/search/facet/facet_range_search.rs | 72 ++--- .../src/search/facet/facet_sort_ascending.rs | 41 +-- .../src/search/facet/facet_sort_descending.rs | 42 +-- milli/src/search/facet/filter.rs | 6 +- milli/src/search/facet/mod.rs | 37 +++ .../random_looking_index_snap.hash.snap | 4 - .../random_looking_index_snap.hash.snap | 4 - .../random_looking_index_snap.hash.snap | 4 - 10 files changed, 350 insertions(+), 213 deletions(-) delete mode 100644 milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap delete mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap delete mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 7c554d368..7eb438a03 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -140,13 +140,13 @@ impl<'a> FacetDistribution<'a> { self.index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates, - |facet_key, nbr_docids| { + |facet_key, nbr_docids, _| { let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap(); distribution.insert(facet_key.to_string(), nbr_docids); if distribution.len() == self.max_values_per_facet { - ControlFlow::Break(()) + Ok(ControlFlow::Break(())) } else { - ControlFlow::Continue(()) + Ok(ControlFlow::Continue(())) } }, ) @@ -163,13 +163,22 @@ impl<'a> FacetDistribution<'a> { self.index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, - |facet_key, nbr_docids| { + |facet_key, nbr_docids, any_docid| { let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); - distribution.insert(facet_key.to_string(), nbr_docids); + + let key: (FieldId, _, &str) = (field_id, any_docid, facet_key); + let original_string = self + .index + .field_id_docid_facet_strings + .get(self.rtxn, &key)? + .unwrap() + .to_owned(); + + distribution.insert(original_string, nbr_docids); if distribution.len() == self.max_values_per_facet { - ControlFlow::Break(()) + Ok(ControlFlow::Break(())) } else { - ControlFlow::Continue(()) + Ok(ControlFlow::Continue(())) } }, ) @@ -186,7 +195,8 @@ impl<'a> FacetDistribution<'a> { let db = self.index.facet_id_f64_docids; let mut prefix = vec![]; prefix.extend_from_slice(&field_id.to_be_bytes()); - prefix.push(0); + prefix.push(0); // read values from level 0 only + let iter = db .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? @@ -207,10 +217,15 @@ impl<'a> FacetDistribution<'a> { .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? .remap_types::, FacetGroupValueCodec>(); - // TODO: get the original value of the facet somewhere (in the documents DB?) for result in iter { let (key, value) = result?; - distribution.insert(key.left_bound.to_owned(), value.bitmap.len()); + + let docid = value.bitmap.iter().next().unwrap(); + let key: (FieldId, _, &'a str) = (field_id, docid, key.left_bound); + let original_string = + self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned(); + + distribution.insert(original_string, value.bitmap.len()); if distribution.len() == self.max_values_per_facet { break; } @@ -304,3 +319,217 @@ impl fmt::Debug for FacetDistribution<'_> { .finish() } } + +#[cfg(test)] +mod tests { + use big_s::S; + use maplit::hashset; + + use crate::{ + documents::documents_batch_reader_from_objects, index::tests::TempIndex, milli_snap, + FacetDistribution, + }; + + #[test] + fn few_candidates_few_facet_values() { + // All the tests here avoid using the code in `facet_distribution_iter` because there aren't + // enough candidates. + + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let documents = documents!([ + { "colour": "Blue" }, + { "colour": " blue" }, + { "colour": "RED" } + ]); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([0, 1, 2].iter().copied().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([1, 2].iter().copied().collect()) + .execute() + .unwrap(); + + // I think it would be fine if " blue" was "Blue" instead. + // We just need to get any non-normalised string I think, even if it's not in + // the candidates + milli_snap!(format!("{map:?}"), @r###"{"colour": {" blue": 1, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([2].iter().copied().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([0, 1, 2].iter().copied().collect()) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + } + + #[test] + fn many_candidates_few_facet_values() { + let mut index = TempIndex::new_with_map_size(4096 * 10_000); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let facet_values = ["Red", "RED", " red ", "Blue", "BLUE"]; + + let mut documents = vec![]; + for i in 0..10_000 { + let document = serde_json::json!({ + "colour": facet_values[i % 5], + }) + .as_object() + .unwrap() + .clone(); + documents.push(document); + } + + let documents = documents_batch_reader_from_objects(documents); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..10_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000}}"###); + } + + #[test] + fn many_candidates_many_facet_values() { + let mut index = TempIndex::new_with_map_size(4096 * 10_000); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let facet_values = (0..1000).into_iter().map(|x| format!("{x:x}")).collect::>(); + + let mut documents = vec![]; + for i in 0..10_000 { + let document = serde_json::json!({ + "colour": facet_values[i % 1000], + }) + .as_object() + .unwrap() + .clone(); + documents.push(document); + } + + let documents = documents_batch_reader_from_objects(documents); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "no_candidates", @"ac9229ed5964d893af96a7076e2f8af5"); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .max_values_per_facet(2) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "no_candidates_with_max_2", @r###"{"colour": {"0": 10, "1": 10}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..10_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "candidates_0_10_000", @"ac9229ed5964d893af96a7076e2f8af5"); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "candidates_0_5_000", @"825f23a4090d05756f46176987b7d992"); + } +} diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 3379d1abe..ad330b8db 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -4,8 +4,9 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, +use crate::{ + heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, + DocumentId, }; pub fn iterate_over_facet_distribution<'t, CB>( @@ -16,7 +17,7 @@ pub fn iterate_over_facet_distribution<'t, CB>( callback: CB, ) -> Result<()> where - CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { let mut fd = FacetDistribution { rtxn, db, field_id, callback }; let highest_level = @@ -32,7 +33,7 @@ where struct FacetDistribution<'t, CB> where - CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, @@ -42,7 +43,7 @@ where impl<'t, CB> FacetDistribution<'t, CB> where - CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { fn iterate_level_0( &mut self, @@ -62,7 +63,8 @@ where } let docids_in_common = value.bitmap.intersection_len(candidates); if docids_in_common > 0 { - match (self.callback)(key.left_bound, docids_in_common) { + let any_docid = value.bitmap.iter().next().unwrap(); + match (self.callback)(key.left_bound, docids_in_common, any_docid)? { ControlFlow::Continue(_) => {} ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } @@ -112,50 +114,14 @@ where #[cfg(test)] mod tests { + use super::iterate_over_facet_distribution; + use crate::milli_snap; + use crate::search::facet::tests::get_random_looking_index; + use crate::{heed_codec::facet::OrderedF64Codec, search::facet::tests::get_simple_index}; + use heed::BytesDecode; + use roaring::RoaringBitmap; use std::ops::ControlFlow; - use super::iterate_over_facet_distribution; - use crate::heed_codec::facet::OrderedF64Codec; - use crate::milli_snap; - use crate::update::facet::tests::FacetIndex; - use heed::BytesDecode; - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - - fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - for i in 0..256u16 { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(i as u32); - index.insert(&mut txn, 0, &(i as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - let keys = - std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); - - for (_i, key) in keys.into_iter().enumerate() { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - - #[test] - fn random_looking_index_snap() { - let index = get_random_looking_index(); - milli_snap!(format!("{index}")); - } #[test] fn filter_distribution_all() { let indexes = [get_simple_index(), get_random_looking_index()]; @@ -163,11 +129,17 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); - iterate_over_facet_distribution(&txn, index.content, 0, &candidates, |facet, count| { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - results.push_str(&format!("{facet}: {count}\n")); - ControlFlow::Continue(()) - }) + iterate_over_facet_distribution( + &txn, + index.content, + 0, + &candidates, + |facet, count, _| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {count}\n")); + Ok(ControlFlow::Continue(())) + }, + ) .unwrap(); milli_snap!(results, i); @@ -182,17 +154,23 @@ mod tests { let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); let mut nbr_facets = 0; - iterate_over_facet_distribution(&txn, index.content, 0, &candidates, |facet, count| { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - if nbr_facets == 100 { - return ControlFlow::Break(()); - } else { - nbr_facets += 1; - results.push_str(&format!("{facet}: {count}\n")); + iterate_over_facet_distribution( + &txn, + index.content, + 0, + &candidates, + |facet, count, _| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + if nbr_facets == 100 { + return Ok(ControlFlow::Break(())); + } else { + nbr_facets += 1; + results.push_str(&format!("{facet}: {count}\n")); - ControlFlow::Continue(()) - } - }) + Ok(ControlFlow::Continue(())) + } + }, + ) .unwrap(); milli_snap!(results, i); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index cb5fd14d2..c99ac8e92 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -15,7 +15,8 @@ pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, -) -> Result + docids: &mut RoaringBitmap, +) -> Result<()> where BoundCodec: for<'a> BytesEncode<'a>, for<'a> >::EItem: Sized, @@ -45,16 +46,15 @@ where Bound::Unbounded => Bound::Unbounded, }; let db = db.remap_key_type::>(); - let mut docids = RoaringBitmap::new(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; - Ok(docids) + Ok(()) } else { - return Ok(RoaringBitmap::new()); + return Ok(()); } } @@ -255,45 +255,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { #[cfg(test)] mod tests { - use std::ops::Bound; - - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - use super::find_docids_of_facet_within_bounds; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use crate::update::facet::tests::FacetIndex; - - fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - for i in 0..256u16 { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(i as u32); - index.insert(&mut txn, 0, &(i as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - let keys = - std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); - - for (_i, key) in keys.into_iter().enumerate() { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as f64), &bitmap); - } - txn.commit().unwrap(); - index - } + use roaring::RoaringBitmap; + use std::ops::Bound; #[test] fn random_looking_index_snap() { @@ -310,12 +278,14 @@ mod tests { let i = i as f64; let start = Bound::Included(0.); let end = Bound::Included(i); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); @@ -326,12 +296,14 @@ mod tests { let i = i as f64; let start = Bound::Excluded(0.); let end = Bound::Excluded(i); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); @@ -352,12 +324,14 @@ mod tests { let i = i as f64; let start = Bound::Included(i); let end = Bound::Included(255.); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); @@ -371,12 +345,14 @@ mod tests { let i = i as f64; let start = Bound::Excluded(i); let end = Bound::Excluded(255.); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); @@ -399,12 +375,14 @@ mod tests { let i = i as f64; let start = Bound::Included(i); let end = Bound::Included(255. - i); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); @@ -418,12 +396,14 @@ mod tests { let i = i as f64; let start = Bound::Excluded(i); let end = Bound::Excluded(255. - i); - let docids = find_docids_of_facet_within_bounds::( + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( &txn, index.content.remap_key_type::>(), 0, &start, &end, + &mut docids, ) .unwrap(); results.push_str(&format!("{}\n", display_bitmap(&docids))); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index f320f9e77..33ca7d1ce 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -83,49 +83,12 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - - use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use crate::update::facet::tests::FacetIndex; + use roaring::RoaringBitmap; - fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - for i in 0..256u16 { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(i as u32); - index.insert(&mut txn, 0, &(i as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - let keys = - std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); - - for (_i, key) in keys.into_iter().enumerate() { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - - #[test] - fn random_looking_index_snap() { - let index = get_random_looking_index(); - milli_snap!(format!("{index}")); - } #[test] fn filter_sort() { let indexes = [get_simple_index(), get_random_looking_index()]; diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index be5fe7841..69f286886 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -116,49 +116,13 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - - use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, OrderedF64Codec}; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use crate::update::facet::tests::FacetIndex; + use roaring::RoaringBitmap; - fn get_simple_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - for i in 0..256u16 { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(i as u32); - index.insert(&mut txn, 0, &(i as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - fn get_random_looking_index() -> FacetIndex { - let index = FacetIndex::::new(4, 8, 5); - let mut txn = index.env.write_txn().unwrap(); - - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - let keys = - std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); - - for (_i, key) in keys.into_iter().enumerate() { - let mut bitmap = RoaringBitmap::new(); - bitmap.insert(key); - bitmap.insert(key + 100); - index.insert(&mut txn, 0, &(key as f64), &bitmap); - } - txn.commit().unwrap(); - index - } - - #[test] - fn random_looking_index_snap() { - let index = get_random_looking_index(); - milli_snap!(format!("{index}")); - } #[test] fn filter_sort_descending() { let indexes = [get_simple_index(), get_random_looking_index()]; diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 15edafb03..4263eea7b 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -278,11 +278,9 @@ impl<'a> Filter<'a> { (Excluded(l), Included(r)) if l >= r => return Ok(()), (_, _) => (), } - let x = facet_range_search::find_docids_of_facet_within_bounds::( - rtxn, db, field_id, &left, &right, + facet_range_search::find_docids_of_facet_within_bounds::( + rtxn, db, field_id, &left, &right, output, )?; - // TODO: the facet range search should take a mutable roaring bitmap as argument - *output = x; Ok(()) } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index fc71acf37..415c2b51a 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -74,3 +74,40 @@ pub(crate) fn get_highest_level<'t>( }) .unwrap_or(0)) } + +#[cfg(test)] +pub(crate) mod tests { + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + + use crate::{heed_codec::facet::OrderedF64Codec, update::facet::tests::FacetIndex}; + + pub fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + txn.commit().unwrap(); + index + } +} diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap deleted file mode 100644 index 661e1a35b..000000000 --- a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/search/facet/facet_distribution_iter.rs ---- -3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap deleted file mode 100644 index 64ff762db..000000000 --- a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/search/facet/facet_sort_ascending.rs ---- -3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap deleted file mode 100644 index 0649e3c5d..000000000 --- a/milli/src/search/facet/snapshots/facet_sort_descending.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/search/facet/facet_sort_descending.rs ---- -3256c76a7c1b768a013e78d5fa6e9ff9 From 3d7ed3263f3cfb4eed14b446de62f04dc9ef6efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 18:00:00 +0200 Subject: [PATCH 1723/1889] Fix bug in string facet distribution with few candidates --- milli/src/search/facet/facet_distribution.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 7eb438a03..0eaeec399 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -96,7 +96,7 @@ impl<'a> FacetDistribution<'a> { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); let db = self.index.field_id_docid_facet_strings; - for docid in candidates.into_iter() { + 'outer: for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db @@ -112,7 +112,7 @@ impl<'a> FacetDistribution<'a> { *count += 1; if normalized_distribution.len() == self.max_values_per_facet { - break; + break 'outer; } } } @@ -393,7 +393,7 @@ mod tests { .execute() .unwrap(); - milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 1}}"###); } #[test] From b1ab09196cdd97549a5e960fb33fd2f2018244d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 18:03:52 +0200 Subject: [PATCH 1724/1889] Remove outdated TODOs --- milli/src/update/facet/bulk.rs | 2 -- .../index_documents/extract/extract_facet_string_docids.rs | 2 +- milli/src/update/index_documents/extract/mod.rs | 2 +- milli/src/update/index_documents/typed_chunk.rs | 2 -- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 83fa51003..3a371995e 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -177,8 +177,6 @@ impl FacetsUpdateBulkInner { field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { - // TODO: first check whether there is anything in level 0? - let mut all_docids = RoaringBitmap::new(); let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { for bitmap in bitmaps { diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index e6a41067b..a7b027ce3 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -21,7 +21,7 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - merge_cbo_roaring_bitmaps, // TODO: check that it is correct + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 208dfc74d..5f557d812 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -142,7 +142,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - merge_cbo_roaring_bitmaps, // TODO: check (cbo?) + merge_cbo_roaring_bitmaps, TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index f11414f20..16784bd92 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -138,13 +138,11 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { - // TODO indexer options for the facet level database let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?; is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { - // TODO indexer options for the facet level database let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); indexer.execute(wtxn)?; is_merged_database = true; From 985a94adfc6fbe4f333f39572b6b7e6f1f1a46b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 18:04:07 +0200 Subject: [PATCH 1725/1889] cargo fmt --- milli/src/search/facet/facet_distribution.rs | 7 ++- .../search/facet/facet_distribution_iter.rs | 18 ++++---- milli/src/search/facet/facet_range_search.rs | 6 ++- .../src/search/facet/facet_sort_ascending.rs | 3 +- .../src/search/facet/facet_sort_descending.rs | 3 +- milli/src/search/facet/mod.rs | 3 +- milli/src/update/facet/bulk.rs | 27 +++++++----- milli/src/update/facet/incremental.rs | 19 ++++---- milli/src/update/facet/mod.rs | 44 ++++++++++++------- 9 files changed, 78 insertions(+), 52 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 0eaeec399..2e2e448c2 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -325,10 +325,9 @@ mod tests { use big_s::S; use maplit::hashset; - use crate::{ - documents::documents_batch_reader_from_objects, index::tests::TempIndex, milli_snap, - FacetDistribution, - }; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + use crate::{milli_snap, FacetDistribution}; #[test] fn few_candidates_few_facet_values() { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index ad330b8db..01266187a 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -4,10 +4,10 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::{ - heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, - DocumentId, +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, }; +use crate::DocumentId; pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, @@ -114,13 +114,15 @@ where #[cfg(test)] mod tests { - use super::iterate_over_facet_distribution; - use crate::milli_snap; - use crate::search::facet::tests::get_random_looking_index; - use crate::{heed_codec::facet::OrderedF64Codec, search::facet::tests::get_simple_index}; + use std::ops::ControlFlow; + use heed::BytesDecode; use roaring::RoaringBitmap; - use std::ops::ControlFlow; + + use super::iterate_over_facet_distribution; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::milli_snap; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; #[test] fn filter_distribution_all() { diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index c99ac8e92..8934873b7 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -255,13 +255,15 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { #[cfg(test)] mod tests { + use std::ops::Bound; + + use roaring::RoaringBitmap; + use super::find_docids_of_facet_within_bounds; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use roaring::RoaringBitmap; - use std::ops::Bound; #[test] fn random_looking_index_snap() { diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 33ca7d1ce..6567fe95e 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -83,11 +83,12 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { #[cfg(test)] mod tests { + use roaring::RoaringBitmap; + use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use roaring::RoaringBitmap; #[test] fn filter_sort() { diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 69f286886..2eab9fca1 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -116,12 +116,13 @@ impl<'t> Iterator for DescendingFacetSort<'t> { #[cfg(test)] mod tests { + use roaring::RoaringBitmap; + use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; use crate::snapshot_tests::display_bitmap; - use roaring::RoaringBitmap; #[test] fn filter_sort_descending() { diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 415c2b51a..18c3a652b 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -80,7 +80,8 @@ pub(crate) mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::{heed_codec::facet::OrderedF64Codec, update::facet::tests::FacetIndex}; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::update::facet::tests::FacetIndex; pub fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 3a371995e..e82af5d66 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,19 +1,20 @@ +use std::borrow::Cow; +use std::fs::File; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + +use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; -use roaring::RoaringBitmap; -use std::borrow::Cow; -use std::fs::File; -use time::OffsetDateTime; - -use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases /// by rebuilding the database "from scratch". @@ -342,11 +343,13 @@ impl FacetsUpdateBulkInner { #[cfg(test)] mod tests { + use std::iter::once; + + use roaring::RoaringBitmap; + use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::update::facet::tests::FacetIndex; - use roaring::RoaringBitmap; - use std::iter::once; #[test] fn insert() { diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 6be2dbf03..a06c8e1c2 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,14 +1,16 @@ +use std::collections::HashMap; +use std::fs::File; + +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; + use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::search::facet::get_highest_level; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{BytesDecode, Error, RoTxn, RwTxn}; -use roaring::RoaringBitmap; -use std::collections::HashMap; -use std::fs::File; enum InsertionResult { InPlace, @@ -613,13 +615,14 @@ impl<'a> FacetGroupKey> { #[cfg(test)] mod tests { - use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; - use crate::milli_snap; - use crate::update::facet::tests::FacetIndex; use rand::seq::SliceRandom; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; + use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; + #[test] fn append() { let index = FacetIndex::::new(4, 8, 5); diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index ea6468538..9263d3a6a 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -53,8 +53,8 @@ FacetGroupValue: ``` When the database is first created using the "bulk" method, each node has a fixed number of children -(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`). -The tree is also built such that the highest level has more than `min_level_size` +(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`). +The tree is also built such that the highest level has more than `min_level_size` (default to `FACET_MIN_LEVEL_SIZE`) elements in it. When the database is incrementally updated, the number of children of a node can vary between @@ -66,7 +66,7 @@ When adding documents to the databases, it is important to determine which metho minimise indexing time. The incremental method is faster when adding few new facet values, but the bulk method is faster when a large part of the database is modified. Empirically, it seems that it takes 50x more time to incrementally add N facet values to an existing database than it is to -construct a database of N facet values. This is the heuristic that is used to choose between the +construct a database of N facet values. This is the heuristic that is used to choose between the two methods. */ @@ -74,12 +74,13 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8; pub const FACET_GROUP_SIZE: u8 = 4; pub const FACET_MIN_LEVEL_SIZE: u8 = 5; +use std::fs::File; + use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::{Index, Result}; -use std::fs::File; pub mod bulk; pub mod incremental; @@ -119,11 +120,23 @@ impl<'i> FacetsUpdate<'i> { return Ok(()); } if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { - let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size); + let bulk_update = FacetsUpdateBulk::new( + self.index, + self.facet_type, + self.new_data, + self.group_size, + self.min_level_size, + ); bulk_update.execute(wtxn)?; } else { - let incremental_update = - FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size, self.max_group_size); + let incremental_update = FacetsUpdateIncremental::new( + self.index, + self.facet_type, + self.new_data, + self.group_size, + self.min_level_size, + self.max_group_size, + ); incremental_update.execute(wtxn)?; } Ok(()) @@ -132,6 +145,14 @@ impl<'i> FacetsUpdate<'i> { #[cfg(test)] pub(crate) mod tests { + use std::fmt::Display; + use std::marker::PhantomData; + use std::rc::Rc; + + use heed::types::ByteSlice; + use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; + use roaring::RoaringBitmap; + use super::bulk::FacetsUpdateBulkInner; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, @@ -140,12 +161,6 @@ pub(crate) mod tests { use crate::snapshot_tests::display_bitmap; use crate::update::FacetsUpdateIncrementalInner; use crate::CboRoaringBitmapCodec; - use heed::types::ByteSlice; - use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; - use roaring::RoaringBitmap; - use std::fmt::Display; - use std::marker::PhantomData; - use std::rc::Rc; // A dummy index that only contains the facet database, used for testing pub struct FacetIndex @@ -381,9 +396,8 @@ mod comparison_bench { use rand::Rng; use roaring::RoaringBitmap; - use crate::heed_codec::facet::OrderedF64Codec; - use super::tests::FacetIndex; + use crate::heed_codec::facet::OrderedF64Codec; // This is a simple test to get an intuition on the relative speed // of the incremental vs. bulk indexer. From de52a9bf75e3fc9b2c8a7f86511daef356504711 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 08:47:40 +0200 Subject: [PATCH 1726/1889] Improve documentation of some facet-related algorithms --- milli/src/search/criteria/asc_desc.rs | 5 ++-- .../search/facet/facet_distribution_iter.rs | 19 ++++++++++---- .../src/search/facet/facet_sort_ascending.rs | 22 ++++++++++++++++ .../src/search/facet/facet_sort_descending.rs | 3 +++ milli/src/search/facet/mod.rs | 25 ++++++++++++------- 5 files changed, 57 insertions(+), 17 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index bb2788cc8..76dd3db29 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -9,9 +9,8 @@ use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::facet_sort_ascending::ascending_facet_sort; -use crate::search::facet::facet_sort_descending::descending_facet_sort; -// use crate::search::facet::FacetStringIter; +use crate::search::facet::ascending_facet_sort; +use crate::search::facet::descending_facet_sort; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 01266187a..ab546f7a9 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,14 +1,23 @@ -use std::ops::ControlFlow; - -use heed::Result; -use roaring::RoaringBitmap; - use super::{get_first_facet_value, get_highest_level}; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, }; use crate::DocumentId; +use heed::Result; +use roaring::RoaringBitmap; +use std::ops::ControlFlow; +/// Call the given closure on the facet distribution of the candidate documents. +/// +/// The arguments to the closure are: +/// - the facet value, as a byte slice +/// - the number of documents among the candidates that contain this facet value +/// - the id of a document which contains the facet value. Note that this document +/// is not necessarily from the list of candidates, it is simply *any* document which +/// contains this facet value. +/// +/// The return value of the closure is a `ControlFlow<()>` which indicates whether we should +/// keep iterating over the different facet values or stop. pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 6567fe95e..2b0a45e15 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -6,6 +6,28 @@ use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +/// Return an iterator which iterates over the given candidate documents in +/// ascending order of their facet value for the given field id. +/// +/// The documents returned by the iterator are grouped by the facet values that +/// determined their rank. For example, given the documents: +/// +/// ```ignore +/// 0: { "colour": ["blue", "green"] } +/// 1: { "colour": ["blue", "red"] } +/// 2: { "colour": ["orange", "red"] } +/// 3: { "colour": ["green", "red"] } +/// 4: { "colour": ["blue", "orange", "red"] } +/// ``` +/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator +/// over the following elements: +/// ```ignore +/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue" +/// [3] // same for "green" +/// [2] // same for "orange" +/// END +/// ``` +/// Note that once a document id is returned by the iterator, it is never returned again. pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 2eab9fca1..47d0f145b 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -8,6 +8,9 @@ use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +/// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort). +/// +/// This function does the same thing, but in the opposite order. pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 18c3a652b..b880c2e01 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,17 +1,19 @@ -use heed::types::ByteSlice; -use heed::{BytesDecode, RoTxn}; - pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; +pub use facet_sort_ascending::ascending_facet_sort; +pub use facet_sort_descending::descending_facet_sort; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, RoTxn}; mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; -pub mod facet_sort_ascending; -pub mod facet_sort_descending; +mod facet_sort_ascending; +mod facet_sort_descending; mod filter; +/// Get the first facet value in the facet database pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: heed::Database, FacetGroupValueCodec>, @@ -23,8 +25,9 @@ where let mut level0prefix = vec![]; level0prefix.extend_from_slice(&field_id.to_be_bytes()); level0prefix.push(0); - let mut level0_iter_forward = - db.as_polymorph().prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; + let mut level0_iter_forward = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; if let Some(first) = level0_iter_forward.next() { let (first_key, _) = first?; let first_key = FacetGroupKeyCodec::::bytes_decode(first_key) @@ -34,6 +37,8 @@ where Ok(None) } } + +/// Get the last facet value in the facet database pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: heed::Database, FacetGroupValueCodec>, @@ -47,7 +52,7 @@ where level0prefix.push(0); let mut level0_iter_backward = db .as_polymorph() - .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?; + .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; if let Some(last) = level0_iter_backward.next() { let (last_key, _) = last?; let last_key = FacetGroupKeyCodec::::bytes_decode(last_key) @@ -57,6 +62,8 @@ where Ok(None) } } + +/// Get the height of the highest level in the facet database pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, @@ -65,7 +72,7 @@ pub(crate) fn get_highest_level<'t>( let field_id_prefix = &field_id.to_be_bytes(); Ok(db .as_polymorph() - .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix)? + .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, field_id_prefix)? .next() .map(|el| { let (key, _) = el.unwrap(); From 86d9f50b9c3d9456f1ba738a2b35fcfabbc688ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 11:53:01 +0200 Subject: [PATCH 1727/1889] Fix bugs in incremental facet indexing with variable parameters e.g. add one facet value incrementally with a group_size = X and then add another one with group_size = Y It is not actually possible to do so with the public API of milli, but I wanted to make sure the algorithm worked well in those cases anyway. The bugs were found by fuzzing the code with fuzzcheck, which I've added to milli as a conditional dev-dependency. But it can be removed later. --- .gitignore | 2 + milli/Cargo.toml | 3 + milli/src/lib.rs | 2 + milli/src/search/criteria/asc_desc.rs | 3 +- .../search/facet/facet_distribution_iter.rs | 8 +- milli/src/search/facet/mod.rs | 7 +- milli/src/update/facet/incremental.rs | 614 +++++++++++------- milli/src/update/facet/mod.rs | 67 +- 8 files changed, 435 insertions(+), 271 deletions(-) diff --git a/.gitignore b/.gitignore index cef7b7b4c..39623a232 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ /target /Cargo.lock +milli/target/ + # datasets *.csv *.mmdb diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 658ef0d24..2f881fccb 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -56,6 +56,9 @@ maplit = "1.0.2" md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } +[target.'cfg(fuzzing)'.dev-dependencies] +fuzzcheck = { path = "../../fuzzcheck-rs/fuzzcheck" } + [features] default = [ "charabia/default" ] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ffbe8f38f..630d13125 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,3 +1,5 @@ +#![cfg_attr(all(test, fuzzing), feature(no_coverage))] + #[macro_use] pub mod documents; diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 76dd3db29..586605116 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -9,8 +9,7 @@ use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::ascending_facet_sort; -use crate::search::facet::descending_facet_sort; +use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index ab546f7a9..4c6dc75fa 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,11 +1,13 @@ +use std::ops::ControlFlow; + +use heed::Result; +use roaring::RoaringBitmap; + use super::{get_first_facet_value, get_highest_level}; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, }; use crate::DocumentId; -use heed::Result; -use roaring::RoaringBitmap; -use std::ops::ControlFlow; /// Call the given closure on the facet distribution of the candidate documents. /// diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index b880c2e01..be04fbd7f 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,11 +1,12 @@ -pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; -pub use self::filter::Filter; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; pub use facet_sort_ascending::ascending_facet_sort; pub use facet_sort_descending::descending_facet_sort; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, RoTxn}; +pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; +pub use self::filter::Filter; +use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; + mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index a06c8e1c2..c2115aee5 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -14,6 +14,7 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; enum InsertionResult { InPlace, + Expand, Insert, } enum DeletionResult { @@ -251,6 +252,7 @@ impl FacetsUpdateIncrementalInner { return Ok(InsertionResult::InPlace); } + InsertionResult::Expand => {} InsertionResult::Insert => {} } @@ -258,7 +260,7 @@ impl FacetsUpdateIncrementalInner { // of a new key. Therefore, it may be the case that we need to modify the left bound of the // insertion key (see documentation of `find_insertion_key_value` for an example of when that // could happen). - let insertion_key = { + let (insertion_key, insertion_key_was_modified) = { let mut new_insertion_key = insertion_key.clone(); let mut key_should_be_modified = false; @@ -271,7 +273,7 @@ impl FacetsUpdateIncrementalInner { assert!(is_deleted); self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; } - new_insertion_key + (new_insertion_key, key_should_be_modified) }; // Now we know that the insertion key contains the `facet_value`. @@ -280,20 +282,25 @@ impl FacetsUpdateIncrementalInner { // 2. Merge the previous docids with the new one let mut updated_value = insertion_value; - updated_value.size += 1; + if matches!(result, InsertionResult::Insert) { + updated_value.size += 1; + } if updated_value.size < max_group_size { updated_value.bitmap |= docids; self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - - return Ok(InsertionResult::InPlace); + if insertion_key_was_modified { + return Ok(InsertionResult::Expand); + } else { + return Ok(InsertionResult::InPlace); + } } // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` // Therefore it must be split into two nodes. - let size_left = max_group_size / 2; - let size_right = max_group_size - size_left; + let size_left = updated_value.size / 2; + let size_right = updated_value.size - size_left; let level_below = level - 1; @@ -303,7 +310,8 @@ impl FacetsUpdateIncrementalInner { left_bound: insertion_key.left_bound.as_slice(), }; - let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize); + let mut iter = + self.db.range(&txn, &(start_key..))?.take((size_left as usize) + (size_right as usize)); let group_left = { let mut values_left = RoaringBitmap::new(); @@ -368,6 +376,7 @@ impl FacetsUpdateIncrementalInner { self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; match result { InsertionResult::InPlace => return Ok(()), + InsertionResult::Expand => return Ok(()), InsertionResult::Insert => {} } @@ -393,8 +402,11 @@ impl FacetsUpdateIncrementalInner { .as_polymorph() .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &highest_level_prefix)?; + let nbr_new_groups = size_highest_level / self.group_size as usize; + let nbr_leftover_elements = size_highest_level % self.group_size as usize; + let mut to_add = vec![]; - for _ in 0..self.min_level_size { + for _ in 0..nbr_new_groups { let mut first_key = None; let mut values = RoaringBitmap::new(); for _ in 0..group_size { @@ -415,6 +427,30 @@ impl FacetsUpdateIncrementalInner { let value = FacetGroupValue { size: group_size as u8, bitmap: values }; to_add.push((key.into_owned(), value)); } + // now we add the rest of the level, in case its size is > group_size * min_level_size + // this can indeed happen if the min_level_size parameter changes between two calls to `insert` + if nbr_leftover_elements > 0 { + let mut first_key = None; + let mut values = RoaringBitmap::new(); + for _ in 0..nbr_leftover_elements { + let (key_bytes, value_i) = groups_iter.next().unwrap()?; + let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)?; + + if first_key.is_none() { + first_key = Some(key_i); + } + values |= value_i.bitmap; + } + let key = FacetGroupKey { + field_id, + level: highest_level + 1, + left_bound: first_key.unwrap().left_bound, + }; + let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values }; + to_add.push((key.into_owned(), value)); + } + drop(groups_iter); for (key, value) in to_add { self.db.put(txn, &key.as_ref(), &value)?; @@ -983,243 +1019,345 @@ mod tests { // fuzz tests } -// #[cfg(all(test, fuzzing))] -// mod fuzz { -// use crate::codec::U16Codec; +#[cfg(all(test, fuzzing))] +mod fuzz { + use std::borrow::Cow; + use std::collections::{BTreeMap, HashMap}; + use std::convert::TryFrom; + use std::rc::Rc; -// use super::tests::verify_structure_validity; -// use super::*; -// use fuzzcheck::mutators::integer_within_range::U16WithinRangeMutator; -// use fuzzcheck::DefaultMutator; -// use roaring::RoaringBitmap; -// use std::collections::BTreeMap; -// use std::collections::HashMap; + use fuzzcheck::mutators::integer_within_range::{U16WithinRangeMutator, U8WithinRangeMutator}; + use fuzzcheck::DefaultMutator; + use heed::BytesEncode; + use roaring::RoaringBitmap; + use tempfile::TempDir; -// #[derive(Default)] -// pub struct TrivialDatabase { -// pub elements: BTreeMap>, -// } -// impl TrivialDatabase -// where -// T: Ord + Clone + Copy + Eq + std::fmt::Debug, -// { -// pub fn insert(&mut self, field_id: u16, new_key: T, new_values: &RoaringBitmap) { -// if new_values.is_empty() { -// return; -// } -// let values_field_id = self.elements.entry(field_id).or_default(); -// let values = values_field_id.entry(new_key).or_default(); -// *values |= new_values; -// } -// pub fn delete(&mut self, field_id: u16, key: T, value: u32) { -// if let Some(values_field_id) = self.elements.get_mut(&field_id) { -// if let Some(values) = values_field_id.get_mut(&key) { -// values.remove(value); -// if values.is_empty() { -// values_field_id.remove(&key); -// } -// } -// if values_field_id.is_empty() { -// self.elements.remove(&field_id); -// } -// } -// } -// } -// #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] -// struct Operation { -// key: Key, -// #[field_mutator(U16WithinRangeMutator = { U16WithinRangeMutator::new(..=3) })] -// field_id: u16, -// kind: OperationKind, -// } -// #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] -// enum OperationKind { -// Insert(Vec), -// Delete(u8), -// } + use super::*; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; -// fn compare_with_trivial_database( -// tempdir: Rc, -// group_size: u8, -// max_group_size: u8, -// operations: &[Operation], -// ) { -// let index = FacetIndex::::open_from_tempdir(tempdir, group_size, max_group_size); -// let mut trivial_db = TrivialDatabase::::default(); -// let mut value_to_keys = HashMap::>::new(); -// let mut txn = index.env.write_txn().unwrap(); -// for Operation { key, field_id, kind } in operations { -// match kind { -// OperationKind::Insert(values) => { -// let mut bitmap = RoaringBitmap::new(); -// for value in values { -// bitmap.insert(*value as u32); -// value_to_keys.entry(*value).or_default().push(*key); -// } -// index.insert(&mut txn, *field_id, key, &bitmap); -// trivial_db.insert(*field_id, *key, &bitmap); -// } -// OperationKind::Delete(value) => { -// if let Some(keys) = value_to_keys.get(value) { -// for key in keys { -// index.delete(&mut txn, *field_id, key, *value as u32); -// trivial_db.delete(*field_id, *key, *value as u32); -// } -// } -// } -// } -// } -// for (field_id, values_field_id) in trivial_db.elements.iter() { -// let level0iter = index -// .db -// .content -// .as_polymorph() -// .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( -// &mut txn, -// &field_id.to_be_bytes(), -// ) -// .unwrap(); + struct NEU16Codec; + impl<'a> BytesEncode<'a> for NEU16Codec { + type EItem = u16; + #[no_coverage] + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Owned(item.to_be_bytes().to_vec())) + } + } + impl<'a> BytesDecode<'a> for NEU16Codec { + type DItem = u16; + #[no_coverage] + fn bytes_decode(bytes: &'a [u8]) -> Option { + let bytes = <[u8; 2]>::try_from(&bytes[0..=1]).unwrap(); + Some(u16::from_be_bytes(bytes)) + } + } -// for ((key, values), group) in values_field_id.iter().zip(level0iter) { -// let (group_key, group_values) = group.unwrap(); -// let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); -// assert_eq!(key, &group_key.left_bound); -// assert_eq!(values, &group_values.bitmap); -// } -// } + #[derive(Default)] + pub struct TrivialDatabase { + pub elements: BTreeMap>, + } + impl TrivialDatabase + where + T: Ord + Clone + Copy + Eq + std::fmt::Debug, + { + #[no_coverage] + pub fn insert(&mut self, field_id: u16, new_key: T, new_values: &RoaringBitmap) { + if new_values.is_empty() { + return; + } + let values_field_id = self.elements.entry(field_id).or_default(); + let values = values_field_id.entry(new_key).or_default(); + *values |= new_values; + } + #[no_coverage] + pub fn delete(&mut self, field_id: u16, key: T, value: u32) { + if let Some(values_field_id) = self.elements.get_mut(&field_id) { + if let Some(values) = values_field_id.get_mut(&key) { + values.remove(value); + if values.is_empty() { + values_field_id.remove(&key); + } + } + if values_field_id.is_empty() { + self.elements.remove(&field_id); + } + } + } + } + #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] + struct Operation { + key: Key, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + group_size: u8, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + max_group_size: u8, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + min_level_size: u8, + #[field_mutator(U16WithinRangeMutator = { U16WithinRangeMutator::new(..=3) })] + field_id: u16, + kind: OperationKind, + } + #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] + enum OperationKind { + Insert(Vec), + Delete(u8), + } -// txn.commit().unwrap(); -// let mut txn = index.env.write_txn().unwrap(); -// for (field_id, values_field_id) in trivial_db.elements.iter() { -// let level0iter = index -// .db -// .content -// .as_polymorph() -// .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) -// .unwrap(); + #[no_coverage] + fn compare_with_trivial_database(tempdir: Rc, operations: &[Operation]) { + let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten + // let mut txn = index.env.write_txn().unwrap(); + let mut txn = index.env.write_txn().unwrap(); -// for ((key, values), group) in values_field_id.iter().zip(level0iter) { -// let (group_key, group_values) = group.unwrap(); -// let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); -// assert_eq!(key, &group_key.left_bound); -// assert_eq!(values, &group_values.bitmap); -// } -// index.verify_structure_validity(*field_id); -// } + let mut trivial_db = TrivialDatabase::::default(); + let mut value_to_keys = HashMap::>::new(); + for Operation { key, group_size, max_group_size, min_level_size, field_id, kind } in + operations + { + index.set_group_size(*group_size); + index.set_max_group_size(*max_group_size); + index.set_min_level_size(*min_level_size); + match kind { + OperationKind::Insert(values) => { + let mut bitmap = RoaringBitmap::new(); + for value in values { + bitmap.insert(*value as u32); + value_to_keys.entry(*value).or_default().push(*key); + } + index.insert(&mut txn, *field_id, key, &bitmap); + trivial_db.insert(*field_id, *key, &bitmap); + } + OperationKind::Delete(value) => { + if let Some(keys) = value_to_keys.get(value) { + for key in keys { + index.delete(&mut txn, *field_id, key, *value as u32); + trivial_db.delete(*field_id, *key, *value as u32); + } + } + } + } + } -// index.db.content.clear(&mut txn).unwrap(); -// txn.commit().unwrap(); -// } + for (field_id, values_field_id) in trivial_db.elements.iter() { + let level0iter = index + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + &mut txn, + &field_id.to_be_bytes(), + ) + .unwrap(); -// #[test] -// fn fuzz() { -// let tempdir = Rc::new(TempDir::new().unwrap()); -// let tempdir_cloned = tempdir.clone(); -// let result = fuzzcheck::fuzz_test(move |x: &(u8, u8, Vec>)| { -// compare_with_trivial_database(tempdir_cloned.clone(), x.0, x.1, &x.2) -// }) -// .default_mutator() -// .serde_serializer() -// .default_sensor_and_pool_with_custom_filter(|file, function| { -// if file.is_relative() -// && !function.contains("serde") -// && !function.contains("tests::") -// && !function.contains("fuzz::") -// && !function.contains("display_bitmap") -// { -// true -// } else { -// false -// } -// }) -// .arguments_from_cargo_fuzzcheck() -// .launch(); -// assert!(!result.found_test_failure); -// } + for ((key, values), group) in values_field_id.iter().zip(level0iter) { + let (group_key, group_values) = group.unwrap(); + let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + assert_eq!(key, &group_key.left_bound); + assert_eq!(values, &group_values.bitmap); + } + } -// #[test] -// fn reproduce_bug() { -// let operations = r#" -// [ -// {"key":0, "field_id": 0, "kind":{"Insert":[109]}}, -// {"key":143, "field_id": 0, "kind":{"Insert":[243]}}, -// {"key":90, "field_id": 0, "kind":{"Insert":[217]}}, -// {"key":172, "field_id": 0, "kind":{"Insert":[94]}}, -// {"key":27, "field_id": 0, "kind":{"Insert":[4]}}, -// {"key":124, "field_id": 0, "kind":{"Insert":[0]}}, -// {"key":123, "field_id": 0, "kind":{"Insert":[0]}}, -// {"key":67, "field_id": 0, "kind":{"Insert":[109]}}, -// {"key":13, "field_id": 0, "kind":{"Insert":[0]}}, -// {"key":162, "field_id": 0, "kind":{"Insert":[213]}}, -// {"key":235, "field_id": 0, "kind":{"Insert":[67]}}, -// {"key":251, "field_id": 0, "kind":{"Insert":[50]}}, -// {"key":218, "field_id": 0, "kind":{"Insert":[164]}}, -// {"key":166, "field_id": 0, "kind":{"Insert":[67]}}, -// {"key":64, "field_id": 0, "kind":{"Insert":[61]}}, -// {"key":183, "field_id": 0, "kind":{"Insert":[210]}}, -// {"key":250, "field_id": 0, "kind":{"Delete":50}} -// ] -// "#; -// let operations: Vec> = serde_json::from_str(operations).unwrap(); -// let tempdir = TempDir::new().unwrap(); -// compare_with_trivial_database(Rc::new(tempdir), 4, 8, &operations); -// } + for (field_id, values_field_id) in trivial_db.elements.iter() { + let level0iter = index + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) + .unwrap(); -// #[test] -// fn reproduce_bug2() { -// let operations = r#" -// [ -// {"key":102, "field_id": 0, "kind":{"Insert":[122]}}, -// {"key":73, "field_id": 0, "kind":{"Insert":[132]}}, -// {"key":20, "field_id": 0, "kind":{"Insert":[215]}}, -// {"key":39, "field_id": 0, "kind":{"Insert":[152]}}, -// {"key":151, "field_id": 0, "kind":{"Insert":[226]}}, -// {"key":17, "field_id": 0, "kind":{"Insert":[101]}}, -// {"key":74, "field_id": 0, "kind":{"Insert":[210]}}, -// {"key":2, "field_id": 0, "kind":{"Insert":[130]}}, -// {"key":64, "field_id": 0, "kind":{"Insert":[180]}}, -// {"key":83, "field_id": 0, "kind":{"Insert":[250]}}, -// {"key":80, "field_id": 0, "kind":{"Insert":[210]}}, -// {"key":113, "field_id": 0, "kind":{"Insert":[63]}}, -// {"key":201, "field_id": 0, "kind":{"Insert":[210]}}, -// {"key":200, "field_id": 0, "kind":{"Insert":[5]}}, -// {"key":93, "field_id": 0, "kind":{"Insert":[98]}}, -// {"key":162, "field_id": 0, "kind":{"Insert":[5]}}, -// {"key":80, "field_id": 0, "kind":{"Delete":210}} -// ] -// "#; -// let operations: Vec> = serde_json::from_str(operations).unwrap(); -// let tempdir = TempDir::new().unwrap(); -// compare_with_trivial_database(Rc::new(tempdir), 4, 8, &operations); -// } -// #[test] -// fn reproduce_bug3() { -// let operations = r#" -// [ -// {"key":27488, "field_id": 0, "kind":{"Insert":[206]}}, -// {"key":64716, "field_id": 0, "kind":{"Insert":[216]}}, -// {"key":60886, "field_id": 0, "kind":{"Insert":[206]}}, -// {"key":59509, "field_id": 0, "kind":{"Insert":[187,231]}}, -// {"key":55057, "field_id": 0, "kind":{"Insert":[37]}}, -// {"key":45200, "field_id": 0, "kind":{"Insert":[206]}}, -// {"key":55056, "field_id": 0, "kind":{"Insert":[37]}}, -// {"key":63679, "field_id": 0, "kind":{"Insert":[206]}}, -// {"key":52155, "field_id": 0, "kind":{"Insert":[74]}}, -// {"key":20648, "field_id": 0, "kind":{"Insert":[47,138,157]}} -// ] -// "#; -// let operations: Vec> = serde_json::from_str(operations).unwrap(); -// let tempdir = TempDir::new().unwrap(); -// compare_with_trivial_database(Rc::new(tempdir), 0, 7, &operations); -// } + for ((key, values), group) in values_field_id.iter().zip(level0iter) { + let (group_key, group_values) = group.unwrap(); + let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + assert_eq!(key, &group_key.left_bound); + assert_eq!(values, &group_values.bitmap); + } + index.verify_structure_validity(&txn, *field_id); + } + txn.abort().unwrap(); + } -// #[test] -// fn reproduce_bug4() { -// let operations = r#" -// [{"key":63499, "field_id": 0, "kind":{"Insert":[87]}},{"key":25374, "field_id": 0, "kind":{"Insert":[14]}},{"key":64481, "field_id": 0, "kind":{"Delete":87}},{"key":23038, "field_id": 0, "kind":{"Insert":[173]}},{"key":14862, "field_id": 0, "kind":{"Insert":[8]}},{"key":13145, "field_id": 0, "kind":{"Insert":[5,64]}},{"key":23446, "field_id": 0, "kind":{"Insert":[86,59]}},{"key":17972, "field_id": 0, "kind":{"Insert":[58,137]}},{"key":21273, "field_id": 0, "kind":{"Insert":[121,132,81,147]}},{"key":28264, "field_id": 0, "kind":{"Insert":[36]}},{"key":46659, "field_id": 0, "kind":{"Insert":[]}}] -// "#; -// let operations: Vec> = serde_json::from_str(operations).unwrap(); -// let tempdir = TempDir::new().unwrap(); -// compare_with_trivial_database(Rc::new(tempdir), 2, 1, &operations); -// } -// } + #[test] + #[no_coverage] + fn fuzz() { + let tempdir = Rc::new(TempDir::new().unwrap()); + let tempdir_cloned = tempdir.clone(); + let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| { + compare_with_trivial_database(tempdir_cloned.clone(), operations) + }) + .default_mutator() + .serde_serializer() + .default_sensor_and_pool_with_custom_filter(|file, function| { + file == std::path::Path::new("milli/src/update/facet/incremental.rs") + && !function.contains("serde") + && !function.contains("tests::") + && !function.contains("fuzz::") + && !function.contains("display_bitmap") + }) + .arguments_from_cargo_fuzzcheck() + .launch(); + assert!(!result.found_test_failure); + } + + #[test] + #[no_coverage] + fn reproduce_bug1() { + let operations = r#" + [ + {"key":0, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[109]}}, + {"key":143, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[243]}}, + {"key":90, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[217]}}, + {"key":172, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[94]}}, + {"key":27, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[4]}}, + {"key":124, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, + {"key":123, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, + {"key":67, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[109]}}, + {"key":13, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, + {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[213]}}, + {"key":235, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, + {"key":251, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[50]}}, + {"key":218, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[164]}}, + {"key":166, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, + {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[61]}}, + {"key":183, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":250, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":50}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug2() { + let operations = r#" + [ + {"key":102, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[122]}}, + {"key":73, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[132]}}, + {"key":20, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[215]}}, + {"key":39, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[152]}}, + {"key":151, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[226]}}, + {"key":17, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[101]}}, + {"key":74, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":2, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[130]}}, + {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[180]}}, + {"key":83, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[250]}}, + {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":113, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[63]}}, + {"key":201, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":200, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, + {"key":93, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[98]}}, + {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, + {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":210}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + #[test] + #[no_coverage] + fn reproduce_bug3() { + let operations = r#" + [ + {"key":27488, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":64716, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[216]}}, + {"key":60886, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":59509, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[187,231]}}, + {"key":55057, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[37]}}, + {"key":45200, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":55056, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[37]}}, + {"key":63679, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":52155, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[74]}}, + {"key":20648, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[47,138,157]}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug4() { + let operations = r#"[ + {"key":63499, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[87]}}, + {"key":25374, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[14]}}, + {"key":64481, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Delete":87}}, + {"key":23038, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[173]}}, + {"key":14862, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[8]}}, + {"key":13145, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[5,64]}}, + {"key":23446, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[86,59]}}, + {"key":17972, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[58,137]}}, + {"key":21273, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[121,132,81,147]}}, + {"key":28264, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[36]}}, + {"key":46659, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[]}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug5() { + let input = r#" + [ + { + "key":3438, + "group_size":11, + "max_group_size":0, + "min_level_size":17, + "field_id":3, + "kind":{"Insert":[198]} + }, + + { + "key":47098, + "group_size":0, + "max_group_size":8, + "min_level_size":0, + "field_id":3, + "kind":{"Insert":[11]} + }, + { + "key":22453, + "group_size":0, + "max_group_size":0, + "min_level_size":0, + "field_id":3, + "kind":{"Insert":[145]} + }, + { + "key":14105, + "group_size":14, + "max_group_size":4, + "min_level_size":25, + "field_id":3, + "kind":{"Delete":11} + } + ] + "#; + let operations: Vec> = serde_json::from_str(input).unwrap(); + let tmpdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tmpdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug6() { + let input = r#" + [ + {"key":45720,"group_size":1,"max_group_size":4,"min_level_size":0,"field_id":0,"kind":{"Insert":[120]}}, + {"key":37463,"group_size":1,"max_group_size":4,"min_level_size":0,"field_id":0,"kind":{"Insert":[187]}}, + {"key":21512,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}}, + {"key":21511,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}}, + {"key":37737,"group_size":12,"max_group_size":0,"min_level_size":6,"field_id":0,"kind":{"Insert":[181]}}, + {"key":53042,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}} + ] + "#; + let operations: Vec> = serde_json::from_str(input).unwrap(); + let tmpdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tmpdir), &operations); + } +} diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 9263d3a6a..e7d14c788 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -145,6 +145,7 @@ impl<'i> FacetsUpdate<'i> { #[cfg(test)] pub(crate) mod tests { + use std::cell::Cell; use std::fmt::Display; use std::marker::PhantomData; use std::rc::Rc; @@ -170,9 +171,9 @@ pub(crate) mod tests { { pub env: Env, pub content: heed::Database, FacetGroupValueCodec>, - pub group_size: u8, - pub min_level_size: u8, - pub max_group_size: u8, + pub group_size: Cell, + pub min_level_size: Cell, + pub max_group_size: Cell, _tempdir: Rc, _phantom: PhantomData, } @@ -189,9 +190,9 @@ pub(crate) mod tests { max_group_size: u8, min_level_size: u8, ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 - let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 - let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + let group_size = std::cmp::min(16, std::cmp::max(group_size, 2)); // 2 <= x <= 16 + let max_group_size = std::cmp::min(16, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 16 + let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17 let mut options = heed::EnvOpenOptions::new(); let options = options.map_size(4096 * 4 * 10 * 100); @@ -202,13 +203,11 @@ pub(crate) mod tests { let content = env.open_database(None).unwrap().unwrap(); FacetIndex { - db: Database { - content, - group_size, - max_group_size, - min_level_size, - _tempdir: tempdir, - }, + content, + group_size: Cell::new(group_size), + max_group_size: Cell::new(max_group_size), + min_level_size: Cell::new(min_level_size), + _tempdir: tempdir, env, _phantom: PhantomData, } @@ -229,14 +228,32 @@ pub(crate) mod tests { FacetIndex { content, - group_size, - max_group_size, - min_level_size, + group_size: Cell::new(group_size), + max_group_size: Cell::new(max_group_size), + min_level_size: Cell::new(min_level_size), _tempdir: Rc::new(tempdir), env, _phantom: PhantomData, } } + + pub fn set_group_size(&self, group_size: u8) { + // 2 <= x <= 64 + self.group_size.set(std::cmp::min(64, std::cmp::max(group_size, 2))); + } + pub fn set_max_group_size(&self, max_group_size: u8) { + // 2*group_size <= x <= 128 + let max_group_size = std::cmp::max(4, std::cmp::min(128, max_group_size)); + self.max_group_size.set(max_group_size); + if self.group_size.get() < max_group_size / 2 { + self.group_size.set(max_group_size / 2); + } + } + pub fn set_min_level_size(&self, min_level_size: u8) { + // 1 <= x <= inf + self.min_level_size.set(std::cmp::max(1, min_level_size)); + } + pub fn insert<'a>( &self, wtxn: &'a mut RwTxn, @@ -246,9 +263,9 @@ pub(crate) mod tests { ) { let update = FacetsUpdateIncrementalInner { db: self.content, - group_size: self.group_size, - min_level_size: self.min_level_size, - max_group_size: self.max_group_size, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); @@ -262,9 +279,9 @@ pub(crate) mod tests { ) { let update = FacetsUpdateIncrementalInner { db: self.content, - group_size: self.group_size, - min_level_size: self.min_level_size, - max_group_size: self.max_group_size, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.delete(wtxn, field_id, &key_bytes, value).unwrap(); @@ -296,8 +313,8 @@ pub(crate) mod tests { let update = FacetsUpdateBulkInner { db: self.content, new_data: Some(reader), - group_size: self.group_size, - min_level_size: self.min_level_size, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), }; update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); @@ -341,7 +358,7 @@ pub(crate) mod tests { FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() }; - assert!(value.size > 0 && value.size < self.max_group_size); + assert!(value.size > 0); let mut actual_size = 0; let mut values_below = RoaringBitmap::new(); From 3baa34d84214924854e297b8fbf9028d25822454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 13:10:45 +0200 Subject: [PATCH 1728/1889] Fix compiler errors/warnings --- milli/src/update/facet/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index e7d14c788..c5046784f 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -237,10 +237,12 @@ pub(crate) mod tests { } } + #[cfg(all(test, fuzzing))] pub fn set_group_size(&self, group_size: u8) { // 2 <= x <= 64 self.group_size.set(std::cmp::min(64, std::cmp::max(group_size, 2))); } + #[cfg(all(test, fuzzing))] pub fn set_max_group_size(&self, max_group_size: u8) { // 2*group_size <= x <= 128 let max_group_size = std::cmp::max(4, std::cmp::min(128, max_group_size)); @@ -249,6 +251,7 @@ pub(crate) mod tests { self.group_size.set(max_group_size / 2); } } + #[cfg(all(test, fuzzing))] pub fn set_min_level_size(&self, min_level_size: u8) { // 1 <= x <= inf self.min_level_size.set(std::cmp::max(1, min_level_size)); From cb8442a119c7bb8e7acaeeb433cf7124597d8b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 13:28:17 +0200 Subject: [PATCH 1729/1889] Further unify facet databases of f64s and strings --- ...4_codec.rs => field_doc_id_facet_codec.rs} | 30 +++--- .../facet/field_doc_id_facet_string_codec.rs | 50 ---------- milli/src/heed_codec/facet/mod.rs | 12 ++- milli/src/search/mod.rs | 2 +- milli/src/update/delete_documents.rs | 98 +++++++------------ 5 files changed, 63 insertions(+), 129 deletions(-) rename milli/src/heed_codec/facet/{field_doc_id_facet_f64_codec.rs => field_doc_id_facet_codec.rs} (54%) delete mode 100644 milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs similarity index 54% rename from milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs rename to milli/src/heed_codec/facet/field_doc_id_facet_codec.rs index 22159601c..7c636e98a 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs @@ -1,13 +1,15 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::f64_into_bytes; use crate::{try_split_array_at, DocumentId, FieldId}; +use heed::{BytesDecode, BytesEncode}; +use std::borrow::Cow; +use std::marker::PhantomData; -pub struct FieldDocIdFacetF64Codec; +pub struct FieldDocIdFacetCodec(PhantomData); -impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { - type DItem = (FieldId, DocumentId, f64); +impl<'a, C> BytesDecode<'a> for FieldDocIdFacetCodec +where + C: BytesDecode<'a>, +{ + type DItem = (FieldId, DocumentId, C::DItem); fn bytes_decode(bytes: &'a [u8]) -> Option { let (field_id_bytes, bytes) = try_split_array_at(bytes)?; @@ -16,22 +18,24 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let document_id = u32::from_be_bytes(document_id_bytes); - let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?; + let value = C::bytes_decode(&bytes[8..])?; Some((field_id, document_id, value)) } } -impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec { - type EItem = (FieldId, DocumentId, f64); +impl<'a, C> BytesEncode<'a> for FieldDocIdFacetCodec +where + C: BytesEncode<'a>, +{ + type EItem = (FieldId, DocumentId, C::EItem); - fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8); bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.extend_from_slice(&document_id.to_be_bytes()); - let value_bytes = f64_into_bytes(*value)?; + let value_bytes = C::bytes_encode(value)?; bytes.extend_from_slice(&value_bytes); - bytes.extend_from_slice(&value.to_be_bytes()); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs deleted file mode 100644 index 178bb21c1..000000000 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, DocumentId, FieldId}; - -pub struct FieldDocIdFacetStringCodec; - -impl FieldDocIdFacetStringCodec { - pub fn serialize_into( - field_id: FieldId, - document_id: DocumentId, - normalized_value: &str, - out: &mut Vec, - ) { - out.reserve(2 + 4 + normalized_value.len()); - out.extend_from_slice(&field_id.to_be_bytes()); - out.extend_from_slice(&document_id.to_be_bytes()); - out.extend_from_slice(normalized_value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { - type DItem = (FieldId, DocumentId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - - let (document_id_bytes, bytes) = try_split_array_at(bytes)?; - let document_id = u32::from_be_bytes(document_id_bytes); - - let normalized_value = str::from_utf8(bytes).ok()?; - Some((field_id, document_id, normalized_value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { - type EItem = (FieldId, DocumentId, &'a str); - - fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FieldDocIdFacetStringCodec::serialize_into( - *field_id, - *document_id, - normalized_value, - &mut bytes, - ); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 2e9f0b212..8db8b7df1 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,5 +1,4 @@ -mod field_doc_id_facet_f64_codec; -mod field_doc_id_facet_string_codec; +mod field_doc_id_facet_codec; mod ordered_f64_codec; mod str_ref; @@ -7,16 +6,19 @@ use std::borrow::Cow; use std::convert::TryFrom; use std::marker::PhantomData; -use heed::types::OwnedType; +use heed::types::{DecodeIgnore, OwnedType}; use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; -pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; -pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; +pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; pub use self::ordered_f64_codec::OrderedF64Codec; pub use self::str_ref::StrRefCodec; use crate::{CboRoaringBitmapCodec, BEU16}; +pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec; +pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec; +pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec; + pub type FieldIdCodec = OwnedType; /// Tries to split a slice in half at the given middle point, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index e6651737c..f62a37c1b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, /* FacetNumberIter,*/ Filter, DEFAULT_VALUES_PER_FACET,}; +pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 14ef5fd6a..a56a61026 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,7 +1,7 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; -use heed::types::{ByteSlice, Str}; +use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -11,11 +11,13 @@ use time::OffsetDateTime; use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetIgnoreCodec, +}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, + ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, }; @@ -187,10 +189,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_position_docids, word_prefix_position_docids, facet_id_f64_docids: _, - facet_id_exists_docids, facet_id_string_docids: _, - field_id_docid_facet_f64s, - field_id_docid_facet_strings, + field_id_docid_facet_f64s: _, + field_id_docid_facet_strings: _, + facet_id_exists_docids, documents, } = self.index; @@ -449,6 +451,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { fields_ids_map.clone(), facet_type, )?; + for field_id in self.index.faceted_fields_ids(self.wtxn)? { + // Remove docids from the number faceted documents ids + let mut docids = + self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?; + docids -= &self.to_delete_docids; + self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; + + remove_docids_from_field_id_docid_facet_value( + &self.index, + self.wtxn, + facet_type, + field_id, + &self.to_delete_docids, + )?; + } } // We delete the documents ids that are under the facet field id values. @@ -458,47 +475,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { &self.to_delete_docids, )?; - // Remove the documents ids from the faceted documents ids. - for field_id in self.index.faceted_fields_ids(self.wtxn)? { - // Remove docids from the number faceted documents ids - let mut docids = - self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::Number)?; - docids -= &self.to_delete_docids; - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::Number, - &docids, - )?; - - remove_docids_from_field_id_docid_facet_value( - self.wtxn, - field_id_docid_facet_f64s, - field_id, - &self.to_delete_docids, - |(_fid, docid, _value)| docid, - )?; - - // Remove docids from the string faceted documents ids - let mut docids = - self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::String)?; - docids -= &self.to_delete_docids; - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::String, - &docids, - )?; - - remove_docids_from_field_id_docid_facet_value( - self.wtxn, - field_id_docid_facet_strings, - field_id, - &self.to_delete_docids, - |(_fid, docid, _value)| docid, - )?; - } - Ok(DocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), @@ -564,26 +540,28 @@ fn remove_from_word_docids( Ok(()) } -fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( +fn remove_docids_from_field_id_docid_facet_value<'i, 'a>( + index: &'i Index, wtxn: &'a mut heed::RwTxn, - db: &heed::Database, + facet_type: FacetType, field_id: FieldId, to_remove: &RoaringBitmap, - convert: F, -) -> heed::Result<()> -where - C: heed::BytesDecode<'a, DItem = K>, - DC: heed::BytesDecode<'a, DItem = V>, - F: Fn(K) -> DocumentId, -{ +) -> heed::Result<()> { + let db = match facet_type { + FacetType::String => { + index.field_id_docid_facet_strings.remap_types::() + } + FacetType::Number => { + index.field_id_docid_facet_f64s.remap_types::() + } + }; let mut iter = db - .remap_key_type::() .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + .remap_key_type::(); while let Some(result) = iter.next() { - let (key, _) = result?; - if to_remove.contains(convert(key)) { + let ((_, docid, _), _) = result?; + if to_remove.contains(docid) { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; } From 51961e10645135d0f7cfc76db9bc98d8ec9a1dc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 13:41:01 +0200 Subject: [PATCH 1730/1889] Polish some details --- .../facet/field_doc_id_facet_codec.rs | 6 ++-- milli/src/heed_codec/facet/mod.rs | 4 +++ milli/src/heed_codec/facet/str_ref.rs | 2 ++ .../extract/extract_facet_string_docids.rs | 5 +--- .../helpers/merge_functions.rs | 28 ------------------- 5 files changed, 11 insertions(+), 34 deletions(-) diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs index 7c636e98a..4e18a0145 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs @@ -1,8 +1,10 @@ -use crate::{try_split_array_at, DocumentId, FieldId}; -use heed::{BytesDecode, BytesEncode}; use std::borrow::Cow; use std::marker::PhantomData; +use heed::{BytesDecode, BytesEncode}; + +use crate::{try_split_array_at, DocumentId, FieldId}; + pub struct FieldDocIdFacetCodec(PhantomData); impl<'a, C> BytesDecode<'a> for FieldDocIdFacetCodec diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 8db8b7df1..35ec925dc 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -40,6 +40,8 @@ pub struct FacetGroupKey { pub left_bound: T, } +/// The value in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] +/// databases. #[derive(Debug)] pub struct FacetGroupValue { pub size: u8, @@ -102,6 +104,8 @@ impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { } } +/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated +/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure. pub struct ByteSliceRef; impl<'a> BytesEncode<'a> for ByteSliceRef { diff --git a/milli/src/heed_codec/facet/str_ref.rs b/milli/src/heed_codec/facet/str_ref.rs index 80a51c803..36e702627 100644 --- a/milli/src/heed_codec/facet/str_ref.rs +++ b/milli/src/heed_codec/facet/str_ref.rs @@ -2,6 +2,8 @@ use std::borrow::Cow; use heed::{BytesDecode, BytesEncode}; +/// A codec for values of type `&str`. Unlike `Str`, its `EItem` and `DItem` associated +/// types are equivalent (= `&'a str`) and these values can reside within another structure. pub struct StrRefCodec; impl<'a> BytesEncode<'a> for StrRefCodec { type EItem = &'a str; diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index a7b027ce3..bf523cbb3 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -33,10 +33,6 @@ pub fn extract_facet_string_docids( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); - // document_id_bytes is a big-endian u32 - // merge_cbo_roaring_bitmap works with native endian u32s - // that is a problem, I think - let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); @@ -45,6 +41,7 @@ pub fn extract_facet_string_docids( let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + // document id is encoded in native-endian because of the CBO roaring bitmap codec facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index cef27ab30..37af7ab6a 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -5,7 +5,6 @@ use std::result::Result as StdResult; use roaring::RoaringBitmap; use super::read_u32_ne_bytes; -// use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::Result; @@ -49,33 +48,6 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul } } -// pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( -// _key: &[u8], -// values: &[Cow<'a, [u8]>], -// ) -> Result> { -// if values.len() == 1 { -// Ok(values[0].clone()) -// } else { -// let original = decode_prefix_string(&values[0]).unwrap().0; -// let merged_bitmaps = values -// .iter() -// .map(AsRef::as_ref) -// .map(decode_prefix_string) -// .map(Option::unwrap) -// .map(|(_, bitmap_bytes)| bitmap_bytes) -// .map(RoaringBitmap::deserialize_from) -// .map(StdResult::unwrap) -// .reduce(|a, b| a | b) -// .unwrap(); - -// let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); -// let mut buffer = Vec::with_capacity(cap); -// encode_prefix_string(original, &mut buffer)?; -// merged_bitmaps.serialize_into(&mut buffer)?; -// Ok(Cow::Owned(buffer)) -// } -// } - pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { Ok(values[0].clone()) } From 1ecd3bb8227b1d389e8f71d2d7140ee6c54fac8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 11:02:30 +0200 Subject: [PATCH 1731/1889] Fix bug in FieldDocIdFacetCodec --- milli/src/heed_codec/facet/field_doc_id_facet_codec.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs index 4e18a0145..cc9919ad2 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs @@ -20,7 +20,7 @@ where let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let document_id = u32::from_be_bytes(document_id_bytes); - let value = C::bytes_decode(&bytes[8..])?; + let value = C::bytes_decode(bytes)?; Some((field_id, document_id, value)) } @@ -33,10 +33,11 @@ where type EItem = (FieldId, DocumentId, C::EItem); fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.extend_from_slice(&document_id.to_be_bytes()); + let mut bytes = Vec::with_capacity(32); + bytes.extend_from_slice(&field_id.to_be_bytes()); // 2 bytes + bytes.extend_from_slice(&document_id.to_be_bytes()); // 4 bytes let value_bytes = C::bytes_encode(value)?; + // variable length, if f64 -> 16 bytes, if string -> large, potentially bytes.extend_from_slice(&value_bytes); Some(Cow::Owned(bytes)) } From a2270b7432d2921603df502f6befc88d58f75118 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 11:07:05 +0200 Subject: [PATCH 1732/1889] Change fuzzcheck dependency to point to git repository --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 2f881fccb..49988da0b 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -57,7 +57,7 @@ md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } [target.'cfg(fuzzing)'.dev-dependencies] -fuzzcheck = { path = "../../fuzzcheck-rs/fuzzcheck" } +fuzzcheck = { git = "https://github.com/loiclec/fuzzcheck-rs", branch = "main" } [features] default = [ "charabia/default" ] From d0109627b901178182f0ec0102d365080c683618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 14:39:11 +0200 Subject: [PATCH 1733/1889] Fix a bug in facet_range_search and add documentation --- milli/src/search/facet/facet_range_search.rs | 122 ++++++++++++++---- milli/src/search/facet/mod.rs | 31 +++++ .../excluded_2.hash.snap | 4 + .../excluded_3.hash.snap | 4 + .../included_2.hash.snap | 4 + .../included_3.hash.snap | 4 + .../excluded_2.hash.snap | 4 + .../excluded_3.hash.snap | 4 + .../included_2.hash.snap | 4 + .../included_3.hash.snap | 4 + .../filter_range_pinch/excluded_2.hash.snap | 4 + .../filter_range_pinch/excluded_3.hash.snap | 4 + .../filter_range_pinch/included_2.hash.snap | 4 + .../filter_range_pinch/included_3.hash.snap | 4 + 14 files changed, 173 insertions(+), 28 deletions(-) create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 8934873b7..a7b4674f1 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -9,6 +9,8 @@ use crate::heed_codec::facet::{ }; use crate::Result; +/// Find all the document ids for which the given field contains a value contained within +/// the two bounds. pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, @@ -24,11 +26,11 @@ where let inner; let left = match left { Bound::Included(left) => { - inner = BoundCodec::bytes_encode(left).unwrap(); + inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; Bound::Included(inner.as_ref()) } Bound::Excluded(left) => { - inner = BoundCodec::bytes_encode(left).unwrap(); + inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; Bound::Excluded(inner.as_ref()) } Bound::Unbounded => Bound::Unbounded, @@ -36,11 +38,11 @@ where let inner; let right = match right { Bound::Included(right) => { - inner = BoundCodec::bytes_encode(right).unwrap(); + inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; Bound::Included(inner.as_ref()) } Bound::Excluded(right) => { - inner = BoundCodec::bytes_encode(right).unwrap(); + inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; Bound::Excluded(inner.as_ref()) } Bound::Unbounded => Bound::Unbounded, @@ -49,9 +51,11 @@ where let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); - f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; + if let Some(starting_left_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let rightmost_bound = + Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded + let group_size = usize::MAX; + f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) } else { return Ok(()); @@ -107,7 +111,25 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { Ok(()) } - /// Recursive part of the algorithm for level > 0 + /// Recursive part of the algorithm for level > 0. + /// + /// It works by visiting a slice of a level and checking whether the range asscociated + /// with each visited element is contained within the bounds. + /// + /// 1. So long as the element's range is less than the left bound, we do nothing and keep iterating + /// 2. If the element's range is fully contained by the bounds, then all of its docids are added to + /// the roaring bitmap. + /// 3. If the element's range merely intersects the bounds, then we call the algorithm recursively + /// on the children of the element from the level below. + /// 4. If the element's range is greater than the right bound, we do nothing and stop iterating. + /// Note that the right bound is found through either the `left_bound` of the *next* element, + /// or from the `rightmost_bound` argument + /// + /// ## Arguments + /// - `level`: the level being visited + /// - `starting_left_bound`: the left_bound of the first element to visit + /// - `rightmost_bound`: the right bound of the last element that should be visited + /// - `group_size`: the number of elements that should be visited fn run( &mut self, level: u8, @@ -123,13 +145,14 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); + // We iterate over the range while keeping in memory the previous value let (mut previous_key, mut previous_value) = iter.next().unwrap()?; for el in iter { let (next_key, next_value) = el?; - // the right of the iter range is unbounded, so we need to make sure that we are not iterating - // on the next field id + // the right of the iter range is potentially unbounded (e.g. if `group_size` is usize::MAX), + // so we need to make sure that we are not iterating on the next field id if next_key.field_id != self.field_id { - return Ok(()); + break; } // now, do we skip, stop, or visit? let should_skip = { @@ -176,6 +199,8 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { previous_value = next_value; continue; } + // from here, we should visit the children of the previous element and + // call the function recursively let level = level - 1; let starting_left_bound = previous_key.left_bound; @@ -187,7 +212,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { previous_key = next_key; previous_value = next_value; } - // previous_key/previous_value are the last element + // previous_key/previous_value are the last element's key/value // now, do we skip, stop, or visit? let should_skip = { @@ -224,18 +249,41 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { Bound::Unbounded => true, }; let right_condition = match (self.right, rightmost_bound) { - (Bound::Included(right), Bound::Included(rightmost)) => rightmost <= right, - (Bound::Included(right), Bound::Excluded(rightmost)) => rightmost < right, - // e.g. x < 8 and rightmost is <= y - // condition met if rightmost < 8 - (Bound::Excluded(right), Bound::Included(rightmost)) => rightmost < right, - // e.g. x < 8 and rightmost is < y - // condition met only if y <= 8? - (Bound::Excluded(right), Bound::Excluded(rightmost)) => rightmost <= right, - // e.g. x < inf. , so yes we take the whole thing - (Bound::Unbounded, _) => true, - // e.g. x < 7 , righmost is inf - (_, Bound::Unbounded) => false, // panic? + (Bound::Included(right), Bound::Included(rightmost)) => { + // we need to stay within the bound ..=right + // the element's range goes to ..=righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Included(right), Bound::Excluded(rightmost)) => { + // we need to stay within the bound ..=right + // the element's range goes to ..righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Excluded(right), Bound::Included(rightmost)) => { + // we need to stay within the bound ..right + // the element's range goes to ..=righmost + // so the element fits entirely within the bound if rightmost < right + rightmost < right + } + (Bound::Excluded(right), Bound::Excluded(rightmost)) => { + // we need to stay within the bound ..right + // the element's range goes to ..righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Unbounded, _) => { + // we need to stay within the bound ..inf + // so the element always fits entirely within the bound + true + } + (_, Bound::Unbounded) => { + // we need to stay within a finite bound + // but the element's range goes to ..inf + // so the element never fits entirely within the bound + false + } }; left_condition && right_condition }; @@ -262,7 +310,10 @@ mod tests { use super::find_docids_of_facet_within_bounds; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; - use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; + use crate::search::facet::tests::{ + get_random_looking_index, get_random_looking_index_with_multiple_field_ids, + get_simple_index, get_simple_index_with_multiple_field_ids, + }; use crate::snapshot_tests::display_bitmap; #[test] @@ -272,7 +323,12 @@ mod tests { } #[test] fn filter_range_increasing() { - let indexes = [get_simple_index(), get_random_looking_index()]; + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let mut results = String::new(); @@ -316,7 +372,12 @@ mod tests { } #[test] fn filter_range_decreasing() { - let indexes = [get_simple_index(), get_random_looking_index()]; + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); @@ -367,7 +428,12 @@ mod tests { } #[test] fn filter_range_pinch() { - let indexes = [get_simple_index(), get_random_looking_index()]; + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index be04fbd7f..c854b546d 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -119,4 +119,35 @@ pub(crate) mod tests { txn.commit().unwrap(); index } + pub fn get_simple_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for fid in 0..2 { + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, fid, &(i as f64), &bitmap); + } + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + for fid in 0..2 { + for (_i, &key) in keys.iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, fid, &(key as f64), &bitmap); + } + } + txn.commit().unwrap(); + index + } } diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap new file mode 100644 index 000000000..7bf13e05c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +fcedc563a82c1c61f50174a5f3f982b6 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap new file mode 100644 index 000000000..100b928d7 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +6cc26e77fc6bd9145deedf14cf422b03 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap new file mode 100644 index 000000000..be0b06ded --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +57d35cfa419a19a1a1f8d7c8ef096e0f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap new file mode 100644 index 000000000..93fe17b0c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3dbe0547b42759795e9b16989df72cee diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap new file mode 100644 index 000000000..db11ce952 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +c1c7a0bb91d53d33724583b6d4a99f16 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap new file mode 100644 index 000000000..f5a81c121 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +12213d3f1047a0c3d08e4670a7d688e7 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap new file mode 100644 index 000000000..fa7242056 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ca59f20e043a4d52c49e15b10adf96bb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap new file mode 100644 index 000000000..a7611d8c1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +cb69e0fe10fb299bafe77514204379cb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap new file mode 100644 index 000000000..07664807e --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3456db9a1bb94c33c1e9f656184ee711 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap new file mode 100644 index 000000000..ef530faa1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2127cd818b457e0611e0c8e1a871602a diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap new file mode 100644 index 000000000..db8a314b0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +b976551ceff412bfb2ec9bfbda320bbb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap new file mode 100644 index 000000000..2b82e07e8 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +7620ca1a96882c7147d3fd996570f9b3 From 0ade6998735e943dcaba9844814556d858a93319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 14:39:43 +0200 Subject: [PATCH 1734/1889] Don't crash when failing to decode using StrRef codec --- milli/src/heed_codec/facet/str_ref.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/heed_codec/facet/str_ref.rs b/milli/src/heed_codec/facet/str_ref.rs index 36e702627..ced5cc65e 100644 --- a/milli/src/heed_codec/facet/str_ref.rs +++ b/milli/src/heed_codec/facet/str_ref.rs @@ -16,7 +16,7 @@ impl<'a> BytesDecode<'a> for StrRefCodec { type DItem = &'a str; fn bytes_decode(bytes: &'a [u8]) -> Option { - let s = std::str::from_utf8(bytes).unwrap(); + let s = std::str::from_utf8(bytes).ok()?; Some(s) } } From 1165ba217197f7abae6ee4e9d9b159bc09cdf275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 15:53:39 +0200 Subject: [PATCH 1735/1889] Make facet deletion incremental --- milli/src/update/delete_documents.rs | 92 ++++++++------------------- milli/src/update/facet/bulk.rs | 19 ++++-- milli/src/update/facet/delete.rs | 92 +++++++++++++++++++++++++++ milli/src/update/facet/incremental.rs | 48 +++++++------- milli/src/update/facet/mod.rs | 31 +++++++-- 5 files changed, 182 insertions(+), 100 deletions(-) create mode 100644 milli/src/update/facet/delete.rs diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index a56a61026..de2f4480c 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,4 +1,5 @@ use std::collections::btree_map::Entry; +use std::collections::{HashMap, HashSet}; use fst::IntoStreamer; use heed::types::{ByteSlice, DecodeIgnore, Str}; @@ -8,17 +9,16 @@ use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; -use super::{ClearDocuments, FacetsUpdateBulk}; +use super::facet::delete::FacetsDelete; +use super::ClearDocuments; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetIgnoreCodec, -}; +use crate::heed_codec::facet::FieldDocIdFacetCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, - RoaringBitmapCodec, SmallString32, BEU32, + ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, + SmallString32, BEU32, }; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -444,13 +444,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } for facet_type in [FacetType::Number, FacetType::String] { - remove_docids_from_facet_id_docids( - self.wtxn, - self.index, - &self.to_delete_docids, - fields_ids_map.clone(), - facet_type, - )?; + let mut affected_facet_values = HashMap::new(); for field_id in self.index.faceted_fields_ids(self.wtxn)? { // Remove docids from the number faceted documents ids let mut docids = @@ -458,14 +452,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { docids -= &self.to_delete_docids; self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; - remove_docids_from_field_id_docid_facet_value( + let facet_values = remove_docids_from_field_id_docid_facet_value( &self.index, self.wtxn, facet_type, field_id, &self.to_delete_docids, )?; + if !facet_values.is_empty() { + affected_facet_values.insert(field_id, facet_values); + } } + FacetsDelete::new( + self.index, + facet_type, + affected_facet_values, + &self.to_delete_docids, + ) + .execute(self.wtxn)?; } // We delete the documents ids that are under the facet field id values. @@ -546,7 +550,7 @@ fn remove_docids_from_field_id_docid_facet_value<'i, 'a>( facet_type: FacetType, field_id: FieldId, to_remove: &RoaringBitmap, -) -> heed::Result<()> { +) -> heed::Result>> { let db = match facet_type { FacetType::String => { index.field_id_docid_facet_strings.remap_types::() @@ -555,19 +559,23 @@ fn remove_docids_from_field_id_docid_facet_value<'i, 'a>( index.field_id_docid_facet_f64s.remap_types::() } }; + let mut all_affected_facet_values = HashSet::default(); let mut iter = db .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + .remap_key_type::>(); while let Some(result) = iter.next() { - let ((_, docid, _), _) = result?; + let ((_, docid, facet_value), _) = result?; if to_remove.contains(docid) { + if !all_affected_facet_values.contains(facet_value) { + all_affected_facet_values.insert(facet_value.to_owned()); + } // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; } } - Ok(()) + Ok(all_affected_facet_values) } fn remove_docids_from_facet_id_exists_docids<'a, C>( @@ -595,54 +603,6 @@ where Ok(()) } -fn remove_docids_from_facet_id_docids<'a>( - wtxn: &'a mut heed::RwTxn, - index: &Index, - to_remove: &RoaringBitmap, - fields_ids_map: FieldsIdsMap, - facet_type: FacetType, -) -> Result<()> { - let db = match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }; - let mut modified = false; - for field_id in fields_ids_map.ids() { - let mut level0_prefix = vec![]; - level0_prefix.extend_from_slice(&field_id.to_be_bytes()); - level0_prefix.push(0); - let mut iter = db - .as_polymorph() - .prefix_iter_mut::<_, ByteSlice, FacetGroupValueCodec>(wtxn, &level0_prefix)?; - - while let Some(result) = iter.next() { - let (bytes, mut value) = result?; - let previous_len = value.bitmap.len(); - value.bitmap -= to_remove; - if value.bitmap.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - modified = true; - } else if value.bitmap.len() != previous_len { - let bytes = bytes.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&bytes, &value)? }; - modified = true; - } - } - } - if !modified { - return Ok(()); - } - let builder = FacetsUpdateBulk::new_not_updating_level_0(index, facet_type); - builder.execute(wtxn)?; - - Ok(()) -} #[cfg(test)] mod tests { diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index e82af5d66..d3db0a0fa 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -29,6 +29,7 @@ pub struct FacetsUpdateBulk<'i> { group_size: u8, min_level_size: u8, facet_type: FacetType, + field_ids: Vec, // None if level 0 does not need to be updated new_data: Option>, } @@ -36,20 +37,30 @@ pub struct FacetsUpdateBulk<'i> { impl<'i> FacetsUpdateBulk<'i> { pub fn new( index: &'i Index, + field_ids: Vec, facet_type: FacetType, new_data: grenad::Reader, group_size: u8, min_level_size: u8, ) -> FacetsUpdateBulk<'i> { - FacetsUpdateBulk { index, group_size, min_level_size, facet_type, new_data: Some(new_data) } + FacetsUpdateBulk { + index, + field_ids, + group_size, + min_level_size, + facet_type, + new_data: Some(new_data), + } } pub fn new_not_updating_level_0( index: &'i Index, + field_ids: Vec, facet_type: FacetType, ) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, + field_ids, group_size: FACET_GROUP_SIZE, min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, @@ -61,7 +72,7 @@ impl<'i> FacetsUpdateBulk<'i> { pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - let Self { index, group_size, min_level_size, facet_type, new_data } = self; + let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; let db = match facet_type { FacetType::String => { @@ -76,8 +87,6 @@ impl<'i> FacetsUpdateBulk<'i> { let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; - let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); - inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; Ok(()) @@ -405,7 +414,7 @@ mod tests { index.verify_structure_validity(&wtxn, 1); // delete all the elements for the facet id 0 for i in 0..100u32 { - index.delete(&mut wtxn, 0, &(i as f64), i); + index.delete_single_docid(&mut wtxn, 0, &(i as f64), i); } index.verify_structure_validity(&wtxn, 0); index.verify_structure_validity(&wtxn, 1); diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs new file mode 100644 index 000000000..efe1d800a --- /dev/null +++ b/milli/src/update/facet/delete.rs @@ -0,0 +1,92 @@ +use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use crate::{ + facet::FacetType, + heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, + update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}, + FieldId, Index, Result, +}; +use heed::RwTxn; +use roaring::RoaringBitmap; +use std::collections::{HashMap, HashSet}; + +pub struct FacetsDelete<'i, 'b> { + index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, + facet_type: FacetType, + affected_facet_values: HashMap>>, + docids_to_delete: &'b RoaringBitmap, + group_size: u8, + max_group_size: u8, + min_level_size: u8, +} +impl<'i, 'b> FacetsDelete<'i, 'b> { + pub fn new( + index: &'i Index, + facet_type: FacetType, + affected_facet_values: HashMap>>, + docids_to_delete: &'b RoaringBitmap, + ) -> Self { + let database = match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + Self { + index, + database, + facet_type, + affected_facet_values, + docids_to_delete, + group_size: FACET_GROUP_SIZE, + max_group_size: FACET_MAX_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, + } + } + + pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { + for (field_id, affected_facet_values) in self.affected_facet_values { + if affected_facet_values.len() >= (self.database.len(wtxn)? / 50) { + // Bulk delete + let mut modified = false; + + for facet_value in affected_facet_values { + let key = + FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() }; + let mut old = self.database.get(wtxn, &key)?.unwrap(); + let previous_len = old.bitmap.len(); + old.bitmap -= self.docids_to_delete; + if old.bitmap.is_empty() { + modified = true; + self.database.delete(wtxn, &key)?; + } else if old.bitmap.len() != previous_len { + modified = true; + self.database.put(wtxn, &key, &old)?; + } + } + if modified { + let builder = FacetsUpdateBulk::new_not_updating_level_0( + self.index, + vec![field_id], + self.facet_type, + ); + builder.execute(wtxn)?; + } + } else { + // Incremental + let inc = FacetsUpdateIncrementalInner { + db: self.database, + group_size: self.group_size, + min_level_size: self.min_level_size, + max_group_size: self.max_group_size, + }; + for facet_value in affected_facet_values { + inc.delete(wtxn, field_id, facet_value.as_slice(), &self.docids_to_delete)?; + } + } + } + Ok(()) + } +} diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index c2115aee5..895713d43 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -485,20 +485,20 @@ impl FacetsUpdateIncrementalInner { field_id: u16, level: u8, facet_value: &[u8], - docid: u32, + docids: &RoaringBitmap, ) -> Result { if level == 0 { - return self.delete_in_level_0(txn, field_id, facet_value, docid); + return self.delete_in_level_0(txn, field_id, facet_value, docids); } let (deletion_key, mut bitmap) = self.find_insertion_key_value(field_id, level, facet_value, txn)?; - let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docid)?; + let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?; let mut decrease_size = false; let next_key = match result { DeletionResult::InPlace => { - bitmap.bitmap.remove(docid); + bitmap.bitmap -= docids; self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; return Ok(DeletionResult::InPlace); } @@ -527,7 +527,7 @@ impl FacetsUpdateIncrementalInner { if reduced_range { updated_deletion_key.left_bound = next_key.clone().unwrap(); } - updated_value.bitmap.remove(docid); + updated_value.bitmap -= docids; let _ = self.db.delete(txn, &deletion_key.as_ref())?; self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; if reduced_range { @@ -543,11 +543,11 @@ impl FacetsUpdateIncrementalInner { txn: &'t mut RwTxn, field_id: u16, facet_value: &[u8], - docid: u32, + docids: &RoaringBitmap, ) -> Result { let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; - bitmap.remove(docid); + bitmap -= docids; if bitmap.is_empty() { let mut next_key = None; @@ -571,7 +571,7 @@ impl FacetsUpdateIncrementalInner { txn: &'t mut RwTxn, field_id: u16, facet_value: &[u8], - docid: u32, + docids: &RoaringBitmap, ) -> Result<()> { if self .db @@ -584,7 +584,7 @@ impl FacetsUpdateIncrementalInner { let highest_level = get_highest_level(&txn, self.db, field_id)?; let result = - self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docid)?; + self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; match result { DeletionResult::InPlace => return Ok(()), DeletionResult::Reduce { .. } => return Ok(()), @@ -807,7 +807,7 @@ mod tests { for i in (200..256).into_iter().rev() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -816,7 +816,7 @@ mod tests { for i in (150..200).into_iter().rev() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -824,7 +824,7 @@ mod tests { let mut txn = index.env.write_txn().unwrap(); for i in (100..150).into_iter().rev() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -832,14 +832,14 @@ mod tests { let mut txn = index.env.write_txn().unwrap(); for i in (17..100).into_iter().rev() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); milli_snap!(format!("{index}"), 17); let mut txn = index.env.write_txn().unwrap(); for i in (15..17).into_iter().rev() { - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -847,7 +847,7 @@ mod tests { let mut txn = index.env.write_txn().unwrap(); for i in (0..15).into_iter().rev() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -867,7 +867,7 @@ mod tests { } for i in 0..128 { - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -875,7 +875,7 @@ mod tests { let mut txn = index.env.write_txn().unwrap(); for i in 128..216 { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -883,7 +883,7 @@ mod tests { let mut txn = index.env.write_txn().unwrap(); for i in 216..256 { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(i as f64), i as u32); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -908,7 +908,7 @@ mod tests { for i in 0..128 { let key = keys[i]; index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(key as f64), key as u32); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -917,7 +917,7 @@ mod tests { for i in 128..216 { let key = keys[i]; index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(key as f64), key as u32); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -926,7 +926,7 @@ mod tests { for i in 216..256 { let key = keys[i]; index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(key as f64), key as u32); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -979,7 +979,7 @@ mod tests { for &key in keys.iter() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &(key as f64), key + 100); + index.delete_single_docid(&mut txn, 0, &(key as f64), key + 100); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -1010,7 +1010,7 @@ mod tests { for &key in keys.iter() { index.verify_structure_validity(&txn, 0); - index.delete(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); + index.delete_single_docid(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); } index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); @@ -1131,7 +1131,7 @@ mod fuzz { OperationKind::Delete(value) => { if let Some(keys) = value_to_keys.get(value) { for key in keys { - index.delete(&mut txn, *field_id, key, *value as u32); + index.delete_single_docid(&mut txn, *field_id, key, *value as u32); trivial_db.delete(*field_id, *key, *value as u32); } } diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index c5046784f..c75713158 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -74,15 +74,15 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8; pub const FACET_GROUP_SIZE: u8 = 4; pub const FACET_MIN_LEVEL_SIZE: u8 = 5; -use std::fs::File; - use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::{Index, Result}; +use std::fs::File; pub mod bulk; +pub mod delete; pub mod incremental; pub struct FacetsUpdate<'i> { @@ -120,8 +120,11 @@ impl<'i> FacetsUpdate<'i> { return Ok(()); } if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { + let field_ids = + self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); let bulk_update = FacetsUpdateBulk::new( self.index, + field_ids, self.facet_type, self.new_data, self.group_size, @@ -273,12 +276,12 @@ pub(crate) mod tests { let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); } - pub fn delete<'a>( + pub fn delete_single_docid<'a>( &self, wtxn: &'a mut RwTxn, field_id: u16, key: &'a >::EItem, - value: u32, + docid: u32, ) { let update = FacetsUpdateIncrementalInner { db: self.content, @@ -287,7 +290,25 @@ pub(crate) mod tests { max_group_size: self.max_group_size.get(), }; let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - update.delete(wtxn, field_id, &key_bytes, value).unwrap(); + let mut docids = RoaringBitmap::new(); + docids.insert(docid); + update.delete(wtxn, field_id, &key_bytes, &docids).unwrap(); + } + pub fn delete<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + max_group_size: self.max_group_size.get(), + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.delete(wtxn, field_id, &key_bytes, docids).unwrap(); } pub fn bulk_insert<'a, 'b>( From a034a1e628175fcc046741037670bf030bda056c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 09:42:55 +0200 Subject: [PATCH 1736/1889] Move StrRefCodec and ByteSliceRefCodec to their own files --- milli/Cargo.toml | 2 +- milli/src/heed_codec/byte_slice_ref.rs | 23 ++++++++++++++++ milli/src/heed_codec/facet/mod.rs | 26 +++---------------- milli/src/heed_codec/mod.rs | 4 +++ milli/src/heed_codec/{facet => }/str_ref.rs | 0 milli/src/index.rs | 3 ++- milli/src/search/criteria/asc_desc.rs | 7 ++--- milli/src/search/facet/facet_distribution.rs | 13 +++++++--- .../search/facet/facet_distribution_iter.rs | 22 +++++++++------- milli/src/search/facet/facet_range_search.rs | 18 +++++++------ .../src/search/facet/facet_sort_ascending.rs | 13 ++++++---- .../src/search/facet/facet_sort_descending.rs | 20 +++++++------- milli/src/search/facet/mod.rs | 12 ++++----- milli/src/update/facet/bulk.rs | 19 +++++++------- milli/src/update/facet/delete.rs | 13 +++++----- milli/src/update/facet/incremental.rs | 25 ++++++++++-------- milli/src/update/facet/mod.rs | 24 +++++++++-------- .../extract/extract_facet_string_docids.rs | 3 ++- 18 files changed, 140 insertions(+), 107 deletions(-) create mode 100644 milli/src/heed_codec/byte_slice_ref.rs rename milli/src/heed_codec/{facet => }/str_ref.rs (100%) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 49988da0b..b768476e3 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -57,7 +57,7 @@ md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } [target.'cfg(fuzzing)'.dev-dependencies] -fuzzcheck = { git = "https://github.com/loiclec/fuzzcheck-rs", branch = "main" } +fuzzcheck = { git = "https://github.com/loiclec/fuzzcheck-rs", branch = "main" } # TODO: use released version [features] default = [ "charabia/default" ] diff --git a/milli/src/heed_codec/byte_slice_ref.rs b/milli/src/heed_codec/byte_slice_ref.rs new file mode 100644 index 000000000..48eda63c5 --- /dev/null +++ b/milli/src/heed_codec/byte_slice_ref.rs @@ -0,0 +1,23 @@ +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; + +/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated +/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure. +pub struct ByteSliceRefCodec; + +impl<'a> BytesEncode<'a> for ByteSliceRefCodec { + type EItem = &'a [u8]; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Borrowed(item)) + } +} + +impl<'a> BytesDecode<'a> for ByteSliceRefCodec { + type DItem = &'a [u8]; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(bytes) + } +} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 35ec925dc..a727b148f 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,6 +1,5 @@ mod field_doc_id_facet_codec; mod ordered_f64_codec; -mod str_ref; use std::borrow::Cow; use std::convert::TryFrom; @@ -12,9 +11,10 @@ use roaring::RoaringBitmap; pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; pub use self::ordered_f64_codec::OrderedF64Codec; -pub use self::str_ref::StrRefCodec; use crate::{CboRoaringBitmapCodec, BEU16}; +use super::StrRefCodec; + pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec; pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec; pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec; @@ -33,7 +33,7 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { /// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] /// databases. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] // TODO: try removing PartialOrd and Ord pub struct FacetGroupKey { pub field_id: u16, pub level: u8, @@ -103,23 +103,3 @@ impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { Some(FacetGroupValue { size, bitmap }) } } - -/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated -/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure. -pub struct ByteSliceRef; - -impl<'a> BytesEncode<'a> for ByteSliceRef { - type EItem = &'a [u8]; - - fn bytes_encode(item: &'a Self::EItem) -> Option> { - Some(Cow::Borrowed(item)) - } -} - -impl<'a> BytesDecode<'a> for ByteSliceRef { - type DItem = &'a [u8]; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - Some(bytes) - } -} diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index e07e47c79..6a058f95f 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -1,10 +1,12 @@ mod beu32_str_codec; +mod byte_slice_ref; pub mod facet; mod field_id_word_count_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; mod str_beu32_codec; +mod str_ref; mod str_str_u8_codec; pub use self::beu32_str_codec::BEU32StrCodec; @@ -16,3 +18,5 @@ pub use self::roaring_bitmap_length::{ }; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; +pub use byte_slice_ref::ByteSliceRefCodec; +pub use str_ref::StrRefCodec; diff --git a/milli/src/heed_codec/facet/str_ref.rs b/milli/src/heed_codec/str_ref.rs similarity index 100% rename from milli/src/heed_codec/facet/str_ref.rs rename to milli/src/heed_codec/str_ref.rs diff --git a/milli/src/index.rs b/milli/src/index.rs index 893817d59..7c5e92d05 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -16,8 +16,9 @@ use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, - FieldIdCodec, OrderedF64Codec, StrRefCodec, + FieldIdCodec, OrderedF64Codec, }; +use crate::heed_codec::StrRefCodec; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 586605116..fd03b1b60 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,8 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; +use crate::heed_codec::facet::FacetGroupKeyCodec; +use crate::heed_codec::ByteSliceRefCodec; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; use crate::search::query_tree::Operation; @@ -194,14 +195,14 @@ fn facet_ordered<'t>( let number_iter = make_iter( rtxn, - index.facet_id_f64_docids.remap_key_type::>(), + index.facet_id_f64_docids.remap_key_type::>(), field_id, candidates.clone(), )?; let string_iter = make_iter( rtxn, - index.facet_id_string_docids.remap_key_type::>(), + index.facet_id_string_docids.remap_key_type::>(), field_id, candidates, )?; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 2e2e448c2..f6a53dbd4 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -9,9 +9,10 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, OrderedF64Codec, StrRefCodec, + FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + OrderedF64Codec, }; +use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec}; use crate::search::facet::facet_distribution_iter; use crate::{FieldId, Index, Result}; @@ -137,7 +138,9 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - self.index.facet_id_f64_docids.remap_key_type::>(), + self.index + .facet_id_f64_docids + .remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids, _| { @@ -160,7 +163,9 @@ impl<'a> FacetDistribution<'a> { ) -> heed::Result<()> { facet_distribution_iter::iterate_over_facet_distribution( self.rtxn, - self.index.facet_id_string_docids.remap_key_type::>(), + self.index + .facet_id_string_docids + .remap_key_type::>(), field_id, candidates, |facet_key, nbr_docids, any_docid| { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 4c6dc75fa..0fdca4118 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -4,9 +4,8 @@ use heed::Result; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; -use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, -}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; use crate::DocumentId; /// Call the given closure on the facet distribution of the candidate documents. @@ -22,7 +21,7 @@ use crate::DocumentId; /// keep iterating over the different facet values or stop. pub fn iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: &RoaringBitmap, callback: CB, @@ -31,10 +30,13 @@ where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { let mut fd = FacetDistribution { rtxn, db, field_id, callback }; - let highest_level = - get_highest_level(rtxn, db.remap_key_type::>(), field_id)?; + let highest_level = get_highest_level( + rtxn, + db.remap_key_type::>(), + field_id, + )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; return Ok(()); } else { @@ -47,7 +49,7 @@ where CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, callback: CB, } @@ -72,11 +74,13 @@ where if key.field_id != self.field_id { return Ok(ControlFlow::Break(())); } + // TODO: use real intersection and then take min()? let docids_in_common = value.bitmap.intersection_len(candidates); if docids_in_common > 0 { + // TODO: use min() let any_docid = value.bitmap.iter().next().unwrap(); match (self.callback)(key.left_bound, docids_in_common, any_docid)? { - ControlFlow::Continue(_) => {} + ControlFlow::Continue(_) => (), // TODO use unit instead of empty scope ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } } diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index a7b4674f1..07300e920 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,9 +4,8 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, -}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; use crate::Result; /// Find all the document ids for which the given field contains a value contained within @@ -47,13 +46,16 @@ where } Bound::Unbounded => Bound::Unbounded, }; - let db = db.remap_key_type::>(); + let db = db.remap_key_type::>(); let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(starting_left_bound) = get_first_facet_value::(rtxn, db, field_id)? { - let rightmost_bound = - Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded + if let Some(starting_left_bound) = + get_first_facet_value::(rtxn, db, field_id)? + { + let rightmost_bound = Bound::Included( + get_last_facet_value::(rtxn, db, field_id)?.unwrap(), + ); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) @@ -65,7 +67,7 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 2b0a45e15..2f1f73db3 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -3,8 +3,9 @@ use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level}; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +use crate::heed_codec::ByteSliceRefCodec; /// Return an iterator which iterates over the given candidate documents in /// ascending order of their facet value for the given field id. @@ -30,12 +31,12 @@ use crate::heed_codec::facet::{ /// Note that once a document id is returned by the iterator, it is never returned again. pub fn ascending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, ) -> Result> + 't>> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); @@ -47,11 +48,13 @@ pub fn ascending_facet_sort<'t>( struct AscendingFacetSort<'t, 'e> { rtxn: &'t heed::RoTxn<'e>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, - std::iter::Take, FacetGroupValueCodec>>, + std::iter::Take< + heed::RoRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + >, )>, } diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 47d0f145b..5f09d708b 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -5,22 +5,23 @@ use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +use crate::heed_codec::ByteSliceRefCodec; /// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort). /// /// This function does the same thing, but in the opposite order. pub fn descending_facet_sort<'t>( rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: RoaringBitmap, ) -> Result> + 't>> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); Ok(Box::new(DescendingFacetSort { @@ -36,12 +37,12 @@ pub fn descending_facet_sort<'t>( struct DescendingFacetSort<'t> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, stack: Vec<( RoaringBitmap, std::iter::Take< - heed::RoRevRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + heed::RoRevRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, >, Bound<&'t [u8]>, )>, @@ -97,7 +98,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { *right_bound = Bound::Excluded(left_bound); let iter = match self .db - .remap_key_type::>() + .remap_key_type::>() .rev_range( &self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow), @@ -121,7 +122,8 @@ impl<'t> Iterator for DescendingFacetSort<'t> { mod tests { use roaring::RoaringBitmap; - use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec}; + use crate::heed_codec::facet::FacetGroupKeyCodec; + use crate::heed_codec::ByteSliceRefCodec; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; @@ -134,7 +136,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let db = index.content.remap_key_type::>(); + let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); for el in iter { let docids = el.unwrap(); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index c854b546d..ccf40d6aa 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -5,8 +5,8 @@ use heed::{BytesDecode, RoTxn}; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::Filter; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; - +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; @@ -17,7 +17,7 @@ mod filter; /// Get the first facet value in the facet database pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -42,7 +42,7 @@ where /// Get the last facet value in the facet database pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result> where @@ -67,7 +67,7 @@ where /// Get the height of the highest level in the facet database pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupValueCodec>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); @@ -77,7 +77,7 @@ pub(crate) fn get_highest_level<'t>( .next() .map(|el| { let (key, _) = el.unwrap(); - let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); key.level }) .unwrap_or(0)) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index d3db0a0fa..4e10c22dd 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -11,8 +11,9 @@ use time::OffsetDateTime; use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::facet::FacetType; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +use crate::heed_codec::ByteSliceRefCodec; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; @@ -75,11 +76,11 @@ impl<'i> FacetsUpdateBulk<'i> { let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; let db = match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; @@ -98,7 +99,7 @@ impl<'i> FacetsUpdateBulk<'i> { /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { - pub db: heed::Database, FacetGroupValueCodec>, + pub db: heed::Database, FacetGroupValueCodec>, pub new_data: Option>, pub group_size: u8, pub min_level_size: u8, @@ -216,7 +217,7 @@ impl FacetsUpdateBulkInner { .db .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, level_0_prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); + .remap_types::, FacetGroupValueCodec>(); let mut left_bound: &[u8] = &[]; let mut first_iteration_for_new_group = true; @@ -299,7 +300,7 @@ impl FacetsUpdateBulkInner { bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { let key = FacetGroupKey { field_id, level, left_bound }; - let key = FacetGroupKeyCodec::::bytes_encode(&key) + let key = FacetGroupKeyCodec::::bytes_encode(&key) .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = @@ -328,7 +329,7 @@ impl FacetsUpdateBulkInner { bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { let key = FacetGroupKey { field_id, level, left_bound }; - let key = FacetGroupKeyCodec::::bytes_encode(&key) + let key = FacetGroupKeyCodec::::bytes_encode(&key) .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs index efe1d800a..74c17e8f2 100644 --- a/milli/src/update/facet/delete.rs +++ b/milli/src/update/facet/delete.rs @@ -1,7 +1,8 @@ use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::{ facet::FacetType, - heed_codec::facet::{ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, + heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, + heed_codec::ByteSliceRefCodec, update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}, FieldId, Index, Result, }; @@ -11,7 +12,7 @@ use std::collections::{HashMap, HashSet}; pub struct FacetsDelete<'i, 'b> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, + database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, affected_facet_values: HashMap>>, docids_to_delete: &'b RoaringBitmap, @@ -27,11 +28,11 @@ impl<'i, 'b> FacetsDelete<'i, 'b> { docids_to_delete: &'b RoaringBitmap, ) -> Self { let database = match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; Self { diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 895713d43..9dda86a46 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -7,8 +7,9 @@ use roaring::RoaringBitmap; use crate::facet::FacetType; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; +use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; @@ -50,10 +51,10 @@ impl<'i> FacetsUpdateIncremental<'i> { db: match facet_type { FacetType::String => index .facet_id_string_docids - .remap_key_type::>(), + .remap_key_type::>(), FacetType::Number => index .facet_id_f64_docids - .remap_key_type::>(), + .remap_key_type::>(), }, group_size, max_group_size, @@ -69,7 +70,7 @@ impl<'i> FacetsUpdateIncremental<'i> { let mut cursor = self.new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { - let key = FacetGroupKeyCodec::::bytes_decode(key) + let key = FacetGroupKeyCodec::::bytes_decode(key) .ok_or(heed::Error::Encoding)?; let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; @@ -87,7 +88,7 @@ impl<'i> FacetsUpdateIncremental<'i> { /// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type pub struct FacetsUpdateIncrementalInner { - pub db: heed::Database, FacetGroupValueCodec>, + pub db: heed::Database, FacetGroupValueCodec>, pub group_size: u8, pub min_level_size: u8, pub max_group_size: u8, @@ -126,7 +127,7 @@ impl FacetsUpdateIncrementalInner { if let Some(e) = prefix_iter.next() { let (key_bytes, value) = e?; Ok(( - FacetGroupKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, @@ -149,7 +150,7 @@ impl FacetsUpdateIncrementalInner { )?; let (key_bytes, value) = iter.next().unwrap()?; Ok(( - FacetGroupKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, @@ -411,7 +412,7 @@ impl FacetsUpdateIncrementalInner { let mut values = RoaringBitmap::new(); for _ in 0..group_size { let (key_bytes, value_i) = groups_iter.next().unwrap()?; - let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) + let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)?; if first_key.is_none() { @@ -434,7 +435,7 @@ impl FacetsUpdateIncrementalInner { let mut values = RoaringBitmap::new(); for _ in 0..nbr_leftover_elements { let (key_bytes, value_i) = groups_iter.next().unwrap()?; - let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) + let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) .ok_or(Error::Encoding)?; if first_key.is_none() { @@ -616,7 +617,7 @@ impl FacetsUpdateIncrementalInner { while let Some(el) = iter.next() { let (k, _) = el?; to_delete.push( - FacetGroupKeyCodec::::bytes_decode(k) + FacetGroupKeyCodec::::bytes_decode(k) .ok_or(Error::Encoding)? .into_owned(), ); @@ -655,7 +656,8 @@ mod tests { use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::StrRefCodec; use crate::milli_snap; use crate::update::facet::tests::FacetIndex; @@ -1019,6 +1021,7 @@ mod tests { // fuzz tests } + #[cfg(all(test, fuzzing))] mod fuzz { use std::borrow::Cow; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index c75713158..a6d8c3d60 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -77,7 +77,8 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5; use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; use crate::{Index, Result}; use std::fs::File; @@ -87,7 +88,7 @@ pub mod incremental; pub struct FacetsUpdate<'i> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, + database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, new_data: grenad::Reader, group_size: u8, @@ -97,11 +98,11 @@ pub struct FacetsUpdate<'i> { impl<'i> FacetsUpdate<'i> { pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { let database = match facet_type { - FacetType::String => { - index.facet_id_string_docids.remap_key_type::>() - } + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() + index.facet_id_f64_docids.remap_key_type::>() } }; Self { @@ -159,8 +160,9 @@ pub(crate) mod tests { use super::bulk::FacetsUpdateBulkInner; use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; + use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; use crate::snapshot_tests::display_bitmap; use crate::update::FacetsUpdateIncrementalInner; @@ -173,7 +175,7 @@ pub(crate) mod tests { BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, { pub env: Env, - pub content: heed::Database, FacetGroupValueCodec>, + pub content: heed::Database, FacetGroupValueCodec>, pub group_size: Cell, pub min_level_size: Cell, pub max_group_size: Cell, @@ -327,7 +329,7 @@ pub(crate) mod tests { let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned(); let key: FacetGroupKey<&[u8]> = FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; - let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); let value = CboRoaringBitmapCodec::bytes_encode(&docids).unwrap(); writer.insert(&key, &value).unwrap(); } @@ -362,7 +364,7 @@ pub(crate) mod tests { .unwrap(); while let Some(el) = iter.next() { let (key, value) = el.unwrap(); - let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); let mut prefix_start_below = vec![]; prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); @@ -379,7 +381,7 @@ pub(crate) mod tests { ) .unwrap(); let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); - FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() + FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() }; assert!(value.size > 0); diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index bf523cbb3..221356ba0 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -4,7 +4,8 @@ use std::io; use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, StrRefCodec}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::StrRefCodec; use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; From acc8caebe62f758794e25cdeb71ac88dd380ee3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 09:46:31 +0200 Subject: [PATCH 1737/1889] Add link to GitHub PR to document of update/facet module --- milli/src/update/facet/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index a6d8c3d60..a5d527282 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -68,6 +68,8 @@ bulk method is faster when a large part of the database is modified. Empirically it takes 50x more time to incrementally add N facet values to an existing database than it is to construct a database of N facet values. This is the heuristic that is used to choose between the two methods. + +Related PR: https://github.com/meilisearch/milli/pull/619 */ pub const FACET_MAX_GROUP_SIZE: u8 = 8; From 2295e0e3ce32d72c9960d1ebfc04b637d07b5047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 10:23:40 +0200 Subject: [PATCH 1738/1889] Use real delete function in facet indexing fuzz tests By deleting multiple docids at once instead of one-by-one --- milli/src/update/facet/incremental.rs | 49 ++++++++++++++++++--------- milli/src/update/facet/mod.rs | 18 ++++------ 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 9dda86a46..a4c756aec 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1018,25 +1018,26 @@ mod tests { txn.commit().unwrap(); milli_snap!(format!("{index}"), "after_delete"); } - - // fuzz tests } +// fuzz tests #[cfg(all(test, fuzzing))] mod fuzz { use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; + use std::iter::FromIterator; use std::rc::Rc; + use fuzzcheck::mutators::integer::U8Mutator; use fuzzcheck::mutators::integer_within_range::{U16WithinRangeMutator, U8WithinRangeMutator}; + use fuzzcheck::mutators::vector::VecMutator; use fuzzcheck::DefaultMutator; use heed::BytesEncode; use roaring::RoaringBitmap; use tempfile::TempDir; use super::*; - use crate::milli_snap; use crate::update::facet::tests::FacetIndex; struct NEU16Codec; @@ -1074,10 +1075,10 @@ mod fuzz { *values |= new_values; } #[no_coverage] - pub fn delete(&mut self, field_id: u16, key: T, value: u32) { + pub fn delete(&mut self, field_id: u16, key: T, values_to_remove: &RoaringBitmap) { if let Some(values_field_id) = self.elements.get_mut(&field_id) { if let Some(values) = values_field_id.get_mut(&key) { - values.remove(value); + *values -= values_to_remove; if values.is_empty() { values_field_id.remove(&key); } @@ -1103,8 +1104,14 @@ mod fuzz { } #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] enum OperationKind { - Insert(Vec), - Delete(u8), + Insert( + #[field_mutator(VecMutator = { VecMutator::new(U8Mutator::default(), 0 ..= 10) })] + Vec, + ), + Delete( + #[field_mutator(VecMutator = { VecMutator::new(U8Mutator::default(), 0 ..= 10) })] + Vec, + ), } #[no_coverage] @@ -1131,13 +1138,23 @@ mod fuzz { index.insert(&mut txn, *field_id, key, &bitmap); trivial_db.insert(*field_id, *key, &bitmap); } - OperationKind::Delete(value) => { - if let Some(keys) = value_to_keys.get(value) { - for key in keys { - index.delete_single_docid(&mut txn, *field_id, key, *value as u32); - trivial_db.delete(*field_id, *key, *value as u32); + OperationKind::Delete(values) => { + let values = RoaringBitmap::from_iter(values.iter().copied().map(|x| x as u32)); + let mut values_per_key = HashMap::new(); + + for value in values { + if let Some(keys) = value_to_keys.get(&(value as u8)) { + for key in keys { + let values: &mut RoaringBitmap = + values_per_key.entry(key).or_default(); + values.insert(value); + } } } + for (key, values) in values_per_key { + index.delete(&mut txn, *field_id, &key, &values); + trivial_db.delete(*field_id, *key, &values); + } } } } @@ -1221,7 +1238,7 @@ mod fuzz { {"key":166, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[61]}}, {"key":183, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, - {"key":250, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":50}} + {"key":250, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":[50]}} ] "#; let operations: Vec> = serde_json::from_str(operations).unwrap(); @@ -1250,7 +1267,7 @@ mod fuzz { {"key":200, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, {"key":93, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[98]}}, {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, - {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":210}} + {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":[210]}} ] "#; let operations: Vec> = serde_json::from_str(operations).unwrap(); @@ -1285,7 +1302,7 @@ mod fuzz { let operations = r#"[ {"key":63499, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[87]}}, {"key":25374, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[14]}}, - {"key":64481, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Delete":87}}, + {"key":64481, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Delete":[87]}}, {"key":23038, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[173]}}, {"key":14862, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[8]}}, {"key":13145, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[5,64]}}, @@ -1337,7 +1354,7 @@ mod fuzz { "max_group_size":4, "min_level_size":25, "field_id":3, - "kind":{"Delete":11} + "kind":{"Delete":[11]} } ] "#; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index a5d527282..5fb5c9e48 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -76,13 +76,14 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8; pub const FACET_GROUP_SIZE: u8 = 4; pub const FACET_MIN_LEVEL_SIZE: u8 = 5; +use std::fs::File; + use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::ByteSliceRefCodec; use crate::{Index, Result}; -use std::fs::File; pub mod bulk; pub mod delete; @@ -153,6 +154,7 @@ impl<'i> FacetsUpdate<'i> { pub(crate) mod tests { use std::cell::Cell; use std::fmt::Display; + use std::iter::FromIterator; use std::marker::PhantomData; use std::rc::Rc; @@ -170,7 +172,7 @@ pub(crate) mod tests { use crate::update::FacetsUpdateIncrementalInner; use crate::CboRoaringBitmapCodec; - // A dummy index that only contains the facet database, used for testing + /// A dummy index that only contains the facet database, used for testing pub struct FacetIndex where for<'a> BoundCodec: @@ -287,17 +289,9 @@ pub(crate) mod tests { key: &'a >::EItem, docid: u32, ) { - let update = FacetsUpdateIncrementalInner { - db: self.content, - group_size: self.group_size.get(), - min_level_size: self.min_level_size.get(), - max_group_size: self.max_group_size.get(), - }; - let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - let mut docids = RoaringBitmap::new(); - docids.insert(docid); - update.delete(wtxn, field_id, &key_bytes, &docids).unwrap(); + self.delete(wtxn, field_id, key, &RoaringBitmap::from_iter(std::iter::once(docid))) } + pub fn delete<'a>( &self, wtxn: &'a mut RwTxn, From ee1abfd1c18291a5bf7d9513c36ddb76663e4135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 10:25:56 +0200 Subject: [PATCH 1739/1889] Ignore files generated by fuzzcheck --- .gitignore | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 39623a232..edd3e675c 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ milli/target/ # Snapshots ## ... large *.full.snap - -# ... unreviewed +## ... unreviewed *.snap.new + +# Fuzzcheck data for the facet indexing fuzz test +milli/fuzz/update::facet::incremental::fuzz::fuzz/ From d885de16002e4c8aaf58602078cf8f88240b4d5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Sep 2022 17:16:11 +0200 Subject: [PATCH 1740/1889] Add option to avoid soft deletion of documents --- milli/src/update/delete_documents.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index de2f4480c..2626c1555 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -26,6 +26,8 @@ pub struct DeleteDocuments<'t, 'u, 'i> { index: &'i Index, external_documents_ids: ExternalDocumentsIds<'static>, to_delete_docids: RoaringBitmap, + #[cfg(test)] + disable_soft_delete: bool, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -46,9 +48,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { index, external_documents_ids, to_delete_docids: RoaringBitmap::new(), + #[cfg(test)] + disable_soft_delete: false, }) } + #[cfg(test)] + fn disable_soft_delete(&mut self, disable: bool) { + self.disable_soft_delete = disable; + } + pub fn delete_document(&mut self, docid: u32) { self.to_delete_docids.insert(docid); } @@ -147,7 +156,20 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We run the deletion. // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents // We run the deletion. - if percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { + let disable_soft_delete = { + #[cfg(not(test))] + { + false + } + #[cfg(test)] + { + self.disable_soft_delete + } + }; + if !disable_soft_delete + && percentage_available > 10 + && percentage_used_by_soft_deleted_documents < 10 + { self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; return Ok(DocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), From ab5e56fd169dacaddc99b8abf39610d932222d0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 25 Aug 2022 14:51:50 +0200 Subject: [PATCH 1741/1889] Add document deletion snapshot tests and tests for hard-deletion --- milli/src/update/delete_documents.rs | 179 ++++++++++++++++++++++----- 1 file changed, 150 insertions(+), 29 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 2626c1555..cece56f4d 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -27,7 +27,7 @@ pub struct DeleteDocuments<'t, 'u, 'i> { external_documents_ids: ExternalDocumentsIds<'static>, to_delete_docids: RoaringBitmap, #[cfg(test)] - disable_soft_delete: bool, + disable_soft_deletion: bool, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -49,13 +49,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { external_documents_ids, to_delete_docids: RoaringBitmap::new(), #[cfg(test)] - disable_soft_delete: false, + disable_soft_deletion: false, }) } #[cfg(test)] - fn disable_soft_delete(&mut self, disable: bool) { - self.disable_soft_delete = disable; + fn disable_soft_deletion(&mut self, disable: bool) { + self.disable_soft_deletion = disable; } pub fn delete_document(&mut self, docid: u32) { @@ -156,17 +156,17 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We run the deletion. // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents // We run the deletion. - let disable_soft_delete = { + let disable_soft_deletion = { #[cfg(not(test))] { false } #[cfg(test)] { - self.disable_soft_delete + self.disable_soft_deletion } }; - if !disable_soft_delete + if !disable_soft_deletion && percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { @@ -634,7 +634,7 @@ mod tests { use super::*; use crate::index::tests::TempIndex; - use crate::Filter; + use crate::{db_snap, Filter}; fn delete_documents<'t>( wtxn: &mut RwTxn<'t, '_>, @@ -680,6 +680,10 @@ mod tests { wtxn.commit().unwrap(); + db_snap!(index, documents_ids, @"[]"); + db_snap!(index, word_docids, @""); + db_snap!(index, soft_deleted_documents_ids, @"[]"); + let rtxn = index.read_txn().unwrap(); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); @@ -689,6 +693,10 @@ mod tests { fn delete_documents_with_strange_primary_key() { let index = TempIndex::new(); + index + .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) + .unwrap(); + let mut wtxn = index.write_txn().unwrap(); index .add_documents_using_wtxn( @@ -700,14 +708,32 @@ mod tests { ]), ) .unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); + db_snap!(index, word_docids, @r###" + benoit [2, ] + kevin [0, ] + kevina [1, ] + "###); + db_snap!(index, soft_deleted_documents_ids, @"[]"); + + let mut wtxn = index.write_txn().unwrap(); // Delete not all of the documents but some of them. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); builder.delete_external_id("0"); builder.delete_external_id("1"); builder.execute().unwrap(); - wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[2, ]"); + db_snap!(index, word_docids, @r###" + benoit [2, ] + kevin [0, ] + kevina [1, ] + "###); + db_snap!(index, soft_deleted_documents_ids, @"[0, 1, ]"); } #[test] @@ -727,26 +753,29 @@ mod tests { .add_documents_using_wtxn( &mut wtxn, documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } ]), ) .unwrap(); @@ -759,6 +788,86 @@ mod tests { assert!(results.documents_ids.is_empty()); wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[0, ]"); + db_snap!(index, word_docids, @"e89cd44832e960519823e12b1e7e28af"); + db_snap!(index, facet_id_f64_docids, @""); + db_snap!(index, facet_id_string_docids, @"720ee1ba8c18342f3714c5863bc6c1f5"); + } + #[test] + fn facet_hard_deletion() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + settings.set_filterable_fields(hashset! { S("label") }); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_external_id("1_4"); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, word_docids, 1, @"999733c2461093d4873321902fc8dcd7"); + db_snap!(index, facet_id_f64_docids, 1, @""); + db_snap!(index, facet_id_string_docids, 1, @"a12e80655ed5f0f8e869bb9c32af61e9"); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete more than one document + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_external_id("1_5"); + builder.delete_external_id("1_7"); + builder.delete_external_id("1_70"); + builder.delete_external_id("1_72"); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); + db_snap!(index, word_docids, 2, @"b892636eaff43c917d5aa8b09c107a02"); + db_snap!(index, facet_id_f64_docids, 2, @""); + db_snap!(index, facet_id_string_docids, 2, @"b9946a9cb0ed2df40352e98d6836c8d0"); } #[test] @@ -814,6 +923,8 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[0, ]"); } #[test] @@ -869,6 +980,8 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); } #[test] @@ -923,6 +1036,10 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[4, 5, 6, 11, 16, 18, ]"); + db_snap!(index, facet_id_f64_docids, @"20727a38c0b1e1a20a44526b85cf2cbc"); + db_snap!(index, facet_id_string_docids, @""); } #[test] @@ -995,6 +1112,8 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); } #[test] @@ -1045,5 +1164,7 @@ mod tests { assert_eq!(Some(&2), results.get("number")); wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); } } From e3ba1fc88383da4eb265bdca07e277e869c33772 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 22 Sep 2022 14:01:13 +0200 Subject: [PATCH 1742/1889] Make deletion tests for both soft-deletion and hard-deletion --- milli/src/snapshot_tests.rs | 9 + milli/src/update/delete_documents.rs | 336 ++++++++---------- .../false/documents_ids.snap | 4 + .../false/facet_id_exists_docids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../false/word_docids.snap | 4 + .../false/word_pair_proximity_docids.snap | 4 + .../true/documents_ids.snap | 4 + .../true/facet_id_exists_docids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + .../true/word_docids.snap | 4 + .../true/word_pair_proximity_docids.snap | 4 + .../false/documents_ids.snap | 4 + .../false/facet_id_exists_docids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../false/word_docids.snap | 7 + .../false/word_pair_proximity_docids.snap | 4 + .../true/documents_ids.snap | 4 + .../true/facet_id_exists_docids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + .../true/word_docids.snap | 5 + .../true/word_pair_proximity_docids.snap | 4 + .../false/facet_id_exists_docids.snap | 6 + .../false/facet_id_f64_docids.snap | 6 + .../false/facet_id_string_docids.snap | 19 + .../false/soft_deleted_documents_ids.snap | 4 + .../false/word_docids.snap | 42 +++ .../false/word_pair_proximity_docids.snap | 4 + .../true/facet_id_exists_docids.snap | 6 + .../true/facet_id_f64_docids.snap | 6 + .../true/facet_id_string_docids.snap | 18 + .../true/soft_deleted_documents_ids.snap | 4 + .../true/word_docids.snap | 40 +++ .../true/word_pair_proximity_docids.snap | 4 + .../false/facet_id_f64_docids.snap | 48 +++ .../false/facet_id_string_docids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../true/facet_id_f64_docids.snap | 36 ++ .../true/facet_id_string_docids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + .../false/soft_deleted_documents_ids.snap | 4 + .../true/soft_deleted_documents_ids.snap | 4 + 46 files changed, 533 insertions(+), 179 deletions(-) create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 9bc39d882..389d7b7a2 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -287,6 +287,12 @@ pub fn snap_facet_id_f64_docids(index: &Index) -> String { }); snap } +pub fn snap_facet_id_exists_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, facet_id_exists_docids, |(facet_id, docids)| { + &format!("{facet_id:<3} {}", display_bitmap(&docids)) + }); + snap +} pub fn snap_facet_id_string_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, facet_id_string_docids, |( FacetGroupKey { field_id, level, left_bound }, @@ -488,6 +494,9 @@ macro_rules! full_snap_of_db { }}; ($index:ident, field_id_docid_facet_strings) => {{ $crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index) + }}; + ($index:ident, facet_id_exists_docids) => {{ + $crate::snapshot_tests::snap_facet_id_exists_docids(&$index) }}; ($index:ident, documents_ids) => {{ $crate::snapshot_tests::snap_documents_ids(&$index) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index cece56f4d..432e1497f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -640,6 +640,7 @@ mod tests { wtxn: &mut RwTxn<'t, '_>, index: &'t Index, external_ids: &[&str], + disable_soft_deletion: bool, ) -> Vec { let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); let ids_to_delete: Vec = external_ids @@ -649,14 +650,14 @@ mod tests { // Delete some documents. let mut builder = DeleteDocuments::new(wtxn, index).unwrap(); + builder.disable_soft_deletion(disable_soft_deletion); external_ids.iter().for_each(|id| drop(builder.delete_external_id(id))); builder.execute().unwrap(); ids_to_delete } - #[test] - fn delete_documents_with_numbers_as_primary_key() { + fn delete_documents_with_numbers_as_primary_key_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -676,13 +677,17 @@ mod tests { builder.delete_document(0); builder.delete_document(1); builder.delete_document(2); + builder.disable_soft_deletion(disable_soft_deletion); builder.execute().unwrap(); wtxn.commit().unwrap(); - db_snap!(index, documents_ids, @"[]"); - db_snap!(index, word_docids, @""); - db_snap!(index, soft_deleted_documents_ids, @"[]"); + // All these snapshots should be empty since the database was cleared + db_snap!(index, documents_ids, disable_soft_deletion); + db_snap!(index, word_docids, disable_soft_deletion); + db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); + db_snap!(index, facet_id_exists_docids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); let rtxn = index.read_txn().unwrap(); @@ -690,7 +695,12 @@ mod tests { } #[test] - fn delete_documents_with_strange_primary_key() { + fn delete_documents_with_numbers_as_primary_key() { + delete_documents_with_numbers_as_primary_key_(true); + delete_documents_with_numbers_as_primary_key_(false); + } + + fn delete_documents_with_strange_primary_key_(disable_soft_deletion: bool) { let index = TempIndex::new(); index @@ -710,34 +720,31 @@ mod tests { .unwrap(); wtxn.commit().unwrap(); - db_snap!(index, documents_ids, @"[0, 1, 2, ]"); - db_snap!(index, word_docids, @r###" - benoit [2, ] - kevin [0, ] - kevina [1, ] - "###); - db_snap!(index, soft_deleted_documents_ids, @"[]"); - let mut wtxn = index.write_txn().unwrap(); // Delete not all of the documents but some of them. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); builder.delete_external_id("0"); builder.delete_external_id("1"); + builder.disable_soft_deletion(disable_soft_deletion); builder.execute().unwrap(); wtxn.commit().unwrap(); - db_snap!(index, documents_ids, @"[2, ]"); - db_snap!(index, word_docids, @r###" - benoit [2, ] - kevin [0, ] - kevina [1, ] - "###); - db_snap!(index, soft_deleted_documents_ids, @"[0, 1, ]"); + db_snap!(index, documents_ids, disable_soft_deletion); + db_snap!(index, word_docids, disable_soft_deletion); + db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); } #[test] - fn filtered_placeholder_search_should_not_return_deleted_documents() { + fn delete_documents_with_strange_primary_key() { + delete_documents_with_strange_primary_key_(true); + delete_documents_with_strange_primary_key_(false); + } + + fn filtered_placeholder_search_should_not_return_deleted_documents_( + disable_soft_deletion: bool, + ) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -745,7 +752,7 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label") }); + settings.set_filterable_fields(hashset! { S("label"), S("label2") }); }) .unwrap(); @@ -780,7 +787,7 @@ mod tests { ) .unwrap(); - delete_documents(&mut wtxn, &index, &["1_4"]); + delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"], disable_soft_deletion); // Placeholder search with filter let filter = Filter::from_str("label = sign").unwrap().unwrap(); @@ -789,21 +796,27 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, @"[0, ]"); - db_snap!(index, word_docids, @"e89cd44832e960519823e12b1e7e28af"); - db_snap!(index, facet_id_f64_docids, @""); - db_snap!(index, facet_id_string_docids, @"720ee1ba8c18342f3714c5863bc6c1f5"); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, word_docids, disable_soft_deletion); + db_snap!(index, facet_id_f64_docids, disable_soft_deletion); + db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); + db_snap!(index, facet_id_exists_docids, disable_soft_deletion); + db_snap!(index, facet_id_string_docids, disable_soft_deletion); } + #[test] - fn facet_hard_deletion() { + fn filtered_placeholder_search_should_not_return_deleted_documents() { + filtered_placeholder_search_should_not_return_deleted_documents_(true); + filtered_placeholder_search_should_not_return_deleted_documents_(false); + } + + fn placeholder_search_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); - index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label") }); }) .unwrap(); @@ -838,78 +851,8 @@ mod tests { ) .unwrap(); - // Delete not all of the documents but some of them. - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.disable_soft_deletion(true); - builder.delete_external_id("1_4"); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); - db_snap!(index, word_docids, 1, @"999733c2461093d4873321902fc8dcd7"); - db_snap!(index, facet_id_f64_docids, 1, @""); - db_snap!(index, facet_id_string_docids, 1, @"a12e80655ed5f0f8e869bb9c32af61e9"); - - let mut wtxn = index.write_txn().unwrap(); - - // Delete more than one document - let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.disable_soft_deletion(true); - builder.delete_external_id("1_5"); - builder.delete_external_id("1_7"); - builder.delete_external_id("1_70"); - builder.delete_external_id("1_72"); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); - db_snap!(index, word_docids, 2, @"b892636eaff43c917d5aa8b09c107a02"); - db_snap!(index, facet_id_f64_docids, 2, @""); - db_snap!(index, facet_id_string_docids, 2, @"b9946a9cb0ed2df40352e98d6836c8d0"); - } - - #[test] - fn placeholder_search_should_not_return_deleted_documents() { - let index = TempIndex::new(); - - let mut wtxn = index.write_txn().unwrap(); - index - .update_settings_using_wtxn(&mut wtxn, |settings| { - settings.set_primary_key(S("docid")); - }) - .unwrap(); - - index - .add_documents_using_wtxn( - &mut wtxn, - documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } - ]), - ) - .unwrap(); - - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &["1_4"], disable_soft_deletion); // Placeholder search let results = index.search(&wtxn).execute().unwrap(); @@ -923,12 +866,15 @@ mod tests { } wtxn.commit().unwrap(); - - db_snap!(index, soft_deleted_documents_ids, @"[0, ]"); } #[test] - fn search_should_not_return_deleted_documents() { + fn placeholder_search_should_not_return_deleted_documents() { + placeholder_search_should_not_return_deleted_documents_(true); + placeholder_search_should_not_return_deleted_documents_(false); + } + + fn search_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -942,31 +888,35 @@ mod tests { .add_documents_using_wtxn( &mut wtxn, documents!([ - {"docid": "1_4", "label": "sign"}, - {"docid": "1_5", "label": "letter"}, - {"docid": "1_7", "label": "abstract,cartoon,design,pattern"}, - {"docid": "1_36","label": "drawing,painting,pattern"}, - {"docid": "1_37","label": "art,drawing,outdoor"}, - {"docid": "1_38","label": "aquarium,art,drawing"}, - {"docid": "1_39","label": "abstract"}, - {"docid": "1_40","label": "cartoon"}, - {"docid": "1_41","label": "art,drawing"}, - {"docid": "1_42","label": "art,pattern"}, - {"docid": "1_43","label": "abstract,art,drawing,pattern"}, - {"docid": "1_44","label": "drawing"}, - {"docid": "1_45","label": "art"}, - {"docid": "1_46","label": "abstract,colorfulness,pattern"}, - {"docid": "1_47","label": "abstract,pattern"}, - {"docid": "1_52","label": "abstract,cartoon"}, - {"docid": "1_57","label": "abstract,drawing,pattern"}, - {"docid": "1_58","label": "abstract,art,cartoon"}, - {"docid": "1_68","label": "design"}, - {"docid": "1_69","label": "geometry"} + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } ]), ) .unwrap(); - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &["1_7", "1_52"], disable_soft_deletion); // search for abstract let results = index.search(&wtxn).query("abstract").execute().unwrap(); @@ -981,11 +931,18 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); } #[test] - fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { + fn search_should_not_return_deleted_documents() { + search_should_not_return_deleted_documents_(true); + search_should_not_return_deleted_documents_(false); + } + + fn geo_filtered_placeholder_search_should_not_return_deleted_documents_( + disable_soft_deletion: bool, + ) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -1021,7 +978,8 @@ mod tests { ])).unwrap(); let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &external_ids_to_delete, disable_soft_deletion); // Placeholder search with geo filter let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); @@ -1037,13 +995,18 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, @"[4, 5, 6, 11, 16, 18, ]"); - db_snap!(index, facet_id_f64_docids, @"20727a38c0b1e1a20a44526b85cf2cbc"); - db_snap!(index, facet_id_string_docids, @""); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, facet_id_f64_docids, disable_soft_deletion); + db_snap!(index, facet_id_string_docids, disable_soft_deletion); } #[test] - fn get_documents_should_not_return_deleted_documents() { + fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { + geo_filtered_placeholder_search_should_not_return_deleted_documents_(true); + geo_filtered_placeholder_search_should_not_return_deleted_documents_(false); + } + + fn get_documents_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -1057,32 +1020,36 @@ mod tests { .add_documents_using_wtxn( &mut wtxn, documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } ]), ) .unwrap(); let deleted_external_ids = ["1_7", "1_52"]; - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &deleted_external_ids, disable_soft_deletion); // list all documents let results = index.all_documents(&wtxn).unwrap(); @@ -1113,11 +1080,16 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); } #[test] - fn stats_should_not_return_deleted_documents() { + fn get_documents_should_not_return_deleted_documents() { + get_documents_should_not_return_deleted_documents_(true); + get_documents_should_not_return_deleted_documents_(false); + } + + fn stats_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -1129,29 +1101,29 @@ mod tests { .unwrap(); index.add_documents_using_wtxn(&mut wtxn, documents!([ - { "docid": "1_4", "label": "sign"}, - { "docid": "1_5", "label": "letter"}, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern", "title": "Mickey Mouse"}, - { "docid": "1_36", "label": "drawing,painting,pattern"}, - { "docid": "1_37", "label": "art,drawing,outdoor"}, - { "docid": "1_38", "label": "aquarium,art,drawing", "title": "Nemo"}, - { "docid": "1_39", "label": "abstract"}, - { "docid": "1_40", "label": "cartoon"}, - { "docid": "1_41", "label": "art,drawing"}, - { "docid": "1_42", "label": "art,pattern"}, - { "docid": "1_43", "label": "abstract,art,drawing,pattern", "number": 32i32}, - { "docid": "1_44", "label": "drawing", "number": 44i32}, - { "docid": "1_45", "label": "art"}, - { "docid": "1_46", "label": "abstract,colorfulness,pattern"}, - { "docid": "1_47", "label": "abstract,pattern"}, - { "docid": "1_52", "label": "abstract,cartoon"}, - { "docid": "1_57", "label": "abstract,drawing,pattern"}, - { "docid": "1_58", "label": "abstract,art,cartoon"}, - { "docid": "1_68", "label": "design"}, - { "docid": "1_69", "label": "geometry"} + { "docid": "1_4", "label": ["sign"]}, + { "docid": "1_5", "label": ["letter"]}, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, + { "docid": "1_36", "label": ["drawing","painting","pattern"]}, + { "docid": "1_37", "label": ["art","drawing","outdoor"]}, + { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, + { "docid": "1_39", "label": ["abstract"]}, + { "docid": "1_40", "label": ["cartoon"]}, + { "docid": "1_41", "label": ["art","drawing"]}, + { "docid": "1_42", "label": ["art","pattern"]}, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, + { "docid": "1_44", "label": ["drawing"], "number": 44i32}, + { "docid": "1_45", "label": ["art"]}, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, + { "docid": "1_47", "label": ["abstract","pattern"]}, + { "docid": "1_52", "label": ["abstract","cartoon"]}, + { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, + { "docid": "1_58", "label": ["abstract","art","cartoon"]}, + { "docid": "1_68", "label": ["design"]}, + { "docid": "1_69", "label": ["geometry"]} ])).unwrap(); - delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + delete_documents(&mut wtxn, &index, &["1_7", "1_52"], disable_soft_deletion); // count internal documents let results = index.number_of_documents(&wtxn).unwrap(); @@ -1165,6 +1137,12 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, @"[2, 15, ]"); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + } + + #[test] + fn stats_should_not_return_deleted_documents() { + stats_should_not_return_deleted_documents_(true); + stats_should_not_return_deleted_documents_(false); } } diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap new file mode 100644 index 000000000..6d69b2ffb --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..9139b7a05 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[0, 1, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap new file mode 100644 index 000000000..15c881e87 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/delete_documents.rs +--- +benoit [2, ] +kevin [0, ] +kevina [1, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap new file mode 100644 index 000000000..6d69b2ffb --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap new file mode 100644 index 000000000..88d3a98aa --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/delete_documents.rs +--- +benoit [2, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap new file mode 100644 index 000000000..a7ee4348d --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] +2 [20, 21, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap new file mode 100644 index 000000000..565fadcb3 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +2 0 1.2 1.2 [20, 22, ] +2 0 2.2 2.2 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap new file mode 100644 index 000000000..019836089 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap @@ -0,0 +1,19 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 abstract abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +1 aquarium aquarium [5, ] +1 art art [4, 5, 8, 9, 10, 12, 17, ] +1 cartoon cartoon [2, 7, 15, 17, ] +1 colorfulness colorfulness [13, ] +1 design design [2, 18, ] +1 drawing drawing [3, 4, 5, 8, 10, 11, 16, ] +1 geometry geometry [19, ] +1 letter letter [1, ] +1 outdoor outdoor [4, ] +1 painting painting [3, ] +1 pattern pattern [2, 3, 9, 10, 13, 14, 16, ] +1 sign sign [0, ] +2 design design [21, ] +2 geometry geometry [20, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..1145cbd56 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[0, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap new file mode 100644 index 000000000..7909d9b06 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap @@ -0,0 +1,42 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1.2 [20, 22, ] +1_36 [3, ] +1_37 [4, ] +1_38 [5, ] +1_39 [6, ] +1_4 [0, ] +1_40 [7, ] +1_41 [8, ] +1_42 [9, ] +1_43 [10, ] +1_44 [11, ] +1_45 [12, ] +1_46 [13, ] +1_47 [14, ] +1_5 [1, ] +1_52 [15, ] +1_57 [16, ] +1_58 [17, ] +1_68 [18, ] +1_69 [19, ] +1_7 [2, ] +1_70 [20, ] +1_71 [21, ] +1_72 [22, ] +2.2 [21, ] +abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +aquarium [5, ] +art [4, 5, 8, 9, 10, 12, 17, ] +cartoon [2, 7, 15, 17, ] +colorfulness [13, ] +design [2, 18, 21, ] +drawing [3, 4, 5, 8, 10, 11, 16, ] +geometry [19, 20, 22, ] +letter [1, ] +outdoor [4, ] +painting [3, ] +pattern [2, 3, 9, 10, 13, 14, 16, ] +sign [0, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap new file mode 100644 index 000000000..7299bc214 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] +2 [20, 21, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap new file mode 100644 index 000000000..565fadcb3 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +2 0 1.2 1.2 [20, 22, ] +2 0 2.2 2.2 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap new file mode 100644 index 000000000..9f8541607 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap @@ -0,0 +1,18 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 abstract abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +1 aquarium aquarium [5, ] +1 art art [4, 5, 8, 9, 10, 12, 17, ] +1 cartoon cartoon [2, 7, 15, 17, ] +1 colorfulness colorfulness [13, ] +1 design design [2, 18, ] +1 drawing drawing [3, 4, 5, 8, 10, 11, 16, ] +1 geometry geometry [19, ] +1 letter letter [1, ] +1 outdoor outdoor [4, ] +1 painting painting [3, ] +1 pattern pattern [2, 3, 9, 10, 13, 14, 16, ] +2 design design [21, ] +2 geometry geometry [20, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap new file mode 100644 index 000000000..c7e0c2d7a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap @@ -0,0 +1,40 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1.2 [20, 22, ] +1_36 [3, ] +1_37 [4, ] +1_38 [5, ] +1_39 [6, ] +1_40 [7, ] +1_41 [8, ] +1_42 [9, ] +1_43 [10, ] +1_44 [11, ] +1_45 [12, ] +1_46 [13, ] +1_47 [14, ] +1_5 [1, ] +1_52 [15, ] +1_57 [16, ] +1_58 [17, ] +1_68 [18, ] +1_69 [19, ] +1_7 [2, ] +1_70 [20, ] +1_71 [21, ] +1_72 [22, ] +2.2 [21, ] +abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +aquarium [5, ] +art [4, 5, 8, 9, 10, 12, 17, ] +cartoon [2, 7, 15, 17, ] +colorfulness [13, ] +design [2, 18, 21, ] +drawing [3, 4, 5, 8, 10, 11, 16, ] +geometry [19, 20, 22, ] +letter [1, ] +outdoor [4, ] +painting [3, ] +pattern [2, 3, 9, 10, 13, 14, 16, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap new file mode 100644 index 000000000..4d3786e09 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap @@ -0,0 +1,48 @@ +--- +source: milli/src/update/delete_documents.rs +--- +3 0 48.9021 48.9021 [19, ] +3 0 49.4449 49.4449 [18, ] +3 0 49.9314 49.9314 [17, ] +3 0 50.1112 50.1112 [16, ] +3 0 50.1793 50.1793 [15, ] +3 0 50.2844 50.2844 [14, ] +3 0 50.3518 50.3518 [13, ] +3 0 50.4095 50.4095 [11, ] +3 0 50.4502 50.4502 [12, ] +3 0 50.6053 50.6053 [8, ] +3 0 50.6224 50.6224 [3, ] +3 0 50.6299 50.6299 [0, ] +3 0 50.6312 50.6312 [2, ] +3 0 50.6415 50.6415 [1, ] +3 0 50.6552 50.6552 [4, ] +3 0 50.6924 50.6924 [5, ] +3 0 50.7263 50.7263 [6, ] +3 0 50.7453 50.7453 [7, ] +3 0 50.8466 50.8466 [10, ] +3 0 51.0537 51.0537 [9, ] +3 1 48.9021 50.1112 [16, 17, 18, 19, ] +3 1 50.1793 50.4095 [11, 13, 14, 15, ] +3 1 50.4502 50.6299 [0, 3, 8, 12, ] +3 1 50.6312 50.6924 [1, 2, 4, 5, ] +3 1 50.7263 51.0537 [6, 7, 9, 10, ] +4 0 2.271 2.271 [17, ] +4 0 2.3708 2.3708 [19, ] +4 0 2.7637 2.7637 [14, ] +4 0 2.7913 2.7913 [18, ] +4 0 2.8547 2.8547 [16, ] +4 0 3.0569 3.0569 [0, ] +4 0 3.1106 3.1106 [1, 2, ] +4 0 3.1476 3.1476 [3, ] +4 0 3.1541 3.1541 [6, ] +4 0 3.1763 3.1763 [5, ] +4 0 3.1897 3.1897 [4, ] +4 0 3.2189 3.2189 [15, ] +4 0 3.2206 3.2206 [7, ] +4 0 3.3758 3.3758 [8, ] +4 0 3.5326 3.5326 [13, ] +4 0 3.6957 3.6957 [9, ] +4 0 3.9623 3.9623 [12, ] +4 0 4.337 4.337 [10, ] +4 0 4.4347 4.4347 [11, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..1260b12de --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[4, 5, 6, 11, 16, 18, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap new file mode 100644 index 000000000..d380cf29c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap @@ -0,0 +1,36 @@ +--- +source: milli/src/update/delete_documents.rs +--- +3 0 48.9021 48.9021 [19, ] +3 0 49.9314 49.9314 [17, ] +3 0 50.1793 50.1793 [15, ] +3 0 50.2844 50.2844 [14, ] +3 0 50.3518 50.3518 [13, ] +3 0 50.4502 50.4502 [12, ] +3 0 50.6053 50.6053 [8, ] +3 0 50.6224 50.6224 [3, ] +3 0 50.6299 50.6299 [0, ] +3 0 50.6312 50.6312 [2, ] +3 0 50.6415 50.6415 [1, ] +3 0 50.7453 50.7453 [7, ] +3 0 50.8466 50.8466 [10, ] +3 0 51.0537 51.0537 [9, ] +3 1 48.9021 50.1112 [17, 19, ] +3 1 50.1793 50.4095 [13, 14, 15, ] +3 1 50.4502 50.6299 [0, 3, 8, 12, ] +3 1 50.6312 50.6924 [1, 2, ] +3 1 50.7263 51.0537 [7, 9, 10, ] +4 0 2.271 2.271 [17, ] +4 0 2.3708 2.3708 [19, ] +4 0 2.7637 2.7637 [14, ] +4 0 3.0569 3.0569 [0, ] +4 0 3.1106 3.1106 [1, 2, ] +4 0 3.1476 3.1476 [3, ] +4 0 3.2189 3.2189 [15, ] +4 0 3.2206 3.2206 [7, ] +4 0 3.3758 3.3758 [8, ] +4 0 3.5326 3.5326 [13, ] +4 0 3.6957 3.6957 [9, ] +4 0 3.9623 3.9623 [12, ] +4 0 4.337 4.337 [10, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] From f198b20c4280b9414a8fd069c9efbd50b20767c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 12:32:33 +0200 Subject: [PATCH 1743/1889] Add facet deletion tests that use both the incremental and bulk methods + update deletion snapshots to the new database format --- milli/src/update/delete_documents.rs | 2 +- milli/src/update/facet/bulk.rs | 6 - milli/src/update/facet/delete.rs | 168 ++++++++++++++++-- milli/src/update/facet/mod.rs | 23 ++- .../1/facet_id_f64_docids.hash.snap | 4 + .../1/number_faceted_documents_ids.hash.snap | 4 + .../2/facet_id_f64_docids.hash.snap | 4 + .../2/number_faceted_documents_ids.hash.snap | 4 + .../false/facet_id_exists_docids.snap | 4 - .../true/facet_id_exists_docids.snap | 4 - .../false/facet_id_f64_docids.snap | 4 +- .../false/facet_id_string_docids.snap | 30 ++-- .../false/soft_deleted_documents_ids.snap | 2 +- .../true/facet_id_exists_docids.snap | 2 +- .../true/facet_id_f64_docids.snap | 3 +- .../true/facet_id_string_docids.snap | 27 ++- .../true/word_docids.snap | 5 +- .../false/facet_id_f64_docids.snap | 93 +++++----- .../true/facet_id_f64_docids.snap | 59 +++--- 19 files changed, 302 insertions(+), 146 deletions(-) create mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap create mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap create mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 432e1497f..6ff41ccbb 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -54,7 +54,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } #[cfg(test)] - fn disable_soft_deletion(&mut self, disable: bool) { + pub fn disable_soft_deletion(&mut self, disable: bool) { self.disable_soft_deletion = disable; } diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 4e10c22dd..ea0a7d3d7 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -4,9 +4,7 @@ use std::fs::File; use grenad::CompressionType; use heed::types::ByteSlice; use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; use roaring::RoaringBitmap; -use time::OffsetDateTime; use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::facet::FacetType; @@ -71,8 +69,6 @@ impl<'i> FacetsUpdateBulk<'i> { #[logging_timer::time("FacetsUpdateBulk::{}")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; let db = match facet_type { @@ -84,8 +80,6 @@ impl<'i> FacetsUpdateBulk<'i> { } }; - index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs index 74c17e8f2..2bc54c7c1 100644 --- a/milli/src/update/facet/delete.rs +++ b/milli/src/update/facet/delete.rs @@ -1,15 +1,21 @@ -use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; -use crate::{ - facet::FacetType, - heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}, - heed_codec::ByteSliceRefCodec, - update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}, - FieldId, Index, Result, -}; -use heed::RwTxn; -use roaring::RoaringBitmap; use std::collections::{HashMap, HashSet}; +use heed::RwTxn; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + +use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}; +use crate::{FieldId, Index, Result}; + +/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases. +/// +/// Depending on the number of removed elements and the existing size of the database, we use either +/// a bulk delete method or an incremental delete method. pub struct FacetsDelete<'i, 'b> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, @@ -48,8 +54,18 @@ impl<'i, 'b> FacetsDelete<'i, 'b> { } pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + for (field_id, affected_facet_values) in self.affected_facet_values { - if affected_facet_values.len() >= (self.database.len(wtxn)? / 50) { + // This is an incorrect condition, since we assume that the length of the database is equal + // to the number of facet values for the given field_id. It means that in some cases, we might + // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could + // really be a performance problem is when we fully delete a large ratio of all facet values for + // each field id. This would almost never happen. Still, to be overly cautious, I have added a + // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance + // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead. + if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) { // Bulk delete let mut modified = false; @@ -91,3 +107,133 @@ impl<'i, 'b> FacetsDelete<'i, 'b> { Ok(()) } } + +#[cfg(test)] +mod tests { + use std::iter::FromIterator; + + use big_s::S; + use maplit::hashset; + use roaring::RoaringBitmap; + + use crate::db_snap; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + use crate::update::DeleteDocuments; + + #[test] + fn delete_mixed_incremental_and_bulk() { + // The point of this test is to create an index populated with documents + // containing different filterable attributes. Then, we delete a bunch of documents + // such that a mix of the incremental and bulk indexer is used (depending on the field id) + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_filterable_fields( + hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "label": i / 10, + "colour": i / 100, + "timestamp": i / 2, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, 1); + db_snap!(index, number_faceted_documents_ids, 1); + + let mut wtxn = index.env.write_txn().unwrap(); + + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_documents(&RoaringBitmap::from_iter(0..100)); + // by deleting the first 100 documents, we expect that: + // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) + // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 + // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 + // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 + // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + db_snap!(index, facet_id_f64_docids, 2); + db_snap!(index, number_faceted_documents_ids, 2); + } +} + +#[allow(unused)] +#[cfg(test)] +mod comparison_bench { + use std::iter::once; + + use rand::Rng; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + use crate::update::facet::tests::FacetIndex; + + // This is a simple test to get an intuition on the relative speed + // of the incremental vs. bulk indexer. + // + // The benchmark shows the worst-case scenario for the incremental indexer, since + // each facet value contains only one document ID. + // + // In that scenario, it appears that the incremental indexer is about 70 times slower than the + // bulk indexer. + // #[test] + fn benchmark_facet_indexing_delete() { + let mut r = rand::thread_rng(); + + for i in 1..=20 { + let size = 50_000 * i; + let index = FacetIndex::::new(4, 8, 5); + + let mut txn = index.env.write_txn().unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..size { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + let timer = std::time::Instant::now(); + index.bulk_insert(&mut txn, &[0], elements.iter()); + let time_spent = timer.elapsed().as_millis(); + println!("bulk {size} : {time_spent}ms"); + + txn.commit().unwrap(); + + for nbr_doc in [1, 100, 1000, 10_000] { + let mut txn = index.env.write_txn().unwrap(); + let timer = std::time::Instant::now(); + // + // delete one document + // + for _ in 0..nbr_doc { + let deleted_u32 = r.gen::() % size; + let deleted_f64 = deleted_u32 as f64; + index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32) + } + let time_spent = timer.elapsed().as_millis(); + println!(" delete {nbr_doc} : {time_spent}ms"); + txn.abort().unwrap(); + } + } + } +} diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 5fb5c9e48..76e5514a1 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -78,6 +78,9 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5; use std::fs::File; +use log::debug; +use time::OffsetDateTime; + use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; @@ -89,6 +92,10 @@ pub mod bulk; pub mod delete; pub mod incremental; +/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. +/// +/// Depending on the number of new elements and the existing size of the database, we use either +/// a bulk update method or an incremental update method. pub struct FacetsUpdate<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, @@ -123,6 +130,10 @@ impl<'i> FacetsUpdate<'i> { if self.new_data.is_empty() { return Ok(()); } + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + + // See self::comparison_bench::benchmark_facet_indexing if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { let field_ids = self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); @@ -204,7 +215,7 @@ pub(crate) mod tests { let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17 let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 10 * 100); + let options = options.map_size(4096 * 4 * 10 * 1000); unsafe { options.flag(heed::flags::Flags::MdbAlwaysFreePages); } @@ -230,7 +241,7 @@ pub(crate) mod tests { let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 1000); + let options = options.map_size(4096 * 4 * 1000 * 100); let tempdir = tempfile::TempDir::new().unwrap(); let env = options.open(tempdir.path()).unwrap(); let content = env.create_database(None).unwrap(); @@ -440,12 +451,14 @@ mod comparison_bench { // This is a simple test to get an intuition on the relative speed // of the incremental vs. bulk indexer. - // It appears that the incremental indexer is about 50 times slower than the + // + // The benchmark shows the worst-case scenario for the incremental indexer, since + // each facet value contains only one document ID. + // + // In that scenario, it appears that the incremental indexer is about 50 times slower than the // bulk indexer. // #[test] fn benchmark_facet_indexing() { - // then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it - let mut facet_value = 0; let mut r = rand::thread_rng(); diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..fee486bab --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +550cd138d6fe31ccdd42cd5392fbd576 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap new file mode 100644 index 000000000..fcf957004 --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +9a0ea88e7c9dcf6dc0ef0b601736ffcf diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..29ceb250e --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +d4d5f14e7f1e1f09b86821a0b6defcc6 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap new file mode 100644 index 000000000..bbaf6d2a2 --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +3570e0ac0fdb21be9ebe433f59264b56 diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/facet_id_exists_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap deleted file mode 100644 index 88031d24a..000000000 --- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/facet_id_exists_docids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/delete_documents.rs ---- - diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap index 565fadcb3..cfa649653 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap @@ -1,6 +1,6 @@ --- source: milli/src/update/delete_documents.rs --- -2 0 1.2 1.2 [20, 22, ] -2 0 2.2 2.2 [21, ] +2 0 1.2 1 [20, 22, ] +2 0 2.2 1 [21, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap index 019836089..8336bd712 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap @@ -1,19 +1,19 @@ --- source: milli/src/update/delete_documents.rs --- -1 abstract abstract [2, 6, 10, 13, 14, 15, 16, 17, ] -1 aquarium aquarium [5, ] -1 art art [4, 5, 8, 9, 10, 12, 17, ] -1 cartoon cartoon [2, 7, 15, 17, ] -1 colorfulness colorfulness [13, ] -1 design design [2, 18, ] -1 drawing drawing [3, 4, 5, 8, 10, 11, 16, ] -1 geometry geometry [19, ] -1 letter letter [1, ] -1 outdoor outdoor [4, ] -1 painting painting [3, ] -1 pattern pattern [2, 3, 9, 10, 13, 14, 16, ] -1 sign sign [0, ] -2 design design [21, ] -2 geometry geometry [20, 22, ] +1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] +1 0 aquarium 1 [5, ] +1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] +1 0 cartoon 1 [2, 7, 15, 17, ] +1 0 colorfulness 1 [13, ] +1 0 design 1 [2, 18, ] +1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] +1 0 geometry 1 [19, ] +1 0 letter 1 [1, ] +1 0 outdoor 1 [4, ] +1 0 painting 1 [3, ] +1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] +1 0 sign 1 [0, ] +2 0 design 1 [21, ] +2 0 geometry 1 [20, 22, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap index 1145cbd56..dfac98e59 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/delete_documents.rs --- -[0, ] +[0, 20, 22, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap index 7299bc214..7481b11c4 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap @@ -2,5 +2,5 @@ source: milli/src/update/delete_documents.rs --- 1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] -2 [20, 21, 22, ] +2 [21, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap index 565fadcb3..87856f6dc 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap @@ -1,6 +1,5 @@ --- source: milli/src/update/delete_documents.rs --- -2 0 1.2 1.2 [20, 22, ] -2 0 2.2 2.2 [21, ] +2 0 2.2 1 [21, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap index 9f8541607..ab1d2175f 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap @@ -1,18 +1,17 @@ --- source: milli/src/update/delete_documents.rs --- -1 abstract abstract [2, 6, 10, 13, 14, 15, 16, 17, ] -1 aquarium aquarium [5, ] -1 art art [4, 5, 8, 9, 10, 12, 17, ] -1 cartoon cartoon [2, 7, 15, 17, ] -1 colorfulness colorfulness [13, ] -1 design design [2, 18, ] -1 drawing drawing [3, 4, 5, 8, 10, 11, 16, ] -1 geometry geometry [19, ] -1 letter letter [1, ] -1 outdoor outdoor [4, ] -1 painting painting [3, ] -1 pattern pattern [2, 3, 9, 10, 13, 14, 16, ] -2 design design [21, ] -2 geometry geometry [20, 22, ] +1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] +1 0 aquarium 1 [5, ] +1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] +1 0 cartoon 1 [2, 7, 15, 17, ] +1 0 colorfulness 1 [13, ] +1 0 design 1 [2, 18, ] +1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] +1 0 geometry 1 [19, ] +1 0 letter 1 [1, ] +1 0 outdoor 1 [4, ] +1 0 painting 1 [3, ] +1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] +2 0 design 1 [21, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap index c7e0c2d7a..d8125dfcf 100644 --- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap @@ -1,7 +1,6 @@ --- source: milli/src/update/delete_documents.rs --- -1.2 [20, 22, ] 1_36 [3, ] 1_37 [4, ] 1_38 [5, ] @@ -21,9 +20,7 @@ source: milli/src/update/delete_documents.rs 1_68 [18, ] 1_69 [19, ] 1_7 [2, ] -1_70 [20, ] 1_71 [21, ] -1_72 [22, ] 2.2 [21, ] abstract [2, 6, 10, 13, 14, 15, 16, 17, ] aquarium [5, ] @@ -32,7 +29,7 @@ cartoon [2, 7, 15, 17, ] colorfulness [13, ] design [2, 18, 21, ] drawing [3, 4, 5, 8, 10, 11, 16, ] -geometry [19, 20, 22, ] +geometry [19, ] letter [1, ] outdoor [4, ] painting [3, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap index 4d3786e09..c909a3cd8 100644 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap @@ -1,48 +1,53 @@ --- source: milli/src/update/delete_documents.rs --- -3 0 48.9021 48.9021 [19, ] -3 0 49.4449 49.4449 [18, ] -3 0 49.9314 49.9314 [17, ] -3 0 50.1112 50.1112 [16, ] -3 0 50.1793 50.1793 [15, ] -3 0 50.2844 50.2844 [14, ] -3 0 50.3518 50.3518 [13, ] -3 0 50.4095 50.4095 [11, ] -3 0 50.4502 50.4502 [12, ] -3 0 50.6053 50.6053 [8, ] -3 0 50.6224 50.6224 [3, ] -3 0 50.6299 50.6299 [0, ] -3 0 50.6312 50.6312 [2, ] -3 0 50.6415 50.6415 [1, ] -3 0 50.6552 50.6552 [4, ] -3 0 50.6924 50.6924 [5, ] -3 0 50.7263 50.7263 [6, ] -3 0 50.7453 50.7453 [7, ] -3 0 50.8466 50.8466 [10, ] -3 0 51.0537 51.0537 [9, ] -3 1 48.9021 50.1112 [16, 17, 18, 19, ] -3 1 50.1793 50.4095 [11, 13, 14, 15, ] -3 1 50.4502 50.6299 [0, 3, 8, 12, ] -3 1 50.6312 50.6924 [1, 2, 4, 5, ] -3 1 50.7263 51.0537 [6, 7, 9, 10, ] -4 0 2.271 2.271 [17, ] -4 0 2.3708 2.3708 [19, ] -4 0 2.7637 2.7637 [14, ] -4 0 2.7913 2.7913 [18, ] -4 0 2.8547 2.8547 [16, ] -4 0 3.0569 3.0569 [0, ] -4 0 3.1106 3.1106 [1, 2, ] -4 0 3.1476 3.1476 [3, ] -4 0 3.1541 3.1541 [6, ] -4 0 3.1763 3.1763 [5, ] -4 0 3.1897 3.1897 [4, ] -4 0 3.2189 3.2189 [15, ] -4 0 3.2206 3.2206 [7, ] -4 0 3.3758 3.3758 [8, ] -4 0 3.5326 3.5326 [13, ] -4 0 3.6957 3.6957 [9, ] -4 0 3.9623 3.9623 [12, ] -4 0 4.337 4.337 [10, ] -4 0 4.4347 4.4347 [11, ] +3 0 48.9021 1 [19, ] +3 0 49.4449 1 [18, ] +3 0 49.9314 1 [17, ] +3 0 50.1112 1 [16, ] +3 0 50.1793 1 [15, ] +3 0 50.2844 1 [14, ] +3 0 50.3518 1 [13, ] +3 0 50.4095 1 [11, ] +3 0 50.4502 1 [12, ] +3 0 50.6053 1 [8, ] +3 0 50.6224 1 [3, ] +3 0 50.6299 1 [0, ] +3 0 50.6312 1 [2, ] +3 0 50.6415 1 [1, ] +3 0 50.6552 1 [4, ] +3 0 50.6924 1 [5, ] +3 0 50.7263 1 [6, ] +3 0 50.7453 1 [7, ] +3 0 50.8466 1 [10, ] +3 0 51.0537 1 [9, ] +3 1 48.9021 4 [16, 17, 18, 19, ] +3 1 50.1793 4 [11, 13, 14, 15, ] +3 1 50.4502 4 [0, 3, 8, 12, ] +3 1 50.6312 4 [1, 2, 4, 5, ] +3 1 50.7263 4 [6, 7, 9, 10, ] +4 0 2.271 1 [17, ] +4 0 2.3708 1 [19, ] +4 0 2.7637 1 [14, ] +4 0 2.7913 1 [18, ] +4 0 2.8547 1 [16, ] +4 0 3.0569 1 [0, ] +4 0 3.1106 1 [1, 2, ] +4 0 3.1476 1 [3, ] +4 0 3.1541 1 [6, ] +4 0 3.1763 1 [5, ] +4 0 3.1897 1 [4, ] +4 0 3.2189 1 [15, ] +4 0 3.2206 1 [7, ] +4 0 3.3758 1 [8, ] +4 0 3.5326 1 [13, ] +4 0 3.6957 1 [9, ] +4 0 3.9623 1 [12, ] +4 0 4.337 1 [10, ] +4 0 4.4347 1 [11, ] +4 1 2.271 4 [14, 17, 18, 19, ] +4 1 2.8547 4 [0, 1, 2, 3, 16, ] +4 1 3.1541 4 [4, 5, 6, 15, ] +4 1 3.2206 4 [7, 8, 9, 13, ] +4 1 3.9623 3 [10, 11, 12, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap index d380cf29c..18a9d9309 100644 --- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap @@ -1,36 +1,31 @@ --- source: milli/src/update/delete_documents.rs --- -3 0 48.9021 48.9021 [19, ] -3 0 49.9314 49.9314 [17, ] -3 0 50.1793 50.1793 [15, ] -3 0 50.2844 50.2844 [14, ] -3 0 50.3518 50.3518 [13, ] -3 0 50.4502 50.4502 [12, ] -3 0 50.6053 50.6053 [8, ] -3 0 50.6224 50.6224 [3, ] -3 0 50.6299 50.6299 [0, ] -3 0 50.6312 50.6312 [2, ] -3 0 50.6415 50.6415 [1, ] -3 0 50.7453 50.7453 [7, ] -3 0 50.8466 50.8466 [10, ] -3 0 51.0537 51.0537 [9, ] -3 1 48.9021 50.1112 [17, 19, ] -3 1 50.1793 50.4095 [13, 14, 15, ] -3 1 50.4502 50.6299 [0, 3, 8, 12, ] -3 1 50.6312 50.6924 [1, 2, ] -3 1 50.7263 51.0537 [7, 9, 10, ] -4 0 2.271 2.271 [17, ] -4 0 2.3708 2.3708 [19, ] -4 0 2.7637 2.7637 [14, ] -4 0 3.0569 3.0569 [0, ] -4 0 3.1106 3.1106 [1, 2, ] -4 0 3.1476 3.1476 [3, ] -4 0 3.2189 3.2189 [15, ] -4 0 3.2206 3.2206 [7, ] -4 0 3.3758 3.3758 [8, ] -4 0 3.5326 3.5326 [13, ] -4 0 3.6957 3.6957 [9, ] -4 0 3.9623 3.9623 [12, ] -4 0 4.337 4.337 [10, ] +3 0 48.9021 1 [19, ] +3 0 49.9314 1 [17, ] +3 0 50.1793 1 [15, ] +3 0 50.2844 1 [14, ] +3 0 50.3518 1 [13, ] +3 0 50.4502 1 [12, ] +3 0 50.6053 1 [8, ] +3 0 50.6224 1 [3, ] +3 0 50.6299 1 [0, ] +3 0 50.6312 1 [2, ] +3 0 50.6415 1 [1, ] +3 0 50.7453 1 [7, ] +3 0 50.8466 1 [10, ] +3 0 51.0537 1 [9, ] +4 0 2.271 1 [17, ] +4 0 2.3708 1 [19, ] +4 0 2.7637 1 [14, ] +4 0 3.0569 1 [0, ] +4 0 3.1106 1 [1, 2, ] +4 0 3.1476 1 [3, ] +4 0 3.2189 1 [15, ] +4 0 3.2206 1 [7, ] +4 0 3.3758 1 [8, ] +4 0 3.5326 1 [13, ] +4 0 3.6957 1 [9, ] +4 0 3.9623 1 [12, ] +4 0 4.337 1 [10, ] From 206a3e00e5ca68075581c64fe8d4d50aaad8b695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 12 Oct 2022 12:35:44 +0200 Subject: [PATCH 1744/1889] cargo fmt --- milli/src/heed_codec/facet/mod.rs | 3 +-- milli/src/snapshot_tests.rs | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index a727b148f..4609bfe7f 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -11,9 +11,8 @@ use roaring::RoaringBitmap; pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; pub use self::ordered_f64_codec::OrderedF64Codec; -use crate::{CboRoaringBitmapCodec, BEU16}; - use super::StrRefCodec; +use crate::{CboRoaringBitmapCodec, BEU16}; pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec; pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec; diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 389d7b7a2..bcb9805ea 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -495,7 +495,7 @@ macro_rules! full_snap_of_db { ($index:ident, field_id_docid_facet_strings) => {{ $crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index) }}; - ($index:ident, facet_id_exists_docids) => {{ + ($index:ident, facet_id_exists_docids) => {{ $crate::snapshot_tests::snap_facet_id_exists_docids(&$index) }}; ($index:ident, documents_ids) => {{ From 14ca8048a8539d2b658f1d5ed0269ea19a980d28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 17 Oct 2022 12:42:12 +0200 Subject: [PATCH 1745/1889] Add some documentation on how to run the facet db fuzzer --- milli/src/update/facet/incremental.rs | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index a4c756aec..2558c81a3 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1022,6 +1022,35 @@ mod tests { // fuzz tests #[cfg(all(test, fuzzing))] +/** +Fuzz test for the incremental indxer. + +The fuzz test uses fuzzcheck, a coverage-guided fuzzer. +See https://github.com/loiclec/fuzzcheck-rs and https://fuzzcheck.neocities.org +for more information. + +It is only run when using the `cargo fuzzcheck` command line tool, which can be installed with: +```sh +cargo install cargo-fuzzcheck +``` +To start the fuzz test, run (from the base folder or from milli/): +```sh +cargo fuzzcheck update::facet::incremental::fuzz::fuzz +``` +and wait a couple minutes to make sure the code was thoroughly tested, then +hit `Ctrl-C` to stop the fuzzer. The corpus generated by the fuzzer is located in milli/fuzz. + +To work on this module with rust-analyzer working properly, add the following to your .cargo/config.toml file: +```toml +[build] +rustflags = ["--cfg", "fuzzing"] +``` + +The fuzz test generates sequences of additions and deletions to the facet database and +ensures that: +1. its structure is still internally valid +2. its content is the same as a trivially correct implementation of the same database +*/ mod fuzz { use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; From 3b1f908e5e80335d517c8eb1dd5e35952d45b49a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 17 Oct 2022 12:48:10 +0200 Subject: [PATCH 1746/1889] Revert behaviour of facet distribution to what it was before Where the docid that is used to get the original facet string value definitely belongs to the candidates --- milli/src/search/facet/facet_distribution_iter.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 0fdca4118..9cd85b667 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -74,13 +74,12 @@ where if key.field_id != self.field_id { return Ok(ControlFlow::Break(())); } - // TODO: use real intersection and then take min()? - let docids_in_common = value.bitmap.intersection_len(candidates); - if docids_in_common > 0 { - // TODO: use min() - let any_docid = value.bitmap.iter().next().unwrap(); - match (self.callback)(key.left_bound, docids_in_common, any_docid)? { - ControlFlow::Continue(_) => (), // TODO use unit instead of empty scope + let docids_in_common = value.bitmap & candidates; + if !docids_in_common.is_empty() { + let any_docid_in_common = docids_in_common.min().unwrap(); + match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)? + { + ControlFlow::Continue(_) => (), ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), } } From b7f2428961198cfaee5f601d94925099723d070c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 26 Oct 2022 13:49:33 +0200 Subject: [PATCH 1747/1889] Fix formatting and warning after rebasing from main --- milli/src/heed_codec/mod.rs | 5 +++-- milli/src/update/index_documents/mod.rs | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 6a058f95f..702dcf661 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -9,6 +9,9 @@ mod str_beu32_codec; mod str_ref; mod str_str_u8_codec; +pub use byte_slice_ref::ByteSliceRefCodec; +pub use str_ref::StrRefCodec; + pub use self::beu32_str_codec::BEU32StrCodec; pub use self::field_id_word_count_codec::FieldIdWordCountCodec; pub use self::obkv_codec::ObkvCodec; @@ -18,5 +21,3 @@ pub use self::roaring_bitmap_length::{ }; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; -pub use byte_slice_ref::ByteSliceRefCodec; -pub use str_ref::StrRefCodec; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7b02fd1af..10a831ddf 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -35,8 +35,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, FacetsUpdateBulk, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, - PrefixWordPairsProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, + self, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, + WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; From 631e9910da878ec410ecb215d853617c93524ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 26 Oct 2022 14:06:59 +0200 Subject: [PATCH 1748/1889] Depend on released version of fuzzcheck from crates.io --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index b768476e3..52fdf2374 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -57,7 +57,7 @@ md5 = "0.7.0" rand = {version = "0.8.5", features = ["small_rng"] } [target.'cfg(fuzzing)'.dev-dependencies] -fuzzcheck = { git = "https://github.com/loiclec/fuzzcheck-rs", branch = "main" } # TODO: use released version +fuzzcheck = "0.12.1" [features] default = [ "charabia/default" ] From 2fa85a24ec700f4c6823e283fae20c8c79833a8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 26 Oct 2022 14:09:35 +0200 Subject: [PATCH 1749/1889] Remove outdated files from http-ui/ and infos/ ... that were reintroduced after a rebase --- http-ui/src/main.rs | 1 - infos/src/main.rs | 1 - 2 files changed, 2 deletions(-) delete mode 100644 http-ui/src/main.rs delete mode 100644 infos/src/main.rs diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs deleted file mode 100644 index 8b1378917..000000000 --- a/http-ui/src/main.rs +++ /dev/null @@ -1 +0,0 @@ - diff --git a/infos/src/main.rs b/infos/src/main.rs deleted file mode 100644 index 8b1378917..000000000 --- a/infos/src/main.rs +++ /dev/null @@ -1 +0,0 @@ - From 62816dddded714266c4af396ac995def5d0d0379 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Wed, 26 Oct 2022 19:08:06 +0530 Subject: [PATCH 1750/1889] [WIP] Fix phrase search containing stop words Fixes #661 and meilisearch/meilisearch#2905 --- milli/src/search/criteria/attribute.rs | 1 + milli/src/search/criteria/exactness.rs | 4 ++-- milli/src/search/criteria/mod.rs | 12 +++++++--- milli/src/search/criteria/proximity.rs | 6 ++++- milli/src/search/criteria/typo.rs | 4 ++-- milli/src/search/query_tree.rs | 32 +++++++++++++++++++------- 6 files changed, 43 insertions(+), 16 deletions(-) diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 7e55a1038..679381838 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -579,6 +579,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { Phrase(words) => { let queries = words .iter() + .filter_map(|w| w.as_ref()) .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) .collect(); vec![queries] diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index d5b2ff0ee..0f0c24723 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -298,7 +298,7 @@ fn attribute_start_with_docids( pos += 1; } Phrase(phrase) => { - for word in phrase { + for word in phrase.iter().filter_map(|w| w.as_ref()) { let wc = ctx.word_position_docids(word, pos)?; if let Some(word_candidates) = wc { attribute_candidates_array.push(word_candidates); @@ -323,7 +323,7 @@ fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap { #[derive(Debug, Clone)] pub enum ExactQueryPart { - Phrase(Vec), + Phrase(Vec>), Synonyms(Vec), } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1b46c8441..96ed0bf6c 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -418,15 +418,21 @@ pub fn resolve_query_tree( resolve_operation(ctx, query_tree, wdcache) } -pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result { +pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result { let mut candidates = RoaringBitmap::new(); let mut first_iter = true; let winsize = phrase.len().min(3); for win in phrase.windows(winsize) { // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win.iter().enumerate() { - for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + for (offset, s1) in win.iter().filter_map(|w| w.as_ref()).enumerate() { + for (dist, s2) in win.iter().skip(offset + 1).enumerate().filter_map(|(index, word)| { + if let Some(word) = word { + Some((index, word)) + } else { + None + } + }) { if dist == 0 { match ctx.word_pair_proximity_docids(s1, s2, 1)? { Some(m) => bitmaps.push(m), diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b7c10a2e0..db8592a1d 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -188,9 +188,13 @@ fn resolve_candidates<'t>( if proximity == 0 { let most_left = words .first() + .map(|o| o.as_ref()) + .flatten() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); let most_right = words .last() + .map(|o| o.as_ref()) + .flatten() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); match (most_left, most_right) { @@ -473,7 +477,7 @@ fn resolve_plane_sweep_candidates( } Phrase(words) => { let mut groups_positions = Vec::with_capacity(words.len()); - for word in words { + for word in words.iter().filter_map(|w| w.as_ref()) { let positions = match words_positions.get(word) { Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), None => return Ok(vec![]), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 76bd04d20..758069642 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::HashMap; use std::mem::take; +use itertools::Itertools; use log::debug; use roaring::RoaringBitmap; @@ -259,8 +260,7 @@ fn resolve_candidates<'t>( Phrase(words) => { let mut candidates = RoaringBitmap::new(); let mut first_loop = true; - for slice in words.windows(2) { - let (left, right) = (&slice[0], &slice[1]); + for (left, right) in words.iter().filter_map(|w| w.as_ref()).tuple_windows() { match ctx.word_pair_proximity_docids(left, right, 1)? { Some(pair_docids) => { if pair_docids.is_empty() { diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 9b4b38f76..4da4b3317 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -18,8 +18,9 @@ type IsPrefix = bool; #[derive(Clone, PartialEq, Eq, Hash)] pub enum Operation { And(Vec), - // serie of consecutive non prefix and exact words - Phrase(Vec), + // series of consecutive non prefix and exact words + // `None` means a stop word. + Phrase(Vec>), Or(IsOptionalWord, Vec), Query(Query), } @@ -75,9 +76,13 @@ impl Operation { } } - fn phrase(mut words: Vec) -> Self { + fn phrase(mut words: Vec>) -> Self { if words.len() == 1 { - Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) }) + if let Some(word) = words.pop().unwrap() { + Self::Query(Query { prefix: false, kind: QueryKind::exact(word) }) + } else { + Self::Phrase(words) + } } else { Self::Phrase(words) } @@ -370,7 +375,10 @@ fn create_query_tree( PrimitiveQueryPart::Word(word, prefix) => { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); if let Some((left, right)) = split_best_frequency(ctx, &word)? { - children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); + children.push(Operation::Phrase(vec![ + Some(left.to_string()), + Some(right.to_string()), + ])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let exact_words = ctx.exact_words(); @@ -583,7 +591,11 @@ fn create_matching_words( PrimitiveQueryPart::Phrase(words) => { let ids: Vec<_> = (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); - let words = words.into_iter().map(|w| MatchingWord::new(w, 0, false)).collect(); + let words = words + .into_iter() + .filter_map(|w| w) + .map(|w| MatchingWord::new(w, 0, false)) + .collect(); matching_words.push((words, ids)); } } @@ -685,7 +697,7 @@ pub type PrimitiveQuery = Vec; #[derive(Debug, Clone)] pub enum PrimitiveQueryPart { - Phrase(Vec), + Phrase(Vec>), Word(String, IsPrefix), } @@ -735,7 +747,11 @@ where // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { - phrase.push(token.lemma().to_string()); + if stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { + phrase.push(None) + } else { + phrase.push(Some(token.lemma().to_string())); + } } else if peekable.peek().is_some() { if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { primitive_query From 6a10b679ca3330f2cc083b6c5b1674c654153b7e Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 19:13:40 +0530 Subject: [PATCH 1751/1889] Add test for phrase search with stop words Originally written by ManyTheFish here: https://gist.github.com/ManyTheFish/f840e37cb2d2e029ce05396b4d540762 Co-authored-by: ManyTheFish --- milli/tests/search/mod.rs | 1 + milli/tests/search/phrase_search.rs | 35 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 milli/tests/search/phrase_search.rs diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index c8b01648c..78640cfb9 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -18,6 +18,7 @@ mod filters; mod query_criteria; mod sort; mod typo_tolerance; +mod phrase_search; pub const TEST_QUERY: &'static str = "hello world america"; diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs new file mode 100644 index 000000000..87b3fd511 --- /dev/null +++ b/milli/tests/search/phrase_search.rs @@ -0,0 +1,35 @@ +use milli::update::{IndexerConfig, Settings}; +use milli::{Index, Search, TermsMatchingStrategy}; + +fn set_stop_words(index: &Index, stop_words: &[&str]) { + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + + let mut builder = Settings::new(&mut wtxn, &index, &config); + let stop_words = stop_words.into_iter().map(|s| s.to_string()).collect(); + builder.set_stop_words(stop_words); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); +} + +#[test] +fn test_phrase_search_with_stop_words() { + let criteria = []; + let index = super::setup_search_index_with_criteria(&criteria); + + // Add stop_words + set_stop_words(&index, &["a", "an", "the", "of"]); + + // Phrase search containing stop words + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("\"the use of force\""); + search.limit(10); + search.authorize_typos(false); + search.terms_matching_strategy(TermsMatchingStrategy::All); + + let result = search.execute().unwrap(); + // 1 document should match + assert_eq!(result.documents_ids.len(), 1); +} From ef13c6a5b602192ddd2424036424884a1be31a4a Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 22:39:55 +0530 Subject: [PATCH 1752/1889] Perform filter after enumerate to keep origin indices --- milli/src/search/criteria/mod.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 96ed0bf6c..631dd2385 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -425,7 +425,13 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result Date: Thu, 13 Oct 2022 22:40:25 +0530 Subject: [PATCH 1753/1889] Increment position even when it's a stop word in exactness criteria --- milli/src/search/criteria/exactness.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 0f0c24723..580031697 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -298,10 +298,12 @@ fn attribute_start_with_docids( pos += 1; } Phrase(phrase) => { - for word in phrase.iter().filter_map(|w| w.as_ref()) { - let wc = ctx.word_position_docids(word, pos)?; - if let Some(word_candidates) = wc { - attribute_candidates_array.push(word_candidates); + for word in phrase { + if let Some(word) = word { + let wc = ctx.word_position_docids(word, pos)?; + if let Some(word_candidates) = wc { + attribute_candidates_array.push(word_candidates); + } } pos += 1; } From 3e190503e6a3b73126476def1be8e866f9c63464 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 22:47:41 +0530 Subject: [PATCH 1754/1889] Search for closest non-stop words in proximity criteria --- milli/src/search/criteria/proximity.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index db8592a1d..5aa3cc8b3 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -187,14 +187,15 @@ fn resolve_candidates<'t>( Phrase(words) => { if proximity == 0 { let most_left = words - .first() - .map(|o| o.as_ref()) - .flatten() + .iter() + .filter_map(|o| o.as_ref()) + .next() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); let most_right = words - .last() - .map(|o| o.as_ref()) - .flatten() + .iter() + .rev() + .filter_map(|o| o.as_ref()) + .next() .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); match (most_left, most_right) { From c8c666c6a6e93a28122b034b5f386fe42e94c0b1 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 23:17:28 +0530 Subject: [PATCH 1755/1889] Use resolve_phrase in exactness and typo criteria --- milli/src/search/criteria/typo.rs | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 758069642..2ae35e418 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -2,7 +2,6 @@ use std::borrow::Cow; use std::collections::HashMap; use std::mem::take; -use itertools::Itertools; use log::debug; use roaring::RoaringBitmap; @@ -10,6 +9,7 @@ use super::{ query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, CriterionResult, }; +use crate::search::criteria::resolve_phrase; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; use crate::Result; @@ -257,26 +257,7 @@ fn resolve_candidates<'t>( match query_tree { And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache), - Phrase(words) => { - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for (left, right) in words.iter().filter_map(|w| w.as_ref()).tuple_windows() { - match ctx.word_pair_proximity_docids(left, right, 1)? { - Some(pair_docids) => { - if pair_docids.is_empty() { - return Ok(RoaringBitmap::new()); - } else if first_loop { - candidates = pair_docids; - first_loop = false; - } else { - candidates &= pair_docids; - } - } - None => return Ok(RoaringBitmap::new()), - } - } - Ok(candidates) - } + Phrase(words) => resolve_phrase(ctx, words), Or(_, ops) => { let mut candidates = RoaringBitmap::new(); for op in ops { From d187b32a2847f9a5887ffe7806b19def927ad7c3 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 23:30:58 +0530 Subject: [PATCH 1756/1889] Fix snapshots to use new phrase type --- milli/src/search/query_tree.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4da4b3317..25366461c 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1081,7 +1081,7 @@ mod test { OR AND OR - PHRASE ["word", "split"] + PHRASE [Some("word"), Some("split")] Tolerant { word: "wordsplit", max typo: 2 } Exact { word: "fish" } Tolerant { word: "wordsplitfish", max typo: 1 } @@ -1100,7 +1100,7 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" OR - PHRASE ["quickbrown", "fox"] + PHRASE [Some("quickbrown"), Some("fox")] PrefixTolerant { word: "quickbrownfox", max typo: 2 } "###); } @@ -1117,7 +1117,7 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" AND - PHRASE ["hey", "friends"] + PHRASE [Some("hey"), Some("friends")] Exact { word: "wooop" } "###); } @@ -1154,8 +1154,8 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" AND - PHRASE ["hey", "friends"] - PHRASE ["wooop", "wooop"] + PHRASE [Some("hey"), Some("friends")] + PHRASE [Some("wooop"), Some("wooop")] "###); } @@ -1203,7 +1203,7 @@ mod test { .unwrap(); insta::assert_debug_snapshot!(query_tree, @r###" - PHRASE ["hey", "my"] + PHRASE [Some("hey"), Some("my")] "###); } @@ -1268,7 +1268,7 @@ mod test { insta::assert_debug_snapshot!(query_tree, @r###" AND - PHRASE ["hey", "my"] + PHRASE [Some("hey"), Some("my")] Exact { word: "good" } "###); } From bb9ce3c5c57a7507c84af9a237b3e227d8832b97 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 13 Oct 2022 23:34:17 +0530 Subject: [PATCH 1757/1889] Run cargo fmt --- milli/src/search/criteria/mod.rs | 12 ++++++------ milli/tests/search/mod.rs | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 631dd2385..a6a0c7f92 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -426,12 +426,12 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result Date: Thu, 13 Oct 2022 23:54:49 +0530 Subject: [PATCH 1758/1889] Fix panic when phrase contains only one stop word and nothing else --- milli/src/search/criteria/mod.rs | 5 +++++ milli/tests/search/phrase_search.rs | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index a6a0c7f92..3657df73e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -422,6 +422,11 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result Date: Thu, 20 Oct 2022 18:35:39 +0530 Subject: [PATCH 1759/1889] Simplify stop word checking in create_primitive_query --- milli/src/search/query_tree.rs | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 25366461c..5042f4762 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -4,7 +4,6 @@ use std::{fmt, mem}; use charabia::classifier::ClassifiedTokenIter; use charabia::{SeparatorKind, TokenKind}; -use fst::Set; use roaring::RoaringBitmap; use slice_group_by::GroupBy; @@ -269,8 +268,7 @@ impl<'a> QueryTreeBuilder<'a> { &self, query: ClassifiedTokenIter, ) -> Result> { - let stop_words = self.index.stop_words(self.rtxn)?; - let primitive_query = create_primitive_query(query, stop_words, self.words_limit); + let primitive_query = create_primitive_query(query, self.words_limit); if !primitive_query.is_empty() { let qt = create_query_tree( self, @@ -722,7 +720,6 @@ impl PrimitiveQueryPart { /// the primitive query is an intermediate state to build the query tree. fn create_primitive_query( query: ClassifiedTokenIter, - stop_words: Option>, words_limit: Option, ) -> PrimitiveQuery where @@ -747,13 +744,14 @@ where // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { - if stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { + if let TokenKind::StopWord = token.kind { phrase.push(None) } else { phrase.push(Some(token.lemma().to_string())); } } else if peekable.peek().is_some() { - if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { + if let TokenKind::StopWord = token.kind { + } else { primitive_query .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false)); } @@ -836,7 +834,7 @@ mod test { words_limit: Option, query: ClassifiedTokenIter, ) -> Result> { - let primitive_query = create_primitive_query(query, None, words_limit); + let primitive_query = create_primitive_query(query, words_limit); if !primitive_query.is_empty() { let qt = create_query_tree( self, From f1da623af3edbf0e5f5ffee9e00f4a7db0babbf0 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 20 Oct 2022 18:41:37 +0530 Subject: [PATCH 1760/1889] Add test for phrase search with stop words and all criteria at once Moved the actual test into a separate function used by both the existing test and the new test. --- milli/tests/search/phrase_search.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs index 313833543..e255927d8 100644 --- a/milli/tests/search/phrase_search.rs +++ b/milli/tests/search/phrase_search.rs @@ -1,5 +1,6 @@ +use crate::search::Criterion::{Attribute, Exactness, Proximity}; use milli::update::{IndexerConfig, Settings}; -use milli::{Index, Search, TermsMatchingStrategy}; +use milli::{Criterion, Index, Search, TermsMatchingStrategy}; fn set_stop_words(index: &Index, stop_words: &[&str]) { let mut wtxn = index.write_txn().unwrap(); @@ -12,9 +13,7 @@ fn set_stop_words(index: &Index, stop_words: &[&str]) { wtxn.commit().unwrap(); } -#[test] -fn test_phrase_search_with_stop_words() { - let criteria = []; +fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) { let index = super::setup_search_index_with_criteria(&criteria); // Add stop_words @@ -42,3 +41,15 @@ fn test_phrase_search_with_stop_words() { let result = search.execute().unwrap(); assert_eq!(result.documents_ids.len(), 0); } + +#[test] +fn test_phrase_search_with_stop_words_no_criteria() { + let criteria = []; + test_phrase_search_with_stop_words_given_criteria(&criteria); +} + +#[test] +fn test_phrase_search_with_stop_words_all_criteria() { + let criteria = [Proximity, Attribute, Exactness]; + test_phrase_search_with_stop_words_given_criteria(&criteria); +} From af33d22f2582f4ef40e9280100f8b72db6514daa Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 20 Oct 2022 19:02:08 +0530 Subject: [PATCH 1761/1889] Consecutive is false when at least 1 stop word is surrounded by words --- milli/src/search/criteria/proximity.rs | 29 +++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 5aa3cc8b3..5f414d84c 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -478,14 +478,29 @@ fn resolve_plane_sweep_candidates( } Phrase(words) => { let mut groups_positions = Vec::with_capacity(words.len()); - for word in words.iter().filter_map(|w| w.as_ref()) { - let positions = match words_positions.get(word) { - Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), - None => return Ok(vec![]), - }; - groups_positions.push(positions); + let mut consecutive = true; + let mut was_last_word_a_stop_word = false; + for word in words.iter() { + if let Some(word) = word { + let positions = match words_positions.get(word) { + Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), + None => return Ok(vec![]), + }; + groups_positions.push(positions); + + if was_last_word_a_stop_word { + consecutive = false; + } + was_last_word_a_stop_word = false; + } else { + if !was_last_word_a_stop_word { + consecutive = false; + } + + was_last_word_a_stop_word = true; + } } - plane_sweep(groups_positions, true)? + plane_sweep(groups_positions, consecutive)? } Or(_, ops) => { let mut result = Vec::new(); From 488d31ecdf7ce4a060e507505634e2a752b13cb0 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Thu, 20 Oct 2022 19:08:21 +0530 Subject: [PATCH 1762/1889] Run cargo fmt --- milli/tests/search/phrase_search.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs index e255927d8..e4f5d73f9 100644 --- a/milli/tests/search/phrase_search.rs +++ b/milli/tests/search/phrase_search.rs @@ -1,7 +1,8 @@ -use crate::search::Criterion::{Attribute, Exactness, Proximity}; use milli::update::{IndexerConfig, Settings}; use milli::{Criterion, Index, Search, TermsMatchingStrategy}; +use crate::search::Criterion::{Attribute, Exactness, Proximity}; + fn set_stop_words(index: &Index, stop_words: &[&str]) { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); From d8fed1f7a9778218b8cefdecaaa6a037e3289e84 Mon Sep 17 00:00:00 2001 From: unvalley Date: Mon, 10 Oct 2022 22:08:34 +0900 Subject: [PATCH 1763/1889] Add clippy job Add Run Clippy to bors.toml --- .github/workflows/rust.yml | 19 +++++++++++++++++++ bors.toml | 1 + 2 files changed, 20 insertions(+) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 9939d3f24..e1e09211a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -48,6 +48,25 @@ jobs: command: test args: --release + clippy: + name: Run Clippy + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + components: clippy + - name: Cache dependencies + uses: Swatinem/rust-cache@v2.0.0 + - name: Run cargo clippy + uses: actions-rs/cargo@v1 + with: + command: clippy + args: --all-targets + fmt: name: Run Rustfmt runs-on: ubuntu-20.04 diff --git a/bors.toml b/bors.toml index 73324892f..8ba0eed94 100644 --- a/bors.toml +++ b/bors.toml @@ -2,6 +2,7 @@ status = [ 'Tests on ubuntu-20.04', 'Tests on macos-latest', 'Tests on windows-latest', + 'Run Clippy', 'Run Rustfmt', ] # 3 hours timeout From 811f156031bd74cac2d5a92acb66c89feb452217 Mon Sep 17 00:00:00 2001 From: unvalley Date: Mon, 10 Oct 2022 22:28:03 +0900 Subject: [PATCH 1764/1889] Execute cargo clippy --fix --- milli/src/index.rs | 2 +- milli/src/search/criteria/mod.rs | 6 +++++- milli/src/search/distinct/mod.rs | 4 ++-- milli/src/snapshot_tests.rs | 12 ++++++------ milli/src/update/delete_documents.rs | 8 ++++---- milli/src/update/index_documents/mod.rs | 4 ++-- milli/src/update/prefix_word_pairs/word_prefix.rs | 2 +- milli/tests/search/mod.rs | 8 ++++---- milli/tests/search/query_criteria.rs | 6 ++---- 9 files changed, 27 insertions(+), 25 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 8b1e4d8ff..5910a305c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1234,7 +1234,7 @@ pub(crate) mod tests { { let builder = IndexDocuments::new( wtxn, - &self, + self, &self.indexer_config, self.index_documents_config.clone(), |_| (), diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 1b46c8441..ab1823779 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -610,7 +610,11 @@ fn query_pair_proximity_docids( } (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_derivations(right, prefix, *typo, ctx.words_fst(), wdcache)?; +<<<<<<< HEAD all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], r_words, proximity) +======= + all_word_pair_proximity_docids(ctx, &[(left, 0)], r_words, proximity) +>>>>>>> 08fe530b (Execute cargo clippy --fix) } ( QueryKind::Tolerant { typo: l_typo, word: left }, @@ -866,7 +870,7 @@ pub mod test { let mut keys = word_docids.keys().collect::>(); keys.sort_unstable(); - let words_fst = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap(); + let words_fst = fst::Set::from_iter(keys).unwrap().map_data(Cow::Owned).unwrap(); TestContext { words_fst, diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index b6ed26917..3a46bb469 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -110,7 +110,7 @@ mod test { addition.execute().unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); - let fid = fields_map.id(&distinct).unwrap(); + let fid = fields_map.id(distinct).unwrap(); let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap(); let map = (0..documents.documents_count() as u32).collect(); @@ -133,7 +133,7 @@ mod test { let s = value.to_string(); assert!(seen.insert(s)); } - Value::Array(values) => values.into_iter().for_each(|value| test(seen, value)), + Value::Array(values) => values.iter().for_each(|value| test(seen, value)), } } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index bcb9805ea..d3fbfc285 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -314,8 +314,8 @@ pub fn snap_field_id_docid_facet_strings(index: &Index) -> String { pub fn snap_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let documents_ids = index.documents_ids(&rtxn).unwrap(); - let snap = display_bitmap(&documents_ids); - snap + + display_bitmap(&documents_ids) } pub fn snap_stop_words(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); @@ -326,8 +326,8 @@ pub fn snap_stop_words(index: &Index) -> String { pub fn snap_soft_deleted_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); - let soft_deleted_documents_ids = display_bitmap(&soft_deleted_documents_ids); - soft_deleted_documents_ids + + display_bitmap(&soft_deleted_documents_ids) } pub fn snap_field_distributions(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); @@ -350,8 +350,8 @@ pub fn snap_fields_ids_map(index: &Index) -> String { pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); - let snap = display_bitmap(&geo_faceted_documents_ids); - snap + + display_bitmap(&geo_faceted_documents_ids) } pub fn snap_external_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index f1341c48c..1dd3f423b 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -641,7 +641,7 @@ mod tests { external_ids: &[&str], disable_soft_deletion: bool, ) -> Vec { - let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); + let external_document_ids = index.external_documents_ids(wtxn).unwrap(); let ids_to_delete: Vec = external_ids .iter() .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) @@ -858,7 +858,7 @@ mod tests { assert!(!results.documents_ids.is_empty()); for id in results.documents_ids.iter() { assert!( - !deleted_internal_ids.contains(&id), + !deleted_internal_ids.contains(id), "The document {} was supposed to be deleted", id ); @@ -922,7 +922,7 @@ mod tests { assert!(!results.documents_ids.is_empty()); for id in results.documents_ids.iter() { assert!( - !deleted_internal_ids.contains(&id), + !deleted_internal_ids.contains(id), "The document {} was supposed to be deleted", id ); @@ -986,7 +986,7 @@ mod tests { assert!(!results.documents_ids.is_empty()); for id in results.documents_ids.iter() { assert!( - !deleted_internal_ids.contains(&id), + !deleted_internal_ids.contains(id), "The document {} was supposed to be deleted", id ); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a121d3ae0..468a8c56f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -772,7 +772,7 @@ mod tests { let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); let (_id, obkv) = docs.iter().find(|(_id, kv)| kv.get(0) == Some(br#""kevin""#)).unwrap(); - let kevin_uuid: String = serde_json::from_slice(&obkv.get(1).unwrap()).unwrap(); + let kevin_uuid: String = serde_json::from_slice(obkv.get(1).unwrap()).unwrap(); drop(rtxn); // Second we send 1 document with the generated uuid, to erase the previous ones. @@ -1811,7 +1811,7 @@ mod tests { let long_word = "lol".repeat(1000); let doc1 = documents! {[{ "id": "1", - "title": long_word.clone(), + "title": long_word, }]}; index.add_documents(doc1).unwrap(); diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index 53e421fac..62d4d4c03 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -574,7 +574,7 @@ mod tests { expected_prefixes: &[&str], ) { let mut actual_prefixes = vec![]; - trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), &search_start, |x| { + trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| { let s = String::from_utf8(x.to_owned()).unwrap(); actual_prefixes.push(s); }); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index c8b01648c..d5e321158 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -19,7 +19,7 @@ mod query_criteria; mod sort; mod typo_tolerance; -pub const TEST_QUERY: &'static str = "hello world america"; +pub const TEST_QUERY: &str = "hello world america"; pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]; @@ -177,7 +177,7 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { { id = Some(document.id.clone()) } - } else if let Some((field, filter)) = filter.split_once("=") { + } else if let Some((field, filter)) = filter.split_once('=') { if field == "tag" && document.tag == filter { id = Some(document.id.clone()) } else if field == "asc_desc_rank" @@ -185,11 +185,11 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { { id = Some(document.id.clone()) } - } else if let Some(("asc_desc_rank", filter)) = filter.split_once("<") { + } else if let Some(("asc_desc_rank", filter)) = filter.split_once('<') { if document.asc_desc_rank < filter.parse().unwrap() { id = Some(document.id.clone()) } - } else if let Some(("asc_desc_rank", filter)) = filter.split_once(">") { + } else if let Some(("asc_desc_rank", filter)) = filter.split_once('>') { if document.asc_desc_rank > filter.parse().unwrap() { id = Some(document.id.clone()) } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 90b4d6362..3007a83ea 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -200,14 +200,12 @@ test_criterion!( #[test] fn criteria_mixup() { use Criterion::*; - let index = search::setup_search_index_with_criteria(&vec![ - Words, + let index = search::setup_search_index_with_criteria(&[Words, Attribute, Desc(S("asc_desc_rank")), Exactness, Proximity, - Typo, - ]); + Typo]); #[rustfmt::skip] let criteria_mix = { From c7322f704c3049d3223caf7821b2c522f4be81ad Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 14 Oct 2022 23:44:10 +0900 Subject: [PATCH 1765/1889] Fix cargo clippy errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dont apply clippy for tests for now Fix clippy warnings of filter-parser package parent 8352febd646ec4bcf56a44161e5c4dce0e55111f author unvalley <38400669+unvalley@users.noreply.github.com> 1666325847 +0900 committer unvalley 1666791316 +0900 Update .github/workflows/rust.yml Co-authored-by: Clémentine Urquizar - curqui Allow clippy lint too_many_argments Allow clippy lint needless_collect Allow clippy lint too_many_arguments and type_complexity Fix for clippy warnings comparison_chains Fix for clippy warnings vec_init_then_push Allow clippy lint should_implement_trait Allow clippy lint drop_non_drop Fix lifetime clipy warnings in filter-paprser Execute cargo fmt Fix clippy remaining warnings Fix clippy remaining warnings again and allow lint on each place --- .github/workflows/rust.yml | 1 - filter-parser/src/condition.rs | 7 ++----- filter-parser/src/lib.rs | 14 +++++++------- filter-parser/src/value.rs | 2 +- milli/src/lib.rs | 2 -- milli/src/search/criteria/asc_desc.rs | 2 ++ milli/src/search/criteria/attribute.rs | 1 + milli/src/search/criteria/mod.rs | 5 +---- milli/src/search/distinct/facet_distinct.rs | 1 + milli/src/search/facet/filter.rs | 9 +++++---- milli/src/search/matches/mod.rs | 5 +---- milli/src/snapshot_tests.rs | 6 +++--- milli/src/update/available_documents_ids.rs | 1 + .../index_documents/extract/extract_geo_points.rs | 1 + milli/src/update/index_documents/extract/mod.rs | 4 ++++ milli/src/update/index_documents/mod.rs | 1 + milli/src/update/prefix_word_pairs/prefix_word.rs | 7 +++---- milli/src/update/prefix_word_pairs/word_prefix.rs | 2 ++ milli/tests/search/query_criteria.rs | 6 ++++-- 19 files changed, 40 insertions(+), 37 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e1e09211a..e640ee1ef 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -65,7 +65,6 @@ jobs: uses: actions-rs/cargo@v1 with: command: clippy - args: --all-targets fmt: name: Run Rustfmt diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index e967bd074..735ffec0e 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -48,17 +48,14 @@ pub fn parse_condition(input: Span) -> IResult { pub fn parse_exists(input: Span) -> IResult { let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; - Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists })) + Ok((input, FilterCondition::Condition { fid: key, op: Exists })) } /// exist = value "NOT" WS+ "EXISTS" pub fn parse_not_exists(input: Span) -> IResult { let (input, key) = parse_value(input)?; let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?; - Ok(( - input, - FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key.into(), op: Exists })), - )) + Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Exists })))) } /// to = value value "TO" WS+ value diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 33025e6e9..9a3e0f1f8 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -168,7 +168,7 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) } /// value_list = (value ("," value)* ","?)? -fn parse_value_list<'a>(input: Span<'a>) -> IResult>> { +fn parse_value_list(input: Span) -> IResult> { let (input, first_value) = opt(parse_value)(input)?; if let Some(first_value) = first_value { let value_list_el_parser = preceded(ws(tag(",")), parse_value); @@ -335,17 +335,17 @@ fn parse_error_reserved_keyword(input: Span) -> IResult { Ok(result) => Ok(result), Err(nom::Err::Error(inner) | nom::Err::Failure(inner)) => match inner.kind() { ErrorKind::ExpectedValue(ExpectedValueKind::ReservedKeyword) => { - return Err(nom::Err::Failure(inner)); + Err(nom::Err::Failure(inner)) } - _ => return Err(nom::Err::Error(inner)), + _ => Err(nom::Err::Error(inner)), }, - Err(e) => { - return Err(e); - } + Err(e) => Err(e), } } -/// primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to +/** +primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to +*/ fn parse_primary(input: Span, depth: usize) -> IResult { if depth > MAX_FILTER_DEPTH { return Err(nom::Err::Error(Error::new_from_kind(input, ErrorKind::DepthLimitReached))); diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index d015018c1..73ef61480 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -78,7 +78,7 @@ pub fn word_exact<'a, 'b: 'a>(tag: &'b str) -> impl Fn(Span<'a>) -> IResult<'a, } /// value = WS* ( word | singleQuoted | doubleQuoted) WS+ -pub fn parse_value<'a>(input: Span<'a>) -> IResult> { +pub fn parse_value(input: Span) -> IResult { // to get better diagnostic message we are going to strip the left whitespaces from the input right now let (input, _) = take_while(char::is_whitespace)(input)?; diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 28f048b8a..c33aae9eb 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,6 +1,4 @@ #![cfg_attr(all(test, fuzzing), feature(no_coverage))] -#![allow(clippy::reversed_empty_ranges)] -#![allow(clippy::too_many_arguments)] #[macro_use] pub mod documents; diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 92c73709b..fbcf1d3fe 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -242,6 +242,7 @@ fn iterative_facet_number_ordered_iter<'t>( // The itertools GroupBy iterator doesn't provide an owned version, we are therefore // required to collect the result into an owned collection (a Vec). // https://github.com/rust-itertools/itertools/issues/499 + #[allow(clippy::needless_collect)] let vec: Vec<_> = iter .group_by(|(_, v)| *v) .into_iter() @@ -284,6 +285,7 @@ fn iterative_facet_string_ordered_iter<'t>( // The itertools GroupBy iterator doesn't provide an owned version, we are therefore // required to collect the result into an owned collection (a Vec). // https://github.com/rust-itertools/itertools/issues/499 + #[allow(clippy::needless_collect)] let vec: Vec<_> = iter .group_by(|(_, v)| *v) .into_iter() diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 7e55a1038..4d2437027 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -179,6 +179,7 @@ impl<'t> Criterion for Attribute<'t> { /// QueryPositionIterator is an Iterator over positions of a Query, /// It contains iterators over words positions. struct QueryPositionIterator<'t> { + #[allow(clippy::type_complexity)] inner: Vec> + 't>>>, } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index ab1823779..09d0908e1 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -96,6 +96,7 @@ pub trait Context<'c> { &self, docid: DocumentId, ) -> heed::Result>; + #[allow(clippy::type_complexity)] fn word_position_iterator( &self, word: &str, @@ -610,11 +611,7 @@ fn query_pair_proximity_docids( } (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_derivations(right, prefix, *typo, ctx.words_fst(), wdcache)?; -<<<<<<< HEAD all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], r_words, proximity) -======= - all_word_pair_proximity_docids(ctx, &[(left, 0)], r_words, proximity) ->>>>>>> 08fe530b (Execute cargo clippy --fix) } ( QueryKind::Tolerant { typo: l_typo, word: left }, diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 1725346be..3ed683823 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -123,6 +123,7 @@ impl<'a> FacetDistinctIter<'a> { } } +#[allow(clippy::drop_non_drop)] fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 1dc01566e..40986fea0 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -100,10 +100,10 @@ impl<'a> Filter<'a> { } } - if ors.len() > 1 { - ands.push(FilterCondition::Or(ors)); - } else if ors.len() == 1 { - ands.push(ors.pop().unwrap()); + match ors.len() { + 1 => ands.push(ors.pop().unwrap()), + n if n > 1 => ands.push(FilterCondition::Or(ors)), + _ => (), } } Either::Right(rule) => { @@ -128,6 +128,7 @@ impl<'a> Filter<'a> { Ok(Some(Self { condition: and })) } + #[allow(clippy::should_implement_trait)] pub fn from_str(expression: &'a str) -> Result> { let condition = match FilterCondition::parse(expression) { Ok(Some(fc)) => Ok(fc), diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index b76ddef99..ec47f848d 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -125,10 +125,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { words_positions: &mut impl Iterator)>, matches: &mut Vec, ) -> bool { - let mut potential_matches = Vec::new(); - - // Add first match to potential matches. - potential_matches.push((token_position, word_position, partial.char_len())); + let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; for (token_position, word_position, word) in words_positions { partial = match partial.match_token(word) { diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index d3fbfc285..46972deba 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -314,7 +314,7 @@ pub fn snap_field_id_docid_facet_strings(index: &Index) -> String { pub fn snap_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let documents_ids = index.documents_ids(&rtxn).unwrap(); - + display_bitmap(&documents_ids) } pub fn snap_stop_words(index: &Index) -> String { @@ -326,7 +326,7 @@ pub fn snap_stop_words(index: &Index) -> String { pub fn snap_soft_deleted_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); - + display_bitmap(&soft_deleted_documents_ids) } pub fn snap_field_distributions(index: &Index) -> String { @@ -350,7 +350,7 @@ pub fn snap_fields_ids_map(index: &Index) -> String { pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); - + display_bitmap(&geo_faceted_documents_ids) } pub fn snap_external_documents_ids(index: &Index) -> String { diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs index 3e4ec5600..784bee5a7 100644 --- a/milli/src/update/available_documents_ids.rs +++ b/milli/src/update/available_documents_ids.rs @@ -21,6 +21,7 @@ impl AvailableDocumentsIds { let iter = match last_id.checked_add(1) { Some(id) => id..=u32::max_value(), + #[allow(clippy::reversed_empty_ranges)] None => 1..=0, // empty range iterator }; diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index c75b60c60..55044e712 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -51,6 +51,7 @@ pub fn extract_geo_points( ) .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; + #[allow(clippy::drop_non_drop)] let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; } else if lat.is_none() && lng.is_some() { diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index c0f12e9ee..e696ed44b 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -33,6 +33,7 @@ use crate::{FieldId, Result}; /// Extract data for each databases from obkv documents in parallel. /// Send data in grenad file over provided Sender. +#[allow(clippy::too_many_arguments)] pub(crate) fn data_from_obkv_documents( original_obkv_chunks: impl Iterator>> + Send, flattened_obkv_chunks: impl Iterator>> + Send, @@ -53,6 +54,7 @@ pub(crate) fn data_from_obkv_documents( }) .collect::>()?; + #[allow(clippy::type_complexity)] let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks .par_bridge() .map(|flattened_obkv_chunks| { @@ -217,6 +219,8 @@ fn send_original_documents_data( /// - docid_fid_facet_numbers /// - docid_fid_facet_strings /// - docid_fid_facet_exists +#[allow(clippy::too_many_arguments)] +#[allow(clippy::type_complexity)] fn send_and_extract_flattened_documents_data( flattened_documents_chunk: Result>, indexer: GrenadParameters, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 468a8c56f..e9f5c2d38 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -598,6 +598,7 @@ where } /// Run the word prefix docids update operation. +#[allow(clippy::too_many_arguments)] fn execute_word_prefix_docids( txn: &mut heed::RwTxn, reader: grenad::Reader>, diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 952e02558..60e2e554e 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -12,6 +12,7 @@ use crate::update::prefix_word_pairs::{ }; use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; +#[allow(clippy::too_many_arguments)] #[logging_timer::time] pub fn index_prefix_word_database( wtxn: &mut heed::RwTxn, @@ -38,8 +39,7 @@ pub fn index_prefix_word_database( for proximity in 1..max_proximity { for prefix in common_prefixes.iter() { - let mut prefix_key = vec![]; - prefix_key.push(proximity); + let mut prefix_key = vec![proximity]; prefix_key.extend_from_slice(prefix.as_bytes()); let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; // This is the core of the algorithm @@ -84,8 +84,7 @@ pub fn index_prefix_word_database( for proximity in 1..max_proximity { for prefix in new_prefixes.iter() { - let mut prefix_key = vec![]; - prefix_key.push(proximity); + let mut prefix_key = vec![proximity]; prefix_key.extend_from_slice(prefix.as_bytes()); let mut db_iter = word_pair_proximity_docids .as_polymorph() diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index 62d4d4c03..71a2a2915 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -176,6 +176,7 @@ use crate::update::prefix_word_pairs::{ }; use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; +#[allow(clippy::too_many_arguments)] #[logging_timer::time] pub fn index_word_prefix_database( wtxn: &mut heed::RwTxn, @@ -385,6 +386,7 @@ can be inserted into the database in sorted order. When it is flushed, it calls struct PrefixAndProximityBatch { proximity: u8, word1: Vec, + #[allow(clippy::type_complexity)] batch: Vec<(Vec, Vec>)>, } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 3007a83ea..d4aa859a4 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -200,12 +200,14 @@ test_criterion!( #[test] fn criteria_mixup() { use Criterion::*; - let index = search::setup_search_index_with_criteria(&[Words, + let index = search::setup_search_index_with_criteria(&[ + Words, Attribute, Desc(S("asc_desc_rank")), Exactness, Proximity, - Typo]); + Typo, + ]); #[rustfmt::skip] let criteria_mix = { From 752d031010fecdf48f823bee809fb2dfdae8be26 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Wed, 26 Oct 2022 23:07:20 +0530 Subject: [PATCH 1766/1889] Update phrase search to use new `execute` method --- milli/tests/search/phrase_search.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs index e4f5d73f9..ca5eaad48 100644 --- a/milli/tests/search/phrase_search.rs +++ b/milli/tests/search/phrase_search.rs @@ -10,7 +10,7 @@ fn set_stop_words(index: &Index, stop_words: &[&str]) { let mut builder = Settings::new(&mut wtxn, &index, &config); let stop_words = stop_words.into_iter().map(|s| s.to_string()).collect(); builder.set_stop_words(stop_words); - builder.execute(|_| ()).unwrap(); + builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); } From d35afa0cf532b357a710bbe53983e3508a11be02 Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Wed, 26 Oct 2022 23:10:48 +0530 Subject: [PATCH 1767/1889] Change consecutive phrase search grouping logic Co-authored-by: ManyTheFish --- milli/src/search/criteria/proximity.rs | 42 ++++++++++++++------------ 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 5f414d84c..b9cf47c8e 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -4,6 +4,7 @@ use std::mem::take; use log::debug; use roaring::RoaringBitmap; +use slice_group_by::GroupBy; use super::{ query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, @@ -478,29 +479,30 @@ fn resolve_plane_sweep_candidates( } Phrase(words) => { let mut groups_positions = Vec::with_capacity(words.len()); - let mut consecutive = true; - let mut was_last_word_a_stop_word = false; - for word in words.iter() { - if let Some(word) = word { - let positions = match words_positions.get(word) { - Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(), - None => return Ok(vec![]), - }; - groups_positions.push(positions); - if was_last_word_a_stop_word { - consecutive = false; - } - was_last_word_a_stop_word = false; - } else { - if !was_last_word_a_stop_word { - consecutive = false; - } - - was_last_word_a_stop_word = true; + // group stop_words together. + for words in words.linear_group_by_key(Option::is_none) { + // skip if it's a group of stop words. + if matches!(words.first(), None | Some(None)) { + continue; } + // make a consecutive plane-sweep on the subgroup of words. + let mut subgroup = Vec::with_capacity(words.len()); + for word in words.into_iter().map(|w| w.as_deref().unwrap()) { + match words_positions.get(word) { + Some(positions) => { + subgroup.push(positions.iter().map(|p| (p, 0, p)).collect()) + } + None => return Ok(vec![]), + } + } + groups_positions.push(plane_sweep(subgroup, true)?); + } + match groups_positions.len() { + 0 => vec![], + 1 => groups_positions.pop().unwrap(), + _ => plane_sweep(groups_positions, false)?, } - plane_sweep(groups_positions, consecutive)? } Or(_, ops) => { let mut result = Vec::new(); From f4ec1abb9bd3264401793492dc03cad964df5029 Mon Sep 17 00:00:00 2001 From: unvalley Date: Thu, 27 Oct 2022 23:58:13 +0900 Subject: [PATCH 1768/1889] Fix all clippy error after conflicts --- filter-parser/src/lib.rs | 4 +- milli/src/heed_codec/facet/mod.rs | 3 +- .../search/facet/facet_distribution_iter.rs | 13 +++-- milli/src/search/facet/facet_range_search.rs | 8 +-- .../src/search/facet/facet_sort_ascending.rs | 5 +- .../src/search/facet/facet_sort_descending.rs | 7 +-- milli/src/search/facet/mod.rs | 7 +-- milli/src/update/delete_documents.rs | 4 +- milli/src/update/facet/bulk.rs | 4 +- milli/src/update/facet/delete.rs | 2 +- milli/src/update/facet/incremental.rs | 50 +++++++++---------- 11 files changed, 53 insertions(+), 54 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 9a3e0f1f8..c595cf827 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -401,7 +401,7 @@ pub mod tests { fn parse() { use FilterCondition as Fc; - fn p<'a>(s: &'a str) -> impl std::fmt::Display + 'a { + fn p(s: &str) -> impl std::fmt::Display + '_ { Fc::parse(s).unwrap().unwrap() } @@ -494,7 +494,7 @@ pub mod tests { fn error() { use FilterCondition as Fc; - fn p<'a>(s: &'a str) -> impl std::fmt::Display + 'a { + fn p(s: &str) -> impl std::fmt::Display + '_ { Fc::parse(s).unwrap_err().to_string() } diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 4609bfe7f..d36ec8434 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -88,8 +88,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { type EItem = FacetGroupValue; fn bytes_encode(value: &'a Self::EItem) -> Option> { - let mut v = vec![]; - v.push(value.size); + let mut v = vec![value.size]; CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); Some(Cow::Owned(v)) } diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 9cd85b667..6e209c7aa 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -38,9 +38,9 @@ where if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; - return Ok(()); + Ok(()) } else { - return Ok(()); + Ok(()) } } @@ -84,7 +84,7 @@ where } } } - return Ok(ControlFlow::Continue(())); + Ok(ControlFlow::Continue(())) } fn iterate( &mut self, @@ -98,7 +98,7 @@ where } let starting_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound }; - let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size); + let iter = self.db.range(self.rtxn, &(&starting_key..)).unwrap().take(group_size); for el in iter { let (key, value) = el.unwrap(); @@ -108,7 +108,7 @@ where return Ok(ControlFlow::Break(())); } let docids_in_common = value.bitmap & candidates; - if docids_in_common.len() > 0 { + if !docids_in_common.is_empty() { let cf = self.iterate( &docids_in_common, level - 1, @@ -121,8 +121,7 @@ where } } } - - return Ok(ControlFlow::Continue(())); + Ok(ControlFlow::Continue(())) } } diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 07300e920..e8eeab1cc 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -60,7 +60,7 @@ where f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) } else { - return Ok(()); + Ok(()) } } @@ -77,7 +77,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; - let iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); + let iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size); for el in iter { let (key, value) = el?; // the right side of the iter range is unbounded, so we need to make sure that we are not iterating @@ -145,7 +145,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let left_key = FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; - let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); + let mut iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size); // We iterate over the range while keeping in memory the previous value let (mut previous_key, mut previous_value) = iter.next().unwrap()?; @@ -348,6 +348,7 @@ mod tests { &mut docids, ) .unwrap(); + #[allow(clippy::format_push_string)] results.push_str(&format!("{}\n", display_bitmap(&docids))); } milli_snap!(results, format!("included_{i}")); @@ -366,6 +367,7 @@ mod tests { &mut docids, ) .unwrap(); + #[allow(clippy::format_push_string)] results.push_str(&format!("{}\n", display_bitmap(&docids))); } milli_snap!(results, format!("excluded_{i}")); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 2f1f73db3..552795981 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -50,6 +50,7 @@ struct AscendingFacetSort<'t, 'e> { rtxn: &'t heed::RoTxn<'e>, db: heed::Database, FacetGroupValueCodec>, field_id: u16, + #[allow(clippy::type_complexity)] stack: Vec<( RoaringBitmap, std::iter::Take< @@ -91,9 +92,9 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { } let starting_key_below = FacetGroupKey { field_id: self.field_id, level: level - 1, left_bound }; - let iter = match self.db.range(&self.rtxn, &(starting_key_below..)) { + let iter = match self.db.range(self.rtxn, &(starting_key_below..)) { Ok(iter) => iter, - Err(e) => return Some(Err(e.into())), + Err(e) => return Some(Err(e)), } .take(group_size as usize); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 5f09d708b..6f073b62a 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -39,6 +39,7 @@ struct DescendingFacetSort<'t> { rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, field_id: u16, + #[allow(clippy::type_complexity)] stack: Vec<( RoaringBitmap, std::iter::Take< @@ -54,7 +55,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { fn next(&mut self) -> Option { 'outer: loop { let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?; - while let Some(result) = deepest_iter.next() { + for result in deepest_iter.by_ref() { let ( FacetGroupKey { level, left_bound, field_id }, FacetGroupValue { size: group_size, mut bitmap }, @@ -100,11 +101,11 @@ impl<'t> Iterator for DescendingFacetSort<'t> { .db .remap_key_type::>() .rev_range( - &self.rtxn, + self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow), ) { Ok(iter) => iter, - Err(e) => return Some(Err(e.into())), + Err(e) => return Some(Err(e)), } .take(group_size as usize); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ccf40d6aa..7dfdcdb94 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -73,7 +73,7 @@ pub(crate) fn get_highest_level<'t>( let field_id_prefix = &field_id.to_be_bytes(); Ok(db .as_polymorph() - .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, field_id_prefix)? + .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, field_id_prefix)? .next() .map(|el| { let (key, _) = el.unwrap(); @@ -105,12 +105,9 @@ pub(crate) mod tests { pub fn get_random_looking_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); let mut txn = index.env.write_txn().unwrap(); - let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); - let keys = - std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); - for (_i, key) in keys.into_iter().enumerate() { + for (_i, key) in std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(key + 100); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 1dd3f423b..a6a4ea609 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -138,7 +138,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // the `soft_deleted_documents_ids` bitmap and early exit. let size_used = self.index.used_size()?; let map_size = self.index.env.map_size()? as u64; - let nb_documents = self.index.number_of_documents(&self.wtxn)?; + let nb_documents = self.index.number_of_documents(self.wtxn)?; let nb_soft_deleted = soft_deleted_docids.len(); let percentage_available = 100 - (size_used * 100 / map_size); @@ -474,7 +474,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; let facet_values = remove_docids_from_field_id_docid_facet_value( - &self.index, + self.index, self.wtxn, facet_type, field_id, diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index ea0a7d3d7..2a4c7f49a 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -111,7 +111,7 @@ impl FacetsUpdateBulkInner { } for &field_id in field_ids.iter() { - let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; + let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?; handle_all_docids(wtxn, field_id, all_docids)?; @@ -341,7 +341,7 @@ impl FacetsUpdateBulkInner { handle_group(&bitmaps, left_bounds.first().unwrap())?; } } - return Ok(sub_writers); + Ok(sub_writers) } } diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs index 2bc54c7c1..9bec2d911 100644 --- a/milli/src/update/facet/delete.rs +++ b/milli/src/update/facet/delete.rs @@ -100,7 +100,7 @@ impl<'i, 'b> FacetsDelete<'i, 'b> { max_group_size: self.max_group_size, }; for facet_value in affected_facet_values { - inc.delete(wtxn, field_id, facet_value.as_slice(), &self.docids_to_delete)?; + inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?; } } } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 2558c81a3..04d702987 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -127,7 +127,7 @@ impl FacetsUpdateIncrementalInner { if let Some(e) = prefix_iter.next() { let (key_bytes, value) = e?; Ok(( - FacetGroupKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, @@ -146,11 +146,11 @@ impl FacetsUpdateIncrementalInner { .as_polymorph() .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( txn, - &prefix.as_slice(), + prefix.as_slice(), )?; let (key_bytes, value) = iter.next().unwrap()?; Ok(( - FacetGroupKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, @@ -185,15 +185,15 @@ impl FacetsUpdateIncrementalInner { let mut iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, &level0_prefix)?; + .prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, &level0_prefix)?; if iter.next().is_none() { drop(iter); self.db.put(txn, &key, &value)?; - return Ok(InsertionResult::Insert); + Ok(InsertionResult::Insert) } else { drop(iter); - let old_value = self.db.get(&txn, &key)?; + let old_value = self.db.get(txn, &key)?; match old_value { Some(mut updated_value) => { // now merge the two @@ -236,7 +236,7 @@ impl FacetsUpdateIncrementalInner { let max_group_size = self.max_group_size; - let result = self.insert_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?; + let result = self.insert_in_level(txn, field_id, level - 1, &(*facet_value), docids)?; // level below inserted an element let (insertion_key, insertion_value) = @@ -312,13 +312,13 @@ impl FacetsUpdateIncrementalInner { }; let mut iter = - self.db.range(&txn, &(start_key..))?.take((size_left as usize) + (size_right as usize)); + self.db.range(txn, &(start_key..))?.take((size_left as usize) + (size_right as usize)); let group_left = { let mut values_left = RoaringBitmap::new(); let mut i = 0; - while let Some(next) = iter.next() { + for next in iter.by_ref() { let (_key, value) = next?; i += 1; values_left |= &value.bitmap; @@ -339,7 +339,7 @@ impl FacetsUpdateIncrementalInner { FacetGroupValue { bitmap: mut values_right, .. }, ) = iter.next().unwrap()?; - while let Some(next) = iter.next() { + for next in iter.by_ref() { let (_, value) = next?; values_right |= &value.bitmap; } @@ -359,7 +359,7 @@ impl FacetsUpdateIncrementalInner { } /// Insert the given facet value and corresponding document ids in the database. - pub fn insert<'a, 't>( + pub fn insert<'t>( &self, txn: &'t mut RwTxn, field_id: u16, @@ -371,7 +371,7 @@ impl FacetsUpdateIncrementalInner { } let group_size = self.group_size; - let highest_level = get_highest_level(&txn, self.db, field_id)?; + let highest_level = get_highest_level(txn, self.db, field_id)?; let result = self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; @@ -391,7 +391,7 @@ impl FacetsUpdateIncrementalInner { let size_highest_level = self .db .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)? .count(); if size_highest_level < self.group_size as usize * self.min_level_size as usize { @@ -401,7 +401,7 @@ impl FacetsUpdateIncrementalInner { let mut groups_iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &highest_level_prefix)?; + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &highest_level_prefix)?; let nbr_new_groups = size_highest_level / self.group_size as usize; let nbr_leftover_elements = size_highest_level % self.group_size as usize; @@ -412,7 +412,7 @@ impl FacetsUpdateIncrementalInner { let mut values = RoaringBitmap::new(); for _ in 0..group_size { let (key_bytes, value_i) = groups_iter.next().unwrap()?; - let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) + let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) .ok_or(Error::Encoding)?; if first_key.is_none() { @@ -435,7 +435,7 @@ impl FacetsUpdateIncrementalInner { let mut values = RoaringBitmap::new(); for _ in 0..nbr_leftover_elements { let (key_bytes, value_i) = groups_iter.next().unwrap()?; - let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) + let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) .ok_or(Error::Encoding)?; if first_key.is_none() { @@ -494,7 +494,7 @@ impl FacetsUpdateIncrementalInner { let (deletion_key, mut bitmap) = self.find_insertion_key_value(field_id, level, facet_value, txn)?; - let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?; + let result = self.delete_in_level(txn, field_id, level - 1, &(*facet_value), docids)?; let mut decrease_size = false; let next_key = match result { @@ -547,13 +547,13 @@ impl FacetsUpdateIncrementalInner { docids: &RoaringBitmap, ) -> Result { let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; - let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; + let mut bitmap = self.db.get(txn, &key)?.unwrap().bitmap; bitmap -= docids; if bitmap.is_empty() { let mut next_key = None; if let Some((next, _)) = - self.db.remap_data_type::().get_greater_than(&txn, &key)? + self.db.remap_data_type::().get_greater_than(txn, &key)? { if next.field_id == field_id && next.level == 0 { next_key = Some(next.left_bound.to_vec()); @@ -567,7 +567,7 @@ impl FacetsUpdateIncrementalInner { } } - pub fn delete<'a, 't>( + pub fn delete<'t>( &self, txn: &'t mut RwTxn, field_id: u16, @@ -582,7 +582,7 @@ impl FacetsUpdateIncrementalInner { { return Ok(()); } - let highest_level = get_highest_level(&txn, self.db, field_id)?; + let highest_level = get_highest_level(txn, self.db, field_id)?; let result = self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; @@ -603,7 +603,7 @@ impl FacetsUpdateIncrementalInner { || self .db .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)? .count() >= self.min_level_size as usize { @@ -614,7 +614,7 @@ impl FacetsUpdateIncrementalInner { .db .as_polymorph() .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)?; - while let Some(el) = iter.next() { + for el in iter.by_ref() { let (k, _) = el?; to_delete.push( FacetGroupKeyCodec::::bytes_decode(k) @@ -640,7 +640,7 @@ impl<'a> FacetGroupKey<&'a [u8]> { } } -impl<'a> FacetGroupKey> { +impl FacetGroupKey> { pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { FacetGroupKey { field_id: self.field_id, @@ -804,7 +804,7 @@ mod tests { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); index.verify_structure_validity(&txn, 0); - index.insert(&mut txn, 0, &(&(i as f64)), &bitmap); + index.insert(&mut txn, 0, &(i as f64), &bitmap); } for i in (200..256).into_iter().rev() { From f3c0b05ae8cc2767e4989f5426356de1f0496d4a Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 28 Oct 2022 09:32:31 +0900 Subject: [PATCH 1769/1889] Fix rust fmt --- milli/src/search/facet/facet_sort_descending.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 6f073b62a..12767c64d 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -100,10 +100,8 @@ impl<'t> Iterator for DescendingFacetSort<'t> { let iter = match self .db .remap_key_type::>() - .rev_range( - self.rtxn, - &(Bound::Included(starting_key_below), end_key_kelow), - ) { + .rev_range(self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow)) + { Ok(iter) => iter, Err(e) => return Some(Err(e)), } From a1d7ed1258f052c3a15610aa5fec885c263d1516 Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 28 Oct 2022 22:07:18 +0900 Subject: [PATCH 1770/1889] fix clippy error and remove clippy job from ci Remove clippy job Fix clippy error type_complexity Restore ambiguous change --- .github/workflows/rust.yml | 18 ------------------ bors.toml | 1 - milli/src/update/facet/bulk.rs | 10 ++++++++-- 3 files changed, 8 insertions(+), 21 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index e640ee1ef..9939d3f24 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -48,24 +48,6 @@ jobs: command: test args: --release - clippy: - name: Run Clippy - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable - override: true - components: clippy - - name: Cache dependencies - uses: Swatinem/rust-cache@v2.0.0 - - name: Run cargo clippy - uses: actions-rs/cargo@v1 - with: - command: clippy - fmt: name: Run Rustfmt runs-on: ubuntu-20.04 diff --git a/bors.toml b/bors.toml index 8ba0eed94..73324892f 100644 --- a/bors.toml +++ b/bors.toml @@ -2,7 +2,6 @@ status = [ 'Tests on ubuntu-20.04', 'Tests on macos-latest', 'Tests on windows-latest', - 'Run Clippy', 'Run Rustfmt', ] # 3 hours timeout diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 2a4c7f49a..9dd62f49f 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -196,7 +196,10 @@ impl FacetsUpdateBulkInner { &self, rtxn: &'t RoTxn, field_id: u16, - handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, + #[allow(clippy::type_complexity)] handle_group: &mut dyn FnMut( + &[RoaringBitmap], + &'t [u8], + ) -> Result<()>, ) -> Result<()> { // we read the elements one by one and // 1. keep track of the left bound @@ -250,7 +253,10 @@ impl FacetsUpdateBulkInner { rtxn: &'t RoTxn, field_id: u16, level: u8, - handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, + #[allow(clippy::type_complexity)] handle_group: &mut dyn FnMut( + &[RoaringBitmap], + &'t [u8], + ) -> Result<()>, ) -> Result>> { if level == 0 { self.read_level_0(rtxn, field_id, handle_group)?; From 03eb5d87c17c545dc28aa6d886e5e6ce0befb9b7 Mon Sep 17 00:00:00 2001 From: Samyak Sarnayak Date: Fri, 28 Oct 2022 19:32:05 +0530 Subject: [PATCH 1771/1889] Only call plane_sweep on subgroups when 2 or more are present --- milli/src/search/criteria/proximity.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index b9cf47c8e..6b09ee2fe 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -496,7 +496,11 @@ fn resolve_plane_sweep_candidates( None => return Ok(vec![]), } } - groups_positions.push(plane_sweep(subgroup, true)?); + match subgroup.len() { + 0 => {}, + 1 => groups_positions.push(subgroup.pop().unwrap()), + _ => groups_positions.push(plane_sweep(subgroup, true)?), + } } match groups_positions.len() { 0 => vec![], From ecb88143f9e5b1be49db8a0d1d59fa15b47a0c99 Mon Sep 17 00:00:00 2001 From: Samyak Sarnayak Date: Fri, 28 Oct 2022 19:37:02 +0530 Subject: [PATCH 1772/1889] Run cargo fmt --- milli/src/search/criteria/proximity.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 6b09ee2fe..d51047821 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -497,7 +497,7 @@ fn resolve_plane_sweep_candidates( } } match subgroup.len() { - 0 => {}, + 0 => {} 1 => groups_positions.push(subgroup.pop().unwrap()), _ => groups_positions.push(plane_sweep(subgroup, true)?), } From d53a80b408617c1a64c565afc427ff3c1de4a66b Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 28 Oct 2022 23:41:35 +0900 Subject: [PATCH 1773/1889] Fix clippy error --- milli/src/update/facet/bulk.rs | 6 ++++-- milli/src/update/facet/incremental.rs | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 9dd62f49f..01a59c1f3 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -192,11 +192,12 @@ impl FacetsUpdateBulkInner { Ok((subwriters, all_docids)) } + #[allow(clippy::type_complexity)] fn read_level_0<'t>( &self, rtxn: &'t RoTxn, field_id: u16, - #[allow(clippy::type_complexity)] handle_group: &mut dyn FnMut( + handle_group: &mut dyn FnMut( &[RoaringBitmap], &'t [u8], ) -> Result<()>, @@ -248,12 +249,13 @@ impl FacetsUpdateBulkInner { /// ## Returns: /// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` /// that must be inserted into the database. + #[allow(clippy::type_complexity)] fn compute_higher_levels<'t>( &self, rtxn: &'t RoTxn, field_id: u16, level: u8, - #[allow(clippy::type_complexity)] handle_group: &mut dyn FnMut( + handle_group: &mut dyn FnMut( &[RoaringBitmap], &'t [u8], ) -> Result<()>, diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 04d702987..c6735224d 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -236,7 +236,7 @@ impl FacetsUpdateIncrementalInner { let max_group_size = self.max_group_size; - let result = self.insert_in_level(txn, field_id, level - 1, &(*facet_value), docids)?; + let result = self.insert_in_level(txn, field_id, level - 1, facet_value, docids)?; // level below inserted an element let (insertion_key, insertion_value) = @@ -494,7 +494,7 @@ impl FacetsUpdateIncrementalInner { let (deletion_key, mut bitmap) = self.find_insertion_key_value(field_id, level, facet_value, txn)?; - let result = self.delete_in_level(txn, field_id, level - 1, &(*facet_value), docids)?; + let result = self.delete_in_level(txn, field_id, level - 1, facet_value, docids)?; let mut decrease_size = false; let next_key = match result { From d55f0e2e5335859ad77b77da4c78829482c533d9 Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 28 Oct 2022 23:42:23 +0900 Subject: [PATCH 1774/1889] Execute cargo fmt --- milli/src/update/facet/bulk.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 01a59c1f3..317a7af9b 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -197,10 +197,7 @@ impl FacetsUpdateBulkInner { &self, rtxn: &'t RoTxn, field_id: u16, - handle_group: &mut dyn FnMut( - &[RoaringBitmap], - &'t [u8], - ) -> Result<()>, + handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result<()> { // we read the elements one by one and // 1. keep track of the left bound @@ -255,10 +252,7 @@ impl FacetsUpdateBulkInner { rtxn: &'t RoTxn, field_id: u16, level: u8, - handle_group: &mut dyn FnMut( - &[RoaringBitmap], - &'t [u8], - ) -> Result<()>, + handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result>> { if level == 0 { self.read_level_0(rtxn, field_id, handle_group)?; From 2dec6e86e9e17433cc06d656fa6835c4fbf4f9d3 Mon Sep 17 00:00:00 2001 From: Shashank Kashyap <50551759+SKVKPandey@users.noreply.github.com> Date: Sun, 30 Oct 2022 08:58:36 +0530 Subject: [PATCH 1775/1889] Delete facet_string_level_zero_value_codec.rs --- milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs deleted file mode 100644 index e69de29bb..000000000 From a07f0a4a435c793f5af2794034464e8245e017d2 Mon Sep 17 00:00:00 2001 From: Shashank Kashyap <50551759+SKVKPandey@users.noreply.github.com> Date: Sun, 30 Oct 2022 08:59:04 +0530 Subject: [PATCH 1776/1889] Delete facet_string_zero_bounds_value_codec.rs --- .../src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs deleted file mode 100644 index e69de29bb..000000000 From 3b35ebda50ff6b34979dbaa76ce37c52929db0e5 Mon Sep 17 00:00:00 2001 From: Pranav Yadav Date: Mon, 31 Oct 2022 15:01:14 +0000 Subject: [PATCH 1777/1889] chore: added `IN`,`NOT IN` to `invalid_filter` msg --- filter-parser/src/error.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index d5d36bd8e..e28685c7a 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -141,10 +141,10 @@ impl<'a> Display for Error<'a> { writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? } ErrorKind::InvalidPrimary if input.trim().is_empty() => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")? } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `{}`.", escaped_input)? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `{}`.", escaped_input)? } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? From 3950ec8d3c8cdd501fb5c9e47abf78271d3613c4 Mon Sep 17 00:00:00 2001 From: Pranav Yadav Date: Mon, 31 Oct 2022 15:41:49 +0000 Subject: [PATCH 1778/1889] chore: update tests for `invalid_filter` msg --- filter-parser/src/lib.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 33025e6e9..3af270557 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -519,7 +519,7 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("'OR'"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `\'OR\'`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `\'OR\'`. 1:5 'OR' "###); @@ -529,12 +529,12 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("channel Ponce"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`. 1:14 channel Ponce "###); insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing. 19:19 channel = Ponce OR "###); @@ -584,12 +584,12 @@ pub mod tests { "###); insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`. 1:17 colour NOT EXIST "###); insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`. + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`. 1:23 subscribers 100 TO1000 "###); From 0d43ddbd85d654c30f6abda3ecdadc1ffac13a5e Mon Sep 17 00:00:00 2001 From: unvalley <38400669+unvalley@users.noreply.github.com> Date: Tue, 1 Nov 2022 01:32:54 +0900 Subject: [PATCH 1779/1889] Update filter-parser/src/lib.rs Co-authored-by: Tamo --- filter-parser/src/lib.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index c595cf827..4a247356c 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -343,9 +343,7 @@ fn parse_error_reserved_keyword(input: Span) -> IResult { } } -/** -primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to -*/ +/// primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to fn parse_primary(input: Span, depth: usize) -> IResult { if depth > MAX_FILTER_DEPTH { return Err(nom::Err::Error(Error::new_from_kind(input, ErrorKind::DepthLimitReached))); From 5ff066c3e7f2d6864e4f60f3120e2e4dcd9b31d1 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Mon, 31 Oct 2022 18:38:48 +0100 Subject: [PATCH 1780/1889] run the tests in all workspaces --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 9939d3f24..ce51255bf 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -46,7 +46,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: test - args: --release + args: --release --all fmt: name: Run Rustfmt From 4492605a789f1fabfc5e9a3002b2e286e7bfe56a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 1 Nov 2022 10:19:45 +0000 Subject: [PATCH 1781/1889] Bump Swatinem/rust-cache from 2.0.0 to 2.0.1 Bumps [Swatinem/rust-cache](https://github.com/Swatinem/rust-cache) from 2.0.0 to 2.0.1. - [Release notes](https://github.com/Swatinem/rust-cache/releases) - [Changelog](https://github.com/Swatinem/rust-cache/blob/master/CHANGELOG.md) - [Commits](https://github.com/Swatinem/rust-cache/compare/v2.0.0...v2.0.1) --- updated-dependencies: - dependency-name: Swatinem/rust-cache dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index ce51255bf..f6449c1e0 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -36,7 +36,7 @@ jobs: toolchain: stable override: true - name: Cache dependencies - uses: Swatinem/rust-cache@v2.0.0 + uses: Swatinem/rust-cache@v2.0.1 - name: Run cargo check uses: actions-rs/cargo@v1 with: @@ -60,7 +60,7 @@ jobs: override: true components: rustfmt - name: Cache dependencies - uses: Swatinem/rust-cache@v2.0.0 + uses: Swatinem/rust-cache@v2.0.1 - name: Run cargo fmt # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate From 13175f2339789a21985c931bd1f848781f2c0c28 Mon Sep 17 00:00:00 2001 From: unvalley Date: Thu, 3 Nov 2022 17:34:33 +0900 Subject: [PATCH 1782/1889] refactor: match for filterCondition --- milli/src/search/facet/filter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 40986fea0..5da1ba7fd 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -101,9 +101,9 @@ impl<'a> Filter<'a> { } match ors.len() { + 0 => (), 1 => ands.push(ors.pop().unwrap()), - n if n > 1 => ands.push(FilterCondition::Or(ors)), - _ => (), + _ => ands.push(FilterCondition::Or(ors)), } } Either::Right(rule) => { From 48eafc546ff68f81fd57ac77b27c7d3742c36e3d Mon Sep 17 00:00:00 2001 From: azzamsa Date: Fri, 4 Nov 2022 00:03:53 +0700 Subject: [PATCH 1783/1889] ci: Use pre-compiled binaries for faster CI --- .github/workflows/manual_benchmarks.yml | 4 +++- .github/workflows/push_benchmarks_indexing.yml | 4 +++- .github/workflows/push_benchmarks_search_geo.yml | 4 +++- .github/workflows/push_benchmarks_search_songs.yml | 4 +++- .github/workflows/push_benchmarks_search_wiki.yml | 4 +++- .github/workflows/update-cargo-toml-version.yml | 6 ++++-- 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/.github/workflows/manual_benchmarks.yml b/.github/workflows/manual_benchmarks.yml index d85a6c07b..e7cbfe68b 100644 --- a/.github/workflows/manual_benchmarks.yml +++ b/.github/workflows/manual_benchmarks.yml @@ -50,7 +50,9 @@ jobs: # Generate critcmp files - name: Install critcmp - run: cargo install critcmp + uses: taiki-e/install-action@v1 + with: + tool: critcmp - name: Export cripcmp file run: | critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json diff --git a/.github/workflows/push_benchmarks_indexing.yml b/.github/workflows/push_benchmarks_indexing.yml index c53de93da..1011f2461 100644 --- a/.github/workflows/push_benchmarks_indexing.yml +++ b/.github/workflows/push_benchmarks_indexing.yml @@ -48,7 +48,9 @@ jobs: # Generate critcmp files - name: Install critcmp - run: cargo install critcmp + uses: taiki-e/install-action@v1 + with: + tool: critcmp - name: Export cripcmp file run: | critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json diff --git a/.github/workflows/push_benchmarks_search_geo.yml b/.github/workflows/push_benchmarks_search_geo.yml index 8a79ce14d..7aa98ca58 100644 --- a/.github/workflows/push_benchmarks_search_geo.yml +++ b/.github/workflows/push_benchmarks_search_geo.yml @@ -47,7 +47,9 @@ jobs: # Generate critcmp files - name: Install critcmp - run: cargo install critcmp + uses: taiki-e/install-action@v1 + with: + tool: critcmp - name: Export cripcmp file run: | critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json diff --git a/.github/workflows/push_benchmarks_search_songs.yml b/.github/workflows/push_benchmarks_search_songs.yml index 8e6f2de75..aa0fcacd4 100644 --- a/.github/workflows/push_benchmarks_search_songs.yml +++ b/.github/workflows/push_benchmarks_search_songs.yml @@ -47,7 +47,9 @@ jobs: # Generate critcmp files - name: Install critcmp - run: cargo install critcmp + uses: taiki-e/install-action@v1 + with: + tool: critcmp - name: Export cripcmp file run: | critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json diff --git a/.github/workflows/push_benchmarks_search_wiki.yml b/.github/workflows/push_benchmarks_search_wiki.yml index 91718b1ce..2f266941d 100644 --- a/.github/workflows/push_benchmarks_search_wiki.yml +++ b/.github/workflows/push_benchmarks_search_wiki.yml @@ -47,7 +47,9 @@ jobs: # Generate critcmp files - name: Install critcmp - run: cargo install critcmp + uses: taiki-e/install-action@v1 + with: + tool: critcmp - name: Export cripcmp file run: | critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json diff --git a/.github/workflows/update-cargo-toml-version.yml b/.github/workflows/update-cargo-toml-version.yml index 45c611c29..0854e265b 100644 --- a/.github/workflows/update-cargo-toml-version.yml +++ b/.github/workflows/update-cargo-toml-version.yml @@ -23,8 +23,10 @@ jobs: profile: minimal toolchain: stable override: true - - name: Install sd - run: cargo install sd + - name: Install critcmp + uses: taiki-e/install-action@v1 + with: + tool: sd - name: Update all Cargo.toml files run: | raw_new_version=$(echo $NEW_VERSION | cut -d 'v' -f 2) From 401e956128f866d6ea21318cc73019c047364c14 Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 28 Oct 2022 22:14:12 +0900 Subject: [PATCH 1784/1889] Add clippy job Add clippy job to CI --- .github/workflows/rust.yml | 18 ++++++++++++++++++ bors.toml | 1 + 2 files changed, 19 insertions(+) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f6449c1e0..d35e78c70 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -48,6 +48,24 @@ jobs: command: test args: --release --all + clippy: + name: Run Clippy + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + components: clippy + - name: Cache dependencies + uses: Swatinem/rust-cache@v2.0.0 + - name: Run cargo clippy + uses: actions-rs/cargo@v1 + with: + command: clippy + fmt: name: Run Rustfmt runs-on: ubuntu-20.04 diff --git a/bors.toml b/bors.toml index 73324892f..8ba0eed94 100644 --- a/bors.toml +++ b/bors.toml @@ -2,6 +2,7 @@ status = [ 'Tests on ubuntu-20.04', 'Tests on macos-latest', 'Tests on windows-latest', + 'Run Clippy', 'Run Rustfmt', ] # 3 hours timeout From 3009981d31d7adc0d87d60f008155fe21f0a00fa Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 4 Nov 2022 08:52:54 +0900 Subject: [PATCH 1785/1889] Fix clippy errors Add clippy job Add clippy job to CI --- .github/workflows/rust.yml | 18 ++++++++++++++++++ milli/src/search/criteria/mod.rs | 12 ++---------- milli/src/search/criteria/proximity.rs | 2 +- milli/src/search/query_tree.rs | 2 +- .../extract/extract_docid_word_positions.rs | 2 +- .../extract/extract_facet_string_docids.rs | 2 +- .../extract_word_pair_proximity_docids.rs | 2 +- .../extract/extract_word_position_docids.rs | 2 +- milli/src/update/index_documents/transform.rs | 4 ++-- 9 files changed, 28 insertions(+), 18 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index d35e78c70..5be5a506b 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -66,6 +66,24 @@ jobs: with: command: clippy + clippy: + name: Run Clippy + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + components: clippy + - name: Cache dependencies + uses: Swatinem/rust-cache@v2.0.0 + - name: Run cargo clippy + uses: actions-rs/cargo@v1 + with: + command: clippy + fmt: name: Run Rustfmt runs-on: ubuntu-20.04 diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 8d0e3af05..d1c482b38 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -432,18 +432,10 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result { subgroup.push(positions.iter().map(|p| (p, 0, p)).collect()) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 5042f4762..3237bb9d3 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -591,7 +591,7 @@ fn create_matching_words( (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); let words = words .into_iter() - .filter_map(|w| w) + .flatten() .map(|w| MatchingWord::new(w, 0, false)) .collect(); matching_words.push((words, ids)); diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index f1d595039..8eae0caee 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -80,7 +80,7 @@ pub fn extract_docid_word_positions( .map_err(|_| SerializationError::InvalidNumberSerialization)?; let position = absolute_from_relative_position(field_id, position); docid_word_positions_sorter - .insert(&key_buffer, &position.to_ne_bytes())?; + .insert(&key_buffer, position.to_ne_bytes())?; } } } diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 221356ba0..182538683 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -43,7 +43,7 @@ pub fn extract_facet_string_docids( let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); // document id is encoded in native-endian because of the CBO roaring bitmap codec - facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?; + facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; } sorter_into_reader(facet_string_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 0c7700a33..6707fc268 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -145,7 +145,7 @@ fn document_word_positions_into_sorter( key_buffer.push(0); key_buffer.extend_from_slice(w2.as_bytes()); - word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; + word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } Ok(()) diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index d4a3eda2c..b7015d2ce 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -41,7 +41,7 @@ pub fn extract_word_position_docids( key_buffer.extend_from_slice(word_bytes); key_buffer.extend_from_slice(&position.to_be_bytes()); - word_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; + word_position_docids_sorter.insert(&key_buffer,document_id.to_ne_bytes())?; } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 7c9a912b3..59f18b22d 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -248,7 +248,7 @@ impl<'a, 'i> Transform<'a, 'i> { skip_insertion = true; } else { // we associate the base document with the new key, everything will get merged later. - self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; + self.original_sorter.insert(docid.to_be_bytes(), base_obkv)?; match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { Some(buffer) => { self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)? @@ -261,7 +261,7 @@ impl<'a, 'i> Transform<'a, 'i> { if !skip_insertion { self.new_documents_ids.insert(docid); // We use the extracted/generated user id as the key for this document. - self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?; + self.original_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?; match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, From 70465aa5ce2a3640fdcc4f0dc8c9a025bd8e056b Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 4 Nov 2022 08:59:58 +0900 Subject: [PATCH 1786/1889] Execute cargo fmt --- milli/src/search/criteria/mod.rs | 17 +++++++++++------ milli/src/search/query_tree.rs | 7 ++----- .../extract/extract_word_position_docids.rs | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d1c482b38..76718c8ec 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -431,12 +431,17 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result bitmaps.push(m), diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 3237bb9d3..a9c1ac29f 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -589,11 +589,8 @@ fn create_matching_words( PrimitiveQueryPart::Phrase(words) => { let ids: Vec<_> = (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); - let words = words - .into_iter() - .flatten() - .map(|w| MatchingWord::new(w, 0, false)) - .collect(); + let words = + words.into_iter().flatten().map(|w| MatchingWord::new(w, 0, false)).collect(); matching_words.push((words, ids)); } } diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index b7015d2ce..d95db4157 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -41,7 +41,7 @@ pub fn extract_word_position_docids( key_buffer.extend_from_slice(word_bytes); key_buffer.extend_from_slice(&position.to_be_bytes()); - word_position_docids_sorter.insert(&key_buffer,document_id.to_ne_bytes())?; + word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } } From b09676779de7bdee978d9586099bf12aa818a87e Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 4 Nov 2022 09:13:01 +0900 Subject: [PATCH 1787/1889] Use nightly for clippy and remove conflict mistake --- .github/workflows/rust.yml | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 5be5a506b..f7acfbebd 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -56,25 +56,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: stable - override: true - components: clippy - - name: Cache dependencies - uses: Swatinem/rust-cache@v2.0.0 - - name: Run cargo clippy - uses: actions-rs/cargo@v1 - with: - command: clippy - - clippy: - name: Run Clippy - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - uses: actions-rs/toolchain@v1 - with: - profile: minimal - toolchain: stable + toolchain: nightly override: true components: clippy - name: Cache dependencies From abf1cf9cd5b07be868e86f3dfec4be76d7dd707e Mon Sep 17 00:00:00 2001 From: unvalley Date: Fri, 4 Nov 2022 09:27:46 +0900 Subject: [PATCH 1788/1889] Fix clippy errors --- milli/src/search/matches/matching_words.rs | 2 +- milli/src/update/facet/incremental.rs | 12 +++++------- milli/src/update/words_prefixes_fst.rs | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 1f6ead8a9..25d447d0c 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -225,7 +225,7 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize { for (col, char_t) in target.chars().enumerate() { let col = col + 1; let last_match_row = *last_row.get(&char_t).unwrap_or(&0); - let cost = if char_s == char_t { 0 } else { 1 }; + let cost = usize::from(char_s != char_t); let dist_add = matrix[(row, col + 1)] + 1; let dist_del = matrix[(row + 1, col)] + 1; diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index c6735224d..fd253b146 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -329,7 +329,7 @@ impl FacetsUpdateIncrementalInner { let key = FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; - let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; + let value = FacetGroupValue { size: size_left, bitmap: values_left }; (key, value) }; @@ -345,7 +345,7 @@ impl FacetsUpdateIncrementalInner { } let key = FacetGroupKey { field_id, level, left_bound: right_left_bound.to_vec() }; - let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; + let value = FacetGroupValue { size: size_right, bitmap: values_right }; (key, value) }; drop(iter); @@ -373,8 +373,7 @@ impl FacetsUpdateIncrementalInner { let highest_level = get_highest_level(txn, self.db, field_id)?; - let result = - self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; + let result = self.insert_in_level(txn, field_id, highest_level, facet_value, docids)?; match result { InsertionResult::InPlace => return Ok(()), InsertionResult::Expand => return Ok(()), @@ -425,7 +424,7 @@ impl FacetsUpdateIncrementalInner { level: highest_level + 1, left_bound: first_key.unwrap().left_bound, }; - let value = FacetGroupValue { size: group_size as u8, bitmap: values }; + let value = FacetGroupValue { size: group_size, bitmap: values }; to_add.push((key.into_owned(), value)); } // now we add the rest of the level, in case its size is > group_size * min_level_size @@ -584,8 +583,7 @@ impl FacetsUpdateIncrementalInner { } let highest_level = get_highest_level(txn, self.db, field_id)?; - let result = - self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; + let result = self.delete_in_level(txn, field_id, highest_level, facet_value, docids)?; match result { DeletionResult::InPlace => return Ok(()), DeletionResult::Reduce { .. } => return Ok(()), diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs index 193956c7a..57fed0922 100644 --- a/milli/src/update/words_prefixes_fst.rs +++ b/milli/src/update/words_prefixes_fst.rs @@ -36,7 +36,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped /// to these bounds, otherwise. pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value.min(25).max(1); // clamp [1, 25] + self.max_prefix_length = value.clamp(1, 25); self } From 332856078823690cb14c3dfb1071321967cb6d63 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 8 Nov 2022 10:17:16 +0100 Subject: [PATCH 1789/1889] fix: allow filters on = inf, = NaN, return InvalidFilter for < inf, < NaN Fixes meilisearch/meilisearch#3000 --- filter-parser/src/error.rs | 4 ++++ filter-parser/src/lib.rs | 14 +++++++------- milli/src/search/facet/filter.rs | 25 +++++++++++++++++-------- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index e28685c7a..8a628156a 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -65,6 +65,7 @@ pub enum ErrorKind<'a> { MalformedValue, InOpeningBracket, InClosingBracket, + NonFiniteFloat, InExpectedValue(ExpectedValueKind), ReservedKeyword(String), MissingClosingDelimiter(char), @@ -167,6 +168,9 @@ impl<'a> Display for Error<'a> { ErrorKind::InClosingBracket => { writeln!(f, "Expected matching `]` after the list of field names given to `IN[`")? } + ErrorKind::NonFiniteFloat => { + writeln!(f, "Non finite floats are not supported")? + } ErrorKind::InExpectedValue(ExpectedValueKind::ReservedKeyword) => { writeln!(f, "Expected only comma-separated field names inside `IN[..]` but instead found `{escaped_input}`, which is a keyword. To use `{escaped_input}` as a field name or a value, surround it by quotes.")? } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 8c1431d93..a9bd9b3d7 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -44,7 +44,6 @@ mod error; mod value; use std::fmt::Debug; -use std::str::FromStr; pub use condition::{parse_condition, parse_to, Condition}; use condition::{parse_exists, parse_not_exists}; @@ -100,12 +99,13 @@ impl<'a> Token<'a> { Error::new_from_external(self.span, error) } - pub fn parse(&self) -> Result - where - T: FromStr, - T::Err: std::error::Error, - { - self.span.parse().map_err(|e| self.as_external_error(e)) + pub fn parse_finite_float(&self) -> Result { + let value: f64 = self.span.parse().map_err(|e| self.as_external_error(e))?; + if value.is_finite() { + Ok(value) + } else { + Err(Error::new_from_kind(self.span, ErrorKind::NonFiniteFloat)) + } } } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 5da1ba7fd..ef293ee41 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -169,11 +169,19 @@ impl<'a> Filter<'a> { // field id and the level. let (left, right) = match operator { - Condition::GreaterThan(val) => (Excluded(val.parse()?), Included(f64::MAX)), - Condition::GreaterThanOrEqual(val) => (Included(val.parse()?), Included(f64::MAX)), - Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)), - Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)), - Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)), + Condition::GreaterThan(val) => { + (Excluded(val.parse_finite_float()?), Included(f64::MAX)) + } + Condition::GreaterThanOrEqual(val) => { + (Included(val.parse_finite_float()?), Included(f64::MAX)) + } + Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse_finite_float()?)), + Condition::LowerThanOrEqual(val) => { + (Included(f64::MIN), Included(val.parse_finite_float()?)) + } + Condition::Between { from, to } => { + (Included(from.parse_finite_float()?), Included(to.parse_finite_float()?)) + } Condition::Exists => { let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; return Ok(exist); @@ -190,7 +198,7 @@ impl<'a> Filter<'a> { )? .map(|v| v.bitmap) .unwrap_or_default(); - let number = val.parse::().ok(); + let number = val.parse_finite_float().ok(); let number_docids = match number { Some(n) => { let n = Included(n); @@ -389,7 +397,8 @@ impl<'a> Filter<'a> { } FilterCondition::GeoLowerThan { point, radius } => { if filterable_fields.contains("_geo") { - let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?]; + let base_point: [f64; 2] = + [point[0].parse_finite_float()?, point[1].parse_finite_float()?]; if !(-90.0..=90.0).contains(&base_point[0]) { return Err( point[0].as_external_error(FilterError::BadGeoLat(base_point[0])) @@ -400,7 +409,7 @@ impl<'a> Filter<'a> { point[1].as_external_error(FilterError::BadGeoLng(base_point[1])) )?; } - let radius = radius.parse()?; + let radius = radius.parse_finite_float()?; let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, None => return Ok(RoaringBitmap::new()), From a836b8e703f80899490e1496e8685b125cb0a756 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 8 Nov 2022 10:21:54 +0100 Subject: [PATCH 1790/1889] tests: Tests filter with non-finite floats --- milli/src/search/facet/filter.rs | 56 ++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index ef293ee41..9b87353b0 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -698,4 +698,60 @@ mod tests { let option = Filter::from_str(" ").unwrap(); assert_eq!(option, None); } + + #[test] + fn non_finite_float() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("price")]); // to keep the fields order + settings.set_filterable_fields(hashset! { S("price") }); + }) + .unwrap(); + index + .add_documents(documents!([ + { + "id": "test_1", + "price": "inf" + }, + { + "id": "test_2", + "price": "2000" + }, + { + "id": "test_3", + "price": "infinity" + }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let filter = Filter::from_str("price = inf").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert!(result.contains(0)); + let filter = Filter::from_str("price < inf").unwrap().unwrap(); + assert!(matches!( + filter.evaluate(&rtxn, &index), + Err(crate::Error::UserError(crate::error::UserError::InvalidFilter(_))) + )); + + let filter = Filter::from_str("price = NaN").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert!(result.is_empty()); + let filter = Filter::from_str("price < NaN").unwrap().unwrap(); + assert!(matches!( + filter.evaluate(&rtxn, &index), + Err(crate::Error::UserError(crate::error::UserError::InvalidFilter(_))) + )); + + let filter = Filter::from_str("price = infinity").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert!(result.contains(2)); + let filter = Filter::from_str("price < infinity").unwrap().unwrap(); + assert!(matches!( + filter.evaluate(&rtxn, &index), + Err(crate::Error::UserError(crate::error::UserError::InvalidFilter(_))) + )); + } } From 1b1ad1923b8ef6e4d07173ef6f5b98d7ffc0b417 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 8 Nov 2022 14:23:14 +0100 Subject: [PATCH 1791/1889] Add a test to check that we take care of soft deleted documents --- milli/src/update/settings.rs | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index f82a57cbc..6da32d73f 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -719,6 +719,7 @@ mod tests { use super::*; use crate::error::Error; use crate::index::tests::TempIndex; + use crate::update::DeleteDocuments; use crate::{Criterion, Filter, SearchResult}; #[test] @@ -1494,4 +1495,34 @@ mod tests { }) .unwrap(); } + + #[test] + fn settings_must_ignore_soft_deleted() { + use serde_json::json; + + let index = TempIndex::new(); + + let mut docs = vec![]; + for i in 0..10 { + docs.push(json!({ "id": i, "title": format!("{:x}", i) })); + } + index.add_documents(documents! { docs }).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + (0..5).for_each(|id| drop(builder.delete_external_id(&id.to_string()))); + builder.execute().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_searchable_fields(vec!["id".to_string()]); + }) + .unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.write_txn().unwrap(); + let docs: StdResult, _> = index.all_documents(&rtxn).unwrap().collect(); + let docs = docs.unwrap(); + assert_eq!(docs.len(), 5); + } } From 37b3c5c323ccb8b3a192c03a9fc59c797b195330 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 8 Nov 2022 14:01:28 +0100 Subject: [PATCH 1792/1889] Fix transform to use all_documents and ignore soft_deleted documents --- milli/src/update/index_documents/transform.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 59f18b22d..57aa02e04 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -578,9 +578,8 @@ impl<'a, 'i> Transform<'a, 'i> { ); let mut obkv_buffer = Vec::new(); - for result in self.index.documents.iter(wtxn)? { + for result in self.index.all_documents(wtxn)? { let (docid, obkv) = result?; - let docid = docid.get(); obkv_buffer.clear(); let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); From bd12989610a5cac8d032309b265bffd3e0ab4860 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 8 Nov 2022 14:31:39 +0000 Subject: [PATCH 1793/1889] Update version for the next release (v0.35.1) in Cargo.toml files --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index b5fee6640..de4d6dad8 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.35.0" +version = "0.35.1" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 30fab7851..0aa56d3cb 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.35.0" +version = "0.35.1" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index b22fdaad5..b7fd00d08 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.35.0" +version = "0.35.1" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index aa0787eed..aa7b8a5aa 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.35.0" +version = "0.35.1" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index db6132fe8..bcd23e988 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.35.0" +version = "0.35.1" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index c7c780dd4..60fc78390 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.35.0" +version = "0.35.1" authors = ["Kerollmops "] edition = "2018" From 8ce8bbcdfc99177818e28e6cc6c7db25791644a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar=20-=20curqui?= Date: Tue, 8 Nov 2022 15:49:45 +0100 Subject: [PATCH 1794/1889] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 131b7ad3b..b0fe1c913 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -111,7 +111,7 @@ This project integrates a tool to create automated changelogs: the [release-draf ### How to Publish the Release -Make a PR modifying all the `Cargo.toml` files with the right version. +Make a PR modifying all the `Cargo.toml` files with the right version by using our automation -> Go to [this GitHub Action](https://github.com/meilisearch/milli/actions/workflows/update-cargo-toml-version.yml), click on `Run workflow`, and fill the appropriate version before validating. A PR updating all the versions in the `Cargo.toml` files will be created. Once the changes are merged on `main`, you can publish the current draft release via the [GitHub interface](https://github.com/meilisearch/milli/releases): on this page, click on `Edit` (related to the draft release) > update the description if needed > when you are ready, click on `Publish release`. From c7711daca349c6ebcca616f790f0217313438651 Mon Sep 17 00:00:00 2001 From: Irevoire Date: Tue, 8 Nov 2022 16:28:01 +0100 Subject: [PATCH 1795/1889] use the lmdb-master.3 branch --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index c7c780dd4..9adc5fbe2 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,7 +18,7 @@ fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" grenad = { version = "0.4.3", default-features = false, features = ["tempfile"] } -heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/meilisearch/heed", branch = "lmdb-master-3", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memmap2 = "0.5.7" From d00d2aab3f693617e8e53b265442f721075989ff Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 9 Nov 2022 11:03:09 +0000 Subject: [PATCH 1796/1889] Update version for the next release (v0.36.0) in Cargo.toml files --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index de4d6dad8..a49c08d39 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.35.1" +version = "0.36.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 0aa56d3cb..8c9cc6c5e 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.35.1" +version = "0.36.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index b7fd00d08..50961e01f 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.35.1" +version = "0.36.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index aa7b8a5aa..3fd52a9fe 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.35.1" +version = "0.36.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index bcd23e988..3e7c2fae9 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.35.1" +version = "0.36.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index e0d7cbdd7..72e31f6c5 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.35.1" +version = "0.36.0" authors = ["Kerollmops "] edition = "2018" From 6dc6a5d874ef9248e6e0308865a3a303cbfd6116 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 14 Nov 2022 17:17:51 +0100 Subject: [PATCH 1797/1889] Force using vendored version of LMDB - don't use lmdb master3 branch anymore --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 72e31f6c5..6867edb79 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -18,7 +18,7 @@ fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.5.1" grenad = { version = "0.4.3", default-features = false, features = ["tempfile"] } -heed = { git = "https://github.com/meilisearch/heed", branch = "lmdb-master-3", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.4", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memmap2 = "0.5.7" From 87576cf26c2baa5a0a185cb54a319fda863d362c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 15 Nov 2022 10:25:02 +0100 Subject: [PATCH 1798/1889] Perform cargo check on the release artifacts --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f7acfbebd..b0d15a6e0 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -41,7 +41,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: check - args: --workspace --all-targets + args: --workspace --all-targets --release - name: Run cargo test uses: actions-rs/cargo@v1 with: From 92cc3550d84e22d6ef6f95d3f5fd3e16642a60e9 Mon Sep 17 00:00:00 2001 From: meili-bot <74670311+meili-bot@users.noreply.github.com> Date: Tue, 15 Nov 2022 16:16:40 +0100 Subject: [PATCH 1799/1889] Update CONTRIBUTING.md --- CONTRIBUTING.md | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b0fe1c913..83bfc5a5f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,25 +5,12 @@ First, thank you for contributing to Meilisearch! The goal of this document is t Remember that there are many ways to contribute other than writing code: writing [tutorials or blog posts](https://github.com/meilisearch/awesome-meilisearch), improving [the documentation](https://github.com/meilisearch/documentation), submitting [bug reports](https://github.com/meilisearch/milli/issues/new) and [feature requests](https://github.com/meilisearch/product/discussions/categories/feedback-feature-proposal)... ## Table of Contents -- [Hacktoberfest](#hacktoberfest-2022) - [Assumptions](#assumptions) - [How to Contribute](#how-to-contribute) - [Development Workflow](#development-workflow) - [Git Guidelines](#git-guidelines) - [Release Process (for internal team only)](#release-process-for-internal-team-only) -## Hacktoberfest 2022 - -It's [Hacktoberfest month](https://hacktoberfest.com)! 🥳 - -Thanks so much for participating with Meilisearch this year! - -1. We will follow the quality standards set by the organizers of Hacktoberfest (see detail on their [website](https://hacktoberfest.com/participation/#spam)). Our reviewers will not consider any PR that doesn’t match that standard. -2. PRs reviews will take place from Monday to Thursday, during usual working hours, CEST time. If you submit outside of these hours, there’s no need to panic; we will get around to your contribution. -3. There will be no issue assignment as we don’t want people to ask to be assigned specific issues and never return, discouraging the volunteer contributors from opening a PR to fix this issue. We take the liberty to choose the PR that best fixes the issue, so we encourage you to get to it as soon as possible and do your best! - -You can check out the longer, more complete guideline documentation [here](https://github.com/meilisearch/.github/blob/main/Hacktoberfest_2022_contributors_guidelines.md). - ## Assumptions 1. **You're familiar with [GitHub](https://github.com) and the [Pull Requests](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests)(PR) workflow.** From f7c8730d0984f3e19f6ed8e915a1abb9f453025e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 14 Nov 2022 15:19:00 +0100 Subject: [PATCH 1800/1889] Fix bug in prefix DB indexing Where the batch's information was not properly updated in cases where only the proximity changed between two consecutive word pair proximities. Closes https://github.com/meilisearch/meilisearch/issues/3043 --- milli/src/update/prefix_word_pairs/mod.rs | 47 +++++++++++++++++++ .../update/prefix_word_pairs/word_prefix.rs | 9 ++-- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 03abdbb6e..6030a82f2 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -238,4 +238,51 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids, "update"); db_snap!(index, prefix_word_pair_proximity_docids, "update"); } + #[test] + fn test_batch_bug_3034() { + // https://github.com/meilisearch/meilisearch/issues/3043 + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["y"]); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "text": "x y" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "text": "x a y" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_pair_proximity_docids); + db_snap!(index, word_prefix_pair_proximity_docids); + db_snap!(index, prefix_word_pair_proximity_docids); + } } diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index 71a2a2915..db607e56c 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -44,7 +44,7 @@ word2 : doggo 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key `prefix` and the value (`docids`) to a sorted map which we call the “batch”. For example, -at the end of the first inner loop, we may have: +at the end of the first outer loop, we may have: ```text Outer loop 1: ------------------------------ @@ -85,7 +85,7 @@ end of the batch. 4. On the third iteration of the outer loop, we have: ```text -Outer loop 4: +Outer loop 3: ------------------------------ proximity: 1 word1 : good @@ -340,17 +340,16 @@ fn execute_on_word_pairs_and_prefixes( if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev { batch.flush(&mut merge_buffer, &mut insert)?; + batch.proximity = proximity; // don't forget to reset the value of batch.word1 and prev_word2_start if word1_different_than_prev { - prefix_search_start.0 = 0; batch.word1.clear(); batch.word1.extend_from_slice(word1); - batch.proximity = proximity; } if word2_start_different_than_prev { - // word2_start_different_than_prev == true prev_word2_start = word2[0]; } + prefix_search_start.0 = 0; // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); } From f00108d2ec665681db49315c98f1b1c66c47e91b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 16 Nov 2022 12:12:49 +0100 Subject: [PATCH 1801/1889] Fix name of bug in reproduction test --- milli/src/update/prefix_word_pairs/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 6030a82f2..10ea850af 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -239,7 +239,7 @@ mod tests { db_snap!(index, prefix_word_pair_proximity_docids, "update"); } #[test] - fn test_batch_bug_3034() { + fn test_batch_bug_3043() { // https://github.com/meilisearch/meilisearch/issues/3043 let mut index = TempIndex::new(); index.index_documents_config.words_prefix_threshold = Some(50); From d95d02cb8a54937b71a8d80cf76a2257b7017041 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 14 Nov 2022 14:16:14 +0100 Subject: [PATCH 1802/1889] Fix Facet Indexing bugs 1. Handle keys with variable length correctly This fixes https://github.com/meilisearch/meilisearch/issues/3042 and is easily reproducible with the updated fuzz tests, which now generate keys with variable lengths. 2. Prevent adding facets to the database if their encoded value does not satisfy `valid_lmdb_key`. This fixes an indexing failure when a document had a filterable attribute containing a value whose length is higher than ~500 bytes. --- milli/src/update/facet/bulk.rs | 8 +- milli/src/update/facet/incremental.rs | 302 ++++-------------- .../extract/extract_facet_number_docids.rs | 1 - .../extract/extract_facet_string_docids.rs | 9 +- 4 files changed, 66 insertions(+), 254 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 317a7af9b..30660d5af 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -12,7 +12,7 @@ use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::heed_codec::ByteSliceRefCodec; -use crate::update::index_documents::{create_writer, writer_into_reader}; +use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases @@ -142,6 +142,9 @@ impl FacetsUpdateBulkInner { let mut database = self.db.iter_mut(wtxn)?.remap_types::(); let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + if !valid_lmdb_key(key) { + continue; + } buffer.clear(); // the group size for level 0 buffer.push(1); @@ -155,6 +158,9 @@ impl FacetsUpdateBulkInner { let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + if !valid_lmdb_key(key) { + continue; + } // the value is a CboRoaringBitmap, but I still need to prepend the // group size for level 0 (= 1) to it buffer.clear(); diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index fd253b146..b07b675c5 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -11,6 +11,7 @@ use crate::heed_codec::facet::{ }; use crate::heed_codec::ByteSliceRefCodec; use crate::search::facet::get_highest_level; +use crate::update::index_documents::valid_lmdb_key; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; enum InsertionResult { @@ -70,6 +71,9 @@ impl<'i> FacetsUpdateIncremental<'i> { let mut cursor = self.new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { + if !valid_lmdb_key(key) { + continue; + } let key = FacetGroupKeyCodec::::bytes_decode(key) .ok_or(heed::Error::Encoding)?; let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; @@ -114,52 +118,37 @@ impl FacetsUpdateIncrementalInner { txn: &RoTxn, ) -> Result<(FacetGroupKey>, FacetGroupValue)> { assert!(level > 0); + match self.db.get_lower_than_or_equal_to( + txn, + &FacetGroupKey { field_id, level, left_bound: facet_value }, + )? { + Some((key, value)) => { + if key.level != level { + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(level); - let mut prefix = vec![]; - prefix.extend_from_slice(&field_id.to_be_bytes()); - prefix.push(level); - prefix.extend_from_slice(facet_value); - - let mut prefix_iter = self - .db - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, prefix.as_slice())?; - if let Some(e) = prefix_iter.next() { - let (key_bytes, value) = e?; - Ok(( - FacetGroupKeyCodec::::bytes_decode(key_bytes) - .ok_or(Error::Encoding)? - .into_owned(), - value, - )) - } else { - let key = FacetGroupKey { field_id, level, left_bound: facet_value }; - match self.db.get_lower_than(txn, &key)? { - Some((key, value)) => { - if key.level != level { - let mut prefix = vec![]; - prefix.extend_from_slice(&field_id.to_be_bytes()); - prefix.push(level); - - let mut iter = self - .db - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( - txn, - prefix.as_slice(), - )?; - let (key_bytes, value) = iter.next().unwrap()?; - Ok(( - FacetGroupKeyCodec::::bytes_decode(key_bytes) - .ok_or(Error::Encoding)? - .into_owned(), - value, - )) - } else { - Ok((key.into_owned(), value)) - } + let mut iter = + self.db.as_polymorph().prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + txn, + &prefix.as_slice(), + )?; + let (key_bytes, value) = iter.next().unwrap()?; + Ok(( + FacetGroupKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)? + .into_owned(), + value, + )) + } else { + Ok((key.into_owned(), value)) } - None => panic!(), + } + None => { + // We checked that the level is > 0 + // Since all keys of level 1 are greater than those of level 0, + // we are guaranteed that db.get_lower_than_or_equal_to(key) exists + panic!() } } } @@ -1050,9 +1039,7 @@ ensures that: 2. its content is the same as a trivially correct implementation of the same database */ mod fuzz { - use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; - use std::convert::TryFrom; use std::iter::FromIterator; use std::rc::Rc; @@ -1060,49 +1047,30 @@ mod fuzz { use fuzzcheck::mutators::integer_within_range::{U16WithinRangeMutator, U8WithinRangeMutator}; use fuzzcheck::mutators::vector::VecMutator; use fuzzcheck::DefaultMutator; - use heed::BytesEncode; use roaring::RoaringBitmap; use tempfile::TempDir; use super::*; use crate::update::facet::tests::FacetIndex; - - struct NEU16Codec; - impl<'a> BytesEncode<'a> for NEU16Codec { - type EItem = u16; - #[no_coverage] - fn bytes_encode(item: &'a Self::EItem) -> Option> { - Some(Cow::Owned(item.to_be_bytes().to_vec())) - } - } - impl<'a> BytesDecode<'a> for NEU16Codec { - type DItem = u16; - #[no_coverage] - fn bytes_decode(bytes: &'a [u8]) -> Option { - let bytes = <[u8; 2]>::try_from(&bytes[0..=1]).unwrap(); - Some(u16::from_be_bytes(bytes)) - } - } - #[derive(Default)] pub struct TrivialDatabase { pub elements: BTreeMap>, } impl TrivialDatabase where - T: Ord + Clone + Copy + Eq + std::fmt::Debug, + T: Ord + Clone + Eq + std::fmt::Debug, { #[no_coverage] - pub fn insert(&mut self, field_id: u16, new_key: T, new_values: &RoaringBitmap) { + pub fn insert(&mut self, field_id: u16, new_key: &T, new_values: &RoaringBitmap) { if new_values.is_empty() { return; } let values_field_id = self.elements.entry(field_id).or_default(); - let values = values_field_id.entry(new_key).or_default(); + let values = values_field_id.entry(new_key.clone()).or_default(); *values |= new_values; } #[no_coverage] - pub fn delete(&mut self, field_id: u16, key: T, values_to_remove: &RoaringBitmap) { + pub fn delete(&mut self, field_id: u16, key: &T, values_to_remove: &RoaringBitmap) { if let Some(values_field_id) = self.elements.get_mut(&field_id) { if let Some(values) = values_field_id.get_mut(&key) { *values -= values_to_remove; @@ -1117,8 +1085,9 @@ mod fuzz { } } #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] - struct Operation { - key: Key, + struct Operation { + #[field_mutator(VecMutator = { VecMutator::new(u8::default_mutator(), 0 ..= 5) })] + key: Vec, #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] group_size: u8, #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] @@ -1142,13 +1111,12 @@ mod fuzz { } #[no_coverage] - fn compare_with_trivial_database(tempdir: Rc, operations: &[Operation]) { - let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten - // let mut txn = index.env.write_txn().unwrap(); + fn compare_with_trivial_database(tempdir: Rc, operations: &[Operation]) { + let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten let mut txn = index.env.write_txn().unwrap(); - let mut trivial_db = TrivialDatabase::::default(); - let mut value_to_keys = HashMap::>::new(); + let mut trivial_db = TrivialDatabase::>::default(); + let mut value_to_keys = HashMap::>>::new(); for Operation { key, group_size, max_group_size, min_level_size, field_id, kind } in operations { @@ -1160,10 +1128,10 @@ mod fuzz { let mut bitmap = RoaringBitmap::new(); for value in values { bitmap.insert(*value as u32); - value_to_keys.entry(*value).or_default().push(*key); + value_to_keys.entry(*value).or_default().push(key.clone()); } - index.insert(&mut txn, *field_id, key, &bitmap); - trivial_db.insert(*field_id, *key, &bitmap); + index.insert(&mut txn, *field_id, &key.as_slice(), &bitmap); + trivial_db.insert(*field_id, &key, &bitmap); } OperationKind::Delete(values) => { let values = RoaringBitmap::from_iter(values.iter().copied().map(|x| x as u32)); @@ -1179,8 +1147,8 @@ mod fuzz { } } for (key, values) in values_per_key { - index.delete(&mut txn, *field_id, &key, &values); - trivial_db.delete(*field_id, *key, &values); + index.delete(&mut txn, *field_id, &key.as_slice(), &values); + trivial_db.delete(*field_id, &key, &values); } } } @@ -1198,7 +1166,8 @@ mod fuzz { for ((key, values), group) in values_field_id.iter().zip(level0iter) { let (group_key, group_values) = group.unwrap(); - let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + let group_key = + FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); assert_eq!(key, &group_key.left_bound); assert_eq!(values, &group_values.bitmap); } @@ -1213,7 +1182,8 @@ mod fuzz { for ((key, values), group) in values_field_id.iter().zip(level0iter) { let (group_key, group_values) = group.unwrap(); - let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + let group_key = + FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); assert_eq!(key, &group_key.left_bound); assert_eq!(values, &group_values.bitmap); } @@ -1227,7 +1197,7 @@ mod fuzz { fn fuzz() { let tempdir = Rc::new(TempDir::new().unwrap()); let tempdir_cloned = tempdir.clone(); - let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| { + let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| { compare_with_trivial_database(tempdir_cloned.clone(), operations) }) .default_mutator() @@ -1243,168 +1213,4 @@ mod fuzz { .launch(); assert!(!result.found_test_failure); } - - #[test] - #[no_coverage] - fn reproduce_bug1() { - let operations = r#" - [ - {"key":0, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[109]}}, - {"key":143, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[243]}}, - {"key":90, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[217]}}, - {"key":172, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[94]}}, - {"key":27, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[4]}}, - {"key":124, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, - {"key":123, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, - {"key":67, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[109]}}, - {"key":13, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, - {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[213]}}, - {"key":235, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, - {"key":251, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[50]}}, - {"key":218, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[164]}}, - {"key":166, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, - {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[61]}}, - {"key":183, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, - {"key":250, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":[50]}} - ] - "#; - let operations: Vec> = serde_json::from_str(operations).unwrap(); - let tempdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tempdir), &operations); - } - - #[test] - #[no_coverage] - fn reproduce_bug2() { - let operations = r#" - [ - {"key":102, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[122]}}, - {"key":73, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[132]}}, - {"key":20, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[215]}}, - {"key":39, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[152]}}, - {"key":151, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[226]}}, - {"key":17, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[101]}}, - {"key":74, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, - {"key":2, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[130]}}, - {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[180]}}, - {"key":83, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[250]}}, - {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, - {"key":113, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[63]}}, - {"key":201, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, - {"key":200, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, - {"key":93, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[98]}}, - {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, - {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":[210]}} - ] - "#; - let operations: Vec> = serde_json::from_str(operations).unwrap(); - let tempdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tempdir), &operations); - } - #[test] - #[no_coverage] - fn reproduce_bug3() { - let operations = r#" - [ - {"key":27488, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, - {"key":64716, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[216]}}, - {"key":60886, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, - {"key":59509, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[187,231]}}, - {"key":55057, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[37]}}, - {"key":45200, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, - {"key":55056, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[37]}}, - {"key":63679, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, - {"key":52155, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[74]}}, - {"key":20648, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[47,138,157]}} - ] - "#; - let operations: Vec> = serde_json::from_str(operations).unwrap(); - let tempdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tempdir), &operations); - } - - #[test] - #[no_coverage] - fn reproduce_bug4() { - let operations = r#"[ - {"key":63499, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[87]}}, - {"key":25374, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[14]}}, - {"key":64481, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Delete":[87]}}, - {"key":23038, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[173]}}, - {"key":14862, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[8]}}, - {"key":13145, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[5,64]}}, - {"key":23446, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[86,59]}}, - {"key":17972, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[58,137]}}, - {"key":21273, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[121,132,81,147]}}, - {"key":28264, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[36]}}, - {"key":46659, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[]}} - ] - "#; - let operations: Vec> = serde_json::from_str(operations).unwrap(); - let tempdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tempdir), &operations); - } - - #[test] - #[no_coverage] - fn reproduce_bug5() { - let input = r#" - [ - { - "key":3438, - "group_size":11, - "max_group_size":0, - "min_level_size":17, - "field_id":3, - "kind":{"Insert":[198]} - }, - - { - "key":47098, - "group_size":0, - "max_group_size":8, - "min_level_size":0, - "field_id":3, - "kind":{"Insert":[11]} - }, - { - "key":22453, - "group_size":0, - "max_group_size":0, - "min_level_size":0, - "field_id":3, - "kind":{"Insert":[145]} - }, - { - "key":14105, - "group_size":14, - "max_group_size":4, - "min_level_size":25, - "field_id":3, - "kind":{"Delete":[11]} - } - ] - "#; - let operations: Vec> = serde_json::from_str(input).unwrap(); - let tmpdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tmpdir), &operations); - } - - #[test] - #[no_coverage] - fn reproduce_bug6() { - let input = r#" - [ - {"key":45720,"group_size":1,"max_group_size":4,"min_level_size":0,"field_id":0,"kind":{"Insert":[120]}}, - {"key":37463,"group_size":1,"max_group_size":4,"min_level_size":0,"field_id":0,"kind":{"Insert":[187]}}, - {"key":21512,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}}, - {"key":21511,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}}, - {"key":37737,"group_size":12,"max_group_size":0,"min_level_size":6,"field_id":0,"kind":{"Insert":[181]}}, - {"key":53042,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}} - ] - "#; - let operations: Vec> = serde_json::from_str(input).unwrap(); - let tmpdir = TempDir::new().unwrap(); - compare_with_trivial_database(Rc::new(tmpdir), &operations); - } } diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 1d415166d..33dd5ce5b 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -38,7 +38,6 @@ pub fn extract_facet_number_docids( let key = FacetGroupKey { field_id, level: 0, left_bound: number }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 182538683..8b02a6008 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -6,7 +6,7 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::StrRefCodec; -use crate::update::index_documents::merge_cbo_roaring_bitmaps; +use crate::update::index_documents::{merge_cbo_roaring_bitmaps, valid_lmdb_key}; use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -41,9 +41,10 @@ pub fn extract_facet_string_docids( let normalised_value = std::str::from_utf8(normalized_value_bytes)?; let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - - // document id is encoded in native-endian because of the CBO roaring bitmap codec - facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; + if valid_lmdb_key(&key_bytes) { + // document id is encoded in native-endian because of the CBO roaring bitmap codec + facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; + } } sorter_into_reader(facet_string_docids_sorter, indexer) From 990a8612413cd225c5088d86557e574fa1077089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 16 Nov 2022 15:19:55 +0100 Subject: [PATCH 1803/1889] Add test for indexing a document with a long facet value --- milli/src/update/index_documents/mod.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e9f5c2d38..af99a230b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1821,4 +1821,24 @@ mod tests { let words_fst = index.words_fst(&rtxn).unwrap(); assert!(!words_fst.contains(&long_word)); } + + #[test] + fn long_facet_values_must_not_crash() { + let index = TempIndex::new(); + + // this is obviousy too long + let long_word = "lol".repeat(1000); + let doc1 = documents! {[{ + "id": "1", + "title": long_word, + }]}; + + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("title") }); + }) + .unwrap(); + + index.add_documents(doc1).unwrap(); + } } From ac3baafbe85914a4020eae06f43f6b8394eff5ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 16 Nov 2022 14:03:27 +0100 Subject: [PATCH 1804/1889] Truncate facet values that are too long before indexing them --- .../extract/extract_facet_string_docids.rs | 21 +++++++++++++------ .../extract/extract_fid_docid_facet_values.rs | 11 ++++++++-- .../src/update/index_documents/helpers/mod.rs | 14 ++++++++++++- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 8b02a6008..3a0af3c96 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -6,7 +6,8 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::StrRefCodec; -use crate::update::index_documents::{merge_cbo_roaring_bitmaps, valid_lmdb_key}; +use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH; +use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -38,13 +39,21 @@ pub fn extract_facet_string_docids( try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); - let normalised_value = std::str::from_utf8(normalized_value_bytes)?; + let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?; + + let normalised_truncated_value: String; + if normalised_value.len() > MAX_FACET_VALUE_LENGTH { + normalised_truncated_value = normalised_value + .char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + normalised_value = normalised_truncated_value.as_str(); + } let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - if valid_lmdb_key(&key_bytes) { - // document id is encoded in native-endian because of the CBO roaring bitmap codec - facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; - } + // document id is encoded in native-endian because of the CBO roaring bitmap codec + facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; } sorter_into_reader(facet_string_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 44afcde6c..b37cd90d3 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -12,6 +12,7 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; +use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32}; @@ -85,10 +86,16 @@ pub fn extract_fid_docid_facet_values( } } - // insert normalized and original facet string in sorter + // insert normalized and original facet string in sorter for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { + let normalised_truncated_value: String = normalized + .char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + key_buffer.truncate(size_of::() + size_of::()); - key_buffer.extend_from_slice(normalized.as_bytes()); + key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; } } diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 8fb629cae..e1f112858 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -18,8 +18,20 @@ pub use merge_functions::{ serialize_roaring_bitmap, MergeFn, }; +/// The maximum length a LMDB key can be. +/// +/// Note that the actual allowed length is a little bit higher, but +/// we keep a margin of safety. +const MAX_LMDB_KEY_LENGTH: usize = 500; + +/// The maximum length a field value can be when inserted in an LMDB key. +/// +/// This number is determined by the keys of the different facet databases +/// and adding a margin of safety. +pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20; + /// The maximum length a word can be -pub const MAX_WORD_LENGTH: usize = 250; +pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty() From 0caadedd3b9e74d18905715f74afc6a5b0bd4544 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 17 Nov 2022 12:17:53 +0100 Subject: [PATCH 1805/1889] Make clippy happy --- milli/src/update/facet/incremental.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index b07b675c5..ddf55b06c 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -131,11 +131,11 @@ impl FacetsUpdateIncrementalInner { let mut iter = self.db.as_polymorph().prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( txn, - &prefix.as_slice(), + prefix.as_slice(), )?; let (key_bytes, value) = iter.next().unwrap()?; Ok(( - FacetGroupKeyCodec::::bytes_decode(&key_bytes) + FacetGroupKeyCodec::::bytes_decode(key_bytes) .ok_or(Error::Encoding)? .into_owned(), value, From 777eb3fa006443b787cab58ce6705128b8219bac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 17 Nov 2022 12:21:27 +0100 Subject: [PATCH 1806/1889] Add insta-snaps for test of bug 3043 --- .../prefix_word_pair_proximity_docids.snap | 4 ++++ .../test_batch_bug_3043/word_pair_proximity_docids.snap | 8 ++++++++ .../word_prefix_pair_proximity_docids.snap | 7 +++++++ 3 files changed, 19 insertions(+) create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..d212999bb --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap new file mode 100644 index 000000000..816895dcf --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap @@ -0,0 +1,8 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a y [51, ] +1 x a [51, ] +1 x y [50, ] +2 x y [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..03530a2f1 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a y [51, ] +1 x y [50, ] +2 x y [51, ] + From cd5aaa3a9fb45fe7211add394e514c7f52f59395 Mon Sep 17 00:00:00 2001 From: curquiza Date: Thu, 17 Nov 2022 12:50:07 +0000 Subject: [PATCH 1807/1889] Update version for the next release (v0.37.0) in Cargo.toml files --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index a49c08d39..fdef63729 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.36.0" +version = "0.37.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 8c9cc6c5e..0148ac87e 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.36.0" +version = "0.37.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 50961e01f..d50563ec0 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.36.0" +version = "0.37.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 3fd52a9fe..972cfc3a7 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.36.0" +version = "0.37.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 3e7c2fae9..dc36c1a3f 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.36.0" +version = "0.37.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 6867edb79..60d45730c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.36.0" +version = "0.37.0" authors = ["Kerollmops "] edition = "2018" From d19c8672bb66eaf6405ca726f6b6eaa0021ea61c Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Wed, 23 Nov 2022 15:50:53 -0500 Subject: [PATCH 1808/1889] perf: limit reindex to when exact_attributes changes --- milli/src/index.rs | 5 ++--- milli/src/update/settings.rs | 23 ++++++++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 5910a305c..d9636634d 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1145,9 +1145,8 @@ impl Index { } /// Clears the exact attributes from the store. - pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result<()> { - self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?; - Ok(()) + pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result { + Ok(self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?) } pub fn max_values_per_facet(&self, txn: &RoTxn) -> heed::Result> { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 6da32d73f..8220ed3ab 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -465,14 +465,23 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_exact_attributes(&mut self) -> Result { match self.exact_attributes { Setting::Set(ref attrs) => { - let attrs = attrs.iter().map(String::as_str).collect::>(); - self.index.put_exact_attributes(self.wtxn, &attrs)?; - Ok(true) - } - Setting::Reset => { - self.index.delete_exact_attributes(self.wtxn)?; - Ok(true) + let old_attrs = self + .index + .exact_attributes(self.wtxn)? + .iter() + .cloned() + .map(String::from) + .collect::>(); + + if attrs != &old_attrs { + let attrs = attrs.iter().map(String::as_str).collect::>(); + self.index.put_exact_attributes(self.wtxn, &attrs)?; + Ok(true) + } else { + Ok(false) + } } + Setting::Reset => Ok(self.index.delete_exact_attributes(self.wtxn)?), Setting::NotSet => Ok(false), } } From 7c0e544839d1cea94f45f07757251c8d25f6366e Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Wed, 23 Nov 2022 21:18:58 -0500 Subject: [PATCH 1809/1889] feat: Add all_obkv_to_json function --- cli/src/main.rs | 2 +- milli/src/lib.rs | 30 +++++++++++++++++++++++++++++- milli/src/update/settings.rs | 2 +- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index dd5489ebc..f2bbc0b23 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -450,7 +450,7 @@ impl Search { let documents = index.documents(&txn, result.documents_ids)?; let mut jsons = Vec::new(); for (_, obkv) in documents { - let json = milli::obkv_to_json(&displayed_fields, &fields_ids_map, obkv)?; + let json = milli::obkv_to_json(&displayed_fields, &fields_ids_map, &obkv)?; jsons.push(json); } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index c33aae9eb..7d74637f2 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -88,7 +88,7 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, - obkv: obkv::KvReaderU16, + obkv: &obkv::KvReaderU16, ) -> Result { displayed_fields .iter() @@ -105,6 +105,12 @@ pub fn obkv_to_json( .collect() } +/// Transform every field of a raw obkv store into a JSON Object. +pub fn all_obkv_to_json(obkv: &obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result { + let all_keys = obkv.iter().map(|(k, _v)| k).collect::>(); + obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv) +} + /// Transform a JSON value into a string that can be indexed. pub fn json_to_string(value: &Value) -> Option { fn inner(value: &Value, output: &mut String) -> bool { @@ -285,4 +291,26 @@ mod tests { assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678)); assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF)); } + + #[test] + fn test_all_obkv_to_json() { + let mut fields_ids_map = FieldsIdsMap::new(); + let id1 = fields_ids_map.insert("field1").unwrap(); + let id2 = fields_ids_map.insert("field2").unwrap(); + + let mut writer = obkv::KvWriterU16::memory(); + writer.insert(id1, b"1234").unwrap(); + writer.insert(id2, b"4321").unwrap(); + let contents = writer.into_inner().unwrap(); + let obkv = obkv::KvReaderU16::new(&contents); + + let expected = json!({ + "field1": 1234, + "field2": 4321, + }); + let expected = expected.as_object().unwrap(); + let actual = all_obkv_to_json(&obkv, &fields_ids_map).unwrap(); + + assert_eq!(&actual, expected); + } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 6da32d73f..eb5f56c45 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -920,7 +920,7 @@ mod tests { let fidmap = index.fields_ids_map(&rtxn).unwrap(); for document in index.all_documents(&rtxn).unwrap() { let document = document.unwrap(); - let json = crate::obkv_to_json(&fidmap.ids().collect::>(), &fidmap, document.1) + let json = crate::obkv_to_json(&fidmap.ids().collect::>(), &fidmap, &document.1) .unwrap(); println!("json: {:?}", json); } From bb9e33bf85f5fd69d49b72cd6fc43b97f951d4ff Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Wed, 23 Nov 2022 22:01:46 -0500 Subject: [PATCH 1810/1889] perf: Prevent reindex in searchable reset case when not needed --- milli/src/update/settings.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 8220ed3ab..586198c52 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -373,13 +373,11 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { &new_fields_ids_map, )?; self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; + Ok(true) } - Setting::Reset => { - self.index.delete_all_searchable_fields(self.wtxn)?; - } + Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), Setting::NotSet => return Ok(false), } - Ok(true) } fn update_stop_words(&mut self) -> Result { From ed29cceae940d580ccdf5407c7dea0c07d168d86 Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Wed, 23 Nov 2022 22:33:06 -0500 Subject: [PATCH 1811/1889] perf: Prevent reindex in searchable set case when not needed --- milli/src/update/settings.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 586198c52..aed2d951e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -349,6 +349,16 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_searchable(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { + let did_change = self + .index + .searchable_fields(self.wtxn)? + .map(|f| f.into_iter().map(String::from).collect::>()) + .map(|old_fields| fields != &old_fields) + .unwrap_or(true); // if old_fields was None before, it was changed + if !did_change { + return Ok(false); + } + // every time the searchable attributes are updated, we need to update the // ids for any settings that uses the facets. (distinct_fields, filterable_fields). let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -376,7 +386,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), - Setting::NotSet => return Ok(false), + Setting::NotSet => Ok(false), } } From 935a724c570b3b851da544b96734ce20cf280f2c Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Thu, 24 Nov 2022 10:08:23 -0500 Subject: [PATCH 1812/1889] revert: Revert pass by reference API change --- cli/src/main.rs | 2 +- milli/src/lib.rs | 6 +++--- milli/src/update/settings.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index f2bbc0b23..dd5489ebc 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -450,7 +450,7 @@ impl Search { let documents = index.documents(&txn, result.documents_ids)?; let mut jsons = Vec::new(); for (_, obkv) in documents { - let json = milli::obkv_to_json(&displayed_fields, &fields_ids_map, &obkv)?; + let json = milli::obkv_to_json(&displayed_fields, &fields_ids_map, obkv)?; jsons.push(json); } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 7d74637f2..21851430e 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -88,7 +88,7 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, - obkv: &obkv::KvReaderU16, + obkv: obkv::KvReaderU16, ) -> Result { displayed_fields .iter() @@ -106,7 +106,7 @@ pub fn obkv_to_json( } /// Transform every field of a raw obkv store into a JSON Object. -pub fn all_obkv_to_json(obkv: &obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result { +pub fn all_obkv_to_json(obkv: obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result { let all_keys = obkv.iter().map(|(k, _v)| k).collect::>(); obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv) } @@ -309,7 +309,7 @@ mod tests { "field2": 4321, }); let expected = expected.as_object().unwrap(); - let actual = all_obkv_to_json(&obkv, &fields_ids_map).unwrap(); + let actual = all_obkv_to_json(obkv, &fields_ids_map).unwrap(); assert_eq!(&actual, expected); } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index eb5f56c45..6da32d73f 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -920,7 +920,7 @@ mod tests { let fidmap = index.fields_ids_map(&rtxn).unwrap(); for document in index.all_documents(&rtxn).unwrap() { let document = document.unwrap(); - let json = crate::obkv_to_json(&fidmap.ids().collect::>(), &fidmap, &document.1) + let json = crate::obkv_to_json(&fidmap.ids().collect::>(), &fidmap, document.1) .unwrap(); println!("json: {:?}", json); } From 3958db4b17334e96e9eb1f904ee981e7dee26c6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 24 Nov 2022 16:26:48 +0100 Subject: [PATCH 1813/1889] Update the CI to use Rust Stable --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index b0d15a6e0..9d1e60baa 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -56,7 +56,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: nightly + toolchain: stable override: true components: clippy - name: Cache dependencies @@ -74,7 +74,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: nightly + toolchain: stable override: true components: rustfmt - name: Cache dependencies From 3d06ea41eaeb5312d2e7f2b5cdcc384044a7726f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 24 Nov 2022 16:54:40 +0100 Subject: [PATCH 1814/1889] Keep a nightly for rustfmt Co-authored-by: Tamo --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 9d1e60baa..0962e4511 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -74,7 +74,7 @@ jobs: - uses: actions-rs/toolchain@v1 with: profile: minimal - toolchain: stable + toolchain: nightly override: true components: rustfmt - name: Cache dependencies From 2db738dbac9b3e35b69a989b835e27b58f44d5a1 Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Sat, 26 Nov 2022 13:26:39 -0500 Subject: [PATCH 1815/1889] refactor: rewrite method chain to be more readable --- milli/src/update/settings.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index aed2d951e..7d281262a 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -349,12 +349,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_searchable(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { - let did_change = self - .index - .searchable_fields(self.wtxn)? - .map(|f| f.into_iter().map(String::from).collect::>()) - .map(|old_fields| fields != &old_fields) - .unwrap_or(true); // if old_fields was None before, it was changed + // Check to see if the searchable fields changed before doing anything else + let old_fields = self.index.searchable_fields(self.wtxn)?; + let did_change = match old_fields { + // If old_fields is Some, let's check to see if the fields actually changed + Some(old_fields) => { + let new_fields = fields.iter().map(String::as_str).collect::>(); + new_fields != old_fields + } + // If old_fields is None, the fields have changed (because they are being set) + None => true, + }; if !did_change { return Ok(false); } From e0d24104a3c66a9a597288032212f7ae1cc06e9a Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Sat, 26 Nov 2022 13:33:19 -0500 Subject: [PATCH 1816/1889] refactor: Rewrite another method chain to be more readable --- milli/src/update/settings.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 7d281262a..fc7e6bc03 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -478,13 +478,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_exact_attributes(&mut self) -> Result { match self.exact_attributes { Setting::Set(ref attrs) => { - let old_attrs = self - .index - .exact_attributes(self.wtxn)? - .iter() - .cloned() - .map(String::from) - .collect::>(); + let old_attrs = self.index.exact_attributes(self.wtxn)?; + let old_attrs = old_attrs.into_iter().map(String::from).collect::>(); if attrs != &old_attrs { let attrs = attrs.iter().map(String::as_str).collect::>(); From eba7af1d2ca92dcc3a3be520dc930722bafeac28 Mon Sep 17 00:00:00 2001 From: Minh Pham Date: Sun, 27 Nov 2022 06:47:08 +0700 Subject: [PATCH 1817/1889] Replace deprecated gh actions --- .github/workflows/manual_benchmarks.yml | 8 ++++---- .github/workflows/push_benchmarks_indexing.yml | 8 ++++---- .github/workflows/push_benchmarks_search_geo.yml | 8 ++++---- .github/workflows/push_benchmarks_search_songs.yml | 8 ++++---- .github/workflows/push_benchmarks_search_wiki.yml | 8 ++++---- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/manual_benchmarks.yml b/.github/workflows/manual_benchmarks.yml index e7cbfe68b..17e006224 100644 --- a/.github/workflows/manual_benchmarks.yml +++ b/.github/workflows/manual_benchmarks.yml @@ -27,19 +27,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/})" id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "##[echo "name=short" >> $GITHUB_SHA;]$(echo $GITHUB_SHA | cut -c1-8)" id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[echo "name=basename" >> $BENCH_NAME;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_indexing.yml b/.github/workflows/push_benchmarks_indexing.yml index 1011f2461..9b53a3940 100644 --- a/.github/workflows/push_benchmarks_indexing.yml +++ b/.github/workflows/push_benchmarks_indexing.yml @@ -25,19 +25,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/})" id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "##[echo "name=short" >> $GITHUB_SHA;]$(echo $GITHUB_SHA | cut -c1-8)" id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[echo "name=basename" >> ${BENCH_NAME};]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_search_geo.yml b/.github/workflows/push_benchmarks_search_geo.yml index 7aa98ca58..a1b18f6bd 100644 --- a/.github/workflows/push_benchmarks_search_geo.yml +++ b/.github/workflows/push_benchmarks_search_geo.yml @@ -24,19 +24,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/})" id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "##[echo "name=short" >> $GITHUB_SHA;]$(echo $GITHUB_SHA | cut -c1-8)" id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[echo "name=basename" >> $BENCH_NAME;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_search_songs.yml b/.github/workflows/push_benchmarks_search_songs.yml index aa0fcacd4..4f1f631ba 100644 --- a/.github/workflows/push_benchmarks_search_songs.yml +++ b/.github/workflows/push_benchmarks_search_songs.yml @@ -24,19 +24,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/})" id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "##[echo "name=short" >> $GITHUB_SHA;]$(echo $GITHUB_SHA | cut -c1-8)" id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[echo "name=basename" >> $BENCH_NAME;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_search_wiki.yml b/.github/workflows/push_benchmarks_search_wiki.yml index 2f266941d..7abf0661f 100644 --- a/.github/workflows/push_benchmarks_search_wiki.yml +++ b/.github/workflows/push_benchmarks_search_wiki.yml @@ -24,19 +24,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/})" id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[set-output name=name;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[set-output name=short;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "##[echo "name=short" >> $GITHUB_SHA;]$(echo $GITHUB_SHA | cut -c1-8)" id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[set-output name=basename;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[echo "name=basename" >> $BENCH_NAME;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks From 86c34a996b5713915215d107d5cc22735ef238ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 31 Oct 2022 13:33:49 +0100 Subject: [PATCH 1818/1889] Deduplicate matching words --- milli/src/search/matches/matching_words.rs | 21 ++-- milli/src/search/matches/mod.rs | 50 +++++---- milli/src/search/query_tree.rs | 124 ++++++++++++++++++--- 3 files changed, 151 insertions(+), 44 deletions(-) diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 25d447d0c..5bd6c222d 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -2,6 +2,7 @@ use std::cmp::{min, Reverse}; use std::collections::BTreeMap; use std::fmt; use std::ops::{Index, IndexMut}; +use std::rc::Rc; use charabia::Token; use levenshtein_automata::{Distance, DFA}; @@ -14,11 +15,11 @@ type IsPrefix = bool; /// referencing words that match the given query tree. #[derive(Default)] pub struct MatchingWords { - inner: Vec<(Vec, Vec)>, + inner: Vec<(Vec>, Vec)>, } impl MatchingWords { - pub fn new(mut matching_words: Vec<(Vec, Vec)>) -> Self { + pub fn new(mut matching_words: Vec<(Vec>, Vec)>) -> Self { // Sort word by len in DESC order prioritizing the longuest matches, // in order to highlight the longuest part of the matched word. matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); @@ -35,7 +36,8 @@ impl MatchingWords { /// Iterator over terms that match the given token, /// This allow to lazily evaluate matches. pub struct MatchesIter<'a, 'b> { - inner: Box, Vec)> + 'a>, + #[allow(clippy::type_complexity)] + inner: Box>, Vec)> + 'a>, token: &'b Token<'b>, } @@ -126,7 +128,7 @@ pub enum MatchType<'a> { /// Structure helper to match several tokens in a row in order to complete a partial match. #[derive(Debug, PartialEq)] pub struct PartialMatch<'a> { - matching_words: &'a [MatchingWord], + matching_words: &'a [Rc], ids: &'a [PrimitiveWordId], char_len: usize, } @@ -332,10 +334,15 @@ mod tests { #[test] fn matching_words() { + let all = vec![ + Rc::new(MatchingWord::new("split".to_string(), 1, true)), + Rc::new(MatchingWord::new("this".to_string(), 0, false)), + Rc::new(MatchingWord::new("world".to_string(), 1, true)), + ]; let matching_words = vec![ - (vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]), - (vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]), - (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + (vec![all[0].clone()], vec![0]), + (vec![all[1].clone()], vec![1]), + (vec![all[2].clone()], vec![2]), ]; let matching_words = MatchingWords::new(matching_words); diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index ec47f848d..0e515fde6 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -494,16 +494,23 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { #[cfg(test)] mod tests { + use std::rc::Rc; + use charabia::TokenizerBuilder; use super::*; use crate::search::matches::matching_words::MatchingWord; fn matching_words() -> MatchingWords { + let all = vec![ + Rc::new(MatchingWord::new("split".to_string(), 0, false)), + Rc::new(MatchingWord::new("the".to_string(), 0, false)), + Rc::new(MatchingWord::new("world".to_string(), 1, true)), + ]; let matching_words = vec![ - (vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]), - (vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]), - (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + (vec![all[0].clone()], vec![0]), + (vec![all[1].clone()], vec![1]), + (vec![all[2].clone()], vec![2]), ]; MatchingWords::new(matching_words) @@ -587,10 +594,11 @@ mod tests { #[test] fn highlight_unicode() { - let matching_words = vec![ - (vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]), - (vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]), + let all = vec![ + Rc::new(MatchingWord::new("wessfali".to_string(), 1, true)), + Rc::new(MatchingWord::new("world".to_string(), 1, true)), ]; + let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])]; let matching_words = MatchingWords::new(matching_words); @@ -823,24 +831,20 @@ mod tests { #[test] fn partial_matches() { + let all = vec![ + Rc::new(MatchingWord::new("the".to_string(), 0, false)), + Rc::new(MatchingWord::new("t".to_string(), 0, false)), + Rc::new(MatchingWord::new("he".to_string(), 0, false)), + Rc::new(MatchingWord::new("door".to_string(), 0, false)), + Rc::new(MatchingWord::new("do".to_string(), 0, false)), + Rc::new(MatchingWord::new("or".to_string(), 0, false)), + ]; let matching_words = vec![ - (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), - ( - vec![ - MatchingWord::new("t".to_string(), 0, false), - MatchingWord::new("he".to_string(), 0, false), - ], - vec![0], - ), - (vec![MatchingWord::new("door".to_string(), 0, false)], vec![1]), - ( - vec![ - MatchingWord::new("do".to_string(), 0, false), - MatchingWord::new("or".to_string(), 0, false), - ], - vec![1], - ), - (vec![MatchingWord::new("do".to_string(), 0, false)], vec![2]), + (vec![all[0].clone()], vec![0]), + (vec![all[1].clone(), all[2].clone()], vec![0]), + (vec![all[3].clone()], vec![1]), + (vec![all[4].clone(), all[5].clone()], vec![1]), + (vec![all[4].clone()], vec![2]), ]; let matching_words = MatchingWords::new(matching_words); diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index a9c1ac29f..acb326022 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,5 +1,9 @@ use std::borrow::Cow; use std::cmp::max; +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::hash::Hash; +use std::rc::Rc; use std::{fmt, mem}; use charabia::classifier::ClassifiedTokenIter; @@ -540,6 +544,30 @@ fn create_query_tree( Ok(Operation::or(true, operation_children)) } +#[derive(Default, Debug)] +struct MatchingWordCache { + all: Vec>, + map: HashMap<(String, u8, bool), Rc>, +} +impl MatchingWordCache { + fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Rc { + // Toggle the (un)commented code to switch between cached and non-cached + // implementations. + + // self.all.push(MatchingWord::new(word, typo, prefix)); + // self.all.len() - 1 + match self.map.entry((word.clone(), typo, prefix)) { + Entry::Occupied(idx) => idx.get().clone(), + Entry::Vacant(vacant) => { + let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)); + self.all.push(matching_word.clone()); + vacant.insert(matching_word.clone()); + matching_word + } + } + } +} + /// Main function that matchings words used for crop and highlight. fn create_matching_words( ctx: &impl Context, @@ -551,7 +579,8 @@ fn create_matching_words( ctx: &impl Context, authorize_typos: bool, part: PrimitiveQueryPart, - matching_words: &mut Vec<(Vec, Vec)>, + matching_words: &mut Vec<(Vec>, Vec)>, + matching_word_cache: &mut MatchingWordCache, id: PrimitiveWordId, ) -> Result<()> { match part { @@ -562,15 +591,15 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| MatchingWord::new(syn, 0, false)) + .map(|syn| matching_word_cache.insert(syn, 0, false)) .collect(); matching_words.push((synonym, vec![id])); } } if let Some((left, right)) = split_best_frequency(ctx, &word)? { - let left = MatchingWord::new(left.to_string(), 0, false); - let right = MatchingWord::new(right.to_string(), 0, false); + let left = matching_word_cache.insert(left.to_string(), 0, false); + let right = matching_word_cache.insert(right.to_string(), 0, false); matching_words.push((vec![left, right], vec![id])); } @@ -580,8 +609,10 @@ fn create_matching_words( TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; let matching_word = match typos(word, authorize_typos, config) { - QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix), - QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix), + QueryKind::Exact { word, .. } => matching_word_cache.insert(word, 0, prefix), + QueryKind::Tolerant { typo, word } => { + matching_word_cache.insert(word, typo, prefix) + } }; matching_words.push((vec![matching_word], vec![id])); } @@ -589,8 +620,11 @@ fn create_matching_words( PrimitiveQueryPart::Phrase(words) => { let ids: Vec<_> = (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); - let words = - words.into_iter().flatten().map(|w| MatchingWord::new(w, 0, false)).collect(); + let words = words + .into_iter() + .flatten() + .map(|w| matching_word_cache.insert(w, 0, false)) + .collect(); matching_words.push((words, ids)); } } @@ -603,7 +637,8 @@ fn create_matching_words( ctx: &impl Context, authorize_typos: bool, query: &[PrimitiveQueryPart], - matching_words: &mut Vec<(Vec, Vec)>, + matching_words: &mut Vec<(Vec>, Vec)>, + matching_word_cache: &mut MatchingWordCache, mut id: PrimitiveWordId, ) -> Result<()> { const MAX_NGRAM: usize = 3; @@ -621,6 +656,7 @@ fn create_matching_words( authorize_typos, part.clone(), matching_words, + matching_word_cache, id, )?; } @@ -645,7 +681,7 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| MatchingWord::new(syn, 0, false)) + .map(|syn| matching_word_cache.insert(syn, 0, false)) .collect(); matching_words.push((synonym, ids.clone())); } @@ -662,10 +698,10 @@ fn create_matching_words( }; let matching_word = match typos(word, authorize_typos, config) { QueryKind::Exact { word, .. } => { - MatchingWord::new(word, 0, is_prefix) + matching_word_cache.insert(word, 0, is_prefix) } QueryKind::Tolerant { typo, word } => { - MatchingWord::new(word, typo, is_prefix) + matching_word_cache.insert(word, typo, is_prefix) } }; matching_words.push((vec![matching_word], ids)); @@ -673,7 +709,14 @@ fn create_matching_words( } if !is_last { - ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?; + ngrams( + ctx, + authorize_typos, + tail, + matching_words, + matching_word_cache, + id + 1, + )?; } } } @@ -683,8 +726,9 @@ fn create_matching_words( Ok(()) } + let mut matching_word_cache = MatchingWordCache::default(); let mut matching_words = Vec::new(); - ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?; + ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?; Ok(MatchingWords::new(matching_words)) } @@ -806,7 +850,9 @@ pub fn maximum_proximity(operation: &Operation) -> usize { #[cfg(test)] mod test { + use std::alloc::{GlobalAlloc, System}; use std::collections::HashMap; + use std::sync::atomic::{self, AtomicI64}; use charabia::Tokenize; use maplit::hashmap; @@ -814,6 +860,7 @@ mod test { use rand::{Rng, SeedableRng}; use super::*; + use crate::index::tests::TempIndex; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; #[derive(Debug)] @@ -1310,4 +1357,53 @@ mod test { Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) )); } + + #[global_allocator] + static ALLOC: CountingAlloc = + CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) }; + + pub struct CountingAlloc { + pub resident: AtomicI64, + pub allocated: AtomicI64, + } + unsafe impl GlobalAlloc for CountingAlloc { + unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { + self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); + self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); + + System.alloc(layout) + } + + unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { + self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::SeqCst); + System.dealloc(ptr, layout) + } + } + + // This test must be run + #[test] + fn ten_words() { + let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); + let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); + + let index = TempIndex::new(); + let rtxn = index.read_txn().unwrap(); + let query = "a beautiful summer house by the beach overlooking what seems"; + let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + builder.words_limit(10); + let x = builder.build(query.tokenize()).unwrap().unwrap(); + let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); + let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); + + insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4521710"); + insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7259092"); + + // Note, if the matching word cache is deactivated, the memory usage is: + // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91311265"); + // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125948410"); + // or about 20x more resident memory (90MB vs 4.5MB) + + // Use x + let _x = x; + } } From 8d0ace2d64aa37558c354249e6104e7d407f3a1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 24 Nov 2022 09:00:53 +0100 Subject: [PATCH 1819/1889] Avoid creating a MatchingWord for words that exceed the length limit --- milli/src/lib.rs | 15 ++++ milli/src/search/matches/matching_words.rs | 25 ++++-- milli/src/search/matches/mod.rs | 22 ++--- milli/src/search/query_tree.rs | 82 +++++++++++++------ .../extract/extract_docid_word_positions.rs | 8 +- .../extract/extract_facet_string_docids.rs | 3 +- .../extract/extract_fid_docid_facet_values.rs | 3 +- .../src/update/index_documents/helpers/mod.rs | 15 +--- 8 files changed, 111 insertions(+), 62 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index c33aae9eb..40e36092f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -70,6 +70,21 @@ pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; /// expressed in term of latitude and longitude. pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>; +/// The maximum length a LMDB key can be. +/// +/// Note that the actual allowed length is a little bit higher, but +/// we keep a margin of safety. +const MAX_LMDB_KEY_LENGTH: usize = 500; + +/// The maximum length a field value can be when inserted in an LMDB key. +/// +/// This number is determined by the keys of the different facet databases +/// and adding a margin of safety. +pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20; + +/// The maximum length a word can be +pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; + pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; // Convert an absolute word position into a relative position. diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 5bd6c222d..22ba973b5 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -8,6 +8,7 @@ use charabia::Token; use levenshtein_automata::{Distance, DFA}; use crate::search::build_dfa; +use crate::MAX_WORD_LENGTH; type IsPrefix = bool; @@ -18,6 +19,17 @@ pub struct MatchingWords { inner: Vec<(Vec>, Vec)>, } +impl fmt::Debug for MatchingWords { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "[")?; + for (matching_words, primitive_word_id) in self.inner.iter() { + writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?; + } + writeln!(f, "]")?; + Ok(()) + } +} + impl MatchingWords { pub fn new(mut matching_words: Vec<(Vec>, Vec)>) -> Self { // Sort word by len in DESC order prioritizing the longuest matches, @@ -93,10 +105,13 @@ impl PartialEq for MatchingWord { } impl MatchingWord { - pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self { + pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option { + if word.len() > MAX_WORD_LENGTH { + return None; + } let dfa = build_dfa(&word, typo, prefix); - Self { dfa, word, typo, prefix } + Some(Self { dfa, word, typo, prefix }) } /// Returns the lenght in chars of the match in case of the token matches the term. @@ -335,9 +350,9 @@ mod tests { #[test] fn matching_words() { let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 1, true)), - Rc::new(MatchingWord::new("this".to_string(), 0, false)), - Rc::new(MatchingWord::new("world".to_string(), 1, true)), + Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()), + Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), ]; let matching_words = vec![ (vec![all[0].clone()], vec![0]), diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 0e515fde6..25ee52ab1 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -503,9 +503,9 @@ mod tests { fn matching_words() -> MatchingWords { let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 0, false)), - Rc::new(MatchingWord::new("the".to_string(), 0, false)), - Rc::new(MatchingWord::new("world".to_string(), 1, true)), + Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), ]; let matching_words = vec![ (vec![all[0].clone()], vec![0]), @@ -595,8 +595,8 @@ mod tests { #[test] fn highlight_unicode() { let all = vec![ - Rc::new(MatchingWord::new("wessfali".to_string(), 1, true)), - Rc::new(MatchingWord::new("world".to_string(), 1, true)), + Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), ]; let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])]; @@ -832,12 +832,12 @@ mod tests { #[test] fn partial_matches() { let all = vec![ - Rc::new(MatchingWord::new("the".to_string(), 0, false)), - Rc::new(MatchingWord::new("t".to_string(), 0, false)), - Rc::new(MatchingWord::new("he".to_string(), 0, false)), - Rc::new(MatchingWord::new("door".to_string(), 0, false)), - Rc::new(MatchingWord::new("do".to_string(), 0, false)), - Rc::new(MatchingWord::new("or".to_string(), 0, false)), + Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()), ]; let matching_words = vec![ (vec![all[0].clone()], vec![0]), diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index acb326022..74b244f9a 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -550,21 +550,20 @@ struct MatchingWordCache { map: HashMap<(String, u8, bool), Rc>, } impl MatchingWordCache { - fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Rc { - // Toggle the (un)commented code to switch between cached and non-cached - // implementations. - - // self.all.push(MatchingWord::new(word, typo, prefix)); - // self.all.len() - 1 + fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Option> { match self.map.entry((word.clone(), typo, prefix)) { - Entry::Occupied(idx) => idx.get().clone(), + Entry::Occupied(idx) => Some(idx.get().clone()), Entry::Vacant(vacant) => { - let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)); + let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); self.all.push(matching_word.clone()); vacant.insert(matching_word.clone()); - matching_word + Some(matching_word) } } + // To deactivate the cache, for testing purposes, use the following instead: + // let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); + // self.all.push(matching_word.clone()); + // Some(matching_word) } } @@ -591,16 +590,19 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| matching_word_cache.insert(syn, 0, false)) + .flat_map(|syn| matching_word_cache.insert(syn, 0, false)) .collect(); matching_words.push((synonym, vec![id])); } } if let Some((left, right)) = split_best_frequency(ctx, &word)? { - let left = matching_word_cache.insert(left.to_string(), 0, false); - let right = matching_word_cache.insert(right.to_string(), 0, false); - matching_words.push((vec![left, right], vec![id])); + if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) { + if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false) + { + matching_words.push((vec![left, right], vec![id])); + } + } } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; @@ -614,7 +616,9 @@ fn create_matching_words( matching_word_cache.insert(word, typo, prefix) } }; - matching_words.push((vec![matching_word], vec![id])); + if let Some(matching_word) = matching_word { + matching_words.push((vec![matching_word], vec![id])); + } } // create a CONSECUTIVE matchings words wrapping all word in the phrase PrimitiveQueryPart::Phrase(words) => { @@ -623,7 +627,7 @@ fn create_matching_words( let words = words .into_iter() .flatten() - .map(|w| matching_word_cache.insert(w, 0, false)) + .flat_map(|w| matching_word_cache.insert(w, 0, false)) .collect(); matching_words.push((words, ids)); } @@ -681,7 +685,7 @@ fn create_matching_words( for synonym in synonyms { let synonym = synonym .into_iter() - .map(|syn| matching_word_cache.insert(syn, 0, false)) + .flat_map(|syn| matching_word_cache.insert(syn, 0, false)) .collect(); matching_words.push((synonym, ids.clone())); } @@ -704,7 +708,9 @@ fn create_matching_words( matching_word_cache.insert(word, typo, is_prefix) } }; - matching_words.push((vec![matching_word], ids)); + if let Some(matching_word) = matching_word { + matching_words.push((vec![matching_word], ids)); + } } } @@ -1341,6 +1347,27 @@ mod test { ); } + #[test] + fn test_dont_create_matching_word_for_long_words() { + let index = TempIndex::new(); + let rtxn = index.read_txn().unwrap(); + let query = "what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house"; + let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + builder.words_limit(10); + let (_, _, matching_words) = builder.build(query.tokenize()).unwrap().unwrap(); + insta::assert_snapshot!(format!("{matching_words:?}"), @r###" + [ + ([MatchingWord { word: "house", typo: 1, prefix: true }], [3]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) + ([MatchingWord { word: "whata", typo: 1, prefix: false }], [0, 1]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [1]) + ([MatchingWord { word: "what", typo: 0, prefix: false }], [0]) + ([MatchingWord { word: "a", typo: 0, prefix: false }], [1]) + ] + "###); + } + #[test] fn disable_typo_on_word() { let query = "goodbye"; @@ -1380,9 +1407,8 @@ mod test { } } - // This test must be run #[test] - fn ten_words() { + fn memory_usage_of_ten_word_query() { let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); @@ -1395,12 +1421,20 @@ mod test { let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); - insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4521710"); - insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7259092"); + // Weak check on the memory usage + // Don't keep more than 5MB. (Arguably 5MB is already too high) + assert!(resident_after - resident_before < 5_000_000); + // Don't allocate more than 10MB. + assert!(allocated_after - allocated_before < 10_000_000); - // Note, if the matching word cache is deactivated, the memory usage is: - // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91311265"); - // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125948410"); + // Use these snapshots to measure the exact memory usage. + // The values below were correct at the time I wrote them. + // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950"); + // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502"); + + // Note, with the matching word cache deactivated, the memory usage was: + // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697"); + // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588"); // or about 20x more resident memory (90MB vs 4.5MB) // Use x diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 8eae0caee..be9b479bb 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -7,11 +7,11 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; -use super::helpers::{ - concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH, -}; +use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; -use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{ + absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, +}; /// Extracts the word and positions where this word appear and /// prefixes it by the document id. diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index 3a0af3c96..0d9c0981e 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -6,9 +6,8 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::StrRefCodec; -use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH; use crate::update::index_documents::merge_cbo_roaring_bitmaps; -use crate::{FieldId, Result}; +use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet string and the documents ids where this facet string appear. /// diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index b37cd90d3..0a7dfbeb1 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -12,9 +12,8 @@ use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::index_documents::helpers::MAX_FACET_VALUE_LENGTH; use crate::update::index_documents::{create_writer, writer_into_reader}; -use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32}; +use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet values of each faceted field of each document. /// diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index e1f112858..a496ccd6e 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -18,20 +18,7 @@ pub use merge_functions::{ serialize_roaring_bitmap, MergeFn, }; -/// The maximum length a LMDB key can be. -/// -/// Note that the actual allowed length is a little bit higher, but -/// we keep a margin of safety. -const MAX_LMDB_KEY_LENGTH: usize = 500; - -/// The maximum length a field value can be when inserted in an LMDB key. -/// -/// This number is determined by the keys of the different facet databases -/// and adding a margin of safety. -pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20; - -/// The maximum length a word can be -pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; +use crate::MAX_WORD_LENGTH; pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty() From 8284bd760f601da6c22b76bea61dba9460fec4ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 24 Nov 2022 09:29:10 +0100 Subject: [PATCH 1820/1889] Relax memory ordering of operations within the test CountingAlloc --- milli/src/search/query_tree.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 74b244f9a..9d0ca5633 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1395,14 +1395,14 @@ mod test { } unsafe impl GlobalAlloc for CountingAlloc { unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { - self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); - self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); + self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); + self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); System.alloc(layout) } unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { - self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::SeqCst); + self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); System.dealloc(ptr, layout) } } From e2ebed62b1f20b36d1cf802938867ec3046e5a44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 28 Nov 2022 10:19:43 +0100 Subject: [PATCH 1821/1889] Don't create partial matching words for synonyms, split words, phrases --- milli/src/search/query_tree.rs | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 9d0ca5633..b218b48e2 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -588,15 +588,21 @@ fn create_matching_words( PrimitiveQueryPart::Word(word, prefix) => { if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? { for synonym in synonyms { - let synonym = synonym + // Require that all words of the synonym have a corresponding MatchingWord + // before adding any of its words to the matching_words result. + if let Some(synonym_matching_words) = synonym .into_iter() - .flat_map(|syn| matching_word_cache.insert(syn, 0, false)) - .collect(); - matching_words.push((synonym, vec![id])); + .map(|word| matching_word_cache.insert(word, 0, false)) + .collect() + { + matching_words.push((synonym_matching_words, vec![id])); + } } } if let Some((left, right)) = split_best_frequency(ctx, &word)? { + // Require that both left and right words have a corresponding MatchingWord + // before adding them to the matching_words result if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) { if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false) { @@ -624,12 +630,16 @@ fn create_matching_words( PrimitiveQueryPart::Phrase(words) => { let ids: Vec<_> = (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); - let words = words + // Require that all words of the phrase have a corresponding MatchingWord + // before adding any of them to the matching_words result + if let Some(phrase_matching_words) = words .into_iter() .flatten() - .flat_map(|w| matching_word_cache.insert(w, 0, false)) - .collect(); - matching_words.push((words, ids)); + .map(|w| matching_word_cache.insert(w, 0, false)) + .collect() + { + matching_words.push((phrase_matching_words, ids)); + } } } From 80588daae516127b8aa6b2db9f9f55014f357900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 28 Nov 2022 10:27:15 +0100 Subject: [PATCH 1822/1889] Fix compilation error in formatting benches --- benchmarks/benches/formatting.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs index 25c5a0ba8..3479029f4 100644 --- a/benchmarks/benches/formatting.rs +++ b/benchmarks/benches/formatting.rs @@ -1,3 +1,5 @@ +use std::rc::Rc; + use criterion::{criterion_group, criterion_main}; use milli::tokenizer::TokenizerBuilder; use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; @@ -18,14 +20,14 @@ fn bench_formatting(c: &mut criterion::Criterion) { name: "'the door d'", text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#, matching_words: MatcherBuilder::new(MatchingWords::new(vec![ - (vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]), - (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), - (vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]), - (vec![MatchingWord::new("do".to_string(), 0, false), MatchingWord::new("or".to_string(), 0, false)], vec![0]), - (vec![MatchingWord::new("thedoor".to_string(), 1, false)], vec![0, 1]), - (vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]), - (vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]), - (vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]), + (vec![Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap())], vec![0]), + (vec![Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap())], vec![0]), + (vec![Rc::new(MatchingWord::new("door".to_string(), 1, false).unwrap())], vec![1]), + (vec![Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap())], vec![0]), + (vec![Rc::new(MatchingWord::new("thedoor".to_string(), 1, false).unwrap())], vec![0, 1]), + (vec![Rc::new(MatchingWord::new("d".to_string(), 0, true).unwrap())], vec![2]), + (vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]), + (vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]), ] ), TokenizerBuilder::default().build()), }, From f70856bab1505b808261e6d889bcfb3dd9eaad61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 28 Nov 2022 12:39:43 +0100 Subject: [PATCH 1823/1889] Remove memory usage test that fails when many tests are run in parallel --- milli/src/search/query_tree.rs | 117 +++++++++++++++++---------------- 1 file changed, 61 insertions(+), 56 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index b218b48e2..6ea82f165 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -866,9 +866,7 @@ pub fn maximum_proximity(operation: &Operation) -> usize { #[cfg(test)] mod test { - use std::alloc::{GlobalAlloc, System}; use std::collections::HashMap; - use std::sync::atomic::{self, AtomicI64}; use charabia::Tokenize; use maplit::hashmap; @@ -1395,59 +1393,66 @@ mod test { )); } - #[global_allocator] - static ALLOC: CountingAlloc = - CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) }; + // The memory usage test below is disabled because `cargo test` runs multiple tests in parallel, + // which invalidates the measurements of memory usage. Nevertheless, it is a useful test to run + // manually from time to time, so I kept it here, commented-out. - pub struct CountingAlloc { - pub resident: AtomicI64, - pub allocated: AtomicI64, - } - unsafe impl GlobalAlloc for CountingAlloc { - unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { - self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); - self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); - - System.alloc(layout) - } - - unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { - self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); - System.dealloc(ptr, layout) - } - } - - #[test] - fn memory_usage_of_ten_word_query() { - let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); - let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); - - let index = TempIndex::new(); - let rtxn = index.read_txn().unwrap(); - let query = "a beautiful summer house by the beach overlooking what seems"; - let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); - builder.words_limit(10); - let x = builder.build(query.tokenize()).unwrap().unwrap(); - let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); - let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); - - // Weak check on the memory usage - // Don't keep more than 5MB. (Arguably 5MB is already too high) - assert!(resident_after - resident_before < 5_000_000); - // Don't allocate more than 10MB. - assert!(allocated_after - allocated_before < 10_000_000); - - // Use these snapshots to measure the exact memory usage. - // The values below were correct at the time I wrote them. - // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950"); - // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502"); - - // Note, with the matching word cache deactivated, the memory usage was: - // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697"); - // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588"); - // or about 20x more resident memory (90MB vs 4.5MB) - - // Use x - let _x = x; - } + // use std::alloc::{GlobalAlloc, System}; + // use std::sync::atomic::{self, AtomicI64}; + // + // #[global_allocator] + // static ALLOC: CountingAlloc = + // CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) }; + // + // pub struct CountingAlloc { + // pub resident: AtomicI64, + // pub allocated: AtomicI64, + // } + // unsafe impl GlobalAlloc for CountingAlloc { + // unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { + // self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); + // self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); + // + // System.alloc(layout) + // } + // + // unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { + // self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); + // System.dealloc(ptr, layout) + // } + // } + // + // #[test] + // fn memory_usage_of_ten_word_query() { + // let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); + // let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); + // + // let index = TempIndex::new(); + // let rtxn = index.read_txn().unwrap(); + // let query = "a beautiful summer house by the beach overlooking what seems"; + // let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + // builder.words_limit(10); + // let x = builder.build(query.tokenize()).unwrap().unwrap(); + // let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); + // let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); + // + // // Weak check on the memory usage + // // Don't keep more than 5MB. (Arguably 5MB is already too high) + // assert!(resident_after - resident_before < 5_000_000); + // // Don't allocate more than 10MB. + // assert!(allocated_after - allocated_before < 10_000_000); + // + // // Use these snapshots to measure the exact memory usage. + // // The values below were correct at the time I wrote them. + // // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950"); + // // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502"); + // + // // Note, with the matching word cache deactivated, the memory usage was: + // // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697"); + // // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588"); + // // or about 20x more resident memory (90MB vs 4.5MB) + // + // // Use x + // let _x = x; + // } } From d3182f38307dc8be91f604a9f300a8b777ceb7f9 Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Mon, 28 Nov 2022 10:02:03 -0500 Subject: [PATCH 1824/1889] refactor: Change return type to keep consistency with others --- milli/src/index.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index d9636634d..9e4e56de0 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1145,8 +1145,8 @@ impl Index { } /// Clears the exact attributes from the store. - pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> Result { - Ok(self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES)?) + pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES) } pub fn max_values_per_facet(&self, txn: &RoTxn) -> heed::Result> { From 61b58b115a5dcfe7b0dc701c722f4832abdc747a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 28 Nov 2022 16:32:28 +0100 Subject: [PATCH 1825/1889] Don't create partial matching words for synonyms in ngrams --- milli/src/search/query_tree.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 6ea82f165..e689ae440 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -693,11 +693,13 @@ fn create_matching_words( if let Some(synonyms) = ctx.synonyms(&words)? { for synonym in synonyms { - let synonym = synonym + if let Some(synonym) = synonym .into_iter() - .flat_map(|syn| matching_word_cache.insert(syn, 0, false)) - .collect(); - matching_words.push((synonym, ids.clone())); + .map(|syn| matching_word_cache.insert(syn, 0, false)) + .collect() + { + matching_words.push((synonym, ids.clone())); + } } } let word = words.concat(); From 87e2bc3beda573a72c87647d614393d524bd5b44 Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Mon, 28 Nov 2022 13:12:19 -0500 Subject: [PATCH 1826/1889] fix(reindex): reindex in a few more cases Cases: whenever searchable_fields OR user_defined_searchable_fields is modified --- milli/src/index.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 9e4e56de0..33c04789d 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -560,8 +560,9 @@ impl Index { } pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { - self.delete_searchable_fields(wtxn)?; - self.delete_user_defined_searchable_fields(wtxn) + let did_delete_searchable = self.delete_searchable_fields(wtxn)?; + let did_delete_user_defined = self.delete_user_defined_searchable_fields(wtxn)?; + Ok(did_delete_searchable || did_delete_user_defined) } /// Writes the searchable fields, when this list is specified, only these are indexed. From 5f785220447df20519b244cc0f0bafa94a5309fc Mon Sep 17 00:00:00 2001 From: Minh Pham Date: Tue, 29 Nov 2022 10:11:38 +0700 Subject: [PATCH 1827/1889] Updagte --- .github/workflows/manual_benchmarks.yml | 8 ++++---- .github/workflows/push_benchmarks_indexing.yml | 8 ++++---- .github/workflows/push_benchmarks_search_geo.yml | 8 ++++---- .github/workflows/push_benchmarks_search_songs.yml | 8 ++++---- .github/workflows/push_benchmarks_search_wiki.yml | 8 ++++---- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/manual_benchmarks.yml b/.github/workflows/manual_benchmarks.yml index 17e006224..615f98d56 100644 --- a/.github/workflows/manual_benchmarks.yml +++ b/.github/workflows/manual_benchmarks.yml @@ -27,19 +27,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/})" id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[echo "name=short" >> $GITHUB_SHA;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "##[echo "name=short" >> $GITHUB_OUTPUT;]$(echo $GITHUB_SHA | cut -c1-8)" id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[echo "name=basename" >> $BENCH_NAME;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[echo "name=basename" >> $GITHUB_OUTPUT;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_indexing.yml b/.github/workflows/push_benchmarks_indexing.yml index 9b53a3940..9491b30df 100644 --- a/.github/workflows/push_benchmarks_indexing.yml +++ b/.github/workflows/push_benchmarks_indexing.yml @@ -25,19 +25,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/})" id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[echo "name=short" >> $GITHUB_SHA;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "##[echo "name=short" >> $GITHUB_OUTPUT;]$(echo $GITHUB_SHA | cut -c1-8)" id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[echo "name=basename" >> ${BENCH_NAME};]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[echo "name=basename" >> $GITHUB_OUTPUT;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_search_geo.yml b/.github/workflows/push_benchmarks_search_geo.yml index a1b18f6bd..6f6695f17 100644 --- a/.github/workflows/push_benchmarks_search_geo.yml +++ b/.github/workflows/push_benchmarks_search_geo.yml @@ -24,19 +24,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/})" id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[echo "name=short" >> $GITHUB_SHA;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "##[echo "name=short" >> $GITHUB_OUTPUT;]$(echo $GITHUB_SHA | cut -c1-8)" id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[echo "name=basename" >> $BENCH_NAME;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[echo "name=basename" >> $GITHUB_OUTPUT;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_search_songs.yml b/.github/workflows/push_benchmarks_search_songs.yml index 4f1f631ba..c913e2c39 100644 --- a/.github/workflows/push_benchmarks_search_songs.yml +++ b/.github/workflows/push_benchmarks_search_songs.yml @@ -24,19 +24,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/})" id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[echo "name=short" >> $GITHUB_SHA;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "##[echo "name=short" >> $GITHUB_OUTPUT;]$(echo $GITHUB_SHA | cut -c1-8)" id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[echo "name=basename" >> $BENCH_NAME;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[echo "name=basename" >> $GITHUB_OUTPUT;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_search_wiki.yml b/.github/workflows/push_benchmarks_search_wiki.yml index 7abf0661f..0b43e5ec1 100644 --- a/.github/workflows/push_benchmarks_search_wiki.yml +++ b/.github/workflows/push_benchmarks_search_wiki.yml @@ -24,19 +24,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/})" id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[echo "name=name" >> $GITHUB_REF;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[echo "name=short" >> $GITHUB_SHA;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "##[echo "name=short" >> $GITHUB_OUTPUT;]$(echo $GITHUB_SHA | cut -c1-8)" id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[echo "name=basename" >> $BENCH_NAME;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "##[echo "name=basename" >> $GITHUB_OUTPUT;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" id: file # Run benchmarks From 9dd4b33a9af95f53da65cdb1065ec01f4c6f53d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 30 Nov 2022 14:27:36 +0100 Subject: [PATCH 1828/1889] Fix bulk facet indexing bug --- milli/src/update/facet/bulk.rs | 52 +++++++++++++++++++++++++-- milli/src/update/facet/incremental.rs | 2 ++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 30660d5af..b1065c0bc 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -288,6 +288,8 @@ impl FacetsUpdateBulkInner { for bitmap in sub_bitmaps { combined_bitmap |= bitmap; } + // The conversion of sub_bitmaps.len() to a u8 will always be correct + // since its length is bounded by max_group_size, which is a u8. group_sizes.push(sub_bitmaps.len() as u8); left_bounds.push(left_bound); @@ -340,7 +342,7 @@ impl FacetsUpdateBulkInner { } } // if we inserted enough elements to reach the minimum level size, then we push the writer - if cur_writer_len as u8 >= self.min_level_size { + if cur_writer_len >= self.min_level_size as usize { sub_writers.push(writer_into_reader(cur_writer)?); } else { // otherwise, if there are still leftover elements, we give them to the level above @@ -357,11 +359,15 @@ impl FacetsUpdateBulkInner { mod tests { use std::iter::once; + use big_s::S; + use maplit::hashset; use roaring::RoaringBitmap; + use crate::documents::documents_batch_reader_from_objects; use crate::heed_codec::facet::OrderedF64Codec; - use crate::milli_snap; + use crate::index::tests::TempIndex; use crate::update::facet::tests::FacetIndex; + use crate::{db_snap, milli_snap}; #[test] fn insert() { @@ -443,4 +449,46 @@ mod tests { test("large_group_small_min_level", 16, 2); test("odd_group_odd_min_level", 7, 3); } + + #[test] + fn bug_3165() { + // Indexing a number of facet values that falls within certains ranges (e.g. 22_540 qualifies) + // would lead to a facet DB which was missing some levels. + // That was because before writing a level into the database, we would + // check that its size was higher than the minimum level size using + // a lossy integer conversion: `level_size as u8 >= min_level_size`. + // + // This missing level in the facet DBs would make the incremental indexer + // (and other search algorithms) crash. + // + // https://github.com/meilisearch/meilisearch/issues/3165 + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("id") }); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..=22_540 { + documents.push( + serde_json::json! { + { + "id": i as u64, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); + db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); + } } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index ddf55b06c..223d4fc63 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -436,6 +436,8 @@ impl FacetsUpdateIncrementalInner { level: highest_level + 1, left_bound: first_key.unwrap().left_bound, }; + // Note: nbr_leftover_elements can be casted to a u8 since it is bounded by `max_group_size` + // when it is created above. let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values }; to_add.push((key.into_owned(), value)); } From 282b2e3b9818e71608457b32c897187ce2f4ba2c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 1 Dec 2022 10:02:54 +0000 Subject: [PATCH 1829/1889] Bump Swatinem/rust-cache from 2.0.1 to 2.2.0 Bumps [Swatinem/rust-cache](https://github.com/Swatinem/rust-cache) from 2.0.1 to 2.2.0. - [Release notes](https://github.com/Swatinem/rust-cache/releases) - [Changelog](https://github.com/Swatinem/rust-cache/blob/master/CHANGELOG.md) - [Commits](https://github.com/Swatinem/rust-cache/compare/v2.0.1...v2.2.0) --- updated-dependencies: - dependency-name: Swatinem/rust-cache dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/rust.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 0962e4511..abe227db0 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -36,7 +36,7 @@ jobs: toolchain: stable override: true - name: Cache dependencies - uses: Swatinem/rust-cache@v2.0.1 + uses: Swatinem/rust-cache@v2.2.0 - name: Run cargo check uses: actions-rs/cargo@v1 with: @@ -60,7 +60,7 @@ jobs: override: true components: clippy - name: Cache dependencies - uses: Swatinem/rust-cache@v2.0.0 + uses: Swatinem/rust-cache@v2.2.0 - name: Run cargo clippy uses: actions-rs/cargo@v1 with: @@ -78,7 +78,7 @@ jobs: override: true components: rustfmt - name: Cache dependencies - uses: Swatinem/rust-cache@v2.0.1 + uses: Swatinem/rust-cache@v2.2.0 - name: Run cargo fmt # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate From 5bdf5c0aaf9cc8046c5fdb65436575975981b6b4 Mon Sep 17 00:00:00 2001 From: curquiza Date: Thu, 1 Dec 2022 10:59:20 +0100 Subject: [PATCH 1830/1889] Update the steps to set variables --- .github/workflows/manual_benchmarks.yml | 8 ++++---- .github/workflows/push_benchmarks_indexing.yml | 8 ++++---- .github/workflows/push_benchmarks_search_geo.yml | 8 ++++---- .github/workflows/push_benchmarks_search_songs.yml | 8 ++++---- .github/workflows/push_benchmarks_search_wiki.yml | 8 ++++---- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/manual_benchmarks.yml b/.github/workflows/manual_benchmarks.yml index 615f98d56..7adf05bcf 100644 --- a/.github/workflows/manual_benchmarks.yml +++ b/.github/workflows/manual_benchmarks.yml @@ -27,19 +27,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[echo "name=short" >> $GITHUB_OUTPUT;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[echo "name=basename" >> $GITHUB_OUTPUT;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "basename=$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_indexing.yml b/.github/workflows/push_benchmarks_indexing.yml index 9491b30df..2ddfc8614 100644 --- a/.github/workflows/push_benchmarks_indexing.yml +++ b/.github/workflows/push_benchmarks_indexing.yml @@ -25,19 +25,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[echo "name=short" >> $GITHUB_OUTPUT;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[echo "name=basename" >> $GITHUB_OUTPUT;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "basename=$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_search_geo.yml b/.github/workflows/push_benchmarks_search_geo.yml index 6f6695f17..625b55ff1 100644 --- a/.github/workflows/push_benchmarks_search_geo.yml +++ b/.github/workflows/push_benchmarks_search_geo.yml @@ -24,19 +24,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[echo "name=short" >> $GITHUB_OUTPUT;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[echo "name=basename" >> $GITHUB_OUTPUT;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "basename=$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_search_songs.yml b/.github/workflows/push_benchmarks_search_songs.yml index c913e2c39..5bed67152 100644 --- a/.github/workflows/push_benchmarks_search_songs.yml +++ b/.github/workflows/push_benchmarks_search_songs.yml @@ -24,19 +24,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[echo "name=short" >> $GITHUB_OUTPUT;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[echo "name=basename" >> $GITHUB_OUTPUT;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "basename=$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT id: file # Run benchmarks diff --git a/.github/workflows/push_benchmarks_search_wiki.yml b/.github/workflows/push_benchmarks_search_wiki.yml index 0b43e5ec1..69a58a56e 100644 --- a/.github/workflows/push_benchmarks_search_wiki.yml +++ b/.github/workflows/push_benchmarks_search_wiki.yml @@ -24,19 +24,19 @@ jobs: # Set variables - name: Set current branch name shell: bash - run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/})" + run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT id: current_branch - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 shell: bash - run: echo "##[echo "name=name" >> $GITHUB_OUTPUT;]$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" + run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT id: normalized_current_branch - name: Set shorter commit SHA shell: bash - run: echo "##[echo "name=short" >> $GITHUB_OUTPUT;]$(echo $GITHUB_SHA | cut -c1-8)" + run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT id: commit_sha - name: Set file basename with format "dataset_branch_commitSHA" shell: bash - run: echo "##[echo "name=basename" >> $GITHUB_OUTPUT;]$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" + run: echo "basename=$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT id: file # Run benchmarks From 1b5b5778c166e4c7a2f85d22c92d0f0be5711701 Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Sat, 3 Dec 2022 13:13:41 -0500 Subject: [PATCH 1831/1889] feat: Add From<&str> implementation for Token --- filter-parser/src/lib.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index a9bd9b3d7..61801e3d4 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -115,6 +115,13 @@ impl<'a> From> for Token<'a> { } } +/// Allow [Token] to be constructed from &[str] +impl<'a> From<&'a str> for Token<'a> { + fn from(s: &'a str) -> Self { + Token::from(Span::new_extra(s, s)) + } +} + #[derive(Debug, Clone, PartialEq, Eq)] pub enum FilterCondition<'a> { Not(Box), @@ -664,6 +671,13 @@ pub mod tests { assert!(filter.token_at_depth(2).is_some()); assert!(filter.token_at_depth(3).is_none()); } + + #[test] + fn token_from_str() { + let s = "test string that should not be parsed"; + let token: Token = s.into(); + assert_eq!(token.value(), s); + } } impl<'a> std::fmt::Display for FilterCondition<'a> { From 50954d31fac5727a0a4dc55bf02eeb20e25ee84d Mon Sep 17 00:00:00 2001 From: Gregory Conrad Date: Sat, 3 Dec 2022 13:37:33 -0500 Subject: [PATCH 1832/1889] feat: Re-export Span and Token to milli:: --- milli/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 9facdaa2d..b17be8f1f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -22,7 +22,7 @@ use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; -pub use filter_parser::{Condition, FilterCondition}; +pub use filter_parser::{Condition, FilterCondition, Span, Token}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; use serde_json::Value; From f2cf981641183a98a5a0e0f376e6d27e1b85323d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Dec 2022 10:33:31 +0100 Subject: [PATCH 1833/1889] Add more tests and allow disabling of soft-deletion outside of tests Also allow disabling soft-deletion in the IndexDocumentsConfig --- .../src/search/facet/facet_sort_ascending.rs | 93 ++++- .../src/search/facet/facet_sort_descending.rs | 97 +++++- milli/src/search/facet/mod.rs | 42 ++- .../0.snap | 0 .../1.snap | 0 .../0-0.snap | 33 ++ .../0-1.snap | 33 ++ .../1-0.snap | 27 ++ .../1-1.snap | 27 ++ .../filter_sort_descending/2.snap | 60 ++++ .../0-0.snap | 33 ++ .../0-1.snap | 33 ++ .../1-0.snap | 27 ++ .../1-1.snap | 27 ++ milli/src/update/delete_documents.rs | 16 +- milli/src/update/facet/bulk.rs | 40 ++- milli/src/update/facet/delete.rs | 131 +++++++- milli/src/update/facet/incremental.rs | 4 +- milli/src/update/facet/mod.rs | 113 ++++++- .../bulk.rs/insert_string/default.hash.snap | 4 + .../large_group_small_min_level.hash.snap | 4 + .../odd_group_odd_min_level.hash.snap | 4 + .../small_group_large_min_level.hash.snap | 4 + .../small_group_small_min_level.hash.snap | 4 + .../1/facet_id_f64_docids.hash.snap | 4 - .../1/number_faceted_documents_ids.hash.snap | 4 - .../2/facet_id_f64_docids.hash.snap | 4 - .../2/number_faceted_documents_ids.hash.snap | 4 - milli/src/update/index_documents/mod.rs | 7 + .../initial/word_docids.snap | 54 +++ .../updated/soft_deleted_documents_ids.snap | 4 + .../updated/word_docids.snap | 58 ++++ milli/src/update/prefix_word_pairs/mod.rs | 318 +++++++++++++++++- .../prefix_word_pair_proximity_docids.snap | 20 ++ .../word_prefix_pair_proximity_docids.snap | 0 .../prefix_word_pair_proximity_docids.snap | 0 .../update/word_pair_proximity_docids.snap | 0 .../word_prefix_pair_proximity_docids.snap | 0 .../prefix_word_pair_proximity_docids.snap | 0 .../word_pair_proximity_docids.snap | 0 .../word_prefix_pair_proximity_docids.snap | 0 .../first_delete/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 6 + .../first_delete/word_docids.snap | 60 ++++ .../word_prefix_pair_proximity_docids.snap | 11 + .../initial/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 14 + .../initial/word_docids.snap | 65 ++++ .../word_prefix_pair_proximity_docids.snap | 16 + .../reupdate/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 6 + .../reupdate/word_docids.snap | 60 ++++ .../word_prefix_pair_proximity_docids.snap | 5 + .../second_delete/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 6 + .../second_delete/word_docids.snap | 10 + .../word_prefix_pair_proximity_docids.snap | 11 + .../initial/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 9 + .../initial/word_docids.snap | 61 ++++ .../word_prefix_pair_proximity_docids.snap | 7 + .../replaced/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 5 + .../replaced/word_docids.snap | 61 ++++ .../word_prefix_pair_proximity_docids.snap | 5 + .../initial/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 9 + .../initial/word_docids.snap | 61 ++++ .../word_prefix_pair_proximity_docids.snap | 7 + .../replaced/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 10 + .../replaced/word_docids.hash.snap | 4 + .../word_prefix_pair_proximity_docids.snap | 8 + .../first_delete/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 14 + .../first_delete/word_docids.snap | 65 ++++ .../word_prefix_pair_proximity_docids.snap | 16 + .../initial/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 14 + .../initial/word_docids.snap | 65 ++++ .../word_prefix_pair_proximity_docids.snap | 16 + .../reupdate/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 17 + .../reupdate/word_docids.hash.snap | 4 + .../word_prefix_pair_proximity_docids.snap | 21 ++ .../second_delete/documents_ids.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 14 + .../second_delete/word_docids.snap | 65 ++++ .../word_prefix_pair_proximity_docids.snap | 16 + 89 files changed, 2171 insertions(+), 54 deletions(-) rename milli/src/search/facet/snapshots/facet_sort_ascending.rs/{filter_sort => filter_sort_ascending}/0.snap (100%) rename milli/src/search/facet/snapshots/facet_sort_ascending.rs/{filter_sort => filter_sort_ascending}/1.snap (100%) create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap create mode 100644 milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap delete mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap delete mode 100644 milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_update => add_new_documents}/initial/word_prefix_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_update => add_new_documents}/update/prefix_word_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_update => add_new_documents}/update/word_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_update => add_new_documents}/update/word_prefix_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_batch_bug_3043 => batch_bug_3043}/prefix_word_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_batch_bug_3043 => batch_bug_3043}/word_pair_proximity_docids.snap (100%) rename milli/src/update/prefix_word_pairs/snapshots/mod.rs/{test_batch_bug_3043 => batch_bug_3043}/word_prefix_pair_proximity_docids.snap (100%) create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 552795981..32cf5c355 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -80,6 +80,8 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { // that we found all the documents in the sub level iterations already, // we can pop this level iterator. if documents_ids.is_empty() { + // break our of the for loop into the end of the 'outer loop, which + // pops the stack break; } @@ -113,11 +115,14 @@ mod tests { use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; - use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; + use crate::search::facet::tests::{ + get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids, + get_simple_index, get_simple_string_index_with_multiple_field_ids, + }; use crate::snapshot_tests::display_bitmap; #[test] - fn filter_sort() { + fn filter_sort_ascending() { let indexes = [get_simple_index(), get_random_looking_index()]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); @@ -134,4 +139,88 @@ mod tests { txn.commit().unwrap(); } } + + #[test] + fn filter_sort_ascending_multiple_field_ids() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-0")); + + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-1")); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_ascending_with_no_candidates() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_ascending_with_inexisting_field_id() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } } diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 12767c64d..4d1fdd1e7 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -125,12 +125,20 @@ mod tests { use crate::heed_codec::ByteSliceRefCodec; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; - use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; + use crate::search::facet::tests::{ + get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids, + get_simple_index, get_simple_index_with_multiple_field_ids, + get_simple_string_index_with_multiple_field_ids, + }; use crate::snapshot_tests::display_bitmap; #[test] fn filter_sort_descending() { - let indexes = [get_simple_index(), get_random_looking_index()]; + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); @@ -147,4 +155,89 @@ mod tests { txn.commit().unwrap(); } } + + #[test] + fn filter_sort_descending_multiple_field_ids() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let db = index.content.remap_key_type::>(); + let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-0")); + + let mut results = String::new(); + + let iter = descending_facet_sort(&txn, db, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-1")); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_sort_ascending_with_no_candidates() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = descending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + let mut results = String::new(); + let iter = descending_facet_sort(&txn, index.content, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_ascending_with_inexisting_field_id() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = descending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 7dfdcdb94..73054b84a 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -89,7 +89,8 @@ pub(crate) mod tests { use roaring::RoaringBitmap; use crate::heed_codec::facet::OrderedF64Codec; - use crate::update::facet::tests::FacetIndex; + use crate::heed_codec::StrRefCodec; + use crate::update::facet::test_helpers::FacetIndex; pub fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -147,4 +148,43 @@ pub(crate) mod tests { txn.commit().unwrap(); index } + pub fn get_simple_string_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for fid in 0..2 { + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + if i % 2 == 0 { + index.insert(&mut txn, fid, &format!("{i}").as_str(), &bitmap); + } else { + index.insert(&mut txn, fid, &"", &bitmap); + } + } + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_string_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + for fid in 0..2 { + for (_i, &key) in keys.iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + if key % 2 == 0 { + index.insert(&mut txn, fid, &format!("{key}").as_str(), &bitmap); + } else { + index.insert(&mut txn, fid, &"", &bitmap); + } + } + } + txn.commit().unwrap(); + index + } } diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/0.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap rename to milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/0.snap diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/1.snap similarity index 100% rename from milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap rename to milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/1.snap diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap new file mode 100644 index 000000000..ef207f888 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] +[200, ] +[202, ] +[204, ] +[206, ] +[208, ] +[210, ] +[212, ] +[214, ] +[216, ] +[218, ] +[220, ] +[222, ] +[224, ] +[226, ] +[228, ] +[230, ] +[232, ] +[234, ] +[236, ] +[238, ] +[240, ] +[242, ] +[244, ] +[246, ] +[248, ] +[250, ] +[252, ] +[254, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap new file mode 100644 index 000000000..ef207f888 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] +[200, ] +[202, ] +[204, ] +[206, ] +[208, ] +[210, ] +[212, ] +[214, ] +[216, ] +[218, ] +[220, ] +[222, ] +[224, ] +[226, ] +[228, ] +[230, ] +[232, ] +[234, ] +[236, ] +[238, ] +[240, ] +[242, ] +[244, ] +[246, ] +[248, ] +[250, ] +[252, ] +[254, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap new file mode 100644 index 000000000..52d3d0de0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] +[202, ] +[224, ] +[230, ] +[236, ] +[244, ] +[250, ] +[256, ] +[258, ] +[260, ] +[262, ] +[264, ] +[278, ] +[282, ] +[286, ] +[292, ] +[206, ] +[208, ] +[210, ] +[216, ] +[220, ] +[226, ] +[238, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap new file mode 100644 index 000000000..52d3d0de0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] +[202, ] +[224, ] +[230, ] +[236, ] +[244, ] +[250, ] +[256, ] +[258, ] +[260, ] +[262, ] +[264, ] +[278, ] +[282, ] +[286, ] +[292, ] +[206, ] +[208, ] +[210, ] +[216, ] +[220, ] +[226, ] +[238, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap new file mode 100644 index 000000000..032763c74 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[255, ] +[254, ] +[253, ] +[252, ] +[251, ] +[250, ] +[249, ] +[248, ] +[247, ] +[246, ] +[245, ] +[244, ] +[243, ] +[242, ] +[241, ] +[240, ] +[239, ] +[238, ] +[237, ] +[236, ] +[235, ] +[234, ] +[233, ] +[232, ] +[231, ] +[230, ] +[229, ] +[228, ] +[227, ] +[226, ] +[225, ] +[224, ] +[223, ] +[222, ] +[221, ] +[220, ] +[219, ] +[218, ] +[217, ] +[216, ] +[215, ] +[214, ] +[213, ] +[212, ] +[211, ] +[210, ] +[209, ] +[208, ] +[207, ] +[206, ] +[205, ] +[204, ] +[203, ] +[202, ] +[201, ] +[200, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap new file mode 100644 index 000000000..b833cae97 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[254, ] +[252, ] +[250, ] +[248, ] +[246, ] +[244, ] +[242, ] +[240, ] +[238, ] +[236, ] +[234, ] +[232, ] +[230, ] +[228, ] +[226, ] +[224, ] +[222, ] +[220, ] +[218, ] +[216, ] +[214, ] +[212, ] +[210, ] +[208, ] +[206, ] +[204, ] +[202, ] +[200, ] +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap new file mode 100644 index 000000000..b833cae97 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[254, ] +[252, ] +[250, ] +[248, ] +[246, ] +[244, ] +[242, ] +[240, ] +[238, ] +[236, ] +[234, ] +[232, ] +[230, ] +[228, ] +[226, ] +[224, ] +[222, ] +[220, ] +[218, ] +[216, ] +[214, ] +[212, ] +[210, ] +[208, ] +[206, ] +[204, ] +[202, ] +[200, ] +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap new file mode 100644 index 000000000..2623a8807 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[238, ] +[236, ] +[226, ] +[220, ] +[216, ] +[210, ] +[208, ] +[206, ] +[292, ] +[286, ] +[282, ] +[278, ] +[264, ] +[262, ] +[260, ] +[258, ] +[256, ] +[250, ] +[244, ] +[230, ] +[224, ] +[202, ] +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap new file mode 100644 index 000000000..2623a8807 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[238, ] +[236, ] +[226, ] +[220, ] +[216, ] +[210, ] +[208, ] +[206, ] +[292, ] +[286, ] +[282, ] +[278, ] +[264, ] +[262, ] +[260, ] +[258, ] +[256, ] +[250, ] +[244, ] +[230, ] +[224, ] +[202, ] +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] + diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index a6a4ea609..88ec78420 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -26,7 +26,6 @@ pub struct DeleteDocuments<'t, 'u, 'i> { index: &'i Index, external_documents_ids: ExternalDocumentsIds<'static>, to_delete_docids: RoaringBitmap, - #[cfg(test)] disable_soft_deletion: bool, } @@ -48,12 +47,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { index, external_documents_ids, to_delete_docids: RoaringBitmap::new(), - #[cfg(test)] disable_soft_deletion: false, }) } - #[cfg(test)] pub fn disable_soft_deletion(&mut self, disable: bool) { self.disable_soft_deletion = disable; } @@ -156,17 +153,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We run the deletion. // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents // We run the deletion. - let disable_soft_deletion = { - #[cfg(not(test))] - { - false - } - #[cfg(test)] - { - self.disable_soft_deletion - } - }; - if !disable_soft_deletion + + if !self.disable_soft_deletion && percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index b1065c0bc..0270205a6 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -365,9 +365,9 @@ mod tests { use crate::documents::documents_batch_reader_from_objects; use crate::heed_codec::facet::OrderedF64Codec; - use crate::index::tests::TempIndex; - use crate::update::facet::tests::FacetIndex; - use crate::{db_snap, milli_snap}; + use crate::heed_codec::StrRefCodec; + use crate::milli_snap; + use crate::update::facet::test_helpers::{ordered_string, FacetIndex}; #[test] fn insert() { @@ -491,4 +491,38 @@ mod tests { db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); } + + + #[test] + fn insert_string() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); + + let strings = (0..1_000).map(|i| ordered_string(i as usize)).collect::>(); + let mut elements = Vec::<((u16, &str), RoaringBitmap)>::new(); + for i in 0..1_000u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, &strings[i as usize]), once(i).collect())); + } + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, &strings[i as usize]), once(i).collect())); + } + let mut wtxn = index.env.write_txn().unwrap(); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); + + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + + wtxn.commit().unwrap(); + + milli_snap!(format!("{index}"), name); + }; + + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); + } } diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs index 9bec2d911..4030f10da 100644 --- a/milli/src/update/facet/delete.rs +++ b/milli/src/update/facet/delete.rs @@ -114,11 +114,14 @@ mod tests { use big_s::S; use maplit::hashset; + use rand::seq::SliceRandom; + use rand::SeedableRng; use roaring::RoaringBitmap; use crate::db_snap; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; + use crate::update::facet::test_helpers::ordered_string; use crate::update::DeleteDocuments; #[test] @@ -156,8 +159,8 @@ mod tests { let documents = documents_batch_reader_from_objects(documents); index.add_documents(documents).unwrap(); - db_snap!(index, facet_id_f64_docids, 1); - db_snap!(index, number_faceted_documents_ids, 1); + db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576"); + db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf"); let mut wtxn = index.env.write_txn().unwrap(); @@ -174,8 +177,126 @@ mod tests { wtxn.commit().unwrap(); db_snap!(index, soft_deleted_documents_ids, @"[]"); - db_snap!(index, facet_id_f64_docids, 2); - db_snap!(index, number_faceted_documents_ids, 2); + db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6"); + db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56"); + } + + // Same test as above but working with string values for the facets + #[test] + fn delete_mixed_incremental_and_bulk_string() { + // The point of this test is to create an index populated with documents + // containing different filterable attributes. Then, we delete a bunch of documents + // such that a mix of the incremental and bulk indexer is used (depending on the field id) + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_filterable_fields( + hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "label": ordered_string(i / 10), + "colour": ordered_string(i / 100), + "timestamp": ordered_string(i / 2), + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) + db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); + db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); + + let mut wtxn = index.env.write_txn().unwrap(); + + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_documents(&RoaringBitmap::from_iter(0..100)); + // by deleting the first 100 documents, we expect that: + // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) + // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 + // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 + // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 + // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc"); + db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f"); + } + + #[test] + fn delete_almost_all_incrementally_string() { + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_filterable_fields( + hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "label": ordered_string(i / 10), + "colour": ordered_string(i / 100), + "timestamp": ordered_string(i / 2), + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) + db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); + db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + + let mut docids_to_delete = (0..1000).collect::>(); + docids_to_delete.shuffle(&mut rng); + for docid in docids_to_delete.into_iter().take(990) { + let mut wtxn = index.env.write_txn().unwrap(); + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_documents(&RoaringBitmap::from_iter([docid])); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + } + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d"); + db_snap!(index, string_faceted_documents_ids, 2, @r###" + 0 [] + 1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] + 2 [292, 324, 358, 381, 493, 839, 852, ] + 3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] + "###); } } @@ -188,7 +309,7 @@ mod comparison_bench { use roaring::RoaringBitmap; use crate::heed_codec::facet::OrderedF64Codec; - use crate::update::facet::tests::FacetIndex; + use crate::update::facet::test_helpers::FacetIndex; // This is a simple test to get an intuition on the relative speed // of the incremental vs. bulk indexer. diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 223d4fc63..cffce5525 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -648,7 +648,7 @@ mod tests { use crate::heed_codec::facet::OrderedF64Codec; use crate::heed_codec::StrRefCodec; use crate::milli_snap; - use crate::update::facet::tests::FacetIndex; + use crate::update::facet::test_helpers::FacetIndex; #[test] fn append() { @@ -1053,7 +1053,7 @@ mod fuzz { use tempfile::TempDir; use super::*; - use crate::update::facet::tests::FacetIndex; + use crate::update::facet::test_helpers::FacetIndex; #[derive(Default)] pub struct TrivialDatabase { pub elements: BTreeMap>, diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 76e5514a1..fd55204c3 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -162,7 +162,7 @@ impl<'i> FacetsUpdate<'i> { } #[cfg(test)] -pub(crate) mod tests { +pub(crate) mod test_helpers { use std::cell::Cell; use std::fmt::Display; use std::iter::FromIterator; @@ -183,6 +183,23 @@ pub(crate) mod tests { use crate::update::FacetsUpdateIncrementalInner; use crate::CboRoaringBitmapCodec; + /// Utility function to generate a string whose position in a lexicographically + /// ordered list is `i`. + pub fn ordered_string(mut i: usize) -> String { + // The first string is empty + if i == 0 { + return String::new(); + } + // The others are 5 char long, each between 'a' and 'z' + let mut s = String::new(); + for _ in 0..5 { + let (digit, next) = (i % 26, i / 26); + s.insert(0, char::from_u32('a' as u32 + digit as u32).unwrap()); + i = next; + } + s + } + /// A dummy index that only contains the facet database, used for testing pub struct FacetIndex where @@ -438,6 +455,98 @@ pub(crate) mod tests { } } +#[cfg(test)] +mod tests { + use big_s::S; + use maplit::hashset; + + use crate::db_snap; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + + #[test] + fn replace_all_identical_soft_deletion_then_hard_deletion() { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("size") }); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "size": i % 250, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); + db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9"); + db_snap!(index, soft_deleted_documents_ids, "initial", @"[]"); + + let mut documents = vec![]; + for i in 0..999 { + documents.push( + serde_json::json! { + { + "id": i, + "size": i % 250, + "other": 0, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); + db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06"); + db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); + + // Then replace the last document while disabling soft_deletion + index.index_documents_config.disable_soft_deletion = true; + let mut documents = vec![]; + for i in 999..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "size": i % 250, + "other": 0, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); + db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028"); + db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]"); + } +} + #[allow(unused)] #[cfg(test)] mod comparison_bench { @@ -446,7 +555,7 @@ mod comparison_bench { use rand::Rng; use roaring::RoaringBitmap; - use super::tests::FacetIndex; + use super::test_helpers::FacetIndex; use crate::heed_codec::facet::OrderedF64Codec; // This is a simple test to get an intuition on the relative speed diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap new file mode 100644 index 000000000..b7705b72e --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +353d70f52eea66e5031dca989ea8a037 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..15030a1ea --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +52a093c909133d84023a4a7b83864808 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..949ec6647 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +9d86c72ddb241d0aeca2995d61a3648a diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..d8797f1ab --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +c0943177594534bfe5527cbf40fe388e diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..f7949c5f3 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +6ed86f234028ae3df5881bee5512f11e diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap deleted file mode 100644 index fee486bab..000000000 --- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/delete.rs ---- -550cd138d6fe31ccdd42cd5392fbd576 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap deleted file mode 100644 index fcf957004..000000000 --- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/delete.rs ---- -9a0ea88e7c9dcf6dc0ef0b601736ffcf diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap deleted file mode 100644 index 29ceb250e..000000000 --- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/delete.rs ---- -d4d5f14e7f1e1f09b86821a0b6defcc6 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap deleted file mode 100644 index bbaf6d2a2..000000000 --- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/delete.rs ---- -3570e0ac0fdb21be9ebe433f59264b56 diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index af99a230b..db6ffedc1 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -88,6 +88,7 @@ pub struct IndexDocumentsConfig { pub words_positions_level_group_size: Option, pub words_positions_min_level_size: Option, pub update_method: IndexDocumentsMethod, + pub disable_soft_deletion: bool, pub autogenerate_docids: bool, } @@ -331,6 +332,7 @@ where // able to simply insert all the documents even if they already exist in the database. if !replaced_documents_ids.is_empty() { let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?; + deletion_builder.disable_soft_deletion(self.config.disable_soft_deletion); debug!("documents to delete {:?}", replaced_documents_ids); deletion_builder.delete_documents(&replaced_documents_ids); let deleted_documents_count = deletion_builder.execute()?; @@ -906,6 +908,8 @@ mod tests { { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } } ])).unwrap(); + db_snap!(index, word_docids, "initial"); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index @@ -928,6 +932,9 @@ mod tests { let count = index.all_documents(&rtxn).unwrap().count(); assert_eq!(count, 6); + db_snap!(index, word_docids, "updated"); + db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]"); + drop(rtxn); } diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap new file mode 100644 index 000000000..5b424356a --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 [2, ] +10.0 [1, ] +12 [0, ] +1344 [3, ] +2 [0, ] +23 [5, ] +25.99 [2, ] +3.5 [0, ] +35 [5, ] +4 [4, ] +42 [0, 5, ] +456 [1, ] +adams [5, ] +adventure [1, ] +alice [2, ] +and [0, 4, ] +antoine [1, ] +austin [0, ] +blood [4, ] +carroll [2, ] +de [1, ] +douglas [5, ] +exupery [1, ] +fantasy [2, 3, 4, ] +galaxy [5, ] +guide [5, ] +half [4, ] +harry [4, ] +hitchhiker' [5, ] +hobbit [3, ] +in [2, ] +j [3, 4, ] +jane [0, ] +k [4, ] +le [1, ] +lewis [2, ] +petit [1, ] +potter [4, ] +prejudice [0, ] +pride [0, ] +prince [1, 4, ] +r [3, ] +romance [0, ] +rowling [4, ] +s [5, ] +saint [1, ] +the [3, 4, 5, ] +to [5, ] +tolkien [3, ] +wonderland [2, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..9228ad265 --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +[0, 1, 4, ] diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap new file mode 100644 index 000000000..4f4a9e33a --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap @@ -0,0 +1,58 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 [2, ] +10.0 [1, 7, ] +12 [0, 8, ] +1344 [3, ] +1813 [8, ] +2 [0, 8, ] +23 [5, ] +25.99 [2, ] +3.5 [0, 8, ] +35 [5, ] +4 [4, 6, ] +42 [0, 5, 8, ] +456 [1, 7, ] +adams [5, ] +adventure [1, 7, ] +alice [2, ] +and [0, 4, 6, 8, ] +antoine [1, 7, ] +austen [8, ] +austin [0, ] +blood [4, 6, ] +carroll [2, ] +de [1, 7, ] +douglas [5, ] +exupery [1, 7, ] +fantasy [2, 3, 4, 6, ] +galaxy [5, ] +guide [5, ] +half [4, 6, ] +harry [4, 6, ] +hitchhiker' [5, ] +hobbit [3, ] +in [2, ] +j [3, 4, 6, 8, ] +jane [0, ] +k [4, 6, ] +le [1, ] +lewis [2, ] +little [7, ] +petit [1, ] +potter [4, 6, ] +prejudice [0, 8, ] +pride [0, 8, ] +prince [1, 4, 7, ] +princess [6, ] +r [3, ] +romance [0, 8, ] +rowling [4, 6, ] +s [5, ] +saint [1, 7, ] +the [3, 4, 5, 6, 7, ] +to [5, ] +tolkien [3, ] +wonderland [2, ] + diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 10ea850af..de429435b 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -66,6 +66,9 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { common_prefix_fst_words: &[&'a [String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { + println!("{new_prefix_fst_words:?}"); + println!("{common_prefix_fst_words:?}"); + println!("{del_prefix_fst_words:?}"); index_word_prefix_database( self.wtxn, self.index.word_pair_proximity_docids, @@ -156,30 +159,40 @@ pub fn write_into_lmdb_database_without_merging( #[cfg(test)] mod tests { use std::io::Cursor; + use std::iter::FromIterator; + + use roaring::RoaringBitmap; use crate::db_snap; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; + use crate::update::{DeleteDocuments, IndexDocumentsMethod}; - fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { + fn documents_with_enough_different_words_for_prefixes( + prefixes: &[&str], + start_id: usize, + ) -> Vec { let mut documents = Vec::new(); + let mut id = start_id; for prefix in prefixes { for i in 0..50 { documents.push( serde_json::json!({ + "id": id, "text": format!("{prefix}{i:x}"), }) .as_object() .unwrap() .clone(), - ) + ); + id += 1; } } documents } #[test] - fn test_update() { + fn add_new_documents() { let mut index = TempIndex::new(); index.index_documents_config.words_prefix_threshold = Some(50); index.index_documents_config.autogenerate_docids = true; @@ -198,10 +211,11 @@ mod tests { DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() }; - let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); + let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0); // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database documents.push( serde_json::json!({ + "id": "9000", "text": "At an amazing and beautiful house" }) .as_object() @@ -210,6 +224,7 @@ mod tests { ); documents.push( serde_json::json!({ + "id": "9001", "text": "The bell rings at 5 am" }) .as_object() @@ -221,10 +236,12 @@ mod tests { index.add_documents(documents).unwrap(); db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); + let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100); documents.push( serde_json::json!({ + "id": "9002", "text": "At an extraordinary house" }) .as_object() @@ -239,7 +256,7 @@ mod tests { db_snap!(index, prefix_word_pair_proximity_docids, "update"); } #[test] - fn test_batch_bug_3043() { + fn batch_bug_3043() { // https://github.com/meilisearch/meilisearch/issues/3043 let mut index = TempIndex::new(); index.index_documents_config.words_prefix_threshold = Some(50); @@ -259,7 +276,7 @@ mod tests { DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() }; - let mut documents = documents_with_enough_different_words_for_prefixes(&["y"]); + let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0); // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database documents.push( serde_json::json!({ @@ -285,4 +302,291 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids); db_snap!(index, prefix_word_pair_proximity_docids); } + + #[test] + fn hard_delete_and_reupdate() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.disable_soft_deletion(true); + delete.delete_documents(&RoaringBitmap::from_iter([50])); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "first_delete"); + db_snap!(index, word_docids, "first_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.disable_soft_deletion(true); + delete.delete_documents(&RoaringBitmap::from_iter(0..50)); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "second_delete"); + db_snap!(index, word_docids, "second_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "reupdate"); + db_snap!(index, word_docids, "reupdate"); + db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); + db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); + } + + #[test] + fn soft_delete_and_reupdate() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_documents(&RoaringBitmap::from_iter([50])); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "first_delete"); + db_snap!(index, word_docids, "first_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_documents(&RoaringBitmap::from_iter(0..50)); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "second_delete"); + db_snap!(index, word_docids, "second_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "reupdate"); + db_snap!(index, word_docids, "reupdate"); + db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); + db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); + } + + #[test] + fn replace_soft_deletion() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "replaced"); + db_snap!(index, word_docids, "replaced"); + db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); + db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); + db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]"); + } + + #[test] + fn replace_hard_deletion() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.disable_soft_deletion = true; + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "replaced"); + db_snap!(index, word_docids, "replaced"); + db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); + db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); + db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]"); + } } diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..6609786a3 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,20 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [101, ] +1 a amazing [100, ] +1 a an [100, ] +1 a and [100, ] +1 a beautiful [100, ] +1 b house [100, ] +1 b rings [101, ] +1 be house [100, ] +1 be rings [101, ] +2 a am [101, ] +2 a amazing [100, ] +2 a and [100, ] +2 a beautiful [100, ] +2 a house [100, ] +2 b at [101, ] +2 be at [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap rename to milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap new file mode 100644 index 000000000..39e9fbe65 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..61987fd4a --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +2 a am [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap new file mode 100644 index 000000000..1caf1a9a3 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +at [51, ] +bell [51, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..41c71ea59 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,11 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 rings a [51, ] +2 at a [51, ] +2 bell a [51, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap new file mode 100644 index 000000000..39e9fbe65 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..267a1c01d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 b rings [51, ] +2 b at [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap new file mode 100644 index 000000000..e5336d58c --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +am [51, ] +at [51, ] +b0 [0, ] +b1 [1, ] +b10 [16, ] +b11 [17, ] +b12 [18, ] +b13 [19, ] +b14 [20, ] +b15 [21, ] +b16 [22, ] +b17 [23, ] +b18 [24, ] +b19 [25, ] +b1a [26, ] +b1b [27, ] +b1c [28, ] +b1d [29, ] +b1e [30, ] +b1f [31, ] +b2 [2, ] +b20 [32, ] +b21 [33, ] +b22 [34, ] +b23 [35, ] +b24 [36, ] +b25 [37, ] +b26 [38, ] +b27 [39, ] +b28 [40, ] +b29 [41, ] +b2a [42, ] +b2b [43, ] +b2c [44, ] +b2d [45, ] +b2e [46, ] +b2f [47, ] +b3 [3, ] +b30 [48, ] +b31 [49, ] +b4 [4, ] +b5 [5, ] +b6 [6, ] +b7 [7, ] +b8 [8, ] +b9 [9, ] +ba [10, ] +bb [11, ] +bc [12, ] +bd [13, ] +be [14, ] +bell [51, ] +bf [15, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..4cdf756ac --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 the b [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap new file mode 100644 index 000000000..4dca775e6 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..61987fd4a --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +2 a am [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap new file mode 100644 index 000000000..7949d464e --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap @@ -0,0 +1,10 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +am [51, ] +at [51, ] +bell [51, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..41c71ea59 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,11 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 rings a [51, ] +2 at a [51, ] +2 bell a [51, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..78b6a3885 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,9 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a amazing [50, ] +1 a an [50, ] +1 a house [50, ] +2 a amazing [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap new file mode 100644 index 000000000..8c7809973 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap @@ -0,0 +1,61 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +amazing [50, ] +an [50, ] +at [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..65d8b806b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 an a [50, ] +1 at a [50, ] +2 at a [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap new file mode 100644 index 000000000..775d41a3d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..54c9e4b9b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 b rings [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap new file mode 100644 index 000000000..f86fdcb8b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap @@ -0,0 +1,61 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +amazing [50, ] +an [50, ] +at [50, ] +b0 [52, ] +b1 [53, ] +b10 [68, ] +b11 [69, ] +b12 [70, ] +b13 [71, ] +b14 [72, ] +b15 [73, ] +b16 [74, ] +b17 [75, ] +b18 [76, ] +b19 [77, ] +b1a [78, ] +b1b [79, ] +b1c [80, ] +b1d [81, ] +b1e [82, ] +b1f [83, ] +b2 [54, ] +b20 [84, ] +b21 [85, ] +b22 [86, ] +b23 [87, ] +b24 [88, ] +b25 [89, ] +b26 [90, ] +b27 [91, ] +b28 [92, ] +b29 [93, ] +b2a [94, ] +b2b [95, ] +b2c [96, ] +b2d [97, ] +b2e [98, ] +b2f [99, ] +b3 [55, ] +b30 [100, ] +b31 [101, ] +b4 [56, ] +b5 [57, ] +b6 [58, ] +b7 [59, ] +b8 [60, ] +b9 [61, ] +ba [62, ] +bb [63, ] +bc [64, ] +bd [65, ] +be [66, ] +bell [51, ] +bf [67, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..4cdf756ac --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 the b [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..78b6a3885 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,9 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a amazing [50, ] +1 a an [50, ] +1 a house [50, ] +2 a amazing [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap new file mode 100644 index 000000000..8c7809973 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap @@ -0,0 +1,61 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +amazing [50, ] +an [50, ] +at [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..65d8b806b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 an a [50, ] +1 at a [50, ] +2 at a [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap new file mode 100644 index 000000000..775d41a3d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..0241f26a5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,10 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a amazing [50, ] +1 a an [50, ] +1 a house [50, ] +1 b rings [51, ] +2 a amazing [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap new file mode 100644 index 000000000..6a481eeee --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5f6443e54fae188aa96d4f27fce28939 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..d20582970 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,8 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 an a [50, ] +1 at a [50, ] +1 the b [51, ] +2 at a [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap new file mode 100644 index 000000000..39e9fbe65 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap new file mode 100644 index 000000000..c8a1e54b4 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..db62b6566 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,17 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +1 b house [50, ] +1 b rings [51, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] +2 b at [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap new file mode 100644 index 000000000..7fd726325 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +9f4866b80177e321a33ce434992022b5 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..8a684b16d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,21 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 and b [50, ] +1 at a [50, ] +1 rings a [51, ] +1 the b [51, ] +2 amazing b [50, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 an b [50, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 at b [50, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap new file mode 100644 index 000000000..4dca775e6 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + From ae59d37b75765a07f8871e60f121f187fc272969 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 30 Nov 2022 15:58:41 +0100 Subject: [PATCH 1834/1889] Improve insta-snap of the external document ids --- milli/src/snapshot_tests.rs | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 46972deba..9ad5fe425 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -356,19 +356,22 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { pub fn snap_external_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap(); + let mut snap = String::new(); - let soft_bytes = soft.into_fst().as_bytes().to_owned(); - let mut hex_soft = String::new(); - for byte in soft_bytes { - write!(&mut hex_soft, "{:x}", byte).unwrap(); + + writeln!(&mut snap, "soft:").unwrap(); + let stream_soft = soft.stream(); + let soft_external_ids = stream_soft.into_str_vec().unwrap(); + for (key, id) in soft_external_ids { + writeln!(&mut snap, "{key:<24} {id}").unwrap(); } - writeln!(&mut snap, "soft: {hex_soft}").unwrap(); - let hard_bytes = hard.into_fst().as_bytes().to_owned(); - let mut hex_hard = String::new(); - for byte in hard_bytes { - write!(&mut hex_hard, "{:x}", byte).unwrap(); + writeln!(&mut snap, "hard:").unwrap(); + let stream_hard = hard.stream(); + let hard_external_ids = stream_hard.into_str_vec().unwrap(); + for (key, id) in hard_external_ids { + writeln!(&mut snap, "{key:<24} {id}").unwrap(); } - writeln!(&mut snap, "hard: {hex_hard}").unwrap(); + snap } pub fn snap_number_faceted_documents_ids(index: &Index) -> String { From cda4ba2bb6ca5ba52018cc6a2aef152d9c00ed90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 5 Dec 2022 10:26:53 +0100 Subject: [PATCH 1835/1889] Add document import tests --- milli/src/index.rs | 547 +++++++++++++++++++++- milli/src/update/facet/bulk.rs | 4 +- milli/src/update/prefix_word_pairs/mod.rs | 3 - 3 files changed, 547 insertions(+), 7 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 33c04789d..a98247b6e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1190,8 +1190,10 @@ pub(crate) mod tests { use crate::documents::DocumentsBatchReader; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; - use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; - use crate::{db_snap, Index}; + use crate::update::{ + self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, + }; + use crate::{db_snap, obkv_to_json, Index}; pub(crate) struct TempIndex { pub inner: Index, @@ -1477,4 +1479,545 @@ pub(crate) mod tests { let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); assert_eq!(user_defined, &["doggo", "name"]); } + + #[test] + fn replace_documents_external_ids_and_soft_deletion_check() { + use big_s::S; + use maplit::hashset; + + let mut index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("doggo") }); + }) + .unwrap(); + + let mut docs = vec![]; + for i in 0..4 { + docs.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + index.add_documents(documents!(docs)).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 0 + 1 1 + 2 2 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, ] + 1 0 2 1 [2, ] + 1 0 3 1 [3, ] + "###); + + let mut docs = vec![]; + for i in 0..3 { + docs.push(serde_json::json!( + { "id": i, "doggo": i + 1 } + )); + } + index.add_documents(documents!(docs)).unwrap(); + + db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[0, 1, 2, ]"); + db_snap!(index, facet_id_f64_docids, 2, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, 4, ] + 1 0 2 1 [2, 5, ] + 1 0 3 1 [3, 6, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index + .add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }])) + .unwrap(); + + db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + 3 7 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[0, 1, 2, 3, ]"); + db_snap!(index, facet_id_f64_docids, 3, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, 4, ] + 1 0 2 1 [2, 5, ] + 1 0 3 1 [3, 6, ] + 1 0 4 1 [7, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index + .update_settings(|settings| { + settings.set_distinct_field("id".to_owned()); + }) + .unwrap(); + + db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + 3 7 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + db_snap!(index, facet_id_f64_docids, 3, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [7, ] + "###); + + index.index_documents_config.disable_soft_deletion = true; + index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap(); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + 3 7 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + db_snap!(index, facet_id_f64_docids, 3, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [7, ] + "###); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + let docid = delete.delete_external_id("3").unwrap(); + insta::assert_snapshot!(format!("{docid}"), @"7"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[4, 5, 6, ]"); + db_snap!(index, external_documents_ids, 4, @r###" + soft: + 3 7 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + + db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]"); + db_snap!(index, facet_id_f64_docids, 4, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [7, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap(); + + db_snap!(index, external_documents_ids, 4, @r###" + soft: + 3 0 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + + db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]"); + db_snap!(index, facet_id_f64_docids, 4, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [0, 7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [0, 7, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index.add_documents(documents!([{ "id": 3, "doggo": 5 }])).unwrap(); + + db_snap!(index, external_documents_ids, 4, @r###" + soft: + 3 1 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + + db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 7, ]"); + db_snap!(index, facet_id_f64_docids, 4, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [0, 1, 7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [0, 7, ] + 1 0 5 1 [1, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index.add_documents(documents!([{ "id": 3, "doggo": 5, "id": 2, "doggo": 4 }])).unwrap(); + db_snap!(index, external_documents_ids, 4, @r###" + soft: + hard: + 0 4 + 1 5 + 2 2 + 3 1 + "###); + + db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]"); + db_snap!(index, facet_id_f64_docids, 4, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [2, 6, ] + 0 0 3 1 [0, 1, 7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [0, 2, 7, ] + 1 0 5 1 [1, ] + "###); + + index.index_documents_config.disable_soft_deletion = false; + index + .add_documents(documents!([{ "id": 4, "doggo": 5 }, { "id": 3, "doggo": 5 }])) + .unwrap(); + + db_snap!(index, external_documents_ids, 4, @r###" + soft: + 4 3 + hard: + 0 4 + 1 5 + 2 2 + 3 1 + "###); + + db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]"); + db_snap!(index, facet_id_f64_docids, 4, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [2, 6, ] + 0 0 3 1 [0, 1, 7, ] + 0 0 4 1 [3, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [0, 2, 7, ] + 1 0 5 1 [1, 3, ] + "###); + } + + #[test] + fn replace_documents_in_batches_external_ids_and_soft_deletion_check() { + use big_s::S; + use maplit::hashset; + + let mut index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("doggo") }); + }) + .unwrap(); + + let add_documents = |index: &TempIndex, docs: Vec>| { + let mut wtxn = index.write_txn().unwrap(); + let mut builder = IndexDocuments::new( + &mut wtxn, + index, + &index.indexer_config, + index.index_documents_config.clone(), + |_| (), + || false, + ) + .unwrap(); + for docs in docs { + (builder, _) = builder.add_documents(documents!(docs)).unwrap(); + } + builder.execute().unwrap(); + wtxn.commit().unwrap(); + }; + // First Batch + { + let mut docs1 = vec![]; + for i in 0..4 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1]); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 0 + 1 1 + 2 2 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, ] + 1 0 2 1 [2, ] + 1 0 3 1 [3, ] + "###); + } + // Second Batch: replace the documents with soft-deletion + { + index.index_documents_config.disable_soft_deletion = false; + let mut docs1 = vec![]; + for i in 0..3 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i+1 } + )); + } + let mut docs2 = vec![]; + for i in 0..3 { + docs2.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1, docs2]); + + db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, ]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, 4, ] + 1 0 1 1 [1, 5, ] + 1 0 2 1 [2, 6, ] + 1 0 3 1 [3, ] + "###); + } + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(3), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [4]).unwrap()[0]; + + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(0), + "doggo": Number(0), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [5]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(1), + "doggo": Number(1), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [6]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(2), + "doggo": Number(2), + } + "###); + drop(rtxn); + // Third Batch: replace the documents with soft-deletion again + { + index.index_documents_config.disable_soft_deletion = false; + let mut docs1 = vec![]; + for i in 0..3 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i+1 } + )); + } + let mut docs2 = vec![]; + for i in 0..4 { + docs2.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1, docs2]); + + db_snap!(index, documents_ids, @"[3, 7, 8, 9, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 7 + 1 8 + 2 9 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, 4, 5, 6, ]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, 4, 7, ] + 1 0 1 1 [1, 5, 8, ] + 1 0 2 1 [2, 6, 9, ] + 1 0 3 1 [3, ] + "###); + } + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(3), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [7]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(0), + "doggo": Number(0), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [8]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(1), + "doggo": Number(1), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [9]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(2), + "doggo": Number(2), + } + "###); + drop(rtxn); + + // Fourth Batch: replace the documents without soft-deletion + { + index.index_documents_config.disable_soft_deletion = true; + let mut docs1 = vec![]; + for i in 0..3 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i+2 } + )); + } + let mut docs2 = vec![]; + for i in 0..1 { + docs2.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1, docs2]); + + db_snap!(index, documents_ids, @"[3, 10, 11, 12, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 10 + 1 11 + 2 12 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [10, ] + 1 0 3 1 [3, 11, ] + 1 0 4 1 [12, ] + "###); + + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(3), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [10]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(0), + "doggo": Number(0), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [11]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(1), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [12]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(2), + "doggo": Number(4), + } + "###); + drop(rtxn); + } + } } diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 0270205a6..30f15ebab 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -366,8 +366,9 @@ mod tests { use crate::documents::documents_batch_reader_from_objects; use crate::heed_codec::facet::OrderedF64Codec; use crate::heed_codec::StrRefCodec; - use crate::milli_snap; + use crate::index::tests::TempIndex; use crate::update::facet::test_helpers::{ordered_string, FacetIndex}; + use crate::{db_snap, milli_snap}; #[test] fn insert() { @@ -492,7 +493,6 @@ mod tests { db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); } - #[test] fn insert_string() { let test = |name: &str, group_size: u8, min_level_size: u8| { diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index de429435b..01a4de35e 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -66,9 +66,6 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { common_prefix_fst_words: &[&'a [String]], del_prefix_fst_words: &HashSet>, ) -> Result<()> { - println!("{new_prefix_fst_words:?}"); - println!("{common_prefix_fst_words:?}"); - println!("{del_prefix_fst_words:?}"); index_word_prefix_database( self.wtxn, self.index.word_pair_proximity_docids, From 456da5de9c8b1120866c22ac8bffbccda19eb7be Mon Sep 17 00:00:00 2001 From: amab8901 Date: Wed, 30 Nov 2022 19:44:26 +0100 Subject: [PATCH 1836/1889] Geosearch for zero radius --- milli/src/search/facet/filter.rs | 54 ++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 9b87353b0..d7e1e500d 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -420,7 +420,8 @@ impl<'a> Filter<'a> { let result = rtree .nearest_neighbor_iter(&xyz_base_point) .take_while(|point| { - distance_between_two_points(&base_point, &point.data.1) < radius + distance_between_two_points(&base_point, &point.data.1) + <= radius + f64::EPSILON }) .map(|point| point.data.0) .collect(); @@ -457,10 +458,9 @@ mod tests { #[test] fn empty_db() { let index = TempIndex::new(); - // Set the filterable fields to be the channel. + //Set the filterable fields to be the channel. index .update_settings(|settings| { - settings.set_searchable_fields(vec![S("PrIcE")]); // to keep the fields order settings.set_filterable_fields(hashset! { S("PrIcE") }); }) .unwrap(); @@ -626,6 +626,53 @@ mod tests { assert_eq!(documents_ids, vec![2]); } + #[test] + fn zero_radius() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("_geo")]); + settings.set_filterable_fields(hashset! { S("_geo") }); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 1, + "name": "Nàpiz' Milano", + "address": "Viale Vittorio Veneto, 30, 20124, Milan, Italy", + "type": "pizza", + "rating": 9, + "_geo": { + "lat": 45.4777599, + "lng": 9.1967508 + } + }, + { + "id": 2, + "name": "Artico Gelateria Tradizionale", + "address": "Via Dogana, 1, 20123 Milan, Italy", + "type": "ice cream", + "rating": 10, + "_geo": { + "lat": 45.4632046, + "lng": 9.1719421 + } + }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = crate::Search::new(&rtxn, &index); + + search.filter(Filter::from_str("_geoRadius(45.4777599, 9.1967508, 0)").unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0]); + } + #[test] fn geo_radius_error() { let index = TempIndex::new(); @@ -638,6 +685,7 @@ mod tests { .unwrap(); let rtxn = index.read_txn().unwrap(); + // georadius have a bad latitude let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap().unwrap(); let error = filter.evaluate(&rtxn, &index).unwrap_err(); From 212dbfa3b5352c5477067fb444ee38e023beaf55 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 5 Dec 2022 20:56:21 +0100 Subject: [PATCH 1837/1889] Update milli/src/search/facet/filter.rs --- milli/src/search/facet/filter.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index d7e1e500d..3842a5f56 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -632,7 +632,6 @@ mod tests { index .update_settings(|settings| { - settings.set_searchable_fields(vec![S("_geo")]); settings.set_filterable_fields(hashset! { S("_geo") }); }) .unwrap(); From 67d8cec20903a37c339fc1b460a32a672932dfcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 6 Dec 2022 11:38:15 +0100 Subject: [PATCH 1838/1889] Fix bug in handling of soft deleted documents when updating settings --- milli/src/external_documents_ids.rs | 24 + milli/src/index.rs | 417 +++++++++++------- milli/src/update/index_documents/transform.rs | 34 +- milli/src/update/settings.rs | 17 +- 4 files changed, 310 insertions(+), 182 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 6029722af..64b294541 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -71,6 +71,30 @@ impl<'a> ExternalDocumentsIds<'a> { self.merge_soft_into_hard() } + /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they + /// don't contain any soft deleted document id. + pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { + let mut new_hard_builder = fst::MapBuilder::memory(); + + let union_op = self.hard.op().add(&self.soft).r#union(); + let mut iter = union_op.into_stream(); + while let Some((external_id, docids)) = iter.next() { + // prefer selecting the ids from soft, always + let id = indexed_last_value(docids).unwrap(); + if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) { + new_hard_builder.insert(external_id, id)?; + } + } + drop(iter); + + // Delete soft map completely + self.soft = fst::Map::default().map_data(Cow::Owned)?; + // We save the new map as the new hard map. + self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; + + Ok(()) + } + pub fn insert_ids>(&mut self, other: &fst::Map) -> fst::Result<()> { let union_op = self.soft.op().add(other).r#union(); diff --git a/milli/src/index.rs b/milli/src/index.rs index a98247b6e..e9d66a3ae 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1185,13 +1185,15 @@ pub(crate) mod tests { use big_s::S; use heed::{EnvOpenOptions, RwTxn}; + use maplit::hashset; use tempfile::TempDir; use crate::documents::DocumentsBatchReader; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, + self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, + IndexerConfig, Settings, }; use crate::{db_snap, obkv_to_json, Index}; @@ -1485,7 +1487,7 @@ pub(crate) mod tests { use big_s::S; use maplit::hashset; - let mut index = TempIndex::new(); + let index = TempIndex::new(); index .update_settings(|settings| { @@ -1544,7 +1546,6 @@ pub(crate) mod tests { 1 0 3 1 [3, 6, ] "###); - index.index_documents_config.disable_soft_deletion = false; index .add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }])) .unwrap(); @@ -1568,7 +1569,6 @@ pub(crate) mod tests { 1 0 4 1 [7, ] "###); - index.index_documents_config.disable_soft_deletion = false; index .update_settings(|settings| { settings.set_distinct_field("id".to_owned()); @@ -1576,37 +1576,13 @@ pub(crate) mod tests { .unwrap(); db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); - db_snap!(index, external_documents_ids, 3, @r###" - soft: - 3 7 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); - db_snap!(index, facet_id_f64_docids, 3, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [7, ] - "###); - - index.index_documents_config.disable_soft_deletion = true; - index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap(); db_snap!(index, external_documents_ids, 3, @r###" soft: - 3 7 hard: 0 4 1 5 2 6 - 3 3 + 3 7 "###); db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); db_snap!(index, facet_id_f64_docids, 3, @r###" @@ -1619,140 +1595,6 @@ pub(crate) mod tests { 1 0 3 1 [6, ] 1 0 4 1 [7, ] "###); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - let docid = delete.delete_external_id("3").unwrap(); - insta::assert_snapshot!(format!("{docid}"), @"7"); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, @"[4, 5, 6, ]"); - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 3 7 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [7, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap(); - - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 3 0 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [0, 7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 7, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index.add_documents(documents!([{ "id": 3, "doggo": 5 }])).unwrap(); - - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 3 1 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [0, 1, 7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 7, ] - 1 0 5 1 [1, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index.add_documents(documents!([{ "id": 3, "doggo": 5, "id": 2, "doggo": 4 }])).unwrap(); - db_snap!(index, external_documents_ids, 4, @r###" - soft: - hard: - 0 4 - 1 5 - 2 2 - 3 1 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [2, 6, ] - 0 0 3 1 [0, 1, 7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 2, 7, ] - 1 0 5 1 [1, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index - .add_documents(documents!([{ "id": 4, "doggo": 5 }, { "id": 3, "doggo": 5 }])) - .unwrap(); - - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 4 3 - hard: - 0 4 - 1 5 - 2 2 - 3 1 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [2, 6, ] - 0 0 3 1 [0, 1, 7, ] - 0 0 4 1 [3, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 2, 7, ] - 1 0 5 1 [1, 3, ] - "###); } #[test] @@ -2020,4 +1862,253 @@ pub(crate) mod tests { drop(rtxn); } } + + #[test] + fn bug_3021_first() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 38 }, + { "primary_key": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_external_id("34"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec![]); + }) + .unwrap(); + + // The key point of the test is to verify that the external documents ids + // do not contain any entry for previously soft-deleted document ids + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + hard: + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + + // So that this document addition works correctly now. + // It would be wrongly interpreted as a replacement before + index.add_documents(documents!({ "primary_key": 34 })).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 4, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); + + // We do the test again, but deleting the document with id 0 instead of id 1 now + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_external_id("38"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[1, ]"); + db_snap!(index, external_documents_ids, 5, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 5, @"[0, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec!["primary_key".to_owned()]); + }) + .unwrap(); + + db_snap!(index, documents_ids, @"[1, ]"); + db_snap!(index, external_documents_ids, 6, @r###" + soft: + hard: + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); + + // And adding lots of documents afterwards instead of just one. + // These extra subtests don't add much, but it's better than nothing. + index.add_documents(documents!([{ "primary_key": 38 }, { "primary_key": 39 }, { "primary_key": 41 }, { "primary_key": 40 }, { "primary_key": 41 }, { "primary_key": 42 }])).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); + db_snap!(index, external_documents_ids, 7, @r###" + soft: + hard: + 34 1 + 38 0 + 39 2 + 40 4 + 41 3 + 42 5 + "###); + db_snap!(index, soft_deleted_documents_ids, 7, @"[]"); + } + + #[test] + fn bug_3021_second() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 30 }, + { "primary_key": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 30 0 + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_external_id("34"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 30 0 + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec![]); + }) + .unwrap(); + + // The key point of the test is to verify that the external documents ids + // do not contain any entry for previously soft-deleted document ids + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + hard: + 30 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + + // So that when we add a new document + index.add_documents(documents!({ "primary_key": 35, "b": 2 })).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + // The external documents ids don't have several external ids pointing to the same + // internal document id + db_snap!(index, external_documents_ids, 4, @r###" + soft: + hard: + 30 0 + 35 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); + + // And when we add 34 again, we don't replace document 35 + index.add_documents(documents!({ "primary_key": 34, "a": 1 })).unwrap(); + + // And document 35 still exists, is not deleted + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); + db_snap!(index, external_documents_ids, 5, @r###" + soft: + hard: + 30 0 + 34 2 + 35 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 5, @"[]"); + + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0]; + let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "primary_key": Number(30), + } + "###); + + // Furthermore, when we retrieve document 34, it is not the result of merging 35 with 34 + let (_docid, obkv) = index.documents(&rtxn, [2]).unwrap()[0]; + let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "primary_key": Number(34), + "a": Number(1), + } + "###); + + drop(rtxn); + + // Add new documents again + index + .add_documents( + documents!([{ "primary_key": 37 }, { "primary_key": 38 }, { "primary_key": 39 }]), + ) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); + db_snap!(index, external_documents_ids, 6, @r###" + soft: + hard: + 30 0 + 34 2 + 35 1 + 37 3 + 38 4 + 39 5 + "###); + db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); + } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 57aa02e04..f414569b9 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -17,7 +17,7 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; -use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, @@ -546,12 +546,13 @@ impl<'a, 'i> Transform<'a, 'i> { }) } - /// Returns a `TransformOutput` with a file that contains the documents of the index - /// with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. + /// Clear all databases. Returns a `TransformOutput` with a file that contains the documents + /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. + /// // TODO this can be done in parallel by using the rayon `ThreadPool`. - pub fn remap_index_documents( + pub fn prepare_for_documents_reindexing( self, - wtxn: &mut heed::RwTxn, + wtxn: &mut heed::RwTxn<'i, '_>, old_fields_ids_map: FieldsIdsMap, mut new_fields_ids_map: FieldsIdsMap, ) -> Result { @@ -559,7 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> { let primary_key = self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string(); let field_distribution = self.index.field_distribution(wtxn)?; - let external_documents_ids = self.index.external_documents_ids(wtxn)?; + + // Delete the soft deleted document ids from the maps inside the external_document_ids structure + let new_external_documents_ids = { + let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; + external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; + external_documents_ids + }; + let documents_ids = self.index.documents_ids(wtxn)?; let documents_count = documents_ids.len() as usize; @@ -638,17 +646,25 @@ impl<'a, 'i> Transform<'a, 'i> { let mut flattened_documents = flattened_writer.into_inner()?; flattened_documents.seek(SeekFrom::Start(0))?; - Ok(TransformOutput { + let output = TransformOutput { primary_key, fields_ids_map: new_fields_ids_map, field_distribution, - external_documents_ids: external_documents_ids.into_static(), + external_documents_ids: new_external_documents_ids.into_static(), new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), documents_count, original_documents, flattened_documents, - }) + }; + + let new_facets = output.compute_real_facets(wtxn, self.index)?; + self.index.put_faceted_fields(wtxn, &new_facets)?; + + // We clear the full database (words-fst, documents ids and documents content). + ClearDocuments::new(wtxn, self.index).execute()?; + + Ok(output) } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index fc7e6bc03..b66893ee3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -12,7 +12,7 @@ use crate::criterion::Criterion; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::index_documents::IndexDocumentsMethod; -use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; +use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::{FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] @@ -291,15 +291,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { false, )?; - // We remap the documents fields based on the new `FieldsIdsMap`. - let output = - transform.remap_index_documents(self.wtxn, old_fields_ids_map, fields_ids_map)?; - - let new_facets = output.compute_real_facets(self.wtxn, self.index)?; - self.index.put_faceted_fields(self.wtxn, &new_facets)?; - - // We clear the full database (words-fst, documents ids and documents content). - ClearDocuments::new(self.wtxn, self.index).execute()?; + // We clear the databases and remap the documents fields based on the new `FieldsIdsMap`. + let output = transform.prepare_for_documents_reindexing( + self.wtxn, + old_fields_ids_map, + fields_ids_map, + )?; // We index the generated `TransformOutput` which must contain // all the documents with fields in the newly defined searchable order. From 80c7a00567596191d6155ae84764b7d7666ac2db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 6 Dec 2022 15:19:26 +0100 Subject: [PATCH 1839/1889] Fix compilation error in tests of settings update --- milli/src/update/settings.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index b66893ee3..db6bbf602 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -726,15 +726,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { #[cfg(test)] mod tests { - use big_s::S; - use heed::types::ByteSlice; - use maplit::{btreeset, hashmap, hashset}; - use super::*; use crate::error::Error; use crate::index::tests::TempIndex; - use crate::update::DeleteDocuments; + use crate::update::{ClearDocuments, DeleteDocuments}; use crate::{Criterion, Filter, SearchResult}; + use big_s::S; + use heed::types::ByteSlice; + use maplit::{btreeset, hashmap, hashset}; #[test] fn set_and_reset_searchable_fields() { From a993b686845340851468bcd44156ab13c90d060f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 6 Dec 2022 15:22:10 +0100 Subject: [PATCH 1840/1889] Cargo fmt >:-( --- milli/src/update/settings.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index db6bbf602..5f75910dc 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -726,14 +726,15 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { #[cfg(test)] mod tests { + use big_s::S; + use heed::types::ByteSlice; + use maplit::{btreeset, hashmap, hashset}; + use super::*; use crate::error::Error; use crate::index::tests::TempIndex; use crate::update::{ClearDocuments, DeleteDocuments}; use crate::{Criterion, Filter, SearchResult}; - use big_s::S; - use heed::types::ByteSlice; - use maplit::{btreeset, hashmap, hashset}; #[test] fn set_and_reset_searchable_fields() { From f53bdc43202859b81cbb56fbde64d54bf547cdb5 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 6 Dec 2022 17:41:05 +0100 Subject: [PATCH 1841/1889] update the contributing.md --- CONTRIBUTING.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 83bfc5a5f..a9a0c95ff 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -31,6 +31,8 @@ Remember that there are many ways to contribute other than writing code: writing ## Development Workflow +We're using rust stable for the tests and clippy but rust nightly for the formatting of the code. + ### Test ```bash @@ -55,6 +57,28 @@ Don't forget to specify the `id` of the documents. Also, note that it supports J streaming: you can send them to the engine by using the `content-type:application/json` and `content-type:application/x-ndjson` headers respectively. +### Format + +For your first run you'll need to run this command: + +```bash +touch benchmarks/benches/datasets_paths.rs +``` + +Then you can format your code BUT you need to do it with rust-fmt. + +```bash +cargo +nightly fmt --all +``` + +### Clippy + +```bash +cargo clippy +``` + + + ## Git Guidelines ### Git Branches From 0e5c3b1f64ea4b765cfd72d9a67b145cee3e961e Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 7 Dec 2022 12:23:06 +0100 Subject: [PATCH 1842/1889] Update CONTRIBUTING.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Urquizar - curqui --- CONTRIBUTING.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a9a0c95ff..e44e502e0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -76,9 +76,6 @@ cargo +nightly fmt --all ```bash cargo clippy ``` - - - ## Git Guidelines ### Git Branches From 5eecb8489df3649793197451c2232b76806e4465 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 7 Dec 2022 12:23:12 +0100 Subject: [PATCH 1843/1889] Update CONTRIBUTING.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clémentine Urquizar - curqui --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e44e502e0..d78627580 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -31,7 +31,7 @@ Remember that there are many ways to contribute other than writing code: writing ## Development Workflow -We're using rust stable for the tests and clippy but rust nightly for the formatting of the code. +We're using a stable version of Rust for the tests and Clippy but the nightly version of Rust for the formatting of the code. ### Test From 250743885df3a7fcd04303e21a9311221f4df9e5 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 7 Dec 2022 12:31:43 +0100 Subject: [PATCH 1844/1889] add a sentence about installing rust-nightly --- CONTRIBUTING.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d78627580..377aadea0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -66,6 +66,15 @@ touch benchmarks/benches/datasets_paths.rs ``` Then you can format your code BUT you need to do it with rust-fmt. +If you have not installed the rust nightly version yet here is how to do it: + +```bash +rustup toolchain install nightly +``` + +You can read more about it here: https://rust-lang.github.io/rustup/concepts/channels.html + +And finally, to format your code you can run: ```bash cargo +nightly fmt --all From 303d740245812d112b4152672407a8523146b0df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Dec 2022 12:03:33 +0100 Subject: [PATCH 1845/1889] Prepare fix within facet range search By creating snapshots and updating the format of the existing snapshots. The next commit will apply the fix, which will show its effects cleanly on the old and new snapshot tests --- milli/src/search/facet/facet_range_search.rs | 162 +++++++++++++++++- .../excluded_0.hash.snap | 2 +- .../excluded_1.hash.snap | 2 +- .../excluded_2.hash.snap | 2 +- .../excluded_3.hash.snap | 2 +- .../included_0.hash.snap | 2 +- .../included_1.hash.snap | 2 +- .../included_2.hash.snap | 2 +- .../included_3.hash.snap | 2 +- .../excluded_0.hash.snap | 2 +- .../excluded_1.hash.snap | 2 +- .../excluded_2.hash.snap | 2 +- .../excluded_3.hash.snap | 2 +- .../included_0.hash.snap | 2 +- .../included_1.hash.snap | 2 +- .../included_2.hash.snap | 2 +- .../included_3.hash.snap | 2 +- .../filter_range_pinch/excluded_0.hash.snap | 2 +- .../filter_range_pinch/excluded_1.hash.snap | 2 +- .../filter_range_pinch/excluded_2.hash.snap | 2 +- .../filter_range_pinch/excluded_3.hash.snap | 2 +- .../filter_range_pinch/included_0.hash.snap | 2 +- .../filter_range_pinch/included_1.hash.snap | 2 +- .../filter_range_pinch/included_2.hash.snap | 2 +- .../filter_range_pinch/included_3.hash.snap | 2 +- .../end_at_included_0.hash.snap} | 2 +- .../end_at_included_1.hash.snap | 4 + .../end_at_included_2.hash.snap | 4 + .../end_at_included_3.hash.snap | 4 + .../start_from_included_0.hash.snap | 4 + .../start_from_included_1.hash.snap | 4 + .../start_from_included_2.hash.snap | 4 + .../start_from_included_3.hash.snap | 4 + .../unbounded_field_id_0_0.snap | 5 + .../unbounded_field_id_0_1.snap | 5 + .../unbounded_field_id_0_2.snap | 5 + .../unbounded_field_id_0_3.snap | 5 + .../unbounded_field_id_1_0.snap | 5 + .../unbounded_field_id_1_1.snap | 5 + .../unbounded_field_id_1_2.snap | 5 + .../unbounded_field_id_1_3.snap | 5 + .../updated/soft_deleted_documents_ids.snap | 4 - 42 files changed, 248 insertions(+), 36 deletions(-) rename milli/src/search/facet/snapshots/facet_range_search.rs/{random_looking_index_snap/random_looking_index_snap.hash.snap => filter_range_unbounded/end_at_included_0.hash.snap} (64%) create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_1.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_0.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_1.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_2.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_3.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_0.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_1.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_2.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_3.snap delete mode 100644 milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index e8eeab1cc..8c6a6c073 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -171,6 +171,8 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } // should we stop? + // We should if the the search range doesn't include any + // element from the previous key or its successors let should_stop = { match self.right { Bound::Included(right) => right < previous_key.left_bound, @@ -233,6 +235,8 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } // should we stop? + // We should if the the search range doesn't include any + // element from the previous key or its successors let should_stop = { match self.right { Bound::Included(right) => right <= previous_key.left_bound, @@ -321,8 +325,27 @@ mod tests { #[test] fn random_looking_index_snap() { let index = get_random_looking_index(); - milli_snap!(format!("{index}")); + milli_snap!(format!("{index}"), @"3256c76a7c1b768a013e78d5fa6e9ff9"); } + + #[test] + fn random_looking_index_with_multiple_field_ids_snap() { + let index = get_random_looking_index_with_multiple_field_ids(); + milli_snap!(format!("{index}"), @"c3e5fe06a8f1c404ed4935b32c90a89b"); + } + + #[test] + fn simple_index_snap() { + let index = get_simple_index(); + milli_snap!(format!("{index}"), @"5dbfa134cc44abeb3ab6242fc182e48e"); + } + + #[test] + fn simple_index_with_multiple_field_ids_snap() { + let index = get_simple_index_with_multiple_field_ids(); + milli_snap!(format!("{index}"), @"a4893298218f682bc76357f46777448c"); + } + #[test] fn filter_range_increasing() { let indexes = [ @@ -349,7 +372,7 @@ mod tests { ) .unwrap(); #[allow(clippy::format_push_string)] - results.push_str(&format!("{}\n", display_bitmap(&docids))); + results.push_str(&format!("0 <= . <= {i} : {}\n", display_bitmap(&docids))); } milli_snap!(results, format!("included_{i}")); let mut results = String::new(); @@ -368,7 +391,7 @@ mod tests { ) .unwrap(); #[allow(clippy::format_push_string)] - results.push_str(&format!("{}\n", display_bitmap(&docids))); + results.push_str(&format!("0 < . < {i} : {}\n", display_bitmap(&docids))); } milli_snap!(results, format!("excluded_{i}")); txn.commit().unwrap(); @@ -401,7 +424,7 @@ mod tests { &mut docids, ) .unwrap(); - results.push_str(&format!("{}\n", display_bitmap(&docids))); + results.push_str(&format!("{i} <= . <= 255 : {}\n", display_bitmap(&docids))); } milli_snap!(results, format!("included_{i}")); @@ -422,7 +445,7 @@ mod tests { &mut docids, ) .unwrap(); - results.push_str(&format!("{}\n", display_bitmap(&docids))); + results.push_str(&format!("{i} < . < 255 : {}\n", display_bitmap(&docids))); } milli_snap!(results, format!("excluded_{i}")); @@ -457,7 +480,11 @@ mod tests { &mut docids, ) .unwrap(); - results.push_str(&format!("{}\n", display_bitmap(&docids))); + results.push_str(&format!( + "{i} <= . <= {r} : {docids}\n", + r = 255. - i, + docids = display_bitmap(&docids) + )); } milli_snap!(results, format!("included_{i}")); @@ -478,7 +505,11 @@ mod tests { &mut docids, ) .unwrap(); - results.push_str(&format!("{}\n", display_bitmap(&docids))); + results.push_str(&format!( + "{i} < . < {r} {docids}\n", + r = 255. - i, + docids = display_bitmap(&docids) + )); } milli_snap!(results, format!("excluded_{i}")); @@ -486,4 +517,121 @@ mod tests { txn.commit().unwrap(); } } + + #[test] + fn filter_range_unbounded() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Unbounded; + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + #[allow(clippy::format_push_string)] + results.push_str(&format!(">= {i}: {}\n", display_bitmap(&docids))); + } + milli_snap!(results, format!("start_from_included_{i}")); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Unbounded; + let end = Bound::Included(i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + #[allow(clippy::format_push_string)] + results.push_str(&format!("<= {i}: {}\n", display_bitmap(&docids))); + } + milli_snap!(results, format!("end_at_included_{i}")); + + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &Bound::Unbounded, + &Bound::Unbounded, + &mut docids, + ) + .unwrap(); + milli_snap!( + &format!("all field_id 0: {}\n", display_bitmap(&docids)), + format!("unbounded_field_id_0_{i}") + ); + + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 1, + &Bound::Unbounded, + &Bound::Unbounded, + &mut docids, + ) + .unwrap(); + milli_snap!( + &format!("all field_id 1: {}\n", display_bitmap(&docids)), + format!("unbounded_field_id_1_{i}") + ); + + drop(txn); + } + } + + #[test] + fn filter_range_exact() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Included(i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + #[allow(clippy::format_push_string)] + results.push_str(&format!("{i}: {}\n", display_bitmap(&docids))); + } + milli_snap!(results, format!("exact_{i}")); + + drop(txn); + } + } } diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap index 7bf13e05c..e14520141 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -fcedc563a82c1c61f50174a5f3f982b6 +adf484f467a31ee9460dec539621938a diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap index 100b928d7..f3743e045 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -6cc26e77fc6bd9145deedf14cf422b03 +c9939aa4977fcd4bfd35852e102dbc82 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap index 7bf13e05c..e14520141 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -fcedc563a82c1c61f50174a5f3f982b6 +adf484f467a31ee9460dec539621938a diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap index 100b928d7..f3743e045 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -6cc26e77fc6bd9145deedf14cf422b03 +c9939aa4977fcd4bfd35852e102dbc82 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap index be0b06ded..25347579e 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -57d35cfa419a19a1a1f8d7c8ef096e0f +618738d28ff1386b6e93d171a5acb08f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap index 93fe17b0c..ec14f2cf6 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -3dbe0547b42759795e9b16989df72cee +ffb62ab3eef55c2254c13dc0f4099849 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap index be0b06ded..25347579e 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -57d35cfa419a19a1a1f8d7c8ef096e0f +618738d28ff1386b6e93d171a5acb08f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap index 93fe17b0c..ec14f2cf6 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -3dbe0547b42759795e9b16989df72cee +ffb62ab3eef55c2254c13dc0f4099849 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap index db11ce952..b7b156c65 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -c1c7a0bb91d53d33724583b6d4a99f16 +e849066b0e43d5c456f086c552372afc diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap index f5a81c121..92bba2433 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -12213d3f1047a0c3d08e4670a7d688e7 +8cc5e82995b0443b660f419bb9ea2e85 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap index db11ce952..b7b156c65 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -c1c7a0bb91d53d33724583b6d4a99f16 +e849066b0e43d5c456f086c552372afc diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap index f5a81c121..92bba2433 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -12213d3f1047a0c3d08e4670a7d688e7 +8cc5e82995b0443b660f419bb9ea2e85 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap index fa7242056..a94ac51ac 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -ca59f20e043a4d52c49e15b10adf96bb +a50f49405717ef9f08829ff742d51cbb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap index a7611d8c1..8aaf2b8db 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -cb69e0fe10fb299bafe77514204379cb +3a5954e37c6f575b88026179c466c4b7 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap index fa7242056..a94ac51ac 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -ca59f20e043a4d52c49e15b10adf96bb +a50f49405717ef9f08829ff742d51cbb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap index a7611d8c1..8aaf2b8db 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -cb69e0fe10fb299bafe77514204379cb +3a5954e37c6f575b88026179c466c4b7 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap index 07664807e..b6a9b6bfa 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -3456db9a1bb94c33c1e9f656184ee711 +c3f8b0b858a4820a508b25b42328cedd diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap index ef530faa1..76a0589f1 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -2127cd818b457e0611e0c8e1a871602a +38a42f5dc25e99d7a5312a63ce94ed30 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap index 07664807e..b6a9b6bfa 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -3456db9a1bb94c33c1e9f656184ee711 +c3f8b0b858a4820a508b25b42328cedd diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap index ef530faa1..76a0589f1 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -2127cd818b457e0611e0c8e1a871602a +38a42f5dc25e99d7a5312a63ce94ed30 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap index db8a314b0..c81622e87 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -b976551ceff412bfb2ec9bfbda320bbb +d53339a9ec9edf5d9b5e0e1d665c4a34 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap index 2b82e07e8..d37df9e45 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -7620ca1a96882c7147d3fd996570f9b3 +a1806ad3f0dfd826e7645107ba413b1d diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap index db8a314b0..c81622e87 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -b976551ceff412bfb2ec9bfbda320bbb +d53339a9ec9edf5d9b5e0e1d665c4a34 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap index 2b82e07e8..d37df9e45 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -7620ca1a96882c7147d3fd996570f9b3 +a1806ad3f0dfd826e7645107ba413b1d diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_0.hash.snap similarity index 64% rename from milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap rename to milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_0.hash.snap index 67a2f6bd9..bbd3315bc 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -3256c76a7c1b768a013e78d5fa6e9ff9 +b41507892dd4468a821a4da411ef1d9d diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap new file mode 100644 index 000000000..a76e5468b --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3b78bbb7a06c258a52afb332a04c7838 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap new file mode 100644 index 000000000..bbd3315bc --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +b41507892dd4468a821a4da411ef1d9d diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap new file mode 100644 index 000000000..a76e5468b --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3b78bbb7a06c258a52afb332a04c7838 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_0.hash.snap new file mode 100644 index 000000000..f9becb30d --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9a8c7343b4735d37704748cabcd51ff2 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_1.hash.snap new file mode 100644 index 000000000..e495229a3 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +898a7dc25a1441bc3e7e2a8a62d99090 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_2.hash.snap new file mode 100644 index 000000000..f9becb30d --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9a8c7343b4735d37704748cabcd51ff2 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_3.hash.snap new file mode 100644 index 000000000..e495229a3 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +898a7dc25a1441bc3e7e2a8a62d99090 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_0.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_0.snap new file mode 100644 index 000000000..b86eebd09 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_0.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_1.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_1.snap new file mode 100644 index 000000000..778a5b488 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_1.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 0: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_2.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_2.snap new file mode 100644 index 000000000..b86eebd09 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_2.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_3.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_3.snap new file mode 100644 index 000000000..778a5b488 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_3.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 0: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_0.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_0.snap new file mode 100644 index 000000000..1773ad84b --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_0.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 1: [] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_1.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_1.snap new file mode 100644 index 000000000..1773ad84b --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_1.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 1: [] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_2.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_2.snap new file mode 100644 index 000000000..02c2c6318 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_2.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_3.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_3.snap new file mode 100644 index 000000000..d9c9cb699 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_3.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 1: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap deleted file mode 100644 index 9228ad265..000000000 --- a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/soft_deleted_documents_ids.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/index_documents/mod.rs ---- -[0, 1, 4, ] From 1c9555566e9cc096b048ddc84a0a39da567f3016 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Dec 2022 12:07:26 +0100 Subject: [PATCH 1846/1889] Fix bug in facet range search --- milli/src/search/facet/facet_range_search.rs | 4 ++-- .../filter_range_increasing/included_0.hash.snap | 2 +- .../filter_range_increasing/included_1.hash.snap | 2 +- .../filter_range_increasing/included_2.hash.snap | 2 +- .../filter_range_increasing/included_3.hash.snap | 2 +- .../filter_range_pinch/included_0.hash.snap | 2 +- .../filter_range_pinch/included_1.hash.snap | 2 +- .../filter_range_pinch/included_2.hash.snap | 2 +- .../filter_range_pinch/included_3.hash.snap | 2 +- .../filter_range_unbounded/end_at_included_0.hash.snap | 2 +- .../filter_range_unbounded/end_at_included_1.hash.snap | 2 +- .../filter_range_unbounded/end_at_included_2.hash.snap | 2 +- .../filter_range_unbounded/end_at_included_3.hash.snap | 2 +- 13 files changed, 14 insertions(+), 14 deletions(-) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 8c6a6c073..81f9bba77 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -239,8 +239,8 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { // element from the previous key or its successors let should_stop = { match self.right { - Bound::Included(right) => right <= previous_key.left_bound, - Bound::Excluded(right) => right < previous_key.left_bound, + Bound::Included(right) => right < previous_key.left_bound, + Bound::Excluded(right) => right <= previous_key.left_bound, Bound::Unbounded => false, } }; diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap index a94ac51ac..fc48b6ddd 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -a50f49405717ef9f08829ff742d51cbb +73b48005dc57b04f0939bbf21a68dab6 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap index 8aaf2b8db..a16d93d8d 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -3a5954e37c6f575b88026179c466c4b7 +3c23d35627667dcee98468bfdecf09d3 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap index a94ac51ac..fc48b6ddd 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -a50f49405717ef9f08829ff742d51cbb +73b48005dc57b04f0939bbf21a68dab6 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap index 8aaf2b8db..a16d93d8d 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -3a5954e37c6f575b88026179c466c4b7 +3c23d35627667dcee98468bfdecf09d3 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap index c81622e87..558740f8a 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -d53339a9ec9edf5d9b5e0e1d665c4a34 +2049930204498b323885c91de88e44ca diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap index d37df9e45..48eb244c8 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -a1806ad3f0dfd826e7645107ba413b1d +7f0ca8c0fc6494f3dba46e8eb9699045 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap index c81622e87..558740f8a 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -d53339a9ec9edf5d9b5e0e1d665c4a34 +2049930204498b323885c91de88e44ca diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap index d37df9e45..48eb244c8 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -a1806ad3f0dfd826e7645107ba413b1d +7f0ca8c0fc6494f3dba46e8eb9699045 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_0.hash.snap index bbd3315bc..c75a7aafc 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_0.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_0.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -b41507892dd4468a821a4da411ef1d9d +ad8fc873747aaf1d3590e7ccab735985 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap index a76e5468b..440494b37 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -3b78bbb7a06c258a52afb332a04c7838 +7c6cc88697da835d33877b2df41fa1cb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap index bbd3315bc..c75a7aafc 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -b41507892dd4468a821a4da411ef1d9d +ad8fc873747aaf1d3590e7ccab735985 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap index a76e5468b..440494b37 100644 --- a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/search/facet/facet_range_search.rs --- -3b78bbb7a06c258a52afb332a04c7838 +7c6cc88697da835d33877b2df41fa1cb From 4ac8f96342a7430c174c9d026bb217f3a10b34e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Dec 2022 12:19:32 +0100 Subject: [PATCH 1847/1889] Simplify implementation of equality condition in filters --- milli/src/search/facet/filter.rs | 63 ++++---------------------------- 1 file changed, 7 insertions(+), 56 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 3842a5f56..7449f828b 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -4,7 +4,6 @@ use std::ops::Bound::{self, Excluded, Included}; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; -use heed::types::DecodeIgnore; use roaring::RoaringBitmap; use super::facet_range_search; @@ -200,20 +199,10 @@ impl<'a> Filter<'a> { .unwrap_or_default(); let number = val.parse_finite_float().ok(); let number_docids = match number { - Some(n) => { - let n = Included(n); - let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels( - rtxn, - numbers_db, - field_id, - 0, - n, - n, - &mut output, - )?; - output - } + Some(n) => numbers_db + .get(rtxn, &FacetGroupKey { field_id, level: 0, left_bound: n })? + .map(|v| v.bitmap) + .unwrap_or_default(), None => RoaringBitmap::new(), }; return Ok(string_docids | number_docids); @@ -226,40 +215,9 @@ impl<'a> Filter<'a> { } }; - // Ask for the biggest value that can exist for this specific field, if it exists - // that's fine if it don't, the value just before will be returned instead. - let biggest_level = numbers_db - .remap_data_type::() - .get_lower_than_or_equal_to( - rtxn, - &FacetGroupKey { field_id, level: u8::MAX, left_bound: f64::MAX }, - )? - .and_then( - |(FacetGroupKey { field_id: id, level, .. }, _)| { - if id == field_id { - Some(level) - } else { - None - } - }, - ); - - match biggest_level { - Some(level) => { - let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels( - rtxn, - numbers_db, - field_id, - level, - left, - right, - &mut output, - )?; - Ok(output) - } - None => Ok(RoaringBitmap::new()), - } + let mut output = RoaringBitmap::new(); + Self::explore_facet_number_levels(rtxn, numbers_db, field_id, left, right, &mut output)?; + Ok(output) } /// Aggregates the documents ids that are part of the specified range automatically @@ -268,18 +226,11 @@ impl<'a> Filter<'a> { rtxn: &heed::RoTxn, db: heed::Database, FacetGroupValueCodec>, field_id: FieldId, - level: u8, left: Bound, right: Bound, output: &mut RoaringBitmap, ) -> Result<()> { match (left, right) { - // If the request is an exact value we must go directly to the deepest level. - (Included(l), Included(r)) if l == r && level > 0 => { - return Self::explore_facet_number_levels( - rtxn, db, field_id, 0, left, right, output, - ); - } // lower TO upper when lower > upper must return no result (Included(l), Included(r)) if l > r => return Ok(()), (Included(l), Excluded(r)) if l >= r => return Ok(()), From e688581c364893c9fdd025ad78a0387f3336e4ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Dec 2022 12:22:41 +0100 Subject: [PATCH 1848/1889] Add tests for facet range search on different field ids --- milli/src/search/facet/facet_range_search.rs | 21 +- .../field_id_0_exact_0.hash.snap | 4 + .../field_id_0_exact_1.hash.snap | 4 + .../field_id_0_exact_2.hash.snap | 4 + .../field_id_0_exact_3.hash.snap | 4 + .../field_id_1_exact_0.snap | 260 ++++++++++++++++++ .../field_id_1_exact_1.snap | 260 ++++++++++++++++++ .../field_id_1_exact_2.hash.snap | 4 + .../field_id_1_exact_3.hash.snap | 4 + 9 files changed, 562 insertions(+), 3 deletions(-) create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_0.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_1.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_3.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_0.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_1.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_2.hash.snap create mode 100644 milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_3.hash.snap diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 81f9bba77..b1ab6f71f 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -611,7 +611,8 @@ mod tests { ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let mut results = String::new(); + let mut results_0 = String::new(); + let mut results_1 = String::new(); for i in 0..=255 { let i = i as f64; let start = Bound::Included(i); @@ -627,9 +628,23 @@ mod tests { ) .unwrap(); #[allow(clippy::format_push_string)] - results.push_str(&format!("{i}: {}\n", display_bitmap(&docids))); + results_0.push_str(&format!("{i}: {}\n", display_bitmap(&docids))); + + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 1, + &start, + &end, + &mut docids, + ) + .unwrap(); + #[allow(clippy::format_push_string)] + results_1.push_str(&format!("{i}: {}\n", display_bitmap(&docids))); } - milli_snap!(results, format!("exact_{i}")); + milli_snap!(results_0, format!("field_id_0_exact_{i}")); + milli_snap!(results_1, format!("field_id_1_exact_{i}")); drop(txn); } diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_0.hash.snap new file mode 100644 index 000000000..67965fcd4 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9c25261cec7275cb5cfd85835904d023 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_1.hash.snap new file mode 100644 index 000000000..c43ba2152 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2f97f18c15e915853e4df879be6e1f63 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_2.hash.snap new file mode 100644 index 000000000..67965fcd4 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9c25261cec7275cb5cfd85835904d023 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_3.hash.snap new file mode 100644 index 000000000..c43ba2152 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2f97f18c15e915853e4df879be6e1f63 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_0.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_0.snap new file mode 100644 index 000000000..6cf7aa46c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_0.snap @@ -0,0 +1,260 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +0: [] +1: [] +2: [] +3: [] +4: [] +5: [] +6: [] +7: [] +8: [] +9: [] +10: [] +11: [] +12: [] +13: [] +14: [] +15: [] +16: [] +17: [] +18: [] +19: [] +20: [] +21: [] +22: [] +23: [] +24: [] +25: [] +26: [] +27: [] +28: [] +29: [] +30: [] +31: [] +32: [] +33: [] +34: [] +35: [] +36: [] +37: [] +38: [] +39: [] +40: [] +41: [] +42: [] +43: [] +44: [] +45: [] +46: [] +47: [] +48: [] +49: [] +50: [] +51: [] +52: [] +53: [] +54: [] +55: [] +56: [] +57: [] +58: [] +59: [] +60: [] +61: [] +62: [] +63: [] +64: [] +65: [] +66: [] +67: [] +68: [] +69: [] +70: [] +71: [] +72: [] +73: [] +74: [] +75: [] +76: [] +77: [] +78: [] +79: [] +80: [] +81: [] +82: [] +83: [] +84: [] +85: [] +86: [] +87: [] +88: [] +89: [] +90: [] +91: [] +92: [] +93: [] +94: [] +95: [] +96: [] +97: [] +98: [] +99: [] +100: [] +101: [] +102: [] +103: [] +104: [] +105: [] +106: [] +107: [] +108: [] +109: [] +110: [] +111: [] +112: [] +113: [] +114: [] +115: [] +116: [] +117: [] +118: [] +119: [] +120: [] +121: [] +122: [] +123: [] +124: [] +125: [] +126: [] +127: [] +128: [] +129: [] +130: [] +131: [] +132: [] +133: [] +134: [] +135: [] +136: [] +137: [] +138: [] +139: [] +140: [] +141: [] +142: [] +143: [] +144: [] +145: [] +146: [] +147: [] +148: [] +149: [] +150: [] +151: [] +152: [] +153: [] +154: [] +155: [] +156: [] +157: [] +158: [] +159: [] +160: [] +161: [] +162: [] +163: [] +164: [] +165: [] +166: [] +167: [] +168: [] +169: [] +170: [] +171: [] +172: [] +173: [] +174: [] +175: [] +176: [] +177: [] +178: [] +179: [] +180: [] +181: [] +182: [] +183: [] +184: [] +185: [] +186: [] +187: [] +188: [] +189: [] +190: [] +191: [] +192: [] +193: [] +194: [] +195: [] +196: [] +197: [] +198: [] +199: [] +200: [] +201: [] +202: [] +203: [] +204: [] +205: [] +206: [] +207: [] +208: [] +209: [] +210: [] +211: [] +212: [] +213: [] +214: [] +215: [] +216: [] +217: [] +218: [] +219: [] +220: [] +221: [] +222: [] +223: [] +224: [] +225: [] +226: [] +227: [] +228: [] +229: [] +230: [] +231: [] +232: [] +233: [] +234: [] +235: [] +236: [] +237: [] +238: [] +239: [] +240: [] +241: [] +242: [] +243: [] +244: [] +245: [] +246: [] +247: [] +248: [] +249: [] +250: [] +251: [] +252: [] +253: [] +254: [] +255: [] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_1.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_1.snap new file mode 100644 index 000000000..6cf7aa46c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_1.snap @@ -0,0 +1,260 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +0: [] +1: [] +2: [] +3: [] +4: [] +5: [] +6: [] +7: [] +8: [] +9: [] +10: [] +11: [] +12: [] +13: [] +14: [] +15: [] +16: [] +17: [] +18: [] +19: [] +20: [] +21: [] +22: [] +23: [] +24: [] +25: [] +26: [] +27: [] +28: [] +29: [] +30: [] +31: [] +32: [] +33: [] +34: [] +35: [] +36: [] +37: [] +38: [] +39: [] +40: [] +41: [] +42: [] +43: [] +44: [] +45: [] +46: [] +47: [] +48: [] +49: [] +50: [] +51: [] +52: [] +53: [] +54: [] +55: [] +56: [] +57: [] +58: [] +59: [] +60: [] +61: [] +62: [] +63: [] +64: [] +65: [] +66: [] +67: [] +68: [] +69: [] +70: [] +71: [] +72: [] +73: [] +74: [] +75: [] +76: [] +77: [] +78: [] +79: [] +80: [] +81: [] +82: [] +83: [] +84: [] +85: [] +86: [] +87: [] +88: [] +89: [] +90: [] +91: [] +92: [] +93: [] +94: [] +95: [] +96: [] +97: [] +98: [] +99: [] +100: [] +101: [] +102: [] +103: [] +104: [] +105: [] +106: [] +107: [] +108: [] +109: [] +110: [] +111: [] +112: [] +113: [] +114: [] +115: [] +116: [] +117: [] +118: [] +119: [] +120: [] +121: [] +122: [] +123: [] +124: [] +125: [] +126: [] +127: [] +128: [] +129: [] +130: [] +131: [] +132: [] +133: [] +134: [] +135: [] +136: [] +137: [] +138: [] +139: [] +140: [] +141: [] +142: [] +143: [] +144: [] +145: [] +146: [] +147: [] +148: [] +149: [] +150: [] +151: [] +152: [] +153: [] +154: [] +155: [] +156: [] +157: [] +158: [] +159: [] +160: [] +161: [] +162: [] +163: [] +164: [] +165: [] +166: [] +167: [] +168: [] +169: [] +170: [] +171: [] +172: [] +173: [] +174: [] +175: [] +176: [] +177: [] +178: [] +179: [] +180: [] +181: [] +182: [] +183: [] +184: [] +185: [] +186: [] +187: [] +188: [] +189: [] +190: [] +191: [] +192: [] +193: [] +194: [] +195: [] +196: [] +197: [] +198: [] +199: [] +200: [] +201: [] +202: [] +203: [] +204: [] +205: [] +206: [] +207: [] +208: [] +209: [] +210: [] +211: [] +212: [] +213: [] +214: [] +215: [] +216: [] +217: [] +218: [] +219: [] +220: [] +221: [] +222: [] +223: [] +224: [] +225: [] +226: [] +227: [] +228: [] +229: [] +230: [] +231: [] +232: [] +233: [] +234: [] +235: [] +236: [] +237: [] +238: [] +239: [] +240: [] +241: [] +242: [] +243: [] +244: [] +245: [] +246: [] +247: [] +248: [] +249: [] +250: [] +251: [] +252: [] +253: [] +254: [] +255: [] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_2.hash.snap new file mode 100644 index 000000000..67965fcd4 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9c25261cec7275cb5cfd85835904d023 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_3.hash.snap new file mode 100644 index 000000000..c43ba2152 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2f97f18c15e915853e4df879be6e1f63 From d38cc73630187c03f1a113e01e41225df7b519d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Dec 2022 14:11:20 +0100 Subject: [PATCH 1849/1889] Add one more filter "integration" test --- milli/src/search/facet/filter.rs | 83 ++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 7449f828b..23cbb280c 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -398,10 +398,12 @@ impl<'a> From> for Filter<'a> { #[cfg(test)] mod tests { use std::fmt::Write; + use std::iter::FromIterator; use big_s::S; use either::Either; use maplit::hashset; + use roaring::RoaringBitmap; use crate::index::tests::TempIndex; use crate::Filter; @@ -752,4 +754,85 @@ mod tests { Err(crate::Error::UserError(crate::error::UserError::InvalidFilter(_))) )); } + + #[test] + fn filter_number() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("id"), S("one"), S("two") }); + }) + .unwrap(); + + let mut docs = vec![]; + for i in 0..100 { + docs.push(serde_json::json!({ "id": i, "two": i % 10 })); + } + + index.add_documents(documents!(docs)).unwrap(); + + let rtxn = index.read_txn().unwrap(); + for i in 0..100 { + let filter_str = format!("id = {i}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter([i])); + } + for i in 0..100 { + let filter_str = format!("id > {i}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter((i + 1)..100)); + } + for i in 0..100 { + let filter_str = format!("id < {i}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter(0..i)); + } + for i in 0..100 { + let filter_str = format!("id <= {i}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter(0..=i)); + } + for i in 0..100 { + let filter_str = format!("id >= {i}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter(i..100)); + } + for i in 0..100 { + for j in i..100 { + let filter_str = format!("id {i} TO {j}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter(i..=j)); + } + } + let filter = Filter::from_str("one >= 0 OR one <= 0").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::default()); + + let filter = Filter::from_str("one = 0").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::default()); + + for i in 0..10 { + for j in i..10 { + let filter_str = format!("two {i} TO {j}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!( + result, + RoaringBitmap::from_iter((0..100).filter(|x| (i..=j).contains(&(x % 10)))) + ); + } + } + let filter = Filter::from_str("two != 0").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter((0..100).filter(|x| x % 10 != 0))); + } } From f37c86e0b20dbfe2e638ba1cb3e192c9273d0f7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Dec 2022 15:44:06 +0100 Subject: [PATCH 1850/1889] Add some integration tests on the sort criterion --- milli/src/search/criteria/asc_desc.rs | 199 ++++++++++++++++++++++++++ 1 file changed, 199 insertions(+) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index fbcf1d3fe..036bd0b63 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -294,3 +294,202 @@ fn iterative_facet_string_ordered_iter<'t>( Ok(vec.into_iter()) } + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use big_s::S; + use maplit::hashset; + + use crate::index::tests::TempIndex; + use crate::{AscDesc, Filter, Search, SearchResult}; + + // Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD + // constant to 0 to ensure that the other sort algorithms are also correct. + #[test] + fn sort_criterion_placeholder() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings + .set_sortable_fields(maplit::hashset! { S("id"), S("mod_10"), S("mod_20") }); + settings.set_criteria(vec!["sort".to_owned()]); + }) + .unwrap(); + + let mut docs = vec![]; + for i in 0..100 { + docs.push( + serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }), + ); + } + + index.add_documents(documents!(docs)).unwrap(); + + let all_ids = (0..100).collect::>(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.sort_criteria(vec![AscDesc::from_str("mod_10:desc").unwrap()]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 19, 29, 39, 49, 59, 69, 79, 89, 99, 8, 18, 28, 38, 48, 58, 68, 78, 88, 98, 7, 17, 27, 37, 47, 57, 67, 77, 87, 97, 6, 16, 26, 36, 46, 56, 66, 76, 86, 96, 5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 4, 14, 24, 34, 44, 54, 64, 74, 84, 94, 3, 13, 23, 33, 43, 53, 63, 73, 83, 93, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90]"); + documents_ids.sort(); + assert_eq!(all_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:desc").unwrap(), + AscDesc::from_str("id:desc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 89, 79, 69, 59, 49, 39, 29, 19, 9, 98, 88, 78, 68, 58, 48, 38, 28, 18, 8, 97, 87, 77, 67, 57, 47, 37, 27, 17, 7, 96, 86, 76, 66, 56, 46, 36, 26, 16, 6, 95, 85, 75, 65, 55, 45, 35, 25, 15, 5, 94, 84, 74, 64, 54, 44, 34, 24, 14, 4, 93, 83, 73, 63, 53, 43, 33, 23, 13, 3, 92, 82, 72, 62, 52, 42, 32, 22, 12, 2, 91, 81, 71, 61, 51, 41, 31, 21, 11, 1, 90, 80, 70, 60, 50, 40, 30, 20, 10, 0]"); + documents_ids.sort(); + assert_eq!(all_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:desc").unwrap(), + AscDesc::from_str("mod_20:asc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 29, 49, 69, 89, 19, 39, 59, 79, 99, 8, 28, 48, 68, 88, 18, 38, 58, 78, 98, 7, 27, 47, 67, 87, 17, 37, 57, 77, 97, 6, 26, 46, 66, 86, 16, 36, 56, 76, 96, 5, 25, 45, 65, 85, 15, 35, 55, 75, 95, 4, 24, 44, 64, 84, 14, 34, 54, 74, 94, 3, 23, 43, 63, 83, 13, 33, 53, 73, 93, 2, 22, 42, 62, 82, 12, 32, 52, 72, 92, 1, 21, 41, 61, 81, 11, 31, 51, 71, 91, 0, 20, 40, 60, 80, 10, 30, 50, 70, 90]"); + documents_ids.sort(); + assert_eq!(all_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:desc").unwrap(), + AscDesc::from_str("mod_20:desc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 39, 59, 79, 99, 9, 29, 49, 69, 89, 18, 38, 58, 78, 98, 8, 28, 48, 68, 88, 17, 37, 57, 77, 97, 7, 27, 47, 67, 87, 16, 36, 56, 76, 96, 6, 26, 46, 66, 86, 15, 35, 55, 75, 95, 5, 25, 45, 65, 85, 14, 34, 54, 74, 94, 4, 24, 44, 64, 84, 13, 33, 53, 73, 93, 3, 23, 43, 63, 83, 12, 32, 52, 72, 92, 2, 22, 42, 62, 82, 11, 31, 51, 71, 91, 1, 21, 41, 61, 81, 10, 30, 50, 70, 90, 0, 20, 40, 60, 80]"); + documents_ids.sort(); + assert_eq!(all_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:desc").unwrap(), + AscDesc::from_str("mod_20:desc").unwrap(), + AscDesc::from_str("id:desc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 79, 59, 39, 19, 89, 69, 49, 29, 9, 98, 78, 58, 38, 18, 88, 68, 48, 28, 8, 97, 77, 57, 37, 17, 87, 67, 47, 27, 7, 96, 76, 56, 36, 16, 86, 66, 46, 26, 6, 95, 75, 55, 35, 15, 85, 65, 45, 25, 5, 94, 74, 54, 34, 14, 84, 64, 44, 24, 4, 93, 73, 53, 33, 13, 83, 63, 43, 23, 3, 92, 72, 52, 32, 12, 82, 62, 42, 22, 2, 91, 71, 51, 31, 11, 81, 61, 41, 21, 1, 90, 70, 50, 30, 10, 80, 60, 40, 20, 0]"); + documents_ids.sort(); + assert_eq!(all_ids, documents_ids); + } + + // Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD + // constant to 0 to ensure that the other sort algorithms are also correct. + #[test] + fn sort_criterion_non_placeholder() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") }); + settings.set_sortable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") }); + settings.set_criteria(vec!["sort".to_owned()]); + }) + .unwrap(); + + let mut docs = vec![]; + for i in 0..100 { + docs.push( + serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }), + ); + } + + index.add_documents(documents!(docs)).unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.filter( + Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]") + .unwrap() + .unwrap(), + ); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:desc").unwrap(), + AscDesc::from_str("mod_20:asc").unwrap(), + AscDesc::from_str("id:desc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + // The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 93, 73, 53, 33, 13, 82, 62, 42, 22, 2, 92, 72, 52, 32, 12, 81, 61, 41, 21, 1, 91, 71, 51, 31, 11, 80, 60, 40, 20, 0, 90, 70, 50, 30, 10]"); + let expected_ids = (0..100) + .filter(|id| { + [1, 0, 2].contains(&(id % 10)) + || [10, 13].contains(&(id % 20)) + || [5, 6].contains(id) + }) + .collect::>(); + documents_ids.sort(); + assert_eq!(expected_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.filter( + Filter::from_str("mod_10 IN [7, 8, 0] OR mod_20 IN [1, 15, 16] OR id IN [0, 4]") + .unwrap() + .unwrap(), + ); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:asc").unwrap(), + AscDesc::from_str("mod_20:asc").unwrap(), + AscDesc::from_str("id:desc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + // The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[80, 60, 40, 20, 0, 90, 70, 50, 30, 10, 81, 61, 41, 21, 1, 4, 95, 75, 55, 35, 15, 96, 76, 56, 36, 16, 87, 67, 47, 27, 7, 97, 77, 57, 37, 17, 88, 68, 48, 28, 8, 98, 78, 58, 38, 18]"); + let expected_ids = (0..100) + .filter(|id| { + [7, 8, 0].contains(&(id % 10)) + || [1, 15, 16].contains(&(id % 20)) + || [0, 4].contains(id) + }) + .collect::>(); + documents_ids.sort(); + assert_eq!(expected_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.filter( + Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]") + .unwrap() + .unwrap(), + ); + search.sort_criteria(vec![AscDesc::from_str("id:desc").unwrap()]); + search.limit(100); + + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + // The order should be in decreasing value of the id + let mut expected_ids = (0..100) + .filter(|id| { + [1, 0, 2].contains(&(id % 10)) + || [10, 13].contains(&(id % 20)) + || [5, 6].contains(id) + }) + .collect::>(); + expected_ids.sort(); + expected_ids.reverse(); + assert_eq!(expected_ids, documents_ids); + } +} From 6d50ea0830faf554267ad8e27f83d8b5f9b95548 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 7 Dec 2022 16:41:23 +0100 Subject: [PATCH 1851/1889] add tests --- milli/tests/search/distinct.rs | 117 +++++++++++++++++++++++++++++---- 1 file changed, 105 insertions(+), 12 deletions(-) diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index c2b7e2c1e..f1e57d288 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -8,7 +8,7 @@ use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; macro_rules! test_distinct { - ($func:ident, $distinct:ident, $criteria:expr, $n_res:expr) => { + ($func:ident, $distinct:ident, $exhaustive:ident, $limit:expr, $criteria:expr, $n_res:expr) => { #[test] fn $func() { let criteria = $criteria; @@ -26,7 +26,8 @@ macro_rules! test_distinct { let mut search = Search::new(&rtxn, &index); search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.limit($limit); + search.exhaustive_number_hits($exhaustive); search.authorize_typos(true); search.terms_matching_strategy(TermsMatchingStrategy::default()); @@ -46,6 +47,7 @@ macro_rules! test_distinct { Some(d.id) } }) + .take($limit) .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); @@ -54,25 +56,116 @@ macro_rules! test_distinct { }; } +test_distinct!( + exhaustive_distinct_string_default_criteria, + tag, + true, + 1, + vec![Words, Typo, Proximity, Attribute, Exactness], + 3 +); +test_distinct!( + exhaustive_distinct_number_default_criteria, + asc_desc_rank, + true, + 1, + vec![Words, Typo, Proximity, Attribute, Exactness], + 7 +); + test_distinct!( distinct_string_default_criteria, tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), vec![Words, Typo, Proximity, Attribute, Exactness], 3 ); test_distinct!( distinct_number_default_criteria, asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), vec![Words, Typo, Proximity, Attribute, Exactness], 7 ); -test_distinct!(distinct_string_criterion_words, tag, vec![Words], 3); -test_distinct!(distinct_number_criterion_words, asc_desc_rank, vec![Words], 7); -test_distinct!(distinct_string_criterion_words_typo, tag, vec![Words, Typo], 3); -test_distinct!(distinct_number_criterion_words_typo, asc_desc_rank, vec![Words, Typo], 7); -test_distinct!(distinct_string_criterion_words_proximity, tag, vec![Words, Proximity], 3); -test_distinct!(distinct_number_criterion_words_proximity, asc_desc_rank, vec![Words, Proximity], 7); -test_distinct!(distinct_string_criterion_words_attribute, tag, vec![Words, Attribute], 3); -test_distinct!(distinct_number_criterion_words_attribute, asc_desc_rank, vec![Words, Attribute], 7); -test_distinct!(distinct_string_criterion_words_exactness, tag, vec![Words, Exactness], 3); -test_distinct!(distinct_number_criterion_words_exactness, asc_desc_rank, vec![Words, Exactness], 7); +test_distinct!( + distinct_string_criterion_words, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words], + 3 +); +test_distinct!( + distinct_number_criterion_words, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words], + 7 +); +test_distinct!( + distinct_string_criterion_words_typo, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Typo], + 3 +); +test_distinct!( + distinct_number_criterion_words_typo, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Typo], + 7 +); +test_distinct!( + distinct_string_criterion_words_proximity, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Proximity], + 3 +); +test_distinct!( + distinct_number_criterion_words_proximity, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Proximity], + 7 +); +test_distinct!( + distinct_string_criterion_words_attribute, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Attribute], + 3 +); +test_distinct!( + distinct_number_criterion_words_attribute, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Attribute], + 7 +); +test_distinct!( + distinct_string_criterion_words_exactness, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Exactness], + 3 +); +test_distinct!( + distinct_number_criterion_words_exactness, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Exactness], + 7 +); From 55724f24120c55ed2e293ee7438dd8489e77edbe Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 7 Dec 2022 18:29:25 +0100 Subject: [PATCH 1852/1889] Introduce an initial candidates set that makes the difference between an exhaustive count and an estimation --- milli/src/search/criteria/asc_desc.rs | 20 ++++---- milli/src/search/criteria/attribute.rs | 28 ++++++----- milli/src/search/criteria/exactness.rs | 21 ++++---- milli/src/search/criteria/final.rs | 10 ++-- milli/src/search/criteria/geo.rs | 20 ++++---- milli/src/search/criteria/initial.rs | 20 ++++---- milli/src/search/criteria/mod.rs | 69 +++++++++++++++++++++++++- milli/src/search/criteria/proximity.rs | 22 ++++---- milli/src/search/criteria/typo.rs | 39 +++++++-------- milli/src/search/criteria/words.rs | 21 ++++---- milli/src/search/mod.rs | 11 ++-- 11 files changed, 180 insertions(+), 101 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index fbcf1d3fe..fd01e806d 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -9,7 +9,7 @@ use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; use crate::heed_codec::facet::FacetGroupKeyCodec; use crate::heed_codec::ByteSliceRefCodec; -use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -27,7 +27,7 @@ pub struct AscDesc<'t> { query_tree: Option, candidates: Box> + 't>, allowed_candidates: RoaringBitmap, - bucket_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, faceted_candidates: RoaringBitmap, parent: Box, } @@ -81,7 +81,7 @@ impl<'t> AscDesc<'t> { candidates: Box::new(std::iter::empty()), allowed_candidates: RoaringBitmap::new(), faceted_candidates, - bucket_candidates: RoaringBitmap::new(), + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), parent, }) } @@ -106,7 +106,7 @@ impl<'t> Criterion for AscDesc<'t> { query_tree: self.query_tree.clone(), candidates: Some(take(&mut self.allowed_candidates)), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } None => match self.parent.next(params)? { @@ -114,7 +114,7 @@ impl<'t> Criterion for AscDesc<'t> { query_tree, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { self.query_tree = query_tree; let mut candidates = match (&self.query_tree, candidates) { @@ -130,9 +130,11 @@ impl<'t> Criterion for AscDesc<'t> { candidates &= filtered_candidates; } - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), } if candidates.is_empty() { @@ -160,7 +162,7 @@ impl<'t> Criterion for AscDesc<'t> { query_tree: self.query_tree.clone(), candidates: Some(candidates), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } } diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index fd567a7ac..9da868e1a 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -7,7 +7,7 @@ use std::mem::take; use roaring::RoaringBitmap; use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::Query; +use crate::search::criteria::{InitialCandidates, Query}; use crate::search::query_tree::{Operation, QueryKind}; use crate::search::{build_dfa, word_derivations, WordDerivationsCache}; use crate::Result; @@ -26,7 +26,7 @@ type FlattenedQueryTree = Vec>>; pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, - bucket_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, parent: Box, linear_buckets: Option>, set_buckets: Option>>, @@ -37,7 +37,7 @@ impl<'t> Attribute<'t> { Attribute { ctx, state: None, - bucket_candidates: RoaringBitmap::new(), + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), parent, linear_buckets: None, set_buckets: None, @@ -60,7 +60,7 @@ impl<'t> Criterion for Attribute<'t> { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { @@ -84,7 +84,7 @@ impl<'t> Criterion for Attribute<'t> { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } } @@ -109,7 +109,7 @@ impl<'t> Criterion for Attribute<'t> { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } } @@ -124,7 +124,7 @@ impl<'t> Criterion for Attribute<'t> { query_tree: Some(query_tree), candidates: Some(found_candidates), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } None => match self.parent.next(params)? { @@ -132,7 +132,7 @@ impl<'t> Criterion for Attribute<'t> { query_tree: Some(query_tree), candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { let mut candidates = match candidates { Some(candidates) => candidates, @@ -148,9 +148,11 @@ impl<'t> Criterion for Attribute<'t> { let flattened_query_tree = flatten_query_tree(&query_tree); - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), } self.state = Some((query_tree, flattened_query_tree, candidates)); @@ -160,13 +162,13 @@ impl<'t> Criterion for Attribute<'t> { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, })); } None => return Ok(None), diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 580031697..b389a5d1e 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -8,6 +8,7 @@ use roaring::RoaringBitmap; use crate::search::criteria::{ resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, + InitialCandidates, }; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; use crate::{absolute_from_relative_position, FieldId, Result}; @@ -16,7 +17,7 @@ pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, query_tree: Option, state: Option, - bucket_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, parent: Box, query: Vec, } @@ -36,7 +37,7 @@ impl<'t> Exactness<'t> { ctx, query_tree: None, state: None, - bucket_candidates: RoaringBitmap::new(), + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), parent, query, }) @@ -68,7 +69,7 @@ impl<'t> Criterion for Exactness<'t> { query_tree: self.query_tree.clone(), candidates: Some(candidates), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } None => match self.parent.next(params)? { @@ -76,7 +77,7 @@ impl<'t> Criterion for Exactness<'t> { query_tree: Some(query_tree), candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { let mut candidates = match candidates { Some(candidates) => candidates, @@ -90,9 +91,11 @@ impl<'t> Criterion for Exactness<'t> { candidates &= filtered_candidates; } - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), } self.state = Some(State::new(candidates)); @@ -102,13 +105,13 @@ impl<'t> Criterion for Exactness<'t> { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, })); } None => return Ok(None), diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index bd3244143..9f7a147b8 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -2,6 +2,7 @@ use log::debug; use roaring::RoaringBitmap; use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::InitialCandidates; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use crate::Result; @@ -14,7 +15,7 @@ pub struct FinalResult { /// The candidates of the current bucket of the last criterion. pub candidates: RoaringBitmap, /// Candidates that comes from the current bucket of the initial criterion. - pub bucket_candidates: RoaringBitmap, + pub initial_candidates: InitialCandidates, } pub struct Final<'t> { @@ -49,7 +50,7 @@ impl<'t> Final<'t> { query_tree, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { let mut candidates = match (candidates, query_tree.as_ref()) { (Some(candidates), _) => candidates, @@ -63,11 +64,12 @@ impl<'t> Final<'t> { candidates &= filtered_candidates; } - let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); + let initial_candidates = initial_candidates + .unwrap_or_else(|| InitialCandidates::Estimated(candidates.clone())); self.returned_candidates |= &candidates; - Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })) + Ok(Some(FinalResult { query_tree, candidates, initial_candidates })) } None => Ok(None), } diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index 1b08cfac8..0b33e6b2f 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -4,7 +4,7 @@ use roaring::RoaringBitmap; use rstar::RTree; use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; use crate::{lat_lng_to_xyz, GeoPoint, Index, Result}; pub struct Geo<'t> { @@ -14,7 +14,7 @@ pub struct Geo<'t> { parent: Box, candidates: Box>, allowed_candidates: RoaringBitmap, - bucket_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, rtree: Option>, point: [f64; 2], } @@ -47,7 +47,7 @@ impl<'t> Geo<'t> { ) -> Result { let candidates = Box::new(iter::empty()); let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; - let bucket_candidates = RoaringBitmap::new(); + let initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new()); let rtree = index.geo_rtree(rtxn)?; Ok(Self { @@ -57,7 +57,7 @@ impl<'t> Geo<'t> { parent, candidates, allowed_candidates, - bucket_candidates, + initial_candidates, rtree, point, }) @@ -77,7 +77,7 @@ impl Criterion for Geo<'_> { query_tree: None, candidates: Some(candidates), filtered_candidates: None, - bucket_candidates: Some(self.bucket_candidates.clone()), + initial_candidates: Some(self.initial_candidates.clone()), })); } None => match self.parent.next(params)? { @@ -85,7 +85,7 @@ impl Criterion for Geo<'_> { query_tree, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { let mut candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, @@ -100,9 +100,11 @@ impl Criterion for Geo<'_> { candidates &= filtered_candidates; } - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), } if candidates.is_empty() { diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 85daa813b..44c08dc06 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -1,7 +1,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::{resolve_query_tree, Context}; +use crate::search::criteria::{resolve_query_tree, Context, InitialCandidates}; use crate::search::query_tree::Operation; use crate::search::Distinct; use crate::Result; @@ -27,7 +27,7 @@ impl<'t, D> Initial<'t, D> { query_tree, candidates: None, filtered_candidates, - bucket_candidates: None, + initial_candidates: None, }; Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct } } @@ -41,32 +41,34 @@ impl Criterion for Initial<'_, D> { .map(|mut answer| { if self.exhaustive_number_hits && answer.query_tree.is_some() { // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. + // then remove the potential soft deleted documents. let mut candidates = resolve_query_tree( self.ctx, answer.query_tree.as_ref().unwrap(), params.wdcache, - )?; + )? - params.excluded_candidates; // Apply the filters on the documents retrieved with the query tree. if let Some(ref filtered_candidates) = answer.filtered_candidates { candidates &= filtered_candidates; } - // because the bucket_candidates should be an exhaustive count of the matching documents, + // because the initial_candidates should be an exhaustive count of the matching documents, // we precompute the distinct attributes. - let bucket_candidates = match &mut self.distinct { + let initial_candidates = match &mut self.distinct { Some(distinct) => { - let mut bucket_candidates = RoaringBitmap::new(); + let mut initial_candidates = RoaringBitmap::new(); for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { - bucket_candidates.insert(c?); + initial_candidates.insert(c?); } - bucket_candidates + initial_candidates } None => candidates.clone(), }; answer.candidates = Some(candidates); - answer.bucket_candidates = Some(bucket_candidates); + answer.initial_candidates = + Some(InitialCandidates::Exhaustive(initial_candidates)); } Ok(answer) }) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 76718c8ec..eb83f5515 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,5 +1,7 @@ use std::borrow::Cow; use std::collections::HashMap; +use std::mem::take; +use std::ops::{BitOr, BitOrAssign}; use roaring::RoaringBitmap; @@ -41,7 +43,7 @@ pub struct CriterionResult { /// The candidates, coming from facet filters, that this criterion is allowed to return subsets of. filtered_candidates: Option, /// Candidates that comes from the current bucket of the initial criterion. - bucket_candidates: Option, + initial_candidates: Option, } #[derive(Debug, PartialEq)] @@ -65,6 +67,71 @@ impl Default for Candidates { } } +/// Either a set of candidates that defines the estimated set of candidates +/// that could be returned, +/// or the Exhaustive set of candidates that will be returned if all possible results are fetched. +#[derive(Debug, Clone, PartialEq)] +pub enum InitialCandidates { + Estimated(RoaringBitmap), + Exhaustive(RoaringBitmap), +} + +impl InitialCandidates { + fn take(&mut self) -> Self { + match self { + Self::Estimated(c) => Self::Estimated(take(c)), + Self::Exhaustive(c) => Self::Exhaustive(take(c)), + } + } + + /// modify the containing roaring bitmap inplace if the set isn't already Exhaustive. + pub fn map_inplace(&mut self, f: F) + where + F: FnOnce(RoaringBitmap) -> RoaringBitmap, + { + if let Self::Estimated(c) = self { + *c = f(take(c)) + } + } + + pub fn into_inner(self) -> RoaringBitmap { + match self { + Self::Estimated(c) => c, + Self::Exhaustive(c) => c, + } + } +} + +impl BitOrAssign for InitialCandidates { + /// Make an union between the containing roaring bitmaps if the set isn't already Exhaustive. + /// In the case of rhs is Exhaustive and not self, then rhs replaces self. + fn bitor_assign(&mut self, rhs: Self) { + if let Self::Estimated(c) = self { + *self = match rhs { + Self::Estimated(rhs) => Self::Estimated(rhs | &*c), + Self::Exhaustive(rhs) => Self::Exhaustive(rhs), + } + } + } +} + +impl BitOr for InitialCandidates { + type Output = Self; + + /// Make an union between the containing roaring bitmaps if the set isn't already Exhaustive. + /// In the case of rhs is Exhaustive and not self, then rhs replaces self. + fn bitor(self, rhs: Self) -> Self::Output { + if let Self::Estimated(c) = self { + match rhs { + Self::Estimated(rhs) => Self::Estimated(rhs | c), + Self::Exhaustive(rhs) => Self::Exhaustive(rhs), + } + } else { + self.clone() + } + } +} + pub trait Context<'c> { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 880b3e1ba..d44ba25dd 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -1,6 +1,5 @@ use std::collections::btree_map::{self, BTreeMap}; use std::collections::hash_map::HashMap; -use std::mem::take; use log::debug; use roaring::RoaringBitmap; @@ -10,6 +9,7 @@ use super::{ query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; +use crate::search::criteria::InitialCandidates; use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; use crate::search::{build_dfa, WordDerivationsCache}; use crate::{Position, Result}; @@ -29,7 +29,7 @@ pub struct Proximity<'t> { /// (max_proximity, query_tree, allowed_candidates) state: Option<(u8, Operation, RoaringBitmap)>, proximity: u8, - bucket_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, parent: Box, candidates_cache: Cache, plane_sweep_cache: Option>, @@ -41,7 +41,7 @@ impl<'t> Proximity<'t> { ctx, state: None, proximity: 0, - bucket_candidates: RoaringBitmap::new(), + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), parent, candidates_cache: Cache::new(), plane_sweep_cache: None, @@ -115,7 +115,7 @@ impl<'t> Criterion for Proximity<'t> { query_tree: Some(query_tree.clone()), candidates: Some(new_candidates), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } None => match self.parent.next(params)? { @@ -123,7 +123,7 @@ impl<'t> Criterion for Proximity<'t> { query_tree: Some(query_tree), candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { let mut candidates = match candidates { Some(candidates) => candidates, @@ -137,9 +137,11 @@ impl<'t> Criterion for Proximity<'t> { candidates &= filtered_candidates; } - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), } let maximum_proximity = maximum_proximity(&query_tree); @@ -151,13 +153,13 @@ impl<'t> Criterion for Proximity<'t> { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, })); } None => return Ok(None), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 2ae35e418..56cffd232 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -9,7 +9,7 @@ use super::{ query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, CriterionResult, }; -use crate::search::criteria::resolve_phrase; +use crate::search::criteria::{resolve_phrase, InitialCandidates}; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; use crate::Result; @@ -22,7 +22,7 @@ pub struct Typo<'t> { /// (max_typos, query_tree, candidates) state: Option<(u8, Operation, Candidates)>, typos: u8, - bucket_candidates: Option, + initial_candidates: Option, parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } @@ -33,7 +33,7 @@ impl<'t> Typo<'t> { ctx, state: None, typos: 0, - bucket_candidates: None, + initial_candidates: None, parent, candidates_cache: HashMap::new(), } @@ -120,9 +120,9 @@ impl<'t> Criterion for Typo<'t> { } } - let bucket_candidates = match self.bucket_candidates.as_mut() { - Some(bucket_candidates) => take(bucket_candidates), - None => candidates.clone(), + let initial_candidates = match self.initial_candidates.as_mut() { + Some(initial_candidates) => initial_candidates.take(), + None => InitialCandidates::Estimated(candidates.clone()), }; self.typos += 1; @@ -131,7 +131,7 @@ impl<'t> Criterion for Typo<'t> { query_tree: Some(new_query_tree), candidates: Some(candidates), filtered_candidates: None, - bucket_candidates: Some(bucket_candidates), + initial_candidates: Some(initial_candidates), })); } None => match self.parent.next(params)? { @@ -139,14 +139,9 @@ impl<'t> Criterion for Typo<'t> { query_tree: Some(query_tree), candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { - self.bucket_candidates = - match (self.bucket_candidates.take(), bucket_candidates) { - (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), - (self_bc, parent_bc) => self_bc.or(parent_bc), - }; - + self.initial_candidates = initial_candidates; let candidates = match candidates.or(filtered_candidates) { Some(candidates) => { Candidates::Allowed(candidates - params.excluded_candidates) @@ -162,13 +157,13 @@ impl<'t> Criterion for Typo<'t> { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, })); } None => return Ok(None), @@ -356,7 +351,7 @@ mod test { let result = display_criteria(criteria, criterion_parameters); insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, bucket_candidates: None } + CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, initial_candidates: None } "###); } @@ -399,7 +394,7 @@ mod test { Exact { word: "split" } Exact { word: "this" } Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } CriterionResult { query_tree: Some(OR AND @@ -408,7 +403,7 @@ mod test { OR Exact { word: "word" } Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } "###); } @@ -434,7 +429,7 @@ mod test { let result = display_criteria(criteria, criterion_parameters); insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), bucket_candidates: None } + CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), initial_candidates: None } "###); } @@ -482,7 +477,7 @@ mod test { Exact { word: "split" } Exact { word: "this" } Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } CriterionResult { query_tree: Some(OR AND @@ -491,7 +486,7 @@ mod test { OR Exact { word: "word" } Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } "###); } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index b67b7f6b4..181749b60 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -1,9 +1,8 @@ -use std::mem::take; - use log::debug; use roaring::RoaringBitmap; use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::InitialCandidates; use crate::search::query_tree::Operation; use crate::Result; @@ -11,7 +10,7 @@ pub struct Words<'t> { ctx: &'t dyn Context<'t>, query_trees: Vec, candidates: Option, - bucket_candidates: Option, + initial_candidates: Option, filtered_candidates: Option, parent: Box, } @@ -22,7 +21,7 @@ impl<'t> Words<'t> { ctx, query_trees: Vec::default(), candidates: None, - bucket_candidates: None, + initial_candidates: None, parent, filtered_candidates: None, } @@ -53,13 +52,13 @@ impl<'t> Criterion for Words<'t> { None => None, }; - let bucket_candidates = self.bucket_candidates.as_mut().map(take); + let initial_candidates = self.initial_candidates.clone(); return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates: self.filtered_candidates.clone(), - bucket_candidates, + initial_candidates, })); } None => match self.parent.next(params)? { @@ -67,14 +66,14 @@ impl<'t> Criterion for Words<'t> { query_tree: Some(query_tree), candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { self.query_trees = explode_query_tree(query_tree); self.candidates = candidates; self.filtered_candidates = filtered_candidates; - self.bucket_candidates = - match (self.bucket_candidates.take(), bucket_candidates) { + self.initial_candidates = + match (self.initial_candidates.take(), initial_candidates) { (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc), }; @@ -83,13 +82,13 @@ impl<'t> Criterion for Words<'t> { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, })); } None => return Ok(None), diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f62a37c1b..96cf1e0f1 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -23,6 +23,7 @@ pub use self::matches::{ use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; +use crate::search::criteria::InitialCandidates; use crate::{AscDesc, Criterion, DocumentId, Index, Member, Result}; // Building these factories is not free. @@ -235,11 +236,11 @@ impl<'a> Search<'a> { mut criteria: Final, ) -> Result { let mut offset = self.offset; - let mut initial_candidates = RoaringBitmap::new(); + let mut initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new()); let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?; let mut documents_ids = Vec::new(); - while let Some(FinalResult { candidates, bucket_candidates, .. }) = + while let Some(FinalResult { candidates, initial_candidates: ic, .. }) = criteria.next(&excluded_candidates)? { debug!("Number of candidates found {}", candidates.len()); @@ -247,7 +248,7 @@ impl<'a> Search<'a> { let excluded = take(&mut excluded_candidates); let mut candidates = distinct.distinct(candidates, excluded); - initial_candidates |= bucket_candidates; + initial_candidates |= ic; if offset != 0 { let discarded = candidates.by_ref().take(offset).count(); @@ -265,9 +266,11 @@ impl<'a> Search<'a> { } } + initial_candidates.map_inplace(|c| c - excluded_candidates); + Ok(SearchResult { matching_words, - candidates: initial_candidates - excluded_candidates, + candidates: initial_candidates.into_inner(), documents_ids, }) } From bebd050961f5ecd61404dbd59aac2bfee820a0c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Dec 2022 19:18:25 +0100 Subject: [PATCH 1853/1889] Add new test for bug 3021 --- milli/src/index.rs | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index e9d66a3ae..8855d51cd 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2111,4 +2111,72 @@ pub(crate) mod tests { "###); db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); } + + #[test] + fn bug_3021_third() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 3 }, + { "primary_key": 4 }, + { "primary_key": 5 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 3 0 + 4 1 + 5 2 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_external_id("3"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[1, 2, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 3 0 + 4 1 + 5 2 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]"); + + index.index_documents_config.disable_soft_deletion = true; + + index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap(); + + db_snap!(index, documents_ids, @"[2, 3, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 3 0 + 4 3 + 5 2 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); + + // boom + index + .add_documents(documents!([ + { "primary_key": "3" }, + ])) + .unwrap(); + } } From e3ee553dcca16e5ff5e688da6230acdab8eacc67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 12 Dec 2022 12:42:55 +0100 Subject: [PATCH 1854/1889] Remove soft deleted ids from ExternalDocumentIds during document import If the document import replaces a document using hard deletion --- milli/src/index.rs | 12 ++++++++-- milli/src/update/delete_documents.rs | 31 +++++++++++++++++++++---- milli/src/update/index_documents/mod.rs | 9 ++++--- 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 8855d51cd..2d489fbd1 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2166,17 +2166,25 @@ pub(crate) mod tests { db_snap!(index, external_documents_ids, 2, @r###" soft: hard: - 3 0 4 3 5 2 "###); db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); - // boom index .add_documents(documents!([ { "primary_key": "3" }, ])) .unwrap(); + + db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 3 0 + 4 3 + 5 2 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); } } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 88ec78420..0f77e2b13 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -34,6 +34,12 @@ pub struct DocumentDeletionResult { pub deleted_documents: u64, pub remaining_documents: u64, } +#[derive(Debug)] +pub struct DetailedDocumentDeletionResult { + pub deleted_documents: u64, + pub remaining_documents: u64, + pub used_soft_deletion: bool, +} impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { pub fn new( @@ -68,8 +74,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.delete_document(docid); Some(docid) } + pub fn execute(self) -> Result { + let DetailedDocumentDeletionResult { + deleted_documents, + remaining_documents, + used_soft_deletion: _, + } = self.execute_inner()?; - pub fn execute(mut self) -> Result { + Ok(DocumentDeletionResult { deleted_documents, remaining_documents }) + } + pub(crate) fn execute_inner(mut self) -> Result { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; // We retrieve the current documents ids that are in the database. @@ -83,7 +97,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if !soft_deleted_docids.is_empty() { ClearDocuments::new(self.wtxn, self.index).execute()?; } - return Ok(DocumentDeletionResult { deleted_documents: 0, remaining_documents: 0 }); + return Ok(DetailedDocumentDeletionResult { + deleted_documents: 0, + remaining_documents: 0, + used_soft_deletion: false, + }); } // We remove the documents ids that we want to delete @@ -95,9 +113,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // to delete is exactly the number of documents in the database. if current_documents_ids_len == self.to_delete_docids.len() { let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?; - return Ok(DocumentDeletionResult { + return Ok(DetailedDocumentDeletionResult { deleted_documents: current_documents_ids_len, remaining_documents, + used_soft_deletion: false, }); } @@ -159,9 +178,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { && percentage_used_by_soft_deleted_documents < 10 { self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; - return Ok(DocumentDeletionResult { + return Ok(DetailedDocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), + used_soft_deletion: true, }); } @@ -488,9 +508,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { &self.to_delete_docids, )?; - Ok(DocumentDeletionResult { + Ok(DetailedDocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), + used_soft_deletion: false, }) } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index db6ffedc1..478a74065 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -210,7 +210,7 @@ where primary_key, fields_ids_map, field_distribution, - external_documents_ids, + mut external_documents_ids, new_documents_ids, replaced_documents_ids, documents_count, @@ -335,8 +335,11 @@ where deletion_builder.disable_soft_deletion(self.config.disable_soft_deletion); debug!("documents to delete {:?}", replaced_documents_ids); deletion_builder.delete_documents(&replaced_documents_ids); - let deleted_documents_count = deletion_builder.execute()?; - debug!("{} documents actually deleted", deleted_documents_count.deleted_documents); + let deleted_documents_result = deletion_builder.execute_inner()?; + debug!("{} documents actually deleted", deleted_documents_result.deleted_documents); + if !deleted_documents_result.used_soft_deletion { + external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; + } } let index_documents_ids = self.index.documents_ids(self.wtxn)?; From 80d34a41699d33bc03f7d9e47984fd0bbfe306a0 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 12 Dec 2022 19:02:48 +0100 Subject: [PATCH 1855/1889] Fix typo initial candiddates computation --- milli/src/search/criteria/typo.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 56cffd232..154aa2d8e 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -141,7 +141,12 @@ impl<'t> Criterion for Typo<'t> { filtered_candidates, initial_candidates, }) => { - self.initial_candidates = initial_candidates; + self.initial_candidates = + match (self.initial_candidates.take(), initial_candidates) { + (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), + (self_bc, parent_bc) => self_bc.or(parent_bc), + }; + let candidates = match candidates.or(filtered_candidates) { Some(candidates) => { Candidates::Allowed(candidates - params.excluded_candidates) From be3b00350c9fed3c6d1bd2e94e78b8af5e81897a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 13 Dec 2022 10:15:22 +0100 Subject: [PATCH 1856/1889] Apply review suggestions: naming and documentation --- milli/src/update/delete_documents.rs | 20 +++++++++++++------- milli/src/update/index_documents/mod.rs | 2 +- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 0f77e2b13..6c0f66685 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -29,16 +29,22 @@ pub struct DeleteDocuments<'t, 'u, 'i> { disable_soft_deletion: bool, } +/// Result of a [`DeleteDocuments`] operation. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct DocumentDeletionResult { pub deleted_documents: u64, pub remaining_documents: u64, } + +/// Result of a [`DeleteDocuments`] operation, used for internal purposes. +/// +/// It is a superset of the [`DocumentDeletionResult`] structure, giving +/// additional information about the algorithm used to delete the documents. #[derive(Debug)] -pub struct DetailedDocumentDeletionResult { +pub(crate) struct DetailedDocumentDeletionResult { pub deleted_documents: u64, pub remaining_documents: u64, - pub used_soft_deletion: bool, + pub soft_deletion_used: bool, } impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { @@ -78,7 +84,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let DetailedDocumentDeletionResult { deleted_documents, remaining_documents, - used_soft_deletion: _, + soft_deletion_used: _, } = self.execute_inner()?; Ok(DocumentDeletionResult { deleted_documents, remaining_documents }) @@ -100,7 +106,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { return Ok(DetailedDocumentDeletionResult { deleted_documents: 0, remaining_documents: 0, - used_soft_deletion: false, + soft_deletion_used: false, }); } @@ -116,7 +122,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { return Ok(DetailedDocumentDeletionResult { deleted_documents: current_documents_ids_len, remaining_documents, - used_soft_deletion: false, + soft_deletion_used: false, }); } @@ -181,7 +187,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { return Ok(DetailedDocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), - used_soft_deletion: true, + soft_deletion_used: true, }); } @@ -511,7 +517,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { Ok(DetailedDocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), - used_soft_deletion: false, + soft_deletion_used: false, }) } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 478a74065..74a8d2779 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -337,7 +337,7 @@ where deletion_builder.delete_documents(&replaced_documents_ids); let deleted_documents_result = deletion_builder.execute_inner()?; debug!("{} documents actually deleted", deleted_documents_result.deleted_documents); - if !deleted_documents_result.used_soft_deletion { + if !deleted_documents_result.soft_deletion_used { external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; } } From 2d8d0af1a6549012bd04cf6472b234e431a26fbe Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 13 Dec 2022 10:56:38 +0100 Subject: [PATCH 1857/1889] Rename short name bc by ic for initial_candidates --- milli/src/search/criteria/typo.rs | 4 ++-- milli/src/search/criteria/words.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 154aa2d8e..20bc718fd 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -143,8 +143,8 @@ impl<'t> Criterion for Typo<'t> { }) => { self.initial_candidates = match (self.initial_candidates.take(), initial_candidates) { - (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), - (self_bc, parent_bc) => self_bc.or(parent_bc), + (Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic), + (self_ic, parent_ic) => self_ic.or(parent_ic), }; let candidates = match candidates.or(filtered_candidates) { diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 181749b60..4c5f8b45b 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -74,8 +74,8 @@ impl<'t> Criterion for Words<'t> { self.initial_candidates = match (self.initial_candidates.take(), initial_candidates) { - (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), - (self_bc, parent_bc) => self_bc.or(parent_bc), + (Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic), + (self_ic, parent_ic) => self_ic.or(parent_ic), }; } Some(CriterionResult { From 2c47500bc3b9f0d78f1ff29f3331f062ccf33b4e Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 13 Dec 2022 15:29:52 +0100 Subject: [PATCH 1858/1889] fix two nightly errors --- filter-parser/src/error.rs | 2 +- filter-parser/src/lib.rs | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index 8a628156a..ea95caba7 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -33,7 +33,7 @@ impl NomErrorExt for nom::Err { /// cut a parser and map the error pub fn cut_with_err<'a, O>( - mut parser: impl FnMut(Span<'a>) -> IResult, + mut parser: impl FnMut(Span<'a>) -> IResult<'a, O>, mut with: impl FnMut(Error<'a>) -> Error<'a>, ) -> impl FnMut(Span<'a>) -> IResult { move |input| match parser.parse(input) { diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 61801e3d4..d10136ace 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -170,7 +170,9 @@ impl<'a> FilterCondition<'a> { } /// remove OPTIONAL whitespaces before AND after the provided parser. -fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) -> IResult { +fn ws<'a, O>( + inner: impl FnMut(Span<'a>) -> IResult<'a, O>, +) -> impl FnMut(Span<'a>) -> IResult<'a, O> { delimited(multispace0, inner, multispace0) } From 739da9fd4dd1aae5d8b6c1f7ff624b12016d85b0 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 13 Dec 2022 15:54:43 +0100 Subject: [PATCH 1859/1889] Add test --- milli/tests/search/distinct.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index f1e57d288..3c6dd8cc0 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -72,6 +72,14 @@ test_distinct!( vec![Words, Typo, Proximity, Attribute, Exactness], 7 ); +test_distinct!( + exhaustive_distinct_number_weird_order_criteria, + asc_desc_rank, + true, + 0, + vec![Desc(S("attribute_rank")), Desc(S("exactness_rank")), Exactness, Typo], + 7 +); test_distinct!( distinct_string_default_criteria, From 3322018c066824ac5a112a3abf258c9dda78d77f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 14 Dec 2022 20:09:47 +0100 Subject: [PATCH 1860/1889] Fix placeholder search --- milli/src/search/criteria/initial.rs | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 44c08dc06..0826a9f68 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -39,19 +39,24 @@ impl Criterion for Initial<'_, D> { self.answer .take() .map(|mut answer| { - if self.exhaustive_number_hits && answer.query_tree.is_some() { + if self.exhaustive_number_hits { // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. - // then remove the potential soft deleted documents. - let mut candidates = resolve_query_tree( - self.ctx, - answer.query_tree.as_ref().unwrap(), - params.wdcache, - )? - params.excluded_candidates; + let candidates = answer + .query_tree + .as_ref() + .map(|query_tree| resolve_query_tree(self.ctx, query_tree, params.wdcache)) + .transpose()?; - // Apply the filters on the documents retrieved with the query tree. - if let Some(ref filtered_candidates) = answer.filtered_candidates { - candidates &= filtered_candidates; - } + // then intersect the candidates with the potential filtered candidates. + let mut candidates = match (candidates, answer.filtered_candidates.take()) { + (Some(candidates), Some(filtered)) => candidates & filtered, + (Some(candidates), None) => candidates, + (None, Some(filtered)) => filtered, + (None, None) => self.ctx.documents_ids()?, + }; + + // then remove the potential soft deleted documents. + candidates -= params.excluded_candidates; // because the initial_candidates should be an exhaustive count of the matching documents, // we precompute the distinct attributes. From 96d4242b9372d06adc63537dd22de8bf73b34c56 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 12 Dec 2022 14:53:08 +0100 Subject: [PATCH 1861/1889] Update charabia --- milli/Cargo.toml | 6 +++++- milli/src/search/matches/mod.rs | 6 +++--- milli/src/search/query_tree.rs | 8 ++++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 60d45730c..871716ecb 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,7 +9,7 @@ bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" bstr = "1.0.1" byteorder = "1.4.3" -charabia = { version = "0.6.0", default-features = false } +charabia = { version = "0.7.0", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.6" either = "1.8.0" @@ -70,6 +70,10 @@ hebrew = ["charabia/hebrew"] # allow japanese specialized tokenization japanese = ["charabia/japanese"] +japanese-transliteration = ["charabia/japanese-transliteration"] + +# allow korean specialized tokenization +korean = ["charabia/korean"] # allow thai specialized tokenization thai = ["charabia/thai"] diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 25ee52ab1..6ac5123a8 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -14,14 +14,14 @@ const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; /// Structure used to build a Matcher allowing to customize formating tags. pub struct MatcherBuilder<'a, A> { matching_words: MatchingWords, - tokenizer: Tokenizer<'a, A>, + tokenizer: Tokenizer<'a, 'a, A>, crop_marker: Option, highlight_prefix: Option, highlight_suffix: Option, } impl<'a, A> MatcherBuilder<'a, A> { - pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, A>) -> Self { + pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { Self { matching_words, tokenizer, @@ -106,7 +106,7 @@ pub struct MatchBounds { pub struct Matcher<'t, 'm, A> { text: &'t str, matching_words: &'m MatchingWords, - tokenizer: &'m Tokenizer<'m, A>, + tokenizer: &'m Tokenizer<'m, 'm, A>, crop_marker: &'m str, highlight_prefix: &'m str, highlight_suffix: &'m str, diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index e689ae440..b5399f6e6 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -6,7 +6,7 @@ use std::hash::Hash; use std::rc::Rc; use std::{fmt, mem}; -use charabia::classifier::ClassifiedTokenIter; +use charabia::normalizer::NormalizedTokenIter; use charabia::{SeparatorKind, TokenKind}; use roaring::RoaringBitmap; use slice_group_by::GroupBy; @@ -270,7 +270,7 @@ impl<'a> QueryTreeBuilder<'a> { /// (the criterion `typo` will be ignored) pub fn build>( &self, - query: ClassifiedTokenIter, + query: NormalizedTokenIter, ) -> Result> { let primitive_query = create_primitive_query(query, self.words_limit); if !primitive_query.is_empty() { @@ -778,7 +778,7 @@ impl PrimitiveQueryPart { /// Create primitive query from tokenized query string, /// the primitive query is an intermediate state to build the query tree. fn create_primitive_query( - query: ClassifiedTokenIter, + query: NormalizedTokenIter, words_limit: Option, ) -> PrimitiveQuery where @@ -892,7 +892,7 @@ mod test { terms_matching_strategy: TermsMatchingStrategy, authorize_typos: bool, words_limit: Option, - query: ClassifiedTokenIter, + query: NormalizedTokenIter, ) -> Result> { let primitive_query = create_primitive_query(query, words_limit); if !primitive_query.is_empty() { From 7f88c4ff2f90cabb1fd8d3d93dbd81edee2318df Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 13 Dec 2022 16:21:31 +0100 Subject: [PATCH 1862/1889] Fix #1714 test --- milli/src/update/index_documents/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 74a8d2779..e6d387a0d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1575,11 +1575,11 @@ mod tests { let rtxn = index.read_txn().unwrap(); // Only the first document should match. - let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len(); + let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len(); assert_eq!(count, 1); // Only the second document should match. - let count = index.word_docids.get(&rtxn, "包").unwrap().unwrap().len(); + let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len(); assert_eq!(count, 1); let mut search = crate::Search::new(&rtxn, &index); From fc7618d49b1d1b3eb8004eb7ac7432cf2ea148c5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 19 Dec 2022 09:47:54 +0100 Subject: [PATCH 1863/1889] Add DeletionStrategy --- milli/src/update/delete_documents.rs | 38 ++++++++++++++++++++++--- milli/src/update/index_documents/mod.rs | 8 +++--- milli/src/update/mod.rs | 2 +- 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 6c0f66685..25dc9fa12 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -26,7 +26,7 @@ pub struct DeleteDocuments<'t, 'u, 'i> { index: &'i Index, external_documents_ids: ExternalDocumentsIds<'static>, to_delete_docids: RoaringBitmap, - disable_soft_deletion: bool, + strategy: DeletionStrategy, } /// Result of a [`DeleteDocuments`] operation. @@ -36,6 +36,36 @@ pub struct DocumentDeletionResult { pub remaining_documents: u64, } +/// Strategy for deleting documents. +/// +/// - Soft-deleted documents are simply marked as deleted without being actually removed from DB. +/// - Hard-deleted documents are definitely suppressed from the DB. +/// +/// Soft-deleted documents trade disk space for runtime performance. +/// +/// Note that any of these variants can be used at any given moment for any indexation in a database. +/// For instance, you can use an [`AlwaysSoft`] followed by an [`AlwaysHard`] option without issue. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +pub enum DeletionStrategy { + #[default] + /// Definitely suppress documents according to the number of size of soft-deleted documents + Dynamic, + /// Never definitely suppress documents + AlwaysSoft, + /// Always definitely suppress documents + AlwaysHard, +} + +impl std::fmt::Display for DeletionStrategy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DeletionStrategy::Dynamic => write!(f, "dynamic"), + DeletionStrategy::AlwaysSoft => write!(f, "always_soft"), + DeletionStrategy::AlwaysHard => write!(f, "always_hard"), + } + } +} + /// Result of a [`DeleteDocuments`] operation, used for internal purposes. /// /// It is a superset of the [`DocumentDeletionResult`] structure, giving @@ -59,12 +89,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { index, external_documents_ids, to_delete_docids: RoaringBitmap::new(), - disable_soft_deletion: false, + strategy: Default::default(), }) } - pub fn disable_soft_deletion(&mut self, disable: bool) { - self.disable_soft_deletion = disable; + pub fn strategy(&mut self, strategy: DeletionStrategy) { + self.strategy = strategy; } pub fn delete_document(&mut self, docid: u32) { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 74a8d2779..7b8408fe4 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -35,8 +35,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, - WordPrefixPositionDocids, WordsPrefixesFst, + self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, + WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; @@ -88,7 +88,7 @@ pub struct IndexDocumentsConfig { pub words_positions_level_group_size: Option, pub words_positions_min_level_size: Option, pub update_method: IndexDocumentsMethod, - pub disable_soft_deletion: bool, + pub deletion_strategy: DeletionStrategy, pub autogenerate_docids: bool, } @@ -332,7 +332,7 @@ where // able to simply insert all the documents even if they already exist in the database. if !replaced_documents_ids.is_empty() { let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?; - deletion_builder.disable_soft_deletion(self.config.disable_soft_deletion); + deletion_builder.strategy(self.config.deletion_strategy); debug!("documents to delete {:?}", replaced_documents_ids); deletion_builder.delete_documents(&replaced_documents_ids); let deleted_documents_result = deletion_builder.execute_inner()?; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 952720725..2dda24172 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -1,6 +1,6 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; -pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; +pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDeletionResult}; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ From e2ae3b24aaf28ae034fa25ba14fd486710860267 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 19 Dec 2022 09:38:59 +0100 Subject: [PATCH 1864/1889] Hard or soft delete according to the deletion strategy --- milli/src/update/delete_documents.rs | 54 +++++++++++++++------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 25dc9fa12..52b8f5f4a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -186,33 +186,39 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { soft_deleted_docids |= &self.to_delete_docids; - // if we have less documents to delete than the threshold we simply save them in - // the `soft_deleted_documents_ids` bitmap and early exit. - let size_used = self.index.used_size()?; - let map_size = self.index.env.map_size()? as u64; - let nb_documents = self.index.number_of_documents(self.wtxn)?; - let nb_soft_deleted = soft_deleted_docids.len(); + // decide for a hard or soft deletion depending on the strategy + let soft_deletion = match self.strategy { + DeletionStrategy::Dynamic => { + // if we have less documents to delete than the threshold we simply save them in + // the `soft_deleted_documents_ids` bitmap and early exit. + let size_used = self.index.used_size()?; + let map_size = self.index.env.map_size()? as u64; + let nb_documents = self.index.number_of_documents(self.wtxn)?; + let nb_soft_deleted = soft_deleted_docids.len(); - let percentage_available = 100 - (size_used * 100 / map_size); - let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); - let estimated_size_used_by_soft_deleted = estimated_document_size * nb_soft_deleted; - let percentage_used_by_soft_deleted_documents = - estimated_size_used_by_soft_deleted * 100 / map_size; + let percentage_available = 100 - (size_used * 100 / map_size); + let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); + let estimated_size_used_by_soft_deleted = estimated_document_size * nb_soft_deleted; + let percentage_used_by_soft_deleted_documents = + estimated_size_used_by_soft_deleted * 100 / map_size; - // if we have more than 10% of disk space available and the soft deleted - // documents uses less than 10% of the total space available, - // we skip the deletion. Eg. - // - With 100Go of disk and 20Go used including 5Go of soft-deleted documents - // We don’t delete anything. - // - With 100Go of disk and 95Go used including 1mo of soft-deleted documents - // We run the deletion. - // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents - // We run the deletion. + // if we have more than 10% of disk space available and the soft deleted + // documents uses less than 10% of the total space available, + // we skip the deletion. Eg. + // - With 100Go of disk and 20Go used including 5Go of soft-deleted documents + // We don’t delete anything. + // - With 100Go of disk and 95Go used including 1mo of soft-deleted documents + // We run the deletion. + // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents + // We run the deletion. + percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 + } + DeletionStrategy::AlwaysSoft => true, + DeletionStrategy::AlwaysHard => false, + }; - if !self.disable_soft_deletion - && percentage_available > 10 - && percentage_used_by_soft_deleted_documents < 10 - { + if soft_deletion { + // Keep the soft-deleted in the DB self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; return Ok(DetailedDocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), From 171c942282152f86aa53af47af5c07b2f39143c9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 15 Dec 2022 12:04:46 +0100 Subject: [PATCH 1865/1889] Soft-deletion computation no longer takes into account the mapsize Implemented solution 2.3 from https://github.com/meilisearch/meilisearch/issues/3231#issuecomment-1348628824 --- milli/src/update/delete_documents.rs | 33 ++++++++++++---------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 52b8f5f4a..dbe095dd5 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -189,29 +189,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // decide for a hard or soft deletion depending on the strategy let soft_deletion = match self.strategy { DeletionStrategy::Dynamic => { - // if we have less documents to delete than the threshold we simply save them in - // the `soft_deleted_documents_ids` bitmap and early exit. + // decide to keep the soft deleted in the DB for now if they meet 2 criteria: + // 1. There is less than a fixed rate of 50% of soft-deleted to actual documents, *and* + // 2. Soft-deleted occupy an average of less than a fixed size on disk + let size_used = self.index.used_size()?; - let map_size = self.index.env.map_size()? as u64; let nb_documents = self.index.number_of_documents(self.wtxn)?; let nb_soft_deleted = soft_deleted_docids.len(); - let percentage_available = 100 - (size_used * 100 / map_size); - let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); - let estimated_size_used_by_soft_deleted = estimated_document_size * nb_soft_deleted; - let percentage_used_by_soft_deleted_documents = - estimated_size_used_by_soft_deleted * 100 / map_size; + (nb_soft_deleted < nb_documents) && { + const SOFT_DELETED_SIZE_BYTE_THRESHOLD: u64 = 1_073_741_824; // 1GiB - // if we have more than 10% of disk space available and the soft deleted - // documents uses less than 10% of the total space available, - // we skip the deletion. Eg. - // - With 100Go of disk and 20Go used including 5Go of soft-deleted documents - // We don’t delete anything. - // - With 100Go of disk and 95Go used including 1mo of soft-deleted documents - // We run the deletion. - // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents - // We run the deletion. - percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 + // nb_documents + nb_soft_deleted !=0 because if nb_documents is 0 we short-circuit earlier, and then we moved the documents to delete + // from the documents_docids to the soft_deleted_docids. + let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); + let estimated_size_used_by_soft_deleted = + estimated_document_size * nb_soft_deleted; + estimated_size_used_by_soft_deleted < SOFT_DELETED_SIZE_BYTE_THRESHOLD + } } DeletionStrategy::AlwaysSoft => true, DeletionStrategy::AlwaysHard => false, @@ -227,7 +222,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { }); } - // There is more than documents to delete than the threshold we needs to delete them all + // Erase soft-deleted from DB self.to_delete_docids = soft_deleted_docids; // and we can reset the soft deleted bitmap self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; From ad9937c75541a815d0ef1303a08b8014bbfaae37 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 19 Dec 2022 09:47:29 +0100 Subject: [PATCH 1866/1889] Fix tests after adding DeletionStrategy --- milli/src/index.rs | 55 +++++----- milli/src/update/delete_documents.rs | 121 ++++++++++++---------- milli/src/update/facet/delete.rs | 8 +- milli/src/update/facet/mod.rs | 5 +- milli/src/update/prefix_word_pairs/mod.rs | 12 ++- 5 files changed, 109 insertions(+), 92 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 2d489fbd1..1747a45fa 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1192,8 +1192,8 @@ pub(crate) mod tests { use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, - IndexerConfig, Settings, + self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig, + IndexDocumentsMethod, IndexerConfig, Settings, }; use crate::{db_snap, obkv_to_json, Index}; @@ -1282,6 +1282,17 @@ pub(crate) mod tests { builder.execute(drop, || false)?; Ok(()) } + + pub fn delete_document(&self, external_document_id: &str) { + let mut wtxn = self.write_txn().unwrap(); + + let mut delete = DeleteDocuments::new(&mut wtxn, &self).unwrap(); + delete.strategy(self.index_documents_config.deletion_strategy); + + delete.delete_external_id(external_document_id); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + } } #[test] @@ -1487,7 +1498,9 @@ pub(crate) mod tests { use big_s::S; use maplit::hashset; - let index = TempIndex::new(); + let mut index = TempIndex::new(); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + let index = index; index .update_settings(|settings| { @@ -1657,7 +1670,8 @@ pub(crate) mod tests { } // Second Batch: replace the documents with soft-deletion { - index.index_documents_config.disable_soft_deletion = false; + index.index_documents_config.deletion_strategy = + crate::update::DeletionStrategy::AlwaysSoft; let mut docs1 = vec![]; for i in 0..3 { docs1.push(serde_json::json!( @@ -1726,7 +1740,7 @@ pub(crate) mod tests { drop(rtxn); // Third Batch: replace the documents with soft-deletion again { - index.index_documents_config.disable_soft_deletion = false; + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; let mut docs1 = vec![]; for i in 0..3 { docs1.push(serde_json::json!( @@ -1795,7 +1809,7 @@ pub(crate) mod tests { // Fourth Batch: replace the documents without soft-deletion { - index.index_documents_config.disable_soft_deletion = true; + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; let mut docs1 = vec![]; for i in 0..3 { docs1.push(serde_json::json!( @@ -1867,6 +1881,7 @@ pub(crate) mod tests { fn bug_3021_first() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; index @@ -1891,11 +1906,7 @@ pub(crate) mod tests { "###); db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.delete_external_id("34"); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_document("34"); db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 2, @r###" @@ -1936,11 +1947,7 @@ pub(crate) mod tests { db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); // We do the test again, but deleting the document with id 0 instead of id 1 now - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.delete_external_id("38"); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_document("38"); db_snap!(index, documents_ids, @"[1, ]"); db_snap!(index, external_documents_ids, 5, @r###" @@ -1987,6 +1994,7 @@ pub(crate) mod tests { fn bug_3021_second() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index @@ -2011,11 +2019,7 @@ pub(crate) mod tests { "###); db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.delete_external_id("34"); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_document("34"); db_snap!(index, documents_ids, @"[0, ]"); db_snap!(index, external_documents_ids, 2, @r###" @@ -2116,6 +2120,7 @@ pub(crate) mod tests { fn bug_3021_third() { // https://github.com/meilisearch/meilisearch/issues/3021 let mut index = TempIndex::new(); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; index @@ -2142,11 +2147,7 @@ pub(crate) mod tests { "###); db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.delete_external_id("3"); - delete.execute().unwrap(); - wtxn.commit().unwrap(); + index.delete_document("3"); db_snap!(index, documents_ids, @"[1, 2, ]"); db_snap!(index, external_documents_ids, 2, @r###" @@ -2158,7 +2159,7 @@ pub(crate) mod tests { "###); db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]"); - index.index_documents_config.disable_soft_deletion = true; + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap(); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index dbe095dd5..85bd8636a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -685,7 +685,7 @@ mod tests { wtxn: &mut RwTxn<'t, '_>, index: &'t Index, external_ids: &[&str], - disable_soft_deletion: bool, + strategy: DeletionStrategy, ) -> Vec { let external_document_ids = index.external_documents_ids(wtxn).unwrap(); let ids_to_delete: Vec = external_ids @@ -695,14 +695,14 @@ mod tests { // Delete some documents. let mut builder = DeleteDocuments::new(wtxn, index).unwrap(); - builder.disable_soft_deletion(disable_soft_deletion); + builder.strategy(strategy); external_ids.iter().for_each(|id| drop(builder.delete_external_id(id))); builder.execute().unwrap(); ids_to_delete } - fn delete_documents_with_numbers_as_primary_key_(disable_soft_deletion: bool) { + fn delete_documents_with_numbers_as_primary_key_(deletion_strategy: DeletionStrategy) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -722,17 +722,17 @@ mod tests { builder.delete_document(0); builder.delete_document(1); builder.delete_document(2); - builder.disable_soft_deletion(disable_soft_deletion); + builder.strategy(deletion_strategy); builder.execute().unwrap(); wtxn.commit().unwrap(); // All these snapshots should be empty since the database was cleared - db_snap!(index, documents_ids, disable_soft_deletion); - db_snap!(index, word_docids, disable_soft_deletion); - db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); - db_snap!(index, facet_id_exists_docids, disable_soft_deletion); - db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, documents_ids, deletion_strategy); + db_snap!(index, word_docids, deletion_strategy); + db_snap!(index, word_pair_proximity_docids, deletion_strategy); + db_snap!(index, facet_id_exists_docids, deletion_strategy); + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); let rtxn = index.read_txn().unwrap(); @@ -741,11 +741,11 @@ mod tests { #[test] fn delete_documents_with_numbers_as_primary_key() { - delete_documents_with_numbers_as_primary_key_(true); - delete_documents_with_numbers_as_primary_key_(false); + delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysHard); + delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysSoft); } - fn delete_documents_with_strange_primary_key_(disable_soft_deletion: bool) { + fn delete_documents_with_strange_primary_key_(strategy: DeletionStrategy) { let index = TempIndex::new(); index @@ -771,24 +771,24 @@ mod tests { let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); builder.delete_external_id("0"); builder.delete_external_id("1"); - builder.disable_soft_deletion(disable_soft_deletion); + builder.strategy(strategy); builder.execute().unwrap(); wtxn.commit().unwrap(); - db_snap!(index, documents_ids, disable_soft_deletion); - db_snap!(index, word_docids, disable_soft_deletion); - db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); - db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, documents_ids, strategy); + db_snap!(index, word_docids, strategy); + db_snap!(index, word_pair_proximity_docids, strategy); + db_snap!(index, soft_deleted_documents_ids, strategy); } #[test] fn delete_documents_with_strange_primary_key() { - delete_documents_with_strange_primary_key_(true); - delete_documents_with_strange_primary_key_(false); + delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysHard); + delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysSoft); } fn filtered_placeholder_search_should_not_return_deleted_documents_( - disable_soft_deletion: bool, + deletion_strategy: DeletionStrategy, ) { let index = TempIndex::new(); @@ -832,7 +832,7 @@ mod tests { ) .unwrap(); - delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"], disable_soft_deletion); + delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"], deletion_strategy); // Placeholder search with filter let filter = Filter::from_str("label = sign").unwrap().unwrap(); @@ -841,21 +841,27 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); - db_snap!(index, word_docids, disable_soft_deletion); - db_snap!(index, facet_id_f64_docids, disable_soft_deletion); - db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); - db_snap!(index, facet_id_exists_docids, disable_soft_deletion); - db_snap!(index, facet_id_string_docids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); + db_snap!(index, word_docids, deletion_strategy); + db_snap!(index, facet_id_f64_docids, deletion_strategy); + db_snap!(index, word_pair_proximity_docids, deletion_strategy); + db_snap!(index, facet_id_exists_docids, deletion_strategy); + db_snap!(index, facet_id_string_docids, deletion_strategy); } #[test] fn filtered_placeholder_search_should_not_return_deleted_documents() { - filtered_placeholder_search_should_not_return_deleted_documents_(true); - filtered_placeholder_search_should_not_return_deleted_documents_(false); + filtered_placeholder_search_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysHard, + ); + filtered_placeholder_search_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysSoft, + ); } - fn placeholder_search_should_not_return_deleted_documents_(disable_soft_deletion: bool) { + fn placeholder_search_should_not_return_deleted_documents_( + deletion_strategy: DeletionStrategy, + ) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -896,8 +902,7 @@ mod tests { ) .unwrap(); - let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &["1_4"], disable_soft_deletion); + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"], deletion_strategy); // Placeholder search let results = index.search(&wtxn).execute().unwrap(); @@ -915,11 +920,11 @@ mod tests { #[test] fn placeholder_search_should_not_return_deleted_documents() { - placeholder_search_should_not_return_deleted_documents_(true); - placeholder_search_should_not_return_deleted_documents_(false); + placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); + placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); } - fn search_should_not_return_deleted_documents_(disable_soft_deletion: bool) { + fn search_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -961,7 +966,7 @@ mod tests { .unwrap(); let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &["1_7", "1_52"], disable_soft_deletion); + delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy); // search for abstract let results = index.search(&wtxn).query("abstract").execute().unwrap(); @@ -976,17 +981,17 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); } #[test] fn search_should_not_return_deleted_documents() { - search_should_not_return_deleted_documents_(true); - search_should_not_return_deleted_documents_(false); + search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); + search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); } fn geo_filtered_placeholder_search_should_not_return_deleted_documents_( - disable_soft_deletion: bool, + deletion_strategy: DeletionStrategy, ) { let index = TempIndex::new(); @@ -1024,7 +1029,7 @@ mod tests { let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &external_ids_to_delete, disable_soft_deletion); + delete_documents(&mut wtxn, &index, &external_ids_to_delete, deletion_strategy); // Placeholder search with geo filter let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); @@ -1040,18 +1045,22 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); - db_snap!(index, facet_id_f64_docids, disable_soft_deletion); - db_snap!(index, facet_id_string_docids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); + db_snap!(index, facet_id_f64_docids, deletion_strategy); + db_snap!(index, facet_id_string_docids, deletion_strategy); } #[test] fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { - geo_filtered_placeholder_search_should_not_return_deleted_documents_(true); - geo_filtered_placeholder_search_should_not_return_deleted_documents_(false); + geo_filtered_placeholder_search_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysHard, + ); + geo_filtered_placeholder_search_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysSoft, + ); } - fn get_documents_should_not_return_deleted_documents_(disable_soft_deletion: bool) { + fn get_documents_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -1094,7 +1103,7 @@ mod tests { let deleted_external_ids = ["1_7", "1_52"]; let deleted_internal_ids = - delete_documents(&mut wtxn, &index, &deleted_external_ids, disable_soft_deletion); + delete_documents(&mut wtxn, &index, &deleted_external_ids, deletion_strategy); // list all documents let results = index.all_documents(&wtxn).unwrap(); @@ -1125,16 +1134,16 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); } #[test] fn get_documents_should_not_return_deleted_documents() { - get_documents_should_not_return_deleted_documents_(true); - get_documents_should_not_return_deleted_documents_(false); + get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); + get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); } - fn stats_should_not_return_deleted_documents_(disable_soft_deletion: bool) { + fn stats_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -1168,7 +1177,7 @@ mod tests { { "docid": "1_69", "label": ["geometry"]} ])).unwrap(); - delete_documents(&mut wtxn, &index, &["1_7", "1_52"], disable_soft_deletion); + delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy); // count internal documents let results = index.number_of_documents(&wtxn).unwrap(); @@ -1182,12 +1191,12 @@ mod tests { wtxn.commit().unwrap(); - db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); } #[test] fn stats_should_not_return_deleted_documents() { - stats_should_not_return_deleted_documents_(true); - stats_should_not_return_deleted_documents_(false); + stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); + stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); } } diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs index 4030f10da..883abc8ca 100644 --- a/milli/src/update/facet/delete.rs +++ b/milli/src/update/facet/delete.rs @@ -122,7 +122,7 @@ mod tests { use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; use crate::update::facet::test_helpers::ordered_string; - use crate::update::DeleteDocuments; + use crate::update::{DeleteDocuments, DeletionStrategy}; #[test] fn delete_mixed_incremental_and_bulk() { @@ -165,7 +165,7 @@ mod tests { let mut wtxn = index.env.write_txn().unwrap(); let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.disable_soft_deletion(true); + builder.strategy(DeletionStrategy::AlwaysHard); builder.delete_documents(&RoaringBitmap::from_iter(0..100)); // by deleting the first 100 documents, we expect that: // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) @@ -224,7 +224,7 @@ mod tests { let mut wtxn = index.env.write_txn().unwrap(); let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.disable_soft_deletion(true); + builder.strategy(DeletionStrategy::AlwaysHard); builder.delete_documents(&RoaringBitmap::from_iter(0..100)); // by deleting the first 100 documents, we expect that: // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) @@ -283,7 +283,7 @@ mod tests { for docid in docids_to_delete.into_iter().take(990) { let mut wtxn = index.env.write_txn().unwrap(); let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - builder.disable_soft_deletion(true); + builder.strategy(DeletionStrategy::AlwaysHard); builder.delete_documents(&RoaringBitmap::from_iter([docid])); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index fd55204c3..e2895919f 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -463,11 +463,14 @@ mod tests { use crate::db_snap; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; + use crate::update::DeletionStrategy; #[test] fn replace_all_identical_soft_deletion_then_hard_deletion() { let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); @@ -521,7 +524,7 @@ mod tests { db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); // Then replace the last document while disabling soft_deletion - index.index_documents_config.disable_soft_deletion = true; + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; let mut documents = vec![]; for i in 999..1000 { documents.push( diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 01a4de35e..49874993c 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -163,7 +163,7 @@ mod tests { use crate::db_snap; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; - use crate::update::{DeleteDocuments, IndexDocumentsMethod}; + use crate::update::{DeleteDocuments, DeletionStrategy, IndexDocumentsMethod}; fn documents_with_enough_different_words_for_prefixes( prefixes: &[&str], @@ -351,7 +351,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.disable_soft_deletion(true); + delete.strategy(DeletionStrategy::AlwaysHard); delete.delete_documents(&RoaringBitmap::from_iter([50])); delete.execute().unwrap(); wtxn.commit().unwrap(); @@ -363,7 +363,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - delete.disable_soft_deletion(true); + delete.strategy(DeletionStrategy::AlwaysHard); delete.delete_documents(&RoaringBitmap::from_iter(0..50)); delete.execute().unwrap(); wtxn.commit().unwrap(); @@ -435,6 +435,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.strategy(DeletionStrategy::AlwaysSoft); delete.delete_documents(&RoaringBitmap::from_iter([50])); delete.execute().unwrap(); wtxn.commit().unwrap(); @@ -446,6 +447,8 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.strategy(DeletionStrategy::AlwaysSoft); + delete.delete_documents(&RoaringBitmap::from_iter(0..50)); delete.execute().unwrap(); wtxn.commit().unwrap(); @@ -471,6 +474,7 @@ mod tests { let mut index = TempIndex::new(); index.index_documents_config.words_prefix_threshold = Some(50); index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; index .update_settings(|settings| { @@ -530,7 +534,7 @@ mod tests { fn replace_hard_deletion() { let mut index = TempIndex::new(); index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.disable_soft_deletion = true; + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; index From 916c23e7be72e8b4dedcc4025bbb03b172fec311 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 19 Dec 2022 09:36:56 +0100 Subject: [PATCH 1867/1889] Tests: rename snapshots --- .../{false => always_hard}/documents_ids.snap | 0 .../{false => always_hard}/facet_id_exists_docids.snap | 0 .../{false => always_hard}/soft_deleted_documents_ids.snap | 0 .../{false => always_hard}/word_docids.snap | 0 .../{false => always_hard}/word_pair_proximity_docids.snap | 0 .../{true => always_soft}/documents_ids.snap | 0 .../{true => always_soft}/facet_id_exists_docids.snap | 0 .../{true => always_soft}/soft_deleted_documents_ids.snap | 0 .../{true => always_soft}/word_docids.snap | 0 .../{true => always_soft}/word_pair_proximity_docids.snap | 0 .../{false => always_hard}/documents_ids.snap | 0 .../{true => always_hard}/soft_deleted_documents_ids.snap | 0 .../{true => always_hard}/word_docids.snap | 0 .../{false => always_hard}/word_pair_proximity_docids.snap | 0 .../{true => always_soft}/documents_ids.snap | 0 .../{false => always_soft}/soft_deleted_documents_ids.snap | 0 .../{false => always_soft}/word_docids.snap | 0 .../{true => always_soft}/word_pair_proximity_docids.snap | 0 .../{true => always_hard}/facet_id_exists_docids.snap | 0 .../{true => always_hard}/facet_id_f64_docids.snap | 0 .../{true => always_hard}/facet_id_string_docids.snap | 0 .../{true => always_hard}/soft_deleted_documents_ids.snap | 0 .../{true => always_hard}/word_docids.snap | 0 .../{false => always_hard}/word_pair_proximity_docids.snap | 0 .../{false => always_soft}/facet_id_exists_docids.snap | 0 .../{false => always_soft}/facet_id_f64_docids.snap | 0 .../{false => always_soft}/facet_id_string_docids.snap | 0 .../{false => always_soft}/soft_deleted_documents_ids.snap | 0 .../{false => always_soft}/word_docids.snap | 0 .../{true => always_soft}/word_pair_proximity_docids.snap | 0 .../{true => always_hard}/facet_id_f64_docids.snap | 0 .../{false => always_hard}/facet_id_string_docids.snap | 0 .../{true => always_hard}/soft_deleted_documents_ids.snap | 0 .../{false => always_soft}/facet_id_f64_docids.snap | 0 .../{true => always_soft}/facet_id_string_docids.snap | 0 .../{false => always_soft}/soft_deleted_documents_ids.snap | 0 .../{true => always_hard}/soft_deleted_documents_ids.snap | 0 .../{false => always_soft}/soft_deleted_documents_ids.snap | 0 .../{true => always_hard}/soft_deleted_documents_ids.snap | 0 .../{false => always_soft}/soft_deleted_documents_ids.snap | 0 .../{true => always_hard}/soft_deleted_documents_ids.snap | 0 .../{false => always_soft}/soft_deleted_documents_ids.snap | 0 42 files changed, 0 insertions(+), 0 deletions(-) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/{false => always_hard}/documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/{false => always_hard}/facet_id_exists_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/{false => always_hard}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/{false => always_hard}/word_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/{false => always_hard}/word_pair_proximity_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/{true => always_soft}/documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/{true => always_soft}/facet_id_exists_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/{true => always_soft}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/{true => always_soft}/word_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/{true => always_soft}/word_pair_proximity_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/{false => always_hard}/documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/{true => always_hard}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/{true => always_hard}/word_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/{false => always_hard}/word_pair_proximity_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/{true => always_soft}/documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/{false => always_soft}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/{false => always_soft}/word_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/{true => always_soft}/word_pair_proximity_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{true => always_hard}/facet_id_exists_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{true => always_hard}/facet_id_f64_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{true => always_hard}/facet_id_string_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{true => always_hard}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{true => always_hard}/word_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{false => always_hard}/word_pair_proximity_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{false => always_soft}/facet_id_exists_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{false => always_soft}/facet_id_f64_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{false => always_soft}/facet_id_string_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{false => always_soft}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{false => always_soft}/word_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/{true => always_soft}/word_pair_proximity_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/{true => always_hard}/facet_id_f64_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/{false => always_hard}/facet_id_string_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/{true => always_hard}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/{false => always_soft}/facet_id_f64_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/{true => always_soft}/facet_id_string_docids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/{false => always_soft}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/{true => always_hard}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/{false => always_soft}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/{true => always_hard}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/{false => always_soft}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/{true => always_hard}/soft_deleted_documents_ids.snap (100%) rename milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/{false => always_soft}/soft_deleted_documents_ids.snap (100%) diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap rename to milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap similarity index 100% rename from milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap rename to milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap From c72535531bbe9320610de5e9de679289cbe375f7 Mon Sep 17 00:00:00 2001 From: curquiza Date: Mon, 19 Dec 2022 16:35:38 +0000 Subject: [PATCH 1868/1889] Update version for the next release (v0.38.0) in Cargo.toml files --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index fdef63729..be3544bc0 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.37.0" +version = "0.38.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 0148ac87e..26c8ff681 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.37.0" +version = "0.38.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index d50563ec0..95a9b0062 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.37.0" +version = "0.38.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 972cfc3a7..909c2702c 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.37.0" +version = "0.38.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index dc36c1a3f..45b6b6ec9 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.37.0" +version = "0.38.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 60d45730c..f5277a8fa 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.37.0" +version = "0.38.0" authors = ["Kerollmops "] edition = "2018" From 69edbf9f6d1d5a33779e08ea529fa67c3832015e Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 19 Dec 2022 18:23:50 +0100 Subject: [PATCH 1869/1889] Update milli/src/update/delete_documents.rs --- milli/src/update/delete_documents.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 85bd8636a..e442117d0 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -48,7 +48,7 @@ pub struct DocumentDeletionResult { #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] pub enum DeletionStrategy { #[default] - /// Definitely suppress documents according to the number of size of soft-deleted documents + /// Definitely suppress documents according to the number or size of soft-deleted documents Dynamic, /// Never definitely suppress documents AlwaysSoft, From fc0e7382fe5f951ce43a643c489aa4159be606f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 20 Dec 2022 10:37:50 +0100 Subject: [PATCH 1870/1889] Fix hard-deletion of an external id that was soft-deleted --- milli/src/external_documents_ids.rs | 97 ---------------------------- milli/src/index.rs | 96 ++++++++++++++++++++++++++- milli/src/update/delete_documents.rs | 77 ++++++---------------- 3 files changed, 116 insertions(+), 154 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 64b294541..2cecd1abe 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -47,30 +47,6 @@ impl<'a> ExternalDocumentsIds<'a> { } } - pub fn delete_ids>(&mut self, other: fst::Set) -> fst::Result<()> { - let other = fst::Map::from(other.into_fst()); - let union_op = self.soft.op().add(&other).r#union(); - - let mut iter = union_op.into_stream(); - let mut new_soft_builder = fst::MapBuilder::memory(); - while let Some((external_id, docids)) = iter.next() { - if docids.iter().any(|v| v.index == 1) { - // If the `other` set returns a value here it means - // that it must be marked as deleted. - new_soft_builder.insert(external_id, DELETED_ID)?; - } else { - let value = docids.iter().find(|v| v.index == 0).unwrap().value; - new_soft_builder.insert(external_id, value)?; - } - } - - drop(iter); - - // We save this new map as the new soft map. - self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; - self.merge_soft_into_hard() - } - /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they /// don't contain any soft deleted document id. pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { @@ -173,76 +149,3 @@ impl Default for ExternalDocumentsIds<'static> { fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option { indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn simple_insert_delete_ids() { - let mut external_documents_ids = ExternalDocumentsIds::default(); - - let new_ids = fst::Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3), ("d", 4)]).unwrap(); - external_documents_ids.insert_ids(&new_ids).unwrap(); - - assert_eq!(external_documents_ids.get("a"), Some(1)); - assert_eq!(external_documents_ids.get("b"), Some(2)); - assert_eq!(external_documents_ids.get("c"), Some(3)); - assert_eq!(external_documents_ids.get("d"), Some(4)); - - let new_ids = fst::Map::from_iter(vec![("e", 5), ("f", 6), ("g", 7)]).unwrap(); - external_documents_ids.insert_ids(&new_ids).unwrap(); - - assert_eq!(external_documents_ids.get("a"), Some(1)); - assert_eq!(external_documents_ids.get("b"), Some(2)); - assert_eq!(external_documents_ids.get("c"), Some(3)); - assert_eq!(external_documents_ids.get("d"), Some(4)); - assert_eq!(external_documents_ids.get("e"), Some(5)); - assert_eq!(external_documents_ids.get("f"), Some(6)); - assert_eq!(external_documents_ids.get("g"), Some(7)); - - let del_ids = fst::Set::from_iter(vec!["a", "c", "f"]).unwrap(); - external_documents_ids.delete_ids(del_ids).unwrap(); - - assert_eq!(external_documents_ids.get("a"), None); - assert_eq!(external_documents_ids.get("b"), Some(2)); - assert_eq!(external_documents_ids.get("c"), None); - assert_eq!(external_documents_ids.get("d"), Some(4)); - assert_eq!(external_documents_ids.get("e"), Some(5)); - assert_eq!(external_documents_ids.get("f"), None); - assert_eq!(external_documents_ids.get("g"), Some(7)); - - let new_ids = fst::Map::from_iter(vec![("a", 5), ("b", 6), ("h", 8)]).unwrap(); - external_documents_ids.insert_ids(&new_ids).unwrap(); - - assert_eq!(external_documents_ids.get("a"), Some(5)); - assert_eq!(external_documents_ids.get("b"), Some(6)); - assert_eq!(external_documents_ids.get("c"), None); - assert_eq!(external_documents_ids.get("d"), Some(4)); - assert_eq!(external_documents_ids.get("e"), Some(5)); - assert_eq!(external_documents_ids.get("f"), None); - assert_eq!(external_documents_ids.get("g"), Some(7)); - assert_eq!(external_documents_ids.get("h"), Some(8)); - } - - #[test] - fn strange_delete_insert_ids() { - let mut external_documents_ids = ExternalDocumentsIds::default(); - - let new_ids = - fst::Map::from_iter(vec![("1", 0), ("123", 1), ("30", 2), ("456", 3)]).unwrap(); - external_documents_ids.insert_ids(&new_ids).unwrap(); - assert_eq!(external_documents_ids.get("1"), Some(0)); - assert_eq!(external_documents_ids.get("123"), Some(1)); - assert_eq!(external_documents_ids.get("30"), Some(2)); - assert_eq!(external_documents_ids.get("456"), Some(3)); - - let deleted_ids = fst::Set::from_iter(vec!["30"]).unwrap(); - external_documents_ids.delete_ids(deleted_ids).unwrap(); - assert_eq!(external_documents_ids.get("30"), None); - - let new_ids = fst::Map::from_iter(vec![("30", 2)]).unwrap(); - external_documents_ids.insert_ids(&new_ids).unwrap(); - assert_eq!(external_documents_ids.get("30"), Some(2)); - } -} diff --git a/milli/src/index.rs b/milli/src/index.rs index 1747a45fa..50a4e909f 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1181,6 +1181,7 @@ impl Index { #[cfg(test)] pub(crate) mod tests { + use std::collections::HashSet; use std::ops::Deref; use big_s::S; @@ -1195,7 +1196,7 @@ pub(crate) mod tests { self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; - use crate::{db_snap, obkv_to_json, Index}; + use crate::{db_snap, obkv_to_json, Index, Search, SearchResult}; pub(crate) struct TempIndex { pub inner: Index, @@ -2188,4 +2189,97 @@ pub(crate) mod tests { "###); db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); } + + #[test] + fn bug_3021_fourth() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 11 }, + { "primary_key": 4 }, + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 11 0 + 4 1 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[]"); + + index + .add_documents(documents!([ + { "primary_key": 4, "a": 0 }, + { "primary_key": 1 }, + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 1 3 + 11 0 + 4 2 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[1, ]"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.strategy(DeletionStrategy::AlwaysHard); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 1 3 + 11 0 + 4 2 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[]"); + + index + .add_documents(documents!([ + { "primary_key": 4, "a": 1 }, + { "primary_key": 1, "a": 0 }, + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 4, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 1 4 + 11 0 + 4 1 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]"); + + let rtxn = index.read_txn().unwrap(); + let search = Search::new(&rtxn, &index); + let SearchResult { matching_words: _, candidates: _, mut documents_ids } = + search.execute().unwrap(); + let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap(); + documents_ids.sort_unstable(); + let docs = index.documents(&rtxn, documents_ids).unwrap(); + let mut all_ids = HashSet::new(); + for (_docid, obkv) in docs { + let id = obkv.get(primary_key_id).unwrap(); + assert!(all_ids.insert(id)); + } + } } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e442117d0..635ce85be 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -6,16 +6,14 @@ use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; -use serde_json::Value; use time::OffsetDateTime; use super::facet::delete::FacetsDelete; use super::ClearDocuments; -use crate::error::{InternalError, UserError}; +use crate::error::InternalError; use crate::facet::FacetType; use crate::heed_codec::facet::FieldDocIdFacetCodec; use crate::heed_codec::CboRoaringBitmapCodec; -use crate::index::{db_name, main_key}; use crate::{ ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, @@ -186,6 +184,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { soft_deleted_docids |= &self.to_delete_docids; + // We always soft-delete the documents, even if they will be permanently + // deleted immediately after. + self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; + // decide for a hard or soft deletion depending on the strategy let soft_deletion = match self.strategy { DeletionStrategy::Dynamic => { @@ -214,7 +216,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { if soft_deletion { // Keep the soft-deleted in the DB - self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; return Ok(DetailedDocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), @@ -222,23 +223,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { }); } - // Erase soft-deleted from DB self.to_delete_docids = soft_deleted_docids; - // and we can reset the soft deleted bitmap - self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; - - let primary_key = - self.index.primary_key(self.wtxn)?.ok_or(InternalError::DatabaseMissingEntry { - db_name: db_name::MAIN, - key: Some(main_key::PRIMARY_KEY_KEY), - })?; - - // Since we already checked if the DB was empty, if we can't find the primary key, then - // something is wrong, and we must return an error. - let id_field = match fields_ids_map.id(primary_key) { - Some(field) => field, - None => return Err(UserError::MissingPrimaryKey.into()), - }; let Index { env: _env, @@ -262,33 +247,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { documents, } = self.index; - // Retrieve the words and the external documents ids contained in the documents. + // Retrieve the words contained in the documents. let mut words = Vec::new(); - let mut external_ids = Vec::new(); for docid in &self.to_delete_docids { - // We create an iterator to be able to get the content and delete the document - // content itself. It's faster to acquire a cursor to get and delete, - // as we avoid traversing the LMDB B-Tree two times but only once. - let key = BEU32::new(docid); - let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; - if let Some((_key, obkv)) = iter.next().transpose()? { - if let Some(content) = obkv.get(id_field) { - let external_id = match serde_json::from_slice(content).unwrap() { - Value::String(string) => SmallString32::from(string.as_str()), - Value::Number(number) => SmallString32::from(number.to_string()), - document_id => { - return Err(UserError::InvalidDocumentId { document_id }.into()) - } - }; - external_ids.push(external_id); - } - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } - drop(iter); + documents.delete(self.wtxn, &BEU32::new(docid))?; - // We iterate through the words positions of the document id, - // retrieve the word and delete the positions. + // We iterate through the words positions of the document id, retrieve the word and delete the positions. + // We create an iterator to be able to get the content and delete the key-value itself. + // It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once. let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?; while let Some(result) = iter.next() { let ((_docid, word), _positions) = result?; @@ -298,17 +264,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { unsafe { iter.del_current()? }; } } - - // We create the FST map of the external ids that we must delete. - external_ids.sort_unstable(); - let external_ids_to_delete = fst::Set::from_iter(external_ids)?; - // We acquire the current external documents ids map... + // Note that its soft-deleted document ids field will be equal to the `to_delete_docids` let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - // ...and remove the to-delete external ids. - new_external_documents_ids.delete_ids(external_ids_to_delete)?; - - // We write the new external ids into the main database. + // We then remove the soft-deleted docids from it + new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; + // and write it back to the main database. let new_external_documents_ids = new_external_documents_ids.into_static(); self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; @@ -545,6 +506,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { &self.to_delete_docids, )?; + self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; + Ok(DetailedDocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), @@ -1125,14 +1088,16 @@ mod tests { id ); } + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); // get internal docids from deleted external document ids - let results = index.external_documents_ids(&wtxn).unwrap(); + let results = index.external_documents_ids(&rtxn).unwrap(); for id in deleted_external_ids { assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id); } - - wtxn.commit().unwrap(); + drop(rtxn); db_snap!(index, soft_deleted_documents_ids, deletion_strategy); } From 229405aeb927c1fd700cb4cb091d849ee4c8e689 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 12 Dec 2022 16:54:31 +0100 Subject: [PATCH 1871/1889] Choose implementation strategy of criterion at runtime --- cli/src/main.rs | 4 +- milli/src/lib.rs | 5 +- milli/src/search/criteria/asc_desc.rs | 100 +++++++++++++++++-------- milli/src/search/criteria/attribute.rs | 22 +++++- milli/src/search/criteria/mod.rs | 32 ++++++-- milli/src/search/criteria/proximity.rs | 21 +++++- milli/src/search/mod.rs | 22 ++++++ 7 files changed, 156 insertions(+), 50 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index dd5489ebc..6b7f2078b 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -13,7 +13,7 @@ use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; -use milli::{heed, Index, Object}; +use milli::{heed, CriterionImplementationStrategy, Index, Object}; use structopt::StructOpt; #[global_allocator] @@ -441,7 +441,7 @@ impl Search { if let Some(limit) = limit { search.limit(*limit); } - + search.criterion_implementation_strategy(CriterionImplementationStrategy::OnlyIterative); let result = search.execute()?; let fields_ids_map = index.fields_ids_map(&txn)?; diff --git a/milli/src/lib.rs b/milli/src/lib.rs index b17be8f1f..865195df5 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -42,8 +42,9 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, - MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, + CriterionImplementationStrategy, FacetDistribution, Filter, FormatOptions, MatchBounds, + MatcherBuilder, MatchingWord, MatchingWords, Search, SearchResult, TermsMatchingStrategy, + DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 6b2199b28..c0fdbada3 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -12,6 +12,7 @@ use crate::heed_codec::ByteSliceRefCodec; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; use crate::search::query_tree::Operation; +use crate::search::CriterionImplementationStrategy; use crate::{FieldId, Index, Result}; /// Threshold on the number of candidates that will make @@ -29,6 +30,7 @@ pub struct AscDesc<'t> { allowed_candidates: RoaringBitmap, initial_candidates: InitialCandidates, faceted_candidates: RoaringBitmap, + implementation_strategy: CriterionImplementationStrategy, parent: Box, } @@ -38,8 +40,9 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, + implementation_strategy: CriterionImplementationStrategy, ) -> Result { - Self::new(index, rtxn, parent, field_name, true) + Self::new(index, rtxn, parent, field_name, true, implementation_strategy) } pub fn desc( @@ -47,8 +50,9 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, + implementation_strategy: CriterionImplementationStrategy, ) -> Result { - Self::new(index, rtxn, parent, field_name, false) + Self::new(index, rtxn, parent, field_name, false, implementation_strategy) } fn new( @@ -57,6 +61,7 @@ impl<'t> AscDesc<'t> { parent: Box, field_name: String, is_ascending: bool, + implementation_strategy: CriterionImplementationStrategy, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map.id(&field_name); @@ -82,6 +87,7 @@ impl<'t> AscDesc<'t> { allowed_candidates: RoaringBitmap::new(), faceted_candidates, initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), + implementation_strategy, parent, }) } @@ -149,6 +155,7 @@ impl<'t> Criterion for AscDesc<'t> { field_id, self.is_ascending, candidates & &self.faceted_candidates, + self.implementation_strategy, )?, None => Box::new(std::iter::empty()), }; @@ -170,6 +177,51 @@ impl<'t> Criterion for AscDesc<'t> { } } +fn facet_ordered_iterative<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, +) -> Result> + 't>> { + let number_iter = iterative_facet_number_ordered_iter( + index, + rtxn, + field_id, + is_ascending, + candidates.clone(), + )?; + let string_iter = + iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; + Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) +} + +fn facet_ordered_set_based<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, +) -> Result> + 't>> { + let make_iter = if is_ascending { ascending_facet_sort } else { descending_facet_sort }; + + let number_iter = make_iter( + rtxn, + index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates.clone(), + )?; + + let string_iter = make_iter( + rtxn, + index.facet_id_string_docids.remap_key_type::>(), + field_id, + candidates, + )?; + + Ok(Box::new(number_iter.chain(string_iter))) +} + /// Returns an iterator over groups of the given candidates in ascending or descending order. /// /// It will either use an iterative or a recursive method on the whole facet database depending @@ -180,36 +232,22 @@ fn facet_ordered<'t>( field_id: FieldId, is_ascending: bool, candidates: RoaringBitmap, + implementation_strategy: CriterionImplementationStrategy, ) -> Result> + 't>> { - if candidates.len() <= CANDIDATES_THRESHOLD { - let number_iter = iterative_facet_number_ordered_iter( - index, - rtxn, - field_id, - is_ascending, - candidates.clone(), - )?; - let string_iter = - iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; - Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) - } else { - let make_iter = if is_ascending { ascending_facet_sort } else { descending_facet_sort }; - - let number_iter = make_iter( - rtxn, - index.facet_id_f64_docids.remap_key_type::>(), - field_id, - candidates.clone(), - )?; - - let string_iter = make_iter( - rtxn, - index.facet_id_string_docids.remap_key_type::>(), - field_id, - candidates, - )?; - - Ok(Box::new(number_iter.chain(string_iter))) + match implementation_strategy { + CriterionImplementationStrategy::OnlyIterative => { + facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates) + } + CriterionImplementationStrategy::OnlySetBased => { + facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates) + } + CriterionImplementationStrategy::Dynamic => { + if candidates.len() <= CANDIDATES_THRESHOLD { + facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates) + } else { + facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates) + } + } } } diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 9da868e1a..d7ec0d382 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -9,7 +9,9 @@ use roaring::RoaringBitmap; use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{InitialCandidates, Query}; use crate::search::query_tree::{Operation, QueryKind}; -use crate::search::{build_dfa, word_derivations, WordDerivationsCache}; +use crate::search::{ + build_dfa, word_derivations, CriterionImplementationStrategy, WordDerivationsCache, +}; use crate::Result; /// To be able to divide integers by the number of words in the query @@ -30,10 +32,15 @@ pub struct Attribute<'t> { parent: Box, linear_buckets: Option>, set_buckets: Option>>, + implementation_strategy: CriterionImplementationStrategy, } impl<'t> Attribute<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { + pub fn new( + ctx: &'t dyn Context<'t>, + parent: Box, + implementation_strategy: CriterionImplementationStrategy, + ) -> Self { Attribute { ctx, state: None, @@ -41,6 +48,7 @@ impl<'t> Attribute<'t> { parent, linear_buckets: None, set_buckets: None, + implementation_strategy, } } } @@ -64,7 +72,15 @@ impl<'t> Criterion for Attribute<'t> { })); } Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { - let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { + let found_candidates = if matches!( + self.implementation_strategy, + CriterionImplementationStrategy::OnlyIterative + ) || (matches!( + self.implementation_strategy, + CriterionImplementationStrategy::Dynamic + ) && allowed_candidates.len() + < CANDIDATES_THRESHOLD) + { let linear_buckets = match self.linear_buckets.as_mut() { Some(linear_buckets) => linear_buckets, None => { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index eb83f5515..26d1e243f 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -14,6 +14,7 @@ use self::r#final::Final; use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; +use super::CriterionImplementationStrategy; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, Distinct, WordDerivationsCache}; use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; @@ -377,6 +378,7 @@ impl<'t> CriteriaBuilder<'t> { sort_criteria: Option>, exhaustive_number_hits: bool, distinct: Option, + implementation_strategy: CriterionImplementationStrategy, ) -> Result> { use crate::criterion::Criterion as Name; @@ -402,12 +404,14 @@ impl<'t> CriteriaBuilder<'t> { self.rtxn, criterion, field.to_string(), + implementation_strategy, )?), AscDescName::Desc(Member::Field(field)) => Box::new(AscDesc::desc( self.index, self.rtxn, criterion, field.to_string(), + implementation_strategy, )?), AscDescName::Asc(Member::Geo(point)) => { Box::new(Geo::asc(self.index, self.rtxn, criterion, *point)?) @@ -421,15 +425,27 @@ impl<'t> CriteriaBuilder<'t> { } None => criterion, }, - Name::Proximity => Box::new(Proximity::new(self, criterion)), - Name::Attribute => Box::new(Attribute::new(self, criterion)), + Name::Proximity => { + Box::new(Proximity::new(self, criterion, implementation_strategy)) + } + Name::Attribute => { + Box::new(Attribute::new(self, criterion, implementation_strategy)) + } Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), - Name::Asc(field) => { - Box::new(AscDesc::asc(self.index, self.rtxn, criterion, field)?) - } - Name::Desc(field) => { - Box::new(AscDesc::desc(self.index, self.rtxn, criterion, field)?) - } + Name::Asc(field) => Box::new(AscDesc::asc( + self.index, + self.rtxn, + criterion, + field, + implementation_strategy, + )?), + Name::Desc(field) => Box::new(AscDesc::desc( + self.index, + self.rtxn, + criterion, + field, + implementation_strategy, + )?), }; } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index d44ba25dd..2072d0133 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -11,7 +11,7 @@ use super::{ }; use crate::search::criteria::InitialCandidates; use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; -use crate::search::{build_dfa, WordDerivationsCache}; +use crate::search::{build_dfa, CriterionImplementationStrategy, WordDerivationsCache}; use crate::{Position, Result}; type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; @@ -33,10 +33,15 @@ pub struct Proximity<'t> { parent: Box, candidates_cache: Cache, plane_sweep_cache: Option>, + implementation_strategy: CriterionImplementationStrategy, } impl<'t> Proximity<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { + pub fn new( + ctx: &'t dyn Context<'t>, + parent: Box, + implementation_strategy: CriterionImplementationStrategy, + ) -> Self { Proximity { ctx, state: None, @@ -45,6 +50,7 @@ impl<'t> Proximity<'t> { parent, candidates_cache: Cache::new(), plane_sweep_cache: None, + implementation_strategy, } } } @@ -72,8 +78,15 @@ impl<'t> Criterion for Proximity<'t> { self.state = None; // reset state } Some((_, query_tree, allowed_candidates)) => { - let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD - && self.proximity > PROXIMITY_THRESHOLD + let mut new_candidates = if matches!( + self.implementation_strategy, + CriterionImplementationStrategy::OnlyIterative + ) || (matches!( + self.implementation_strategy, + CriterionImplementationStrategy::Dynamic + ) && allowed_candidates.len() + <= CANDIDATES_THRESHOLD + && self.proximity > PROXIMITY_THRESHOLD) { if let Some(cache) = self.plane_sweep_cache.as_mut() { match cache.next() { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 96cf1e0f1..df59634bb 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -49,6 +49,7 @@ pub struct Search<'a> { authorize_typos: bool, words_limit: usize, exhaustive_number_hits: bool, + criterion_implementation_strategy: CriterionImplementationStrategy, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } @@ -65,6 +66,7 @@ impl<'a> Search<'a> { authorize_typos: true, exhaustive_number_hits: false, words_limit: 10, + criterion_implementation_strategy: CriterionImplementationStrategy::default(), rtxn, index, } @@ -117,6 +119,14 @@ impl<'a> Search<'a> { self } + pub fn criterion_implementation_strategy( + &mut self, + strategy: CriterionImplementationStrategy, + ) -> &mut Search<'a> { + self.criterion_implementation_strategy = strategy; + self + } + fn is_typo_authorized(&self) -> Result { let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; // only authorize typos if both the index and the query allow it. @@ -204,6 +214,7 @@ impl<'a> Search<'a> { self.sort_criteria.clone(), self.exhaustive_number_hits, None, + self.criterion_implementation_strategy, )?; self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria) } @@ -220,6 +231,7 @@ impl<'a> Search<'a> { self.sort_criteria.clone(), self.exhaustive_number_hits, Some(distinct.clone()), + self.criterion_implementation_strategy, )?; self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) } @@ -288,6 +300,7 @@ impl fmt::Debug for Search<'_> { authorize_typos, words_limit, exhaustive_number_hits, + criterion_implementation_strategy, rtxn: _, index: _, } = self; @@ -300,6 +313,7 @@ impl fmt::Debug for Search<'_> { .field("terms_matching_strategy", terms_matching_strategy) .field("authorize_typos", authorize_typos) .field("exhaustive_number_hits", exhaustive_number_hits) + .field("criterion_implementation_strategy", criterion_implementation_strategy) .field("words_limit", words_limit) .finish() } @@ -313,6 +327,14 @@ pub struct SearchResult { pub documents_ids: Vec, } +#[derive(Debug, Default, Clone, Copy)] +pub enum CriterionImplementationStrategy { + OnlyIterative, + OnlySetBased, + #[default] + Dynamic, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum TermsMatchingStrategy { // remove last word first From 904fd2f6d1b162dab5a20505b1bd23377ed708d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Dec 2022 11:54:12 +0100 Subject: [PATCH 1872/1889] Add a search strategy option to the cli --- cli/src/main.rs | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 6b7f2078b..c28d3de59 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,4 +1,5 @@ use std::collections::BTreeMap; +use std::fmt::Display; use std::fs::File; use std::io::{stdin, BufRead, BufReader, Cursor, Read, Write}; use std::path::PathBuf; @@ -349,6 +350,29 @@ fn documents_from_csv(reader: impl Read) -> Result> { documents.into_inner().map_err(Into::into) } +#[derive(Debug, Clone, Copy)] +struct SearchStrategyOption(CriterionImplementationStrategy); +impl FromStr for SearchStrategyOption { + type Err = String; + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "dynamic" => Ok(SearchStrategyOption(CriterionImplementationStrategy::Dynamic)), + "set" => Ok(SearchStrategyOption(CriterionImplementationStrategy::OnlySetBased)), + "iterative" => Ok(SearchStrategyOption(CriterionImplementationStrategy::OnlyIterative)), + _ => Err("could not parse {s} as a criterion implementation strategy, available options are `dynamic`, `set`, and `iterative`".to_owned()), + } + } +} +impl Display for SearchStrategyOption { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.0 { + CriterionImplementationStrategy::OnlyIterative => Display::fmt("iterative", f), + CriterionImplementationStrategy::OnlySetBased => Display::fmt("set", f), + CriterionImplementationStrategy::Dynamic => Display::fmt("dynamic", f), + } + } +} + #[derive(Debug, StructOpt)] struct Search { query: Option, @@ -360,6 +384,8 @@ struct Search { limit: Option, #[structopt(short, long, conflicts_with = "query")] interactive: bool, + #[structopt(short, long)] + strategy: Option, } impl Performer for Search { @@ -379,6 +405,7 @@ impl Performer for Search { &self.filter, &self.offset, &self.limit, + &self.strategy, )?; let time = now.elapsed(); @@ -386,6 +413,7 @@ impl Performer for Search { let hits = serde_json::to_string_pretty(&jsons)?; println!("{}", hits); + eprintln!("found {} results in {:.02?}", jsons.len(), time); } _ => break, @@ -399,6 +427,7 @@ impl Performer for Search { &self.filter, &self.offset, &self.limit, + &self.strategy, )?; let time = now.elapsed(); @@ -420,6 +449,7 @@ impl Search { filter: &Option, offset: &Option, limit: &Option, + strategy: &Option, ) -> Result> { let txn = index.read_txn()?; let mut search = index.search(&txn); @@ -441,7 +471,10 @@ impl Search { if let Some(limit) = limit { search.limit(*limit); } - search.criterion_implementation_strategy(CriterionImplementationStrategy::OnlyIterative); + if let Some(strategy) = strategy { + search.criterion_implementation_strategy(strategy.0); + } + let result = search.execute()?; let fields_ids_map = index.fields_ids_map(&txn)?; From 339a4b0789f9873d00c093bbd826a554625369db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 21 Dec 2022 12:49:34 +0100 Subject: [PATCH 1873/1889] Make clippy happy --- milli/src/search/criteria/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 26d1e243f..98b9d928e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -370,6 +370,7 @@ impl<'t> CriteriaBuilder<'t> { Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) } + #[allow(clippy::too_many_arguments)] pub fn build( &'t self, query_tree: Option, From 13c95d25aa94abd33cfab9dedd06e59fe6637505 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 19 Dec 2022 15:59:22 +0100 Subject: [PATCH 1874/1889] Remove uses of UserError::MissingPrimaryKey not related to inference --- milli/src/update/index_documents/enrich.rs | 6 +++++- milli/src/update/index_documents/transform.rs | 17 +++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 7eda5dca4..8874b836c 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -21,6 +21,10 @@ const DEFAULT_PRIMARY_KEY: &str = "id"; /// - all the documents id exist and are extracted, /// - the validity of them but also, /// - the validity of the `_geo` field depending on the settings. +/// +/// # Panics +/// +/// - if reader.is_empty(), this function may panic in some cases pub fn enrich_documents_batch( rtxn: &heed::RoTxn, index: &Index, @@ -49,7 +53,7 @@ pub fn enrich_documents_batch( primary_key: primary_key.to_string(), document: obkv_to_object(&first_document, &documents_batch_index)?, })), - None => Ok(Err(UserError::MissingPrimaryKey)), + None => unreachable!("Called with reader.is_empty()"), }; } }, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f414569b9..68ef2b7ee 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -16,7 +16,7 @@ use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; -use crate::index::db_name; +use crate::index::{db_name, main_key}; use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, @@ -459,7 +459,10 @@ impl<'a, 'i> Transform<'a, 'i> { let primary_key = self .index .primary_key(wtxn)? - .ok_or(Error::UserError(UserError::MissingPrimaryKey))? + .ok_or(Error::InternalError(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + }))? .to_string(); let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; @@ -557,8 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> { mut new_fields_ids_map: FieldsIdsMap, ) -> Result { // There already has been a document addition, the primary key should be set by now. - let primary_key = - self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string(); + let primary_key = self + .index + .primary_key(wtxn)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + })? + .to_string(); let field_distribution = self.index.field_distribution(wtxn)?; // Delete the soft deleted document ids from the maps inside the external_document_ids structure From 402dcd6b2fb8f21c89d3eac8d94297c3023af4b3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 19 Dec 2022 17:37:44 +0100 Subject: [PATCH 1875/1889] Simplify primary key inference --- milli/src/error.rs | 6 ++-- milli/src/update/index_documents/enrich.rs | 33 +++++++++++++++++----- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index bd691ab1d..0b8649067 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -130,8 +130,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco MissingDocumentId { primary_key: String, document: Object }, #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] TooManyDocumentIds { primary_key: String, document: Object }, - #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")] - MissingPrimaryKey, + #[error("The primary key inference process failed because the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")] + NoPrimaryKeyCandidateFound, + #[error("The primary key inference process failed because the engine found {} fields ending with `id` in their name, such as '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())] + MultiplePrimaryKeyCandidatesFound { candidates: Vec }, #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] NoSpaceLeftOnDevice, #[error("Index already has a primary key: `{0}`.")] diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 8874b836c..0d6a8dcbf 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -58,17 +58,36 @@ pub fn enrich_documents_batch( } }, None => { - let guessed = documents_batch_index + let mut guesses: Vec<(u16, &str)> = documents_batch_index .iter() - .filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY)) - .min_by_key(|(fid, _)| *fid); - match guessed { - Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), - None if autogenerate_docids => PrimaryKey::flat( + .filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) + .map(|(field_id, name)| (*field_id, name.as_str())) + .collect(); + + // sort the keys in a deterministic, obvious way, so that fields are always in the same order. + guesses.sort_by(|(_, left_name), (_, right_name)| { + // shortest name first + left_name.len().cmp(&right_name.len()).then_with( + // then alphabetical order + || left_name.cmp(right_name), + ) + }); + + match guesses.as_slice() { + [] if autogenerate_docids => PrimaryKey::flat( DEFAULT_PRIMARY_KEY, documents_batch_index.insert(DEFAULT_PRIMARY_KEY), ), - None => return Ok(Err(UserError::MissingPrimaryKey)), + [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), + [(field_id, name)] => PrimaryKey::flat(name, *field_id), + multiple => { + return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { + candidates: multiple + .iter() + .map(|(_, candidate)| candidate.to_string()) + .collect(), + })); + } } } }; From b24def328180af74daf9b2a562fcdb24b224f873 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 21 Dec 2022 10:20:39 +0100 Subject: [PATCH 1876/1889] Add logging when inference took place. Displays log message in the form: ``` [2022-12-21T09:19:42Z INFO milli::update::index_documents::enrich] Primary key was not specified in index. Inferred to 'id' ``` --- milli/src/update/index_documents/enrich.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 0d6a8dcbf..3331497c9 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -79,7 +79,10 @@ pub fn enrich_documents_batch( documents_batch_index.insert(DEFAULT_PRIMARY_KEY), ), [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), - [(field_id, name)] => PrimaryKey::flat(name, *field_id), + [(field_id, name)] => { + log::info!("Primary key was not specified in index. Inferred to '{name}'"); + PrimaryKey::flat(name, *field_id) + } multiple => { return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { candidates: multiple From 59431007542cfd7851c49cda9fd0bba504ce8322 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 20 Dec 2022 11:21:29 +0100 Subject: [PATCH 1877/1889] Fix existing tests --- milli/src/update/index_documents/mod.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7b8408fe4..3656a3bc4 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1658,6 +1658,12 @@ mod tests { "branch_id_number": 0 }]}; + { + let mut wtxn = index.write_txn().unwrap(); + index.put_primary_key(&mut wtxn, "id").unwrap(); + wtxn.commit().unwrap(); + } + index.add_documents(doc1).unwrap(); index.add_documents(doc2).unwrap(); From 4b166bea2b7198fc0124bf85c06aeab4b6db01f3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 20 Dec 2022 11:21:42 +0100 Subject: [PATCH 1878/1889] Add primary_key_inference test --- milli/src/update/index_documents/mod.rs | 50 +++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3656a3bc4..9e55318ca 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1820,6 +1820,56 @@ mod tests { index.add_documents(doc4).unwrap_err(); } + #[test] + fn primary_key_inference() { + let index = TempIndex::new(); + + let doc_no_id = documents! {[{ + "title": "asdsad", + "state": "automated", + "priority": "normal", + "branch_id_number": 0 + }]}; + assert!(matches!( + index.add_documents(doc_no_id), + Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound)) + )); + + let doc_multiple_ids = documents! {[{ + "id": 228143, + "title": "something", + "state": "automated", + "priority": "normal", + "public_uid": "39c6499b", + "project_id": 78207, + "branch_id_number": 0 + }]}; + + let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound { + candidates + })) = + index.add_documents(doc_multiple_ids) else { panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)") }; + + assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]); + + let doc_inferable = documents! {[{ + "video": "test.mp4", + "id": 228143, + "title": "something", + "state": "automated", + "priority": "normal", + "public_uid_": "39c6499b", + "project_id_": 78207, + "branch_id_number": 0 + }]}; + + index.add_documents(doc_inferable).unwrap(); + + let txn = index.read_txn().unwrap(); + + assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id"); + } + #[test] fn long_words_must_be_skipped() { let index = TempIndex::new(); From b0f3dc2c06bc6bdb6c4d9ad3a53d8baf98048f55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 29 Nov 2022 13:49:12 +0100 Subject: [PATCH 1879/1889] Interpret synonyms as phrases --- milli/src/search/query_tree.rs | 45 +++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index e689ae440..7ac4fded4 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -339,18 +339,18 @@ fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { /// and create the list of operations for the query tree fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result>> { let synonyms = ctx.synonyms(word)?; - Ok(synonyms.map(|synonyms| { synonyms .into_iter() .map(|synonym| { - let words = synonym - .into_iter() - .map(|word| { - Operation::Query(Query { prefix: false, kind: QueryKind::exact(word) }) + if synonym.len() == 1 { + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(synonym[0].clone()), }) - .collect(); - Operation::and(words) + } else { + Operation::Phrase(synonym.into_iter().map(Some).collect()) + } }) .collect() })) @@ -1058,9 +1058,7 @@ mod test { AND OR Exact { word: "hi" } - AND - Exact { word: "good" } - Exact { word: "morning" } + PHRASE [Some("good"), Some("morning")] Tolerant { word: "hello", max typo: 1 } OR Exact { word: "earth" } @@ -1070,6 +1068,24 @@ mod test { "###); } + #[test] + fn simple_synonyms() { + let query = "nyc"; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::Last, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + PHRASE [Some("new"), Some("york")] + PHRASE [Some("new"), Some("york"), Some("city")] + PrefixExact { word: "nyc" } + "###); + } + #[test] fn complex_synonyms() { let query = "new york city "; @@ -1092,16 +1108,11 @@ mod test { AND OR Exact { word: "nyc" } - AND - Exact { word: "new" } - Exact { word: "york" } - Exact { word: "city" } + PHRASE [Some("new"), Some("york"), Some("city")] Tolerant { word: "newyork", max typo: 1 } Exact { word: "city" } Exact { word: "nyc" } - AND - Exact { word: "new" } - Exact { word: "york" } + PHRASE [Some("new"), Some("york")] Tolerant { word: "newyorkcity", max typo: 1 } "###); } From 777b387dc4227fca3c2052662ea6d19b0f1a009d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Dec 2022 11:53:21 +0100 Subject: [PATCH 1880/1889] Avoid a prefix-related worst-case scenario in the proximity criterion --- milli/src/search/criteria/mod.rs | 124 +++++++++++++++++----- milli/src/search/criteria/proximity.rs | 3 + milli/src/update/mod.rs | 5 +- milli/src/update/prefix_word_pairs/mod.rs | 26 +---- 4 files changed, 108 insertions(+), 50 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 98b9d928e..0a5bfd664 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -17,6 +17,7 @@ use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use super::CriterionImplementationStrategy; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, Distinct, WordDerivationsCache}; +use crate::update::{MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB}; use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; mod asc_desc; @@ -653,14 +654,30 @@ fn query_pair_proximity_docids( match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix { - match word_prefix_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? { - Some(docids) => Ok(docids), - None => { + // There are three distinct cases which we need to distinguish regarding the prefix `right`: + // + // 1. `right` is not in any prefix cache because it is not the prefix of many words + // (and thus, it doesn't have many word derivations) + // 2. `right` is in the prefix cache but cannot be found in the "word prefix pair proximity" databases either + // because it is too long or because the given proximity is too high. + // 3. `right` is in the prefix cache and can be found in the "word prefix pair proximity" databases + // + // The three cases are handled as follows: + // 1. We manually retrieve all the word derivations of `right` and check the `word_pair_proximity` + // database for each of them. + // 2. It would be too expensive to apply the same strategy as (1), therefore, we "disable" the + // proximity ranking rule for the prefixes of the right word. This is done as follows: + // 1. Only find the documents where left is in proximity to the exact (ie non-prefix) right word + // 2. Otherwise, assume that their proximity in all the documents in which they coexist is >= 8 + // + // 3. Query the prefix proximity databases. + match ( + ctx.in_prefix_cache(right), + right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB + && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + ) { + // Case 1: not in prefix cache + (false, _) => { let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; all_word_pair_overall_proximity_docids( ctx, @@ -669,40 +686,91 @@ fn query_pair_proximity_docids( proximity, ) } + // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to + // query the prefix proximity databases. + (true, false) => { + // To "save" the relevancy a little bit, we still find the documents where the + // exact (i.e. non-prefix) right word is in the given proximity to the left word. + Ok(word_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()) + } + // Case 3: in prefix cache, short enough, and proximity is low enough + (true, true) => Ok(word_prefix_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()), } } else { - Ok(ctx - .word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? - .unwrap_or_default()) + Ok(word_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()) } } (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { let l_words = word_derivations(left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); if prefix { - let mut docids = RoaringBitmap::new(); - for (left, _) in l_words { - let current_docids = match word_prefix_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? { - Some(docids) => Ok(docids), - None => { - let r_words = - word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids( + // The logic here is almost identical to the one in the previous match branch. + // The difference is that we fetch the docids for each derivation of the left word. + match ( + ctx.in_prefix_cache(right), + right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB + && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + ) { + // Case 1: not in prefix cache + (false, _) => { + let mut docids = RoaringBitmap::new(); + let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; + for (left, _) in l_words { + docids |= all_word_pair_overall_proximity_docids( ctx, &[(left, 0)], r_words, proximity, - ) + )?; } - }?; - docids |= current_docids; + Ok(docids) + } + // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to + // query the prefix proximity databases. + (true, false) => { + // To "save" the relevancy a little bit, we still find the documents where the + // exact (i.e. non-prefix) right word is in proximity to any derivation of the left word. + let mut candidates = RoaringBitmap::new(); + for (left, _) in l_words { + candidates |= ctx + .word_pair_proximity_docids(&left, right, proximity)? + .unwrap_or_default(); + } + Ok(candidates) + } + // Case 3: in prefix cache, short enough, and proximity is low enough + (true, true) => { + let mut docids = RoaringBitmap::new(); + for (left, _) in l_words { + docids |= word_prefix_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default(); + } + Ok(docids) + } } - Ok(docids) } else { all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 2072d0133..20e540f7f 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -590,3 +590,6 @@ fn resolve_plane_sweep_candidates( Ok(candidates) } + +#[cfg(test)] +mod tests {} diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 2dda24172..948811a6b 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -7,7 +7,10 @@ pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; pub use self::indexer_config::IndexerConfig; -pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids; +pub use self::prefix_word_pairs::{ + PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, + MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, +}; pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 49874993c..bed542bdb 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -14,6 +14,9 @@ mod word_prefix; pub use prefix_word::index_prefix_word_database; pub use word_prefix::index_word_prefix_database; +pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; +pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; + pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, @@ -32,31 +35,12 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { Self { wtxn, index, - max_proximity: 4, - max_prefix_length: 2, + max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, chunk_compression_type, chunk_compression_level, } } - /// Set the maximum proximity required to make a prefix be part of the words prefixes - /// database. If two words are too far from the threshold the associated documents will - /// not be part of the prefix database. - /// - /// Default value is 4. This value must be lower or equal than 7 and will be clamped - /// to this bound otherwise. - pub fn max_proximity(&mut self, value: u8) -> &mut Self { - self.max_proximity = value.max(7); - self - } - /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words - /// prefixes database. If the prefix length is higher than the threshold, the associated documents - /// will not be part of the prefix database. - /// - /// Default value is 2. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value; - self - } #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute<'a>( From f097aafa1c79ac52da56c717c4bc8a668772229d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 28 Nov 2022 16:10:17 +0100 Subject: [PATCH 1881/1889] Add unit test for prefix handling by the proximity criterion --- milli/src/search/criteria/proximity.rs | 99 +++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 20e540f7f..160c4908b 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -592,4 +592,101 @@ fn resolve_plane_sweep_candidates( } #[cfg(test)] -mod tests {} +mod tests { + use std::io::Cursor; + + use big_s::S; + + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + use crate::SearchResult; + + fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { + let mut documents = Vec::new(); + for prefix in prefixes { + for i in 0..500 { + documents.push( + serde_json::json!({ + "text": format!("{prefix}{i:x}"), + }) + .as_object() + .unwrap() + .clone(), + ) + } + } + documents + } + + #[test] + fn test_proximity_criterion_prefix_handling() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_primary_key(S("id")); + settings.set_criteria(vec![ + "words".to_owned(), + "typo".to_owned(), + "proximity".to_owned(), + ]); + }) + .unwrap(); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + for doc in [ + // 0 + serde_json::json!({ "text": "zero is exactly the amount of configuration I want" }), + // 1 + serde_json::json!({ "text": "zero bad configuration" }), + // 2 + serde_json::json!({ "text": "zero configuration" }), + // 3 + serde_json::json!({ "text": "zero config" }), + // 4 + serde_json::json!({ "text": "zero conf" }), + // 5 + serde_json::json!({ "text": "zero bad conf" }), + ] { + documents.append_json_object(doc.as_object().unwrap()).unwrap(); + } + for doc in documents_with_enough_different_words_for_prefixes(&["conf"]) { + documents.append_json_object(&doc).unwrap(); + } + let documents = + DocumentsBatchReader::from_reader(Cursor::new(documents.into_inner().unwrap())) + .unwrap(); + + index.add_documents(documents).unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("zero c").execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("zero co").execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("zero con").execute().unwrap(); + // Here searh results are degraded because `con` is in the prefix cache but it is too + // long to be stored in the prefix proximity databases, and we don't want to iterate over + // all of its word derivations + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("zero conf").execute().unwrap(); + // Here search results are degraded as well, but we can still rank correctly documents + // that contain `conf` exactly, and not as a prefix. + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 2, 3]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("zero config").execute().unwrap(); + // `config` is not a common prefix, so the normal methods are used + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 4, 5]"); + } +} From 32c6062e65931a67541493c62a25c8ca2a7d5d28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 3 Nov 2022 15:22:38 +0100 Subject: [PATCH 1882/1889] Optimise exactness criterion 1. Cache some results between calls to next() 2. Compute the combinations of exact words more efficiently --- milli/src/search/criteria/exactness.rs | 538 ++++++++++++++++++++++--- 1 file changed, 478 insertions(+), 60 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index b389a5d1e..29e8ce87a 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -1,10 +1,10 @@ +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; use std::convert::TryFrom; use std::mem::take; -use std::ops::BitOr; -use itertools::Itertools; use log::debug; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use crate::search::criteria::{ resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, @@ -20,6 +20,7 @@ pub struct Exactness<'t> { initial_candidates: InitialCandidates, parent: Box, query: Vec, + cache: Option, } impl<'t> Exactness<'t> { @@ -40,6 +41,7 @@ impl<'t> Exactness<'t> { initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), parent, query, + cache: None, }) } } @@ -51,7 +53,6 @@ impl<'t> Criterion for Exactness<'t> { if let Some(state) = self.state.as_mut() { state.difference_with(params.excluded_candidates); } - loop { debug!("Exactness at state {:?}", self.state); @@ -60,9 +61,12 @@ impl<'t> Criterion for Exactness<'t> { // reset state self.state = None; self.query_tree = None; + // we don't need to reset the combinations cache since it only depends on + // the primitive query, which does not change } Some(state) => { - let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?; + let (candidates, state) = + resolve_state(self.ctx, take(state), &self.query, &mut self.cache)?; self.state = state; return Ok(Some(CriterionResult { @@ -166,12 +170,12 @@ impl Default for State { Self::Remainings(vec![]) } } - #[logging_timer::time("Exactness::{}")] fn resolve_state( ctx: &dyn Context, state: State, query: &[ExactQueryPart], + cache: &mut Option, ) -> Result<(RoaringBitmap, Option)> { use State::*; match state { @@ -186,6 +190,7 @@ fn resolve_state( let mut attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; attribute_candidates_array.push(attribute_allowed_docids); + candidates |= intersection_of(attribute_candidates_array.iter().collect()); } } @@ -212,59 +217,26 @@ fn resolve_state( allowed_candidates -= &candidates; Ok((candidates, Some(ExactWords(allowed_candidates)))) } - ExactWords(mut allowed_candidates) => { - let number_of_part = query.len(); - let mut parts_candidates_array = Vec::with_capacity(number_of_part); + ExactWords(allowed_candidates) => { + // Retrieve the cache if it already exist, otherwise create it. + let owned_cache = if let Some(cache) = cache.take() { + cache + } else { + compute_combinations(ctx, query)? + }; + // The cache contains the sets of documents which contain exactly 1,2,3,.. exact words + // from the query. It cannot be empty. All the candidates in it are disjoint. - for part in query { - let mut candidates = RoaringBitmap::new(); - use ExactQueryPart::*; - match part { - Synonyms(synonyms) => { - for synonym in synonyms { - if let Some(synonym_candidates) = ctx.word_docids(synonym)? { - candidates |= synonym_candidates; - } - } - } - // compute intersection on pair of words with a proximity of 0. - Phrase(phrase) => { - candidates |= resolve_phrase(ctx, phrase)?; - } - } - parts_candidates_array.push(candidates); + let mut candidates_array = owned_cache.combinations.clone(); + for candidates in candidates_array.iter_mut() { + *candidates &= &allowed_candidates; } + *cache = Some(owned_cache); - let mut candidates_array = Vec::new(); + let best_candidates = candidates_array.pop().unwrap(); - // compute documents that contain all exact words. - let mut all_exact_candidates = intersection_of(parts_candidates_array.iter().collect()); - all_exact_candidates &= &allowed_candidates; - allowed_candidates -= &all_exact_candidates; - - // push the result of combinations of exact words grouped by the number of exact words contained by documents. - for c_count in (1..number_of_part).rev() { - let mut combinations_candidates = parts_candidates_array - .iter() - // create all `c_count` combinations of exact words - .combinations(c_count) - // intersect each word candidates in combinations - .map(intersection_of) - // union combinations of `c_count` exact words - .fold(RoaringBitmap::new(), RoaringBitmap::bitor); - // only keep allowed candidates - combinations_candidates &= &allowed_candidates; - // remove current candidates from allowed candidates - allowed_candidates -= &combinations_candidates; - candidates_array.push(combinations_candidates); - } - - // push remainings allowed candidates as the worst valid candidates - candidates_array.push(allowed_candidates); - // reverse the array to be able to pop candidates from the best to the worst. - candidates_array.reverse(); - - Ok((all_exact_candidates, Some(Remainings(candidates_array)))) + candidates_array.insert(0, allowed_candidates); + Ok((best_candidates, Some(Remainings(candidates_array)))) } // pop remainings candidates until the emptiness Remainings(mut candidates_array) => { @@ -317,13 +289,10 @@ fn attribute_start_with_docids( Ok(attribute_candidates_array) } +#[inline(never)] fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap { rbs.sort_unstable_by_key(|rb| rb.len()); - let mut iter = rbs.into_iter(); - match iter.next() { - Some(first) => iter.fold(first.clone(), |acc, rb| acc & rb), - None => RoaringBitmap::new(), - } + roaring::MultiOps::intersection(rbs.into_iter()) } #[derive(Debug, Clone)] @@ -363,3 +332,452 @@ impl ExactQueryPart { Ok(part) } } + +struct ExactWordsCombinationCache { + // index 0 is only 1 word + combinations: Vec, +} + +fn compute_combinations( + ctx: &dyn Context, + query: &[ExactQueryPart], +) -> Result { + let number_of_part = query.len(); + let mut parts_candidates_array = Vec::with_capacity(number_of_part); + for part in query { + let mut candidates = RoaringBitmap::new(); + use ExactQueryPart::*; + match part { + Synonyms(synonyms) => { + for synonym in synonyms { + if let Some(synonym_candidates) = ctx.word_docids(synonym)? { + candidates |= synonym_candidates; + } + } + } + // compute intersection on pair of words with a proximity of 0. + Phrase(phrase) => { + candidates |= resolve_phrase(ctx, phrase)?; + } + } + parts_candidates_array.push(candidates); + } + let combinations = create_disjoint_combinations(parts_candidates_array); + + Ok(ExactWordsCombinationCache { combinations }) +} + +/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn` +/// such that `Xi` contains all the elements that are contained in **at least** `i+1` bitmaps among `b0,b1,...,bn`. +/// +/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`. +/// +/// ## Implementation +/// +/// We do so by iteratively building a map containing the union of all the different ways to intersect `J` bitmaps among `b0,b1,...,bn`. +/// - The key of the map is the index `i` of the last bitmap in the intersections +/// - The value is the union of all the possible intersections of J bitmaps such that the last bitmap in the intersection is `bi` +/// +/// For example, with the bitmaps `b0,b1,b2,b3`, this map should look like this +/// ```text +/// Map 0: (first iteration, contains all the combinations of 1 bitmap) +/// // What follows are unions of intersection of bitmaps asscociated with the index of their last component +/// 0: [b0] +/// 1: [b1] +/// 2: [b2] +/// 3: [b3] +/// Map 1: (second iteration, combinations of 2 bitmaps) +/// 1: [b0&b1] +/// 2: [b0&b2 | b1&b2] +/// 3: [b0&b3 | b1&b3 | b2&b3] +/// Map 2: (third iteration, combinations of 3 bitmaps) +/// 2: [b0&b1&b2] +/// 3: [b0&b2&b3 | b1&b2&b3] +/// Map 3: (fourth iteration, combinations of 4 bitmaps) +/// 3: [b0&b1&b2&b3] +/// ``` +/// +/// These maps are built one by one from the content of the preceding map. +/// For example, to create Map 2, we look at each line of Map 1, for example: +/// ```text +/// 2: [b0&b2 | b1&b2] +/// ``` +/// And then for each i > 2, we compute `(b0&b2 | b1&b2) & bi = b0&b2&bi | b1&b2&bi` +/// and then add it the new map (Map 3) under the key `i` (if it is not empty): +/// ```text +/// 3: [b0&b2&b3 | b1&b2&b3] +/// 4: [b0&b2&b4 | b1&b2&b4] +/// 5: [b0&b2&b5 | b1&b2&b5] +/// etc. +/// ``` +/// We only keep two maps in memory at any one point. As soon as Map J is built, we flatten Map J-1 into +/// a single bitmap by taking the union of all of its values. This union gives us Xj-1. +/// +/// ## Memory Usage +/// This function is expected to be called on a maximum of 10 bitmaps. The worst case thus happens when +/// 10 identical large bitmaps are given. +/// +/// In the context of Meilisearch, let's imagine that we are given 10 bitmaps containing all +/// the document ids. If the dataset contains 16 million documents, then each bitmap will take +/// around 2MB of memory. +/// +/// When creating Map 3, we will have, in memory: +/// 1. The 10 original bitmaps (20MB) +/// 2. X0 : 2MB +/// 3. Map 1, containing 9 bitmaps: 18MB +/// 4. Map 2, containing 8 bitmaps: 16MB +/// 5. X1: 2MB +/// for a total of around 60MB of memory. This roughly represents the maximum memory usage of this function. +/// +/// ## Time complexity +/// Let N be the size of the given list of bitmaps and M the length of each individual bitmap. +/// +/// We need to create N new bitmaps. The most expensive one to create is the second one, where we need to +/// iterate over the N keys of Map 1, and for each of those keys `k_i`, we perform `N-k_i` bitmap unions. +/// Unioning two bitmaps is O(M), and we need to do it O(N^2) times. +/// +/// Therefore the time complexity is O(N^3 * M). +fn create_non_disjoint_combinations(bitmaps: Vec) -> Vec { + let nbr_parts = bitmaps.len(); + if nbr_parts == 1 { + let flattened_base_level = MultiOps::union(bitmaps.into_iter()); + return vec![flattened_base_level]; + } + let mut flattened_levels = vec![]; + let mut last_level: BTreeMap = + bitmaps.clone().into_iter().enumerate().collect(); + + for _ in 2..=nbr_parts { + let mut new_level = BTreeMap::new(); + for (last_part_index, base_combination) in last_level.iter() { + #[allow(clippy::needless_range_loop)] + for new_last_part_index in last_part_index + 1..nbr_parts { + let new_combination = base_combination & &bitmaps[new_last_part_index]; + if !new_combination.is_empty() { + match new_level.entry(new_last_part_index) { + Entry::Occupied(mut b) => { + *b.get_mut() |= new_combination; + } + Entry::Vacant(entry) => { + entry.insert(new_combination); + } + } + } + } + } + // Now flatten the last level to save memory + let flattened_last_level = MultiOps::union(last_level.values()); + flattened_levels.push(flattened_last_level); + last_level = new_level; + } + // Flatten the last level + let flattened_last_level = MultiOps::union(last_level.values()); + flattened_levels.push(flattened_last_level); + flattened_levels +} + +/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn` +/// such that `Xi` contains all the elements that are contained in **exactly** `i+1` bitmaps among `b0,b1,...,bn`. +/// +/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`. +/// +/// ## Implementation +/// 1. We first create `Y0,Y1,...Yn` such that `Yi` contains all the elements that are contained in +/// **at least** `i+1` bitmaps among `b0,b1,...,bn`. This is done using `create_non_disjoint_combinations`. +/// +/// 2. We create a set of "forbidden" elements, `Fn`, which is initialised to the empty set. +/// +/// 3. We compute: +/// - `Xn = Yn - Fn` +/// - `Fn-1 = Fn | Xn` +fn create_disjoint_combinations(parts_candidates_array: Vec) -> Vec { + let non_disjoint_combinations = create_non_disjoint_combinations(parts_candidates_array); + + let mut disjoint_combinations = vec![]; + let mut forbidden = RoaringBitmap::new(); + for mut combination in non_disjoint_combinations.into_iter().rev() { + combination -= &forbidden; + forbidden |= &combination; + disjoint_combinations.push(combination) + } + disjoint_combinations.reverse(); + disjoint_combinations +} + +#[cfg(test)] +mod tests { + use big_s::S; + use roaring::RoaringBitmap; + + use crate::index::tests::TempIndex; + use crate::search::criteria::exactness::{ + create_disjoint_combinations, create_non_disjoint_combinations, + }; + use crate::snapshot_tests::display_bitmap; + use crate::SearchResult; + + #[test] + fn test_exact_words_subcriterion() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key(S("id")); + settings.set_criteria(vec!["exactness".to_owned()]); + }) + .unwrap(); + + index + .add_documents(documents!([ + // not relevant + { "id": "0", "text": "cat good dog bad" }, + // 1 exact word + { "id": "1", "text": "they said: cats arebetter thandogs" }, + // 3 exact words + { "id": "2", "text": "they said: cats arebetter than dogs" }, + // 5 exact words + { "id": "3", "text": "they said: cats are better than dogs" }, + // attribute starts with the exact words + { "id": "4", "text": "cats are better than dogs except on Saturday" }, + // attribute equal to the exact words + { "id": "5", "text": "cats are better than dogs" }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("cats are better than dogs").execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4, 3, 2, 1]"); + } + + fn print_combinations(rbs: &[RoaringBitmap]) -> String { + let mut s = String::new(); + for rb in rbs { + s.push_str(&format!("{}\n", &display_bitmap(rb))); + } + s + } + + // In these unit tests, the test bitmaps always contain all the multiple of a certain number. + // This makes it easy to check the validity of the results of `create_disjoint_combinations` by + // counting the number of dividers of elements in the returned bitmaps. + fn assert_correct_combinations(combinations: &[RoaringBitmap], dividers: &[u32]) { + for (i, set) in combinations.iter().enumerate() { + let expected_nbr_dividers = i + 1; + for el in set { + let nbr_dividers = dividers.iter().map(|d| usize::from(el % d == 0)).sum::(); + assert_eq!( + nbr_dividers, expected_nbr_dividers, + "{el} is divisible by {nbr_dividers} elements, not {expected_nbr_dividers}." + ); + } + } + } + + #[test] + fn compute_combinations_1() { + let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); + + let parts_candidates = vec![b0]; + + let combinations = create_disjoint_combinations(parts_candidates); + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, ] + "###); + + assert_correct_combinations(&combinations, &[2]); + } + + #[test] + fn compute_combinations_2() { + let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); + let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); + + let parts_candidates = vec![b0, b1]; + + let combinations = create_disjoint_combinations(parts_candidates); + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [2, 3, 4, 8, 9, 10, 14, 15, 16, 20, 21, 22, 26, 27, 28, 32, 33, 34, 38, 39, 40, 44, 45, 46, 50, 51, 52, 56, 57, 58, 62, 63, 64, 68, 69, 70, 74, 75, 76, 80, 81, 82, 86, 87, 88, 92, 93, 94, 98, 99, 100, 104, 105, 106, 110, 111, 112, 116, 117, 118, 122, 123, 124, 128, 129, 130, 134, 135, 136, 140, 141, 142, 146, 147, 148, ] + [0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108, 114, 120, 126, 132, 138, 144, ] + "###); + } + + #[test] + fn compute_combinations_4() { + let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); + let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); + let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); + let b3: RoaringBitmap = (0..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect(); + + let parts_candidates = vec![b0, b1, b2, b3]; + + let combinations = create_disjoint_combinations(parts_candidates); + + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ] + [6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ] + [30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ] + [0, ] + "###); + + // But we also check it programmatically + assert_correct_combinations(&combinations, &[2, 3, 5, 7]); + } + #[test] + fn compute_combinations_4_with_empty_results_at_end() { + let b0: RoaringBitmap = (1..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); + let b1: RoaringBitmap = (1..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); + let b2: RoaringBitmap = (1..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); + let b3: RoaringBitmap = (1..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect(); + + let parts_candidates = vec![b0, b1, b2, b3]; + + let combinations = create_disjoint_combinations(parts_candidates); + + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ] + [6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ] + [30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ] + [] + "###); + + // But we also check it programmatically + assert_correct_combinations(&combinations, &[2, 3, 5, 7]); + } + + #[test] + fn compute_combinations_4_with_some_equal_bitmaps() { + let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); + let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); + let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); + // b3 == b1 + let b3: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); + + let parts_candidates = vec![b0, b1, b2, b3]; + + let combinations = create_disjoint_combinations(parts_candidates); + + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [2, 4, 5, 8, 14, 16, 22, 25, 26, 28, 32, 34, 35, 38, 44, 46, 52, 55, 56, 58, 62, 64, 65, 68, 74, 76, 82, 85, 86, 88, 92, 94, 95, 98, 104, 106, 112, 115, 116, 118, 122, 124, 125, 128, 134, 136, 142, 145, 146, 148, ] + [3, 9, 10, 20, 21, 27, 33, 39, 40, 50, 51, 57, 63, 69, 70, 80, 81, 87, 93, 99, 100, 110, 111, 117, 123, 129, 130, 140, 141, 147, ] + [6, 12, 15, 18, 24, 36, 42, 45, 48, 54, 66, 72, 75, 78, 84, 96, 102, 105, 108, 114, 126, 132, 135, 138, 144, ] + [0, 30, 60, 90, 120, ] + "###); + + // But we also check it programmatically + assert_correct_combinations(&combinations, &[2, 3, 5, 3]); + } + + #[test] + fn compute_combinations_10() { + let dividers = [2, 3, 5, 7, 11, 6, 15, 35, 18, 14]; + let parts_candidates: Vec = dividers + .iter() + .map(|÷r| { + (0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 210).collect() + }) + .collect(); + + let combinations = create_disjoint_combinations(parts_candidates); + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [2, 3, 4, 5, 7, 8, 9, 11, 16, 25, 26, 27, 32, 34, 38, 39, 46, 49, 51, 52, 57, 58, 62, 64, 65, 68, 69, 74, 76, 81, 82, 85, 86, 87, 91, 92, 93, 94, 95, 104, 106, 111, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 143, 145, 146, 148, 152, 153, 155, 158, 159, 161, 164, 166, 171, 172, 177, 178, 183, 184, 185, 187, 188, 194, 201, 202, 203, 205, 206, 207, 208, 209, ] + [10, 20, 21, 22, 33, 40, 44, 50, 55, 63, 77, 80, 88, 99, 100, 130, 147, 160, 170, 176, 189, 190, 200, ] + [6, 12, 14, 15, 24, 28, 35, 45, 48, 56, 75, 78, 96, 98, 102, 110, 112, 114, 135, 138, 156, 174, 175, 182, 186, 192, 195, 196, 204, ] + [18, 36, 54, 66, 72, 108, 132, 144, 154, 162, 165, ] + [30, 42, 60, 70, 84, 105, 120, 140, 150, 168, 198, ] + [90, 126, 180, ] + [] + [210, ] + [] + [0, ] + "###); + + assert_correct_combinations(&combinations, ÷rs); + } + + #[test] + fn compute_combinations_30() { + let dividers: [u32; 30] = [ + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, + 5, + ]; + let parts_candidates: Vec = dividers + .iter() + .map(|divider| { + (0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 100).collect() + }) + .collect(); + + let combinations = create_non_disjoint_combinations(parts_candidates.clone()); + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 60, ] + [0, 60, ] + [0, 60, ] + [0, 60, ] + [0, 60, ] + [0, 60, ] + "###); + + let combinations = create_disjoint_combinations(parts_candidates); + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [] + [] + [] + [] + [] + [1, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 49, 53, 59, 61, 67, 71, 73, 77, 79, 83, 89, 91, 97, ] + [] + [] + [] + [] + [] + [2, 3, 5, 9, 14, 21, 22, 25, 26, 27, 33, 34, 35, 38, 39, 46, 51, 55, 57, 58, 62, 63, 65, 69, 74, 81, 82, 85, 86, 87, 93, 94, 95, 98, 99, ] + [] + [] + [] + [] + [] + [4, 6, 8, 10, 15, 16, 18, 28, 32, 42, 44, 45, 50, 52, 54, 56, 64, 66, 68, 70, 75, 76, 78, 88, 92, ] + [] + [] + [] + [] + [] + [12, 20, 24, 30, 36, 40, 48, 72, 80, 84, 90, 96, 100, ] + [] + [] + [] + [] + [] + [0, 60, ] + "###); + + assert_correct_combinations(&combinations, ÷rs); + } +} From 939e7faf31248172908f35df1915b0c968e1fea0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 1 Jan 2023 10:02:00 +0000 Subject: [PATCH 1883/1889] Bump taiki-e/install-action from 1 to 2 Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 1 to 2. - [Release notes](https://github.com/taiki-e/install-action/releases) - [Changelog](https://github.com/taiki-e/install-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/taiki-e/install-action/compare/v1...v2) --- updated-dependencies: - dependency-name: taiki-e/install-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/manual_benchmarks.yml | 2 +- .github/workflows/push_benchmarks_indexing.yml | 2 +- .github/workflows/push_benchmarks_search_geo.yml | 2 +- .github/workflows/push_benchmarks_search_songs.yml | 2 +- .github/workflows/push_benchmarks_search_wiki.yml | 2 +- .github/workflows/update-cargo-toml-version.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/manual_benchmarks.yml b/.github/workflows/manual_benchmarks.yml index 7adf05bcf..76c6fe0fe 100644 --- a/.github/workflows/manual_benchmarks.yml +++ b/.github/workflows/manual_benchmarks.yml @@ -50,7 +50,7 @@ jobs: # Generate critcmp files - name: Install critcmp - uses: taiki-e/install-action@v1 + uses: taiki-e/install-action@v2 with: tool: critcmp - name: Export cripcmp file diff --git a/.github/workflows/push_benchmarks_indexing.yml b/.github/workflows/push_benchmarks_indexing.yml index 2ddfc8614..12f9f6eda 100644 --- a/.github/workflows/push_benchmarks_indexing.yml +++ b/.github/workflows/push_benchmarks_indexing.yml @@ -48,7 +48,7 @@ jobs: # Generate critcmp files - name: Install critcmp - uses: taiki-e/install-action@v1 + uses: taiki-e/install-action@v2 with: tool: critcmp - name: Export cripcmp file diff --git a/.github/workflows/push_benchmarks_search_geo.yml b/.github/workflows/push_benchmarks_search_geo.yml index 625b55ff1..02661061f 100644 --- a/.github/workflows/push_benchmarks_search_geo.yml +++ b/.github/workflows/push_benchmarks_search_geo.yml @@ -47,7 +47,7 @@ jobs: # Generate critcmp files - name: Install critcmp - uses: taiki-e/install-action@v1 + uses: taiki-e/install-action@v2 with: tool: critcmp - name: Export cripcmp file diff --git a/.github/workflows/push_benchmarks_search_songs.yml b/.github/workflows/push_benchmarks_search_songs.yml index 5bed67152..92684a907 100644 --- a/.github/workflows/push_benchmarks_search_songs.yml +++ b/.github/workflows/push_benchmarks_search_songs.yml @@ -47,7 +47,7 @@ jobs: # Generate critcmp files - name: Install critcmp - uses: taiki-e/install-action@v1 + uses: taiki-e/install-action@v2 with: tool: critcmp - name: Export cripcmp file diff --git a/.github/workflows/push_benchmarks_search_wiki.yml b/.github/workflows/push_benchmarks_search_wiki.yml index 69a58a56e..0f6511337 100644 --- a/.github/workflows/push_benchmarks_search_wiki.yml +++ b/.github/workflows/push_benchmarks_search_wiki.yml @@ -47,7 +47,7 @@ jobs: # Generate critcmp files - name: Install critcmp - uses: taiki-e/install-action@v1 + uses: taiki-e/install-action@v2 with: tool: critcmp - name: Export cripcmp file diff --git a/.github/workflows/update-cargo-toml-version.yml b/.github/workflows/update-cargo-toml-version.yml index 0854e265b..e1cf6cf92 100644 --- a/.github/workflows/update-cargo-toml-version.yml +++ b/.github/workflows/update-cargo-toml-version.yml @@ -24,7 +24,7 @@ jobs: toolchain: stable override: true - name: Install critcmp - uses: taiki-e/install-action@v1 + uses: taiki-e/install-action@v2 with: tool: sd - name: Update all Cargo.toml files From 8d36570958070e0bbf8f185691f6f20d5a125db9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 2 Jan 2023 10:37:01 +0100 Subject: [PATCH 1884/1889] Add explicit criterion impl strategy to proximity search tests --- milli/src/search/criteria/proximity.rs | 42 +++++++++++++++++++------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 160c4908b..1d86f4da1 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -599,7 +599,7 @@ mod tests { use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; - use crate::SearchResult; + use crate::{CriterionImplementationStrategy, SearchResult}; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); @@ -663,29 +663,49 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("zero c").execute().unwrap(); + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .query("zero c") + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .execute() + .unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("zero co").execute().unwrap(); + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .query("zero co") + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .execute() + .unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("zero con").execute().unwrap(); + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .query("zero con") + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .execute() + .unwrap(); // Here searh results are degraded because `con` is in the prefix cache but it is too // long to be stored in the prefix proximity databases, and we don't want to iterate over // all of its word derivations insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]"); - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("zero conf").execute().unwrap(); + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .query("zero conf") + .execute() + .unwrap(); // Here search results are degraded as well, but we can still rank correctly documents // that contain `conf` exactly, and not as a prefix. insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 2, 3]"); - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("zero config").execute().unwrap(); + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .query("zero config") + .execute() + .unwrap(); // `config` is not a common prefix, so the normal methods are used insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 4, 5]"); } From b5df889dcb3db58e6c06c27103f5c9cda7d02974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 2 Jan 2023 10:46:35 +0100 Subject: [PATCH 1885/1889] Apply review suggestions: simplify implementation of exactness criterion --- milli/src/search/criteria/exactness.rs | 39 ++++++++------------------ 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 29e8ce87a..d4a90576c 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -191,7 +191,7 @@ fn resolve_state( attribute_start_with_docids(ctx, id, query)?; attribute_candidates_array.push(attribute_allowed_docids); - candidates |= intersection_of(attribute_candidates_array.iter().collect()); + candidates |= MultiOps::intersection(attribute_candidates_array); } } @@ -208,7 +208,7 @@ fn resolve_state( let attributes_ids = ctx.searchable_fields_ids()?; for id in attributes_ids { let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; - candidates |= intersection_of(attribute_candidates_array.iter().collect()); + candidates |= MultiOps::intersection(attribute_candidates_array); } // only keep allowed candidates @@ -289,12 +289,6 @@ fn attribute_start_with_docids( Ok(attribute_candidates_array) } -#[inline(never)] -fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap { - rbs.sort_unstable_by_key(|rb| rb.len()); - roaring::MultiOps::intersection(rbs.into_iter()) -} - #[derive(Debug, Clone)] pub enum ExactQueryPart { Phrase(Vec>), @@ -440,8 +434,7 @@ fn compute_combinations( fn create_non_disjoint_combinations(bitmaps: Vec) -> Vec { let nbr_parts = bitmaps.len(); if nbr_parts == 1 { - let flattened_base_level = MultiOps::union(bitmaps.into_iter()); - return vec![flattened_base_level]; + return bitmaps; } let mut flattened_levels = vec![]; let mut last_level: BTreeMap = @@ -466,12 +459,12 @@ fn create_non_disjoint_combinations(bitmaps: Vec) -> Vec) -> Vec) -> Vec { let non_disjoint_combinations = create_non_disjoint_combinations(parts_candidates_array); - let mut disjoint_combinations = vec![]; - let mut forbidden = RoaringBitmap::new(); - for mut combination in non_disjoint_combinations.into_iter().rev() { - combination -= &forbidden; - forbidden |= &combination; + let mut combinations = non_disjoint_combinations.into_iter().peekable(); + while let Some(mut combination) = combinations.next() { + if let Some(forbidden) = combinations.peek() { + combination -= forbidden; + } disjoint_combinations.push(combination) } - disjoint_combinations.reverse(); + disjoint_combinations } From be9786bed9cde854a76ce4b56f1a3d29a44ff385 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 5 Jan 2023 10:40:09 +0100 Subject: [PATCH 1886/1889] Change primary key inference error messages --- milli/src/error.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 0b8649067..8734cb540 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -130,9 +130,9 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco MissingDocumentId { primary_key: String, document: Object }, #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] TooManyDocumentIds { primary_key: String, document: Object }, - #[error("The primary key inference process failed because the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")] + #[error("The primary key inference failed as the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")] NoPrimaryKeyCandidateFound, - #[error("The primary key inference process failed because the engine found {} fields ending with `id` in their name, such as '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())] + #[error("The primary key inference failed as the engine found {} fields ending with `id` in their names: '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())] MultiplePrimaryKeyCandidatesFound { candidates: Vec }, #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] NoSpaceLeftOnDevice, From 00746b32c0c28bd8f0dbb114b380fb10f6121d54 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 10 Jan 2023 09:46:28 +0100 Subject: [PATCH 1887/1889] Add Index::map_size --- milli/src/index.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/milli/src/index.rs b/milli/src/index.rs index 50a4e909f..46f8eb6a3 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -255,6 +255,16 @@ impl Index { Ok(self.env.real_disk_size()?) } + /// Returns the map size the underlying environment was opened with, in bytes. + /// + /// This value does not represent the current on-disk size of the index. + /// + /// This value is the maximum between the map size passed during the opening of the index + /// and the on-disk size of the index at the time of opening. + pub fn map_size(&self) -> Result { + Ok(self.env.map_size()?) + } + pub fn copy_to_path>(&self, path: P, option: CompactionOption) -> Result { self.env.copy_to_path(path, option).map_err(Into::into) } From 02fd06ea0bab560ada6491ed898c2b6cbe4cf065 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 11 Jan 2023 12:14:17 +0100 Subject: [PATCH 1888/1889] Integrate deserr --- benchmarks/benches/utils.rs | 5 ++-- cli/src/main.rs | 2 +- milli/Cargo.toml | 1 + milli/src/search/criteria/asc_desc.rs | 6 ++-- milli/src/search/criteria/exactness.rs | 4 +-- milli/src/search/criteria/proximity.rs | 8 ++--- milli/src/update/settings.rs | 41 ++++++++++++++++++-------- milli/tests/search/mod.rs | 3 +- milli/tests/search/query_criteria.rs | 4 +-- 9 files changed, 45 insertions(+), 29 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 511b3b8d5..470d2030d 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -4,6 +4,7 @@ use std::fs::{create_dir_all, remove_dir_all, File}; use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; use std::num::ParseFloatError; use std::path::Path; +use std::str::FromStr; use criterion::BenchmarkId; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; @@ -11,7 +12,7 @@ use milli::heed::EnvOpenOptions; use milli::update::{ IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; -use milli::{Filter, Index, Object, TermsMatchingStrategy}; +use milli::{Criterion, Filter, Index, Object, TermsMatchingStrategy}; use serde_json::Value; pub struct Conf<'a> { @@ -80,7 +81,7 @@ pub fn base_setup(conf: &Conf) -> Index { builder.reset_criteria(); builder.reset_stop_words(); - let criterion = criterion.iter().map(|s| s.to_string()).collect(); + let criterion = criterion.iter().map(|s| Criterion::from_str(s).unwrap()).collect(); builder.set_criteria(criterion); } diff --git a/cli/src/main.rs b/cli/src/main.rs index c28d3de59..09ee7f984 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -521,7 +521,7 @@ impl Performer for SettingsUpdate { if let Some(criteria) = self.criteria { if !criteria.is_empty() { - update.set_criteria(criteria); + update.set_criteria(criteria.iter().map(|c| c.parse()).collect::>()?); } else { update.reset_criteria(); } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 4e4fdc483..5bbd7a8ff 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -12,6 +12,7 @@ byteorder = "1.4.3" charabia = { version = "0.7.0", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.6" +deserr = "0.1.4" either = "1.8.0" flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index c0fdbada3..b5afe6778 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -343,7 +343,7 @@ mod tests { use maplit::hashset; use crate::index::tests::TempIndex; - use crate::{AscDesc, Filter, Search, SearchResult}; + use crate::{AscDesc, Criterion, Filter, Search, SearchResult}; // Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD // constant to 0 to ensure that the other sort algorithms are also correct. @@ -356,7 +356,7 @@ mod tests { settings.set_primary_key("id".to_owned()); settings .set_sortable_fields(maplit::hashset! { S("id"), S("mod_10"), S("mod_20") }); - settings.set_criteria(vec!["sort".to_owned()]); + settings.set_criteria(vec![Criterion::Sort]); }) .unwrap(); @@ -443,7 +443,7 @@ mod tests { settings.set_primary_key("id".to_owned()); settings.set_filterable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") }); settings.set_sortable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") }); - settings.set_criteria(vec!["sort".to_owned()]); + settings.set_criteria(vec![Criterion::Sort]); }) .unwrap(); diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index d4a90576c..078a9cd6c 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -497,7 +497,7 @@ mod tests { create_disjoint_combinations, create_non_disjoint_combinations, }; use crate::snapshot_tests::display_bitmap; - use crate::SearchResult; + use crate::{Criterion, SearchResult}; #[test] fn test_exact_words_subcriterion() { @@ -506,7 +506,7 @@ mod tests { index .update_settings(|settings| { settings.set_primary_key(S("id")); - settings.set_criteria(vec!["exactness".to_owned()]); + settings.set_criteria(vec![Criterion::Exactness]); }) .unwrap(); diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 1d86f4da1..66e5c95bf 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -599,7 +599,7 @@ mod tests { use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; - use crate::{CriterionImplementationStrategy, SearchResult}; + use crate::{Criterion, CriterionImplementationStrategy, SearchResult}; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); @@ -627,9 +627,9 @@ mod tests { .update_settings(|settings| { settings.set_primary_key(S("id")); settings.set_criteria(vec![ - "words".to_owned(), - "typo".to_owned(), - "proximity".to_owned(), + Criterion::Words, + Criterion::Typo, + Criterion::Proximity, ]); }) .unwrap(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 5f75910dc..f10bfe4e9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -2,6 +2,7 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::result::Result as StdResult; use charabia::{Tokenizer, TokenizerBuilder}; +use deserr::{DeserializeError, DeserializeFromValue}; use itertools::Itertools; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -22,6 +23,25 @@ pub enum Setting { NotSet, } +impl DeserializeFromValue for Setting +where + T: DeserializeFromValue, + E: DeserializeError, +{ + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> std::result::Result { + match value { + deserr::Value::Null => Ok(Setting::Reset), + _ => T::deserialize_from_value(value, location).map(Setting::Set), + } + } + fn default() -> Option { + Some(Self::NotSet) + } +} + impl Default for Setting { fn default() -> Self { Self::NotSet @@ -93,7 +113,7 @@ pub struct Settings<'a, 't, 'u, 'i> { displayed_fields: Setting>, filterable_fields: Setting>, sortable_fields: Setting>, - criteria: Setting>, + criteria: Setting>, stop_words: Setting>, distinct_field: Setting, synonyms: Setting>>, @@ -173,7 +193,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.criteria = Setting::Reset; } - pub fn set_criteria(&mut self, criteria: Vec) { + pub fn set_criteria(&mut self, criteria: Vec) { self.criteria = Setting::Set(criteria); } @@ -526,14 +546,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } fn update_criteria(&mut self) -> Result<()> { - match self.criteria { - Setting::Set(ref fields) => { - let mut new_criteria = Vec::new(); - for name in fields { - let criterion: Criterion = name.parse()?; - new_criteria.push(criterion); - } - self.index.put_criteria(self.wtxn, &new_criteria)?; + match &self.criteria { + Setting::Set(criteria) => { + self.index.put_criteria(self.wtxn, criteria)?; } Setting::Reset => { self.index.delete_criteria(self.wtxn)?; @@ -977,7 +992,7 @@ mod tests { index .update_settings(|settings| { settings.set_displayed_fields(vec![S("name")]); - settings.set_criteria(vec![S("age:asc")]); + settings.set_criteria(vec![Criterion::Asc("age".to_owned())]); }) .unwrap(); @@ -1246,7 +1261,7 @@ mod tests { .update_settings(|settings| { settings.set_displayed_fields(vec!["hello".to_string()]); settings.set_filterable_fields(hashset! { S("age"), S("toto") }); - settings.set_criteria(vec!["toto:asc".to_string()]); + settings.set_criteria(vec![Criterion::Asc(S("toto"))]); }) .unwrap(); @@ -1280,7 +1295,7 @@ mod tests { .update_settings(|settings| { settings.set_displayed_fields(vec!["hello".to_string()]); // It is only Asc(toto), there is a facet database but it is denied to filter with toto. - settings.set_criteria(vec!["toto:asc".to_string()]); + settings.set_criteria(vec![Criterion::Asc(S("toto"))]); }) .unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index d63df96ec..c2f8acd4d 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -38,8 +38,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = Settings::new(&mut wtxn, &index, &config); - let criteria = criteria.iter().map(|c| c.to_string()).collect(); - builder.set_criteria(criteria); + builder.set_criteria(criteria.to_vec()); builder.set_filterable_fields(hashset! { S("tag"), S("asc_desc_rank"), diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index d4aa859a4..16058e941 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -344,7 +344,7 @@ fn criteria_mixup() { //update criteria let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_criteria(criteria.iter().map(ToString::to_string).collect()); + builder.set_criteria(criteria.clone()); builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); @@ -436,7 +436,7 @@ fn criteria_ascdesc() { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_criteria(vec![criterion.to_string()]); + builder.set_criteria(vec![criterion.clone()]); builder.execute(|_| (), || false).unwrap(); wtxn.commit().unwrap(); From 9e32ac7cb2f2dbc5e1eab82ca7c2964084c208a5 Mon Sep 17 00:00:00 2001 From: curquiza Date: Wed, 11 Jan 2023 15:05:06 +0000 Subject: [PATCH 1889/1889] Update version for the next release (v0.39.0) in Cargo.toml files --- benchmarks/Cargo.toml | 2 +- cli/Cargo.toml | 2 +- filter-parser/Cargo.toml | 2 +- flatten-serde-json/Cargo.toml | 2 +- json-depth-checker/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index be3544bc0..1cb63db4a 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "benchmarks" -version = "0.38.0" +version = "0.39.0" edition = "2018" publish = false diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 26c8ff681..7ecc3fa33 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.38.0" +version = "0.39.0" edition = "2018" description = "A CLI to interact with a milli index" publish = false diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml index 95a9b0062..9202c3875 100644 --- a/filter-parser/Cargo.toml +++ b/filter-parser/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "filter-parser" -version = "0.38.0" +version = "0.39.0" edition = "2021" description = "The parser for the Meilisearch filter syntax" publish = false diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml index 909c2702c..2fb668f86 100644 --- a/flatten-serde-json/Cargo.toml +++ b/flatten-serde-json/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "flatten-serde-json" -version = "0.38.0" +version = "0.39.0" edition = "2021" description = "Flatten serde-json objects like elastic search" readme = "README.md" diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml index 45b6b6ec9..feb245e5e 100644 --- a/json-depth-checker/Cargo.toml +++ b/json-depth-checker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "json-depth-checker" -version = "0.38.0" +version = "0.39.0" edition = "2021" description = "A library that indicates if a JSON must be flattened" publish = false diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 5bbd7a8ff..e23051b69 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.38.0" +version = "0.39.0" authors = ["Kerollmops "] edition = "2018"